xref: /freebsd/contrib/llvm-project/clang/lib/Format/FormatTokenLexer.cpp (revision 0d8fe2373503aeac48492f28073049a8bfa4feb5)
1 //===--- FormatTokenLexer.cpp - Lex FormatTokens -------------*- C++ ----*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// This file implements FormatTokenLexer, which tokenizes a source file
11 /// into a FormatToken stream suitable for ClangFormat.
12 ///
13 //===----------------------------------------------------------------------===//
14 
15 #include "FormatTokenLexer.h"
16 #include "FormatToken.h"
17 #include "clang/Basic/SourceLocation.h"
18 #include "clang/Basic/SourceManager.h"
19 #include "clang/Format/Format.h"
20 #include "llvm/Support/Regex.h"
21 
22 namespace clang {
23 namespace format {
24 
25 FormatTokenLexer::FormatTokenLexer(
26     const SourceManager &SourceMgr, FileID ID, unsigned Column,
27     const FormatStyle &Style, encoding::Encoding Encoding,
28     llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator,
29     IdentifierTable &IdentTable)
30     : FormatTok(nullptr), IsFirstToken(true), StateStack({LexerState::NORMAL}),
31       Column(Column), TrailingWhitespace(0), SourceMgr(SourceMgr), ID(ID),
32       Style(Style), IdentTable(IdentTable), Keywords(IdentTable),
33       Encoding(Encoding), Allocator(Allocator), FirstInLineIndex(0),
34       FormattingDisabled(false), MacroBlockBeginRegex(Style.MacroBlockBegin),
35       MacroBlockEndRegex(Style.MacroBlockEnd) {
36   Lex.reset(new Lexer(ID, SourceMgr.getBufferOrFake(ID), SourceMgr,
37                       getFormattingLangOpts(Style)));
38   Lex->SetKeepWhitespaceMode(true);
39 
40   for (const std::string &ForEachMacro : Style.ForEachMacros)
41     Macros.insert({&IdentTable.get(ForEachMacro), TT_ForEachMacro});
42   for (const std::string &AttributeMacro : Style.AttributeMacros)
43     Macros.insert({&IdentTable.get(AttributeMacro), TT_AttributeMacro});
44   for (const std::string &StatementMacro : Style.StatementMacros)
45     Macros.insert({&IdentTable.get(StatementMacro), TT_StatementMacro});
46   for (const std::string &TypenameMacro : Style.TypenameMacros)
47     Macros.insert({&IdentTable.get(TypenameMacro), TT_TypenameMacro});
48   for (const std::string &NamespaceMacro : Style.NamespaceMacros)
49     Macros.insert({&IdentTable.get(NamespaceMacro), TT_NamespaceMacro});
50   for (const std::string &WhitespaceSensitiveMacro :
51        Style.WhitespaceSensitiveMacros) {
52     Macros.insert(
53         {&IdentTable.get(WhitespaceSensitiveMacro), TT_UntouchableMacroFunc});
54   }
55   for (const std::string &StatementAttributeLikeMacro :
56        Style.StatementAttributeLikeMacros)
57     Macros.insert({&IdentTable.get(StatementAttributeLikeMacro),
58                    TT_StatementAttributeLikeMacro});
59 }
60 
61 ArrayRef<FormatToken *> FormatTokenLexer::lex() {
62   assert(Tokens.empty());
63   assert(FirstInLineIndex == 0);
64   do {
65     Tokens.push_back(getNextToken());
66     if (Style.Language == FormatStyle::LK_JavaScript) {
67       tryParseJSRegexLiteral();
68       handleTemplateStrings();
69     }
70     if (Style.Language == FormatStyle::LK_TextProto)
71       tryParsePythonComment();
72     tryMergePreviousTokens();
73     if (Style.isCSharp())
74       // This needs to come after tokens have been merged so that C#
75       // string literals are correctly identified.
76       handleCSharpVerbatimAndInterpolatedStrings();
77     if (Tokens.back()->NewlinesBefore > 0 || Tokens.back()->IsMultiline)
78       FirstInLineIndex = Tokens.size() - 1;
79   } while (Tokens.back()->Tok.isNot(tok::eof));
80   return Tokens;
81 }
82 
83 void FormatTokenLexer::tryMergePreviousTokens() {
84   if (tryMerge_TMacro())
85     return;
86   if (tryMergeConflictMarkers())
87     return;
88   if (tryMergeLessLess())
89     return;
90   if (tryMergeForEach())
91     return;
92   if (Style.isCpp() && tryTransformTryUsageForC())
93     return;
94 
95   if (Style.isCSharp()) {
96     if (tryMergeCSharpKeywordVariables())
97       return;
98     if (tryMergeCSharpStringLiteral())
99       return;
100     if (tryMergeCSharpDoubleQuestion())
101       return;
102     if (tryMergeCSharpNullConditional())
103       return;
104     if (tryTransformCSharpForEach())
105       return;
106     static const tok::TokenKind JSRightArrow[] = {tok::equal, tok::greater};
107     if (tryMergeTokens(JSRightArrow, TT_JsFatArrow))
108       return;
109   }
110 
111   if (tryMergeNSStringLiteral())
112     return;
113 
114   if (Style.Language == FormatStyle::LK_JavaScript) {
115     static const tok::TokenKind JSIdentity[] = {tok::equalequal, tok::equal};
116     static const tok::TokenKind JSNotIdentity[] = {tok::exclaimequal,
117                                                    tok::equal};
118     static const tok::TokenKind JSShiftEqual[] = {tok::greater, tok::greater,
119                                                   tok::greaterequal};
120     static const tok::TokenKind JSRightArrow[] = {tok::equal, tok::greater};
121     static const tok::TokenKind JSExponentiation[] = {tok::star, tok::star};
122     static const tok::TokenKind JSExponentiationEqual[] = {tok::star,
123                                                            tok::starequal};
124     static const tok::TokenKind JSNullPropagatingOperator[] = {tok::question,
125                                                                tok::period};
126     static const tok::TokenKind JSNullishOperator[] = {tok::question,
127                                                        tok::question};
128     static const tok::TokenKind JSNullishEqual[] = {tok::question,
129                                                     tok::question, tok::equal};
130     static const tok::TokenKind JSPipePipeEqual[] = {tok::pipepipe, tok::equal};
131     static const tok::TokenKind JSAndAndEqual[] = {tok::ampamp, tok::equal};
132 
133     // FIXME: Investigate what token type gives the correct operator priority.
134     if (tryMergeTokens(JSIdentity, TT_BinaryOperator))
135       return;
136     if (tryMergeTokens(JSNotIdentity, TT_BinaryOperator))
137       return;
138     if (tryMergeTokens(JSShiftEqual, TT_BinaryOperator))
139       return;
140     if (tryMergeTokens(JSRightArrow, TT_JsFatArrow))
141       return;
142     if (tryMergeTokens(JSExponentiation, TT_JsExponentiation))
143       return;
144     if (tryMergeTokens(JSExponentiationEqual, TT_JsExponentiationEqual)) {
145       Tokens.back()->Tok.setKind(tok::starequal);
146       return;
147     }
148     if (tryMergeTokens(JSNullishOperator, TT_JsNullishCoalescingOperator)) {
149       // Treat like the "||" operator (as opposed to the ternary ?).
150       Tokens.back()->Tok.setKind(tok::pipepipe);
151       return;
152     }
153     if (tryMergeTokens(JSNullPropagatingOperator,
154                        TT_JsNullPropagatingOperator)) {
155       // Treat like a regular "." access.
156       Tokens.back()->Tok.setKind(tok::period);
157       return;
158     }
159     if (tryMergeTokens(JSAndAndEqual, TT_JsAndAndEqual) ||
160         tryMergeTokens(JSPipePipeEqual, TT_JsPipePipeEqual) ||
161         tryMergeTokens(JSNullishEqual, TT_JsNullishCoalescingEqual)) {
162       // Treat like the "=" assignment operator.
163       Tokens.back()->Tok.setKind(tok::equal);
164       return;
165     }
166     if (tryMergeJSPrivateIdentifier())
167       return;
168   }
169 
170   if (Style.Language == FormatStyle::LK_Java) {
171     static const tok::TokenKind JavaRightLogicalShiftAssign[] = {
172         tok::greater, tok::greater, tok::greaterequal};
173     if (tryMergeTokens(JavaRightLogicalShiftAssign, TT_BinaryOperator))
174       return;
175   }
176 }
177 
178 bool FormatTokenLexer::tryMergeNSStringLiteral() {
179   if (Tokens.size() < 2)
180     return false;
181   auto &At = *(Tokens.end() - 2);
182   auto &String = *(Tokens.end() - 1);
183   if (!At->is(tok::at) || !String->is(tok::string_literal))
184     return false;
185   At->Tok.setKind(tok::string_literal);
186   At->TokenText = StringRef(At->TokenText.begin(),
187                             String->TokenText.end() - At->TokenText.begin());
188   At->ColumnWidth += String->ColumnWidth;
189   At->setType(TT_ObjCStringLiteral);
190   Tokens.erase(Tokens.end() - 1);
191   return true;
192 }
193 
194 bool FormatTokenLexer::tryMergeJSPrivateIdentifier() {
195   // Merges #idenfier into a single identifier with the text #identifier
196   // but the token tok::identifier.
197   if (Tokens.size() < 2)
198     return false;
199   auto &Hash = *(Tokens.end() - 2);
200   auto &Identifier = *(Tokens.end() - 1);
201   if (!Hash->is(tok::hash) || !Identifier->is(tok::identifier))
202     return false;
203   Hash->Tok.setKind(tok::identifier);
204   Hash->TokenText =
205       StringRef(Hash->TokenText.begin(),
206                 Identifier->TokenText.end() - Hash->TokenText.begin());
207   Hash->ColumnWidth += Identifier->ColumnWidth;
208   Hash->setType(TT_JsPrivateIdentifier);
209   Tokens.erase(Tokens.end() - 1);
210   return true;
211 }
212 
213 // Search for verbatim or interpolated string literals @"ABC" or
214 // $"aaaaa{abc}aaaaa" i and mark the token as TT_CSharpStringLiteral, and to
215 // prevent splitting of @, $ and ".
216 // Merging of multiline verbatim strings with embedded '"' is handled in
217 // handleCSharpVerbatimAndInterpolatedStrings with lower-level lexing.
218 bool FormatTokenLexer::tryMergeCSharpStringLiteral() {
219   if (Tokens.size() < 2)
220     return false;
221 
222   // Interpolated strings could contain { } with " characters inside.
223   // $"{x ?? "null"}"
224   // should not be split into $"{x ?? ", null, "}" but should treated as a
225   // single string-literal.
226   //
227   // We opt not to try and format expressions inside {} within a C#
228   // interpolated string. Formatting expressions within an interpolated string
229   // would require similar work as that done for JavaScript template strings
230   // in `handleTemplateStrings()`.
231   auto &CSharpInterpolatedString = *(Tokens.end() - 2);
232   if (CSharpInterpolatedString->getType() == TT_CSharpStringLiteral &&
233       (CSharpInterpolatedString->TokenText.startswith(R"($")") ||
234        CSharpInterpolatedString->TokenText.startswith(R"($@")"))) {
235     int UnmatchedOpeningBraceCount = 0;
236 
237     auto TokenTextSize = CSharpInterpolatedString->TokenText.size();
238     for (size_t Index = 0; Index < TokenTextSize; ++Index) {
239       char C = CSharpInterpolatedString->TokenText[Index];
240       if (C == '{') {
241         // "{{"  inside an interpolated string is an escaped '{' so skip it.
242         if (Index + 1 < TokenTextSize &&
243             CSharpInterpolatedString->TokenText[Index + 1] == '{') {
244           ++Index;
245           continue;
246         }
247         ++UnmatchedOpeningBraceCount;
248       } else if (C == '}') {
249         // "}}"  inside an interpolated string is an escaped '}' so skip it.
250         if (Index + 1 < TokenTextSize &&
251             CSharpInterpolatedString->TokenText[Index + 1] == '}') {
252           ++Index;
253           continue;
254         }
255         --UnmatchedOpeningBraceCount;
256       }
257     }
258 
259     if (UnmatchedOpeningBraceCount > 0) {
260       auto &NextToken = *(Tokens.end() - 1);
261       CSharpInterpolatedString->TokenText =
262           StringRef(CSharpInterpolatedString->TokenText.begin(),
263                     NextToken->TokenText.end() -
264                         CSharpInterpolatedString->TokenText.begin());
265       CSharpInterpolatedString->ColumnWidth += NextToken->ColumnWidth;
266       Tokens.erase(Tokens.end() - 1);
267       return true;
268     }
269   }
270 
271   // Look for @"aaaaaa" or $"aaaaaa".
272   auto &String = *(Tokens.end() - 1);
273   if (!String->is(tok::string_literal))
274     return false;
275 
276   auto &At = *(Tokens.end() - 2);
277   if (!(At->is(tok::at) || At->TokenText == "$"))
278     return false;
279 
280   if (Tokens.size() > 2 && At->is(tok::at)) {
281     auto &Dollar = *(Tokens.end() - 3);
282     if (Dollar->TokenText == "$") {
283       // This looks like $@"aaaaa" so we need to combine all 3 tokens.
284       Dollar->Tok.setKind(tok::string_literal);
285       Dollar->TokenText =
286           StringRef(Dollar->TokenText.begin(),
287                     String->TokenText.end() - Dollar->TokenText.begin());
288       Dollar->ColumnWidth += (At->ColumnWidth + String->ColumnWidth);
289       Dollar->setType(TT_CSharpStringLiteral);
290       Tokens.erase(Tokens.end() - 2);
291       Tokens.erase(Tokens.end() - 1);
292       return true;
293     }
294   }
295 
296   // Convert back into just a string_literal.
297   At->Tok.setKind(tok::string_literal);
298   At->TokenText = StringRef(At->TokenText.begin(),
299                             String->TokenText.end() - At->TokenText.begin());
300   At->ColumnWidth += String->ColumnWidth;
301   At->setType(TT_CSharpStringLiteral);
302   Tokens.erase(Tokens.end() - 1);
303   return true;
304 }
305 
306 // Valid C# attribute targets:
307 // https://docs.microsoft.com/en-us/dotnet/csharp/programming-guide/concepts/attributes/#attribute-targets
308 const llvm::StringSet<> FormatTokenLexer::CSharpAttributeTargets = {
309     "assembly", "module",   "field",  "event", "method",
310     "param",    "property", "return", "type",
311 };
312 
313 bool FormatTokenLexer::tryMergeCSharpDoubleQuestion() {
314   if (Tokens.size() < 2)
315     return false;
316   auto &FirstQuestion = *(Tokens.end() - 2);
317   auto &SecondQuestion = *(Tokens.end() - 1);
318   if (!FirstQuestion->is(tok::question) || !SecondQuestion->is(tok::question))
319     return false;
320   FirstQuestion->Tok.setKind(tok::question); // no '??' in clang tokens.
321   FirstQuestion->TokenText = StringRef(FirstQuestion->TokenText.begin(),
322                                        SecondQuestion->TokenText.end() -
323                                            FirstQuestion->TokenText.begin());
324   FirstQuestion->ColumnWidth += SecondQuestion->ColumnWidth;
325   FirstQuestion->setType(TT_CSharpNullCoalescing);
326   Tokens.erase(Tokens.end() - 1);
327   return true;
328 }
329 
330 // Merge '?[' and '?.' pairs into single tokens.
331 bool FormatTokenLexer::tryMergeCSharpNullConditional() {
332   if (Tokens.size() < 2)
333     return false;
334   auto &Question = *(Tokens.end() - 2);
335   auto &PeriodOrLSquare = *(Tokens.end() - 1);
336   if (!Question->is(tok::question) ||
337       !PeriodOrLSquare->isOneOf(tok::l_square, tok::period))
338     return false;
339   Question->TokenText =
340       StringRef(Question->TokenText.begin(),
341                 PeriodOrLSquare->TokenText.end() - Question->TokenText.begin());
342   Question->ColumnWidth += PeriodOrLSquare->ColumnWidth;
343 
344   if (PeriodOrLSquare->is(tok::l_square)) {
345     Question->Tok.setKind(tok::question); // no '?[' in clang tokens.
346     Question->setType(TT_CSharpNullConditionalLSquare);
347   } else {
348     Question->Tok.setKind(tok::question); // no '?.' in clang tokens.
349     Question->setType(TT_CSharpNullConditional);
350   }
351 
352   Tokens.erase(Tokens.end() - 1);
353   return true;
354 }
355 
356 bool FormatTokenLexer::tryMergeCSharpKeywordVariables() {
357   if (Tokens.size() < 2)
358     return false;
359   auto &At = *(Tokens.end() - 2);
360   auto &Keyword = *(Tokens.end() - 1);
361   if (!At->is(tok::at))
362     return false;
363   if (!Keywords.isCSharpKeyword(*Keyword))
364     return false;
365 
366   At->Tok.setKind(tok::identifier);
367   At->TokenText = StringRef(At->TokenText.begin(),
368                             Keyword->TokenText.end() - At->TokenText.begin());
369   At->ColumnWidth += Keyword->ColumnWidth;
370   At->setType(Keyword->getType());
371   Tokens.erase(Tokens.end() - 1);
372   return true;
373 }
374 
375 // In C# transform identifier foreach into kw_foreach
376 bool FormatTokenLexer::tryTransformCSharpForEach() {
377   if (Tokens.size() < 1)
378     return false;
379   auto &Identifier = *(Tokens.end() - 1);
380   if (!Identifier->is(tok::identifier))
381     return false;
382   if (Identifier->TokenText != "foreach")
383     return false;
384 
385   Identifier->setType(TT_ForEachMacro);
386   Identifier->Tok.setKind(tok::kw_for);
387   return true;
388 }
389 
390 bool FormatTokenLexer::tryMergeForEach() {
391   if (Tokens.size() < 2)
392     return false;
393   auto &For = *(Tokens.end() - 2);
394   auto &Each = *(Tokens.end() - 1);
395   if (!For->is(tok::kw_for))
396     return false;
397   if (!Each->is(tok::identifier))
398     return false;
399   if (Each->TokenText != "each")
400     return false;
401 
402   For->setType(TT_ForEachMacro);
403   For->Tok.setKind(tok::kw_for);
404 
405   For->TokenText = StringRef(For->TokenText.begin(),
406                              Each->TokenText.end() - For->TokenText.begin());
407   For->ColumnWidth += Each->ColumnWidth;
408   Tokens.erase(Tokens.end() - 1);
409   return true;
410 }
411 
412 bool FormatTokenLexer::tryTransformTryUsageForC() {
413   if (Tokens.size() < 2)
414     return false;
415   auto &Try = *(Tokens.end() - 2);
416   if (!Try->is(tok::kw_try))
417     return false;
418   auto &Next = *(Tokens.end() - 1);
419   if (Next->isOneOf(tok::l_brace, tok::colon, tok::hash, tok::comment))
420     return false;
421 
422   if (Tokens.size() > 2) {
423     auto &At = *(Tokens.end() - 3);
424     if (At->is(tok::at))
425       return false;
426   }
427 
428   Try->Tok.setKind(tok::identifier);
429   return true;
430 }
431 
432 bool FormatTokenLexer::tryMergeLessLess() {
433   // Merge X,less,less,Y into X,lessless,Y unless X or Y is less.
434   if (Tokens.size() < 3)
435     return false;
436 
437   bool FourthTokenIsLess = false;
438   if (Tokens.size() > 3)
439     FourthTokenIsLess = (Tokens.end() - 4)[0]->is(tok::less);
440 
441   auto First = Tokens.end() - 3;
442   if (First[2]->is(tok::less) || First[1]->isNot(tok::less) ||
443       First[0]->isNot(tok::less) || FourthTokenIsLess)
444     return false;
445 
446   // Only merge if there currently is no whitespace between the two "<".
447   if (First[1]->WhitespaceRange.getBegin() !=
448       First[1]->WhitespaceRange.getEnd())
449     return false;
450 
451   First[0]->Tok.setKind(tok::lessless);
452   First[0]->TokenText = "<<";
453   First[0]->ColumnWidth += 1;
454   Tokens.erase(Tokens.end() - 2);
455   return true;
456 }
457 
458 bool FormatTokenLexer::tryMergeTokens(ArrayRef<tok::TokenKind> Kinds,
459                                       TokenType NewType) {
460   if (Tokens.size() < Kinds.size())
461     return false;
462 
463   SmallVectorImpl<FormatToken *>::const_iterator First =
464       Tokens.end() - Kinds.size();
465   if (!First[0]->is(Kinds[0]))
466     return false;
467   unsigned AddLength = 0;
468   for (unsigned i = 1; i < Kinds.size(); ++i) {
469     if (!First[i]->is(Kinds[i]) || First[i]->WhitespaceRange.getBegin() !=
470                                        First[i]->WhitespaceRange.getEnd())
471       return false;
472     AddLength += First[i]->TokenText.size();
473   }
474   Tokens.resize(Tokens.size() - Kinds.size() + 1);
475   First[0]->TokenText = StringRef(First[0]->TokenText.data(),
476                                   First[0]->TokenText.size() + AddLength);
477   First[0]->ColumnWidth += AddLength;
478   First[0]->setType(NewType);
479   return true;
480 }
481 
482 // Returns \c true if \p Tok can only be followed by an operand in JavaScript.
483 bool FormatTokenLexer::precedesOperand(FormatToken *Tok) {
484   // NB: This is not entirely correct, as an r_paren can introduce an operand
485   // location in e.g. `if (foo) /bar/.exec(...);`. That is a rare enough
486   // corner case to not matter in practice, though.
487   return Tok->isOneOf(tok::period, tok::l_paren, tok::comma, tok::l_brace,
488                       tok::r_brace, tok::l_square, tok::semi, tok::exclaim,
489                       tok::colon, tok::question, tok::tilde) ||
490          Tok->isOneOf(tok::kw_return, tok::kw_do, tok::kw_case, tok::kw_throw,
491                       tok::kw_else, tok::kw_new, tok::kw_delete, tok::kw_void,
492                       tok::kw_typeof, Keywords.kw_instanceof, Keywords.kw_in) ||
493          Tok->isBinaryOperator();
494 }
495 
496 bool FormatTokenLexer::canPrecedeRegexLiteral(FormatToken *Prev) {
497   if (!Prev)
498     return true;
499 
500   // Regex literals can only follow after prefix unary operators, not after
501   // postfix unary operators. If the '++' is followed by a non-operand
502   // introducing token, the slash here is the operand and not the start of a
503   // regex.
504   // `!` is an unary prefix operator, but also a post-fix operator that casts
505   // away nullability, so the same check applies.
506   if (Prev->isOneOf(tok::plusplus, tok::minusminus, tok::exclaim))
507     return (Tokens.size() < 3 || precedesOperand(Tokens[Tokens.size() - 3]));
508 
509   // The previous token must introduce an operand location where regex
510   // literals can occur.
511   if (!precedesOperand(Prev))
512     return false;
513 
514   return true;
515 }
516 
517 // Tries to parse a JavaScript Regex literal starting at the current token,
518 // if that begins with a slash and is in a location where JavaScript allows
519 // regex literals. Changes the current token to a regex literal and updates
520 // its text if successful.
521 void FormatTokenLexer::tryParseJSRegexLiteral() {
522   FormatToken *RegexToken = Tokens.back();
523   if (!RegexToken->isOneOf(tok::slash, tok::slashequal))
524     return;
525 
526   FormatToken *Prev = nullptr;
527   for (auto I = Tokens.rbegin() + 1, E = Tokens.rend(); I != E; ++I) {
528     // NB: Because previous pointers are not initialized yet, this cannot use
529     // Token.getPreviousNonComment.
530     if ((*I)->isNot(tok::comment)) {
531       Prev = *I;
532       break;
533     }
534   }
535 
536   if (!canPrecedeRegexLiteral(Prev))
537     return;
538 
539   // 'Manually' lex ahead in the current file buffer.
540   const char *Offset = Lex->getBufferLocation();
541   const char *RegexBegin = Offset - RegexToken->TokenText.size();
542   StringRef Buffer = Lex->getBuffer();
543   bool InCharacterClass = false;
544   bool HaveClosingSlash = false;
545   for (; !HaveClosingSlash && Offset != Buffer.end(); ++Offset) {
546     // Regular expressions are terminated with a '/', which can only be
547     // escaped using '\' or a character class between '[' and ']'.
548     // See http://www.ecma-international.org/ecma-262/5.1/#sec-7.8.5.
549     switch (*Offset) {
550     case '\\':
551       // Skip the escaped character.
552       ++Offset;
553       break;
554     case '[':
555       InCharacterClass = true;
556       break;
557     case ']':
558       InCharacterClass = false;
559       break;
560     case '/':
561       if (!InCharacterClass)
562         HaveClosingSlash = true;
563       break;
564     }
565   }
566 
567   RegexToken->setType(TT_RegexLiteral);
568   // Treat regex literals like other string_literals.
569   RegexToken->Tok.setKind(tok::string_literal);
570   RegexToken->TokenText = StringRef(RegexBegin, Offset - RegexBegin);
571   RegexToken->ColumnWidth = RegexToken->TokenText.size();
572 
573   resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset)));
574 }
575 
576 void FormatTokenLexer::handleCSharpVerbatimAndInterpolatedStrings() {
577   FormatToken *CSharpStringLiteral = Tokens.back();
578 
579   if (CSharpStringLiteral->getType() != TT_CSharpStringLiteral)
580     return;
581 
582   // Deal with multiline strings.
583   if (!(CSharpStringLiteral->TokenText.startswith(R"(@")") ||
584         CSharpStringLiteral->TokenText.startswith(R"($@")")))
585     return;
586 
587   const char *StrBegin =
588       Lex->getBufferLocation() - CSharpStringLiteral->TokenText.size();
589   const char *Offset = StrBegin;
590   if (CSharpStringLiteral->TokenText.startswith(R"(@")"))
591     Offset += 2;
592   else // CSharpStringLiteral->TokenText.startswith(R"($@")")
593     Offset += 3;
594 
595   // Look for a terminating '"' in the current file buffer.
596   // Make no effort to format code within an interpolated or verbatim string.
597   for (; Offset != Lex->getBuffer().end(); ++Offset) {
598     if (Offset[0] == '"') {
599       // "" within a verbatim string is an escaped double quote: skip it.
600       if (Offset + 1 < Lex->getBuffer().end() && Offset[1] == '"')
601         ++Offset;
602       else
603         break;
604     }
605   }
606 
607   // Make no attempt to format code properly if a verbatim string is
608   // unterminated.
609   if (Offset == Lex->getBuffer().end())
610     return;
611 
612   StringRef LiteralText(StrBegin, Offset - StrBegin + 1);
613   CSharpStringLiteral->TokenText = LiteralText;
614 
615   // Adjust width for potentially multiline string literals.
616   size_t FirstBreak = LiteralText.find('\n');
617   StringRef FirstLineText = FirstBreak == StringRef::npos
618                                 ? LiteralText
619                                 : LiteralText.substr(0, FirstBreak);
620   CSharpStringLiteral->ColumnWidth = encoding::columnWidthWithTabs(
621       FirstLineText, CSharpStringLiteral->OriginalColumn, Style.TabWidth,
622       Encoding);
623   size_t LastBreak = LiteralText.rfind('\n');
624   if (LastBreak != StringRef::npos) {
625     CSharpStringLiteral->IsMultiline = true;
626     unsigned StartColumn = 0;
627     CSharpStringLiteral->LastLineColumnWidth = encoding::columnWidthWithTabs(
628         LiteralText.substr(LastBreak + 1, LiteralText.size()), StartColumn,
629         Style.TabWidth, Encoding);
630   }
631 
632   SourceLocation loc = Offset < Lex->getBuffer().end()
633                            ? Lex->getSourceLocation(Offset + 1)
634                            : SourceMgr.getLocForEndOfFile(ID);
635   resetLexer(SourceMgr.getFileOffset(loc));
636 }
637 
638 void FormatTokenLexer::handleTemplateStrings() {
639   FormatToken *BacktickToken = Tokens.back();
640 
641   if (BacktickToken->is(tok::l_brace)) {
642     StateStack.push(LexerState::NORMAL);
643     return;
644   }
645   if (BacktickToken->is(tok::r_brace)) {
646     if (StateStack.size() == 1)
647       return;
648     StateStack.pop();
649     if (StateStack.top() != LexerState::TEMPLATE_STRING)
650       return;
651     // If back in TEMPLATE_STRING, fallthrough and continue parsing the
652   } else if (BacktickToken->is(tok::unknown) &&
653              BacktickToken->TokenText == "`") {
654     StateStack.push(LexerState::TEMPLATE_STRING);
655   } else {
656     return; // Not actually a template
657   }
658 
659   // 'Manually' lex ahead in the current file buffer.
660   const char *Offset = Lex->getBufferLocation();
661   const char *TmplBegin = Offset - BacktickToken->TokenText.size(); // at "`"
662   for (; Offset != Lex->getBuffer().end(); ++Offset) {
663     if (Offset[0] == '`') {
664       StateStack.pop();
665       break;
666     }
667     if (Offset[0] == '\\') {
668       ++Offset; // Skip the escaped character.
669     } else if (Offset + 1 < Lex->getBuffer().end() && Offset[0] == '$' &&
670                Offset[1] == '{') {
671       // '${' introduces an expression interpolation in the template string.
672       StateStack.push(LexerState::NORMAL);
673       ++Offset;
674       break;
675     }
676   }
677 
678   StringRef LiteralText(TmplBegin, Offset - TmplBegin + 1);
679   BacktickToken->setType(TT_TemplateString);
680   BacktickToken->Tok.setKind(tok::string_literal);
681   BacktickToken->TokenText = LiteralText;
682 
683   // Adjust width for potentially multiline string literals.
684   size_t FirstBreak = LiteralText.find('\n');
685   StringRef FirstLineText = FirstBreak == StringRef::npos
686                                 ? LiteralText
687                                 : LiteralText.substr(0, FirstBreak);
688   BacktickToken->ColumnWidth = encoding::columnWidthWithTabs(
689       FirstLineText, BacktickToken->OriginalColumn, Style.TabWidth, Encoding);
690   size_t LastBreak = LiteralText.rfind('\n');
691   if (LastBreak != StringRef::npos) {
692     BacktickToken->IsMultiline = true;
693     unsigned StartColumn = 0; // The template tail spans the entire line.
694     BacktickToken->LastLineColumnWidth = encoding::columnWidthWithTabs(
695         LiteralText.substr(LastBreak + 1, LiteralText.size()), StartColumn,
696         Style.TabWidth, Encoding);
697   }
698 
699   SourceLocation loc = Offset < Lex->getBuffer().end()
700                            ? Lex->getSourceLocation(Offset + 1)
701                            : SourceMgr.getLocForEndOfFile(ID);
702   resetLexer(SourceMgr.getFileOffset(loc));
703 }
704 
705 void FormatTokenLexer::tryParsePythonComment() {
706   FormatToken *HashToken = Tokens.back();
707   if (!HashToken->isOneOf(tok::hash, tok::hashhash))
708     return;
709   // Turn the remainder of this line into a comment.
710   const char *CommentBegin =
711       Lex->getBufferLocation() - HashToken->TokenText.size(); // at "#"
712   size_t From = CommentBegin - Lex->getBuffer().begin();
713   size_t To = Lex->getBuffer().find_first_of('\n', From);
714   if (To == StringRef::npos)
715     To = Lex->getBuffer().size();
716   size_t Len = To - From;
717   HashToken->setType(TT_LineComment);
718   HashToken->Tok.setKind(tok::comment);
719   HashToken->TokenText = Lex->getBuffer().substr(From, Len);
720   SourceLocation Loc = To < Lex->getBuffer().size()
721                            ? Lex->getSourceLocation(CommentBegin + Len)
722                            : SourceMgr.getLocForEndOfFile(ID);
723   resetLexer(SourceMgr.getFileOffset(Loc));
724 }
725 
726 bool FormatTokenLexer::tryMerge_TMacro() {
727   if (Tokens.size() < 4)
728     return false;
729   FormatToken *Last = Tokens.back();
730   if (!Last->is(tok::r_paren))
731     return false;
732 
733   FormatToken *String = Tokens[Tokens.size() - 2];
734   if (!String->is(tok::string_literal) || String->IsMultiline)
735     return false;
736 
737   if (!Tokens[Tokens.size() - 3]->is(tok::l_paren))
738     return false;
739 
740   FormatToken *Macro = Tokens[Tokens.size() - 4];
741   if (Macro->TokenText != "_T")
742     return false;
743 
744   const char *Start = Macro->TokenText.data();
745   const char *End = Last->TokenText.data() + Last->TokenText.size();
746   String->TokenText = StringRef(Start, End - Start);
747   String->IsFirst = Macro->IsFirst;
748   String->LastNewlineOffset = Macro->LastNewlineOffset;
749   String->WhitespaceRange = Macro->WhitespaceRange;
750   String->OriginalColumn = Macro->OriginalColumn;
751   String->ColumnWidth = encoding::columnWidthWithTabs(
752       String->TokenText, String->OriginalColumn, Style.TabWidth, Encoding);
753   String->NewlinesBefore = Macro->NewlinesBefore;
754   String->HasUnescapedNewline = Macro->HasUnescapedNewline;
755 
756   Tokens.pop_back();
757   Tokens.pop_back();
758   Tokens.pop_back();
759   Tokens.back() = String;
760   return true;
761 }
762 
763 bool FormatTokenLexer::tryMergeConflictMarkers() {
764   if (Tokens.back()->NewlinesBefore == 0 && Tokens.back()->isNot(tok::eof))
765     return false;
766 
767   // Conflict lines look like:
768   // <marker> <text from the vcs>
769   // For example:
770   // >>>>>>> /file/in/file/system at revision 1234
771   //
772   // We merge all tokens in a line that starts with a conflict marker
773   // into a single token with a special token type that the unwrapped line
774   // parser will use to correctly rebuild the underlying code.
775 
776   FileID ID;
777   // Get the position of the first token in the line.
778   unsigned FirstInLineOffset;
779   std::tie(ID, FirstInLineOffset) = SourceMgr.getDecomposedLoc(
780       Tokens[FirstInLineIndex]->getStartOfNonWhitespace());
781   StringRef Buffer = SourceMgr.getBufferOrFake(ID).getBuffer();
782   // Calculate the offset of the start of the current line.
783   auto LineOffset = Buffer.rfind('\n', FirstInLineOffset);
784   if (LineOffset == StringRef::npos) {
785     LineOffset = 0;
786   } else {
787     ++LineOffset;
788   }
789 
790   auto FirstSpace = Buffer.find_first_of(" \n", LineOffset);
791   StringRef LineStart;
792   if (FirstSpace == StringRef::npos) {
793     LineStart = Buffer.substr(LineOffset);
794   } else {
795     LineStart = Buffer.substr(LineOffset, FirstSpace - LineOffset);
796   }
797 
798   TokenType Type = TT_Unknown;
799   if (LineStart == "<<<<<<<" || LineStart == ">>>>") {
800     Type = TT_ConflictStart;
801   } else if (LineStart == "|||||||" || LineStart == "=======" ||
802              LineStart == "====") {
803     Type = TT_ConflictAlternative;
804   } else if (LineStart == ">>>>>>>" || LineStart == "<<<<") {
805     Type = TT_ConflictEnd;
806   }
807 
808   if (Type != TT_Unknown) {
809     FormatToken *Next = Tokens.back();
810 
811     Tokens.resize(FirstInLineIndex + 1);
812     // We do not need to build a complete token here, as we will skip it
813     // during parsing anyway (as we must not touch whitespace around conflict
814     // markers).
815     Tokens.back()->setType(Type);
816     Tokens.back()->Tok.setKind(tok::kw___unknown_anytype);
817 
818     Tokens.push_back(Next);
819     return true;
820   }
821 
822   return false;
823 }
824 
825 FormatToken *FormatTokenLexer::getStashedToken() {
826   // Create a synthesized second '>' or '<' token.
827   Token Tok = FormatTok->Tok;
828   StringRef TokenText = FormatTok->TokenText;
829 
830   unsigned OriginalColumn = FormatTok->OriginalColumn;
831   FormatTok = new (Allocator.Allocate()) FormatToken;
832   FormatTok->Tok = Tok;
833   SourceLocation TokLocation =
834       FormatTok->Tok.getLocation().getLocWithOffset(Tok.getLength() - 1);
835   FormatTok->Tok.setLocation(TokLocation);
836   FormatTok->WhitespaceRange = SourceRange(TokLocation, TokLocation);
837   FormatTok->TokenText = TokenText;
838   FormatTok->ColumnWidth = 1;
839   FormatTok->OriginalColumn = OriginalColumn + 1;
840 
841   return FormatTok;
842 }
843 
844 FormatToken *FormatTokenLexer::getNextToken() {
845   if (StateStack.top() == LexerState::TOKEN_STASHED) {
846     StateStack.pop();
847     return getStashedToken();
848   }
849 
850   FormatTok = new (Allocator.Allocate()) FormatToken;
851   readRawToken(*FormatTok);
852   SourceLocation WhitespaceStart =
853       FormatTok->Tok.getLocation().getLocWithOffset(-TrailingWhitespace);
854   FormatTok->IsFirst = IsFirstToken;
855   IsFirstToken = false;
856 
857   // Consume and record whitespace until we find a significant token.
858   unsigned WhitespaceLength = TrailingWhitespace;
859   while (FormatTok->Tok.is(tok::unknown)) {
860     StringRef Text = FormatTok->TokenText;
861     auto EscapesNewline = [&](int pos) {
862       // A '\r' here is just part of '\r\n'. Skip it.
863       if (pos >= 0 && Text[pos] == '\r')
864         --pos;
865       // See whether there is an odd number of '\' before this.
866       // FIXME: This is wrong. A '\' followed by a newline is always removed,
867       // regardless of whether there is another '\' before it.
868       // FIXME: Newlines can also be escaped by a '?' '?' '/' trigraph.
869       unsigned count = 0;
870       for (; pos >= 0; --pos, ++count)
871         if (Text[pos] != '\\')
872           break;
873       return count & 1;
874     };
875     // FIXME: This miscounts tok:unknown tokens that are not just
876     // whitespace, e.g. a '`' character.
877     for (int i = 0, e = Text.size(); i != e; ++i) {
878       switch (Text[i]) {
879       case '\n':
880         ++FormatTok->NewlinesBefore;
881         FormatTok->HasUnescapedNewline = !EscapesNewline(i - 1);
882         FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;
883         Column = 0;
884         break;
885       case '\r':
886         FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;
887         Column = 0;
888         break;
889       case '\f':
890       case '\v':
891         Column = 0;
892         break;
893       case ' ':
894         ++Column;
895         break;
896       case '\t':
897         Column +=
898             Style.TabWidth - (Style.TabWidth ? Column % Style.TabWidth : 0);
899         break;
900       case '\\':
901         if (i + 1 == e || (Text[i + 1] != '\r' && Text[i + 1] != '\n'))
902           FormatTok->setType(TT_ImplicitStringLiteral);
903         break;
904       default:
905         FormatTok->setType(TT_ImplicitStringLiteral);
906         break;
907       }
908       if (FormatTok->getType() == TT_ImplicitStringLiteral)
909         break;
910     }
911 
912     if (FormatTok->is(TT_ImplicitStringLiteral))
913       break;
914     WhitespaceLength += FormatTok->Tok.getLength();
915 
916     readRawToken(*FormatTok);
917   }
918 
919   // JavaScript and Java do not allow to escape the end of the line with a
920   // backslash. Backslashes are syntax errors in plain source, but can occur in
921   // comments. When a single line comment ends with a \, it'll cause the next
922   // line of code to be lexed as a comment, breaking formatting. The code below
923   // finds comments that contain a backslash followed by a line break, truncates
924   // the comment token at the backslash, and resets the lexer to restart behind
925   // the backslash.
926   if ((Style.Language == FormatStyle::LK_JavaScript ||
927        Style.Language == FormatStyle::LK_Java) &&
928       FormatTok->is(tok::comment) && FormatTok->TokenText.startswith("//")) {
929     size_t BackslashPos = FormatTok->TokenText.find('\\');
930     while (BackslashPos != StringRef::npos) {
931       if (BackslashPos + 1 < FormatTok->TokenText.size() &&
932           FormatTok->TokenText[BackslashPos + 1] == '\n') {
933         const char *Offset = Lex->getBufferLocation();
934         Offset -= FormatTok->TokenText.size();
935         Offset += BackslashPos + 1;
936         resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset)));
937         FormatTok->TokenText = FormatTok->TokenText.substr(0, BackslashPos + 1);
938         FormatTok->ColumnWidth = encoding::columnWidthWithTabs(
939             FormatTok->TokenText, FormatTok->OriginalColumn, Style.TabWidth,
940             Encoding);
941         break;
942       }
943       BackslashPos = FormatTok->TokenText.find('\\', BackslashPos + 1);
944     }
945   }
946 
947   // In case the token starts with escaped newlines, we want to
948   // take them into account as whitespace - this pattern is quite frequent
949   // in macro definitions.
950   // FIXME: Add a more explicit test.
951   while (FormatTok->TokenText.size() > 1 && FormatTok->TokenText[0] == '\\') {
952     unsigned SkippedWhitespace = 0;
953     if (FormatTok->TokenText.size() > 2 &&
954         (FormatTok->TokenText[1] == '\r' && FormatTok->TokenText[2] == '\n'))
955       SkippedWhitespace = 3;
956     else if (FormatTok->TokenText[1] == '\n')
957       SkippedWhitespace = 2;
958     else
959       break;
960 
961     ++FormatTok->NewlinesBefore;
962     WhitespaceLength += SkippedWhitespace;
963     FormatTok->LastNewlineOffset = SkippedWhitespace;
964     Column = 0;
965     FormatTok->TokenText = FormatTok->TokenText.substr(SkippedWhitespace);
966   }
967 
968   FormatTok->WhitespaceRange = SourceRange(
969       WhitespaceStart, WhitespaceStart.getLocWithOffset(WhitespaceLength));
970 
971   FormatTok->OriginalColumn = Column;
972 
973   TrailingWhitespace = 0;
974   if (FormatTok->Tok.is(tok::comment)) {
975     // FIXME: Add the trimmed whitespace to Column.
976     StringRef UntrimmedText = FormatTok->TokenText;
977     FormatTok->TokenText = FormatTok->TokenText.rtrim(" \t\v\f");
978     TrailingWhitespace = UntrimmedText.size() - FormatTok->TokenText.size();
979   } else if (FormatTok->Tok.is(tok::raw_identifier)) {
980     IdentifierInfo &Info = IdentTable.get(FormatTok->TokenText);
981     FormatTok->Tok.setIdentifierInfo(&Info);
982     FormatTok->Tok.setKind(Info.getTokenID());
983     if (Style.Language == FormatStyle::LK_Java &&
984         FormatTok->isOneOf(tok::kw_struct, tok::kw_union, tok::kw_delete,
985                            tok::kw_operator)) {
986       FormatTok->Tok.setKind(tok::identifier);
987       FormatTok->Tok.setIdentifierInfo(nullptr);
988     } else if (Style.Language == FormatStyle::LK_JavaScript &&
989                FormatTok->isOneOf(tok::kw_struct, tok::kw_union,
990                                   tok::kw_operator)) {
991       FormatTok->Tok.setKind(tok::identifier);
992       FormatTok->Tok.setIdentifierInfo(nullptr);
993     }
994   } else if (FormatTok->Tok.is(tok::greatergreater)) {
995     FormatTok->Tok.setKind(tok::greater);
996     FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
997     ++Column;
998     StateStack.push(LexerState::TOKEN_STASHED);
999   } else if (FormatTok->Tok.is(tok::lessless)) {
1000     FormatTok->Tok.setKind(tok::less);
1001     FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
1002     ++Column;
1003     StateStack.push(LexerState::TOKEN_STASHED);
1004   }
1005 
1006   // Now FormatTok is the next non-whitespace token.
1007 
1008   StringRef Text = FormatTok->TokenText;
1009   size_t FirstNewlinePos = Text.find('\n');
1010   if (FirstNewlinePos == StringRef::npos) {
1011     // FIXME: ColumnWidth actually depends on the start column, we need to
1012     // take this into account when the token is moved.
1013     FormatTok->ColumnWidth =
1014         encoding::columnWidthWithTabs(Text, Column, Style.TabWidth, Encoding);
1015     Column += FormatTok->ColumnWidth;
1016   } else {
1017     FormatTok->IsMultiline = true;
1018     // FIXME: ColumnWidth actually depends on the start column, we need to
1019     // take this into account when the token is moved.
1020     FormatTok->ColumnWidth = encoding::columnWidthWithTabs(
1021         Text.substr(0, FirstNewlinePos), Column, Style.TabWidth, Encoding);
1022 
1023     // The last line of the token always starts in column 0.
1024     // Thus, the length can be precomputed even in the presence of tabs.
1025     FormatTok->LastLineColumnWidth = encoding::columnWidthWithTabs(
1026         Text.substr(Text.find_last_of('\n') + 1), 0, Style.TabWidth, Encoding);
1027     Column = FormatTok->LastLineColumnWidth;
1028   }
1029 
1030   if (Style.isCpp()) {
1031     auto it = Macros.find(FormatTok->Tok.getIdentifierInfo());
1032     if (!(Tokens.size() > 0 && Tokens.back()->Tok.getIdentifierInfo() &&
1033           Tokens.back()->Tok.getIdentifierInfo()->getPPKeywordID() ==
1034               tok::pp_define) &&
1035         it != Macros.end()) {
1036       FormatTok->setType(it->second);
1037     } else if (FormatTok->is(tok::identifier)) {
1038       if (MacroBlockBeginRegex.match(Text)) {
1039         FormatTok->setType(TT_MacroBlockBegin);
1040       } else if (MacroBlockEndRegex.match(Text)) {
1041         FormatTok->setType(TT_MacroBlockEnd);
1042       }
1043     }
1044   }
1045 
1046   return FormatTok;
1047 }
1048 
1049 void FormatTokenLexer::readRawToken(FormatToken &Tok) {
1050   Lex->LexFromRawLexer(Tok.Tok);
1051   Tok.TokenText = StringRef(SourceMgr.getCharacterData(Tok.Tok.getLocation()),
1052                             Tok.Tok.getLength());
1053   // For formatting, treat unterminated string literals like normal string
1054   // literals.
1055   if (Tok.is(tok::unknown)) {
1056     if (!Tok.TokenText.empty() && Tok.TokenText[0] == '"') {
1057       Tok.Tok.setKind(tok::string_literal);
1058       Tok.IsUnterminatedLiteral = true;
1059     } else if (Style.Language == FormatStyle::LK_JavaScript &&
1060                Tok.TokenText == "''") {
1061       Tok.Tok.setKind(tok::string_literal);
1062     }
1063   }
1064 
1065   if ((Style.Language == FormatStyle::LK_JavaScript ||
1066        Style.Language == FormatStyle::LK_Proto ||
1067        Style.Language == FormatStyle::LK_TextProto) &&
1068       Tok.is(tok::char_constant)) {
1069     Tok.Tok.setKind(tok::string_literal);
1070   }
1071 
1072   if (Tok.is(tok::comment) && (Tok.TokenText == "// clang-format on" ||
1073                                Tok.TokenText == "/* clang-format on */")) {
1074     FormattingDisabled = false;
1075   }
1076 
1077   Tok.Finalized = FormattingDisabled;
1078 
1079   if (Tok.is(tok::comment) && (Tok.TokenText == "// clang-format off" ||
1080                                Tok.TokenText == "/* clang-format off */")) {
1081     FormattingDisabled = true;
1082   }
1083 }
1084 
1085 void FormatTokenLexer::resetLexer(unsigned Offset) {
1086   StringRef Buffer = SourceMgr.getBufferData(ID);
1087   Lex.reset(new Lexer(SourceMgr.getLocForStartOfFile(ID),
1088                       getFormattingLangOpts(Style), Buffer.begin(),
1089                       Buffer.begin() + Offset, Buffer.end()));
1090   Lex->SetKeepWhitespaceMode(true);
1091   TrailingWhitespace = 0;
1092 }
1093 
1094 } // namespace format
1095 } // namespace clang
1096