1 //===--- FormatTokenLexer.cpp - Lex FormatTokens -------------*- C++ ----*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// 9 /// \file 10 /// This file implements FormatTokenLexer, which tokenizes a source file 11 /// into a FormatToken stream suitable for ClangFormat. 12 /// 13 //===----------------------------------------------------------------------===// 14 15 #include "FormatTokenLexer.h" 16 #include "FormatToken.h" 17 #include "clang/Basic/SourceLocation.h" 18 #include "clang/Basic/SourceManager.h" 19 #include "clang/Format/Format.h" 20 #include "llvm/Support/Regex.h" 21 22 namespace clang { 23 namespace format { 24 25 FormatTokenLexer::FormatTokenLexer(const SourceManager &SourceMgr, FileID ID, 26 unsigned Column, const FormatStyle &Style, 27 encoding::Encoding Encoding) 28 : FormatTok(nullptr), IsFirstToken(true), StateStack({LexerState::NORMAL}), 29 Column(Column), TrailingWhitespace(0), SourceMgr(SourceMgr), ID(ID), 30 Style(Style), IdentTable(getFormattingLangOpts(Style)), 31 Keywords(IdentTable), Encoding(Encoding), FirstInLineIndex(0), 32 FormattingDisabled(false), MacroBlockBeginRegex(Style.MacroBlockBegin), 33 MacroBlockEndRegex(Style.MacroBlockEnd) { 34 Lex.reset(new Lexer(ID, SourceMgr.getBuffer(ID), SourceMgr, 35 getFormattingLangOpts(Style))); 36 Lex->SetKeepWhitespaceMode(true); 37 38 for (const std::string &ForEachMacro : Style.ForEachMacros) 39 Macros.insert({&IdentTable.get(ForEachMacro), TT_ForEachMacro}); 40 for (const std::string &StatementMacro : Style.StatementMacros) 41 Macros.insert({&IdentTable.get(StatementMacro), TT_StatementMacro}); 42 for (const std::string &TypenameMacro : Style.TypenameMacros) 43 Macros.insert({&IdentTable.get(TypenameMacro), TT_TypenameMacro}); 44 for (const std::string &NamespaceMacro : Style.NamespaceMacros) 45 Macros.insert({&IdentTable.get(NamespaceMacro), TT_NamespaceMacro}); 46 } 47 48 ArrayRef<FormatToken *> FormatTokenLexer::lex() { 49 assert(Tokens.empty()); 50 assert(FirstInLineIndex == 0); 51 do { 52 Tokens.push_back(getNextToken()); 53 if (Style.Language == FormatStyle::LK_JavaScript) { 54 tryParseJSRegexLiteral(); 55 handleTemplateStrings(); 56 } 57 if (Style.Language == FormatStyle::LK_TextProto) 58 tryParsePythonComment(); 59 tryMergePreviousTokens(); 60 if (Tokens.back()->NewlinesBefore > 0 || Tokens.back()->IsMultiline) 61 FirstInLineIndex = Tokens.size() - 1; 62 } while (Tokens.back()->Tok.isNot(tok::eof)); 63 return Tokens; 64 } 65 66 void FormatTokenLexer::tryMergePreviousTokens() { 67 if (tryMerge_TMacro()) 68 return; 69 if (tryMergeConflictMarkers()) 70 return; 71 if (tryMergeLessLess()) 72 return; 73 74 if (Style.isCSharp()) { 75 if (tryMergeCSharpKeywordVariables()) 76 return; 77 if (tryMergeCSharpVerbatimStringLiteral()) 78 return; 79 if (tryMergeCSharpDoubleQuestion()) 80 return; 81 if (tryMergeCSharpNullConditionals()) 82 return; 83 if (tryTransformCSharpForEach()) 84 return; 85 static const tok::TokenKind JSRightArrow[] = {tok::equal, tok::greater}; 86 if (tryMergeTokens(JSRightArrow, TT_JsFatArrow)) 87 return; 88 } 89 90 if (tryMergeNSStringLiteral()) 91 return; 92 93 if (Style.Language == FormatStyle::LK_JavaScript) { 94 static const tok::TokenKind JSIdentity[] = {tok::equalequal, tok::equal}; 95 static const tok::TokenKind JSNotIdentity[] = {tok::exclaimequal, 96 tok::equal}; 97 static const tok::TokenKind JSShiftEqual[] = {tok::greater, tok::greater, 98 tok::greaterequal}; 99 static const tok::TokenKind JSRightArrow[] = {tok::equal, tok::greater}; 100 static const tok::TokenKind JSExponentiation[] = {tok::star, tok::star}; 101 static const tok::TokenKind JSExponentiationEqual[] = {tok::star, 102 tok::starequal}; 103 static const tok::TokenKind JSNullPropagatingOperator[] = {tok::question, 104 tok::period}; 105 static const tok::TokenKind JSNullishOperator[] = {tok::question, 106 tok::question}; 107 108 // FIXME: Investigate what token type gives the correct operator priority. 109 if (tryMergeTokens(JSIdentity, TT_BinaryOperator)) 110 return; 111 if (tryMergeTokens(JSNotIdentity, TT_BinaryOperator)) 112 return; 113 if (tryMergeTokens(JSShiftEqual, TT_BinaryOperator)) 114 return; 115 if (tryMergeTokens(JSRightArrow, TT_JsFatArrow)) 116 return; 117 if (tryMergeTokens(JSExponentiation, TT_JsExponentiation)) 118 return; 119 if (tryMergeTokens(JSExponentiationEqual, TT_JsExponentiationEqual)) { 120 Tokens.back()->Tok.setKind(tok::starequal); 121 return; 122 } 123 if (tryMergeTokens(JSNullishOperator, TT_JsNullishCoalescingOperator)) 124 return; 125 if (tryMergeTokens(JSNullPropagatingOperator, 126 TT_JsNullPropagatingOperator)) { 127 // Treat like a regular "." access. 128 Tokens.back()->Tok.setKind(tok::period); 129 return; 130 } 131 if (tryMergeJSPrivateIdentifier()) 132 return; 133 } 134 135 if (Style.Language == FormatStyle::LK_Java) { 136 static const tok::TokenKind JavaRightLogicalShiftAssign[] = { 137 tok::greater, tok::greater, tok::greaterequal}; 138 if (tryMergeTokens(JavaRightLogicalShiftAssign, TT_BinaryOperator)) 139 return; 140 } 141 } 142 143 bool FormatTokenLexer::tryMergeNSStringLiteral() { 144 if (Tokens.size() < 2) 145 return false; 146 auto &At = *(Tokens.end() - 2); 147 auto &String = *(Tokens.end() - 1); 148 if (!At->is(tok::at) || !String->is(tok::string_literal)) 149 return false; 150 At->Tok.setKind(tok::string_literal); 151 At->TokenText = StringRef(At->TokenText.begin(), 152 String->TokenText.end() - At->TokenText.begin()); 153 At->ColumnWidth += String->ColumnWidth; 154 At->Type = TT_ObjCStringLiteral; 155 Tokens.erase(Tokens.end() - 1); 156 return true; 157 } 158 159 bool FormatTokenLexer::tryMergeJSPrivateIdentifier() { 160 // Merges #idenfier into a single identifier with the text #identifier 161 // but the token tok::identifier. 162 if (Tokens.size() < 2) 163 return false; 164 auto &Hash = *(Tokens.end() - 2); 165 auto &Identifier = *(Tokens.end() - 1); 166 if (!Hash->is(tok::hash) || !Identifier->is(tok::identifier)) 167 return false; 168 Hash->Tok.setKind(tok::identifier); 169 Hash->TokenText = 170 StringRef(Hash->TokenText.begin(), 171 Identifier->TokenText.end() - Hash->TokenText.begin()); 172 Hash->ColumnWidth += Identifier->ColumnWidth; 173 Hash->Type = TT_JsPrivateIdentifier; 174 Tokens.erase(Tokens.end() - 1); 175 return true; 176 } 177 178 // Search for verbatim or interpolated string literals @"ABC" or 179 // $"aaaaa{abc}aaaaa" i and mark the token as TT_CSharpStringLiteral, and to 180 // prevent splitting of @, $ and ". 181 bool FormatTokenLexer::tryMergeCSharpVerbatimStringLiteral() { 182 if (Tokens.size() < 2) 183 return false; 184 auto &At = *(Tokens.end() - 2); 185 auto &String = *(Tokens.end() - 1); 186 187 // Look for $"aaaaaa" @"aaaaaa". 188 if (!(At->is(tok::at) || At->TokenText == "$") || 189 !String->is(tok::string_literal)) 190 return false; 191 192 if (Tokens.size() >= 2 && At->is(tok::at)) { 193 auto &Dollar = *(Tokens.end() - 3); 194 if (Dollar->TokenText == "$") { 195 // This looks like $@"aaaaa" so we need to combine all 3 tokens. 196 Dollar->Tok.setKind(tok::string_literal); 197 Dollar->TokenText = 198 StringRef(Dollar->TokenText.begin(), 199 String->TokenText.end() - Dollar->TokenText.begin()); 200 Dollar->ColumnWidth += (At->ColumnWidth + String->ColumnWidth); 201 Dollar->Type = TT_CSharpStringLiteral; 202 Tokens.erase(Tokens.end() - 2); 203 Tokens.erase(Tokens.end() - 1); 204 return true; 205 } 206 } 207 208 // Convert back into just a string_literal. 209 At->Tok.setKind(tok::string_literal); 210 At->TokenText = StringRef(At->TokenText.begin(), 211 String->TokenText.end() - At->TokenText.begin()); 212 At->ColumnWidth += String->ColumnWidth; 213 At->Type = TT_CSharpStringLiteral; 214 Tokens.erase(Tokens.end() - 1); 215 return true; 216 } 217 218 bool FormatTokenLexer::tryMergeCSharpDoubleQuestion() { 219 if (Tokens.size() < 2) 220 return false; 221 auto &FirstQuestion = *(Tokens.end() - 2); 222 auto &SecondQuestion = *(Tokens.end() - 1); 223 if (!FirstQuestion->is(tok::question) || !SecondQuestion->is(tok::question)) 224 return false; 225 FirstQuestion->Tok.setKind(tok::question); 226 FirstQuestion->TokenText = StringRef(FirstQuestion->TokenText.begin(), 227 SecondQuestion->TokenText.end() - 228 FirstQuestion->TokenText.begin()); 229 FirstQuestion->ColumnWidth += SecondQuestion->ColumnWidth; 230 FirstQuestion->Type = TT_CSharpNullCoalescing; 231 Tokens.erase(Tokens.end() - 1); 232 return true; 233 } 234 235 bool FormatTokenLexer::tryMergeCSharpKeywordVariables() { 236 if (Tokens.size() < 2) 237 return false; 238 auto &At = *(Tokens.end() - 2); 239 auto &Keyword = *(Tokens.end() - 1); 240 if (!At->is(tok::at)) 241 return false; 242 if (!Keywords.isCSharpKeyword(*Keyword)) 243 return false; 244 245 At->Tok.setKind(tok::identifier); 246 At->TokenText = StringRef(At->TokenText.begin(), 247 Keyword->TokenText.end() - At->TokenText.begin()); 248 At->ColumnWidth += Keyword->ColumnWidth; 249 At->Type = Keyword->Type; 250 Tokens.erase(Tokens.end() - 1); 251 return true; 252 } 253 254 // In C# merge the Identifier and the ? together e.g. arg?. 255 bool FormatTokenLexer::tryMergeCSharpNullConditionals() { 256 if (Tokens.size() < 2) 257 return false; 258 auto &Identifier = *(Tokens.end() - 2); 259 auto &Question = *(Tokens.end() - 1); 260 if (!Identifier->isOneOf(tok::r_square, tok::identifier) || 261 !Question->is(tok::question)) 262 return false; 263 Identifier->TokenText = 264 StringRef(Identifier->TokenText.begin(), 265 Question->TokenText.end() - Identifier->TokenText.begin()); 266 Identifier->ColumnWidth += Question->ColumnWidth; 267 Tokens.erase(Tokens.end() - 1); 268 return true; 269 } 270 271 // In C# transform identifier foreach into kw_foreach 272 bool FormatTokenLexer::tryTransformCSharpForEach() { 273 if (Tokens.size() < 1) 274 return false; 275 auto &Identifier = *(Tokens.end() - 1); 276 if (!Identifier->is(tok::identifier)) 277 return false; 278 if (Identifier->TokenText != "foreach") 279 return false; 280 281 Identifier->Type = TT_ForEachMacro; 282 Identifier->Tok.setKind(tok::kw_for); 283 return true; 284 } 285 286 bool FormatTokenLexer::tryMergeLessLess() { 287 // Merge X,less,less,Y into X,lessless,Y unless X or Y is less. 288 if (Tokens.size() < 3) 289 return false; 290 291 bool FourthTokenIsLess = false; 292 if (Tokens.size() > 3) 293 FourthTokenIsLess = (Tokens.end() - 4)[0]->is(tok::less); 294 295 auto First = Tokens.end() - 3; 296 if (First[2]->is(tok::less) || First[1]->isNot(tok::less) || 297 First[0]->isNot(tok::less) || FourthTokenIsLess) 298 return false; 299 300 // Only merge if there currently is no whitespace between the two "<". 301 if (First[1]->WhitespaceRange.getBegin() != 302 First[1]->WhitespaceRange.getEnd()) 303 return false; 304 305 First[0]->Tok.setKind(tok::lessless); 306 First[0]->TokenText = "<<"; 307 First[0]->ColumnWidth += 1; 308 Tokens.erase(Tokens.end() - 2); 309 return true; 310 } 311 312 bool FormatTokenLexer::tryMergeTokens(ArrayRef<tok::TokenKind> Kinds, 313 TokenType NewType) { 314 if (Tokens.size() < Kinds.size()) 315 return false; 316 317 SmallVectorImpl<FormatToken *>::const_iterator First = 318 Tokens.end() - Kinds.size(); 319 if (!First[0]->is(Kinds[0])) 320 return false; 321 unsigned AddLength = 0; 322 for (unsigned i = 1; i < Kinds.size(); ++i) { 323 if (!First[i]->is(Kinds[i]) || First[i]->WhitespaceRange.getBegin() != 324 First[i]->WhitespaceRange.getEnd()) 325 return false; 326 AddLength += First[i]->TokenText.size(); 327 } 328 Tokens.resize(Tokens.size() - Kinds.size() + 1); 329 First[0]->TokenText = StringRef(First[0]->TokenText.data(), 330 First[0]->TokenText.size() + AddLength); 331 First[0]->ColumnWidth += AddLength; 332 First[0]->Type = NewType; 333 return true; 334 } 335 336 // Returns \c true if \p Tok can only be followed by an operand in JavaScript. 337 bool FormatTokenLexer::precedesOperand(FormatToken *Tok) { 338 // NB: This is not entirely correct, as an r_paren can introduce an operand 339 // location in e.g. `if (foo) /bar/.exec(...);`. That is a rare enough 340 // corner case to not matter in practice, though. 341 return Tok->isOneOf(tok::period, tok::l_paren, tok::comma, tok::l_brace, 342 tok::r_brace, tok::l_square, tok::semi, tok::exclaim, 343 tok::colon, tok::question, tok::tilde) || 344 Tok->isOneOf(tok::kw_return, tok::kw_do, tok::kw_case, tok::kw_throw, 345 tok::kw_else, tok::kw_new, tok::kw_delete, tok::kw_void, 346 tok::kw_typeof, Keywords.kw_instanceof, Keywords.kw_in) || 347 Tok->isBinaryOperator(); 348 } 349 350 bool FormatTokenLexer::canPrecedeRegexLiteral(FormatToken *Prev) { 351 if (!Prev) 352 return true; 353 354 // Regex literals can only follow after prefix unary operators, not after 355 // postfix unary operators. If the '++' is followed by a non-operand 356 // introducing token, the slash here is the operand and not the start of a 357 // regex. 358 // `!` is an unary prefix operator, but also a post-fix operator that casts 359 // away nullability, so the same check applies. 360 if (Prev->isOneOf(tok::plusplus, tok::minusminus, tok::exclaim)) 361 return (Tokens.size() < 3 || precedesOperand(Tokens[Tokens.size() - 3])); 362 363 // The previous token must introduce an operand location where regex 364 // literals can occur. 365 if (!precedesOperand(Prev)) 366 return false; 367 368 return true; 369 } 370 371 // Tries to parse a JavaScript Regex literal starting at the current token, 372 // if that begins with a slash and is in a location where JavaScript allows 373 // regex literals. Changes the current token to a regex literal and updates 374 // its text if successful. 375 void FormatTokenLexer::tryParseJSRegexLiteral() { 376 FormatToken *RegexToken = Tokens.back(); 377 if (!RegexToken->isOneOf(tok::slash, tok::slashequal)) 378 return; 379 380 FormatToken *Prev = nullptr; 381 for (auto I = Tokens.rbegin() + 1, E = Tokens.rend(); I != E; ++I) { 382 // NB: Because previous pointers are not initialized yet, this cannot use 383 // Token.getPreviousNonComment. 384 if ((*I)->isNot(tok::comment)) { 385 Prev = *I; 386 break; 387 } 388 } 389 390 if (!canPrecedeRegexLiteral(Prev)) 391 return; 392 393 // 'Manually' lex ahead in the current file buffer. 394 const char *Offset = Lex->getBufferLocation(); 395 const char *RegexBegin = Offset - RegexToken->TokenText.size(); 396 StringRef Buffer = Lex->getBuffer(); 397 bool InCharacterClass = false; 398 bool HaveClosingSlash = false; 399 for (; !HaveClosingSlash && Offset != Buffer.end(); ++Offset) { 400 // Regular expressions are terminated with a '/', which can only be 401 // escaped using '\' or a character class between '[' and ']'. 402 // See http://www.ecma-international.org/ecma-262/5.1/#sec-7.8.5. 403 switch (*Offset) { 404 case '\\': 405 // Skip the escaped character. 406 ++Offset; 407 break; 408 case '[': 409 InCharacterClass = true; 410 break; 411 case ']': 412 InCharacterClass = false; 413 break; 414 case '/': 415 if (!InCharacterClass) 416 HaveClosingSlash = true; 417 break; 418 } 419 } 420 421 RegexToken->Type = TT_RegexLiteral; 422 // Treat regex literals like other string_literals. 423 RegexToken->Tok.setKind(tok::string_literal); 424 RegexToken->TokenText = StringRef(RegexBegin, Offset - RegexBegin); 425 RegexToken->ColumnWidth = RegexToken->TokenText.size(); 426 427 resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset))); 428 } 429 430 void FormatTokenLexer::handleTemplateStrings() { 431 FormatToken *BacktickToken = Tokens.back(); 432 433 if (BacktickToken->is(tok::l_brace)) { 434 StateStack.push(LexerState::NORMAL); 435 return; 436 } 437 if (BacktickToken->is(tok::r_brace)) { 438 if (StateStack.size() == 1) 439 return; 440 StateStack.pop(); 441 if (StateStack.top() != LexerState::TEMPLATE_STRING) 442 return; 443 // If back in TEMPLATE_STRING, fallthrough and continue parsing the 444 } else if (BacktickToken->is(tok::unknown) && 445 BacktickToken->TokenText == "`") { 446 StateStack.push(LexerState::TEMPLATE_STRING); 447 } else { 448 return; // Not actually a template 449 } 450 451 // 'Manually' lex ahead in the current file buffer. 452 const char *Offset = Lex->getBufferLocation(); 453 const char *TmplBegin = Offset - BacktickToken->TokenText.size(); // at "`" 454 for (; Offset != Lex->getBuffer().end(); ++Offset) { 455 if (Offset[0] == '`') { 456 StateStack.pop(); 457 break; 458 } 459 if (Offset[0] == '\\') { 460 ++Offset; // Skip the escaped character. 461 } else if (Offset + 1 < Lex->getBuffer().end() && Offset[0] == '$' && 462 Offset[1] == '{') { 463 // '${' introduces an expression interpolation in the template string. 464 StateStack.push(LexerState::NORMAL); 465 ++Offset; 466 break; 467 } 468 } 469 470 StringRef LiteralText(TmplBegin, Offset - TmplBegin + 1); 471 BacktickToken->Type = TT_TemplateString; 472 BacktickToken->Tok.setKind(tok::string_literal); 473 BacktickToken->TokenText = LiteralText; 474 475 // Adjust width for potentially multiline string literals. 476 size_t FirstBreak = LiteralText.find('\n'); 477 StringRef FirstLineText = FirstBreak == StringRef::npos 478 ? LiteralText 479 : LiteralText.substr(0, FirstBreak); 480 BacktickToken->ColumnWidth = encoding::columnWidthWithTabs( 481 FirstLineText, BacktickToken->OriginalColumn, Style.TabWidth, Encoding); 482 size_t LastBreak = LiteralText.rfind('\n'); 483 if (LastBreak != StringRef::npos) { 484 BacktickToken->IsMultiline = true; 485 unsigned StartColumn = 0; // The template tail spans the entire line. 486 BacktickToken->LastLineColumnWidth = encoding::columnWidthWithTabs( 487 LiteralText.substr(LastBreak + 1, LiteralText.size()), StartColumn, 488 Style.TabWidth, Encoding); 489 } 490 491 SourceLocation loc = Offset < Lex->getBuffer().end() 492 ? Lex->getSourceLocation(Offset + 1) 493 : SourceMgr.getLocForEndOfFile(ID); 494 resetLexer(SourceMgr.getFileOffset(loc)); 495 } 496 497 void FormatTokenLexer::tryParsePythonComment() { 498 FormatToken *HashToken = Tokens.back(); 499 if (!HashToken->isOneOf(tok::hash, tok::hashhash)) 500 return; 501 // Turn the remainder of this line into a comment. 502 const char *CommentBegin = 503 Lex->getBufferLocation() - HashToken->TokenText.size(); // at "#" 504 size_t From = CommentBegin - Lex->getBuffer().begin(); 505 size_t To = Lex->getBuffer().find_first_of('\n', From); 506 if (To == StringRef::npos) 507 To = Lex->getBuffer().size(); 508 size_t Len = To - From; 509 HashToken->Type = TT_LineComment; 510 HashToken->Tok.setKind(tok::comment); 511 HashToken->TokenText = Lex->getBuffer().substr(From, Len); 512 SourceLocation Loc = To < Lex->getBuffer().size() 513 ? Lex->getSourceLocation(CommentBegin + Len) 514 : SourceMgr.getLocForEndOfFile(ID); 515 resetLexer(SourceMgr.getFileOffset(Loc)); 516 } 517 518 bool FormatTokenLexer::tryMerge_TMacro() { 519 if (Tokens.size() < 4) 520 return false; 521 FormatToken *Last = Tokens.back(); 522 if (!Last->is(tok::r_paren)) 523 return false; 524 525 FormatToken *String = Tokens[Tokens.size() - 2]; 526 if (!String->is(tok::string_literal) || String->IsMultiline) 527 return false; 528 529 if (!Tokens[Tokens.size() - 3]->is(tok::l_paren)) 530 return false; 531 532 FormatToken *Macro = Tokens[Tokens.size() - 4]; 533 if (Macro->TokenText != "_T") 534 return false; 535 536 const char *Start = Macro->TokenText.data(); 537 const char *End = Last->TokenText.data() + Last->TokenText.size(); 538 String->TokenText = StringRef(Start, End - Start); 539 String->IsFirst = Macro->IsFirst; 540 String->LastNewlineOffset = Macro->LastNewlineOffset; 541 String->WhitespaceRange = Macro->WhitespaceRange; 542 String->OriginalColumn = Macro->OriginalColumn; 543 String->ColumnWidth = encoding::columnWidthWithTabs( 544 String->TokenText, String->OriginalColumn, Style.TabWidth, Encoding); 545 String->NewlinesBefore = Macro->NewlinesBefore; 546 String->HasUnescapedNewline = Macro->HasUnescapedNewline; 547 548 Tokens.pop_back(); 549 Tokens.pop_back(); 550 Tokens.pop_back(); 551 Tokens.back() = String; 552 return true; 553 } 554 555 bool FormatTokenLexer::tryMergeConflictMarkers() { 556 if (Tokens.back()->NewlinesBefore == 0 && Tokens.back()->isNot(tok::eof)) 557 return false; 558 559 // Conflict lines look like: 560 // <marker> <text from the vcs> 561 // For example: 562 // >>>>>>> /file/in/file/system at revision 1234 563 // 564 // We merge all tokens in a line that starts with a conflict marker 565 // into a single token with a special token type that the unwrapped line 566 // parser will use to correctly rebuild the underlying code. 567 568 FileID ID; 569 // Get the position of the first token in the line. 570 unsigned FirstInLineOffset; 571 std::tie(ID, FirstInLineOffset) = SourceMgr.getDecomposedLoc( 572 Tokens[FirstInLineIndex]->getStartOfNonWhitespace()); 573 StringRef Buffer = SourceMgr.getBuffer(ID)->getBuffer(); 574 // Calculate the offset of the start of the current line. 575 auto LineOffset = Buffer.rfind('\n', FirstInLineOffset); 576 if (LineOffset == StringRef::npos) { 577 LineOffset = 0; 578 } else { 579 ++LineOffset; 580 } 581 582 auto FirstSpace = Buffer.find_first_of(" \n", LineOffset); 583 StringRef LineStart; 584 if (FirstSpace == StringRef::npos) { 585 LineStart = Buffer.substr(LineOffset); 586 } else { 587 LineStart = Buffer.substr(LineOffset, FirstSpace - LineOffset); 588 } 589 590 TokenType Type = TT_Unknown; 591 if (LineStart == "<<<<<<<" || LineStart == ">>>>") { 592 Type = TT_ConflictStart; 593 } else if (LineStart == "|||||||" || LineStart == "=======" || 594 LineStart == "====") { 595 Type = TT_ConflictAlternative; 596 } else if (LineStart == ">>>>>>>" || LineStart == "<<<<") { 597 Type = TT_ConflictEnd; 598 } 599 600 if (Type != TT_Unknown) { 601 FormatToken *Next = Tokens.back(); 602 603 Tokens.resize(FirstInLineIndex + 1); 604 // We do not need to build a complete token here, as we will skip it 605 // during parsing anyway (as we must not touch whitespace around conflict 606 // markers). 607 Tokens.back()->Type = Type; 608 Tokens.back()->Tok.setKind(tok::kw___unknown_anytype); 609 610 Tokens.push_back(Next); 611 return true; 612 } 613 614 return false; 615 } 616 617 FormatToken *FormatTokenLexer::getStashedToken() { 618 // Create a synthesized second '>' or '<' token. 619 Token Tok = FormatTok->Tok; 620 StringRef TokenText = FormatTok->TokenText; 621 622 unsigned OriginalColumn = FormatTok->OriginalColumn; 623 FormatTok = new (Allocator.Allocate()) FormatToken; 624 FormatTok->Tok = Tok; 625 SourceLocation TokLocation = 626 FormatTok->Tok.getLocation().getLocWithOffset(Tok.getLength() - 1); 627 FormatTok->Tok.setLocation(TokLocation); 628 FormatTok->WhitespaceRange = SourceRange(TokLocation, TokLocation); 629 FormatTok->TokenText = TokenText; 630 FormatTok->ColumnWidth = 1; 631 FormatTok->OriginalColumn = OriginalColumn + 1; 632 633 return FormatTok; 634 } 635 636 FormatToken *FormatTokenLexer::getNextToken() { 637 if (StateStack.top() == LexerState::TOKEN_STASHED) { 638 StateStack.pop(); 639 return getStashedToken(); 640 } 641 642 FormatTok = new (Allocator.Allocate()) FormatToken; 643 readRawToken(*FormatTok); 644 SourceLocation WhitespaceStart = 645 FormatTok->Tok.getLocation().getLocWithOffset(-TrailingWhitespace); 646 FormatTok->IsFirst = IsFirstToken; 647 IsFirstToken = false; 648 649 // Consume and record whitespace until we find a significant token. 650 unsigned WhitespaceLength = TrailingWhitespace; 651 while (FormatTok->Tok.is(tok::unknown)) { 652 StringRef Text = FormatTok->TokenText; 653 auto EscapesNewline = [&](int pos) { 654 // A '\r' here is just part of '\r\n'. Skip it. 655 if (pos >= 0 && Text[pos] == '\r') 656 --pos; 657 // See whether there is an odd number of '\' before this. 658 // FIXME: This is wrong. A '\' followed by a newline is always removed, 659 // regardless of whether there is another '\' before it. 660 // FIXME: Newlines can also be escaped by a '?' '?' '/' trigraph. 661 unsigned count = 0; 662 for (; pos >= 0; --pos, ++count) 663 if (Text[pos] != '\\') 664 break; 665 return count & 1; 666 }; 667 // FIXME: This miscounts tok:unknown tokens that are not just 668 // whitespace, e.g. a '`' character. 669 for (int i = 0, e = Text.size(); i != e; ++i) { 670 switch (Text[i]) { 671 case '\n': 672 ++FormatTok->NewlinesBefore; 673 FormatTok->HasUnescapedNewline = !EscapesNewline(i - 1); 674 FormatTok->LastNewlineOffset = WhitespaceLength + i + 1; 675 Column = 0; 676 break; 677 case '\r': 678 FormatTok->LastNewlineOffset = WhitespaceLength + i + 1; 679 Column = 0; 680 break; 681 case '\f': 682 case '\v': 683 Column = 0; 684 break; 685 case ' ': 686 ++Column; 687 break; 688 case '\t': 689 Column += 690 Style.TabWidth - (Style.TabWidth ? Column % Style.TabWidth : 0); 691 break; 692 case '\\': 693 if (i + 1 == e || (Text[i + 1] != '\r' && Text[i + 1] != '\n')) 694 FormatTok->Type = TT_ImplicitStringLiteral; 695 break; 696 default: 697 FormatTok->Type = TT_ImplicitStringLiteral; 698 break; 699 } 700 if (FormatTok->Type == TT_ImplicitStringLiteral) 701 break; 702 } 703 704 if (FormatTok->is(TT_ImplicitStringLiteral)) 705 break; 706 WhitespaceLength += FormatTok->Tok.getLength(); 707 708 readRawToken(*FormatTok); 709 } 710 711 // JavaScript and Java do not allow to escape the end of the line with a 712 // backslash. Backslashes are syntax errors in plain source, but can occur in 713 // comments. When a single line comment ends with a \, it'll cause the next 714 // line of code to be lexed as a comment, breaking formatting. The code below 715 // finds comments that contain a backslash followed by a line break, truncates 716 // the comment token at the backslash, and resets the lexer to restart behind 717 // the backslash. 718 if ((Style.Language == FormatStyle::LK_JavaScript || 719 Style.Language == FormatStyle::LK_Java) && 720 FormatTok->is(tok::comment) && FormatTok->TokenText.startswith("//")) { 721 size_t BackslashPos = FormatTok->TokenText.find('\\'); 722 while (BackslashPos != StringRef::npos) { 723 if (BackslashPos + 1 < FormatTok->TokenText.size() && 724 FormatTok->TokenText[BackslashPos + 1] == '\n') { 725 const char *Offset = Lex->getBufferLocation(); 726 Offset -= FormatTok->TokenText.size(); 727 Offset += BackslashPos + 1; 728 resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset))); 729 FormatTok->TokenText = FormatTok->TokenText.substr(0, BackslashPos + 1); 730 FormatTok->ColumnWidth = encoding::columnWidthWithTabs( 731 FormatTok->TokenText, FormatTok->OriginalColumn, Style.TabWidth, 732 Encoding); 733 break; 734 } 735 BackslashPos = FormatTok->TokenText.find('\\', BackslashPos + 1); 736 } 737 } 738 739 // In case the token starts with escaped newlines, we want to 740 // take them into account as whitespace - this pattern is quite frequent 741 // in macro definitions. 742 // FIXME: Add a more explicit test. 743 while (FormatTok->TokenText.size() > 1 && FormatTok->TokenText[0] == '\\') { 744 unsigned SkippedWhitespace = 0; 745 if (FormatTok->TokenText.size() > 2 && 746 (FormatTok->TokenText[1] == '\r' && FormatTok->TokenText[2] == '\n')) 747 SkippedWhitespace = 3; 748 else if (FormatTok->TokenText[1] == '\n') 749 SkippedWhitespace = 2; 750 else 751 break; 752 753 ++FormatTok->NewlinesBefore; 754 WhitespaceLength += SkippedWhitespace; 755 FormatTok->LastNewlineOffset = SkippedWhitespace; 756 Column = 0; 757 FormatTok->TokenText = FormatTok->TokenText.substr(SkippedWhitespace); 758 } 759 760 FormatTok->WhitespaceRange = SourceRange( 761 WhitespaceStart, WhitespaceStart.getLocWithOffset(WhitespaceLength)); 762 763 FormatTok->OriginalColumn = Column; 764 765 TrailingWhitespace = 0; 766 if (FormatTok->Tok.is(tok::comment)) { 767 // FIXME: Add the trimmed whitespace to Column. 768 StringRef UntrimmedText = FormatTok->TokenText; 769 FormatTok->TokenText = FormatTok->TokenText.rtrim(" \t\v\f"); 770 TrailingWhitespace = UntrimmedText.size() - FormatTok->TokenText.size(); 771 } else if (FormatTok->Tok.is(tok::raw_identifier)) { 772 IdentifierInfo &Info = IdentTable.get(FormatTok->TokenText); 773 FormatTok->Tok.setIdentifierInfo(&Info); 774 FormatTok->Tok.setKind(Info.getTokenID()); 775 if (Style.Language == FormatStyle::LK_Java && 776 FormatTok->isOneOf(tok::kw_struct, tok::kw_union, tok::kw_delete, 777 tok::kw_operator)) { 778 FormatTok->Tok.setKind(tok::identifier); 779 FormatTok->Tok.setIdentifierInfo(nullptr); 780 } else if (Style.Language == FormatStyle::LK_JavaScript && 781 FormatTok->isOneOf(tok::kw_struct, tok::kw_union, 782 tok::kw_operator)) { 783 FormatTok->Tok.setKind(tok::identifier); 784 FormatTok->Tok.setIdentifierInfo(nullptr); 785 } 786 } else if (FormatTok->Tok.is(tok::greatergreater)) { 787 FormatTok->Tok.setKind(tok::greater); 788 FormatTok->TokenText = FormatTok->TokenText.substr(0, 1); 789 ++Column; 790 StateStack.push(LexerState::TOKEN_STASHED); 791 } else if (FormatTok->Tok.is(tok::lessless)) { 792 FormatTok->Tok.setKind(tok::less); 793 FormatTok->TokenText = FormatTok->TokenText.substr(0, 1); 794 ++Column; 795 StateStack.push(LexerState::TOKEN_STASHED); 796 } 797 798 // Now FormatTok is the next non-whitespace token. 799 800 StringRef Text = FormatTok->TokenText; 801 size_t FirstNewlinePos = Text.find('\n'); 802 if (FirstNewlinePos == StringRef::npos) { 803 // FIXME: ColumnWidth actually depends on the start column, we need to 804 // take this into account when the token is moved. 805 FormatTok->ColumnWidth = 806 encoding::columnWidthWithTabs(Text, Column, Style.TabWidth, Encoding); 807 Column += FormatTok->ColumnWidth; 808 } else { 809 FormatTok->IsMultiline = true; 810 // FIXME: ColumnWidth actually depends on the start column, we need to 811 // take this into account when the token is moved. 812 FormatTok->ColumnWidth = encoding::columnWidthWithTabs( 813 Text.substr(0, FirstNewlinePos), Column, Style.TabWidth, Encoding); 814 815 // The last line of the token always starts in column 0. 816 // Thus, the length can be precomputed even in the presence of tabs. 817 FormatTok->LastLineColumnWidth = encoding::columnWidthWithTabs( 818 Text.substr(Text.find_last_of('\n') + 1), 0, Style.TabWidth, Encoding); 819 Column = FormatTok->LastLineColumnWidth; 820 } 821 822 if (Style.isCpp()) { 823 auto it = Macros.find(FormatTok->Tok.getIdentifierInfo()); 824 if (!(Tokens.size() > 0 && Tokens.back()->Tok.getIdentifierInfo() && 825 Tokens.back()->Tok.getIdentifierInfo()->getPPKeywordID() == 826 tok::pp_define) && 827 it != Macros.end()) { 828 FormatTok->Type = it->second; 829 } else if (FormatTok->is(tok::identifier)) { 830 if (MacroBlockBeginRegex.match(Text)) { 831 FormatTok->Type = TT_MacroBlockBegin; 832 } else if (MacroBlockEndRegex.match(Text)) { 833 FormatTok->Type = TT_MacroBlockEnd; 834 } 835 } 836 } 837 838 return FormatTok; 839 } 840 841 void FormatTokenLexer::readRawToken(FormatToken &Tok) { 842 Lex->LexFromRawLexer(Tok.Tok); 843 Tok.TokenText = StringRef(SourceMgr.getCharacterData(Tok.Tok.getLocation()), 844 Tok.Tok.getLength()); 845 // For formatting, treat unterminated string literals like normal string 846 // literals. 847 if (Tok.is(tok::unknown)) { 848 if (!Tok.TokenText.empty() && Tok.TokenText[0] == '"') { 849 Tok.Tok.setKind(tok::string_literal); 850 Tok.IsUnterminatedLiteral = true; 851 } else if (Style.Language == FormatStyle::LK_JavaScript && 852 Tok.TokenText == "''") { 853 Tok.Tok.setKind(tok::string_literal); 854 } 855 } 856 857 if ((Style.Language == FormatStyle::LK_JavaScript || 858 Style.Language == FormatStyle::LK_Proto || 859 Style.Language == FormatStyle::LK_TextProto) && 860 Tok.is(tok::char_constant)) { 861 Tok.Tok.setKind(tok::string_literal); 862 } 863 864 if (Tok.is(tok::comment) && (Tok.TokenText == "// clang-format on" || 865 Tok.TokenText == "/* clang-format on */")) { 866 FormattingDisabled = false; 867 } 868 869 Tok.Finalized = FormattingDisabled; 870 871 if (Tok.is(tok::comment) && (Tok.TokenText == "// clang-format off" || 872 Tok.TokenText == "/* clang-format off */")) { 873 FormattingDisabled = true; 874 } 875 } 876 877 void FormatTokenLexer::resetLexer(unsigned Offset) { 878 StringRef Buffer = SourceMgr.getBufferData(ID); 879 Lex.reset(new Lexer(SourceMgr.getLocForStartOfFile(ID), 880 getFormattingLangOpts(Style), Buffer.begin(), 881 Buffer.begin() + Offset, Buffer.end())); 882 Lex->SetKeepWhitespaceMode(true); 883 TrailingWhitespace = 0; 884 } 885 886 } // namespace format 887 } // namespace clang 888