1 //===--- CommentLexer.cpp -------------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "clang/AST/CommentLexer.h" 10 #include "clang/AST/CommentCommandTraits.h" 11 #include "clang/AST/CommentDiagnostic.h" 12 #include "clang/Basic/CharInfo.h" 13 #include "llvm/ADT/StringExtras.h" 14 #include "llvm/ADT/StringSwitch.h" 15 #include "llvm/Support/ConvertUTF.h" 16 #include "llvm/Support/ErrorHandling.h" 17 18 namespace clang { 19 namespace comments { 20 21 void Token::dump(const Lexer &L, const SourceManager &SM) const { 22 llvm::errs() << "comments::Token Kind=" << Kind << " "; 23 Loc.print(llvm::errs(), SM); 24 llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n"; 25 } 26 27 static inline bool isHTMLNamedCharacterReferenceCharacter(char C) { 28 return isLetter(C); 29 } 30 31 static inline bool isHTMLDecimalCharacterReferenceCharacter(char C) { 32 return isDigit(C); 33 } 34 35 static inline bool isHTMLHexCharacterReferenceCharacter(char C) { 36 return isHexDigit(C); 37 } 38 39 static inline StringRef convertCodePointToUTF8( 40 llvm::BumpPtrAllocator &Allocator, 41 unsigned CodePoint) { 42 char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT); 43 char *ResolvedPtr = Resolved; 44 if (llvm::ConvertCodePointToUTF8(CodePoint, ResolvedPtr)) 45 return StringRef(Resolved, ResolvedPtr - Resolved); 46 else 47 return StringRef(); 48 } 49 50 namespace { 51 52 #include "clang/AST/CommentHTMLTags.inc" 53 #include "clang/AST/CommentHTMLNamedCharacterReferences.inc" 54 55 } // end anonymous namespace 56 57 StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const { 58 // Fast path, first check a few most widely used named character references. 59 return llvm::StringSwitch<StringRef>(Name) 60 .Case("amp", "&") 61 .Case("lt", "<") 62 .Case("gt", ">") 63 .Case("quot", "\"") 64 .Case("apos", "\'") 65 // Slow path. 66 .Default(translateHTMLNamedCharacterReferenceToUTF8(Name)); 67 } 68 69 StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const { 70 unsigned CodePoint = 0; 71 for (unsigned i = 0, e = Name.size(); i != e; ++i) { 72 assert(isHTMLDecimalCharacterReferenceCharacter(Name[i])); 73 CodePoint *= 10; 74 CodePoint += Name[i] - '0'; 75 } 76 return convertCodePointToUTF8(Allocator, CodePoint); 77 } 78 79 StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const { 80 unsigned CodePoint = 0; 81 for (unsigned i = 0, e = Name.size(); i != e; ++i) { 82 CodePoint *= 16; 83 const char C = Name[i]; 84 assert(isHTMLHexCharacterReferenceCharacter(C)); 85 CodePoint += llvm::hexDigitValue(C); 86 } 87 return convertCodePointToUTF8(Allocator, CodePoint); 88 } 89 90 void Lexer::skipLineStartingDecorations() { 91 // This function should be called only for C comments 92 assert(CommentState == LCS_InsideCComment); 93 94 if (BufferPtr == CommentEnd) 95 return; 96 97 const char *NewBufferPtr = BufferPtr; 98 while (isHorizontalWhitespace(*NewBufferPtr)) 99 if (++NewBufferPtr == CommentEnd) 100 return; 101 if (*NewBufferPtr == '*') 102 BufferPtr = NewBufferPtr + 1; 103 } 104 105 namespace { 106 /// Returns pointer to the first newline character in the string. 107 const char *findNewline(const char *BufferPtr, const char *BufferEnd) { 108 for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 109 if (isVerticalWhitespace(*BufferPtr)) 110 return BufferPtr; 111 } 112 return BufferEnd; 113 } 114 115 const char *skipNewline(const char *BufferPtr, const char *BufferEnd) { 116 if (BufferPtr == BufferEnd) 117 return BufferPtr; 118 119 if (*BufferPtr == '\n') 120 BufferPtr++; 121 else { 122 assert(*BufferPtr == '\r'); 123 BufferPtr++; 124 if (BufferPtr != BufferEnd && *BufferPtr == '\n') 125 BufferPtr++; 126 } 127 return BufferPtr; 128 } 129 130 const char *skipNamedCharacterReference(const char *BufferPtr, 131 const char *BufferEnd) { 132 for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 133 if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr)) 134 return BufferPtr; 135 } 136 return BufferEnd; 137 } 138 139 const char *skipDecimalCharacterReference(const char *BufferPtr, 140 const char *BufferEnd) { 141 for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 142 if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr)) 143 return BufferPtr; 144 } 145 return BufferEnd; 146 } 147 148 const char *skipHexCharacterReference(const char *BufferPtr, 149 const char *BufferEnd) { 150 for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 151 if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr)) 152 return BufferPtr; 153 } 154 return BufferEnd; 155 } 156 157 bool isHTMLIdentifierStartingCharacter(char C) { 158 return isLetter(C); 159 } 160 161 bool isHTMLIdentifierCharacter(char C) { 162 return isAlphanumeric(C); 163 } 164 165 const char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) { 166 for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 167 if (!isHTMLIdentifierCharacter(*BufferPtr)) 168 return BufferPtr; 169 } 170 return BufferEnd; 171 } 172 173 /// Skip HTML string quoted in single or double quotes. Escaping quotes inside 174 /// string allowed. 175 /// 176 /// Returns pointer to closing quote. 177 const char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd) 178 { 179 const char Quote = *BufferPtr; 180 assert(Quote == '\"' || Quote == '\''); 181 182 BufferPtr++; 183 for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 184 const char C = *BufferPtr; 185 if (C == Quote && BufferPtr[-1] != '\\') 186 return BufferPtr; 187 } 188 return BufferEnd; 189 } 190 191 const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) { 192 for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 193 if (!isWhitespace(*BufferPtr)) 194 return BufferPtr; 195 } 196 return BufferEnd; 197 } 198 199 bool isWhitespace(const char *BufferPtr, const char *BufferEnd) { 200 return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd; 201 } 202 203 bool isCommandNameStartCharacter(char C) { 204 return isLetter(C); 205 } 206 207 bool isCommandNameCharacter(char C) { 208 return isAlphanumeric(C); 209 } 210 211 const char *skipCommandName(const char *BufferPtr, const char *BufferEnd) { 212 for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 213 if (!isCommandNameCharacter(*BufferPtr)) 214 return BufferPtr; 215 } 216 return BufferEnd; 217 } 218 219 /// Return the one past end pointer for BCPL comments. 220 /// Handles newlines escaped with backslash or trigraph for backslahs. 221 const char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) { 222 const char *CurPtr = BufferPtr; 223 while (CurPtr != BufferEnd) { 224 while (!isVerticalWhitespace(*CurPtr)) { 225 CurPtr++; 226 if (CurPtr == BufferEnd) 227 return BufferEnd; 228 } 229 // We found a newline, check if it is escaped. 230 const char *EscapePtr = CurPtr - 1; 231 while(isHorizontalWhitespace(*EscapePtr)) 232 EscapePtr--; 233 234 if (*EscapePtr == '\\' || 235 (EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' && 236 EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) { 237 // We found an escaped newline. 238 CurPtr = skipNewline(CurPtr, BufferEnd); 239 } else 240 return CurPtr; // Not an escaped newline. 241 } 242 return BufferEnd; 243 } 244 245 /// Return the one past end pointer for C comments. 246 /// Very dumb, does not handle escaped newlines or trigraphs. 247 const char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) { 248 for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 249 if (*BufferPtr == '*') { 250 assert(BufferPtr + 1 != BufferEnd); 251 if (*(BufferPtr + 1) == '/') 252 return BufferPtr; 253 } 254 } 255 llvm_unreachable("buffer end hit before '*/' was seen"); 256 } 257 258 } // end anonymous namespace 259 260 void Lexer::formTokenWithChars(Token &Result, const char *TokEnd, 261 tok::TokenKind Kind) { 262 const unsigned TokLen = TokEnd - BufferPtr; 263 Result.setLocation(getSourceLocation(BufferPtr)); 264 Result.setKind(Kind); 265 Result.setLength(TokLen); 266 #ifndef NDEBUG 267 Result.TextPtr = "<UNSET>"; 268 Result.IntVal = 7; 269 #endif 270 BufferPtr = TokEnd; 271 } 272 273 const char *Lexer::skipTextToken() { 274 const char *TokenPtr = BufferPtr; 275 assert(TokenPtr < CommentEnd); 276 StringRef TokStartSymbols = ParseCommands ? "\n\r\\@\"&<" : "\n\r"; 277 278 again: 279 size_t End = 280 StringRef(TokenPtr, CommentEnd - TokenPtr).find_first_of(TokStartSymbols); 281 if (End == StringRef::npos) 282 return CommentEnd; 283 284 // Doxygen doesn't recognize any commands in a one-line double quotation. 285 // If we don't find an ending quotation mark, we pretend it never began. 286 if (*(TokenPtr + End) == '\"') { 287 TokenPtr += End + 1; 288 End = StringRef(TokenPtr, CommentEnd - TokenPtr).find_first_of("\n\r\""); 289 if (End != StringRef::npos && *(TokenPtr + End) == '\"') 290 TokenPtr += End + 1; 291 goto again; 292 } 293 return TokenPtr + End; 294 } 295 296 void Lexer::lexCommentText(Token &T) { 297 assert(CommentState == LCS_InsideBCPLComment || 298 CommentState == LCS_InsideCComment); 299 300 // Handles lexing non-command text, i.e. text and newline. 301 auto HandleNonCommandToken = [&]() -> void { 302 assert(State == LS_Normal); 303 304 const char *TokenPtr = BufferPtr; 305 assert(TokenPtr < CommentEnd); 306 switch (*TokenPtr) { 307 case '\n': 308 case '\r': 309 TokenPtr = skipNewline(TokenPtr, CommentEnd); 310 formTokenWithChars(T, TokenPtr, tok::newline); 311 312 if (CommentState == LCS_InsideCComment) 313 skipLineStartingDecorations(); 314 return; 315 316 default: 317 return formTextToken(T, skipTextToken()); 318 } 319 }; 320 321 if (!ParseCommands) 322 return HandleNonCommandToken(); 323 324 switch (State) { 325 case LS_Normal: 326 break; 327 case LS_VerbatimBlockFirstLine: 328 lexVerbatimBlockFirstLine(T); 329 return; 330 case LS_VerbatimBlockBody: 331 lexVerbatimBlockBody(T); 332 return; 333 case LS_VerbatimLineText: 334 lexVerbatimLineText(T); 335 return; 336 case LS_HTMLStartTag: 337 lexHTMLStartTag(T); 338 return; 339 case LS_HTMLEndTag: 340 lexHTMLEndTag(T); 341 return; 342 } 343 344 assert(State == LS_Normal); 345 const char *TokenPtr = BufferPtr; 346 assert(TokenPtr < CommentEnd); 347 switch(*TokenPtr) { 348 case '\\': 349 case '@': { 350 // Commands that start with a backslash and commands that start with 351 // 'at' have equivalent semantics. But we keep information about the 352 // exact syntax in AST for comments. 353 tok::TokenKind CommandKind = 354 (*TokenPtr == '@') ? tok::at_command : tok::backslash_command; 355 TokenPtr++; 356 if (TokenPtr == CommentEnd) { 357 formTextToken(T, TokenPtr); 358 return; 359 } 360 char C = *TokenPtr; 361 switch (C) { 362 default: 363 break; 364 365 case '\\': case '@': case '&': case '$': 366 case '#': case '<': case '>': case '%': 367 case '\"': case '.': case ':': 368 // This is one of \\ \@ \& \$ etc escape sequences. 369 TokenPtr++; 370 if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') { 371 // This is the \:: escape sequence. 372 TokenPtr++; 373 } 374 StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1)); 375 formTokenWithChars(T, TokenPtr, tok::text); 376 T.setText(UnescapedText); 377 return; 378 } 379 380 // Don't make zero-length commands. 381 if (!isCommandNameStartCharacter(*TokenPtr)) { 382 formTextToken(T, TokenPtr); 383 return; 384 } 385 386 TokenPtr = skipCommandName(TokenPtr, CommentEnd); 387 unsigned Length = TokenPtr - (BufferPtr + 1); 388 389 // Hardcoded support for lexing LaTeX formula commands 390 // \f$ \f( \f) \f[ \f] \f{ \f} as a single command. 391 if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) { 392 C = *TokenPtr; 393 if (C == '$' || C == '(' || C == ')' || C == '[' || C == ']' || 394 C == '{' || C == '}') { 395 TokenPtr++; 396 Length++; 397 } 398 } 399 400 StringRef CommandName(BufferPtr + 1, Length); 401 402 const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName); 403 if (!Info) { 404 if ((Info = Traits.getTypoCorrectCommandInfo(CommandName))) { 405 StringRef CorrectedName = Info->Name; 406 SourceLocation Loc = getSourceLocation(BufferPtr); 407 SourceLocation EndLoc = getSourceLocation(TokenPtr); 408 SourceRange FullRange = SourceRange(Loc, EndLoc); 409 SourceRange CommandRange(Loc.getLocWithOffset(1), EndLoc); 410 Diag(Loc, diag::warn_correct_comment_command_name) 411 << FullRange << CommandName << CorrectedName 412 << FixItHint::CreateReplacement(CommandRange, CorrectedName); 413 } else { 414 formTokenWithChars(T, TokenPtr, tok::unknown_command); 415 T.setUnknownCommandName(CommandName); 416 Diag(T.getLocation(), diag::warn_unknown_comment_command_name) 417 << SourceRange(T.getLocation(), T.getEndLocation()); 418 return; 419 } 420 } 421 if (Info->IsVerbatimBlockCommand) { 422 setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info); 423 return; 424 } 425 if (Info->IsVerbatimLineCommand) { 426 setupAndLexVerbatimLine(T, TokenPtr, Info); 427 return; 428 } 429 formTokenWithChars(T, TokenPtr, CommandKind); 430 T.setCommandID(Info->getID()); 431 return; 432 } 433 434 case '&': 435 lexHTMLCharacterReference(T); 436 return; 437 438 case '<': { 439 TokenPtr++; 440 if (TokenPtr == CommentEnd) { 441 formTextToken(T, TokenPtr); 442 return; 443 } 444 const char C = *TokenPtr; 445 if (isHTMLIdentifierStartingCharacter(C)) 446 setupAndLexHTMLStartTag(T); 447 else if (C == '/') 448 setupAndLexHTMLEndTag(T); 449 else 450 formTextToken(T, TokenPtr); 451 return; 452 } 453 454 default: 455 return HandleNonCommandToken(); 456 } 457 } 458 459 void Lexer::setupAndLexVerbatimBlock(Token &T, 460 const char *TextBegin, 461 char Marker, const CommandInfo *Info) { 462 assert(Info->IsVerbatimBlockCommand); 463 464 VerbatimBlockEndCommandName.clear(); 465 VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@"); 466 VerbatimBlockEndCommandName.append(Info->EndCommandName); 467 468 formTokenWithChars(T, TextBegin, tok::verbatim_block_begin); 469 T.setVerbatimBlockID(Info->getID()); 470 471 // If there is a newline following the verbatim opening command, skip the 472 // newline so that we don't create an tok::verbatim_block_line with empty 473 // text content. 474 if (BufferPtr != CommentEnd && 475 isVerticalWhitespace(*BufferPtr)) { 476 BufferPtr = skipNewline(BufferPtr, CommentEnd); 477 State = LS_VerbatimBlockBody; 478 return; 479 } 480 481 State = LS_VerbatimBlockFirstLine; 482 } 483 484 void Lexer::lexVerbatimBlockFirstLine(Token &T) { 485 again: 486 assert(BufferPtr < CommentEnd); 487 488 // FIXME: It would be better to scan the text once, finding either the block 489 // end command or newline. 490 // 491 // Extract current line. 492 const char *Newline = findNewline(BufferPtr, CommentEnd); 493 StringRef Line(BufferPtr, Newline - BufferPtr); 494 495 // Look for end command in current line. 496 size_t Pos = Line.find(VerbatimBlockEndCommandName); 497 const char *TextEnd; 498 const char *NextLine; 499 if (Pos == StringRef::npos) { 500 // Current line is completely verbatim. 501 TextEnd = Newline; 502 NextLine = skipNewline(Newline, CommentEnd); 503 } else if (Pos == 0) { 504 // Current line contains just an end command. 505 const char *End = BufferPtr + VerbatimBlockEndCommandName.size(); 506 StringRef Name(BufferPtr + 1, End - (BufferPtr + 1)); 507 formTokenWithChars(T, End, tok::verbatim_block_end); 508 T.setVerbatimBlockID(Traits.getCommandInfo(Name)->getID()); 509 State = LS_Normal; 510 return; 511 } else { 512 // There is some text, followed by end command. Extract text first. 513 TextEnd = BufferPtr + Pos; 514 NextLine = TextEnd; 515 // If there is only whitespace before end command, skip whitespace. 516 if (isWhitespace(BufferPtr, TextEnd)) { 517 BufferPtr = TextEnd; 518 goto again; 519 } 520 } 521 522 StringRef Text(BufferPtr, TextEnd - BufferPtr); 523 formTokenWithChars(T, NextLine, tok::verbatim_block_line); 524 T.setVerbatimBlockText(Text); 525 526 State = LS_VerbatimBlockBody; 527 } 528 529 void Lexer::lexVerbatimBlockBody(Token &T) { 530 assert(State == LS_VerbatimBlockBody); 531 532 if (CommentState == LCS_InsideCComment) 533 skipLineStartingDecorations(); 534 535 if (BufferPtr == CommentEnd) { 536 formTokenWithChars(T, BufferPtr, tok::verbatim_block_line); 537 T.setVerbatimBlockText(""); 538 return; 539 } 540 541 lexVerbatimBlockFirstLine(T); 542 } 543 544 void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin, 545 const CommandInfo *Info) { 546 assert(Info->IsVerbatimLineCommand); 547 formTokenWithChars(T, TextBegin, tok::verbatim_line_name); 548 T.setVerbatimLineID(Info->getID()); 549 550 State = LS_VerbatimLineText; 551 } 552 553 void Lexer::lexVerbatimLineText(Token &T) { 554 assert(State == LS_VerbatimLineText); 555 556 // Extract current line. 557 const char *Newline = findNewline(BufferPtr, CommentEnd); 558 StringRef Text(BufferPtr, Newline - BufferPtr); 559 formTokenWithChars(T, Newline, tok::verbatim_line_text); 560 T.setVerbatimLineText(Text); 561 562 State = LS_Normal; 563 } 564 565 void Lexer::lexHTMLCharacterReference(Token &T) { 566 const char *TokenPtr = BufferPtr; 567 assert(*TokenPtr == '&'); 568 TokenPtr++; 569 if (TokenPtr == CommentEnd) { 570 formTextToken(T, TokenPtr); 571 return; 572 } 573 const char *NamePtr; 574 bool isNamed = false; 575 bool isDecimal = false; 576 char C = *TokenPtr; 577 if (isHTMLNamedCharacterReferenceCharacter(C)) { 578 NamePtr = TokenPtr; 579 TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd); 580 isNamed = true; 581 } else if (C == '#') { 582 TokenPtr++; 583 if (TokenPtr == CommentEnd) { 584 formTextToken(T, TokenPtr); 585 return; 586 } 587 C = *TokenPtr; 588 if (isHTMLDecimalCharacterReferenceCharacter(C)) { 589 NamePtr = TokenPtr; 590 TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd); 591 isDecimal = true; 592 } else if (C == 'x' || C == 'X') { 593 TokenPtr++; 594 NamePtr = TokenPtr; 595 TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd); 596 } else { 597 formTextToken(T, TokenPtr); 598 return; 599 } 600 } else { 601 formTextToken(T, TokenPtr); 602 return; 603 } 604 if (NamePtr == TokenPtr || TokenPtr == CommentEnd || 605 *TokenPtr != ';') { 606 formTextToken(T, TokenPtr); 607 return; 608 } 609 StringRef Name(NamePtr, TokenPtr - NamePtr); 610 TokenPtr++; // Skip semicolon. 611 StringRef Resolved; 612 if (isNamed) 613 Resolved = resolveHTMLNamedCharacterReference(Name); 614 else if (isDecimal) 615 Resolved = resolveHTMLDecimalCharacterReference(Name); 616 else 617 Resolved = resolveHTMLHexCharacterReference(Name); 618 619 if (Resolved.empty()) { 620 formTextToken(T, TokenPtr); 621 return; 622 } 623 formTokenWithChars(T, TokenPtr, tok::text); 624 T.setText(Resolved); 625 } 626 627 void Lexer::setupAndLexHTMLStartTag(Token &T) { 628 assert(BufferPtr[0] == '<' && 629 isHTMLIdentifierStartingCharacter(BufferPtr[1])); 630 const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd); 631 StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1)); 632 if (!isHTMLTagName(Name)) { 633 formTextToken(T, TagNameEnd); 634 return; 635 } 636 637 formTokenWithChars(T, TagNameEnd, tok::html_start_tag); 638 T.setHTMLTagStartName(Name); 639 640 BufferPtr = skipWhitespace(BufferPtr, CommentEnd); 641 642 const char C = *BufferPtr; 643 if (BufferPtr != CommentEnd && 644 (C == '>' || C == '/' || isHTMLIdentifierStartingCharacter(C))) 645 State = LS_HTMLStartTag; 646 } 647 648 void Lexer::lexHTMLStartTag(Token &T) { 649 assert(State == LS_HTMLStartTag); 650 651 const char *TokenPtr = BufferPtr; 652 char C = *TokenPtr; 653 if (isHTMLIdentifierCharacter(C)) { 654 TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd); 655 StringRef Ident(BufferPtr, TokenPtr - BufferPtr); 656 formTokenWithChars(T, TokenPtr, tok::html_ident); 657 T.setHTMLIdent(Ident); 658 } else { 659 switch (C) { 660 case '=': 661 TokenPtr++; 662 formTokenWithChars(T, TokenPtr, tok::html_equals); 663 break; 664 case '\"': 665 case '\'': { 666 const char *OpenQuote = TokenPtr; 667 TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd); 668 const char *ClosingQuote = TokenPtr; 669 if (TokenPtr != CommentEnd) // Skip closing quote. 670 TokenPtr++; 671 formTokenWithChars(T, TokenPtr, tok::html_quoted_string); 672 T.setHTMLQuotedString(StringRef(OpenQuote + 1, 673 ClosingQuote - (OpenQuote + 1))); 674 break; 675 } 676 case '>': 677 TokenPtr++; 678 formTokenWithChars(T, TokenPtr, tok::html_greater); 679 State = LS_Normal; 680 return; 681 case '/': 682 TokenPtr++; 683 if (TokenPtr != CommentEnd && *TokenPtr == '>') { 684 TokenPtr++; 685 formTokenWithChars(T, TokenPtr, tok::html_slash_greater); 686 } else 687 formTextToken(T, TokenPtr); 688 689 State = LS_Normal; 690 return; 691 } 692 } 693 694 // Now look ahead and return to normal state if we don't see any HTML tokens 695 // ahead. 696 BufferPtr = skipWhitespace(BufferPtr, CommentEnd); 697 if (BufferPtr == CommentEnd) { 698 State = LS_Normal; 699 return; 700 } 701 702 C = *BufferPtr; 703 if (!isHTMLIdentifierStartingCharacter(C) && 704 C != '=' && C != '\"' && C != '\'' && C != '>') { 705 State = LS_Normal; 706 return; 707 } 708 } 709 710 void Lexer::setupAndLexHTMLEndTag(Token &T) { 711 assert(BufferPtr[0] == '<' && BufferPtr[1] == '/'); 712 713 const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd); 714 const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd); 715 StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin); 716 if (!isHTMLTagName(Name)) { 717 formTextToken(T, TagNameEnd); 718 return; 719 } 720 721 const char *End = skipWhitespace(TagNameEnd, CommentEnd); 722 723 formTokenWithChars(T, End, tok::html_end_tag); 724 T.setHTMLTagEndName(Name); 725 726 if (BufferPtr != CommentEnd && *BufferPtr == '>') 727 State = LS_HTMLEndTag; 728 } 729 730 void Lexer::lexHTMLEndTag(Token &T) { 731 assert(BufferPtr != CommentEnd && *BufferPtr == '>'); 732 733 formTokenWithChars(T, BufferPtr + 1, tok::html_greater); 734 State = LS_Normal; 735 } 736 737 Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags, 738 const CommandTraits &Traits, SourceLocation FileLoc, 739 const char *BufferStart, const char *BufferEnd, bool ParseCommands) 740 : Allocator(Allocator), Diags(Diags), Traits(Traits), 741 BufferStart(BufferStart), BufferEnd(BufferEnd), BufferPtr(BufferStart), 742 FileLoc(FileLoc), ParseCommands(ParseCommands), 743 CommentState(LCS_BeforeComment), State(LS_Normal) {} 744 745 void Lexer::lex(Token &T) { 746 again: 747 switch (CommentState) { 748 case LCS_BeforeComment: 749 if (BufferPtr == BufferEnd) { 750 formTokenWithChars(T, BufferPtr, tok::eof); 751 return; 752 } 753 754 assert(*BufferPtr == '/'); 755 BufferPtr++; // Skip first slash. 756 switch(*BufferPtr) { 757 case '/': { // BCPL comment. 758 BufferPtr++; // Skip second slash. 759 760 if (BufferPtr != BufferEnd) { 761 // Skip Doxygen magic marker, if it is present. 762 // It might be missing because of a typo //< or /*<, or because we 763 // merged this non-Doxygen comment into a bunch of Doxygen comments 764 // around it: /** ... */ /* ... */ /** ... */ 765 const char C = *BufferPtr; 766 if (C == '/' || C == '!') 767 BufferPtr++; 768 } 769 770 // Skip less-than symbol that marks trailing comments. 771 // Skip it even if the comment is not a Doxygen one, because //< and /*< 772 // are frequent typos. 773 if (BufferPtr != BufferEnd && *BufferPtr == '<') 774 BufferPtr++; 775 776 CommentState = LCS_InsideBCPLComment; 777 if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine) 778 State = LS_Normal; 779 CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd); 780 goto again; 781 } 782 case '*': { // C comment. 783 BufferPtr++; // Skip star. 784 785 // Skip Doxygen magic marker. 786 const char C = *BufferPtr; 787 if ((C == '*' && *(BufferPtr + 1) != '/') || C == '!') 788 BufferPtr++; 789 790 // Skip less-than symbol that marks trailing comments. 791 if (BufferPtr != BufferEnd && *BufferPtr == '<') 792 BufferPtr++; 793 794 CommentState = LCS_InsideCComment; 795 State = LS_Normal; 796 CommentEnd = findCCommentEnd(BufferPtr, BufferEnd); 797 goto again; 798 } 799 default: 800 llvm_unreachable("second character of comment should be '/' or '*'"); 801 } 802 803 case LCS_BetweenComments: { 804 // Consecutive comments are extracted only if there is only whitespace 805 // between them. So we can search for the start of the next comment. 806 const char *EndWhitespace = BufferPtr; 807 while(EndWhitespace != BufferEnd && *EndWhitespace != '/') 808 EndWhitespace++; 809 810 // Turn any whitespace between comments (and there is only whitespace 811 // between them -- guaranteed by comment extraction) into a newline. We 812 // have two newlines between C comments in total (first one was synthesized 813 // after a comment). 814 formTokenWithChars(T, EndWhitespace, tok::newline); 815 816 CommentState = LCS_BeforeComment; 817 break; 818 } 819 820 case LCS_InsideBCPLComment: 821 case LCS_InsideCComment: 822 if (BufferPtr != CommentEnd) { 823 lexCommentText(T); 824 break; 825 } else { 826 // Skip C comment closing sequence. 827 if (CommentState == LCS_InsideCComment) { 828 assert(BufferPtr[0] == '*' && BufferPtr[1] == '/'); 829 BufferPtr += 2; 830 assert(BufferPtr <= BufferEnd); 831 832 // Synthenize newline just after the C comment, regardless if there is 833 // actually a newline. 834 formTokenWithChars(T, BufferPtr, tok::newline); 835 836 CommentState = LCS_BetweenComments; 837 break; 838 } else { 839 // Don't synthesized a newline after BCPL comment. 840 CommentState = LCS_BetweenComments; 841 goto again; 842 } 843 } 844 } 845 } 846 847 StringRef Lexer::getSpelling(const Token &Tok, 848 const SourceManager &SourceMgr) const { 849 SourceLocation Loc = Tok.getLocation(); 850 std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc); 851 852 bool InvalidTemp = false; 853 StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp); 854 if (InvalidTemp) 855 return StringRef(); 856 857 const char *Begin = File.data() + LocInfo.second; 858 return StringRef(Begin, Tok.getLength()); 859 } 860 861 } // end namespace comments 862 } // end namespace clang 863