lib/AST/CommentLexer.cpp

//===--- CommentLexer.cpp -------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "clang/AST/CommentLexer.h"
#include "clang/AST/CommentCommandTraits.h"
#include "clang/Basic/CharInfo.h"
#include "clang/Basic/DiagnosticComment.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringSwitch.h"
#include "llvm/Support/ConvertUTF.h"
#include "llvm/Support/ErrorHandling.h"

namespace clang {
namespace comments {

void Token::dump(const Lexer &L, const SourceManager &SM) const {
  llvm::errs() << "comments::Token Kind=" << Kind << " ";
  Loc.print(llvm::errs(), SM);
  llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n";
}

static inline bool isHTMLNamedCharacterReferenceCharacter(char C) {
  return isLetter(C);
}

static inline bool isHTMLDecimalCharacterReferenceCharacter(char C) {
  return isDigit(C);
}

static inline bool isHTMLHexCharacterReferenceCharacter(char C) {
  return isHexDigit(C);
}

static inline StringRef convertCodePointToUTF8(
                                      llvm::BumpPtrAllocator &Allocator,
                                      unsigned CodePoint) {
  char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
  char *ResolvedPtr = Resolved;
  if (llvm::ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
    return StringRef(Resolved, ResolvedPtr - Resolved);
  else
    return StringRef();
}

namespace {

#include "clang/AST/CommentHTMLTags.inc"
#include "clang/AST/CommentHTMLNamedCharacterReferences.inc"

} // end anonymous namespace

StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {
  // Fast path, first check a few most widely used named character references.
  return llvm::StringSwitch<StringRef>(Name)
      .Case("amp", "&")
      .Case("lt", "<")
      .Case("gt", ">")
      .Case("quot", "\"")
      .Case("apos", "\'")
      // Slow path.
      .Default(translateHTMLNamedCharacterReferenceToUTF8(Name));
}

StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {
  unsigned CodePoint = 0;
  for (unsigned i = 0, e = Name.size(); i != e; ++i) {
    assert(isHTMLDecimalCharacterReferenceCharacter(Name[i]));
    CodePoint *= 10;
    CodePoint += Name[i] - '0';
  }
  return convertCodePointToUTF8(Allocator, CodePoint);
}

StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
  unsigned CodePoint = 0;
  for (unsigned i = 0, e = Name.size(); i != e; ++i) {
    CodePoint *= 16;
    const char C = Name[i];
    assert(isHTMLHexCharacterReferenceCharacter(C));
    CodePoint += llvm::hexDigitValue(C);
  }
  return convertCodePointToUTF8(Allocator, CodePoint);
}

void Lexer::skipLineStartingDecorations() {
  // This function should be called only for C comments
  assert(CommentState == LCS_InsideCComment);

  if (BufferPtr == CommentEnd)
    return;

  const char *NewBufferPtr = BufferPtr;
  while (isHorizontalWhitespace(*NewBufferPtr))
    if (++NewBufferPtr == CommentEnd)
      return;
  if (*NewBufferPtr == '*')
    BufferPtr = NewBufferPtr + 1;
}

namespace {
/// Returns pointer to the first newline character in the string.
const char *findNewline(const char *BufferPtr, const char *BufferEnd) {
  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
    if (isVerticalWhitespace(*BufferPtr))
      return BufferPtr;
  }
  return BufferEnd;
}

const char *skipNewline(const char *BufferPtr, const char *BufferEnd) {
  if (BufferPtr == BufferEnd)
    return BufferPtr;

  if (*BufferPtr == '\n')
    BufferPtr++;
  else {
    assert(*BufferPtr == '\r');
    BufferPtr++;
    if (BufferPtr != BufferEnd && *BufferPtr == '\n')
      BufferPtr++;
  }
  return BufferPtr;
}

const char *skipNamedCharacterReference(const char *BufferPtr,
                                        const char *BufferEnd) {
  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
    if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr))
      return BufferPtr;
  }
  return BufferEnd;
}

const char *skipDecimalCharacterReference(const char *BufferPtr,
                                          const char *BufferEnd) {
  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
    if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr))
      return BufferPtr;
  }
  return BufferEnd;
}

const char *skipHexCharacterReference(const char *BufferPtr,
                                      const char *BufferEnd) {
  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
    if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr))
      return BufferPtr;
  }
  return BufferEnd;
}

bool isHTMLIdentifierStartingCharacter(char C) {
  return isLetter(C);
}

bool isHTMLIdentifierCharacter(char C) {
  return isAlphanumeric(C);
}

const char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) {
  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
    if (!isHTMLIdentifierCharacter(*BufferPtr))
      return BufferPtr;
  }
  return BufferEnd;
}

/// Skip HTML string quoted in single or double quotes.  Escaping quotes inside
/// string allowed.
///
/// Returns pointer to closing quote.
const char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd)
{
  const char Quote = *BufferPtr;
  assert(Quote == '\"' || Quote == '\'');

  BufferPtr++;
  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
    const char C = *BufferPtr;
    if (C == Quote && BufferPtr[-1] != '\\')
      return BufferPtr;
  }
  return BufferEnd;
}

const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) {
  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
    if (!isWhitespace(*BufferPtr))
      return BufferPtr;
  }
  return BufferEnd;
}

const char *skipHorizontalWhitespace(const char *BufferPtr,
                                     const char *BufferEnd) {
  for (; BufferPtr != BufferEnd; ++BufferPtr) {
    if (!isHorizontalWhitespace(*BufferPtr))
      return BufferPtr;
  }
  return BufferEnd;
}

bool isWhitespace(const char *BufferPtr, const char *BufferEnd) {
  return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd;
}

bool isCommandNameStartCharacter(char C) {
  return isLetter(C);
}

bool isCommandNameCharacter(char C) {
  return isAlphanumeric(C);
}

const char *skipCommandName(const char *BufferPtr, const char *BufferEnd) {
  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
    if (!isCommandNameCharacter(*BufferPtr))
      return BufferPtr;
  }
  return BufferEnd;
}

/// Return the one past end pointer for BCPL comments.
/// Handles newlines escaped with backslash or trigraph for backslahs.
const char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) {
  const char *CurPtr = BufferPtr;
  while (CurPtr != BufferEnd) {
    while (!isVerticalWhitespace(*CurPtr)) {
      CurPtr++;
      if (CurPtr == BufferEnd)
        return BufferEnd;
    }
    // We found a newline, check if it is escaped.
    const char *EscapePtr = CurPtr - 1;
    while(isHorizontalWhitespace(*EscapePtr))
      EscapePtr--;

    if (*EscapePtr == '\\' ||
        (EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' &&
         EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) {
      // We found an escaped newline.
      CurPtr = skipNewline(CurPtr, BufferEnd);
    } else
      return CurPtr; // Not an escaped newline.
  }
  return BufferEnd;
}

/// Return the one past end pointer for C comments.
/// Very dumb, does not handle escaped newlines or trigraphs.
const char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) {
  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
    if (*BufferPtr == '*') {
      assert(BufferPtr + 1 != BufferEnd);
      if (*(BufferPtr + 1) == '/')
        return BufferPtr;
    }
  }
  llvm_unreachable("buffer end hit before '*/' was seen");
}

} // end anonymous namespace

void Lexer::formTokenWithChars(Token &Result, const char *TokEnd,
                               tok::TokenKind Kind) {
  const unsigned TokLen = TokEnd - BufferPtr;
  Result.setLocation(getSourceLocation(BufferPtr));
  Result.setKind(Kind);
  Result.setLength(TokLen);
#ifndef NDEBUG
  Result.TextPtr = "<UNSET>";
  Result.IntVal = 7;
#endif
  BufferPtr = TokEnd;
}

const char *Lexer::skipTextToken() {
  const char *TokenPtr = BufferPtr;
  assert(TokenPtr < CommentEnd);
  StringRef TokStartSymbols = ParseCommands ? "\n\r\\@\"&<" : "\n\r";

again:
  size_t End =
      StringRef(TokenPtr, CommentEnd - TokenPtr).find_first_of(TokStartSymbols);
  if (End == StringRef::npos)
    return CommentEnd;

  // Doxygen doesn't recognize any commands in a one-line double quotation.
  // If we don't find an ending quotation mark, we pretend it never began.
  if (*(TokenPtr + End) == '\"') {
    TokenPtr += End + 1;
    End = StringRef(TokenPtr, CommentEnd - TokenPtr).find_first_of("\n\r\"");
    if (End != StringRef::npos && *(TokenPtr + End) == '\"')
      TokenPtr += End + 1;
    goto again;
  }
  return TokenPtr + End;
}

void Lexer::lexCommentText(Token &T) {
  assert(CommentState == LCS_InsideBCPLComment ||
         CommentState == LCS_InsideCComment);

  // Handles lexing non-command text, i.e. text and newline.
  auto HandleNonCommandToken = [&]() -> void {
    assert(State == LS_Normal);

    const char *TokenPtr = BufferPtr;
    assert(TokenPtr < CommentEnd);
    switch (*TokenPtr) {
      case '\n':
      case '\r':
          TokenPtr = skipNewline(TokenPtr, CommentEnd);
          formTokenWithChars(T, TokenPtr, tok::newline);

          if (CommentState == LCS_InsideCComment)
            skipLineStartingDecorations();
          return;

      default:
        return formTextToken(T, skipTextToken());
    }
  };

  if (!ParseCommands)
    return HandleNonCommandToken();

  switch (State) {
  case LS_Normal:
    break;
  case LS_VerbatimBlockFirstLine:
    lexVerbatimBlockFirstLine(T);
    return;
  case LS_VerbatimBlockBody:
    lexVerbatimBlockBody(T);
    return;
  case LS_VerbatimLineText:
    lexVerbatimLineText(T);
    return;
  case LS_HTMLStartTag:
    lexHTMLStartTag(T);
    return;
  case LS_HTMLEndTag:
    lexHTMLEndTag(T);
    return;
  }

  assert(State == LS_Normal);
  const char *TokenPtr = BufferPtr;
  assert(TokenPtr < CommentEnd);
  switch(*TokenPtr) {
    case '\\':
    case '@': {
      // Commands that start with a backslash and commands that start with
      // 'at' have equivalent semantics.  But we keep information about the
      // exact syntax in AST for comments.
      tok::TokenKind CommandKind =
          (*TokenPtr == '@') ? tok::at_command : tok::backslash_command;
      TokenPtr++;
      if (TokenPtr == CommentEnd) {
        formTextToken(T, TokenPtr);
        return;
      }
      char C = *TokenPtr;
      switch (C) {
      default:
        break;

      case '\\': case '@': case '&': case '$':
      case '#':  case '<': case '>': case '%':
      case '\"': case '.': case ':':
        // This is one of \\ \@ \& \$ etc escape sequences.
        TokenPtr++;
        if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {
          // This is the \:: escape sequence.
          TokenPtr++;
        }
        StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
        formTokenWithChars(T, TokenPtr, tok::text);
        T.setText(UnescapedText);
        return;
      }

      // Don't make zero-length commands.
      if (!isCommandNameStartCharacter(*TokenPtr)) {
        formTextToken(T, TokenPtr);
        return;
      }

      TokenPtr = skipCommandName(TokenPtr, CommentEnd);
      unsigned Length = TokenPtr - (BufferPtr + 1);

      // Hardcoded support for lexing LaTeX formula commands
      // \f$ \f( \f) \f[ \f] \f{ \f} as a single command.
      if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {
        C = *TokenPtr;
        if (C == '$' || C == '(' || C == ')' || C == '[' || C == ']' ||
            C == '{' || C == '}') {
          TokenPtr++;
          Length++;
        }
      }

      StringRef CommandName(BufferPtr + 1, Length);

      const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName);
      if (!Info) {
        if ((Info = Traits.getTypoCorrectCommandInfo(CommandName))) {
          StringRef CorrectedName = Info->Name;
          SourceLocation Loc = getSourceLocation(BufferPtr);
          SourceLocation EndLoc = getSourceLocation(TokenPtr);
          SourceRange FullRange = SourceRange(Loc, EndLoc);
          SourceRange CommandRange(Loc.getLocWithOffset(1), EndLoc);
          Diag(Loc, diag::warn_correct_comment_command_name)
            << FullRange << CommandName << CorrectedName
            << FixItHint::CreateReplacement(CommandRange, CorrectedName);
        } else {
          formTokenWithChars(T, TokenPtr, tok::unknown_command);
          T.setUnknownCommandName(CommandName);
          Diag(T.getLocation(), diag::warn_unknown_comment_command_name)
              << SourceRange(T.getLocation(), T.getEndLocation());
          return;
        }
      }
      if (Info->IsVerbatimBlockCommand) {
        setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info);
        return;
      }
      if (Info->IsVerbatimLineCommand) {
        setupAndLexVerbatimLine(T, TokenPtr, Info);
        return;
      }
      formTokenWithChars(T, TokenPtr, CommandKind);
      T.setCommandID(Info->getID());
      return;
    }

    case '&':
      lexHTMLCharacterReference(T);
      return;

    case '<': {
      TokenPtr++;
      if (TokenPtr == CommentEnd) {
        formTextToken(T, TokenPtr);
        return;
      }
      const char C = *TokenPtr;
      if (isHTMLIdentifierStartingCharacter(C))
        setupAndLexHTMLStartTag(T);
      else if (C == '/')
        setupAndLexHTMLEndTag(T);
      else
        formTextToken(T, TokenPtr);
      return;
    }

    default:
      return HandleNonCommandToken();
  }
}

void Lexer::setupAndLexVerbatimBlock(Token &T,
                                     const char *TextBegin,
                                     char Marker, const CommandInfo *Info) {
  assert(Info->IsVerbatimBlockCommand);

  VerbatimBlockEndCommandName.clear();
  VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@");
  VerbatimBlockEndCommandName.append(Info->EndCommandName);

  formTokenWithChars(T, TextBegin, tok::verbatim_block_begin);
  T.setVerbatimBlockID(Info->getID());

  // If there is a newline following the verbatim opening command, skip the
  // newline so that we don't create an tok::verbatim_block_line with empty
  // text content.
  if (BufferPtr != CommentEnd &&
      isVerticalWhitespace(*BufferPtr)) {
    BufferPtr = skipNewline(BufferPtr, CommentEnd);
    State = LS_VerbatimBlockBody;
    return;
  }

  State = LS_VerbatimBlockFirstLine;
}

void Lexer::lexVerbatimBlockFirstLine(Token &T) {
again:
  assert(BufferPtr < CommentEnd);

  // FIXME: It would be better to scan the text once, finding either the block
  // end command or newline.
  //
  // Extract current line.
  const char *Newline = findNewline(BufferPtr, CommentEnd);
  StringRef Line(BufferPtr, Newline - BufferPtr);

  // Look for end command in current line.
  size_t Pos = Line.find(VerbatimBlockEndCommandName);
  const char *TextEnd;
  const char *NextLine;
  if (Pos == StringRef::npos) {
    // Current line is completely verbatim.
    TextEnd = Newline;
    NextLine = skipNewline(Newline, CommentEnd);
  } else if (Pos == 0) {
    // Current line contains just an end command.
    const char *End = BufferPtr + VerbatimBlockEndCommandName.size();
    StringRef Name(BufferPtr + 1, End - (BufferPtr + 1));
    formTokenWithChars(T, End, tok::verbatim_block_end);
    T.setVerbatimBlockID(Traits.getCommandInfo(Name)->getID());
    State = LS_Normal;
    return;
  } else {
    // There is some text, followed by end command.  Extract text first.
    TextEnd = BufferPtr + Pos;
    NextLine = TextEnd;
    // If there is only whitespace before end command, skip whitespace.
    if (isWhitespace(BufferPtr, TextEnd)) {
      BufferPtr = TextEnd;
      goto again;
    }
  }

  StringRef Text(BufferPtr, TextEnd - BufferPtr);
  formTokenWithChars(T, NextLine, tok::verbatim_block_line);
  T.setVerbatimBlockText(Text);

  State = LS_VerbatimBlockBody;
}

void Lexer::lexVerbatimBlockBody(Token &T) {
  assert(State == LS_VerbatimBlockBody);

  if (CommentState == LCS_InsideCComment)
    skipLineStartingDecorations();

  if (BufferPtr == CommentEnd) {
    formTokenWithChars(T, BufferPtr, tok::verbatim_block_line);
    T.setVerbatimBlockText("");
    return;
  }

  lexVerbatimBlockFirstLine(T);
}

void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin,
                                    const CommandInfo *Info) {
  assert(Info->IsVerbatimLineCommand);
  formTokenWithChars(T, TextBegin, tok::verbatim_line_name);
  T.setVerbatimLineID(Info->getID());

  State = LS_VerbatimLineText;
}

void Lexer::lexVerbatimLineText(Token &T) {
  assert(State == LS_VerbatimLineText);

  // Extract current line.
  const char *Newline = findNewline(BufferPtr, CommentEnd);
  StringRef Text(BufferPtr, Newline - BufferPtr);
  formTokenWithChars(T, Newline, tok::verbatim_line_text);
  T.setVerbatimLineText(Text);

  State = LS_Normal;
}

void Lexer::lexHTMLCharacterReference(Token &T) {
  const char *TokenPtr = BufferPtr;
  assert(*TokenPtr == '&');
  TokenPtr++;
  if (TokenPtr == CommentEnd) {
    formTextToken(T, TokenPtr);
    return;
  }
  const char *NamePtr;
  bool isNamed = false;
  bool isDecimal = false;
  char C = *TokenPtr;
  if (isHTMLNamedCharacterReferenceCharacter(C)) {
    NamePtr = TokenPtr;
    TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd);
    isNamed = true;
  } else if (C == '#') {
    TokenPtr++;
    if (TokenPtr == CommentEnd) {
      formTextToken(T, TokenPtr);
      return;
    }
    C = *TokenPtr;
    if (isHTMLDecimalCharacterReferenceCharacter(C)) {
      NamePtr = TokenPtr;
      TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd);
      isDecimal = true;
    } else if (C == 'x' || C == 'X') {
      TokenPtr++;
      NamePtr = TokenPtr;
      TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd);
    } else {
      formTextToken(T, TokenPtr);
      return;
    }
  } else {
    formTextToken(T, TokenPtr);
    return;
  }
  if (NamePtr == TokenPtr || TokenPtr == CommentEnd ||
      *TokenPtr != ';') {
    formTextToken(T, TokenPtr);
    return;
  }
  StringRef Name(NamePtr, TokenPtr - NamePtr);
  TokenPtr++; // Skip semicolon.
  StringRef Resolved;
  if (isNamed)
    Resolved = resolveHTMLNamedCharacterReference(Name);
  else if (isDecimal)
    Resolved = resolveHTMLDecimalCharacterReference(Name);
  else
    Resolved = resolveHTMLHexCharacterReference(Name);

  if (Resolved.empty()) {
    formTextToken(T, TokenPtr);
    return;
  }
  formTokenWithChars(T, TokenPtr, tok::text);
  T.setText(Resolved);
}

void Lexer::setupAndLexHTMLStartTag(Token &T) {
  assert(BufferPtr[0] == '<' &&
         isHTMLIdentifierStartingCharacter(BufferPtr[1]));
  const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd);
  StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1));
  if (!isHTMLTagName(Name)) {
    formTextToken(T, TagNameEnd);
    return;
  }

  formTokenWithChars(T, TagNameEnd, tok::html_start_tag);
  T.setHTMLTagStartName(Name);

  BufferPtr = skipHorizontalWhitespace(BufferPtr, CommentEnd);
  if (BufferPtr == CommentEnd) { // in BCPL comments
    State = LS_HTMLStartTag;
    return;
  }

  const char C = *BufferPtr;
  if (BufferPtr != CommentEnd &&
      (C == '>' || C == '/' || isVerticalWhitespace(C) ||
       isHTMLIdentifierStartingCharacter(C)))
    State = LS_HTMLStartTag;
}

void Lexer::lexHTMLStartTag(Token &T) {
  assert(State == LS_HTMLStartTag);

  // Skip leading whitespace and comment decorations
  while (isVerticalWhitespace(*BufferPtr)) {
    BufferPtr = skipNewline(BufferPtr, CommentEnd);

    if (CommentState == LCS_InsideCComment)
      skipLineStartingDecorations();

    BufferPtr = skipHorizontalWhitespace(BufferPtr, CommentEnd);
    if (BufferPtr == CommentEnd) {
      // HTML starting tags must be defined in a single comment block.
      // It's likely a user-error where they forgot to terminate the comment.
      State = LS_Normal;
      // Since at least one newline was skipped and one token needs to be lexed,
      // return a newline.
      formTokenWithChars(T, BufferPtr, tok::newline);
      return;
    }
  }

  const char *TokenPtr = BufferPtr;
  char C = *TokenPtr;
  if (isHTMLIdentifierCharacter(C)) {
    TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd);
    StringRef Ident(BufferPtr, TokenPtr - BufferPtr);
    formTokenWithChars(T, TokenPtr, tok::html_ident);
    T.setHTMLIdent(Ident);
  } else {
    switch (C) {
    case '=':
      TokenPtr++;
      formTokenWithChars(T, TokenPtr, tok::html_equals);
      break;
    case '\"':
    case '\'': {
      const char *OpenQuote = TokenPtr;
      TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd);
      const char *ClosingQuote = TokenPtr;
      if (TokenPtr != CommentEnd) // Skip closing quote.
        TokenPtr++;
      formTokenWithChars(T, TokenPtr, tok::html_quoted_string);
      T.setHTMLQuotedString(StringRef(OpenQuote + 1,
                                      ClosingQuote - (OpenQuote + 1)));
      break;
    }
    case '>':
      TokenPtr++;
      formTokenWithChars(T, TokenPtr, tok::html_greater);
      State = LS_Normal;
      return;
    case '/':
      TokenPtr++;
      if (TokenPtr != CommentEnd && *TokenPtr == '>') {
        TokenPtr++;
        formTokenWithChars(T, TokenPtr, tok::html_slash_greater);
      } else
        formTextToken(T, TokenPtr);

      State = LS_Normal;
      return;
    }
  }

  // Now look ahead and return to normal state if we don't see any HTML tokens
  // ahead.
  BufferPtr = skipHorizontalWhitespace(BufferPtr, CommentEnd);
  if (BufferPtr == CommentEnd) {
    return;
  }

  C = *BufferPtr;
  if (!isHTMLIdentifierStartingCharacter(C) && !isVerticalWhitespace(C) &&
      C != '=' && C != '\"' && C != '\'' && C != '>' && C != '/') {
    State = LS_Normal;
    return;
  }
}

void Lexer::setupAndLexHTMLEndTag(Token &T) {
  assert(BufferPtr[0] == '<' && BufferPtr[1] == '/');

  const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd);
  const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd);
  StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin);
  if (!isHTMLTagName(Name)) {
    formTextToken(T, TagNameEnd);
    return;
  }

  const char *End = skipWhitespace(TagNameEnd, CommentEnd);

  formTokenWithChars(T, End, tok::html_end_tag);
  T.setHTMLTagEndName(Name);

  if (BufferPtr != CommentEnd && *BufferPtr == '>')
    State = LS_HTMLEndTag;
}

void Lexer::lexHTMLEndTag(Token &T) {
  assert(BufferPtr != CommentEnd && *BufferPtr == '>');

  formTokenWithChars(T, BufferPtr + 1, tok::html_greater);
  State = LS_Normal;
}

Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
             const CommandTraits &Traits, SourceLocation FileLoc,
             const char *BufferStart, const char *BufferEnd, bool ParseCommands)
    : Allocator(Allocator), Diags(Diags), Traits(Traits),
      BufferStart(BufferStart), BufferEnd(BufferEnd), BufferPtr(BufferStart),
      FileLoc(FileLoc), ParseCommands(ParseCommands),
      CommentState(LCS_BeforeComment), State(LS_Normal) {}

void Lexer::lex(Token &T) {
again:
  switch (CommentState) {
  case LCS_BeforeComment:
    if (BufferPtr == BufferEnd) {
      formTokenWithChars(T, BufferPtr, tok::eof);
      return;
    }

    assert(*BufferPtr == '/');
    BufferPtr++; // Skip first slash.
    switch(*BufferPtr) {
    case '/': { // BCPL comment.
      BufferPtr++; // Skip second slash.

      if (BufferPtr != BufferEnd) {
        // Skip Doxygen magic marker, if it is present.
        // It might be missing because of a typo //< or /*<, or because we
        // merged this non-Doxygen comment into a bunch of Doxygen comments
        // around it: /** ... */ /* ... */ /** ... */
        const char C = *BufferPtr;
        if (C == '/' || C == '!')
          BufferPtr++;
      }

      // Skip less-than symbol that marks trailing comments.
      // Skip it even if the comment is not a Doxygen one, because //< and /*<
      // are frequent typos.
      if (BufferPtr != BufferEnd && *BufferPtr == '<')
        BufferPtr++;

      CommentState = LCS_InsideBCPLComment;
      switch (State) {
      case LS_VerbatimBlockFirstLine:
      case LS_VerbatimBlockBody:
        break;
      case LS_HTMLStartTag:
        BufferPtr = skipHorizontalWhitespace(BufferPtr, BufferEnd);
        break;
      default:
        State = LS_Normal;
        break;
      }
      CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
      goto again;
    }
    case '*': { // C comment.
      BufferPtr++; // Skip star.

      // Skip Doxygen magic marker.
      const char C = *BufferPtr;
      if ((C == '*' && *(BufferPtr + 1) != '/') || C == '!')
        BufferPtr++;

      // Skip less-than symbol that marks trailing comments.
      if (BufferPtr != BufferEnd && *BufferPtr == '<')
        BufferPtr++;

      CommentState = LCS_InsideCComment;
      State = LS_Normal;
      CommentEnd = findCCommentEnd(BufferPtr, BufferEnd);
      goto again;
    }
    default:
      llvm_unreachable("second character of comment should be '/' or '*'");
    }

  case LCS_BetweenComments: {
    // Consecutive comments are extracted only if there is only whitespace
    // between them.  So we can search for the start of the next comment.
    const char *EndWhitespace = BufferPtr;
    while(EndWhitespace != BufferEnd && *EndWhitespace != '/')
      EndWhitespace++;

    // When lexing the start of an HTML tag (i.e. going through the attributes)
    // there won't be any newlines generated.
    if (State == LS_HTMLStartTag && EndWhitespace != BufferEnd) {
      CommentState = LCS_BeforeComment;
      BufferPtr = EndWhitespace;
      goto again;
    }

    // Turn any whitespace between comments (and there is only whitespace
    // between them -- guaranteed by comment extraction) into a newline.  We
    // have two newlines between C comments in total (first one was synthesized
    // after a comment).
    formTokenWithChars(T, EndWhitespace, tok::newline);

    CommentState = LCS_BeforeComment;
    break;
  }

  case LCS_InsideBCPLComment:
  case LCS_InsideCComment:
    if (BufferPtr != CommentEnd) {
      lexCommentText(T);
      break;
    } else {
      // Skip C comment closing sequence.
      if (CommentState == LCS_InsideCComment) {
        assert(BufferPtr[0] == '*' && BufferPtr[1] == '/');
        BufferPtr += 2;
        assert(BufferPtr <= BufferEnd);

        // When lexing the start of an HTML tag (i.e. going through the
        // attributes) there won't be any newlines generated - whitespace still
        // needs to be skipped.
        if (State == LS_HTMLStartTag && BufferPtr != BufferEnd) {
          CommentState = LCS_BetweenComments;
          goto again;
        }

        // Synthenize newline just after the C comment, regardless if there is
        // actually a newline.
        formTokenWithChars(T, BufferPtr, tok::newline);

        CommentState = LCS_BetweenComments;
        break;
      } else {
        // Don't synthesized a newline after BCPL comment.
        CommentState = LCS_BetweenComments;
        goto again;
      }
    }
  }
}

StringRef Lexer::getSpelling(const Token &Tok,
                             const SourceManager &SourceMgr) const {
  SourceLocation Loc = Tok.getLocation();
  FileIDAndOffset LocInfo = SourceMgr.getDecomposedLoc(Loc);

  bool InvalidTemp = false;
  StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp);
  if (InvalidTemp)
    return StringRef();

  const char *Begin = File.data() + LocInfo.second;
  return StringRef(Begin, Tok.getLength());
}

} // end namespace comments
} // end namespace clang