lib/AST/CommentParser.cpp

//===--- CommentParser.cpp - Doxygen comment parser -----------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "clang/AST/CommentParser.h"
#include "clang/AST/CommentCommandTraits.h"
#include "clang/AST/CommentDiagnostic.h"
#include "clang/AST/CommentSema.h"
#include "clang/Basic/CharInfo.h"
#include "clang/Basic/SourceManager.h"
#include "llvm/Support/ErrorHandling.h"

namespace clang {

static inline bool isWhitespace(llvm::StringRef S) {
  for (StringRef::const_iterator I = S.begin(), E = S.end(); I != E; ++I) {
    if (!isWhitespace(*I))
      return false;
  }
  return true;
}

namespace comments {

/// Re-lexes a sequence of tok::text tokens.
class TextTokenRetokenizer {
  llvm::BumpPtrAllocator &Allocator;
  Parser &P;

  /// This flag is set when there are no more tokens we can fetch from lexer.
  bool NoMoreInterestingTokens;

  /// Token buffer: tokens we have processed and lookahead.
  SmallVector<Token, 16> Toks;

  /// A position in \c Toks.
  struct Position {
    const char *BufferStart;
    const char *BufferEnd;
    const char *BufferPtr;
    SourceLocation BufferStartLoc;
    unsigned CurToken;
  };

  /// Current position in Toks.
  Position Pos;

  bool isEnd() const {
    return Pos.CurToken >= Toks.size();
  }

  /// Sets up the buffer pointers to point to current token.
  void setupBuffer() {
    assert(!isEnd());
    const Token &Tok = Toks[Pos.CurToken];

    Pos.BufferStart = Tok.getText().begin();
    Pos.BufferEnd = Tok.getText().end();
    Pos.BufferPtr = Pos.BufferStart;
    Pos.BufferStartLoc = Tok.getLocation();
  }

  SourceLocation getSourceLocation() const {
    const unsigned CharNo = Pos.BufferPtr - Pos.BufferStart;
    return Pos.BufferStartLoc.getLocWithOffset(CharNo);
  }

  char peek() const {
    assert(!isEnd());
    assert(Pos.BufferPtr != Pos.BufferEnd);
    return *Pos.BufferPtr;
  }

  void consumeChar() {
    assert(!isEnd());
    assert(Pos.BufferPtr != Pos.BufferEnd);
    Pos.BufferPtr++;
    if (Pos.BufferPtr == Pos.BufferEnd) {
      Pos.CurToken++;
      if (isEnd() && !addToken())
        return;

      assert(!isEnd());
      setupBuffer();
    }
  }

  /// Add a token.
  /// Returns true on success, false if there are no interesting tokens to
  /// fetch from lexer.
  bool addToken() {
    if (NoMoreInterestingTokens)
      return false;

    if (P.Tok.is(tok::newline)) {
      // If we see a single newline token between text tokens, skip it.
      Token Newline = P.Tok;
      P.consumeToken();
      if (P.Tok.isNot(tok::text)) {
        P.putBack(Newline);
        NoMoreInterestingTokens = true;
        return false;
      }
    }
    if (P.Tok.isNot(tok::text)) {
      NoMoreInterestingTokens = true;
      return false;
    }

    Toks.push_back(P.Tok);
    P.consumeToken();
    if (Toks.size() == 1)
      setupBuffer();
    return true;
  }

  void consumeWhitespace() {
    while (!isEnd()) {
      if (isWhitespace(peek()))
        consumeChar();
      else
        break;
    }
  }

  void formTokenWithChars(Token &Result,
                          SourceLocation Loc,
                          const char *TokBegin,
                          unsigned TokLength,
                          StringRef Text) {
    Result.setLocation(Loc);
    Result.setKind(tok::text);
    Result.setLength(TokLength);
#ifndef NDEBUG
    Result.TextPtr = "<UNSET>";
    Result.IntVal = 7;
#endif
    Result.setText(Text);
  }

public:
  TextTokenRetokenizer(llvm::BumpPtrAllocator &Allocator, Parser &P):
      Allocator(Allocator), P(P), NoMoreInterestingTokens(false) {
    Pos.CurToken = 0;
    addToken();
  }

  /// Extract a word -- sequence of non-whitespace characters.
  bool lexWord(Token &Tok) {
    if (isEnd())
      return false;

    Position SavedPos = Pos;

    consumeWhitespace();
    SmallString<32> WordText;
    const char *WordBegin = Pos.BufferPtr;
    SourceLocation Loc = getSourceLocation();
    while (!isEnd()) {
      const char C = peek();
      if (!isWhitespace(C)) {
        WordText.push_back(C);
        consumeChar();
      } else
        break;
    }
    const unsigned Length = WordText.size();
    if (Length == 0) {
      Pos = SavedPos;
      return false;
    }

    char *TextPtr = Allocator.Allocate<char>(Length + 1);

    memcpy(TextPtr, WordText.c_str(), Length + 1);
    StringRef Text = StringRef(TextPtr, Length);

    formTokenWithChars(Tok, Loc, WordBegin, Length, Text);
    return true;
  }

  bool lexDelimitedSeq(Token &Tok, char OpenDelim, char CloseDelim) {
    if (isEnd())
      return false;

    Position SavedPos = Pos;

    consumeWhitespace();
    SmallString<32> WordText;
    const char *WordBegin = Pos.BufferPtr;
    SourceLocation Loc = getSourceLocation();
    bool Error = false;
    if (!isEnd()) {
      const char C = peek();
      if (C == OpenDelim) {
        WordText.push_back(C);
        consumeChar();
      } else
        Error = true;
    }
    char C = '\0';
    while (!Error && !isEnd()) {
      C = peek();
      WordText.push_back(C);
      consumeChar();
      if (C == CloseDelim)
        break;
    }
    if (!Error && C != CloseDelim)
      Error = true;

    if (Error) {
      Pos = SavedPos;
      return false;
    }

    const unsigned Length = WordText.size();
    char *TextPtr = Allocator.Allocate<char>(Length + 1);

    memcpy(TextPtr, WordText.c_str(), Length + 1);
    StringRef Text = StringRef(TextPtr, Length);

    formTokenWithChars(Tok, Loc, WordBegin,
                       Pos.BufferPtr - WordBegin, Text);
    return true;
  }

  /// Put back tokens that we didn't consume.
  void putBackLeftoverTokens() {
    if (isEnd())
      return;

    bool HavePartialTok = false;
    Token PartialTok;
    if (Pos.BufferPtr != Pos.BufferStart) {
      formTokenWithChars(PartialTok, getSourceLocation(),
                         Pos.BufferPtr, Pos.BufferEnd - Pos.BufferPtr,
                         StringRef(Pos.BufferPtr,
                                   Pos.BufferEnd - Pos.BufferPtr));
      HavePartialTok = true;
      Pos.CurToken++;
    }

    P.putBack(llvm::ArrayRef(Toks.begin() + Pos.CurToken, Toks.end()));
    Pos.CurToken = Toks.size();

    if (HavePartialTok)
      P.putBack(PartialTok);
  }
};

Parser::Parser(Lexer &L, Sema &S, llvm::BumpPtrAllocator &Allocator,
               const SourceManager &SourceMgr, DiagnosticsEngine &Diags,
               const CommandTraits &Traits):
    L(L), S(S), Allocator(Allocator), SourceMgr(SourceMgr), Diags(Diags),
    Traits(Traits) {
  consumeToken();
}

void Parser::parseParamCommandArgs(ParamCommandComment *PC,
                                   TextTokenRetokenizer &Retokenizer) {
  Token Arg;
  // Check if argument looks like direction specification: [dir]
  // e.g., [in], [out], [in,out]
  if (Retokenizer.lexDelimitedSeq(Arg, '[', ']'))
    S.actOnParamCommandDirectionArg(PC,
                                    Arg.getLocation(),
                                    Arg.getEndLocation(),
                                    Arg.getText());

  if (Retokenizer.lexWord(Arg))
    S.actOnParamCommandParamNameArg(PC,
                                    Arg.getLocation(),
                                    Arg.getEndLocation(),
                                    Arg.getText());
}

void Parser::parseTParamCommandArgs(TParamCommandComment *TPC,
                                    TextTokenRetokenizer &Retokenizer) {
  Token Arg;
  if (Retokenizer.lexWord(Arg))
    S.actOnTParamCommandParamNameArg(TPC,
                                     Arg.getLocation(),
                                     Arg.getEndLocation(),
                                     Arg.getText());
}

ArrayRef<Comment::Argument>
Parser::parseCommandArgs(TextTokenRetokenizer &Retokenizer, unsigned NumArgs) {
  auto *Args = new (Allocator.Allocate<Comment::Argument>(NumArgs))
      Comment::Argument[NumArgs];
  unsigned ParsedArgs = 0;
  Token Arg;
  while (ParsedArgs < NumArgs && Retokenizer.lexWord(Arg)) {
    Args[ParsedArgs] = Comment::Argument{
        SourceRange(Arg.getLocation(), Arg.getEndLocation()), Arg.getText()};
    ParsedArgs++;
  }

  return llvm::ArrayRef(Args, ParsedArgs);
}

BlockCommandComment *Parser::parseBlockCommand() {
  assert(Tok.is(tok::backslash_command) || Tok.is(tok::at_command));

  ParamCommandComment *PC = nullptr;
  TParamCommandComment *TPC = nullptr;
  BlockCommandComment *BC = nullptr;
  const CommandInfo *Info = Traits.getCommandInfo(Tok.getCommandID());
  CommandMarkerKind CommandMarker =
      Tok.is(tok::backslash_command) ? CMK_Backslash : CMK_At;
  if (Info->IsParamCommand) {
    PC = S.actOnParamCommandStart(Tok.getLocation(),
                                  Tok.getEndLocation(),
                                  Tok.getCommandID(),
                                  CommandMarker);
  } else if (Info->IsTParamCommand) {
    TPC = S.actOnTParamCommandStart(Tok.getLocation(),
                                    Tok.getEndLocation(),
                                    Tok.getCommandID(),
                                    CommandMarker);
  } else {
    BC = S.actOnBlockCommandStart(Tok.getLocation(),
                                  Tok.getEndLocation(),
                                  Tok.getCommandID(),
                                  CommandMarker);
  }
  consumeToken();

  if (isTokBlockCommand()) {
    // Block command ahead.  We can't nest block commands, so pretend that this
    // command has an empty argument.
    ParagraphComment *Paragraph = S.actOnParagraphComment(std::nullopt);
    if (PC) {
      S.actOnParamCommandFinish(PC, Paragraph);
      return PC;
    } else if (TPC) {
      S.actOnTParamCommandFinish(TPC, Paragraph);
      return TPC;
    } else {
      S.actOnBlockCommandFinish(BC, Paragraph);
      return BC;
    }
  }

  if (PC || TPC || Info->NumArgs > 0) {
    // In order to parse command arguments we need to retokenize a few
    // following text tokens.
    TextTokenRetokenizer Retokenizer(Allocator, *this);

    if (PC)
      parseParamCommandArgs(PC, Retokenizer);
    else if (TPC)
      parseTParamCommandArgs(TPC, Retokenizer);
    else
      S.actOnBlockCommandArgs(BC, parseCommandArgs(Retokenizer, Info->NumArgs));

    Retokenizer.putBackLeftoverTokens();
  }

  // If there's a block command ahead, we will attach an empty paragraph to
  // this command.
  bool EmptyParagraph = false;
  if (isTokBlockCommand())
    EmptyParagraph = true;
  else if (Tok.is(tok::newline)) {
    Token PrevTok = Tok;
    consumeToken();
    EmptyParagraph = isTokBlockCommand();
    putBack(PrevTok);
  }

  ParagraphComment *Paragraph;
  if (EmptyParagraph)
    Paragraph = S.actOnParagraphComment(std::nullopt);
  else {
    BlockContentComment *Block = parseParagraphOrBlockCommand();
    // Since we have checked for a block command, we should have parsed a
    // paragraph.
    Paragraph = cast<ParagraphComment>(Block);
  }

  if (PC) {
    S.actOnParamCommandFinish(PC, Paragraph);
    return PC;
  } else if (TPC) {
    S.actOnTParamCommandFinish(TPC, Paragraph);
    return TPC;
  } else {
    S.actOnBlockCommandFinish(BC, Paragraph);
    return BC;
  }
}

InlineCommandComment *Parser::parseInlineCommand() {
  assert(Tok.is(tok::backslash_command) || Tok.is(tok::at_command));
  const CommandInfo *Info = Traits.getCommandInfo(Tok.getCommandID());

  const Token CommandTok = Tok;
  consumeToken();

  TextTokenRetokenizer Retokenizer(Allocator, *this);
  ArrayRef<Comment::Argument> Args =
      parseCommandArgs(Retokenizer, Info->NumArgs);

  InlineCommandComment *IC = S.actOnInlineCommand(
      CommandTok.getLocation(), CommandTok.getEndLocation(),
      CommandTok.getCommandID(), Args);

  if (Args.size() < Info->NumArgs) {
    Diag(CommandTok.getEndLocation().getLocWithOffset(1),
         diag::warn_doc_inline_command_not_enough_arguments)
        << CommandTok.is(tok::at_command) << Info->Name << Args.size()
        << Info->NumArgs
        << SourceRange(CommandTok.getLocation(), CommandTok.getEndLocation());
  }

  Retokenizer.putBackLeftoverTokens();

  return IC;
}

HTMLStartTagComment *Parser::parseHTMLStartTag() {
  assert(Tok.is(tok::html_start_tag));
  HTMLStartTagComment *HST =
      S.actOnHTMLStartTagStart(Tok.getLocation(),
                               Tok.getHTMLTagStartName());
  consumeToken();

  SmallVector<HTMLStartTagComment::Attribute, 2> Attrs;
  while (true) {
    switch (Tok.getKind()) {
    case tok::html_ident: {
      Token Ident = Tok;
      consumeToken();
      if (Tok.isNot(tok::html_equals)) {
        Attrs.push_back(HTMLStartTagComment::Attribute(Ident.getLocation(),
                                                       Ident.getHTMLIdent()));
        continue;
      }
      Token Equals = Tok;
      consumeToken();
      if (Tok.isNot(tok::html_quoted_string)) {
        Diag(Tok.getLocation(),
             diag::warn_doc_html_start_tag_expected_quoted_string)
          << SourceRange(Equals.getLocation());
        Attrs.push_back(HTMLStartTagComment::Attribute(Ident.getLocation(),
                                                       Ident.getHTMLIdent()));
        while (Tok.is(tok::html_equals) ||
               Tok.is(tok::html_quoted_string))
          consumeToken();
        continue;
      }
      Attrs.push_back(HTMLStartTagComment::Attribute(
                              Ident.getLocation(),
                              Ident.getHTMLIdent(),
                              Equals.getLocation(),
                              SourceRange(Tok.getLocation(),
                                          Tok.getEndLocation()),
                              Tok.getHTMLQuotedString()));
      consumeToken();
      continue;
    }

    case tok::html_greater:
      S.actOnHTMLStartTagFinish(HST, S.copyArray(llvm::ArrayRef(Attrs)),
                                Tok.getLocation(),
                                /* IsSelfClosing = */ false);
      consumeToken();
      return HST;

    case tok::html_slash_greater:
      S.actOnHTMLStartTagFinish(HST, S.copyArray(llvm::ArrayRef(Attrs)),
                                Tok.getLocation(),
                                /* IsSelfClosing = */ true);
      consumeToken();
      return HST;

    case tok::html_equals:
    case tok::html_quoted_string:
      Diag(Tok.getLocation(),
           diag::warn_doc_html_start_tag_expected_ident_or_greater);
      while (Tok.is(tok::html_equals) ||
             Tok.is(tok::html_quoted_string))
        consumeToken();
      if (Tok.is(tok::html_ident) ||
          Tok.is(tok::html_greater) ||
          Tok.is(tok::html_slash_greater))
        continue;

      S.actOnHTMLStartTagFinish(HST, S.copyArray(llvm::ArrayRef(Attrs)),
                                SourceLocation(),
                                /* IsSelfClosing = */ false);
      return HST;

    default:
      // Not a token from an HTML start tag.  Thus HTML tag prematurely ended.
      S.actOnHTMLStartTagFinish(HST, S.copyArray(llvm::ArrayRef(Attrs)),
                                SourceLocation(),
                                /* IsSelfClosing = */ false);
      bool StartLineInvalid;
      const unsigned StartLine = SourceMgr.getPresumedLineNumber(
                                                  HST->getLocation(),
                                                  &StartLineInvalid);
      bool EndLineInvalid;
      const unsigned EndLine = SourceMgr.getPresumedLineNumber(
                                                  Tok.getLocation(),
                                                  &EndLineInvalid);
      if (StartLineInvalid || EndLineInvalid || StartLine == EndLine)
        Diag(Tok.getLocation(),
             diag::warn_doc_html_start_tag_expected_ident_or_greater)
          << HST->getSourceRange();
      else {
        Diag(Tok.getLocation(),
             diag::warn_doc_html_start_tag_expected_ident_or_greater);
        Diag(HST->getLocation(), diag::note_doc_html_tag_started_here)
          << HST->getSourceRange();
      }
      return HST;
    }
  }
}

HTMLEndTagComment *Parser::parseHTMLEndTag() {
  assert(Tok.is(tok::html_end_tag));
  Token TokEndTag = Tok;
  consumeToken();
  SourceLocation Loc;
  if (Tok.is(tok::html_greater)) {
    Loc = Tok.getLocation();
    consumeToken();
  }

  return S.actOnHTMLEndTag(TokEndTag.getLocation(),
                           Loc,
                           TokEndTag.getHTMLTagEndName());
}

BlockContentComment *Parser::parseParagraphOrBlockCommand() {
  SmallVector<InlineContentComment *, 8> Content;

  while (true) {
    switch (Tok.getKind()) {
    case tok::verbatim_block_begin:
    case tok::verbatim_line_name:
    case tok::eof:
      break; // Block content or EOF ahead, finish this parapgaph.

    case tok::unknown_command:
      Content.push_back(S.actOnUnknownCommand(Tok.getLocation(),
                                              Tok.getEndLocation(),
                                              Tok.getUnknownCommandName()));
      consumeToken();
      continue;

    case tok::backslash_command:
    case tok::at_command: {
      const CommandInfo *Info = Traits.getCommandInfo(Tok.getCommandID());
      if (Info->IsBlockCommand) {
        if (Content.size() == 0)
          return parseBlockCommand();
        break; // Block command ahead, finish this parapgaph.
      }
      if (Info->IsVerbatimBlockEndCommand) {
        Diag(Tok.getLocation(),
             diag::warn_verbatim_block_end_without_start)
          << Tok.is(tok::at_command)
          << Info->Name
          << SourceRange(Tok.getLocation(), Tok.getEndLocation());
        consumeToken();
        continue;
      }
      if (Info->IsUnknownCommand) {
        Content.push_back(S.actOnUnknownCommand(Tok.getLocation(),
                                                Tok.getEndLocation(),
                                                Info->getID()));
        consumeToken();
        continue;
      }
      assert(Info->IsInlineCommand);
      Content.push_back(parseInlineCommand());
      continue;
    }

    case tok::newline: {
      consumeToken();
      if (Tok.is(tok::newline) || Tok.is(tok::eof)) {
        consumeToken();
        break; // Two newlines -- end of paragraph.
      }
      // Also allow [tok::newline, tok::text, tok::newline] if the middle
      // tok::text is just whitespace.
      if (Tok.is(tok::text) && isWhitespace(Tok.getText())) {
        Token WhitespaceTok = Tok;
        consumeToken();
        if (Tok.is(tok::newline) || Tok.is(tok::eof)) {
          consumeToken();
          break;
        }
        // We have [tok::newline, tok::text, non-newline].  Put back tok::text.
        putBack(WhitespaceTok);
      }
      if (Content.size() > 0)
        Content.back()->addTrailingNewline();
      continue;
    }

    // Don't deal with HTML tag soup now.
    case tok::html_start_tag:
      Content.push_back(parseHTMLStartTag());
      continue;

    case tok::html_end_tag:
      Content.push_back(parseHTMLEndTag());
      continue;

    case tok::text:
      Content.push_back(S.actOnText(Tok.getLocation(),
                                    Tok.getEndLocation(),
                                    Tok.getText()));
      consumeToken();
      continue;

    case tok::verbatim_block_line:
    case tok::verbatim_block_end:
    case tok::verbatim_line_text:
    case tok::html_ident:
    case tok::html_equals:
    case tok::html_quoted_string:
    case tok::html_greater:
    case tok::html_slash_greater:
      llvm_unreachable("should not see this token");
    }
    break;
  }

  return S.actOnParagraphComment(S.copyArray(llvm::ArrayRef(Content)));
}

VerbatimBlockComment *Parser::parseVerbatimBlock() {
  assert(Tok.is(tok::verbatim_block_begin));

  VerbatimBlockComment *VB =
      S.actOnVerbatimBlockStart(Tok.getLocation(),
                                Tok.getVerbatimBlockID());
  consumeToken();

  // Don't create an empty line if verbatim opening command is followed
  // by a newline.
  if (Tok.is(tok::newline))
    consumeToken();

  SmallVector<VerbatimBlockLineComment *, 8> Lines;
  while (Tok.is(tok::verbatim_block_line) ||
         Tok.is(tok::newline)) {
    VerbatimBlockLineComment *Line;
    if (Tok.is(tok::verbatim_block_line)) {
      Line = S.actOnVerbatimBlockLine(Tok.getLocation(),
                                      Tok.getVerbatimBlockText());
      consumeToken();
      if (Tok.is(tok::newline)) {
        consumeToken();
      }
    } else {
      // Empty line, just a tok::newline.
      Line = S.actOnVerbatimBlockLine(Tok.getLocation(), "");
      consumeToken();
    }
    Lines.push_back(Line);
  }

  if (Tok.is(tok::verbatim_block_end)) {
    const CommandInfo *Info = Traits.getCommandInfo(Tok.getVerbatimBlockID());
    S.actOnVerbatimBlockFinish(VB, Tok.getLocation(), Info->Name,
                               S.copyArray(llvm::ArrayRef(Lines)));
    consumeToken();
  } else {
    // Unterminated \\verbatim block
    S.actOnVerbatimBlockFinish(VB, SourceLocation(), "",
                               S.copyArray(llvm::ArrayRef(Lines)));
  }

  return VB;
}

VerbatimLineComment *Parser::parseVerbatimLine() {
  assert(Tok.is(tok::verbatim_line_name));

  Token NameTok = Tok;
  consumeToken();

  SourceLocation TextBegin;
  StringRef Text;
  // Next token might not be a tok::verbatim_line_text if verbatim line
  // starting command comes just before a newline or comment end.
  if (Tok.is(tok::verbatim_line_text)) {
    TextBegin = Tok.getLocation();
    Text = Tok.getVerbatimLineText();
  } else {
    TextBegin = NameTok.getEndLocation();
    Text = "";
  }

  VerbatimLineComment *VL = S.actOnVerbatimLine(NameTok.getLocation(),
                                                NameTok.getVerbatimLineID(),
                                                TextBegin,
                                                Text);
  consumeToken();
  return VL;
}

BlockContentComment *Parser::parseBlockContent() {
  switch (Tok.getKind()) {
  case tok::text:
  case tok::unknown_command:
  case tok::backslash_command:
  case tok::at_command:
  case tok::html_start_tag:
  case tok::html_end_tag:
    return parseParagraphOrBlockCommand();

  case tok::verbatim_block_begin:
    return parseVerbatimBlock();

  case tok::verbatim_line_name:
    return parseVerbatimLine();

  case tok::eof:
  case tok::newline:
  case tok::verbatim_block_line:
  case tok::verbatim_block_end:
  case tok::verbatim_line_text:
  case tok::html_ident:
  case tok::html_equals:
  case tok::html_quoted_string:
  case tok::html_greater:
  case tok::html_slash_greater:
    llvm_unreachable("should not see this token");
  }
  llvm_unreachable("bogus token kind");
}

FullComment *Parser::parseFullComment() {
  // Skip newlines at the beginning of the comment.
  while (Tok.is(tok::newline))
    consumeToken();

  SmallVector<BlockContentComment *, 8> Blocks;
  while (Tok.isNot(tok::eof)) {
    Blocks.push_back(parseBlockContent());

    // Skip extra newlines after paragraph end.
    while (Tok.is(tok::newline))
      consumeToken();
  }
  return S.actOnFullComment(S.copyArray(llvm::ArrayRef(Blocks)));
}

} // end namespace comments
} // end namespace clang