1*0b57cec5SDimitry Andric //===--- UnwrappedLineParser.h - Format C++ code ----------------*- C++ -*-===// 2*0b57cec5SDimitry Andric // 3*0b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4*0b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 5*0b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6*0b57cec5SDimitry Andric // 7*0b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 8*0b57cec5SDimitry Andric /// 9*0b57cec5SDimitry Andric /// \file 10*0b57cec5SDimitry Andric /// This file contains the declaration of the UnwrappedLineParser, 11*0b57cec5SDimitry Andric /// which turns a stream of tokens into UnwrappedLines. 12*0b57cec5SDimitry Andric /// 13*0b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 14*0b57cec5SDimitry Andric 15*0b57cec5SDimitry Andric #ifndef LLVM_CLANG_LIB_FORMAT_UNWRAPPEDLINEPARSER_H 16*0b57cec5SDimitry Andric #define LLVM_CLANG_LIB_FORMAT_UNWRAPPEDLINEPARSER_H 17*0b57cec5SDimitry Andric 18*0b57cec5SDimitry Andric #include "FormatToken.h" 19*0b57cec5SDimitry Andric #include "clang/Basic/IdentifierTable.h" 20*0b57cec5SDimitry Andric #include "clang/Format/Format.h" 21*0b57cec5SDimitry Andric #include "llvm/Support/Regex.h" 22*0b57cec5SDimitry Andric #include <list> 23*0b57cec5SDimitry Andric #include <stack> 24*0b57cec5SDimitry Andric 25*0b57cec5SDimitry Andric namespace clang { 26*0b57cec5SDimitry Andric namespace format { 27*0b57cec5SDimitry Andric 28*0b57cec5SDimitry Andric struct UnwrappedLineNode; 29*0b57cec5SDimitry Andric 30*0b57cec5SDimitry Andric /// An unwrapped line is a sequence of \c Token, that we would like to 31*0b57cec5SDimitry Andric /// put on a single line if there was no column limit. 32*0b57cec5SDimitry Andric /// 33*0b57cec5SDimitry Andric /// This is used as a main interface between the \c UnwrappedLineParser and the 34*0b57cec5SDimitry Andric /// \c UnwrappedLineFormatter. The key property is that changing the formatting 35*0b57cec5SDimitry Andric /// within an unwrapped line does not affect any other unwrapped lines. 36*0b57cec5SDimitry Andric struct UnwrappedLine { 37*0b57cec5SDimitry Andric UnwrappedLine(); 38*0b57cec5SDimitry Andric 39*0b57cec5SDimitry Andric // FIXME: Don't use std::list here. 40*0b57cec5SDimitry Andric /// The \c Tokens comprising this \c UnwrappedLine. 41*0b57cec5SDimitry Andric std::list<UnwrappedLineNode> Tokens; 42*0b57cec5SDimitry Andric 43*0b57cec5SDimitry Andric /// The indent level of the \c UnwrappedLine. 44*0b57cec5SDimitry Andric unsigned Level; 45*0b57cec5SDimitry Andric 46*0b57cec5SDimitry Andric /// Whether this \c UnwrappedLine is part of a preprocessor directive. 47*0b57cec5SDimitry Andric bool InPPDirective; 48*0b57cec5SDimitry Andric 49*0b57cec5SDimitry Andric bool MustBeDeclaration; 50*0b57cec5SDimitry Andric 51*0b57cec5SDimitry Andric /// If this \c UnwrappedLine closes a block in a sequence of lines, 52*0b57cec5SDimitry Andric /// \c MatchingOpeningBlockLineIndex stores the index of the corresponding 53*0b57cec5SDimitry Andric /// opening line. Otherwise, \c MatchingOpeningBlockLineIndex must be 54*0b57cec5SDimitry Andric /// \c kInvalidIndex. 55*0b57cec5SDimitry Andric size_t MatchingOpeningBlockLineIndex = kInvalidIndex; 56*0b57cec5SDimitry Andric 57*0b57cec5SDimitry Andric /// If this \c UnwrappedLine opens a block, stores the index of the 58*0b57cec5SDimitry Andric /// line with the corresponding closing brace. 59*0b57cec5SDimitry Andric size_t MatchingClosingBlockLineIndex = kInvalidIndex; 60*0b57cec5SDimitry Andric 61*0b57cec5SDimitry Andric static const size_t kInvalidIndex = -1; 62*0b57cec5SDimitry Andric 63*0b57cec5SDimitry Andric unsigned FirstStartColumn = 0; 64*0b57cec5SDimitry Andric }; 65*0b57cec5SDimitry Andric 66*0b57cec5SDimitry Andric class UnwrappedLineConsumer { 67*0b57cec5SDimitry Andric public: 68*0b57cec5SDimitry Andric virtual ~UnwrappedLineConsumer() {} 69*0b57cec5SDimitry Andric virtual void consumeUnwrappedLine(const UnwrappedLine &Line) = 0; 70*0b57cec5SDimitry Andric virtual void finishRun() = 0; 71*0b57cec5SDimitry Andric }; 72*0b57cec5SDimitry Andric 73*0b57cec5SDimitry Andric class FormatTokenSource; 74*0b57cec5SDimitry Andric 75*0b57cec5SDimitry Andric class UnwrappedLineParser { 76*0b57cec5SDimitry Andric public: 77*0b57cec5SDimitry Andric UnwrappedLineParser(const FormatStyle &Style, 78*0b57cec5SDimitry Andric const AdditionalKeywords &Keywords, 79*0b57cec5SDimitry Andric unsigned FirstStartColumn, ArrayRef<FormatToken *> Tokens, 80*0b57cec5SDimitry Andric UnwrappedLineConsumer &Callback); 81*0b57cec5SDimitry Andric 82*0b57cec5SDimitry Andric void parse(); 83*0b57cec5SDimitry Andric 84*0b57cec5SDimitry Andric private: 85*0b57cec5SDimitry Andric void reset(); 86*0b57cec5SDimitry Andric void parseFile(); 87*0b57cec5SDimitry Andric void parseLevel(bool HasOpeningBrace); 88*0b57cec5SDimitry Andric void parseBlock(bool MustBeDeclaration, bool AddLevel = true, 89*0b57cec5SDimitry Andric bool MunchSemi = true); 90*0b57cec5SDimitry Andric void parseChildBlock(); 91*0b57cec5SDimitry Andric void parsePPDirective(); 92*0b57cec5SDimitry Andric void parsePPDefine(); 93*0b57cec5SDimitry Andric void parsePPIf(bool IfDef); 94*0b57cec5SDimitry Andric void parsePPElIf(); 95*0b57cec5SDimitry Andric void parsePPElse(); 96*0b57cec5SDimitry Andric void parsePPEndIf(); 97*0b57cec5SDimitry Andric void parsePPUnknown(); 98*0b57cec5SDimitry Andric void readTokenWithJavaScriptASI(); 99*0b57cec5SDimitry Andric void parseStructuralElement(); 100*0b57cec5SDimitry Andric bool tryToParseBracedList(); 101*0b57cec5SDimitry Andric bool parseBracedList(bool ContinueOnSemicolons = false, 102*0b57cec5SDimitry Andric tok::TokenKind ClosingBraceKind = tok::r_brace); 103*0b57cec5SDimitry Andric void parseParens(); 104*0b57cec5SDimitry Andric void parseSquare(bool LambdaIntroducer = false); 105*0b57cec5SDimitry Andric void parseIfThenElse(); 106*0b57cec5SDimitry Andric void parseTryCatch(); 107*0b57cec5SDimitry Andric void parseForOrWhileLoop(); 108*0b57cec5SDimitry Andric void parseDoWhile(); 109*0b57cec5SDimitry Andric void parseLabel(); 110*0b57cec5SDimitry Andric void parseCaseLabel(); 111*0b57cec5SDimitry Andric void parseSwitch(); 112*0b57cec5SDimitry Andric void parseNamespace(); 113*0b57cec5SDimitry Andric void parseNew(); 114*0b57cec5SDimitry Andric void parseAccessSpecifier(); 115*0b57cec5SDimitry Andric bool parseEnum(); 116*0b57cec5SDimitry Andric void parseJavaEnumBody(); 117*0b57cec5SDimitry Andric // Parses a record (aka class) as a top level element. If ParseAsExpr is true, 118*0b57cec5SDimitry Andric // parses the record as a child block, i.e. if the class declaration is an 119*0b57cec5SDimitry Andric // expression. 120*0b57cec5SDimitry Andric void parseRecord(bool ParseAsExpr = false); 121*0b57cec5SDimitry Andric void parseObjCMethod(); 122*0b57cec5SDimitry Andric void parseObjCProtocolList(); 123*0b57cec5SDimitry Andric void parseObjCUntilAtEnd(); 124*0b57cec5SDimitry Andric void parseObjCInterfaceOrImplementation(); 125*0b57cec5SDimitry Andric bool parseObjCProtocol(); 126*0b57cec5SDimitry Andric void parseJavaScriptEs6ImportExport(); 127*0b57cec5SDimitry Andric void parseStatementMacro(); 128*0b57cec5SDimitry Andric bool tryToParseLambda(); 129*0b57cec5SDimitry Andric bool tryToParseLambdaIntroducer(); 130*0b57cec5SDimitry Andric void tryToParseJSFunction(); 131*0b57cec5SDimitry Andric void addUnwrappedLine(); 132*0b57cec5SDimitry Andric bool eof() const; 133*0b57cec5SDimitry Andric // LevelDifference is the difference of levels after and before the current 134*0b57cec5SDimitry Andric // token. For example: 135*0b57cec5SDimitry Andric // - if the token is '{' and opens a block, LevelDifference is 1. 136*0b57cec5SDimitry Andric // - if the token is '}' and closes a block, LevelDifference is -1. 137*0b57cec5SDimitry Andric void nextToken(int LevelDifference = 0); 138*0b57cec5SDimitry Andric void readToken(int LevelDifference = 0); 139*0b57cec5SDimitry Andric 140*0b57cec5SDimitry Andric // Decides which comment tokens should be added to the current line and which 141*0b57cec5SDimitry Andric // should be added as comments before the next token. 142*0b57cec5SDimitry Andric // 143*0b57cec5SDimitry Andric // Comments specifies the sequence of comment tokens to analyze. They get 144*0b57cec5SDimitry Andric // either pushed to the current line or added to the comments before the next 145*0b57cec5SDimitry Andric // token. 146*0b57cec5SDimitry Andric // 147*0b57cec5SDimitry Andric // NextTok specifies the next token. A null pointer NextTok is supported, and 148*0b57cec5SDimitry Andric // signifies either the absence of a next token, or that the next token 149*0b57cec5SDimitry Andric // shouldn't be taken into accunt for the analysis. 150*0b57cec5SDimitry Andric void distributeComments(const SmallVectorImpl<FormatToken *> &Comments, 151*0b57cec5SDimitry Andric const FormatToken *NextTok); 152*0b57cec5SDimitry Andric 153*0b57cec5SDimitry Andric // Adds the comment preceding the next token to unwrapped lines. 154*0b57cec5SDimitry Andric void flushComments(bool NewlineBeforeNext); 155*0b57cec5SDimitry Andric void pushToken(FormatToken *Tok); 156*0b57cec5SDimitry Andric void calculateBraceTypes(bool ExpectClassBody = false); 157*0b57cec5SDimitry Andric 158*0b57cec5SDimitry Andric // Marks a conditional compilation edge (for example, an '#if', '#ifdef', 159*0b57cec5SDimitry Andric // '#else' or merge conflict marker). If 'Unreachable' is true, assumes 160*0b57cec5SDimitry Andric // this branch either cannot be taken (for example '#if false'), or should 161*0b57cec5SDimitry Andric // not be taken in this round. 162*0b57cec5SDimitry Andric void conditionalCompilationCondition(bool Unreachable); 163*0b57cec5SDimitry Andric void conditionalCompilationStart(bool Unreachable); 164*0b57cec5SDimitry Andric void conditionalCompilationAlternative(); 165*0b57cec5SDimitry Andric void conditionalCompilationEnd(); 166*0b57cec5SDimitry Andric 167*0b57cec5SDimitry Andric bool isOnNewLine(const FormatToken &FormatTok); 168*0b57cec5SDimitry Andric 169*0b57cec5SDimitry Andric // Compute hash of the current preprocessor branch. 170*0b57cec5SDimitry Andric // This is used to identify the different branches, and thus track if block 171*0b57cec5SDimitry Andric // open and close in the same branch. 172*0b57cec5SDimitry Andric size_t computePPHash() const; 173*0b57cec5SDimitry Andric 174*0b57cec5SDimitry Andric // FIXME: We are constantly running into bugs where Line.Level is incorrectly 175*0b57cec5SDimitry Andric // subtracted from beyond 0. Introduce a method to subtract from Line.Level 176*0b57cec5SDimitry Andric // and use that everywhere in the Parser. 177*0b57cec5SDimitry Andric std::unique_ptr<UnwrappedLine> Line; 178*0b57cec5SDimitry Andric 179*0b57cec5SDimitry Andric // Comments are sorted into unwrapped lines by whether they are in the same 180*0b57cec5SDimitry Andric // line as the previous token, or not. If not, they belong to the next token. 181*0b57cec5SDimitry Andric // Since the next token might already be in a new unwrapped line, we need to 182*0b57cec5SDimitry Andric // store the comments belonging to that token. 183*0b57cec5SDimitry Andric SmallVector<FormatToken *, 1> CommentsBeforeNextToken; 184*0b57cec5SDimitry Andric FormatToken *FormatTok; 185*0b57cec5SDimitry Andric bool MustBreakBeforeNextToken; 186*0b57cec5SDimitry Andric 187*0b57cec5SDimitry Andric // The parsed lines. Only added to through \c CurrentLines. 188*0b57cec5SDimitry Andric SmallVector<UnwrappedLine, 8> Lines; 189*0b57cec5SDimitry Andric 190*0b57cec5SDimitry Andric // Preprocessor directives are parsed out-of-order from other unwrapped lines. 191*0b57cec5SDimitry Andric // Thus, we need to keep a list of preprocessor directives to be reported 192*0b57cec5SDimitry Andric // after an unwrapped line that has been started was finished. 193*0b57cec5SDimitry Andric SmallVector<UnwrappedLine, 4> PreprocessorDirectives; 194*0b57cec5SDimitry Andric 195*0b57cec5SDimitry Andric // New unwrapped lines are added via CurrentLines. 196*0b57cec5SDimitry Andric // Usually points to \c &Lines. While parsing a preprocessor directive when 197*0b57cec5SDimitry Andric // there is an unfinished previous unwrapped line, will point to 198*0b57cec5SDimitry Andric // \c &PreprocessorDirectives. 199*0b57cec5SDimitry Andric SmallVectorImpl<UnwrappedLine> *CurrentLines; 200*0b57cec5SDimitry Andric 201*0b57cec5SDimitry Andric // We store for each line whether it must be a declaration depending on 202*0b57cec5SDimitry Andric // whether we are in a compound statement or not. 203*0b57cec5SDimitry Andric std::vector<bool> DeclarationScopeStack; 204*0b57cec5SDimitry Andric 205*0b57cec5SDimitry Andric const FormatStyle &Style; 206*0b57cec5SDimitry Andric const AdditionalKeywords &Keywords; 207*0b57cec5SDimitry Andric 208*0b57cec5SDimitry Andric llvm::Regex CommentPragmasRegex; 209*0b57cec5SDimitry Andric 210*0b57cec5SDimitry Andric FormatTokenSource *Tokens; 211*0b57cec5SDimitry Andric UnwrappedLineConsumer &Callback; 212*0b57cec5SDimitry Andric 213*0b57cec5SDimitry Andric // FIXME: This is a temporary measure until we have reworked the ownership 214*0b57cec5SDimitry Andric // of the format tokens. The goal is to have the actual tokens created and 215*0b57cec5SDimitry Andric // owned outside of and handed into the UnwrappedLineParser. 216*0b57cec5SDimitry Andric ArrayRef<FormatToken *> AllTokens; 217*0b57cec5SDimitry Andric 218*0b57cec5SDimitry Andric // Represents preprocessor branch type, so we can find matching 219*0b57cec5SDimitry Andric // #if/#else/#endif directives. 220*0b57cec5SDimitry Andric enum PPBranchKind { 221*0b57cec5SDimitry Andric PP_Conditional, // Any #if, #ifdef, #ifndef, #elif, block outside #if 0 222*0b57cec5SDimitry Andric PP_Unreachable // #if 0 or a conditional preprocessor block inside #if 0 223*0b57cec5SDimitry Andric }; 224*0b57cec5SDimitry Andric 225*0b57cec5SDimitry Andric struct PPBranch { 226*0b57cec5SDimitry Andric PPBranch(PPBranchKind Kind, size_t Line) : Kind(Kind), Line(Line) {} 227*0b57cec5SDimitry Andric PPBranchKind Kind; 228*0b57cec5SDimitry Andric size_t Line; 229*0b57cec5SDimitry Andric }; 230*0b57cec5SDimitry Andric 231*0b57cec5SDimitry Andric // Keeps a stack of currently active preprocessor branching directives. 232*0b57cec5SDimitry Andric SmallVector<PPBranch, 16> PPStack; 233*0b57cec5SDimitry Andric 234*0b57cec5SDimitry Andric // The \c UnwrappedLineParser re-parses the code for each combination 235*0b57cec5SDimitry Andric // of preprocessor branches that can be taken. 236*0b57cec5SDimitry Andric // To that end, we take the same branch (#if, #else, or one of the #elif 237*0b57cec5SDimitry Andric // branches) for each nesting level of preprocessor branches. 238*0b57cec5SDimitry Andric // \c PPBranchLevel stores the current nesting level of preprocessor 239*0b57cec5SDimitry Andric // branches during one pass over the code. 240*0b57cec5SDimitry Andric int PPBranchLevel; 241*0b57cec5SDimitry Andric 242*0b57cec5SDimitry Andric // Contains the current branch (#if, #else or one of the #elif branches) 243*0b57cec5SDimitry Andric // for each nesting level. 244*0b57cec5SDimitry Andric SmallVector<int, 8> PPLevelBranchIndex; 245*0b57cec5SDimitry Andric 246*0b57cec5SDimitry Andric // Contains the maximum number of branches at each nesting level. 247*0b57cec5SDimitry Andric SmallVector<int, 8> PPLevelBranchCount; 248*0b57cec5SDimitry Andric 249*0b57cec5SDimitry Andric // Contains the number of branches per nesting level we are currently 250*0b57cec5SDimitry Andric // in while parsing a preprocessor branch sequence. 251*0b57cec5SDimitry Andric // This is used to update PPLevelBranchCount at the end of a branch 252*0b57cec5SDimitry Andric // sequence. 253*0b57cec5SDimitry Andric std::stack<int> PPChainBranchIndex; 254*0b57cec5SDimitry Andric 255*0b57cec5SDimitry Andric // Include guard search state. Used to fixup preprocessor indent levels 256*0b57cec5SDimitry Andric // so that include guards do not participate in indentation. 257*0b57cec5SDimitry Andric enum IncludeGuardState { 258*0b57cec5SDimitry Andric IG_Inited, // Search started, looking for #ifndef. 259*0b57cec5SDimitry Andric IG_IfNdefed, // #ifndef found, IncludeGuardToken points to condition. 260*0b57cec5SDimitry Andric IG_Defined, // Matching #define found, checking other requirements. 261*0b57cec5SDimitry Andric IG_Found, // All requirements met, need to fix indents. 262*0b57cec5SDimitry Andric IG_Rejected, // Search failed or never started. 263*0b57cec5SDimitry Andric }; 264*0b57cec5SDimitry Andric 265*0b57cec5SDimitry Andric // Current state of include guard search. 266*0b57cec5SDimitry Andric IncludeGuardState IncludeGuard; 267*0b57cec5SDimitry Andric 268*0b57cec5SDimitry Andric // Points to the #ifndef condition for a potential include guard. Null unless 269*0b57cec5SDimitry Andric // IncludeGuardState == IG_IfNdefed. 270*0b57cec5SDimitry Andric FormatToken *IncludeGuardToken; 271*0b57cec5SDimitry Andric 272*0b57cec5SDimitry Andric // Contains the first start column where the source begins. This is zero for 273*0b57cec5SDimitry Andric // normal source code and may be nonzero when formatting a code fragment that 274*0b57cec5SDimitry Andric // does not start at the beginning of the file. 275*0b57cec5SDimitry Andric unsigned FirstStartColumn; 276*0b57cec5SDimitry Andric 277*0b57cec5SDimitry Andric friend class ScopedLineState; 278*0b57cec5SDimitry Andric friend class CompoundStatementIndenter; 279*0b57cec5SDimitry Andric }; 280*0b57cec5SDimitry Andric 281*0b57cec5SDimitry Andric struct UnwrappedLineNode { 282*0b57cec5SDimitry Andric UnwrappedLineNode() : Tok(nullptr) {} 283*0b57cec5SDimitry Andric UnwrappedLineNode(FormatToken *Tok) : Tok(Tok) {} 284*0b57cec5SDimitry Andric 285*0b57cec5SDimitry Andric FormatToken *Tok; 286*0b57cec5SDimitry Andric SmallVector<UnwrappedLine, 0> Children; 287*0b57cec5SDimitry Andric }; 288*0b57cec5SDimitry Andric 289*0b57cec5SDimitry Andric inline UnwrappedLine::UnwrappedLine() 290*0b57cec5SDimitry Andric : Level(0), InPPDirective(false), MustBeDeclaration(false), 291*0b57cec5SDimitry Andric MatchingOpeningBlockLineIndex(kInvalidIndex) {} 292*0b57cec5SDimitry Andric 293*0b57cec5SDimitry Andric } // end namespace format 294*0b57cec5SDimitry Andric } // end namespace clang 295*0b57cec5SDimitry Andric 296*0b57cec5SDimitry Andric #endif 297