//===--- Macros.h - Format C++ code -----------------------------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// /// \file /// This file contains the main building blocks of macro support in /// clang-format. /// /// In order to not violate the requirement that clang-format can format files /// in isolation, clang-format's macro support uses expansions users provide /// as part of clang-format's style configuration. /// /// Macro definitions are of the form "MACRO(p1, p2)=p1 + p2", but only support /// one level of expansion (\see MacroExpander for a full description of what /// is supported). /// /// As part of parsing, clang-format uses the MacroExpander to expand the /// spelled token streams into expanded token streams when it encounters a /// macro call. The UnwrappedLineParser continues to parse UnwrappedLines /// from the expanded token stream. /// After the expanded unwrapped lines are parsed, the MacroCallReconstructor /// matches the spelled token stream into unwrapped lines that best resemble the /// structure of the expanded unwrapped lines. These reconstructed unwrapped /// lines are aliasing the tokens in the expanded token stream, so that token /// annotations will be reused when formatting the spelled macro calls. /// /// When formatting, clang-format annotates and formats the expanded unwrapped /// lines first, determining the token types. Next, it formats the spelled /// unwrapped lines, keeping the token types fixed, while allowing other /// formatting decisions to change. /// //===----------------------------------------------------------------------===// #ifndef CLANG_LIB_FORMAT_MACROS_H #define CLANG_LIB_FORMAT_MACROS_H #include #include #include #include #include "FormatToken.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" namespace clang { namespace format { struct UnwrappedLine; struct UnwrappedLineNode; /// Takes a set of macro definitions as strings and allows expanding calls to /// those macros. /// /// For example: /// Definition: A(x, y)=x + y /// Call : A(int a = 1, 2) /// Expansion : int a = 1 + 2 /// /// Expansion does not check arity of the definition. /// If fewer arguments than expected are provided, the remaining parameters /// are considered empty: /// Call : A(a) /// Expansion: a + /// If more arguments than expected are provided, they will be discarded. /// /// The expander does not support: /// - recursive expansion /// - stringification /// - concatenation /// - variadic macros /// /// Furthermore, only a single expansion of each macro argument is supported, /// so that we cannot get conflicting formatting decisions from different /// expansions. /// Definition: A(x)=x+x /// Call : A(id) /// Expansion : id+x /// class MacroExpander { public: using ArgsList = llvm::ArrayRef>; /// Construct a macro expander from a set of macro definitions. /// Macro definitions must be encoded as UTF-8. /// /// Each entry in \p Macros must conform to the following simple /// macro-definition language: /// ::= | "(" ")" /// ::= | "" /// ::= | "," /// ::= "=" | /// ::= | /// /// Macros that cannot be parsed will be silently discarded. /// MacroExpander(const std::vector &Macros, clang::SourceManager &SourceMgr, const FormatStyle &Style, llvm::SpecificBumpPtrAllocator &Allocator, IdentifierTable &IdentTable); ~MacroExpander(); /// Returns whether any macro \p Name is defined, regardless of overloads. bool defined(llvm::StringRef Name) const; /// Returns whetherh there is an object-like overload, i.e. where the macro /// has no arguments and should not consume subsequent parentheses. bool objectLike(llvm::StringRef Name) const; /// Returns whether macro \p Name provides an overload with the given arity. bool hasArity(llvm::StringRef Name, unsigned Arity) const; /// Returns the expanded stream of format tokens for \p ID, where /// each element in \p Args is a positional argument to the macro call. /// If \p Args is not set, the object-like overload is used. /// If \p Args is set, the overload with the arity equal to \c Args.size() is /// used. llvm::SmallVector expand(FormatToken *ID, std::optional OptionalArgs) const; private: struct Definition; class DefinitionParser; void parseDefinition(const std::string &Macro); clang::SourceManager &SourceMgr; const FormatStyle &Style; llvm::SpecificBumpPtrAllocator &Allocator; IdentifierTable &IdentTable; SmallVector> Buffers; llvm::StringMap> FunctionLike; llvm::StringMap ObjectLike; }; /// Converts a sequence of UnwrappedLines containing expanded macros into a /// single UnwrappedLine containing the macro calls. This UnwrappedLine may be /// broken into child lines, in a way that best conveys the structure of the /// expanded code. /// /// In the simplest case, a spelled UnwrappedLine contains one macro, and after /// expanding it we have one expanded UnwrappedLine. In general, macro /// expansions can span UnwrappedLines, and multiple macros can contribute /// tokens to the same line. We keep consuming expanded lines until: /// * all expansions that started have finished (we're not chopping any macros /// in half) /// * *and* we've reached the end of a *spelled* unwrapped line. /// /// A single UnwrappedLine represents this chunk of code. /// /// After this point, the state of the spelled/expanded stream is "in sync" /// (both at the start of an UnwrappedLine, with no macros open), so the /// Reconstructor can be thrown away and parsing can continue. /// /// Given a mapping from the macro name identifier token in the macro call /// to the tokens of the macro call, for example: /// CLASSA -> CLASSA({public: void x();}) /// /// When getting the formatted lines of the expansion via the \c addLine method /// (each '->' specifies a call to \c addLine ): /// -> class A { /// -> public: /// -> void x(); /// -> }; /// /// Creates the tree of unwrapped lines containing the macro call tokens so that /// the macro call tokens fit the semantic structure of the expanded formatted /// lines: /// -> CLASSA({ /// -> public: /// -> void x(); /// -> }) class MacroCallReconstructor { public: /// Create an Reconstructor whose resulting \p UnwrappedLine will start at /// \p Level, using the map from name identifier token to the corresponding /// tokens of the spelled macro call. MacroCallReconstructor( unsigned Level, const llvm::DenseMap> &ActiveExpansions); /// For the given \p Line, match all occurences of tokens expanded from a /// macro to unwrapped lines in the spelled macro call so that the resulting /// tree of unwrapped lines best resembles the structure of unwrapped lines /// passed in via \c addLine. void addLine(const UnwrappedLine &Line); /// Check whether at the current state there is no open macro expansion /// that needs to be processed to finish an macro call. /// Only when \c finished() is true, \c takeResult() can be called to retrieve /// the resulting \c UnwrappedLine. /// If there are multiple subsequent macro calls within an unwrapped line in /// the spelled token stream, the calling code may also continue to call /// \c addLine() when \c finished() is true. bool finished() const { return ActiveExpansions.empty(); } /// Retrieve the formatted \c UnwrappedLine containing the orginal /// macro calls, formatted according to the expanded token stream received /// via \c addLine(). /// Generally, this line tries to have the same structure as the expanded, /// formatted unwrapped lines handed in via \c addLine(), with the exception /// that for multiple top-level lines, each subsequent line will be the /// child of the last token in its predecessor. This representation is chosen /// because it is a precondition to the formatter that we get what looks like /// a single statement in a single \c UnwrappedLine (i.e. matching parens). /// /// If a token in a macro argument is a child of a token in the expansion, /// the parent will be the corresponding token in the macro call. /// For example: /// #define C(a, b) class C { a b /// C(int x;, int y;) /// would expand to /// class C { int x; int y; /// where in a formatted line "int x;" and "int y;" would both be new separate /// lines. /// /// In the result, "int x;" will be a child of the opening parenthesis in "C(" /// and "int y;" will be a child of the "," token: /// C ( /// \- int x; /// , /// \- int y; /// ) UnwrappedLine takeResult() &&; private: void add(FormatToken *Token, FormatToken *ExpandedParent, bool First); void prepareParent(FormatToken *ExpandedParent, bool First); FormatToken *getParentInResult(FormatToken *Parent); void reconstruct(FormatToken *Token); void startReconstruction(FormatToken *Token); bool reconstructActiveCallUntil(FormatToken *Token); void endReconstruction(FormatToken *Token); bool processNextReconstructed(); void finalize(); struct ReconstructedLine; void appendToken(FormatToken *Token, ReconstructedLine *L = nullptr); UnwrappedLine createUnwrappedLine(const ReconstructedLine &Line, int Level); void debug(const ReconstructedLine &Line, int Level); ReconstructedLine &parentLine(); ReconstructedLine *currentLine(); void debugParentMap() const; #ifndef NDEBUG enum ReconstructorState { Start, // No macro expansion was found in the input yet. InProgress, // During a macro reconstruction. Finalized, // Past macro reconstruction, the result is finalized. }; ReconstructorState State = Start; #endif // Node in which we build up the resulting unwrapped line; this type is // analogous to UnwrappedLineNode. struct LineNode { LineNode() = default; LineNode(FormatToken *Tok) : Tok(Tok) {} FormatToken *Tok = nullptr; llvm::SmallVector> Children; }; // Line in which we build up the resulting unwrapped line. // FIXME: Investigate changing UnwrappedLine to a pointer type and using it // instead of rolling our own type. struct ReconstructedLine { llvm::SmallVector> Tokens; }; // The line in which we collect the resulting reconstructed output. // To reduce special cases in the algorithm, the first level of the line // contains a single null token that has the reconstructed incoming // lines as children. // In the end, we stich the lines together so that each subsequent line // is a child of the last token of the previous line. This is necessary // in order to format the overall expression as a single logical line - // if we created separate lines, we'd format them with their own top-level // indent depending on the semantic structure, which is not desired. ReconstructedLine Result; // Stack of currently "open" lines, where each line's predecessor's last // token is the parent token for that line. llvm::SmallVector ActiveReconstructedLines; // Maps from the expanded token to the token that takes its place in the // reconstructed token stream in terms of parent-child relationships. // Note that it might take multiple steps to arrive at the correct // parent in the output. // Given: #define C(a, b) []() { a; b; } // And a call: C(f(), g()) // The structure in the incoming formatted unwrapped line will be: // []() { // |- f(); // \- g(); // } // with f and g being children of the opening brace. // In the reconstructed call: // C(f(), g()) // \- f() // \- g() // We want f to be a child of the opening parenthesis and g to be a child // of the comma token in the macro call. // Thus, we map // { -> ( // and add // ( -> , // once we're past the comma in the reconstruction. llvm::DenseMap SpelledParentToReconstructedParent; // Keeps track of a single expansion while we're reconstructing tokens it // generated. struct Expansion { // The identifier token of the macro call. FormatToken *ID; // Our current position in the reconstruction. std::list::iterator SpelledI; // The end of the reconstructed token sequence. std::list::iterator SpelledE; }; // Stack of macro calls for which we're in the middle of an expansion. llvm::SmallVector ActiveExpansions; struct MacroCallState { MacroCallState(ReconstructedLine *Line, FormatToken *ParentLastToken, FormatToken *MacroCallLParen); ReconstructedLine *Line; // The last token in the parent line or expansion, or nullptr if the macro // expansion is on a top-level line. // // For example, in the macro call: // auto f = []() { ID(1); }; // The MacroCallState for ID will have '{' as ParentLastToken. // // In the macro call: // ID(ID(void f())); // The MacroCallState of the outer ID will have nullptr as ParentLastToken, // while the MacroCallState for the inner ID will have the '(' of the outer // ID as ParentLastToken. // // In the macro call: // ID2(a, ID(b)); // The MacroCallState of ID will have ',' as ParentLastToken. FormatToken *ParentLastToken; // The l_paren of this MacroCallState's macro call. FormatToken *MacroCallLParen; }; // Keeps track of the lines into which the opening brace/parenthesis & // argument separating commas for each level in the macro call go in order to // put the corresponding closing brace/parenthesis into the same line in the // output and keep track of which parents in the expanded token stream map to // which tokens in the reconstructed stream. // When an opening brace/parenthesis has children, we want the structure of // the output line to be: // |- MACRO // |- ( // | \- // |- , // | \- // \- ) llvm::SmallVector MacroCallStructure; // Level the generated UnwrappedLine will be at. const unsigned Level; // Maps from identifier of the macro call to an unwrapped line containing // all tokens of the macro call. const llvm::DenseMap> &IdToReconstructed; }; } // namespace format } // namespace clang #endif