1 //===--- Macros.h - Format C++ code -----------------------------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// 9 /// \file 10 /// This file contains the main building blocks of macro support in 11 /// clang-format. 12 /// 13 /// In order to not violate the requirement that clang-format can format files 14 /// in isolation, clang-format's macro support uses expansions users provide 15 /// as part of clang-format's style configuration. 16 /// 17 /// Macro definitions are of the form "MACRO(p1, p2)=p1 + p2", but only support 18 /// one level of expansion (\see MacroExpander for a full description of what 19 /// is supported). 20 /// 21 /// As part of parsing, clang-format uses the MacroExpander to expand the 22 /// spelled token streams into expanded token streams when it encounters a 23 /// macro call. The UnwrappedLineParser continues to parse UnwrappedLines 24 /// from the expanded token stream. 25 /// After the expanded unwrapped lines are parsed, the MacroCallReconstructor 26 /// matches the spelled token stream into unwrapped lines that best resemble the 27 /// structure of the expanded unwrapped lines. These reconstructed unwrapped 28 /// lines are aliasing the tokens in the expanded token stream, so that token 29 /// annotations will be reused when formatting the spelled macro calls. 30 /// 31 /// When formatting, clang-format annotates and formats the expanded unwrapped 32 /// lines first, determining the token types. Next, it formats the spelled 33 /// unwrapped lines, keeping the token types fixed, while allowing other 34 /// formatting decisions to change. 35 /// 36 //===----------------------------------------------------------------------===// 37 38 #ifndef CLANG_LIB_FORMAT_MACROS_H 39 #define CLANG_LIB_FORMAT_MACROS_H 40 41 #include <list> 42 #include <map> 43 #include <string> 44 #include <vector> 45 46 #include "FormatToken.h" 47 #include "llvm/ADT/ArrayRef.h" 48 #include "llvm/ADT/DenseMap.h" 49 #include "llvm/ADT/SmallVector.h" 50 #include "llvm/ADT/StringRef.h" 51 52 namespace clang { 53 namespace format { 54 55 struct UnwrappedLine; 56 struct UnwrappedLineNode; 57 58 /// Takes a set of macro definitions as strings and allows expanding calls to 59 /// those macros. 60 /// 61 /// For example: 62 /// Definition: A(x, y)=x + y 63 /// Call : A(int a = 1, 2) 64 /// Expansion : int a = 1 + 2 65 /// 66 /// Expansion does not check arity of the definition. 67 /// If fewer arguments than expected are provided, the remaining parameters 68 /// are considered empty: 69 /// Call : A(a) 70 /// Expansion: a + 71 /// If more arguments than expected are provided, they will be discarded. 72 /// 73 /// The expander does not support: 74 /// - recursive expansion 75 /// - stringification 76 /// - concatenation 77 /// - variadic macros 78 /// 79 /// Furthermore, only a single expansion of each macro argument is supported, 80 /// so that we cannot get conflicting formatting decisions from different 81 /// expansions. 82 /// Definition: A(x)=x+x 83 /// Call : A(id) 84 /// Expansion : id+x 85 /// 86 class MacroExpander { 87 public: 88 using ArgsList = llvm::ArrayRef<llvm::SmallVector<FormatToken *, 8>>; 89 90 /// Construct a macro expander from a set of macro definitions. 91 /// Macro definitions must be encoded as UTF-8. 92 /// 93 /// Each entry in \p Macros must conform to the following simple 94 /// macro-definition language: 95 /// <definition> ::= <id> <expansion> | <id> "(" <params> ")" <expansion> 96 /// <params> ::= <id-list> | "" 97 /// <id-list> ::= <id> | <id> "," <params> 98 /// <expansion> ::= "=" <tail> | <eof> 99 /// <tail> ::= <tok> <tail> | <eof> 100 /// 101 /// Macros that cannot be parsed will be silently discarded. 102 /// 103 MacroExpander(const std::vector<std::string> &Macros, 104 clang::SourceManager &SourceMgr, const FormatStyle &Style, 105 llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator, 106 IdentifierTable &IdentTable); 107 ~MacroExpander(); 108 109 /// Returns whether any macro \p Name is defined, regardless of overloads. 110 bool defined(llvm::StringRef Name) const; 111 112 /// Returns whetherh there is an object-like overload, i.e. where the macro 113 /// has no arguments and should not consume subsequent parentheses. 114 bool objectLike(llvm::StringRef Name) const; 115 116 /// Returns whether macro \p Name provides an overload with the given arity. 117 bool hasArity(llvm::StringRef Name, unsigned Arity) const; 118 119 /// Returns the expanded stream of format tokens for \p ID, where 120 /// each element in \p Args is a positional argument to the macro call. 121 /// If \p Args is not set, the object-like overload is used. 122 /// If \p Args is set, the overload with the arity equal to \c Args.size() is 123 /// used. 124 llvm::SmallVector<FormatToken *, 8> 125 expand(FormatToken *ID, std::optional<ArgsList> OptionalArgs) const; 126 127 private: 128 struct Definition; 129 class DefinitionParser; 130 131 void parseDefinition(const std::string &Macro); 132 133 clang::SourceManager &SourceMgr; 134 const FormatStyle &Style; 135 llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator; 136 IdentifierTable &IdentTable; 137 SmallVector<std::unique_ptr<llvm::MemoryBuffer>> Buffers; 138 llvm::StringMap<llvm::DenseMap<int, Definition>> FunctionLike; 139 llvm::StringMap<Definition> ObjectLike; 140 }; 141 142 /// Converts a sequence of UnwrappedLines containing expanded macros into a 143 /// single UnwrappedLine containing the macro calls. This UnwrappedLine may be 144 /// broken into child lines, in a way that best conveys the structure of the 145 /// expanded code. 146 /// 147 /// In the simplest case, a spelled UnwrappedLine contains one macro, and after 148 /// expanding it we have one expanded UnwrappedLine. In general, macro 149 /// expansions can span UnwrappedLines, and multiple macros can contribute 150 /// tokens to the same line. We keep consuming expanded lines until: 151 /// * all expansions that started have finished (we're not chopping any macros 152 /// in half) 153 /// * *and* we've reached the end of a *spelled* unwrapped line. 154 /// 155 /// A single UnwrappedLine represents this chunk of code. 156 /// 157 /// After this point, the state of the spelled/expanded stream is "in sync" 158 /// (both at the start of an UnwrappedLine, with no macros open), so the 159 /// Reconstructor can be thrown away and parsing can continue. 160 /// 161 /// Given a mapping from the macro name identifier token in the macro call 162 /// to the tokens of the macro call, for example: 163 /// CLASSA -> CLASSA({public: void x();}) 164 /// 165 /// When getting the formatted lines of the expansion via the \c addLine method 166 /// (each '->' specifies a call to \c addLine ): 167 /// -> class A { 168 /// -> public: 169 /// -> void x(); 170 /// -> }; 171 /// 172 /// Creates the tree of unwrapped lines containing the macro call tokens so that 173 /// the macro call tokens fit the semantic structure of the expanded formatted 174 /// lines: 175 /// -> CLASSA({ 176 /// -> public: 177 /// -> void x(); 178 /// -> }) 179 class MacroCallReconstructor { 180 public: 181 /// Create an Reconstructor whose resulting \p UnwrappedLine will start at 182 /// \p Level, using the map from name identifier token to the corresponding 183 /// tokens of the spelled macro call. 184 MacroCallReconstructor( 185 unsigned Level, 186 const llvm::DenseMap<FormatToken *, std::unique_ptr<UnwrappedLine>> 187 &ActiveExpansions); 188 189 /// For the given \p Line, match all occurences of tokens expanded from a 190 /// macro to unwrapped lines in the spelled macro call so that the resulting 191 /// tree of unwrapped lines best resembles the structure of unwrapped lines 192 /// passed in via \c addLine. 193 void addLine(const UnwrappedLine &Line); 194 195 /// Check whether at the current state there is no open macro expansion 196 /// that needs to be processed to finish an macro call. 197 /// Only when \c finished() is true, \c takeResult() can be called to retrieve 198 /// the resulting \c UnwrappedLine. 199 /// If there are multiple subsequent macro calls within an unwrapped line in 200 /// the spelled token stream, the calling code may also continue to call 201 /// \c addLine() when \c finished() is true. 202 bool finished() const { return ActiveExpansions.empty(); } 203 204 /// Retrieve the formatted \c UnwrappedLine containing the orginal 205 /// macro calls, formatted according to the expanded token stream received 206 /// via \c addLine(). 207 /// Generally, this line tries to have the same structure as the expanded, 208 /// formatted unwrapped lines handed in via \c addLine(), with the exception 209 /// that for multiple top-level lines, each subsequent line will be the 210 /// child of the last token in its predecessor. This representation is chosen 211 /// because it is a precondition to the formatter that we get what looks like 212 /// a single statement in a single \c UnwrappedLine (i.e. matching parens). 213 /// 214 /// If a token in a macro argument is a child of a token in the expansion, 215 /// the parent will be the corresponding token in the macro call. 216 /// For example: 217 /// #define C(a, b) class C { a b 218 /// C(int x;, int y;) 219 /// would expand to 220 /// class C { int x; int y; 221 /// where in a formatted line "int x;" and "int y;" would both be new separate 222 /// lines. 223 /// 224 /// In the result, "int x;" will be a child of the opening parenthesis in "C(" 225 /// and "int y;" will be a child of the "," token: 226 /// C ( 227 /// \- int x; 228 /// , 229 /// \- int y; 230 /// ) 231 UnwrappedLine takeResult() &&; 232 233 private: 234 void add(FormatToken *Token, FormatToken *ExpandedParent, bool First); 235 void prepareParent(FormatToken *ExpandedParent, bool First); 236 FormatToken *getParentInResult(FormatToken *Parent); 237 void reconstruct(FormatToken *Token); 238 void startReconstruction(FormatToken *Token); 239 bool reconstructActiveCallUntil(FormatToken *Token); 240 void endReconstruction(FormatToken *Token); 241 bool processNextReconstructed(); 242 void finalize(); 243 244 struct ReconstructedLine; 245 246 void appendToken(FormatToken *Token, ReconstructedLine *L = nullptr); 247 UnwrappedLine createUnwrappedLine(const ReconstructedLine &Line, int Level); 248 void debug(const ReconstructedLine &Line, int Level); 249 ReconstructedLine &parentLine(); 250 ReconstructedLine *currentLine(); 251 void debugParentMap() const; 252 253 #ifndef NDEBUG 254 enum ReconstructorState { 255 Start, // No macro expansion was found in the input yet. 256 InProgress, // During a macro reconstruction. 257 Finalized, // Past macro reconstruction, the result is finalized. 258 }; 259 ReconstructorState State = Start; 260 #endif 261 262 // Node in which we build up the resulting unwrapped line; this type is 263 // analogous to UnwrappedLineNode. 264 struct LineNode { 265 LineNode() = default; 266 LineNode(FormatToken *Tok) : Tok(Tok) {} 267 FormatToken *Tok = nullptr; 268 llvm::SmallVector<std::unique_ptr<ReconstructedLine>> Children; 269 }; 270 271 // Line in which we build up the resulting unwrapped line. 272 // FIXME: Investigate changing UnwrappedLine to a pointer type and using it 273 // instead of rolling our own type. 274 struct ReconstructedLine { 275 llvm::SmallVector<std::unique_ptr<LineNode>> Tokens; 276 }; 277 278 // The line in which we collect the resulting reconstructed output. 279 // To reduce special cases in the algorithm, the first level of the line 280 // contains a single null token that has the reconstructed incoming 281 // lines as children. 282 // In the end, we stich the lines together so that each subsequent line 283 // is a child of the last token of the previous line. This is necessary 284 // in order to format the overall expression as a single logical line - 285 // if we created separate lines, we'd format them with their own top-level 286 // indent depending on the semantic structure, which is not desired. 287 ReconstructedLine Result; 288 289 // Stack of currently "open" lines, where each line's predecessor's last 290 // token is the parent token for that line. 291 llvm::SmallVector<ReconstructedLine *> ActiveReconstructedLines; 292 293 // Maps from the expanded token to the token that takes its place in the 294 // reconstructed token stream in terms of parent-child relationships. 295 // Note that it might take multiple steps to arrive at the correct 296 // parent in the output. 297 // Given: #define C(a, b) []() { a; b; } 298 // And a call: C(f(), g()) 299 // The structure in the incoming formatted unwrapped line will be: 300 // []() { 301 // |- f(); 302 // \- g(); 303 // } 304 // with f and g being children of the opening brace. 305 // In the reconstructed call: 306 // C(f(), g()) 307 // \- f() 308 // \- g() 309 // We want f to be a child of the opening parenthesis and g to be a child 310 // of the comma token in the macro call. 311 // Thus, we map 312 // { -> ( 313 // and add 314 // ( -> , 315 // once we're past the comma in the reconstruction. 316 llvm::DenseMap<FormatToken *, FormatToken *> 317 SpelledParentToReconstructedParent; 318 319 // Keeps track of a single expansion while we're reconstructing tokens it 320 // generated. 321 struct Expansion { 322 // The identifier token of the macro call. 323 FormatToken *ID; 324 // Our current position in the reconstruction. 325 std::list<UnwrappedLineNode>::iterator SpelledI; 326 // The end of the reconstructed token sequence. 327 std::list<UnwrappedLineNode>::iterator SpelledE; 328 }; 329 330 // Stack of macro calls for which we're in the middle of an expansion. 331 llvm::SmallVector<Expansion> ActiveExpansions; 332 333 struct MacroCallState { 334 MacroCallState(ReconstructedLine *Line, FormatToken *ParentLastToken, 335 FormatToken *MacroCallLParen); 336 337 ReconstructedLine *Line; 338 339 // The last token in the parent line or expansion, or nullptr if the macro 340 // expansion is on a top-level line. 341 // 342 // For example, in the macro call: 343 // auto f = []() { ID(1); }; 344 // The MacroCallState for ID will have '{' as ParentLastToken. 345 // 346 // In the macro call: 347 // ID(ID(void f())); 348 // The MacroCallState of the outer ID will have nullptr as ParentLastToken, 349 // while the MacroCallState for the inner ID will have the '(' of the outer 350 // ID as ParentLastToken. 351 // 352 // In the macro call: 353 // ID2(a, ID(b)); 354 // The MacroCallState of ID will have ',' as ParentLastToken. 355 FormatToken *ParentLastToken; 356 357 // The l_paren of this MacroCallState's macro call. 358 FormatToken *MacroCallLParen; 359 }; 360 361 // Keeps track of the lines into which the opening brace/parenthesis & 362 // argument separating commas for each level in the macro call go in order to 363 // put the corresponding closing brace/parenthesis into the same line in the 364 // output and keep track of which parents in the expanded token stream map to 365 // which tokens in the reconstructed stream. 366 // When an opening brace/parenthesis has children, we want the structure of 367 // the output line to be: 368 // |- MACRO 369 // |- ( 370 // | \- <argument> 371 // |- , 372 // | \- <argument> 373 // \- ) 374 llvm::SmallVector<MacroCallState> MacroCallStructure; 375 376 // Level the generated UnwrappedLine will be at. 377 const unsigned Level; 378 379 // Maps from identifier of the macro call to an unwrapped line containing 380 // all tokens of the macro call. 381 const llvm::DenseMap<FormatToken *, std::unique_ptr<UnwrappedLine>> 382 &IdToReconstructed; 383 }; 384 385 } // namespace format 386 } // namespace clang 387 388 #endif 389