1 //===- Tree.h - structure of the syntax tree ------------------*- C++ -*-=====// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // Defines the basic structure of the syntax tree. There are two kinds of nodes: 9 // - leaf nodes correspond to a token in the expanded token stream, 10 // - tree nodes correspond to language grammar constructs. 11 // 12 // The tree is initially built from an AST. Each node of a newly built tree 13 // covers a continous subrange of expanded tokens (i.e. tokens after 14 // preprocessing), the specific tokens coverered are stored in the leaf nodes of 15 // a tree. A post-order traversal of a tree will visit leaf nodes in an order 16 // corresponding the original order of expanded tokens. 17 // 18 // This is still work in progress and highly experimental, we leave room for 19 // ourselves to completely change the design and/or implementation. 20 //===----------------------------------------------------------------------===// 21 #ifndef LLVM_CLANG_TOOLING_SYNTAX_TREE_H 22 #define LLVM_CLANG_TOOLING_SYNTAX_TREE_H 23 24 #include "clang/Basic/LangOptions.h" 25 #include "clang/Basic/SourceLocation.h" 26 #include "clang/Basic/SourceManager.h" 27 #include "clang/Basic/TokenKinds.h" 28 #include "clang/Tooling/Syntax/Tokens.h" 29 #include "llvm/ADT/ArrayRef.h" 30 #include "llvm/ADT/DenseMap.h" 31 #include "llvm/ADT/iterator.h" 32 #include "llvm/Support/Allocator.h" 33 #include <cstdint> 34 #include <iterator> 35 36 namespace clang { 37 namespace syntax { 38 39 /// A memory arena for syntax trees. Also tracks the underlying token buffers, 40 /// source manager, etc. 41 class Arena { 42 public: 43 Arena(SourceManager &SourceMgr, const LangOptions &LangOpts, 44 const TokenBuffer &Tokens); 45 46 const SourceManager &getSourceManager() const { return SourceMgr; } 47 const LangOptions &getLangOptions() const { return LangOpts; } 48 49 const TokenBuffer &getTokenBuffer() const; 50 llvm::BumpPtrAllocator &getAllocator() { return Allocator; } 51 52 private: 53 /// Add \p Buffer to the underlying source manager, tokenize it and store the 54 /// resulting tokens. Used exclusively in `FactoryImpl` to materialize tokens 55 /// that were not written in user code. 56 std::pair<FileID, ArrayRef<Token>> 57 lexBuffer(std::unique_ptr<llvm::MemoryBuffer> Buffer); 58 friend class FactoryImpl; 59 60 private: 61 SourceManager &SourceMgr; 62 const LangOptions &LangOpts; 63 const TokenBuffer &Tokens; 64 /// IDs and storage for additional tokenized files. 65 llvm::DenseMap<FileID, std::vector<Token>> ExtraTokens; 66 /// Keeps all the allocated nodes and their intermediate data structures. 67 llvm::BumpPtrAllocator Allocator; 68 }; 69 70 class Tree; 71 class TreeBuilder; 72 class FactoryImpl; 73 class MutationsImpl; 74 75 enum class NodeKind : uint16_t; 76 enum class NodeRole : uint8_t; 77 78 /// A node in a syntax tree. Each node is either a Leaf (representing tokens) or 79 /// a Tree (representing language constructrs). 80 class Node { 81 protected: 82 /// Newly created nodes are detached from a tree, parent and sibling links are 83 /// set when the node is added as a child to another one. 84 Node(NodeKind Kind); 85 /// Nodes are allocated on Arenas; the destructor is never called. 86 ~Node() = default; 87 88 public: 89 /// Nodes cannot simply be copied without violating tree invariants. 90 Node(const Node &) = delete; 91 Node &operator=(const Node &) = delete; 92 /// Idiomatically, nodes are allocated on an Arena and never moved. 93 Node(Node &&) = delete; 94 Node &operator=(Node &&) = delete; 95 96 NodeKind getKind() const { return static_cast<NodeKind>(Kind); } 97 NodeRole getRole() const { return static_cast<NodeRole>(Role); } 98 99 /// Whether the node is detached from a tree, i.e. does not have a parent. 100 bool isDetached() const; 101 /// Whether the node was created from the AST backed by the source code 102 /// rather than added later through mutation APIs or created with factory 103 /// functions. 104 /// When this flag is true, all subtrees are also original. 105 /// This flag is set to false on any modifications to the node or any of its 106 /// subtrees, even if this simply involves swapping existing subtrees. 107 bool isOriginal() const { return Original; } 108 /// If this function return false, the tree cannot be modified because there 109 /// is no reasonable way to produce the corresponding textual replacements. 110 /// This can happen when the node crosses macro expansion boundaries. 111 /// 112 /// Note that even if the node is not modifiable, its child nodes can be 113 /// modifiable. 114 bool canModify() const { return CanModify; } 115 116 const Tree *getParent() const { return Parent; } 117 Tree *getParent() { return Parent; } 118 119 const Node *getNextSibling() const { return NextSibling; } 120 Node *getNextSibling() { return NextSibling; } 121 const Node *getPreviousSibling() const { return PreviousSibling; } 122 Node *getPreviousSibling() { return PreviousSibling; } 123 124 /// Dumps the structure of a subtree. For debugging and testing purposes. 125 std::string dump(const SourceManager &SM) const; 126 /// Dumps the tokens forming this subtree. 127 std::string dumpTokens(const SourceManager &SM) const; 128 129 /// Asserts invariants on this node of the tree and its immediate children. 130 /// Will not recurse into the subtree. No-op if NDEBUG is set. 131 void assertInvariants() const; 132 /// Runs checkInvariants on all nodes in the subtree. No-op if NDEBUG is set. 133 void assertInvariantsRecursive() const; 134 135 private: 136 // Tree is allowed to change the Parent link and Role. 137 friend class Tree; 138 // TreeBuilder is allowed to set the Original and CanModify flags. 139 friend class TreeBuilder; 140 // MutationsImpl sets roles and CanModify flag. 141 friend class MutationsImpl; 142 // FactoryImpl sets CanModify flag. 143 friend class FactoryImpl; 144 145 void setRole(NodeRole NR); 146 147 Tree *Parent; 148 Node *NextSibling; 149 Node *PreviousSibling; 150 unsigned Kind : 16; 151 unsigned Role : 8; 152 unsigned Original : 1; 153 unsigned CanModify : 1; 154 }; 155 156 /// A leaf node points to a single token inside the expanded token stream. 157 class Leaf final : public Node { 158 public: 159 Leaf(const Token *T); 160 static bool classof(const Node *N); 161 162 const Token *getToken() const { return Tok; } 163 164 private: 165 const Token *Tok; 166 }; 167 168 /// A node that has children and represents a syntactic language construct. 169 class Tree : public Node { 170 /// Iterator over children (common base for const/non-const). 171 /// Not invalidated by tree mutations (holds a stable node pointer). 172 template <typename DerivedT, typename NodeT> 173 class ChildIteratorBase 174 : public llvm::iterator_facade_base<DerivedT, std::forward_iterator_tag, 175 NodeT> { 176 protected: 177 NodeT *N = nullptr; 178 using Base = ChildIteratorBase; 179 180 public: 181 ChildIteratorBase() = default; 182 explicit ChildIteratorBase(NodeT *N) : N(N) {} 183 184 friend bool operator==(const DerivedT &LHS, const DerivedT &RHS) { 185 return LHS.N == RHS.N; 186 } 187 188 NodeT &operator*() const { return *N; } 189 DerivedT &operator++() { 190 N = N->getNextSibling(); 191 return *static_cast<DerivedT *>(this); 192 } 193 194 /// Truthy if valid (not past-the-end). 195 /// This allows: if (auto It = find_if(N.children(), ...) ) 196 explicit operator bool() const { return N != nullptr; } 197 /// The element, or nullptr if past-the-end. 198 NodeT *asPointer() const { return N; } 199 }; 200 201 public: 202 static bool classof(const Node *N); 203 204 Node *getFirstChild() { return FirstChild; } 205 const Node *getFirstChild() const { return FirstChild; } 206 Node *getLastChild() { return LastChild; } 207 const Node *getLastChild() const { return LastChild; } 208 209 const Leaf *findFirstLeaf() const; 210 Leaf *findFirstLeaf() { 211 return const_cast<Leaf *>(const_cast<const Tree *>(this)->findFirstLeaf()); 212 } 213 214 const Leaf *findLastLeaf() const; 215 Leaf *findLastLeaf() { 216 return const_cast<Leaf *>(const_cast<const Tree *>(this)->findLastLeaf()); 217 } 218 219 /// child_iterator is not invalidated by mutations. 220 struct ChildIterator : ChildIteratorBase<ChildIterator, Node> { 221 using Base::ChildIteratorBase; 222 }; 223 struct ConstChildIterator 224 : ChildIteratorBase<ConstChildIterator, const Node> { 225 using Base::ChildIteratorBase; 226 ConstChildIterator() = default; 227 ConstChildIterator(const ChildIterator &I) : Base(I.asPointer()) {} 228 }; 229 230 llvm::iterator_range<ChildIterator> getChildren() { 231 return {ChildIterator(getFirstChild()), ChildIterator()}; 232 } 233 llvm::iterator_range<ConstChildIterator> getChildren() const { 234 return {ConstChildIterator(getFirstChild()), ConstChildIterator()}; 235 } 236 237 /// Find the first node with a corresponding role. 238 const Node *findChild(NodeRole R) const; 239 Node *findChild(NodeRole R) { 240 return const_cast<Node *>(const_cast<const Tree *>(this)->findChild(R)); 241 } 242 243 protected: 244 using Node::Node; 245 246 private: 247 /// Append \p Child to the list of children and sets the parent pointer. 248 /// A very low-level operation that does not check any invariants, only used 249 /// by TreeBuilder and FactoryImpl. 250 /// EXPECTS: Role != Detached. 251 void appendChildLowLevel(Node *Child, NodeRole Role); 252 /// Similar but prepends. 253 void prependChildLowLevel(Node *Child, NodeRole Role); 254 255 /// Like the previous overloads, but does not set role for \p Child. 256 /// EXPECTS: Child->Role != Detached 257 void appendChildLowLevel(Node *Child); 258 void prependChildLowLevel(Node *Child); 259 friend class TreeBuilder; 260 friend class FactoryImpl; 261 262 /// Replace a range of children [Begin, End) with a list of 263 /// new nodes starting at \p New. 264 /// Only used by MutationsImpl to implement higher-level mutation operations. 265 /// (!) \p New can be null to model removal of the child range. 266 /// (!) \p End can be null to model one past the end. 267 /// (!) \p Begin can be null to model an append. 268 void replaceChildRangeLowLevel(Node *Begin, Node *End, Node *New); 269 friend class MutationsImpl; 270 271 Node *FirstChild = nullptr; 272 Node *LastChild = nullptr; 273 }; 274 275 /// A list of Elements separated or terminated by a fixed token. 276 /// 277 /// This type models the following grammar construct: 278 /// delimited-list(element, delimiter, termination, canBeEmpty) 279 class List : public Tree { 280 public: 281 template <typename Element> struct ElementAndDelimiter { 282 Element *element; 283 Leaf *delimiter; 284 }; 285 286 enum class TerminationKind { 287 Terminated, 288 MaybeTerminated, 289 Separated, 290 }; 291 292 using Tree::Tree; 293 static bool classof(const Node *N); 294 /// Returns the elements and corresponding delimiters. Missing elements 295 /// and delimiters are represented as null pointers. 296 /// 297 /// For example, in a separated list: 298 /// "a, b, c" <=> [("a" , ","), ("b" , "," ), ("c" , null)] 299 /// "a, , c" <=> [("a" , ","), (null, "," ), ("c" , null)] 300 /// "a, b c" <=> [("a" , ","), ("b" , null), ("c" , null)] 301 /// "a, b," <=> [("a" , ","), ("b" , "," ), (null, null)] 302 /// 303 /// In a terminated or maybe-terminated list: 304 /// "a; b; c;" <=> [("a" , ";"), ("b" , ";" ), ("c" , ";" )] 305 /// "a; ; c;" <=> [("a" , ";"), (null, ";" ), ("c" , ";" )] 306 /// "a; b c;" <=> [("a" , ";"), ("b" , null), ("c" , ";" )] 307 /// "a; b; c" <=> [("a" , ";"), ("b" , ";" ), ("c" , null)] 308 std::vector<ElementAndDelimiter<Node>> getElementsAsNodesAndDelimiters(); 309 310 /// Returns the elements of the list. Missing elements are represented 311 /// as null pointers in the same way as in the return value of 312 /// `getElementsAsNodesAndDelimiters()`. 313 std::vector<Node *> getElementsAsNodes(); 314 315 // These can't be implemented with the information we have! 316 317 /// Returns the appropriate delimiter for this list. 318 /// 319 /// Useful for discovering the correct delimiter to use when adding 320 /// elements to empty or one-element lists. 321 clang::tok::TokenKind getDelimiterTokenKind() const; 322 323 TerminationKind getTerminationKind() const; 324 325 /// Whether this list can be empty in syntactically and semantically correct 326 /// code. 327 /// 328 /// This list may be empty when the source code has errors even if 329 /// canBeEmpty() returns false. 330 bool canBeEmpty() const; 331 }; 332 333 } // namespace syntax 334 } // namespace clang 335 336 #endif 337