1 //===- BuildTree.cpp ------------------------------------------*- C++ -*-=====// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 #include "clang/Tooling/Syntax/BuildTree.h" 9 #include "clang/AST/RecursiveASTVisitor.h" 10 #include "clang/AST/Stmt.h" 11 #include "clang/Basic/LLVM.h" 12 #include "clang/Basic/SourceLocation.h" 13 #include "clang/Basic/SourceManager.h" 14 #include "clang/Basic/TokenKinds.h" 15 #include "clang/Lex/Lexer.h" 16 #include "clang/Tooling/Syntax/Nodes.h" 17 #include "clang/Tooling/Syntax/Tokens.h" 18 #include "clang/Tooling/Syntax/Tree.h" 19 #include "llvm/ADT/ArrayRef.h" 20 #include "llvm/ADT/STLExtras.h" 21 #include "llvm/ADT/SmallVector.h" 22 #include "llvm/Support/Allocator.h" 23 #include "llvm/Support/Casting.h" 24 #include "llvm/Support/FormatVariadic.h" 25 #include "llvm/Support/raw_ostream.h" 26 #include <map> 27 28 using namespace clang; 29 30 /// A helper class for constructing the syntax tree while traversing a clang 31 /// AST. 32 /// 33 /// At each point of the traversal we maintain a list of pending nodes. 34 /// Initially all tokens are added as pending nodes. When processing a clang AST 35 /// node, the clients need to: 36 /// - create a corresponding syntax node, 37 /// - assign roles to all pending child nodes with 'markChild' and 38 /// 'markChildToken', 39 /// - replace the child nodes with the new syntax node in the pending list 40 /// with 'foldNode'. 41 /// 42 /// Note that all children are expected to be processed when building a node. 43 /// 44 /// Call finalize() to finish building the tree and consume the root node. 45 class syntax::TreeBuilder { 46 public: 47 TreeBuilder(syntax::Arena &Arena) : Arena(Arena), Pending(Arena) {} 48 49 llvm::BumpPtrAllocator &allocator() { return Arena.allocator(); } 50 51 /// Populate children for \p New node, assuming it covers tokens from \p 52 /// Range. 53 void foldNode(llvm::ArrayRef<syntax::Token> Range, syntax::Tree *New); 54 55 /// Set role for a token starting at \p Loc. 56 void markChildToken(SourceLocation Loc, tok::TokenKind Kind, NodeRole R); 57 58 /// Finish building the tree and consume the root node. 59 syntax::TranslationUnit *finalize() && { 60 auto Tokens = Arena.tokenBuffer().expandedTokens(); 61 // Build the root of the tree, consuming all the children. 62 Pending.foldChildren(Tokens, 63 new (Arena.allocator()) syntax::TranslationUnit); 64 65 return cast<syntax::TranslationUnit>(std::move(Pending).finalize()); 66 } 67 68 /// getRange() finds the syntax tokens corresponding to the passed source 69 /// locations. 70 /// \p First is the start position of the first token and \p Last is the start 71 /// position of the last token. 72 llvm::ArrayRef<syntax::Token> getRange(SourceLocation First, 73 SourceLocation Last) const { 74 assert(First.isValid()); 75 assert(Last.isValid()); 76 assert(First == Last || 77 Arena.sourceManager().isBeforeInTranslationUnit(First, Last)); 78 return llvm::makeArrayRef(findToken(First), std::next(findToken(Last))); 79 } 80 llvm::ArrayRef<syntax::Token> getRange(const Decl *D) const { 81 return getRange(D->getBeginLoc(), D->getEndLoc()); 82 } 83 llvm::ArrayRef<syntax::Token> getRange(const Stmt *S) const { 84 return getRange(S->getBeginLoc(), S->getEndLoc()); 85 } 86 87 private: 88 /// Finds a token starting at \p L. The token must exist. 89 const syntax::Token *findToken(SourceLocation L) const; 90 91 /// A collection of trees covering the input tokens. 92 /// When created, each tree corresponds to a single token in the file. 93 /// Clients call 'foldChildren' to attach one or more subtrees to a parent 94 /// node and update the list of trees accordingly. 95 /// 96 /// Ensures that added nodes properly nest and cover the whole token stream. 97 struct Forest { 98 Forest(syntax::Arena &A) { 99 // FIXME: do not add 'eof' to the tree. 100 101 // Create all leaf nodes. 102 for (auto &T : A.tokenBuffer().expandedTokens()) 103 Trees.insert(Trees.end(), 104 {&T, NodeAndRole{new (A.allocator()) syntax::Leaf(&T)}}); 105 } 106 107 void assignRole(llvm::ArrayRef<syntax::Token> Range, 108 syntax::NodeRole Role) { 109 assert(!Range.empty()); 110 auto It = Trees.lower_bound(Range.begin()); 111 assert(It != Trees.end() && "no node found"); 112 assert(It->first == Range.begin() && "no child with the specified range"); 113 assert((std::next(It) == Trees.end() || 114 std::next(It)->first == Range.end()) && 115 "no child with the specified range"); 116 It->second.Role = Role; 117 } 118 119 /// Add \p Node to the forest and fill its children nodes based on the \p 120 /// NodeRange. 121 void foldChildren(llvm::ArrayRef<syntax::Token> NodeTokens, 122 syntax::Tree *Node) { 123 assert(!NodeTokens.empty()); 124 assert(Node->firstChild() == nullptr && "node already has children"); 125 126 auto *FirstToken = NodeTokens.begin(); 127 auto BeginChildren = Trees.lower_bound(FirstToken); 128 assert(BeginChildren != Trees.end() && 129 BeginChildren->first == FirstToken && 130 "fold crosses boundaries of existing subtrees"); 131 auto EndChildren = Trees.lower_bound(NodeTokens.end()); 132 assert((EndChildren == Trees.end() || 133 EndChildren->first == NodeTokens.end()) && 134 "fold crosses boundaries of existing subtrees"); 135 136 // (!) we need to go in reverse order, because we can only prepend. 137 for (auto It = EndChildren; It != BeginChildren; --It) 138 Node->prependChildLowLevel(std::prev(It)->second.Node, 139 std::prev(It)->second.Role); 140 141 Trees.erase(BeginChildren, EndChildren); 142 Trees.insert({FirstToken, NodeAndRole(Node)}); 143 } 144 145 // EXPECTS: all tokens were consumed and are owned by a single root node. 146 syntax::Node *finalize() && { 147 assert(Trees.size() == 1); 148 auto *Root = Trees.begin()->second.Node; 149 Trees = {}; 150 return Root; 151 } 152 153 std::string str(const syntax::Arena &A) const { 154 std::string R; 155 for (auto It = Trees.begin(); It != Trees.end(); ++It) { 156 unsigned CoveredTokens = 157 It != Trees.end() 158 ? (std::next(It)->first - It->first) 159 : A.tokenBuffer().expandedTokens().end() - It->first; 160 161 R += llvm::formatv("- '{0}' covers '{1}'+{2} tokens\n", 162 It->second.Node->kind(), 163 It->first->text(A.sourceManager()), CoveredTokens); 164 R += It->second.Node->dump(A); 165 } 166 return R; 167 } 168 169 private: 170 /// A with a role that should be assigned to it when adding to a parent. 171 struct NodeAndRole { 172 explicit NodeAndRole(syntax::Node *Node) 173 : Node(Node), Role(NodeRole::Unknown) {} 174 175 syntax::Node *Node; 176 NodeRole Role; 177 }; 178 179 /// Maps from the start token to a subtree starting at that token. 180 /// FIXME: storing the end tokens is redundant. 181 /// FIXME: the key of a map is redundant, it is also stored in NodeForRange. 182 std::map<const syntax::Token *, NodeAndRole> Trees; 183 }; 184 185 /// For debugging purposes. 186 std::string str() { return Pending.str(Arena); } 187 188 syntax::Arena &Arena; 189 Forest Pending; 190 }; 191 192 namespace { 193 class BuildTreeVisitor : public RecursiveASTVisitor<BuildTreeVisitor> { 194 public: 195 explicit BuildTreeVisitor(ASTContext &Ctx, syntax::TreeBuilder &Builder) 196 : Builder(Builder), LangOpts(Ctx.getLangOpts()) {} 197 198 bool shouldTraversePostOrder() const { return true; } 199 200 bool TraverseDecl(Decl *D) { 201 if (!D || isa<TranslationUnitDecl>(D)) 202 return RecursiveASTVisitor::TraverseDecl(D); 203 if (!llvm::isa<TranslationUnitDecl>(D->getDeclContext())) 204 return true; // Only build top-level decls for now, do not recurse. 205 return RecursiveASTVisitor::TraverseDecl(D); 206 } 207 208 bool VisitDecl(Decl *D) { 209 assert(llvm::isa<TranslationUnitDecl>(D->getDeclContext()) && 210 "expected a top-level decl"); 211 assert(!D->isImplicit()); 212 Builder.foldNode(Builder.getRange(D), 213 new (allocator()) syntax::TopLevelDeclaration()); 214 return true; 215 } 216 217 bool WalkUpFromTranslationUnitDecl(TranslationUnitDecl *TU) { 218 // (!) we do not want to call VisitDecl(), the declaration for translation 219 // unit is built by finalize(). 220 return true; 221 } 222 223 bool WalkUpFromCompoundStmt(CompoundStmt *S) { 224 using NodeRole = syntax::NodeRole; 225 226 Builder.markChildToken(S->getLBracLoc(), tok::l_brace, 227 NodeRole::CompoundStatement_lbrace); 228 Builder.markChildToken(S->getRBracLoc(), tok::r_brace, 229 NodeRole::CompoundStatement_rbrace); 230 231 Builder.foldNode(Builder.getRange(S), 232 new (allocator()) syntax::CompoundStatement); 233 return true; 234 } 235 236 private: 237 /// A small helper to save some typing. 238 llvm::BumpPtrAllocator &allocator() { return Builder.allocator(); } 239 240 syntax::TreeBuilder &Builder; 241 const LangOptions &LangOpts; 242 }; 243 } // namespace 244 245 void syntax::TreeBuilder::foldNode(llvm::ArrayRef<syntax::Token> Range, 246 syntax::Tree *New) { 247 Pending.foldChildren(Range, New); 248 } 249 250 void syntax::TreeBuilder::markChildToken(SourceLocation Loc, 251 tok::TokenKind Kind, NodeRole Role) { 252 if (Loc.isInvalid()) 253 return; 254 Pending.assignRole(*findToken(Loc), Role); 255 } 256 257 const syntax::Token *syntax::TreeBuilder::findToken(SourceLocation L) const { 258 auto Tokens = Arena.tokenBuffer().expandedTokens(); 259 auto &SM = Arena.sourceManager(); 260 auto It = llvm::partition_point(Tokens, [&](const syntax::Token &T) { 261 return SM.isBeforeInTranslationUnit(T.location(), L); 262 }); 263 assert(It != Tokens.end()); 264 assert(It->location() == L); 265 return &*It; 266 } 267 268 syntax::TranslationUnit * 269 syntax::buildSyntaxTree(Arena &A, const TranslationUnitDecl &TU) { 270 TreeBuilder Builder(A); 271 BuildTreeVisitor(TU.getASTContext(), Builder).TraverseAST(TU.getASTContext()); 272 return std::move(Builder).finalize(); 273 } 274