1 //===- BuildTree.cpp ------------------------------------------*- C++ -*-=====// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 #include "clang/Tooling/Syntax/BuildTree.h" 9 #include "clang/AST/RecursiveASTVisitor.h" 10 #include "clang/AST/Stmt.h" 11 #include "clang/Basic/LLVM.h" 12 #include "clang/Basic/SourceLocation.h" 13 #include "clang/Basic/SourceManager.h" 14 #include "clang/Basic/TokenKinds.h" 15 #include "clang/Lex/Lexer.h" 16 #include "clang/Tooling/Syntax/Nodes.h" 17 #include "clang/Tooling/Syntax/Tokens.h" 18 #include "clang/Tooling/Syntax/Tree.h" 19 #include "llvm/ADT/ArrayRef.h" 20 #include "llvm/ADT/STLExtras.h" 21 #include "llvm/ADT/SmallVector.h" 22 #include "llvm/Support/Allocator.h" 23 #include "llvm/Support/Casting.h" 24 #include "llvm/Support/FormatVariadic.h" 25 #include "llvm/Support/raw_ostream.h" 26 #include <map> 27 28 using namespace clang; 29 30 /// A helper class for constructing the syntax tree while traversing a clang 31 /// AST. 32 /// 33 /// At each point of the traversal we maintain a list of pending nodes. 34 /// Initially all tokens are added as pending nodes. When processing a clang AST 35 /// node, the clients need to: 36 /// - create a corresponding syntax node, 37 /// - assign roles to all pending child nodes with 'markChild' and 38 /// 'markChildToken', 39 /// - replace the child nodes with the new syntax node in the pending list 40 /// with 'foldNode'. 41 /// 42 /// Note that all children are expected to be processed when building a node. 43 /// 44 /// Call finalize() to finish building the tree and consume the root node. 45 class syntax::TreeBuilder { 46 public: 47 TreeBuilder(syntax::Arena &Arena) : Arena(Arena), Pending(Arena) {} 48 49 llvm::BumpPtrAllocator &allocator() { return Arena.allocator(); } 50 51 /// Populate children for \p New node, assuming it covers tokens from \p 52 /// Range. 53 void foldNode(llvm::ArrayRef<syntax::Token> Range, syntax::Tree *New); 54 55 /// Set role for a token starting at \p Loc. 56 void markChildToken(SourceLocation Loc, tok::TokenKind Kind, NodeRole R); 57 58 /// Finish building the tree and consume the root node. 59 syntax::TranslationUnit *finalize() && { 60 auto Tokens = Arena.tokenBuffer().expandedTokens(); 61 assert(!Tokens.empty()); 62 assert(Tokens.back().kind() == tok::eof); 63 64 // Build the root of the tree, consuming all the children. 65 Pending.foldChildren(Tokens.drop_back(), 66 new (Arena.allocator()) syntax::TranslationUnit); 67 68 return cast<syntax::TranslationUnit>(std::move(Pending).finalize()); 69 } 70 71 /// getRange() finds the syntax tokens corresponding to the passed source 72 /// locations. 73 /// \p First is the start position of the first token and \p Last is the start 74 /// position of the last token. 75 llvm::ArrayRef<syntax::Token> getRange(SourceLocation First, 76 SourceLocation Last) const { 77 assert(First.isValid()); 78 assert(Last.isValid()); 79 assert(First == Last || 80 Arena.sourceManager().isBeforeInTranslationUnit(First, Last)); 81 return llvm::makeArrayRef(findToken(First), std::next(findToken(Last))); 82 } 83 llvm::ArrayRef<syntax::Token> getRange(const Decl *D) const { 84 return getRange(D->getBeginLoc(), D->getEndLoc()); 85 } 86 llvm::ArrayRef<syntax::Token> getRange(const Stmt *S) const { 87 return getRange(S->getBeginLoc(), S->getEndLoc()); 88 } 89 90 private: 91 /// Finds a token starting at \p L. The token must exist. 92 const syntax::Token *findToken(SourceLocation L) const; 93 94 /// A collection of trees covering the input tokens. 95 /// When created, each tree corresponds to a single token in the file. 96 /// Clients call 'foldChildren' to attach one or more subtrees to a parent 97 /// node and update the list of trees accordingly. 98 /// 99 /// Ensures that added nodes properly nest and cover the whole token stream. 100 struct Forest { 101 Forest(syntax::Arena &A) { 102 assert(!A.tokenBuffer().expandedTokens().empty()); 103 assert(A.tokenBuffer().expandedTokens().back().kind() == tok::eof); 104 // Create all leaf nodes. 105 // Note that we do not have 'eof' in the tree. 106 for (auto &T : A.tokenBuffer().expandedTokens().drop_back()) 107 Trees.insert(Trees.end(), 108 {&T, NodeAndRole{new (A.allocator()) syntax::Leaf(&T)}}); 109 } 110 111 void assignRole(llvm::ArrayRef<syntax::Token> Range, 112 syntax::NodeRole Role) { 113 assert(!Range.empty()); 114 auto It = Trees.lower_bound(Range.begin()); 115 assert(It != Trees.end() && "no node found"); 116 assert(It->first == Range.begin() && "no child with the specified range"); 117 assert((std::next(It) == Trees.end() || 118 std::next(It)->first == Range.end()) && 119 "no child with the specified range"); 120 It->second.Role = Role; 121 } 122 123 /// Add \p Node to the forest and fill its children nodes based on the \p 124 /// NodeRange. 125 void foldChildren(llvm::ArrayRef<syntax::Token> NodeTokens, 126 syntax::Tree *Node) { 127 assert(!NodeTokens.empty()); 128 assert(Node->firstChild() == nullptr && "node already has children"); 129 130 auto *FirstToken = NodeTokens.begin(); 131 auto BeginChildren = Trees.lower_bound(FirstToken); 132 assert(BeginChildren != Trees.end() && 133 BeginChildren->first == FirstToken && 134 "fold crosses boundaries of existing subtrees"); 135 auto EndChildren = Trees.lower_bound(NodeTokens.end()); 136 assert((EndChildren == Trees.end() || 137 EndChildren->first == NodeTokens.end()) && 138 "fold crosses boundaries of existing subtrees"); 139 140 // (!) we need to go in reverse order, because we can only prepend. 141 for (auto It = EndChildren; It != BeginChildren; --It) 142 Node->prependChildLowLevel(std::prev(It)->second.Node, 143 std::prev(It)->second.Role); 144 145 Trees.erase(BeginChildren, EndChildren); 146 Trees.insert({FirstToken, NodeAndRole(Node)}); 147 } 148 149 // EXPECTS: all tokens were consumed and are owned by a single root node. 150 syntax::Node *finalize() && { 151 assert(Trees.size() == 1); 152 auto *Root = Trees.begin()->second.Node; 153 Trees = {}; 154 return Root; 155 } 156 157 std::string str(const syntax::Arena &A) const { 158 std::string R; 159 for (auto It = Trees.begin(); It != Trees.end(); ++It) { 160 unsigned CoveredTokens = 161 It != Trees.end() 162 ? (std::next(It)->first - It->first) 163 : A.tokenBuffer().expandedTokens().end() - It->first; 164 165 R += llvm::formatv("- '{0}' covers '{1}'+{2} tokens\n", 166 It->second.Node->kind(), 167 It->first->text(A.sourceManager()), CoveredTokens); 168 R += It->second.Node->dump(A); 169 } 170 return R; 171 } 172 173 private: 174 /// A with a role that should be assigned to it when adding to a parent. 175 struct NodeAndRole { 176 explicit NodeAndRole(syntax::Node *Node) 177 : Node(Node), Role(NodeRole::Unknown) {} 178 179 syntax::Node *Node; 180 NodeRole Role; 181 }; 182 183 /// Maps from the start token to a subtree starting at that token. 184 /// FIXME: storing the end tokens is redundant. 185 /// FIXME: the key of a map is redundant, it is also stored in NodeForRange. 186 std::map<const syntax::Token *, NodeAndRole> Trees; 187 }; 188 189 /// For debugging purposes. 190 std::string str() { return Pending.str(Arena); } 191 192 syntax::Arena &Arena; 193 Forest Pending; 194 }; 195 196 namespace { 197 class BuildTreeVisitor : public RecursiveASTVisitor<BuildTreeVisitor> { 198 public: 199 explicit BuildTreeVisitor(ASTContext &Ctx, syntax::TreeBuilder &Builder) 200 : Builder(Builder), LangOpts(Ctx.getLangOpts()) {} 201 202 bool shouldTraversePostOrder() const { return true; } 203 204 bool TraverseDecl(Decl *D) { 205 if (!D || isa<TranslationUnitDecl>(D)) 206 return RecursiveASTVisitor::TraverseDecl(D); 207 if (!llvm::isa<TranslationUnitDecl>(D->getDeclContext())) 208 return true; // Only build top-level decls for now, do not recurse. 209 return RecursiveASTVisitor::TraverseDecl(D); 210 } 211 212 bool VisitDecl(Decl *D) { 213 assert(llvm::isa<TranslationUnitDecl>(D->getDeclContext()) && 214 "expected a top-level decl"); 215 assert(!D->isImplicit()); 216 Builder.foldNode(Builder.getRange(D), 217 new (allocator()) syntax::TopLevelDeclaration()); 218 return true; 219 } 220 221 bool WalkUpFromTranslationUnitDecl(TranslationUnitDecl *TU) { 222 // (!) we do not want to call VisitDecl(), the declaration for translation 223 // unit is built by finalize(). 224 return true; 225 } 226 227 bool WalkUpFromCompoundStmt(CompoundStmt *S) { 228 using NodeRole = syntax::NodeRole; 229 230 Builder.markChildToken(S->getLBracLoc(), tok::l_brace, 231 NodeRole::CompoundStatement_lbrace); 232 Builder.markChildToken(S->getRBracLoc(), tok::r_brace, 233 NodeRole::CompoundStatement_rbrace); 234 235 Builder.foldNode(Builder.getRange(S), 236 new (allocator()) syntax::CompoundStatement); 237 return true; 238 } 239 240 private: 241 /// A small helper to save some typing. 242 llvm::BumpPtrAllocator &allocator() { return Builder.allocator(); } 243 244 syntax::TreeBuilder &Builder; 245 const LangOptions &LangOpts; 246 }; 247 } // namespace 248 249 void syntax::TreeBuilder::foldNode(llvm::ArrayRef<syntax::Token> Range, 250 syntax::Tree *New) { 251 Pending.foldChildren(Range, New); 252 } 253 254 void syntax::TreeBuilder::markChildToken(SourceLocation Loc, 255 tok::TokenKind Kind, NodeRole Role) { 256 if (Loc.isInvalid()) 257 return; 258 Pending.assignRole(*findToken(Loc), Role); 259 } 260 261 const syntax::Token *syntax::TreeBuilder::findToken(SourceLocation L) const { 262 auto Tokens = Arena.tokenBuffer().expandedTokens(); 263 auto &SM = Arena.sourceManager(); 264 auto It = llvm::partition_point(Tokens, [&](const syntax::Token &T) { 265 return SM.isBeforeInTranslationUnit(T.location(), L); 266 }); 267 assert(It != Tokens.end()); 268 assert(It->location() == L); 269 return &*It; 270 } 271 272 syntax::TranslationUnit * 273 syntax::buildSyntaxTree(Arena &A, const TranslationUnitDecl &TU) { 274 TreeBuilder Builder(A); 275 BuildTreeVisitor(TU.getASTContext(), Builder).TraverseAST(TU.getASTContext()); 276 return std::move(Builder).finalize(); 277 } 278