1 //===- llvm/ADT/SuffixTree.h - Tree for substrings --------------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // A data structure for fast substring queries. 9 // 10 // Suffix trees represent the suffixes of their input strings in their leaves. 11 // A suffix tree is a type of compressed trie structure where each node 12 // represents an entire substring rather than a single character. Each leaf 13 // of the tree is a suffix. 14 // 15 // A suffix tree can be seen as a type of state machine where each state is a 16 // substring of the full string. The tree is structured so that, for a string 17 // of length N, there are exactly N leaves in the tree. This structure allows 18 // us to quickly find repeated substrings of the input string. 19 // 20 // In this implementation, a "string" is a vector of unsigned integers. 21 // These integers may result from hashing some data type. A suffix tree can 22 // contain 1 or many strings, which can then be queried as one large string. 23 // 24 // The suffix tree is implemented using Ukkonen's algorithm for linear-time 25 // suffix tree construction. Ukkonen's algorithm is explained in more detail 26 // in the paper by Esko Ukkonen "On-line construction of suffix trees. The 27 // paper is available at 28 // 29 // https://www.cs.helsinki.fi/u/ukkonen/SuffixT1withFigs.pdf 30 //===----------------------------------------------------------------------===// 31 32 #ifndef LLVM_SUPPORT_SUFFIXTREE_H 33 #define LLVM_SUPPORT_SUFFIXTREE_H 34 35 #include "llvm/ADT/ArrayRef.h" 36 #include "llvm/Support/Allocator.h" 37 #include "llvm/Support/Compiler.h" 38 #include "llvm/Support/SuffixTreeNode.h" 39 40 namespace llvm { 41 class SuffixTree { 42 public: 43 /// Each element is an integer representing an instruction in the module. 44 ArrayRef<unsigned> Str; 45 46 /// Whether to consider leaf descendants or only leaf children. 47 bool OutlinerLeafDescendants; 48 49 /// A repeated substring in the tree. 50 struct RepeatedSubstring { 51 /// The length of the string. 52 unsigned Length; 53 54 /// The start indices of each occurrence. 55 SmallVector<unsigned> StartIndices; 56 }; 57 58 private: 59 /// Maintains internal nodes in the tree. 60 SpecificBumpPtrAllocator<SuffixTreeInternalNode> InternalNodeAllocator; 61 /// Maintains leaf nodes in the tree. 62 SpecificBumpPtrAllocator<SuffixTreeLeafNode> LeafNodeAllocator; 63 64 /// The root of the suffix tree. 65 /// 66 /// The root represents the empty string. It is maintained by the 67 /// \p NodeAllocator like every other node in the tree. 68 SuffixTreeInternalNode *Root = nullptr; 69 70 /// The end index of each leaf in the tree. 71 unsigned LeafEndIdx = SuffixTreeNode::EmptyIdx; 72 73 /// Helper struct which keeps track of the next insertion point in 74 /// Ukkonen's algorithm. 75 struct ActiveState { 76 /// The next node to insert at. 77 SuffixTreeInternalNode *Node = nullptr; 78 79 /// The index of the first character in the substring currently being added. 80 unsigned Idx = SuffixTreeNode::EmptyIdx; 81 82 /// The length of the substring we have to add at the current step. 83 unsigned Len = 0; 84 }; 85 86 /// The point the next insertion will take place at in the 87 /// construction algorithm. 88 ActiveState Active; 89 90 /// Allocate a leaf node and add it to the tree. 91 /// 92 /// \param Parent The parent of this node. 93 /// \param StartIdx The start index of this node's associated string. 94 /// \param Edge The label on the edge leaving \p Parent to this node. 95 /// 96 /// \returns A pointer to the allocated leaf node. 97 SuffixTreeNode *insertLeaf(SuffixTreeInternalNode &Parent, unsigned StartIdx, 98 unsigned Edge); 99 100 /// Allocate an internal node and add it to the tree. 101 /// 102 /// \param Parent The parent of this node. Only null when allocating the root. 103 /// \param StartIdx The start index of this node's associated string. 104 /// \param EndIdx The end index of this node's associated string. 105 /// \param Edge The label on the edge leaving \p Parent to this node. 106 /// 107 /// \returns A pointer to the allocated internal node. 108 SuffixTreeInternalNode *insertInternalNode(SuffixTreeInternalNode *Parent, 109 unsigned StartIdx, unsigned EndIdx, 110 unsigned Edge); 111 112 /// Allocate the root node and add it to the tree. 113 /// 114 /// \returns A pointer to the root. 115 SuffixTreeInternalNode *insertRoot(); 116 117 /// Set the suffix indices of the leaves to the start indices of their 118 /// respective suffixes. 119 void setSuffixIndices(); 120 121 /// Construct the suffix tree for the prefix of the input ending at 122 /// \p EndIdx. 123 /// 124 /// Used to construct the full suffix tree iteratively. At the end of each 125 /// step, the constructed suffix tree is either a valid suffix tree, or a 126 /// suffix tree with implicit suffixes. At the end of the final step, the 127 /// suffix tree is a valid tree. 128 /// 129 /// \param EndIdx The end index of the current prefix in the main string. 130 /// \param SuffixesToAdd The number of suffixes that must be added 131 /// to complete the suffix tree at the current phase. 132 /// 133 /// \returns The number of suffixes that have not been added at the end of 134 /// this step. 135 unsigned extend(unsigned EndIdx, unsigned SuffixesToAdd); 136 137 /// This vector contains all leaf nodes of this suffix tree. These leaf nodes 138 /// are identified using post-order depth-first traversal, so that the order 139 /// of these leaf nodes in the vector matches the order of the leaves in the 140 /// tree from left to right if one were to draw the tree on paper. 141 std::vector<SuffixTreeLeafNode *> LeafNodes; 142 143 /// Perform a post-order depth-first traversal of the tree and perform two 144 /// tasks during the traversal. The first is to populate LeafNodes, adding 145 /// nodes in order of the traversal. The second is to keep track of the leaf 146 /// descendants of every internal node by assigning values to LeftLeafIndex 147 /// and RightLefIndex fields of SuffixTreeNode for all internal nodes. 148 void setLeafNodes(); 149 150 public: 151 /// Construct a suffix tree from a sequence of unsigned integers. 152 /// 153 /// \param Str The string to construct the suffix tree for. 154 /// \param OutlinerLeafDescendants Whether to consider leaf descendants or 155 /// only leaf children (used by Machine Outliner). 156 LLVM_ABI SuffixTree(const ArrayRef<unsigned> &Str, 157 bool OutlinerLeafDescendants = false); 158 159 /// Iterator for finding all repeated substrings in the suffix tree. 160 struct RepeatedSubstringIterator { 161 private: 162 /// The current node we're visiting. 163 SuffixTreeNode *N = nullptr; 164 165 /// The repeated substring associated with this node. 166 RepeatedSubstring RS; 167 168 /// The nodes left to visit. 169 SmallVector<SuffixTreeInternalNode *> InternalNodesToVisit; 170 171 /// The minimum length of a repeated substring to find. 172 /// Since we're outlining, we want at least two instructions in the range. 173 /// FIXME: This may not be true for targets like X86 which support many 174 /// instruction lengths. 175 const unsigned MinLength = 2; 176 177 /// Vector of leaf nodes of the suffix tree. 178 const std::vector<SuffixTreeLeafNode *> &LeafNodes; 179 180 /// Whether to consider leaf descendants or only leaf children. 181 bool OutlinerLeafDescendants = !LeafNodes.empty(); 182 183 /// Move the iterator to the next repeated substring. 184 LLVM_ABI void advance(); 185 186 public: 187 /// Return the current repeated substring. 188 RepeatedSubstring &operator*() { return RS; } 189 190 RepeatedSubstringIterator &operator++() { 191 advance(); 192 return *this; 193 } 194 195 RepeatedSubstringIterator operator++(int I) { 196 RepeatedSubstringIterator It(*this); 197 advance(); 198 return It; 199 } 200 201 bool operator==(const RepeatedSubstringIterator &Other) const { 202 return N == Other.N; 203 } 204 bool operator!=(const RepeatedSubstringIterator &Other) const { 205 return !(*this == Other); 206 } 207 208 RepeatedSubstringIterator( 209 SuffixTreeInternalNode *N, 210 const std::vector<SuffixTreeLeafNode *> &LeafNodes = {}) NRepeatedSubstringIterator211 : N(N), LeafNodes(LeafNodes) { 212 // Do we have a non-null node? 213 if (!N) 214 return; 215 // Yes. At the first step, we need to visit all of N's children. 216 // Note: This means that we visit N last. 217 InternalNodesToVisit.push_back(N); 218 advance(); 219 } 220 }; 221 222 typedef RepeatedSubstringIterator iterator; begin()223 iterator begin() { return iterator(Root, LeafNodes); } end()224 iterator end() { return iterator(nullptr); } 225 }; 226 227 } // namespace llvm 228 229 #endif // LLVM_SUPPORT_SUFFIXTREE_H 230