xref: /freebsd/contrib/llvm-project/llvm/include/llvm/Support/SuffixTree.h (revision 0fca6ea1d4eea4c934cfff25ac9ee8ad6fe95583)
1 //===- llvm/ADT/SuffixTree.h - Tree for substrings --------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 // A data structure for fast substring queries.
9 //
10 // Suffix trees represent the suffixes of their input strings in their leaves.
11 // A suffix tree is a type of compressed trie structure where each node
12 // represents an entire substring rather than a single character. Each leaf
13 // of the tree is a suffix.
14 //
15 // A suffix tree can be seen as a type of state machine where each state is a
16 // substring of the full string. The tree is structured so that, for a string
17 // of length N, there are exactly N leaves in the tree. This structure allows
18 // us to quickly find repeated substrings of the input string.
19 //
20 // In this implementation, a "string" is a vector of unsigned integers.
21 // These integers may result from hashing some data type. A suffix tree can
22 // contain 1 or many strings, which can then be queried as one large string.
23 //
24 // The suffix tree is implemented using Ukkonen's algorithm for linear-time
25 // suffix tree construction. Ukkonen's algorithm is explained in more detail
26 // in the paper by Esko Ukkonen "On-line construction of suffix trees. The
27 // paper is available at
28 //
29 // https://www.cs.helsinki.fi/u/ukkonen/SuffixT1withFigs.pdf
30 //===----------------------------------------------------------------------===//
31 
32 #ifndef LLVM_SUPPORT_SUFFIXTREE_H
33 #define LLVM_SUPPORT_SUFFIXTREE_H
34 
35 #include "llvm/ADT/ArrayRef.h"
36 #include "llvm/Support/Allocator.h"
37 #include "llvm/Support/SuffixTreeNode.h"
38 
39 namespace llvm {
40 class SuffixTree {
41 public:
42   /// Each element is an integer representing an instruction in the module.
43   ArrayRef<unsigned> Str;
44 
45   /// Whether to consider leaf descendants or only leaf children.
46   bool OutlinerLeafDescendants;
47 
48   /// A repeated substring in the tree.
49   struct RepeatedSubstring {
50     /// The length of the string.
51     unsigned Length;
52 
53     /// The start indices of each occurrence.
54     SmallVector<unsigned> StartIndices;
55   };
56 
57 private:
58   /// Maintains internal nodes in the tree.
59   SpecificBumpPtrAllocator<SuffixTreeInternalNode> InternalNodeAllocator;
60   /// Maintains leaf nodes in the tree.
61   SpecificBumpPtrAllocator<SuffixTreeLeafNode> LeafNodeAllocator;
62 
63   /// The root of the suffix tree.
64   ///
65   /// The root represents the empty string. It is maintained by the
66   /// \p NodeAllocator like every other node in the tree.
67   SuffixTreeInternalNode *Root = nullptr;
68 
69   /// The end index of each leaf in the tree.
70   unsigned LeafEndIdx = SuffixTreeNode::EmptyIdx;
71 
72   /// Helper struct which keeps track of the next insertion point in
73   /// Ukkonen's algorithm.
74   struct ActiveState {
75     /// The next node to insert at.
76     SuffixTreeInternalNode *Node = nullptr;
77 
78     /// The index of the first character in the substring currently being added.
79     unsigned Idx = SuffixTreeNode::EmptyIdx;
80 
81     /// The length of the substring we have to add at the current step.
82     unsigned Len = 0;
83   };
84 
85   /// The point the next insertion will take place at in the
86   /// construction algorithm.
87   ActiveState Active;
88 
89   /// Allocate a leaf node and add it to the tree.
90   ///
91   /// \param Parent The parent of this node.
92   /// \param StartIdx The start index of this node's associated string.
93   /// \param Edge The label on the edge leaving \p Parent to this node.
94   ///
95   /// \returns A pointer to the allocated leaf node.
96   SuffixTreeNode *insertLeaf(SuffixTreeInternalNode &Parent, unsigned StartIdx,
97                              unsigned Edge);
98 
99   /// Allocate an internal node and add it to the tree.
100   ///
101   /// \param Parent The parent of this node. Only null when allocating the root.
102   /// \param StartIdx The start index of this node's associated string.
103   /// \param EndIdx The end index of this node's associated string.
104   /// \param Edge The label on the edge leaving \p Parent to this node.
105   ///
106   /// \returns A pointer to the allocated internal node.
107   SuffixTreeInternalNode *insertInternalNode(SuffixTreeInternalNode *Parent,
108                                              unsigned StartIdx, unsigned EndIdx,
109                                              unsigned Edge);
110 
111   /// Allocate the root node and add it to the tree.
112   ///
113   /// \returns A pointer to the root.
114   SuffixTreeInternalNode *insertRoot();
115 
116   /// Set the suffix indices of the leaves to the start indices of their
117   /// respective suffixes.
118   void setSuffixIndices();
119 
120   /// Construct the suffix tree for the prefix of the input ending at
121   /// \p EndIdx.
122   ///
123   /// Used to construct the full suffix tree iteratively. At the end of each
124   /// step, the constructed suffix tree is either a valid suffix tree, or a
125   /// suffix tree with implicit suffixes. At the end of the final step, the
126   /// suffix tree is a valid tree.
127   ///
128   /// \param EndIdx The end index of the current prefix in the main string.
129   /// \param SuffixesToAdd The number of suffixes that must be added
130   /// to complete the suffix tree at the current phase.
131   ///
132   /// \returns The number of suffixes that have not been added at the end of
133   /// this step.
134   unsigned extend(unsigned EndIdx, unsigned SuffixesToAdd);
135 
136   /// This vector contains all leaf nodes of this suffix tree. These leaf nodes
137   /// are identified using post-order depth-first traversal, so that the order
138   /// of these leaf nodes in the vector matches the order of the leaves in the
139   /// tree from left to right if one were to draw the tree on paper.
140   std::vector<SuffixTreeLeafNode *> LeafNodes;
141 
142   /// Perform a post-order depth-first traversal of the tree and perform two
143   /// tasks during the traversal. The first is to populate LeafNodes, adding
144   /// nodes in order of the traversal. The second is to keep track of the leaf
145   /// descendants of every internal node by assigning values to LeftLeafIndex
146   /// and RightLefIndex fields of SuffixTreeNode for all internal nodes.
147   void setLeafNodes();
148 
149 public:
150   /// Construct a suffix tree from a sequence of unsigned integers.
151   ///
152   /// \param Str The string to construct the suffix tree for.
153   /// \param OutlinerLeafDescendants Whether to consider leaf descendants or
154   /// only leaf children (used by Machine Outliner).
155   SuffixTree(const ArrayRef<unsigned> &Str,
156              bool OutlinerLeafDescendants = false);
157 
158   /// Iterator for finding all repeated substrings in the suffix tree.
159   struct RepeatedSubstringIterator {
160   private:
161     /// The current node we're visiting.
162     SuffixTreeNode *N = nullptr;
163 
164     /// The repeated substring associated with this node.
165     RepeatedSubstring RS;
166 
167     /// The nodes left to visit.
168     SmallVector<SuffixTreeInternalNode *> InternalNodesToVisit;
169 
170     /// The minimum length of a repeated substring to find.
171     /// Since we're outlining, we want at least two instructions in the range.
172     /// FIXME: This may not be true for targets like X86 which support many
173     /// instruction lengths.
174     const unsigned MinLength = 2;
175 
176     /// Vector of leaf nodes of the suffix tree.
177     const std::vector<SuffixTreeLeafNode *> &LeafNodes;
178 
179     /// Whether to consider leaf descendants or only leaf children.
180     bool OutlinerLeafDescendants = !LeafNodes.empty();
181 
182     /// Move the iterator to the next repeated substring.
183     void advance();
184 
185   public:
186     /// Return the current repeated substring.
187     RepeatedSubstring &operator*() { return RS; }
188 
189     RepeatedSubstringIterator &operator++() {
190       advance();
191       return *this;
192     }
193 
194     RepeatedSubstringIterator operator++(int I) {
195       RepeatedSubstringIterator It(*this);
196       advance();
197       return It;
198     }
199 
200     bool operator==(const RepeatedSubstringIterator &Other) const {
201       return N == Other.N;
202     }
203     bool operator!=(const RepeatedSubstringIterator &Other) const {
204       return !(*this == Other);
205     }
206 
207     RepeatedSubstringIterator(
208         SuffixTreeInternalNode *N,
209         const std::vector<SuffixTreeLeafNode *> &LeafNodes = {})
NRepeatedSubstringIterator210         : N(N), LeafNodes(LeafNodes) {
211       // Do we have a non-null node?
212       if (!N)
213         return;
214       // Yes. At the first step, we need to visit all of N's children.
215       // Note: This means that we visit N last.
216       InternalNodesToVisit.push_back(N);
217       advance();
218     }
219   };
220 
221   typedef RepeatedSubstringIterator iterator;
begin()222   iterator begin() { return iterator(Root, LeafNodes); }
end()223   iterator end() { return iterator(nullptr); }
224 };
225 
226 } // namespace llvm
227 
228 #endif // LLVM_SUPPORT_SUFFIXTREE_H
229