xref: /freebsd/contrib/llvm-project/llvm/include/llvm/Support/SuffixTree.h (revision 700637cbb5e582861067a11aaca4d053546871d2)
1 //===- llvm/ADT/SuffixTree.h - Tree for substrings --------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 // A data structure for fast substring queries.
9 //
10 // Suffix trees represent the suffixes of their input strings in their leaves.
11 // A suffix tree is a type of compressed trie structure where each node
12 // represents an entire substring rather than a single character. Each leaf
13 // of the tree is a suffix.
14 //
15 // A suffix tree can be seen as a type of state machine where each state is a
16 // substring of the full string. The tree is structured so that, for a string
17 // of length N, there are exactly N leaves in the tree. This structure allows
18 // us to quickly find repeated substrings of the input string.
19 //
20 // In this implementation, a "string" is a vector of unsigned integers.
21 // These integers may result from hashing some data type. A suffix tree can
22 // contain 1 or many strings, which can then be queried as one large string.
23 //
24 // The suffix tree is implemented using Ukkonen's algorithm for linear-time
25 // suffix tree construction. Ukkonen's algorithm is explained in more detail
26 // in the paper by Esko Ukkonen "On-line construction of suffix trees. The
27 // paper is available at
28 //
29 // https://www.cs.helsinki.fi/u/ukkonen/SuffixT1withFigs.pdf
30 //===----------------------------------------------------------------------===//
31 
32 #ifndef LLVM_SUPPORT_SUFFIXTREE_H
33 #define LLVM_SUPPORT_SUFFIXTREE_H
34 
35 #include "llvm/ADT/ArrayRef.h"
36 #include "llvm/Support/Allocator.h"
37 #include "llvm/Support/Compiler.h"
38 #include "llvm/Support/SuffixTreeNode.h"
39 
40 namespace llvm {
41 class SuffixTree {
42 public:
43   /// Each element is an integer representing an instruction in the module.
44   ArrayRef<unsigned> Str;
45 
46   /// Whether to consider leaf descendants or only leaf children.
47   bool OutlinerLeafDescendants;
48 
49   /// A repeated substring in the tree.
50   struct RepeatedSubstring {
51     /// The length of the string.
52     unsigned Length;
53 
54     /// The start indices of each occurrence.
55     SmallVector<unsigned> StartIndices;
56   };
57 
58 private:
59   /// Maintains internal nodes in the tree.
60   SpecificBumpPtrAllocator<SuffixTreeInternalNode> InternalNodeAllocator;
61   /// Maintains leaf nodes in the tree.
62   SpecificBumpPtrAllocator<SuffixTreeLeafNode> LeafNodeAllocator;
63 
64   /// The root of the suffix tree.
65   ///
66   /// The root represents the empty string. It is maintained by the
67   /// \p NodeAllocator like every other node in the tree.
68   SuffixTreeInternalNode *Root = nullptr;
69 
70   /// The end index of each leaf in the tree.
71   unsigned LeafEndIdx = SuffixTreeNode::EmptyIdx;
72 
73   /// Helper struct which keeps track of the next insertion point in
74   /// Ukkonen's algorithm.
75   struct ActiveState {
76     /// The next node to insert at.
77     SuffixTreeInternalNode *Node = nullptr;
78 
79     /// The index of the first character in the substring currently being added.
80     unsigned Idx = SuffixTreeNode::EmptyIdx;
81 
82     /// The length of the substring we have to add at the current step.
83     unsigned Len = 0;
84   };
85 
86   /// The point the next insertion will take place at in the
87   /// construction algorithm.
88   ActiveState Active;
89 
90   /// Allocate a leaf node and add it to the tree.
91   ///
92   /// \param Parent The parent of this node.
93   /// \param StartIdx The start index of this node's associated string.
94   /// \param Edge The label on the edge leaving \p Parent to this node.
95   ///
96   /// \returns A pointer to the allocated leaf node.
97   SuffixTreeNode *insertLeaf(SuffixTreeInternalNode &Parent, unsigned StartIdx,
98                              unsigned Edge);
99 
100   /// Allocate an internal node and add it to the tree.
101   ///
102   /// \param Parent The parent of this node. Only null when allocating the root.
103   /// \param StartIdx The start index of this node's associated string.
104   /// \param EndIdx The end index of this node's associated string.
105   /// \param Edge The label on the edge leaving \p Parent to this node.
106   ///
107   /// \returns A pointer to the allocated internal node.
108   SuffixTreeInternalNode *insertInternalNode(SuffixTreeInternalNode *Parent,
109                                              unsigned StartIdx, unsigned EndIdx,
110                                              unsigned Edge);
111 
112   /// Allocate the root node and add it to the tree.
113   ///
114   /// \returns A pointer to the root.
115   SuffixTreeInternalNode *insertRoot();
116 
117   /// Set the suffix indices of the leaves to the start indices of their
118   /// respective suffixes.
119   void setSuffixIndices();
120 
121   /// Construct the suffix tree for the prefix of the input ending at
122   /// \p EndIdx.
123   ///
124   /// Used to construct the full suffix tree iteratively. At the end of each
125   /// step, the constructed suffix tree is either a valid suffix tree, or a
126   /// suffix tree with implicit suffixes. At the end of the final step, the
127   /// suffix tree is a valid tree.
128   ///
129   /// \param EndIdx The end index of the current prefix in the main string.
130   /// \param SuffixesToAdd The number of suffixes that must be added
131   /// to complete the suffix tree at the current phase.
132   ///
133   /// \returns The number of suffixes that have not been added at the end of
134   /// this step.
135   unsigned extend(unsigned EndIdx, unsigned SuffixesToAdd);
136 
137   /// This vector contains all leaf nodes of this suffix tree. These leaf nodes
138   /// are identified using post-order depth-first traversal, so that the order
139   /// of these leaf nodes in the vector matches the order of the leaves in the
140   /// tree from left to right if one were to draw the tree on paper.
141   std::vector<SuffixTreeLeafNode *> LeafNodes;
142 
143   /// Perform a post-order depth-first traversal of the tree and perform two
144   /// tasks during the traversal. The first is to populate LeafNodes, adding
145   /// nodes in order of the traversal. The second is to keep track of the leaf
146   /// descendants of every internal node by assigning values to LeftLeafIndex
147   /// and RightLefIndex fields of SuffixTreeNode for all internal nodes.
148   void setLeafNodes();
149 
150 public:
151   /// Construct a suffix tree from a sequence of unsigned integers.
152   ///
153   /// \param Str The string to construct the suffix tree for.
154   /// \param OutlinerLeafDescendants Whether to consider leaf descendants or
155   /// only leaf children (used by Machine Outliner).
156   LLVM_ABI SuffixTree(const ArrayRef<unsigned> &Str,
157                       bool OutlinerLeafDescendants = false);
158 
159   /// Iterator for finding all repeated substrings in the suffix tree.
160   struct RepeatedSubstringIterator {
161   private:
162     /// The current node we're visiting.
163     SuffixTreeNode *N = nullptr;
164 
165     /// The repeated substring associated with this node.
166     RepeatedSubstring RS;
167 
168     /// The nodes left to visit.
169     SmallVector<SuffixTreeInternalNode *> InternalNodesToVisit;
170 
171     /// The minimum length of a repeated substring to find.
172     /// Since we're outlining, we want at least two instructions in the range.
173     /// FIXME: This may not be true for targets like X86 which support many
174     /// instruction lengths.
175     const unsigned MinLength = 2;
176 
177     /// Vector of leaf nodes of the suffix tree.
178     const std::vector<SuffixTreeLeafNode *> &LeafNodes;
179 
180     /// Whether to consider leaf descendants or only leaf children.
181     bool OutlinerLeafDescendants = !LeafNodes.empty();
182 
183     /// Move the iterator to the next repeated substring.
184     LLVM_ABI void advance();
185 
186   public:
187     /// Return the current repeated substring.
188     RepeatedSubstring &operator*() { return RS; }
189 
190     RepeatedSubstringIterator &operator++() {
191       advance();
192       return *this;
193     }
194 
195     RepeatedSubstringIterator operator++(int I) {
196       RepeatedSubstringIterator It(*this);
197       advance();
198       return It;
199     }
200 
201     bool operator==(const RepeatedSubstringIterator &Other) const {
202       return N == Other.N;
203     }
204     bool operator!=(const RepeatedSubstringIterator &Other) const {
205       return !(*this == Other);
206     }
207 
208     RepeatedSubstringIterator(
209         SuffixTreeInternalNode *N,
210         const std::vector<SuffixTreeLeafNode *> &LeafNodes = {})
NRepeatedSubstringIterator211         : N(N), LeafNodes(LeafNodes) {
212       // Do we have a non-null node?
213       if (!N)
214         return;
215       // Yes. At the first step, we need to visit all of N's children.
216       // Note: This means that we visit N last.
217       InternalNodesToVisit.push_back(N);
218       advance();
219     }
220   };
221 
222   typedef RepeatedSubstringIterator iterator;
begin()223   iterator begin() { return iterator(Root, LeafNodes); }
end()224   iterator end() { return iterator(nullptr); }
225 };
226 
227 } // namespace llvm
228 
229 #endif // LLVM_SUPPORT_SUFFIXTREE_H
230