1 //===- llvm/Support/SuffixTree.cpp - Implement Suffix Tree ------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the Suffix Tree class.
10 //
11 //===----------------------------------------------------------------------===//
12
13 #include "llvm/Support/SuffixTree.h"
14 #include "llvm/Support/Allocator.h"
15 #include "llvm/Support/Casting.h"
16 #include "llvm/Support/SuffixTreeNode.h"
17
18 using namespace llvm;
19
20 /// \returns the number of elements in the substring associated with \p N.
numElementsInSubstring(const SuffixTreeNode * N)21 static size_t numElementsInSubstring(const SuffixTreeNode *N) {
22 assert(N && "Got a null node?");
23 if (auto *Internal = dyn_cast<SuffixTreeInternalNode>(N))
24 if (Internal->isRoot())
25 return 0;
26 return N->getEndIdx() - N->getStartIdx() + 1;
27 }
28
SuffixTree(const ArrayRef<unsigned> & Str,bool OutlinerLeafDescendants)29 SuffixTree::SuffixTree(const ArrayRef<unsigned> &Str,
30 bool OutlinerLeafDescendants)
31 : Str(Str), OutlinerLeafDescendants(OutlinerLeafDescendants) {
32 Root = insertRoot();
33 Active.Node = Root;
34
35 // Keep track of the number of suffixes we have to add of the current
36 // prefix.
37 unsigned SuffixesToAdd = 0;
38
39 // Construct the suffix tree iteratively on each prefix of the string.
40 // PfxEndIdx is the end index of the current prefix.
41 // End is one past the last element in the string.
42 for (unsigned PfxEndIdx = 0, End = Str.size(); PfxEndIdx < End; PfxEndIdx++) {
43 SuffixesToAdd++;
44 LeafEndIdx = PfxEndIdx; // Extend each of the leaves.
45 SuffixesToAdd = extend(PfxEndIdx, SuffixesToAdd);
46 }
47
48 // Set the suffix indices of each leaf.
49 assert(Root && "Root node can't be nullptr!");
50 setSuffixIndices();
51
52 // Collect all leaf nodes of the suffix tree. And for each internal node,
53 // record the range of leaf nodes that are descendants of it.
54 if (OutlinerLeafDescendants)
55 setLeafNodes();
56 }
57
insertLeaf(SuffixTreeInternalNode & Parent,unsigned StartIdx,unsigned Edge)58 SuffixTreeNode *SuffixTree::insertLeaf(SuffixTreeInternalNode &Parent,
59 unsigned StartIdx, unsigned Edge) {
60 assert(StartIdx <= LeafEndIdx && "String can't start after it ends!");
61 auto *N = new (LeafNodeAllocator.Allocate())
62 SuffixTreeLeafNode(StartIdx, &LeafEndIdx);
63 Parent.Children[Edge] = N;
64 return N;
65 }
66
67 SuffixTreeInternalNode *
insertInternalNode(SuffixTreeInternalNode * Parent,unsigned StartIdx,unsigned EndIdx,unsigned Edge)68 SuffixTree::insertInternalNode(SuffixTreeInternalNode *Parent,
69 unsigned StartIdx, unsigned EndIdx,
70 unsigned Edge) {
71 assert(StartIdx <= EndIdx && "String can't start after it ends!");
72 assert(!(!Parent && StartIdx != SuffixTreeNode::EmptyIdx) &&
73 "Non-root internal nodes must have parents!");
74 auto *N = new (InternalNodeAllocator.Allocate())
75 SuffixTreeInternalNode(StartIdx, EndIdx, Root);
76 if (Parent)
77 Parent->Children[Edge] = N;
78 return N;
79 }
80
insertRoot()81 SuffixTreeInternalNode *SuffixTree::insertRoot() {
82 return insertInternalNode(/*Parent = */ nullptr, SuffixTreeNode::EmptyIdx,
83 SuffixTreeNode::EmptyIdx, /*Edge = */ 0);
84 }
85
setSuffixIndices()86 void SuffixTree::setSuffixIndices() {
87 // List of nodes we need to visit along with the current length of the
88 // string.
89 SmallVector<std::pair<SuffixTreeNode *, unsigned>> ToVisit;
90
91 // Current node being visited.
92 SuffixTreeNode *CurrNode = Root;
93
94 // Sum of the lengths of the nodes down the path to the current one.
95 unsigned CurrNodeLen = 0;
96 ToVisit.push_back({CurrNode, CurrNodeLen});
97 while (!ToVisit.empty()) {
98 std::tie(CurrNode, CurrNodeLen) = ToVisit.back();
99 ToVisit.pop_back();
100 // Length of the current node from the root down to here.
101 CurrNode->setConcatLen(CurrNodeLen);
102 if (auto *InternalNode = dyn_cast<SuffixTreeInternalNode>(CurrNode))
103 for (auto &ChildPair : InternalNode->Children) {
104 assert(ChildPair.second && "Node had a null child!");
105 ToVisit.push_back(
106 {ChildPair.second,
107 CurrNodeLen + numElementsInSubstring(ChildPair.second)});
108 }
109 // No children, so we are at the end of the string.
110 if (auto *LeafNode = dyn_cast<SuffixTreeLeafNode>(CurrNode))
111 LeafNode->setSuffixIdx(Str.size() - CurrNodeLen);
112 }
113 }
114
setLeafNodes()115 void SuffixTree::setLeafNodes() {
116 // A stack that keeps track of nodes to visit for post-order DFS traversal.
117 SmallVector<SuffixTreeNode *> ToVisit;
118 ToVisit.push_back(Root);
119
120 // This keeps track of the index of the next leaf node to be added to
121 // the LeafNodes vector of the suffix tree.
122 unsigned LeafCounter = 0;
123
124 // This keeps track of nodes whose children have been added to the stack.
125 // The value is a pair, representing a node's first and last children.
126 DenseMap<SuffixTreeInternalNode *,
127 std::pair<SuffixTreeNode *, SuffixTreeNode *>>
128 ChildrenMap;
129
130 // Traverse the tree in post-order.
131 while (!ToVisit.empty()) {
132 SuffixTreeNode *CurrNode = ToVisit.pop_back_val();
133 if (auto *CurrInternalNode = dyn_cast<SuffixTreeInternalNode>(CurrNode)) {
134 // The current node is an internal node.
135 auto I = ChildrenMap.find(CurrInternalNode);
136 if (I == ChildrenMap.end()) {
137 // This is the first time we visit this node.
138 // Its children have not been added to the stack yet.
139 // We add current node back, and add its children to the stack.
140 // We keep track of the first and last children of the current node.
141 auto J = CurrInternalNode->Children.begin();
142 if (J != CurrInternalNode->Children.end()) {
143 ToVisit.push_back(CurrNode);
144 SuffixTreeNode *FirstChild = J->second;
145 SuffixTreeNode *LastChild = nullptr;
146 for (; J != CurrInternalNode->Children.end(); ++J) {
147 LastChild = J->second;
148 ToVisit.push_back(LastChild);
149 }
150 ChildrenMap[CurrInternalNode] = {FirstChild, LastChild};
151 }
152 } else {
153 // This is the second time we visit this node.
154 // All of its children have already been processed.
155 // Now, we can set its LeftLeafIdx and RightLeafIdx;
156 auto [FirstChild, LastChild] = I->second;
157 // Get the first child to use its RightLeafIdx.
158 // The first child is the first one added to the stack, so it is
159 // the last one to be processed. Hence, the leaf descendants
160 // of the first child are assigned the largest index numbers.
161 CurrNode->setRightLeafIdx(FirstChild->getRightLeafIdx());
162 // Get the last child to use its LeftLeafIdx.
163 CurrNode->setLeftLeafIdx(LastChild->getLeftLeafIdx());
164 assert(CurrNode->getLeftLeafIdx() <= CurrNode->getRightLeafIdx() &&
165 "LeftLeafIdx should not be larger than RightLeafIdx");
166 }
167 } else {
168 // The current node is a leaf node.
169 // We can simply set its LeftLeafIdx and RightLeafIdx.
170 CurrNode->setLeftLeafIdx(LeafCounter);
171 CurrNode->setRightLeafIdx(LeafCounter);
172 ++LeafCounter;
173 auto *CurrLeafNode = cast<SuffixTreeLeafNode>(CurrNode);
174 LeafNodes.push_back(CurrLeafNode);
175 }
176 }
177 }
178
extend(unsigned EndIdx,unsigned SuffixesToAdd)179 unsigned SuffixTree::extend(unsigned EndIdx, unsigned SuffixesToAdd) {
180 SuffixTreeInternalNode *NeedsLink = nullptr;
181
182 while (SuffixesToAdd > 0) {
183
184 // Are we waiting to add anything other than just the last character?
185 if (Active.Len == 0) {
186 // If not, then say the active index is the end index.
187 Active.Idx = EndIdx;
188 }
189
190 assert(Active.Idx <= EndIdx && "Start index can't be after end index!");
191
192 // The first character in the current substring we're looking at.
193 unsigned FirstChar = Str[Active.Idx];
194
195 // Have we inserted anything starting with FirstChar at the current node?
196 if (Active.Node->Children.count(FirstChar) == 0) {
197 // If not, then we can just insert a leaf and move to the next step.
198 insertLeaf(*Active.Node, EndIdx, FirstChar);
199
200 // The active node is an internal node, and we visited it, so it must
201 // need a link if it doesn't have one.
202 if (NeedsLink) {
203 NeedsLink->setLink(Active.Node);
204 NeedsLink = nullptr;
205 }
206 } else {
207 // There's a match with FirstChar, so look for the point in the tree to
208 // insert a new node.
209 SuffixTreeNode *NextNode = Active.Node->Children[FirstChar];
210
211 unsigned SubstringLen = numElementsInSubstring(NextNode);
212
213 // Is the current suffix we're trying to insert longer than the size of
214 // the child we want to move to?
215 if (Active.Len >= SubstringLen) {
216 // If yes, then consume the characters we've seen and move to the next
217 // node.
218 assert(isa<SuffixTreeInternalNode>(NextNode) &&
219 "Expected an internal node?");
220 Active.Idx += SubstringLen;
221 Active.Len -= SubstringLen;
222 Active.Node = cast<SuffixTreeInternalNode>(NextNode);
223 continue;
224 }
225
226 // Otherwise, the suffix we're trying to insert must be contained in the
227 // next node we want to move to.
228 unsigned LastChar = Str[EndIdx];
229
230 // Is the string we're trying to insert a substring of the next node?
231 if (Str[NextNode->getStartIdx() + Active.Len] == LastChar) {
232 // If yes, then we're done for this step. Remember our insertion point
233 // and move to the next end index. At this point, we have an implicit
234 // suffix tree.
235 if (NeedsLink && !Active.Node->isRoot()) {
236 NeedsLink->setLink(Active.Node);
237 NeedsLink = nullptr;
238 }
239
240 Active.Len++;
241 break;
242 }
243
244 // The string we're trying to insert isn't a substring of the next node,
245 // but matches up to a point. Split the node.
246 //
247 // For example, say we ended our search at a node n and we're trying to
248 // insert ABD. Then we'll create a new node s for AB, reduce n to just
249 // representing C, and insert a new leaf node l to represent d. This
250 // allows us to ensure that if n was a leaf, it remains a leaf.
251 //
252 // | ABC ---split---> | AB
253 // n s
254 // C / \ D
255 // n l
256
257 // The node s from the diagram
258 SuffixTreeInternalNode *SplitNode = insertInternalNode(
259 Active.Node, NextNode->getStartIdx(),
260 NextNode->getStartIdx() + Active.Len - 1, FirstChar);
261
262 // Insert the new node representing the new substring into the tree as
263 // a child of the split node. This is the node l from the diagram.
264 insertLeaf(*SplitNode, EndIdx, LastChar);
265
266 // Make the old node a child of the split node and update its start
267 // index. This is the node n from the diagram.
268 NextNode->incrementStartIdx(Active.Len);
269 SplitNode->Children[Str[NextNode->getStartIdx()]] = NextNode;
270
271 // SplitNode is an internal node, update the suffix link.
272 if (NeedsLink)
273 NeedsLink->setLink(SplitNode);
274
275 NeedsLink = SplitNode;
276 }
277
278 // We've added something new to the tree, so there's one less suffix to
279 // add.
280 SuffixesToAdd--;
281
282 if (Active.Node->isRoot()) {
283 if (Active.Len > 0) {
284 Active.Len--;
285 Active.Idx = EndIdx - SuffixesToAdd + 1;
286 }
287 } else {
288 // Start the next phase at the next smallest suffix.
289 Active.Node = Active.Node->getLink();
290 }
291 }
292
293 return SuffixesToAdd;
294 }
295
advance()296 void SuffixTree::RepeatedSubstringIterator::advance() {
297 // Clear the current state. If we're at the end of the range, then this
298 // is the state we want to be in.
299 RS = RepeatedSubstring();
300 N = nullptr;
301
302 // Each leaf node represents a repeat of a string.
303 SmallVector<unsigned> RepeatedSubstringStarts;
304
305 // Continue visiting nodes until we find one which repeats more than once.
306 while (!InternalNodesToVisit.empty()) {
307 RepeatedSubstringStarts.clear();
308 auto *Curr = InternalNodesToVisit.back();
309 InternalNodesToVisit.pop_back();
310
311 // Keep track of the length of the string associated with the node. If
312 // it's too short, we'll quit.
313 unsigned Length = Curr->getConcatLen();
314
315 // Iterate over each child, saving internal nodes for visiting.
316 // Internal nodes represent individual strings, which may repeat.
317 for (auto &ChildPair : Curr->Children)
318 // Save all of this node's children for processing.
319 if (auto *InternalChild =
320 dyn_cast<SuffixTreeInternalNode>(ChildPair.second))
321 InternalNodesToVisit.push_back(InternalChild);
322
323 // If length of repeated substring is below threshold, then skip it.
324 if (Length < MinLength)
325 continue;
326
327 // The root never represents a repeated substring. If we're looking at
328 // that, then skip it.
329 if (Curr->isRoot())
330 continue;
331
332 // Collect leaf children or leaf descendants by OutlinerLeafDescendants.
333 if (OutlinerLeafDescendants) {
334 for (unsigned I = Curr->getLeftLeafIdx(); I <= Curr->getRightLeafIdx();
335 ++I)
336 RepeatedSubstringStarts.push_back(LeafNodes[I]->getSuffixIdx());
337 } else {
338 for (auto &ChildPair : Curr->Children)
339 if (auto *Leaf = dyn_cast<SuffixTreeLeafNode>(ChildPair.second))
340 RepeatedSubstringStarts.push_back(Leaf->getSuffixIdx());
341 }
342
343 // Do we have any repeated substrings?
344 if (RepeatedSubstringStarts.size() < 2)
345 continue;
346
347 // Yes. Update the state to reflect this, and then bail out.
348 N = Curr;
349 RS.Length = Length;
350 for (unsigned StartIdx : RepeatedSubstringStarts)
351 RS.StartIndices.push_back(StartIdx);
352 break;
353 }
354 // At this point, either NewRS is an empty RepeatedSubstring, or it was
355 // set in the above loop. Similarly, N is either nullptr, or the node
356 // associated with NewRS.
357 }
358