xref: /freebsd/contrib/llvm-project/llvm/lib/Support/SuffixTree.cpp (revision b64c5a0ace59af62eff52bfe110a521dc73c937b)
1 //===- llvm/Support/SuffixTree.cpp - Implement Suffix Tree ------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the Suffix Tree class.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "llvm/Support/SuffixTree.h"
14 #include "llvm/Support/Allocator.h"
15 #include "llvm/Support/Casting.h"
16 #include "llvm/Support/SuffixTreeNode.h"
17 
18 using namespace llvm;
19 
20 /// \returns the number of elements in the substring associated with \p N.
21 static size_t numElementsInSubstring(const SuffixTreeNode *N) {
22   assert(N && "Got a null node?");
23   if (auto *Internal = dyn_cast<SuffixTreeInternalNode>(N))
24     if (Internal->isRoot())
25       return 0;
26   return N->getEndIdx() - N->getStartIdx() + 1;
27 }
28 
29 SuffixTree::SuffixTree(const ArrayRef<unsigned> &Str,
30                        bool OutlinerLeafDescendants)
31     : Str(Str), OutlinerLeafDescendants(OutlinerLeafDescendants) {
32   Root = insertRoot();
33   Active.Node = Root;
34 
35   // Keep track of the number of suffixes we have to add of the current
36   // prefix.
37   unsigned SuffixesToAdd = 0;
38 
39   // Construct the suffix tree iteratively on each prefix of the string.
40   // PfxEndIdx is the end index of the current prefix.
41   // End is one past the last element in the string.
42   for (unsigned PfxEndIdx = 0, End = Str.size(); PfxEndIdx < End; PfxEndIdx++) {
43     SuffixesToAdd++;
44     LeafEndIdx = PfxEndIdx; // Extend each of the leaves.
45     SuffixesToAdd = extend(PfxEndIdx, SuffixesToAdd);
46   }
47 
48   // Set the suffix indices of each leaf.
49   assert(Root && "Root node can't be nullptr!");
50   setSuffixIndices();
51 
52   // Collect all leaf nodes of the suffix tree. And for each internal node,
53   // record the range of leaf nodes that are descendants of it.
54   if (OutlinerLeafDescendants)
55     setLeafNodes();
56 }
57 
58 SuffixTreeNode *SuffixTree::insertLeaf(SuffixTreeInternalNode &Parent,
59                                        unsigned StartIdx, unsigned Edge) {
60   assert(StartIdx <= LeafEndIdx && "String can't start after it ends!");
61   auto *N = new (LeafNodeAllocator.Allocate())
62       SuffixTreeLeafNode(StartIdx, &LeafEndIdx);
63   Parent.Children[Edge] = N;
64   return N;
65 }
66 
67 SuffixTreeInternalNode *
68 SuffixTree::insertInternalNode(SuffixTreeInternalNode *Parent,
69                                unsigned StartIdx, unsigned EndIdx,
70                                unsigned Edge) {
71   assert(StartIdx <= EndIdx && "String can't start after it ends!");
72   assert(!(!Parent && StartIdx != SuffixTreeNode::EmptyIdx) &&
73          "Non-root internal nodes must have parents!");
74   auto *N = new (InternalNodeAllocator.Allocate())
75       SuffixTreeInternalNode(StartIdx, EndIdx, Root);
76   if (Parent)
77     Parent->Children[Edge] = N;
78   return N;
79 }
80 
81 SuffixTreeInternalNode *SuffixTree::insertRoot() {
82   return insertInternalNode(/*Parent = */ nullptr, SuffixTreeNode::EmptyIdx,
83                             SuffixTreeNode::EmptyIdx, /*Edge = */ 0);
84 }
85 
86 void SuffixTree::setSuffixIndices() {
87   // List of nodes we need to visit along with the current length of the
88   // string.
89   SmallVector<std::pair<SuffixTreeNode *, unsigned>> ToVisit;
90 
91   // Current node being visited.
92   SuffixTreeNode *CurrNode = Root;
93 
94   // Sum of the lengths of the nodes down the path to the current one.
95   unsigned CurrNodeLen = 0;
96   ToVisit.push_back({CurrNode, CurrNodeLen});
97   while (!ToVisit.empty()) {
98     std::tie(CurrNode, CurrNodeLen) = ToVisit.back();
99     ToVisit.pop_back();
100     // Length of the current node from the root down to here.
101     CurrNode->setConcatLen(CurrNodeLen);
102     if (auto *InternalNode = dyn_cast<SuffixTreeInternalNode>(CurrNode))
103       for (auto &ChildPair : InternalNode->Children) {
104         assert(ChildPair.second && "Node had a null child!");
105         ToVisit.push_back(
106             {ChildPair.second,
107              CurrNodeLen + numElementsInSubstring(ChildPair.second)});
108       }
109     // No children, so we are at the end of the string.
110     if (auto *LeafNode = dyn_cast<SuffixTreeLeafNode>(CurrNode))
111       LeafNode->setSuffixIdx(Str.size() - CurrNodeLen);
112   }
113 }
114 
115 void SuffixTree::setLeafNodes() {
116   // A stack that keeps track of nodes to visit for post-order DFS traversal.
117   SmallVector<SuffixTreeNode *> ToVisit;
118   ToVisit.push_back(Root);
119 
120   // This keeps track of the index of the next leaf node to be added to
121   // the LeafNodes vector of the suffix tree.
122   unsigned LeafCounter = 0;
123 
124   // This keeps track of nodes whose children have been added to the stack.
125   // The value is a pair, representing a node's first and last children.
126   DenseMap<SuffixTreeInternalNode *,
127            std::pair<SuffixTreeNode *, SuffixTreeNode *>>
128       ChildrenMap;
129 
130   // Traverse the tree in post-order.
131   while (!ToVisit.empty()) {
132     SuffixTreeNode *CurrNode = ToVisit.pop_back_val();
133     if (auto *CurrInternalNode = dyn_cast<SuffixTreeInternalNode>(CurrNode)) {
134       // The current node is an internal node.
135       auto I = ChildrenMap.find(CurrInternalNode);
136       if (I == ChildrenMap.end()) {
137         // This is the first time we visit this node.
138         // Its children have not been added to the stack yet.
139         // We add current node back, and add its children to the stack.
140         // We keep track of the first and last children of the current node.
141         auto J = CurrInternalNode->Children.begin();
142         if (J != CurrInternalNode->Children.end()) {
143           ToVisit.push_back(CurrNode);
144           SuffixTreeNode *FirstChild = J->second;
145           SuffixTreeNode *LastChild = nullptr;
146           for (; J != CurrInternalNode->Children.end(); ++J) {
147             LastChild = J->second;
148             ToVisit.push_back(LastChild);
149           }
150           ChildrenMap[CurrInternalNode] = {FirstChild, LastChild};
151         }
152       } else {
153         // This is the second time we visit this node.
154         // All of its children have already been processed.
155         // Now, we can set its LeftLeafIdx and RightLeafIdx;
156         auto [FirstChild, LastChild] = I->second;
157         // Get the first child to use its RightLeafIdx.
158         // The first child is the first one added to the stack, so it is
159         // the last one to be processed. Hence, the leaf descendants
160         // of the first child are assigned the largest index numbers.
161         CurrNode->setRightLeafIdx(FirstChild->getRightLeafIdx());
162         // Get the last child to use its LeftLeafIdx.
163         CurrNode->setLeftLeafIdx(LastChild->getLeftLeafIdx());
164         assert(CurrNode->getLeftLeafIdx() <= CurrNode->getRightLeafIdx() &&
165                "LeftLeafIdx should not be larger than RightLeafIdx");
166       }
167     } else {
168       // The current node is a leaf node.
169       // We can simply set its LeftLeafIdx and RightLeafIdx.
170       CurrNode->setLeftLeafIdx(LeafCounter);
171       CurrNode->setRightLeafIdx(LeafCounter);
172       ++LeafCounter;
173       auto *CurrLeafNode = cast<SuffixTreeLeafNode>(CurrNode);
174       LeafNodes.push_back(CurrLeafNode);
175     }
176   }
177 }
178 
179 unsigned SuffixTree::extend(unsigned EndIdx, unsigned SuffixesToAdd) {
180   SuffixTreeInternalNode *NeedsLink = nullptr;
181 
182   while (SuffixesToAdd > 0) {
183 
184     // Are we waiting to add anything other than just the last character?
185     if (Active.Len == 0) {
186       // If not, then say the active index is the end index.
187       Active.Idx = EndIdx;
188     }
189 
190     assert(Active.Idx <= EndIdx && "Start index can't be after end index!");
191 
192     // The first character in the current substring we're looking at.
193     unsigned FirstChar = Str[Active.Idx];
194 
195     // Have we inserted anything starting with FirstChar at the current node?
196     if (Active.Node->Children.count(FirstChar) == 0) {
197       // If not, then we can just insert a leaf and move to the next step.
198       insertLeaf(*Active.Node, EndIdx, FirstChar);
199 
200       // The active node is an internal node, and we visited it, so it must
201       // need a link if it doesn't have one.
202       if (NeedsLink) {
203         NeedsLink->setLink(Active.Node);
204         NeedsLink = nullptr;
205       }
206     } else {
207       // There's a match with FirstChar, so look for the point in the tree to
208       // insert a new node.
209       SuffixTreeNode *NextNode = Active.Node->Children[FirstChar];
210 
211       unsigned SubstringLen = numElementsInSubstring(NextNode);
212 
213       // Is the current suffix we're trying to insert longer than the size of
214       // the child we want to move to?
215       if (Active.Len >= SubstringLen) {
216         // If yes, then consume the characters we've seen and move to the next
217         // node.
218         assert(isa<SuffixTreeInternalNode>(NextNode) &&
219                "Expected an internal node?");
220         Active.Idx += SubstringLen;
221         Active.Len -= SubstringLen;
222         Active.Node = cast<SuffixTreeInternalNode>(NextNode);
223         continue;
224       }
225 
226       // Otherwise, the suffix we're trying to insert must be contained in the
227       // next node we want to move to.
228       unsigned LastChar = Str[EndIdx];
229 
230       // Is the string we're trying to insert a substring of the next node?
231       if (Str[NextNode->getStartIdx() + Active.Len] == LastChar) {
232         // If yes, then we're done for this step. Remember our insertion point
233         // and move to the next end index. At this point, we have an implicit
234         // suffix tree.
235         if (NeedsLink && !Active.Node->isRoot()) {
236           NeedsLink->setLink(Active.Node);
237           NeedsLink = nullptr;
238         }
239 
240         Active.Len++;
241         break;
242       }
243 
244       // The string we're trying to insert isn't a substring of the next node,
245       // but matches up to a point. Split the node.
246       //
247       // For example, say we ended our search at a node n and we're trying to
248       // insert ABD. Then we'll create a new node s for AB, reduce n to just
249       // representing C, and insert a new leaf node l to represent d. This
250       // allows us to ensure that if n was a leaf, it remains a leaf.
251       //
252       //   | ABC  ---split--->  | AB
253       //   n                    s
254       //                     C / \ D
255       //                      n   l
256 
257       // The node s from the diagram
258       SuffixTreeInternalNode *SplitNode = insertInternalNode(
259           Active.Node, NextNode->getStartIdx(),
260           NextNode->getStartIdx() + Active.Len - 1, FirstChar);
261 
262       // Insert the new node representing the new substring into the tree as
263       // a child of the split node. This is the node l from the diagram.
264       insertLeaf(*SplitNode, EndIdx, LastChar);
265 
266       // Make the old node a child of the split node and update its start
267       // index. This is the node n from the diagram.
268       NextNode->incrementStartIdx(Active.Len);
269       SplitNode->Children[Str[NextNode->getStartIdx()]] = NextNode;
270 
271       // SplitNode is an internal node, update the suffix link.
272       if (NeedsLink)
273         NeedsLink->setLink(SplitNode);
274 
275       NeedsLink = SplitNode;
276     }
277 
278     // We've added something new to the tree, so there's one less suffix to
279     // add.
280     SuffixesToAdd--;
281 
282     if (Active.Node->isRoot()) {
283       if (Active.Len > 0) {
284         Active.Len--;
285         Active.Idx = EndIdx - SuffixesToAdd + 1;
286       }
287     } else {
288       // Start the next phase at the next smallest suffix.
289       Active.Node = Active.Node->getLink();
290     }
291   }
292 
293   return SuffixesToAdd;
294 }
295 
296 void SuffixTree::RepeatedSubstringIterator::advance() {
297   // Clear the current state. If we're at the end of the range, then this
298   // is the state we want to be in.
299   RS = RepeatedSubstring();
300   N = nullptr;
301 
302   // Each leaf node represents a repeat of a string.
303   SmallVector<unsigned> RepeatedSubstringStarts;
304 
305   // Continue visiting nodes until we find one which repeats more than once.
306   while (!InternalNodesToVisit.empty()) {
307     RepeatedSubstringStarts.clear();
308     auto *Curr = InternalNodesToVisit.back();
309     InternalNodesToVisit.pop_back();
310 
311     // Keep track of the length of the string associated with the node. If
312     // it's too short, we'll quit.
313     unsigned Length = Curr->getConcatLen();
314 
315     // Iterate over each child, saving internal nodes for visiting.
316     // Internal nodes represent individual strings, which may repeat.
317     for (auto &ChildPair : Curr->Children)
318       // Save all of this node's children for processing.
319       if (auto *InternalChild =
320               dyn_cast<SuffixTreeInternalNode>(ChildPair.second))
321         InternalNodesToVisit.push_back(InternalChild);
322 
323     // If length of repeated substring is below threshold, then skip it.
324     if (Length < MinLength)
325       continue;
326 
327     // The root never represents a repeated substring. If we're looking at
328     // that, then skip it.
329     if (Curr->isRoot())
330       continue;
331 
332     // Collect leaf children or leaf descendants by OutlinerLeafDescendants.
333     if (OutlinerLeafDescendants) {
334       for (unsigned I = Curr->getLeftLeafIdx(); I <= Curr->getRightLeafIdx();
335            ++I)
336         RepeatedSubstringStarts.push_back(LeafNodes[I]->getSuffixIdx());
337     } else {
338       for (auto &ChildPair : Curr->Children)
339         if (auto *Leaf = dyn_cast<SuffixTreeLeafNode>(ChildPair.second))
340           RepeatedSubstringStarts.push_back(Leaf->getSuffixIdx());
341     }
342 
343     // Do we have any repeated substrings?
344     if (RepeatedSubstringStarts.size() < 2)
345       continue;
346 
347     // Yes. Update the state to reflect this, and then bail out.
348     N = Curr;
349     RS.Length = Length;
350     for (unsigned StartIdx : RepeatedSubstringStarts)
351       RS.StartIndices.push_back(StartIdx);
352     break;
353   }
354   // At this point, either NewRS is an empty RepeatedSubstring, or it was
355   // set in the above loop. Similarly, N is either nullptr, or the node
356   // associated with NewRS.
357 }
358