xref: /freebsd/contrib/llvm-project/llvm/lib/Support/SuffixTree.cpp (revision 6be3386466ab79a84b48429ae66244f21526d3df)
1 //===- llvm/Support/SuffixTree.cpp - Implement Suffix Tree ------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the Suffix Tree class.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "llvm/Support/SuffixTree.h"
14 #include "llvm/Support/Allocator.h"
15 #include <vector>
16 
17 using namespace llvm;
18 
19 SuffixTree::SuffixTree(const std::vector<unsigned> &Str) : Str(Str) {
20   Root = insertInternalNode(nullptr, EmptyIdx, EmptyIdx, 0);
21   Active.Node = Root;
22 
23   // Keep track of the number of suffixes we have to add of the current
24   // prefix.
25   unsigned SuffixesToAdd = 0;
26 
27   // Construct the suffix tree iteratively on each prefix of the string.
28   // PfxEndIdx is the end index of the current prefix.
29   // End is one past the last element in the string.
30   for (unsigned PfxEndIdx = 0, End = Str.size(); PfxEndIdx < End; PfxEndIdx++) {
31     SuffixesToAdd++;
32     LeafEndIdx = PfxEndIdx; // Extend each of the leaves.
33     SuffixesToAdd = extend(PfxEndIdx, SuffixesToAdd);
34   }
35 
36   // Set the suffix indices of each leaf.
37   assert(Root && "Root node can't be nullptr!");
38   setSuffixIndices();
39 }
40 
41 SuffixTreeNode *SuffixTree::insertLeaf(SuffixTreeNode &Parent,
42                                        unsigned StartIdx, unsigned Edge) {
43 
44   assert(StartIdx <= LeafEndIdx && "String can't start after it ends!");
45 
46   SuffixTreeNode *N = new (NodeAllocator.Allocate())
47       SuffixTreeNode(StartIdx, &LeafEndIdx, nullptr);
48   Parent.Children[Edge] = N;
49 
50   return N;
51 }
52 
53 SuffixTreeNode *SuffixTree::insertInternalNode(SuffixTreeNode *Parent,
54                                                unsigned StartIdx,
55                                                unsigned EndIdx, unsigned Edge) {
56 
57   assert(StartIdx <= EndIdx && "String can't start after it ends!");
58   assert(!(!Parent && StartIdx != EmptyIdx) &&
59          "Non-root internal nodes must have parents!");
60 
61   unsigned *E = new (InternalEndIdxAllocator) unsigned(EndIdx);
62   SuffixTreeNode *N =
63       new (NodeAllocator.Allocate()) SuffixTreeNode(StartIdx, E, Root);
64   if (Parent)
65     Parent->Children[Edge] = N;
66 
67   return N;
68 }
69 
70 void SuffixTree::setSuffixIndices() {
71   // List of nodes we need to visit along with the current length of the
72   // string.
73   std::vector<std::pair<SuffixTreeNode *, unsigned>> ToVisit;
74 
75   // Current node being visited.
76   SuffixTreeNode *CurrNode = Root;
77 
78   // Sum of the lengths of the nodes down the path to the current one.
79   unsigned CurrNodeLen = 0;
80   ToVisit.push_back({CurrNode, CurrNodeLen});
81   while (!ToVisit.empty()) {
82     std::tie(CurrNode, CurrNodeLen) = ToVisit.back();
83     ToVisit.pop_back();
84     CurrNode->ConcatLen = CurrNodeLen;
85     for (auto &ChildPair : CurrNode->Children) {
86       assert(ChildPair.second && "Node had a null child!");
87       ToVisit.push_back(
88           {ChildPair.second, CurrNodeLen + ChildPair.second->size()});
89     }
90 
91     // No children, so we are at the end of the string.
92     if (CurrNode->Children.size() == 0 && !CurrNode->isRoot())
93       CurrNode->SuffixIdx = Str.size() - CurrNodeLen;
94   }
95 }
96 
97 unsigned SuffixTree::extend(unsigned EndIdx, unsigned SuffixesToAdd) {
98   SuffixTreeNode *NeedsLink = nullptr;
99 
100   while (SuffixesToAdd > 0) {
101 
102     // Are we waiting to add anything other than just the last character?
103     if (Active.Len == 0) {
104       // If not, then say the active index is the end index.
105       Active.Idx = EndIdx;
106     }
107 
108     assert(Active.Idx <= EndIdx && "Start index can't be after end index!");
109 
110     // The first character in the current substring we're looking at.
111     unsigned FirstChar = Str[Active.Idx];
112 
113     // Have we inserted anything starting with FirstChar at the current node?
114     if (Active.Node->Children.count(FirstChar) == 0) {
115       // If not, then we can just insert a leaf and move to the next step.
116       insertLeaf(*Active.Node, EndIdx, FirstChar);
117 
118       // The active node is an internal node, and we visited it, so it must
119       // need a link if it doesn't have one.
120       if (NeedsLink) {
121         NeedsLink->Link = Active.Node;
122         NeedsLink = nullptr;
123       }
124     } else {
125       // There's a match with FirstChar, so look for the point in the tree to
126       // insert a new node.
127       SuffixTreeNode *NextNode = Active.Node->Children[FirstChar];
128 
129       unsigned SubstringLen = NextNode->size();
130 
131       // Is the current suffix we're trying to insert longer than the size of
132       // the child we want to move to?
133       if (Active.Len >= SubstringLen) {
134         // If yes, then consume the characters we've seen and move to the next
135         // node.
136         Active.Idx += SubstringLen;
137         Active.Len -= SubstringLen;
138         Active.Node = NextNode;
139         continue;
140       }
141 
142       // Otherwise, the suffix we're trying to insert must be contained in the
143       // next node we want to move to.
144       unsigned LastChar = Str[EndIdx];
145 
146       // Is the string we're trying to insert a substring of the next node?
147       if (Str[NextNode->StartIdx + Active.Len] == LastChar) {
148         // If yes, then we're done for this step. Remember our insertion point
149         // and move to the next end index. At this point, we have an implicit
150         // suffix tree.
151         if (NeedsLink && !Active.Node->isRoot()) {
152           NeedsLink->Link = Active.Node;
153           NeedsLink = nullptr;
154         }
155 
156         Active.Len++;
157         break;
158       }
159 
160       // The string we're trying to insert isn't a substring of the next node,
161       // but matches up to a point. Split the node.
162       //
163       // For example, say we ended our search at a node n and we're trying to
164       // insert ABD. Then we'll create a new node s for AB, reduce n to just
165       // representing C, and insert a new leaf node l to represent d. This
166       // allows us to ensure that if n was a leaf, it remains a leaf.
167       //
168       //   | ABC  ---split--->  | AB
169       //   n                    s
170       //                     C / \ D
171       //                      n   l
172 
173       // The node s from the diagram
174       SuffixTreeNode *SplitNode =
175           insertInternalNode(Active.Node, NextNode->StartIdx,
176                              NextNode->StartIdx + Active.Len - 1, FirstChar);
177 
178       // Insert the new node representing the new substring into the tree as
179       // a child of the split node. This is the node l from the diagram.
180       insertLeaf(*SplitNode, EndIdx, LastChar);
181 
182       // Make the old node a child of the split node and update its start
183       // index. This is the node n from the diagram.
184       NextNode->StartIdx += Active.Len;
185       SplitNode->Children[Str[NextNode->StartIdx]] = NextNode;
186 
187       // SplitNode is an internal node, update the suffix link.
188       if (NeedsLink)
189         NeedsLink->Link = SplitNode;
190 
191       NeedsLink = SplitNode;
192     }
193 
194     // We've added something new to the tree, so there's one less suffix to
195     // add.
196     SuffixesToAdd--;
197 
198     if (Active.Node->isRoot()) {
199       if (Active.Len > 0) {
200         Active.Len--;
201         Active.Idx = EndIdx - SuffixesToAdd + 1;
202       }
203     } else {
204       // Start the next phase at the next smallest suffix.
205       Active.Node = Active.Node->Link;
206     }
207   }
208 
209   return SuffixesToAdd;
210 }
211