xref: /freebsd/contrib/llvm-project/llvm/lib/Support/SuffixTree.cpp (revision 419822b372f543b22d7fb04eae0dffacf058feb6)
1  //===- llvm/Support/SuffixTree.cpp - Implement Suffix Tree ------*- C++ -*-===//
2  //
3  // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  // See https://llvm.org/LICENSE.txt for license information.
5  // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  //
7  //===----------------------------------------------------------------------===//
8  //
9  // This file implements the Suffix Tree class.
10  //
11  //===----------------------------------------------------------------------===//
12  
13  #include "llvm/Support/SuffixTree.h"
14  #include "llvm/Support/Allocator.h"
15  #include <vector>
16  
17  using namespace llvm;
18  
19  SuffixTree::SuffixTree(const std::vector<unsigned> &Str) : Str(Str) {
20    Root = insertInternalNode(nullptr, EmptyIdx, EmptyIdx, 0);
21    Active.Node = Root;
22  
23    // Keep track of the number of suffixes we have to add of the current
24    // prefix.
25    unsigned SuffixesToAdd = 0;
26  
27    // Construct the suffix tree iteratively on each prefix of the string.
28    // PfxEndIdx is the end index of the current prefix.
29    // End is one past the last element in the string.
30    for (unsigned PfxEndIdx = 0, End = Str.size(); PfxEndIdx < End; PfxEndIdx++) {
31      SuffixesToAdd++;
32      LeafEndIdx = PfxEndIdx; // Extend each of the leaves.
33      SuffixesToAdd = extend(PfxEndIdx, SuffixesToAdd);
34    }
35  
36    // Set the suffix indices of each leaf.
37    assert(Root && "Root node can't be nullptr!");
38    setSuffixIndices();
39  }
40  
41  SuffixTreeNode *SuffixTree::insertLeaf(SuffixTreeNode &Parent,
42                                         unsigned StartIdx, unsigned Edge) {
43  
44    assert(StartIdx <= LeafEndIdx && "String can't start after it ends!");
45  
46    SuffixTreeNode *N = new (NodeAllocator.Allocate())
47        SuffixTreeNode(StartIdx, &LeafEndIdx, nullptr);
48    Parent.Children[Edge] = N;
49  
50    return N;
51  }
52  
53  SuffixTreeNode *SuffixTree::insertInternalNode(SuffixTreeNode *Parent,
54                                                 unsigned StartIdx,
55                                                 unsigned EndIdx, unsigned Edge) {
56  
57    assert(StartIdx <= EndIdx && "String can't start after it ends!");
58    assert(!(!Parent && StartIdx != EmptyIdx) &&
59           "Non-root internal nodes must have parents!");
60  
61    unsigned *E = new (InternalEndIdxAllocator) unsigned(EndIdx);
62    SuffixTreeNode *N =
63        new (NodeAllocator.Allocate()) SuffixTreeNode(StartIdx, E, Root);
64    if (Parent)
65      Parent->Children[Edge] = N;
66  
67    return N;
68  }
69  
70  void SuffixTree::setSuffixIndices() {
71    // List of nodes we need to visit along with the current length of the
72    // string.
73    std::vector<std::pair<SuffixTreeNode *, unsigned>> ToVisit;
74  
75    // Current node being visited.
76    SuffixTreeNode *CurrNode = Root;
77  
78    // Sum of the lengths of the nodes down the path to the current one.
79    unsigned CurrNodeLen = 0;
80    ToVisit.push_back({CurrNode, CurrNodeLen});
81    while (!ToVisit.empty()) {
82      std::tie(CurrNode, CurrNodeLen) = ToVisit.back();
83      ToVisit.pop_back();
84      CurrNode->ConcatLen = CurrNodeLen;
85      for (auto &ChildPair : CurrNode->Children) {
86        assert(ChildPair.second && "Node had a null child!");
87        ToVisit.push_back(
88            {ChildPair.second, CurrNodeLen + ChildPair.second->size()});
89      }
90  
91      // No children, so we are at the end of the string.
92      if (CurrNode->Children.size() == 0 && !CurrNode->isRoot())
93        CurrNode->SuffixIdx = Str.size() - CurrNodeLen;
94    }
95  }
96  
97  unsigned SuffixTree::extend(unsigned EndIdx, unsigned SuffixesToAdd) {
98    SuffixTreeNode *NeedsLink = nullptr;
99  
100    while (SuffixesToAdd > 0) {
101  
102      // Are we waiting to add anything other than just the last character?
103      if (Active.Len == 0) {
104        // If not, then say the active index is the end index.
105        Active.Idx = EndIdx;
106      }
107  
108      assert(Active.Idx <= EndIdx && "Start index can't be after end index!");
109  
110      // The first character in the current substring we're looking at.
111      unsigned FirstChar = Str[Active.Idx];
112  
113      // Have we inserted anything starting with FirstChar at the current node?
114      if (Active.Node->Children.count(FirstChar) == 0) {
115        // If not, then we can just insert a leaf and move to the next step.
116        insertLeaf(*Active.Node, EndIdx, FirstChar);
117  
118        // The active node is an internal node, and we visited it, so it must
119        // need a link if it doesn't have one.
120        if (NeedsLink) {
121          NeedsLink->Link = Active.Node;
122          NeedsLink = nullptr;
123        }
124      } else {
125        // There's a match with FirstChar, so look for the point in the tree to
126        // insert a new node.
127        SuffixTreeNode *NextNode = Active.Node->Children[FirstChar];
128  
129        unsigned SubstringLen = NextNode->size();
130  
131        // Is the current suffix we're trying to insert longer than the size of
132        // the child we want to move to?
133        if (Active.Len >= SubstringLen) {
134          // If yes, then consume the characters we've seen and move to the next
135          // node.
136          Active.Idx += SubstringLen;
137          Active.Len -= SubstringLen;
138          Active.Node = NextNode;
139          continue;
140        }
141  
142        // Otherwise, the suffix we're trying to insert must be contained in the
143        // next node we want to move to.
144        unsigned LastChar = Str[EndIdx];
145  
146        // Is the string we're trying to insert a substring of the next node?
147        if (Str[NextNode->StartIdx + Active.Len] == LastChar) {
148          // If yes, then we're done for this step. Remember our insertion point
149          // and move to the next end index. At this point, we have an implicit
150          // suffix tree.
151          if (NeedsLink && !Active.Node->isRoot()) {
152            NeedsLink->Link = Active.Node;
153            NeedsLink = nullptr;
154          }
155  
156          Active.Len++;
157          break;
158        }
159  
160        // The string we're trying to insert isn't a substring of the next node,
161        // but matches up to a point. Split the node.
162        //
163        // For example, say we ended our search at a node n and we're trying to
164        // insert ABD. Then we'll create a new node s for AB, reduce n to just
165        // representing C, and insert a new leaf node l to represent d. This
166        // allows us to ensure that if n was a leaf, it remains a leaf.
167        //
168        //   | ABC  ---split--->  | AB
169        //   n                    s
170        //                     C / \ D
171        //                      n   l
172  
173        // The node s from the diagram
174        SuffixTreeNode *SplitNode =
175            insertInternalNode(Active.Node, NextNode->StartIdx,
176                               NextNode->StartIdx + Active.Len - 1, FirstChar);
177  
178        // Insert the new node representing the new substring into the tree as
179        // a child of the split node. This is the node l from the diagram.
180        insertLeaf(*SplitNode, EndIdx, LastChar);
181  
182        // Make the old node a child of the split node and update its start
183        // index. This is the node n from the diagram.
184        NextNode->StartIdx += Active.Len;
185        SplitNode->Children[Str[NextNode->StartIdx]] = NextNode;
186  
187        // SplitNode is an internal node, update the suffix link.
188        if (NeedsLink)
189          NeedsLink->Link = SplitNode;
190  
191        NeedsLink = SplitNode;
192      }
193  
194      // We've added something new to the tree, so there's one less suffix to
195      // add.
196      SuffixesToAdd--;
197  
198      if (Active.Node->isRoot()) {
199        if (Active.Len > 0) {
200          Active.Len--;
201          Active.Idx = EndIdx - SuffixesToAdd + 1;
202        }
203      } else {
204        // Start the next phase at the next smallest suffix.
205        Active.Node = Active.Node->Link;
206      }
207    }
208  
209    return SuffixesToAdd;
210  }
211