1 //===- llvm/Support/SuffixTree.cpp - Implement Suffix Tree ------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements the Suffix Tree class. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "llvm/Support/SuffixTree.h" 14 #include "llvm/Support/Allocator.h" 15 #include <vector> 16 17 using namespace llvm; 18 19 SuffixTree::SuffixTree(const std::vector<unsigned> &Str) : Str(Str) { 20 Root = insertInternalNode(nullptr, EmptyIdx, EmptyIdx, 0); 21 Active.Node = Root; 22 23 // Keep track of the number of suffixes we have to add of the current 24 // prefix. 25 unsigned SuffixesToAdd = 0; 26 27 // Construct the suffix tree iteratively on each prefix of the string. 28 // PfxEndIdx is the end index of the current prefix. 29 // End is one past the last element in the string. 30 for (unsigned PfxEndIdx = 0, End = Str.size(); PfxEndIdx < End; PfxEndIdx++) { 31 SuffixesToAdd++; 32 LeafEndIdx = PfxEndIdx; // Extend each of the leaves. 33 SuffixesToAdd = extend(PfxEndIdx, SuffixesToAdd); 34 } 35 36 // Set the suffix indices of each leaf. 37 assert(Root && "Root node can't be nullptr!"); 38 setSuffixIndices(); 39 } 40 41 SuffixTreeNode *SuffixTree::insertLeaf(SuffixTreeNode &Parent, 42 unsigned StartIdx, unsigned Edge) { 43 44 assert(StartIdx <= LeafEndIdx && "String can't start after it ends!"); 45 46 SuffixTreeNode *N = new (NodeAllocator.Allocate()) 47 SuffixTreeNode(StartIdx, &LeafEndIdx, nullptr); 48 Parent.Children[Edge] = N; 49 50 return N; 51 } 52 53 SuffixTreeNode *SuffixTree::insertInternalNode(SuffixTreeNode *Parent, 54 unsigned StartIdx, 55 unsigned EndIdx, unsigned Edge) { 56 57 assert(StartIdx <= EndIdx && "String can't start after it ends!"); 58 assert(!(!Parent && StartIdx != EmptyIdx) && 59 "Non-root internal nodes must have parents!"); 60 61 unsigned *E = new (InternalEndIdxAllocator) unsigned(EndIdx); 62 SuffixTreeNode *N = 63 new (NodeAllocator.Allocate()) SuffixTreeNode(StartIdx, E, Root); 64 if (Parent) 65 Parent->Children[Edge] = N; 66 67 return N; 68 } 69 70 void SuffixTree::setSuffixIndices() { 71 // List of nodes we need to visit along with the current length of the 72 // string. 73 std::vector<std::pair<SuffixTreeNode *, unsigned>> ToVisit; 74 75 // Current node being visited. 76 SuffixTreeNode *CurrNode = Root; 77 78 // Sum of the lengths of the nodes down the path to the current one. 79 unsigned CurrNodeLen = 0; 80 ToVisit.push_back({CurrNode, CurrNodeLen}); 81 while (!ToVisit.empty()) { 82 std::tie(CurrNode, CurrNodeLen) = ToVisit.back(); 83 ToVisit.pop_back(); 84 CurrNode->ConcatLen = CurrNodeLen; 85 for (auto &ChildPair : CurrNode->Children) { 86 assert(ChildPair.second && "Node had a null child!"); 87 ToVisit.push_back( 88 {ChildPair.second, CurrNodeLen + ChildPair.second->size()}); 89 } 90 91 // No children, so we are at the end of the string. 92 if (CurrNode->Children.size() == 0 && !CurrNode->isRoot()) 93 CurrNode->SuffixIdx = Str.size() - CurrNodeLen; 94 } 95 } 96 97 unsigned SuffixTree::extend(unsigned EndIdx, unsigned SuffixesToAdd) { 98 SuffixTreeNode *NeedsLink = nullptr; 99 100 while (SuffixesToAdd > 0) { 101 102 // Are we waiting to add anything other than just the last character? 103 if (Active.Len == 0) { 104 // If not, then say the active index is the end index. 105 Active.Idx = EndIdx; 106 } 107 108 assert(Active.Idx <= EndIdx && "Start index can't be after end index!"); 109 110 // The first character in the current substring we're looking at. 111 unsigned FirstChar = Str[Active.Idx]; 112 113 // Have we inserted anything starting with FirstChar at the current node? 114 if (Active.Node->Children.count(FirstChar) == 0) { 115 // If not, then we can just insert a leaf and move to the next step. 116 insertLeaf(*Active.Node, EndIdx, FirstChar); 117 118 // The active node is an internal node, and we visited it, so it must 119 // need a link if it doesn't have one. 120 if (NeedsLink) { 121 NeedsLink->Link = Active.Node; 122 NeedsLink = nullptr; 123 } 124 } else { 125 // There's a match with FirstChar, so look for the point in the tree to 126 // insert a new node. 127 SuffixTreeNode *NextNode = Active.Node->Children[FirstChar]; 128 129 unsigned SubstringLen = NextNode->size(); 130 131 // Is the current suffix we're trying to insert longer than the size of 132 // the child we want to move to? 133 if (Active.Len >= SubstringLen) { 134 // If yes, then consume the characters we've seen and move to the next 135 // node. 136 Active.Idx += SubstringLen; 137 Active.Len -= SubstringLen; 138 Active.Node = NextNode; 139 continue; 140 } 141 142 // Otherwise, the suffix we're trying to insert must be contained in the 143 // next node we want to move to. 144 unsigned LastChar = Str[EndIdx]; 145 146 // Is the string we're trying to insert a substring of the next node? 147 if (Str[NextNode->StartIdx + Active.Len] == LastChar) { 148 // If yes, then we're done for this step. Remember our insertion point 149 // and move to the next end index. At this point, we have an implicit 150 // suffix tree. 151 if (NeedsLink && !Active.Node->isRoot()) { 152 NeedsLink->Link = Active.Node; 153 NeedsLink = nullptr; 154 } 155 156 Active.Len++; 157 break; 158 } 159 160 // The string we're trying to insert isn't a substring of the next node, 161 // but matches up to a point. Split the node. 162 // 163 // For example, say we ended our search at a node n and we're trying to 164 // insert ABD. Then we'll create a new node s for AB, reduce n to just 165 // representing C, and insert a new leaf node l to represent d. This 166 // allows us to ensure that if n was a leaf, it remains a leaf. 167 // 168 // | ABC ---split---> | AB 169 // n s 170 // C / \ D 171 // n l 172 173 // The node s from the diagram 174 SuffixTreeNode *SplitNode = 175 insertInternalNode(Active.Node, NextNode->StartIdx, 176 NextNode->StartIdx + Active.Len - 1, FirstChar); 177 178 // Insert the new node representing the new substring into the tree as 179 // a child of the split node. This is the node l from the diagram. 180 insertLeaf(*SplitNode, EndIdx, LastChar); 181 182 // Make the old node a child of the split node and update its start 183 // index. This is the node n from the diagram. 184 NextNode->StartIdx += Active.Len; 185 SplitNode->Children[Str[NextNode->StartIdx]] = NextNode; 186 187 // SplitNode is an internal node, update the suffix link. 188 if (NeedsLink) 189 NeedsLink->Link = SplitNode; 190 191 NeedsLink = SplitNode; 192 } 193 194 // We've added something new to the tree, so there's one less suffix to 195 // add. 196 SuffixesToAdd--; 197 198 if (Active.Node->isRoot()) { 199 if (Active.Len > 0) { 200 Active.Len--; 201 Active.Idx = EndIdx - SuffixesToAdd + 1; 202 } 203 } else { 204 // Start the next phase at the next smallest suffix. 205 Active.Node = Active.Node->Link; 206 } 207 } 208 209 return SuffixesToAdd; 210 } 211