1 //===- llvm/Support/SuffixTree.cpp - Implement Suffix Tree ------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements the Suffix Tree class. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "llvm/Support/SuffixTree.h" 14 #include "llvm/Support/Allocator.h" 15 #include "llvm/Support/Casting.h" 16 #include "llvm/Support/SuffixTreeNode.h" 17 18 using namespace llvm; 19 20 /// \returns the number of elements in the substring associated with \p N. 21 static size_t numElementsInSubstring(const SuffixTreeNode *N) { 22 assert(N && "Got a null node?"); 23 if (auto *Internal = dyn_cast<SuffixTreeInternalNode>(N)) 24 if (Internal->isRoot()) 25 return 0; 26 return N->getEndIdx() - N->getStartIdx() + 1; 27 } 28 29 SuffixTree::SuffixTree(const ArrayRef<unsigned> &Str, 30 bool OutlinerLeafDescendants) 31 : Str(Str), OutlinerLeafDescendants(OutlinerLeafDescendants) { 32 Root = insertRoot(); 33 Active.Node = Root; 34 35 // Keep track of the number of suffixes we have to add of the current 36 // prefix. 37 unsigned SuffixesToAdd = 0; 38 39 // Construct the suffix tree iteratively on each prefix of the string. 40 // PfxEndIdx is the end index of the current prefix. 41 // End is one past the last element in the string. 42 for (unsigned PfxEndIdx = 0, End = Str.size(); PfxEndIdx < End; PfxEndIdx++) { 43 SuffixesToAdd++; 44 LeafEndIdx = PfxEndIdx; // Extend each of the leaves. 45 SuffixesToAdd = extend(PfxEndIdx, SuffixesToAdd); 46 } 47 48 // Set the suffix indices of each leaf. 49 assert(Root && "Root node can't be nullptr!"); 50 setSuffixIndices(); 51 52 // Collect all leaf nodes of the suffix tree. And for each internal node, 53 // record the range of leaf nodes that are descendants of it. 54 if (OutlinerLeafDescendants) 55 setLeafNodes(); 56 } 57 58 SuffixTreeNode *SuffixTree::insertLeaf(SuffixTreeInternalNode &Parent, 59 unsigned StartIdx, unsigned Edge) { 60 assert(StartIdx <= LeafEndIdx && "String can't start after it ends!"); 61 auto *N = new (LeafNodeAllocator.Allocate()) 62 SuffixTreeLeafNode(StartIdx, &LeafEndIdx); 63 Parent.Children[Edge] = N; 64 return N; 65 } 66 67 SuffixTreeInternalNode * 68 SuffixTree::insertInternalNode(SuffixTreeInternalNode *Parent, 69 unsigned StartIdx, unsigned EndIdx, 70 unsigned Edge) { 71 assert(StartIdx <= EndIdx && "String can't start after it ends!"); 72 assert(!(!Parent && StartIdx != SuffixTreeNode::EmptyIdx) && 73 "Non-root internal nodes must have parents!"); 74 auto *N = new (InternalNodeAllocator.Allocate()) 75 SuffixTreeInternalNode(StartIdx, EndIdx, Root); 76 if (Parent) 77 Parent->Children[Edge] = N; 78 return N; 79 } 80 81 SuffixTreeInternalNode *SuffixTree::insertRoot() { 82 return insertInternalNode(/*Parent = */ nullptr, SuffixTreeNode::EmptyIdx, 83 SuffixTreeNode::EmptyIdx, /*Edge = */ 0); 84 } 85 86 void SuffixTree::setSuffixIndices() { 87 // List of nodes we need to visit along with the current length of the 88 // string. 89 SmallVector<std::pair<SuffixTreeNode *, unsigned>> ToVisit; 90 91 // Current node being visited. 92 SuffixTreeNode *CurrNode = Root; 93 94 // Sum of the lengths of the nodes down the path to the current one. 95 unsigned CurrNodeLen = 0; 96 ToVisit.push_back({CurrNode, CurrNodeLen}); 97 while (!ToVisit.empty()) { 98 std::tie(CurrNode, CurrNodeLen) = ToVisit.back(); 99 ToVisit.pop_back(); 100 // Length of the current node from the root down to here. 101 CurrNode->setConcatLen(CurrNodeLen); 102 if (auto *InternalNode = dyn_cast<SuffixTreeInternalNode>(CurrNode)) 103 for (auto &ChildPair : InternalNode->Children) { 104 assert(ChildPair.second && "Node had a null child!"); 105 ToVisit.push_back( 106 {ChildPair.second, 107 CurrNodeLen + numElementsInSubstring(ChildPair.second)}); 108 } 109 // No children, so we are at the end of the string. 110 if (auto *LeafNode = dyn_cast<SuffixTreeLeafNode>(CurrNode)) 111 LeafNode->setSuffixIdx(Str.size() - CurrNodeLen); 112 } 113 } 114 115 void SuffixTree::setLeafNodes() { 116 // A stack that keeps track of nodes to visit for post-order DFS traversal. 117 SmallVector<SuffixTreeNode *> ToVisit; 118 ToVisit.push_back(Root); 119 120 // This keeps track of the index of the next leaf node to be added to 121 // the LeafNodes vector of the suffix tree. 122 unsigned LeafCounter = 0; 123 124 // This keeps track of nodes whose children have been added to the stack. 125 // The value is a pair, representing a node's first and last children. 126 DenseMap<SuffixTreeInternalNode *, 127 std::pair<SuffixTreeNode *, SuffixTreeNode *>> 128 ChildrenMap; 129 130 // Traverse the tree in post-order. 131 while (!ToVisit.empty()) { 132 SuffixTreeNode *CurrNode = ToVisit.pop_back_val(); 133 if (auto *CurrInternalNode = dyn_cast<SuffixTreeInternalNode>(CurrNode)) { 134 // The current node is an internal node. 135 auto I = ChildrenMap.find(CurrInternalNode); 136 if (I == ChildrenMap.end()) { 137 // This is the first time we visit this node. 138 // Its children have not been added to the stack yet. 139 // We add current node back, and add its children to the stack. 140 // We keep track of the first and last children of the current node. 141 auto J = CurrInternalNode->Children.begin(); 142 if (J != CurrInternalNode->Children.end()) { 143 ToVisit.push_back(CurrNode); 144 SuffixTreeNode *FirstChild = J->second; 145 SuffixTreeNode *LastChild = nullptr; 146 for (; J != CurrInternalNode->Children.end(); ++J) { 147 LastChild = J->second; 148 ToVisit.push_back(LastChild); 149 } 150 ChildrenMap[CurrInternalNode] = {FirstChild, LastChild}; 151 } 152 } else { 153 // This is the second time we visit this node. 154 // All of its children have already been processed. 155 // Now, we can set its LeftLeafIdx and RightLeafIdx; 156 auto [FirstChild, LastChild] = I->second; 157 // Get the first child to use its RightLeafIdx. 158 // The first child is the first one added to the stack, so it is 159 // the last one to be processed. Hence, the leaf descendants 160 // of the first child are assigned the largest index numbers. 161 CurrNode->setRightLeafIdx(FirstChild->getRightLeafIdx()); 162 // Get the last child to use its LeftLeafIdx. 163 CurrNode->setLeftLeafIdx(LastChild->getLeftLeafIdx()); 164 assert(CurrNode->getLeftLeafIdx() <= CurrNode->getRightLeafIdx() && 165 "LeftLeafIdx should not be larger than RightLeafIdx"); 166 } 167 } else { 168 // The current node is a leaf node. 169 // We can simply set its LeftLeafIdx and RightLeafIdx. 170 CurrNode->setLeftLeafIdx(LeafCounter); 171 CurrNode->setRightLeafIdx(LeafCounter); 172 ++LeafCounter; 173 auto *CurrLeafNode = cast<SuffixTreeLeafNode>(CurrNode); 174 LeafNodes.push_back(CurrLeafNode); 175 } 176 } 177 } 178 179 unsigned SuffixTree::extend(unsigned EndIdx, unsigned SuffixesToAdd) { 180 SuffixTreeInternalNode *NeedsLink = nullptr; 181 182 while (SuffixesToAdd > 0) { 183 184 // Are we waiting to add anything other than just the last character? 185 if (Active.Len == 0) { 186 // If not, then say the active index is the end index. 187 Active.Idx = EndIdx; 188 } 189 190 assert(Active.Idx <= EndIdx && "Start index can't be after end index!"); 191 192 // The first character in the current substring we're looking at. 193 unsigned FirstChar = Str[Active.Idx]; 194 195 // Have we inserted anything starting with FirstChar at the current node? 196 if (Active.Node->Children.count(FirstChar) == 0) { 197 // If not, then we can just insert a leaf and move to the next step. 198 insertLeaf(*Active.Node, EndIdx, FirstChar); 199 200 // The active node is an internal node, and we visited it, so it must 201 // need a link if it doesn't have one. 202 if (NeedsLink) { 203 NeedsLink->setLink(Active.Node); 204 NeedsLink = nullptr; 205 } 206 } else { 207 // There's a match with FirstChar, so look for the point in the tree to 208 // insert a new node. 209 SuffixTreeNode *NextNode = Active.Node->Children[FirstChar]; 210 211 unsigned SubstringLen = numElementsInSubstring(NextNode); 212 213 // Is the current suffix we're trying to insert longer than the size of 214 // the child we want to move to? 215 if (Active.Len >= SubstringLen) { 216 // If yes, then consume the characters we've seen and move to the next 217 // node. 218 assert(isa<SuffixTreeInternalNode>(NextNode) && 219 "Expected an internal node?"); 220 Active.Idx += SubstringLen; 221 Active.Len -= SubstringLen; 222 Active.Node = cast<SuffixTreeInternalNode>(NextNode); 223 continue; 224 } 225 226 // Otherwise, the suffix we're trying to insert must be contained in the 227 // next node we want to move to. 228 unsigned LastChar = Str[EndIdx]; 229 230 // Is the string we're trying to insert a substring of the next node? 231 if (Str[NextNode->getStartIdx() + Active.Len] == LastChar) { 232 // If yes, then we're done for this step. Remember our insertion point 233 // and move to the next end index. At this point, we have an implicit 234 // suffix tree. 235 if (NeedsLink && !Active.Node->isRoot()) { 236 NeedsLink->setLink(Active.Node); 237 NeedsLink = nullptr; 238 } 239 240 Active.Len++; 241 break; 242 } 243 244 // The string we're trying to insert isn't a substring of the next node, 245 // but matches up to a point. Split the node. 246 // 247 // For example, say we ended our search at a node n and we're trying to 248 // insert ABD. Then we'll create a new node s for AB, reduce n to just 249 // representing C, and insert a new leaf node l to represent d. This 250 // allows us to ensure that if n was a leaf, it remains a leaf. 251 // 252 // | ABC ---split---> | AB 253 // n s 254 // C / \ D 255 // n l 256 257 // The node s from the diagram 258 SuffixTreeInternalNode *SplitNode = insertInternalNode( 259 Active.Node, NextNode->getStartIdx(), 260 NextNode->getStartIdx() + Active.Len - 1, FirstChar); 261 262 // Insert the new node representing the new substring into the tree as 263 // a child of the split node. This is the node l from the diagram. 264 insertLeaf(*SplitNode, EndIdx, LastChar); 265 266 // Make the old node a child of the split node and update its start 267 // index. This is the node n from the diagram. 268 NextNode->incrementStartIdx(Active.Len); 269 SplitNode->Children[Str[NextNode->getStartIdx()]] = NextNode; 270 271 // SplitNode is an internal node, update the suffix link. 272 if (NeedsLink) 273 NeedsLink->setLink(SplitNode); 274 275 NeedsLink = SplitNode; 276 } 277 278 // We've added something new to the tree, so there's one less suffix to 279 // add. 280 SuffixesToAdd--; 281 282 if (Active.Node->isRoot()) { 283 if (Active.Len > 0) { 284 Active.Len--; 285 Active.Idx = EndIdx - SuffixesToAdd + 1; 286 } 287 } else { 288 // Start the next phase at the next smallest suffix. 289 Active.Node = Active.Node->getLink(); 290 } 291 } 292 293 return SuffixesToAdd; 294 } 295 296 void SuffixTree::RepeatedSubstringIterator::advance() { 297 // Clear the current state. If we're at the end of the range, then this 298 // is the state we want to be in. 299 RS = RepeatedSubstring(); 300 N = nullptr; 301 302 // Each leaf node represents a repeat of a string. 303 SmallVector<unsigned> RepeatedSubstringStarts; 304 305 // Continue visiting nodes until we find one which repeats more than once. 306 while (!InternalNodesToVisit.empty()) { 307 RepeatedSubstringStarts.clear(); 308 auto *Curr = InternalNodesToVisit.back(); 309 InternalNodesToVisit.pop_back(); 310 311 // Keep track of the length of the string associated with the node. If 312 // it's too short, we'll quit. 313 unsigned Length = Curr->getConcatLen(); 314 315 // Iterate over each child, saving internal nodes for visiting. 316 // Internal nodes represent individual strings, which may repeat. 317 for (auto &ChildPair : Curr->Children) 318 // Save all of this node's children for processing. 319 if (auto *InternalChild = 320 dyn_cast<SuffixTreeInternalNode>(ChildPair.second)) 321 InternalNodesToVisit.push_back(InternalChild); 322 323 // If length of repeated substring is below threshold, then skip it. 324 if (Length < MinLength) 325 continue; 326 327 // The root never represents a repeated substring. If we're looking at 328 // that, then skip it. 329 if (Curr->isRoot()) 330 continue; 331 332 // Collect leaf children or leaf descendants by OutlinerLeafDescendants. 333 if (OutlinerLeafDescendants) { 334 for (unsigned I = Curr->getLeftLeafIdx(); I <= Curr->getRightLeafIdx(); 335 ++I) 336 RepeatedSubstringStarts.push_back(LeafNodes[I]->getSuffixIdx()); 337 } else { 338 for (auto &ChildPair : Curr->Children) 339 if (auto *Leaf = dyn_cast<SuffixTreeLeafNode>(ChildPair.second)) 340 RepeatedSubstringStarts.push_back(Leaf->getSuffixIdx()); 341 } 342 343 // Do we have any repeated substrings? 344 if (RepeatedSubstringStarts.size() < 2) 345 continue; 346 347 // Yes. Update the state to reflect this, and then bail out. 348 N = Curr; 349 RS.Length = Length; 350 for (unsigned StartIdx : RepeatedSubstringStarts) 351 RS.StartIndices.push_back(StartIdx); 352 break; 353 } 354 // At this point, either NewRS is an empty RepeatedSubstring, or it was 355 // set in the above loop. Similarly, N is either nullptr, or the node 356 // associated with NewRS. 357 } 358