1 //===- llvm/ADT/SuffixTreeNode.h - Nodes for SuffixTrees --------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file defines nodes for use within a SuffixTree. 10 // 11 // Each node has either no children or at least two children, with the root 12 // being a exception in the empty tree. 13 // 14 // Children are represented as a map between unsigned integers and nodes. If 15 // a node N has a child M on unsigned integer k, then the mapping represented 16 // by N is a proper prefix of the mapping represented by M. Note that this, 17 // although similar to a trie is somewhat different: each node stores a full 18 // substring of the full mapping rather than a single character state. 19 // 20 // Each internal node contains a pointer to the internal node representing 21 // the same string, but with the first character chopped off. This is stored 22 // in \p Link. Each leaf node stores the start index of its respective 23 // suffix in \p SuffixIdx. 24 //===----------------------------------------------------------------------===// 25 26 #ifndef LLVM_SUPPORT_SUFFIXTREE_NODE_H 27 #define LLVM_SUPPORT_SUFFIXTREE_NODE_H 28 #include "llvm/ADT/DenseMap.h" 29 30 namespace llvm { 31 32 /// A node in a suffix tree which represents a substring or suffix. 33 struct SuffixTreeNode { 34 public: 35 /// Represents an undefined index in the suffix tree. 36 static const unsigned EmptyIdx = -1; 37 enum class NodeKind { ST_Leaf, ST_Internal }; 38 39 private: 40 const NodeKind Kind; 41 42 /// The start index of this node's substring in the main string. 43 unsigned StartIdx = EmptyIdx; 44 45 /// The length of the string formed by concatenating the edge labels from 46 /// the root to this node. 47 unsigned ConcatLen = 0; 48 49 /// These two indices give a range of indices for its leaf descendants. 50 /// Imagine drawing a tree on paper and assigning a unique index to each leaf 51 /// node in monotonically increasing order from left to right. This way of 52 /// numbering the leaf nodes allows us to associate a continuous range of 53 /// indices with each internal node. For example, if a node has leaf 54 /// descendants with indices i, i+1, ..., j, then its LeftLeafIdx is i and 55 /// its RightLeafIdx is j. These indices are for LeafNodes in the SuffixTree 56 /// class, which is constructed using post-order depth-first traversal. 57 unsigned LeftLeafIdx = EmptyIdx; 58 unsigned RightLeafIdx = EmptyIdx; 59 60 public: 61 // LLVM RTTI boilerplate. getKindSuffixTreeNode62 NodeKind getKind() const { return Kind; } 63 64 /// \return the start index of this node's substring in the entire string. 65 unsigned getStartIdx() const; 66 67 /// \returns the end index of this node. 68 virtual unsigned getEndIdx() const = 0; 69 70 /// \return the index of this node's left most leaf node. 71 unsigned getLeftLeafIdx() const; 72 73 /// \return the index of this node's right most leaf node. 74 unsigned getRightLeafIdx() const; 75 76 /// Set the index of the left most leaf node of this node to \p Idx. 77 void setLeftLeafIdx(unsigned Idx); 78 79 /// Set the index of the right most leaf node of this node to \p Idx. 80 void setRightLeafIdx(unsigned Idx); 81 82 /// Advance this node's StartIdx by \p Inc. 83 void incrementStartIdx(unsigned Inc); 84 85 /// Set the length of the string from the root to this node to \p Len. 86 void setConcatLen(unsigned Len); 87 88 /// \returns the length of the string from the root to this node. 89 unsigned getConcatLen() const; 90 SuffixTreeNodeSuffixTreeNode91 SuffixTreeNode(NodeKind Kind, unsigned StartIdx) 92 : Kind(Kind), StartIdx(StartIdx) {} 93 virtual ~SuffixTreeNode() = default; 94 }; 95 96 // A node with two or more children, or the root. 97 struct SuffixTreeInternalNode : SuffixTreeNode { 98 private: 99 /// The end index of this node's substring in the main string. 100 /// 101 /// Every leaf node must have its \p EndIdx incremented at the end of every 102 /// step in the construction algorithm. To avoid having to update O(N) 103 /// nodes individually at the end of every step, the end index is stored 104 /// as a pointer. 105 unsigned EndIdx = EmptyIdx; 106 107 /// A pointer to the internal node representing the same sequence with the 108 /// first character chopped off. 109 /// 110 /// This acts as a shortcut in Ukkonen's algorithm. One of the things that 111 /// Ukkonen's algorithm does to achieve linear-time construction is 112 /// keep track of which node the next insert should be at. This makes each 113 /// insert O(1), and there are a total of O(N) inserts. The suffix link 114 /// helps with inserting children of internal nodes. 115 /// 116 /// Say we add a child to an internal node with associated mapping S. The 117 /// next insertion must be at the node representing S - its first character. 118 /// This is given by the way that we iteratively build the tree in Ukkonen's 119 /// algorithm. The main idea is to look at the suffixes of each prefix in the 120 /// string, starting with the longest suffix of the prefix, and ending with 121 /// the shortest. Therefore, if we keep pointers between such nodes, we can 122 /// move to the next insertion point in O(1) time. If we don't, then we'd 123 /// have to query from the root, which takes O(N) time. This would make the 124 /// construction algorithm O(N^2) rather than O(N). 125 SuffixTreeInternalNode *Link = nullptr; 126 127 public: 128 // LLVM RTTI boilerplate. classofSuffixTreeInternalNode129 static bool classof(const SuffixTreeNode *N) { 130 return N->getKind() == NodeKind::ST_Internal; 131 } 132 133 /// \returns true if this node is the root of its owning \p SuffixTree. 134 bool isRoot() const; 135 136 /// \returns the end index of this node's substring in the entire string. 137 unsigned getEndIdx() const override; 138 139 /// Sets \p Link to \p L. Assumes \p L is not null. 140 void setLink(SuffixTreeInternalNode *L); 141 142 /// \returns the pointer to the Link node. 143 SuffixTreeInternalNode *getLink() const; 144 145 /// The children of this node. 146 /// 147 /// A child existing on an unsigned integer implies that from the mapping 148 /// represented by the current node, there is a way to reach another 149 /// mapping by tacking that character on the end of the current string. 150 DenseMap<unsigned, SuffixTreeNode *> Children; 151 SuffixTreeInternalNodeSuffixTreeInternalNode152 SuffixTreeInternalNode(unsigned StartIdx, unsigned EndIdx, 153 SuffixTreeInternalNode *Link) 154 : SuffixTreeNode(NodeKind::ST_Internal, StartIdx), EndIdx(EndIdx), 155 Link(Link) {} 156 157 virtual ~SuffixTreeInternalNode() = default; 158 }; 159 160 // A node representing a suffix. 161 struct SuffixTreeLeafNode : SuffixTreeNode { 162 private: 163 /// The start index of the suffix represented by this leaf. 164 unsigned SuffixIdx = EmptyIdx; 165 166 /// The end index of this node's substring in the main string. 167 /// 168 /// Every leaf node must have its \p EndIdx incremented at the end of every 169 /// step in the construction algorithm. To avoid having to update O(N) 170 /// nodes individually at the end of every step, the end index is stored 171 /// as a pointer. 172 unsigned *EndIdx = nullptr; 173 174 public: 175 // LLVM RTTI boilerplate. classofSuffixTreeLeafNode176 static bool classof(const SuffixTreeNode *N) { 177 return N->getKind() == NodeKind::ST_Leaf; 178 } 179 180 /// \returns the end index of this node's substring in the entire string. 181 unsigned getEndIdx() const override; 182 183 /// \returns the start index of the suffix represented by this leaf. 184 unsigned getSuffixIdx() const; 185 186 /// Sets the start index of the suffix represented by this leaf to \p Idx. 187 void setSuffixIdx(unsigned Idx); SuffixTreeLeafNodeSuffixTreeLeafNode188 SuffixTreeLeafNode(unsigned StartIdx, unsigned *EndIdx) 189 : SuffixTreeNode(NodeKind::ST_Leaf, StartIdx), EndIdx(EndIdx) {} 190 191 virtual ~SuffixTreeLeafNode() = default; 192 }; 193 } // namespace llvm 194 #endif // LLVM_SUPPORT_SUFFIXTREE_NODE_H 195