xref: /freebsd/contrib/llvm-project/llvm/include/llvm/Support/SuffixTreeNode.h (revision 0fca6ea1d4eea4c934cfff25ac9ee8ad6fe95583)
1 //===- llvm/ADT/SuffixTreeNode.h - Nodes for SuffixTrees --------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file defines nodes for use within a SuffixTree.
10 //
11 // Each node has either no children or at least two children, with the root
12 // being a exception in the empty tree.
13 //
14 // Children are represented as a map between unsigned integers and nodes. If
15 // a node N has a child M on unsigned integer k, then the mapping represented
16 // by N is a proper prefix of the mapping represented by M. Note that this,
17 // although similar to a trie is somewhat different: each node stores a full
18 // substring of the full mapping rather than a single character state.
19 //
20 // Each internal node contains a pointer to the internal node representing
21 // the same string, but with the first character chopped off. This is stored
22 // in \p Link. Each leaf node stores the start index of its respective
23 // suffix in \p SuffixIdx.
24 //===----------------------------------------------------------------------===//
25 
26 #ifndef LLVM_SUPPORT_SUFFIXTREE_NODE_H
27 #define LLVM_SUPPORT_SUFFIXTREE_NODE_H
28 #include "llvm/ADT/DenseMap.h"
29 
30 namespace llvm {
31 
32 /// A node in a suffix tree which represents a substring or suffix.
33 struct SuffixTreeNode {
34 public:
35   /// Represents an undefined index in the suffix tree.
36   static const unsigned EmptyIdx = -1;
37   enum class NodeKind { ST_Leaf, ST_Internal };
38 
39 private:
40   const NodeKind Kind;
41 
42   /// The start index of this node's substring in the main string.
43   unsigned StartIdx = EmptyIdx;
44 
45   /// The length of the string formed by concatenating the edge labels from
46   /// the root to this node.
47   unsigned ConcatLen = 0;
48 
49   /// These two indices give a range of indices for its leaf descendants.
50   /// Imagine drawing a tree on paper and assigning a unique index to each leaf
51   /// node in monotonically increasing order from left to right. This way of
52   /// numbering the leaf nodes allows us to associate a continuous range of
53   /// indices with each internal node. For example, if a node has leaf
54   /// descendants with indices i, i+1, ..., j, then its LeftLeafIdx is i and
55   /// its RightLeafIdx is j. These indices are for LeafNodes in the SuffixTree
56   /// class, which is constructed using post-order depth-first traversal.
57   unsigned LeftLeafIdx = EmptyIdx;
58   unsigned RightLeafIdx = EmptyIdx;
59 
60 public:
61   // LLVM RTTI boilerplate.
getKindSuffixTreeNode62   NodeKind getKind() const { return Kind; }
63 
64   /// \return the start index of this node's substring in the entire string.
65   unsigned getStartIdx() const;
66 
67   /// \returns the end index of this node.
68   virtual unsigned getEndIdx() const = 0;
69 
70   /// \return the index of this node's left most leaf node.
71   unsigned getLeftLeafIdx() const;
72 
73   /// \return the index of this node's right most leaf node.
74   unsigned getRightLeafIdx() const;
75 
76   /// Set the index of the left most leaf node of this node to \p Idx.
77   void setLeftLeafIdx(unsigned Idx);
78 
79   /// Set the index of the right most leaf node of this node to \p Idx.
80   void setRightLeafIdx(unsigned Idx);
81 
82   /// Advance this node's StartIdx by \p Inc.
83   void incrementStartIdx(unsigned Inc);
84 
85   /// Set the length of the string from the root to this node to \p Len.
86   void setConcatLen(unsigned Len);
87 
88   /// \returns the length of the string from the root to this node.
89   unsigned getConcatLen() const;
90 
SuffixTreeNodeSuffixTreeNode91   SuffixTreeNode(NodeKind Kind, unsigned StartIdx)
92       : Kind(Kind), StartIdx(StartIdx) {}
93   virtual ~SuffixTreeNode() = default;
94 };
95 
96 // A node with two or more children, or the root.
97 struct SuffixTreeInternalNode : SuffixTreeNode {
98 private:
99   /// The end index of this node's substring in the main string.
100   ///
101   /// Every leaf node must have its \p EndIdx incremented at the end of every
102   /// step in the construction algorithm. To avoid having to update O(N)
103   /// nodes individually at the end of every step, the end index is stored
104   /// as a pointer.
105   unsigned EndIdx = EmptyIdx;
106 
107   /// A pointer to the internal node representing the same sequence with the
108   /// first character chopped off.
109   ///
110   /// This acts as a shortcut in Ukkonen's algorithm. One of the things that
111   /// Ukkonen's algorithm does to achieve linear-time construction is
112   /// keep track of which node the next insert should be at. This makes each
113   /// insert O(1), and there are a total of O(N) inserts. The suffix link
114   /// helps with inserting children of internal nodes.
115   ///
116   /// Say we add a child to an internal node with associated mapping S. The
117   /// next insertion must be at the node representing S - its first character.
118   /// This is given by the way that we iteratively build the tree in Ukkonen's
119   /// algorithm. The main idea is to look at the suffixes of each prefix in the
120   /// string, starting with the longest suffix of the prefix, and ending with
121   /// the shortest. Therefore, if we keep pointers between such nodes, we can
122   /// move to the next insertion point in O(1) time. If we don't, then we'd
123   /// have to query from the root, which takes O(N) time. This would make the
124   /// construction algorithm O(N^2) rather than O(N).
125   SuffixTreeInternalNode *Link = nullptr;
126 
127 public:
128   // LLVM RTTI boilerplate.
classofSuffixTreeInternalNode129   static bool classof(const SuffixTreeNode *N) {
130     return N->getKind() == NodeKind::ST_Internal;
131   }
132 
133   /// \returns true if this node is the root of its owning \p SuffixTree.
134   bool isRoot() const;
135 
136   /// \returns the end index of this node's substring in the entire string.
137   unsigned getEndIdx() const override;
138 
139   /// Sets \p Link to \p L. Assumes \p L is not null.
140   void setLink(SuffixTreeInternalNode *L);
141 
142   /// \returns the pointer to the Link node.
143   SuffixTreeInternalNode *getLink() const;
144 
145   /// The children of this node.
146   ///
147   /// A child existing on an unsigned integer implies that from the mapping
148   /// represented by the current node, there is a way to reach another
149   /// mapping by tacking that character on the end of the current string.
150   DenseMap<unsigned, SuffixTreeNode *> Children;
151 
SuffixTreeInternalNodeSuffixTreeInternalNode152   SuffixTreeInternalNode(unsigned StartIdx, unsigned EndIdx,
153                          SuffixTreeInternalNode *Link)
154       : SuffixTreeNode(NodeKind::ST_Internal, StartIdx), EndIdx(EndIdx),
155         Link(Link) {}
156 
157   virtual ~SuffixTreeInternalNode() = default;
158 };
159 
160 // A node representing a suffix.
161 struct SuffixTreeLeafNode : SuffixTreeNode {
162 private:
163   /// The start index of the suffix represented by this leaf.
164   unsigned SuffixIdx = EmptyIdx;
165 
166   /// The end index of this node's substring in the main string.
167   ///
168   /// Every leaf node must have its \p EndIdx incremented at the end of every
169   /// step in the construction algorithm. To avoid having to update O(N)
170   /// nodes individually at the end of every step, the end index is stored
171   /// as a pointer.
172   unsigned *EndIdx = nullptr;
173 
174 public:
175   // LLVM RTTI boilerplate.
classofSuffixTreeLeafNode176   static bool classof(const SuffixTreeNode *N) {
177     return N->getKind() == NodeKind::ST_Leaf;
178   }
179 
180   /// \returns the end index of this node's substring in the entire string.
181   unsigned getEndIdx() const override;
182 
183   /// \returns the start index of the suffix represented by this leaf.
184   unsigned getSuffixIdx() const;
185 
186   /// Sets the start index of the suffix represented by this leaf to \p Idx.
187   void setSuffixIdx(unsigned Idx);
SuffixTreeLeafNodeSuffixTreeLeafNode188   SuffixTreeLeafNode(unsigned StartIdx, unsigned *EndIdx)
189       : SuffixTreeNode(NodeKind::ST_Leaf, StartIdx), EndIdx(EndIdx) {}
190 
191   virtual ~SuffixTreeLeafNode() = default;
192 };
193 } // namespace llvm
194 #endif // LLVM_SUPPORT_SUFFIXTREE_NODE_H
195