xref: /freebsd/contrib/llvm-project/llvm/lib/ProfileData/MemProfRadixTree.cpp (revision 770cf0a5f02dc8983a89c6568d741fbc25baa999)
1 //===- MemProfRadixTree.cpp - Radix tree encoded callstacks ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 // This file contains logic that implements a space efficient radix tree
9 // encoding for callstacks used by MemProf.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "llvm/ProfileData/MemProfRadixTree.h"
14 
15 namespace llvm {
16 namespace memprof {
17 // Encode a call stack into RadixArray.  Return the starting index within
18 // RadixArray.  For each call stack we encode, we emit two or three components
19 // into RadixArray.  If a given call stack doesn't have a common prefix relative
20 // to the previous one, we emit:
21 //
22 // - the frames in the given call stack in the root-to-leaf order
23 //
24 // - the length of the given call stack
25 //
26 // If a given call stack has a non-empty common prefix relative to the previous
27 // one, we emit:
28 //
29 // - the relative location of the common prefix, encoded as a negative number.
30 //
31 // - a portion of the given call stack that's beyond the common prefix
32 //
33 // - the length of the given call stack, including the length of the common
34 //   prefix.
35 //
36 // The resulting RadixArray requires a somewhat unintuitive backward traversal
37 // to reconstruct a call stack -- read the call stack length and scan backward
38 // while collecting frames in the leaf to root order.  build, the caller of this
39 // function, reverses RadixArray in place so that we can reconstruct a call
40 // stack as if we were deserializing an array in a typical way -- the call stack
41 // length followed by the frames in the leaf-to-root order except that we need
42 // to handle pointers to parents along the way.
43 //
44 // To quickly determine the location of the common prefix within RadixArray,
45 // Indexes caches the indexes of the previous call stack's frames within
46 // RadixArray.
47 template <typename FrameIdTy>
48 LinearCallStackId CallStackRadixTreeBuilder<FrameIdTy>::encodeCallStack(
49     const llvm::SmallVector<FrameIdTy> *CallStack,
50     const llvm::SmallVector<FrameIdTy> *Prev,
51     const llvm::DenseMap<FrameIdTy, LinearFrameId> *MemProfFrameIndexes) {
52   // Compute the length of the common root prefix between Prev and CallStack.
53   uint32_t CommonLen = 0;
54   if (Prev) {
55     auto Pos = std::mismatch(Prev->rbegin(), Prev->rend(), CallStack->rbegin(),
56                              CallStack->rend());
57     CommonLen = std::distance(CallStack->rbegin(), Pos.second);
58   }
59 
60   // Drop the portion beyond CommonLen.
61   assert(CommonLen <= Indexes.size());
62   Indexes.resize(CommonLen);
63 
64   // Append a pointer to the parent.
65   if (CommonLen) {
66     uint32_t CurrentIndex = RadixArray.size();
67     uint32_t ParentIndex = Indexes.back();
68     // The offset to the parent must be negative because we are pointing to an
69     // element we've already added to RadixArray.
70     assert(ParentIndex < CurrentIndex);
71     RadixArray.push_back(ParentIndex - CurrentIndex);
72   }
73 
74   // Copy the part of the call stack beyond the common prefix to RadixArray.
75   assert(CommonLen <= CallStack->size());
76   for (FrameIdTy F : llvm::drop_begin(llvm::reverse(*CallStack), CommonLen)) {
77     // Remember the index of F in RadixArray.
78     Indexes.push_back(RadixArray.size());
79     RadixArray.push_back(
80         MemProfFrameIndexes ? MemProfFrameIndexes->find(F)->second : F);
81   }
82   assert(CallStack->size() == Indexes.size());
83 
84   // End with the call stack length.
85   RadixArray.push_back(CallStack->size());
86 
87   // Return the index within RadixArray where we can start reconstructing a
88   // given call stack from.
89   return RadixArray.size() - 1;
90 }
91 
92 template <typename FrameIdTy>
93 void CallStackRadixTreeBuilder<FrameIdTy>::build(
94     llvm::MapVector<CallStackId, llvm::SmallVector<FrameIdTy>>
95         &&MemProfCallStackData,
96     const llvm::DenseMap<FrameIdTy, LinearFrameId> *MemProfFrameIndexes,
97     llvm::DenseMap<FrameIdTy, FrameStat> &FrameHistogram) {
98   // Take the vector portion of MemProfCallStackData.  The vector is exactly
99   // what we need to sort.  Also, we no longer need its lookup capability.
100   llvm::SmallVector<CSIdPair, 0> CallStacks = MemProfCallStackData.takeVector();
101 
102   // Return early if we have no work to do.
103   if (CallStacks.empty()) {
104     RadixArray.clear();
105     CallStackPos.clear();
106     return;
107   }
108 
109   // Sorting the list of call stacks in the dictionary order is sufficient to
110   // maximize the length of the common prefix between two adjacent call stacks
111   // and thus minimize the length of RadixArray.  However, we go one step
112   // further and try to reduce the number of times we follow pointers to parents
113   // during deserilization.  Consider a poorly encoded radix tree:
114   //
115   // CallStackId 1:  f1 -> f2 -> f3
116   //                  |
117   // CallStackId 2:   +--- f4 -> f5
118   //                        |
119   // CallStackId 3:         +--> f6
120   //
121   // Here, f2 and f4 appear once and twice, respectively, in the call stacks.
122   // Once we encode CallStackId 1 into RadixArray, every other call stack with
123   // common prefix f1 ends up pointing to CallStackId 1.  Since CallStackId 3
124   // share "f1 f4" with CallStackId 2, CallStackId 3 needs to follow pointers to
125   // parents twice.
126   //
127   // We try to alleviate the situation by sorting the list of call stacks by
128   // comparing the popularity of frames rather than the integer values of
129   // FrameIds.  In the example above, f4 is more popular than f2, so we sort the
130   // call stacks and encode them as:
131   //
132   // CallStackId 2:  f1 -- f4 -> f5
133   //                  |     |
134   // CallStackId 3:   |     +--> f6
135   //                  |
136   // CallStackId 1:   +--> f2 -> f3
137   //
138   // Notice that CallStackId 3 follows a pointer to a parent only once.
139   //
140   // All this is a quick-n-dirty trick to reduce the number of jumps.  The
141   // proper way would be to compute the weight of each radix tree node -- how
142   // many call stacks use a given radix tree node, and encode a radix tree from
143   // the heaviest node first.  We do not do so because that's a lot of work.
144   llvm::sort(CallStacks, [&](const CSIdPair &L, const CSIdPair &R) {
145     // Call stacks are stored from leaf to root.  Perform comparisons from the
146     // root.
147     return std::lexicographical_compare(
148         L.second.rbegin(), L.second.rend(), R.second.rbegin(), R.second.rend(),
149         [&](FrameIdTy F1, FrameIdTy F2) {
150           uint64_t H1 = FrameHistogram[F1].Count;
151           uint64_t H2 = FrameHistogram[F2].Count;
152           // Popular frames should come later because we encode call stacks from
153           // the last one in the list.
154           if (H1 != H2)
155             return H1 < H2;
156           // For sort stability.
157           return F1 < F2;
158         });
159   });
160 
161   // Reserve some reasonable amount of storage.
162   RadixArray.clear();
163   RadixArray.reserve(CallStacks.size() * 8);
164 
165   // Indexes will grow as long as the longest call stack.
166   Indexes.clear();
167   Indexes.reserve(512);
168 
169   // CallStackPos will grow to exactly CallStacks.size() entries.
170   CallStackPos.clear();
171   CallStackPos.reserve(CallStacks.size());
172 
173   // Compute the radix array.  We encode one call stack at a time, computing the
174   // longest prefix that's shared with the previous call stack we encode.  For
175   // each call stack we encode, we remember a mapping from CallStackId to its
176   // position within RadixArray.
177   //
178   // As an optimization, we encode from the last call stack in CallStacks to
179   // reduce the number of times we follow pointers to the parents.  Consider the
180   // list of call stacks that has been sorted in the dictionary order:
181   //
182   // Call Stack 1: F1
183   // Call Stack 2: F1 -> F2
184   // Call Stack 3: F1 -> F2 -> F3
185   //
186   // If we traversed CallStacks in the forward order, we would end up with a
187   // radix tree like:
188   //
189   // Call Stack 1:  F1
190   //                |
191   // Call Stack 2:  +---> F2
192   //                      |
193   // Call Stack 3:        +---> F3
194   //
195   // Notice that each call stack jumps to the previous one.  However, if we
196   // traverse CallStacks in the reverse order, then Call Stack 3 has the
197   // complete call stack encoded without any pointers.  Call Stack 1 and 2 point
198   // to appropriate prefixes of Call Stack 3.
199   const llvm::SmallVector<FrameIdTy> *Prev = nullptr;
200   for (const auto &[CSId, CallStack] : llvm::reverse(CallStacks)) {
201     LinearCallStackId Pos =
202         encodeCallStack(&CallStack, Prev, MemProfFrameIndexes);
203     CallStackPos.insert({CSId, Pos});
204     Prev = &CallStack;
205   }
206 
207   // "RadixArray.size() - 1" below is problematic if RadixArray is empty.
208   assert(!RadixArray.empty());
209 
210   // Reverse the radix array in place.  We do so mostly for intuitive
211   // deserialization where we would read the length field and then the call
212   // stack frames proper just like any other array deserialization, except
213   // that we have occasional jumps to take advantage of prefixes.
214   for (size_t I = 0, J = RadixArray.size() - 1; I < J; ++I, --J)
215     std::swap(RadixArray[I], RadixArray[J]);
216 
217   // "Reverse" the indexes stored in CallStackPos.
218   for (auto &[K, V] : CallStackPos)
219     V = RadixArray.size() - 1 - V;
220 }
221 
222 // Explicitly instantiate class with the utilized FrameIdTy.
223 template class LLVM_EXPORT_TEMPLATE CallStackRadixTreeBuilder<FrameId>;
224 template class LLVM_EXPORT_TEMPLATE CallStackRadixTreeBuilder<LinearFrameId>;
225 
226 template <typename FrameIdTy>
227 llvm::DenseMap<FrameIdTy, FrameStat>
228 computeFrameHistogram(llvm::MapVector<CallStackId, llvm::SmallVector<FrameIdTy>>
229                           &MemProfCallStackData) {
230   llvm::DenseMap<FrameIdTy, FrameStat> Histogram;
231 
232   for (const auto &KV : MemProfCallStackData) {
233     const auto &CS = KV.second;
234     for (unsigned I = 0, E = CS.size(); I != E; ++I) {
235       auto &S = Histogram[CS[I]];
236       ++S.Count;
237       S.PositionSum += I;
238     }
239   }
240   return Histogram;
241 }
242 
243 // Explicitly instantiate function with the utilized FrameIdTy.
244 template LLVM_ABI llvm::DenseMap<FrameId, FrameStat>
245 computeFrameHistogram<FrameId>(
246     llvm::MapVector<CallStackId, llvm::SmallVector<FrameId>>
247         &MemProfCallStackData);
248 template LLVM_ABI llvm::DenseMap<LinearFrameId, FrameStat>
249 computeFrameHistogram<LinearFrameId>(
250     llvm::MapVector<CallStackId, llvm::SmallVector<LinearFrameId>>
251         &MemProfCallStackData);
252 } // namespace memprof
253 } // namespace llvm
254