1*700637cbSDimitry Andric //===- MemProfRadixTree.cpp - Radix tree encoded callstacks ---------------===//
2*700637cbSDimitry Andric //
3*700637cbSDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4*700637cbSDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
5*700637cbSDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6*700637cbSDimitry Andric //
7*700637cbSDimitry Andric //===----------------------------------------------------------------------===//
8*700637cbSDimitry Andric // This file contains logic that implements a space efficient radix tree
9*700637cbSDimitry Andric // encoding for callstacks used by MemProf.
10*700637cbSDimitry Andric //
11*700637cbSDimitry Andric //===----------------------------------------------------------------------===//
12*700637cbSDimitry Andric
13*700637cbSDimitry Andric #include "llvm/ProfileData/MemProfRadixTree.h"
14*700637cbSDimitry Andric
15*700637cbSDimitry Andric namespace llvm {
16*700637cbSDimitry Andric namespace memprof {
17*700637cbSDimitry Andric // Encode a call stack into RadixArray. Return the starting index within
18*700637cbSDimitry Andric // RadixArray. For each call stack we encode, we emit two or three components
19*700637cbSDimitry Andric // into RadixArray. If a given call stack doesn't have a common prefix relative
20*700637cbSDimitry Andric // to the previous one, we emit:
21*700637cbSDimitry Andric //
22*700637cbSDimitry Andric // - the frames in the given call stack in the root-to-leaf order
23*700637cbSDimitry Andric //
24*700637cbSDimitry Andric // - the length of the given call stack
25*700637cbSDimitry Andric //
26*700637cbSDimitry Andric // If a given call stack has a non-empty common prefix relative to the previous
27*700637cbSDimitry Andric // one, we emit:
28*700637cbSDimitry Andric //
29*700637cbSDimitry Andric // - the relative location of the common prefix, encoded as a negative number.
30*700637cbSDimitry Andric //
31*700637cbSDimitry Andric // - a portion of the given call stack that's beyond the common prefix
32*700637cbSDimitry Andric //
33*700637cbSDimitry Andric // - the length of the given call stack, including the length of the common
34*700637cbSDimitry Andric // prefix.
35*700637cbSDimitry Andric //
36*700637cbSDimitry Andric // The resulting RadixArray requires a somewhat unintuitive backward traversal
37*700637cbSDimitry Andric // to reconstruct a call stack -- read the call stack length and scan backward
38*700637cbSDimitry Andric // while collecting frames in the leaf to root order. build, the caller of this
39*700637cbSDimitry Andric // function, reverses RadixArray in place so that we can reconstruct a call
40*700637cbSDimitry Andric // stack as if we were deserializing an array in a typical way -- the call stack
41*700637cbSDimitry Andric // length followed by the frames in the leaf-to-root order except that we need
42*700637cbSDimitry Andric // to handle pointers to parents along the way.
43*700637cbSDimitry Andric //
44*700637cbSDimitry Andric // To quickly determine the location of the common prefix within RadixArray,
45*700637cbSDimitry Andric // Indexes caches the indexes of the previous call stack's frames within
46*700637cbSDimitry Andric // RadixArray.
47*700637cbSDimitry Andric template <typename FrameIdTy>
encodeCallStack(const llvm::SmallVector<FrameIdTy> * CallStack,const llvm::SmallVector<FrameIdTy> * Prev,const llvm::DenseMap<FrameIdTy,LinearFrameId> * MemProfFrameIndexes)48*700637cbSDimitry Andric LinearCallStackId CallStackRadixTreeBuilder<FrameIdTy>::encodeCallStack(
49*700637cbSDimitry Andric const llvm::SmallVector<FrameIdTy> *CallStack,
50*700637cbSDimitry Andric const llvm::SmallVector<FrameIdTy> *Prev,
51*700637cbSDimitry Andric const llvm::DenseMap<FrameIdTy, LinearFrameId> *MemProfFrameIndexes) {
52*700637cbSDimitry Andric // Compute the length of the common root prefix between Prev and CallStack.
53*700637cbSDimitry Andric uint32_t CommonLen = 0;
54*700637cbSDimitry Andric if (Prev) {
55*700637cbSDimitry Andric auto Pos = std::mismatch(Prev->rbegin(), Prev->rend(), CallStack->rbegin(),
56*700637cbSDimitry Andric CallStack->rend());
57*700637cbSDimitry Andric CommonLen = std::distance(CallStack->rbegin(), Pos.second);
58*700637cbSDimitry Andric }
59*700637cbSDimitry Andric
60*700637cbSDimitry Andric // Drop the portion beyond CommonLen.
61*700637cbSDimitry Andric assert(CommonLen <= Indexes.size());
62*700637cbSDimitry Andric Indexes.resize(CommonLen);
63*700637cbSDimitry Andric
64*700637cbSDimitry Andric // Append a pointer to the parent.
65*700637cbSDimitry Andric if (CommonLen) {
66*700637cbSDimitry Andric uint32_t CurrentIndex = RadixArray.size();
67*700637cbSDimitry Andric uint32_t ParentIndex = Indexes.back();
68*700637cbSDimitry Andric // The offset to the parent must be negative because we are pointing to an
69*700637cbSDimitry Andric // element we've already added to RadixArray.
70*700637cbSDimitry Andric assert(ParentIndex < CurrentIndex);
71*700637cbSDimitry Andric RadixArray.push_back(ParentIndex - CurrentIndex);
72*700637cbSDimitry Andric }
73*700637cbSDimitry Andric
74*700637cbSDimitry Andric // Copy the part of the call stack beyond the common prefix to RadixArray.
75*700637cbSDimitry Andric assert(CommonLen <= CallStack->size());
76*700637cbSDimitry Andric for (FrameIdTy F : llvm::drop_begin(llvm::reverse(*CallStack), CommonLen)) {
77*700637cbSDimitry Andric // Remember the index of F in RadixArray.
78*700637cbSDimitry Andric Indexes.push_back(RadixArray.size());
79*700637cbSDimitry Andric RadixArray.push_back(
80*700637cbSDimitry Andric MemProfFrameIndexes ? MemProfFrameIndexes->find(F)->second : F);
81*700637cbSDimitry Andric }
82*700637cbSDimitry Andric assert(CallStack->size() == Indexes.size());
83*700637cbSDimitry Andric
84*700637cbSDimitry Andric // End with the call stack length.
85*700637cbSDimitry Andric RadixArray.push_back(CallStack->size());
86*700637cbSDimitry Andric
87*700637cbSDimitry Andric // Return the index within RadixArray where we can start reconstructing a
88*700637cbSDimitry Andric // given call stack from.
89*700637cbSDimitry Andric return RadixArray.size() - 1;
90*700637cbSDimitry Andric }
91*700637cbSDimitry Andric
92*700637cbSDimitry Andric template <typename FrameIdTy>
build(llvm::MapVector<CallStackId,llvm::SmallVector<FrameIdTy>> && MemProfCallStackData,const llvm::DenseMap<FrameIdTy,LinearFrameId> * MemProfFrameIndexes,llvm::DenseMap<FrameIdTy,FrameStat> & FrameHistogram)93*700637cbSDimitry Andric void CallStackRadixTreeBuilder<FrameIdTy>::build(
94*700637cbSDimitry Andric llvm::MapVector<CallStackId, llvm::SmallVector<FrameIdTy>>
95*700637cbSDimitry Andric &&MemProfCallStackData,
96*700637cbSDimitry Andric const llvm::DenseMap<FrameIdTy, LinearFrameId> *MemProfFrameIndexes,
97*700637cbSDimitry Andric llvm::DenseMap<FrameIdTy, FrameStat> &FrameHistogram) {
98*700637cbSDimitry Andric // Take the vector portion of MemProfCallStackData. The vector is exactly
99*700637cbSDimitry Andric // what we need to sort. Also, we no longer need its lookup capability.
100*700637cbSDimitry Andric llvm::SmallVector<CSIdPair, 0> CallStacks = MemProfCallStackData.takeVector();
101*700637cbSDimitry Andric
102*700637cbSDimitry Andric // Return early if we have no work to do.
103*700637cbSDimitry Andric if (CallStacks.empty()) {
104*700637cbSDimitry Andric RadixArray.clear();
105*700637cbSDimitry Andric CallStackPos.clear();
106*700637cbSDimitry Andric return;
107*700637cbSDimitry Andric }
108*700637cbSDimitry Andric
109*700637cbSDimitry Andric // Sorting the list of call stacks in the dictionary order is sufficient to
110*700637cbSDimitry Andric // maximize the length of the common prefix between two adjacent call stacks
111*700637cbSDimitry Andric // and thus minimize the length of RadixArray. However, we go one step
112*700637cbSDimitry Andric // further and try to reduce the number of times we follow pointers to parents
113*700637cbSDimitry Andric // during deserilization. Consider a poorly encoded radix tree:
114*700637cbSDimitry Andric //
115*700637cbSDimitry Andric // CallStackId 1: f1 -> f2 -> f3
116*700637cbSDimitry Andric // |
117*700637cbSDimitry Andric // CallStackId 2: +--- f4 -> f5
118*700637cbSDimitry Andric // |
119*700637cbSDimitry Andric // CallStackId 3: +--> f6
120*700637cbSDimitry Andric //
121*700637cbSDimitry Andric // Here, f2 and f4 appear once and twice, respectively, in the call stacks.
122*700637cbSDimitry Andric // Once we encode CallStackId 1 into RadixArray, every other call stack with
123*700637cbSDimitry Andric // common prefix f1 ends up pointing to CallStackId 1. Since CallStackId 3
124*700637cbSDimitry Andric // share "f1 f4" with CallStackId 2, CallStackId 3 needs to follow pointers to
125*700637cbSDimitry Andric // parents twice.
126*700637cbSDimitry Andric //
127*700637cbSDimitry Andric // We try to alleviate the situation by sorting the list of call stacks by
128*700637cbSDimitry Andric // comparing the popularity of frames rather than the integer values of
129*700637cbSDimitry Andric // FrameIds. In the example above, f4 is more popular than f2, so we sort the
130*700637cbSDimitry Andric // call stacks and encode them as:
131*700637cbSDimitry Andric //
132*700637cbSDimitry Andric // CallStackId 2: f1 -- f4 -> f5
133*700637cbSDimitry Andric // | |
134*700637cbSDimitry Andric // CallStackId 3: | +--> f6
135*700637cbSDimitry Andric // |
136*700637cbSDimitry Andric // CallStackId 1: +--> f2 -> f3
137*700637cbSDimitry Andric //
138*700637cbSDimitry Andric // Notice that CallStackId 3 follows a pointer to a parent only once.
139*700637cbSDimitry Andric //
140*700637cbSDimitry Andric // All this is a quick-n-dirty trick to reduce the number of jumps. The
141*700637cbSDimitry Andric // proper way would be to compute the weight of each radix tree node -- how
142*700637cbSDimitry Andric // many call stacks use a given radix tree node, and encode a radix tree from
143*700637cbSDimitry Andric // the heaviest node first. We do not do so because that's a lot of work.
144*700637cbSDimitry Andric llvm::sort(CallStacks, [&](const CSIdPair &L, const CSIdPair &R) {
145*700637cbSDimitry Andric // Call stacks are stored from leaf to root. Perform comparisons from the
146*700637cbSDimitry Andric // root.
147*700637cbSDimitry Andric return std::lexicographical_compare(
148*700637cbSDimitry Andric L.second.rbegin(), L.second.rend(), R.second.rbegin(), R.second.rend(),
149*700637cbSDimitry Andric [&](FrameIdTy F1, FrameIdTy F2) {
150*700637cbSDimitry Andric uint64_t H1 = FrameHistogram[F1].Count;
151*700637cbSDimitry Andric uint64_t H2 = FrameHistogram[F2].Count;
152*700637cbSDimitry Andric // Popular frames should come later because we encode call stacks from
153*700637cbSDimitry Andric // the last one in the list.
154*700637cbSDimitry Andric if (H1 != H2)
155*700637cbSDimitry Andric return H1 < H2;
156*700637cbSDimitry Andric // For sort stability.
157*700637cbSDimitry Andric return F1 < F2;
158*700637cbSDimitry Andric });
159*700637cbSDimitry Andric });
160*700637cbSDimitry Andric
161*700637cbSDimitry Andric // Reserve some reasonable amount of storage.
162*700637cbSDimitry Andric RadixArray.clear();
163*700637cbSDimitry Andric RadixArray.reserve(CallStacks.size() * 8);
164*700637cbSDimitry Andric
165*700637cbSDimitry Andric // Indexes will grow as long as the longest call stack.
166*700637cbSDimitry Andric Indexes.clear();
167*700637cbSDimitry Andric Indexes.reserve(512);
168*700637cbSDimitry Andric
169*700637cbSDimitry Andric // CallStackPos will grow to exactly CallStacks.size() entries.
170*700637cbSDimitry Andric CallStackPos.clear();
171*700637cbSDimitry Andric CallStackPos.reserve(CallStacks.size());
172*700637cbSDimitry Andric
173*700637cbSDimitry Andric // Compute the radix array. We encode one call stack at a time, computing the
174*700637cbSDimitry Andric // longest prefix that's shared with the previous call stack we encode. For
175*700637cbSDimitry Andric // each call stack we encode, we remember a mapping from CallStackId to its
176*700637cbSDimitry Andric // position within RadixArray.
177*700637cbSDimitry Andric //
178*700637cbSDimitry Andric // As an optimization, we encode from the last call stack in CallStacks to
179*700637cbSDimitry Andric // reduce the number of times we follow pointers to the parents. Consider the
180*700637cbSDimitry Andric // list of call stacks that has been sorted in the dictionary order:
181*700637cbSDimitry Andric //
182*700637cbSDimitry Andric // Call Stack 1: F1
183*700637cbSDimitry Andric // Call Stack 2: F1 -> F2
184*700637cbSDimitry Andric // Call Stack 3: F1 -> F2 -> F3
185*700637cbSDimitry Andric //
186*700637cbSDimitry Andric // If we traversed CallStacks in the forward order, we would end up with a
187*700637cbSDimitry Andric // radix tree like:
188*700637cbSDimitry Andric //
189*700637cbSDimitry Andric // Call Stack 1: F1
190*700637cbSDimitry Andric // |
191*700637cbSDimitry Andric // Call Stack 2: +---> F2
192*700637cbSDimitry Andric // |
193*700637cbSDimitry Andric // Call Stack 3: +---> F3
194*700637cbSDimitry Andric //
195*700637cbSDimitry Andric // Notice that each call stack jumps to the previous one. However, if we
196*700637cbSDimitry Andric // traverse CallStacks in the reverse order, then Call Stack 3 has the
197*700637cbSDimitry Andric // complete call stack encoded without any pointers. Call Stack 1 and 2 point
198*700637cbSDimitry Andric // to appropriate prefixes of Call Stack 3.
199*700637cbSDimitry Andric const llvm::SmallVector<FrameIdTy> *Prev = nullptr;
200*700637cbSDimitry Andric for (const auto &[CSId, CallStack] : llvm::reverse(CallStacks)) {
201*700637cbSDimitry Andric LinearCallStackId Pos =
202*700637cbSDimitry Andric encodeCallStack(&CallStack, Prev, MemProfFrameIndexes);
203*700637cbSDimitry Andric CallStackPos.insert({CSId, Pos});
204*700637cbSDimitry Andric Prev = &CallStack;
205*700637cbSDimitry Andric }
206*700637cbSDimitry Andric
207*700637cbSDimitry Andric // "RadixArray.size() - 1" below is problematic if RadixArray is empty.
208*700637cbSDimitry Andric assert(!RadixArray.empty());
209*700637cbSDimitry Andric
210*700637cbSDimitry Andric // Reverse the radix array in place. We do so mostly for intuitive
211*700637cbSDimitry Andric // deserialization where we would read the length field and then the call
212*700637cbSDimitry Andric // stack frames proper just like any other array deserialization, except
213*700637cbSDimitry Andric // that we have occasional jumps to take advantage of prefixes.
214*700637cbSDimitry Andric for (size_t I = 0, J = RadixArray.size() - 1; I < J; ++I, --J)
215*700637cbSDimitry Andric std::swap(RadixArray[I], RadixArray[J]);
216*700637cbSDimitry Andric
217*700637cbSDimitry Andric // "Reverse" the indexes stored in CallStackPos.
218*700637cbSDimitry Andric for (auto &[K, V] : CallStackPos)
219*700637cbSDimitry Andric V = RadixArray.size() - 1 - V;
220*700637cbSDimitry Andric }
221*700637cbSDimitry Andric
222*700637cbSDimitry Andric // Explicitly instantiate class with the utilized FrameIdTy.
223*700637cbSDimitry Andric template class LLVM_EXPORT_TEMPLATE CallStackRadixTreeBuilder<FrameId>;
224*700637cbSDimitry Andric template class LLVM_EXPORT_TEMPLATE CallStackRadixTreeBuilder<LinearFrameId>;
225*700637cbSDimitry Andric
226*700637cbSDimitry Andric template <typename FrameIdTy>
227*700637cbSDimitry Andric llvm::DenseMap<FrameIdTy, FrameStat>
computeFrameHistogram(llvm::MapVector<CallStackId,llvm::SmallVector<FrameIdTy>> & MemProfCallStackData)228*700637cbSDimitry Andric computeFrameHistogram(llvm::MapVector<CallStackId, llvm::SmallVector<FrameIdTy>>
229*700637cbSDimitry Andric &MemProfCallStackData) {
230*700637cbSDimitry Andric llvm::DenseMap<FrameIdTy, FrameStat> Histogram;
231*700637cbSDimitry Andric
232*700637cbSDimitry Andric for (const auto &KV : MemProfCallStackData) {
233*700637cbSDimitry Andric const auto &CS = KV.second;
234*700637cbSDimitry Andric for (unsigned I = 0, E = CS.size(); I != E; ++I) {
235*700637cbSDimitry Andric auto &S = Histogram[CS[I]];
236*700637cbSDimitry Andric ++S.Count;
237*700637cbSDimitry Andric S.PositionSum += I;
238*700637cbSDimitry Andric }
239*700637cbSDimitry Andric }
240*700637cbSDimitry Andric return Histogram;
241*700637cbSDimitry Andric }
242*700637cbSDimitry Andric
243*700637cbSDimitry Andric // Explicitly instantiate function with the utilized FrameIdTy.
244*700637cbSDimitry Andric template LLVM_ABI llvm::DenseMap<FrameId, FrameStat>
245*700637cbSDimitry Andric computeFrameHistogram<FrameId>(
246*700637cbSDimitry Andric llvm::MapVector<CallStackId, llvm::SmallVector<FrameId>>
247*700637cbSDimitry Andric &MemProfCallStackData);
248*700637cbSDimitry Andric template LLVM_ABI llvm::DenseMap<LinearFrameId, FrameStat>
249*700637cbSDimitry Andric computeFrameHistogram<LinearFrameId>(
250*700637cbSDimitry Andric llvm::MapVector<CallStackId, llvm::SmallVector<LinearFrameId>>
251*700637cbSDimitry Andric &MemProfCallStackData);
252*700637cbSDimitry Andric } // namespace memprof
253*700637cbSDimitry Andric } // namespace llvm
254