1 //===- MemProfRadixTree.cpp - Radix tree encoded callstacks ---------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // This file contains logic that implements a space efficient radix tree 9 // encoding for callstacks used by MemProf. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "llvm/ProfileData/MemProfRadixTree.h" 14 15 namespace llvm { 16 namespace memprof { 17 // Encode a call stack into RadixArray. Return the starting index within 18 // RadixArray. For each call stack we encode, we emit two or three components 19 // into RadixArray. If a given call stack doesn't have a common prefix relative 20 // to the previous one, we emit: 21 // 22 // - the frames in the given call stack in the root-to-leaf order 23 // 24 // - the length of the given call stack 25 // 26 // If a given call stack has a non-empty common prefix relative to the previous 27 // one, we emit: 28 // 29 // - the relative location of the common prefix, encoded as a negative number. 30 // 31 // - a portion of the given call stack that's beyond the common prefix 32 // 33 // - the length of the given call stack, including the length of the common 34 // prefix. 35 // 36 // The resulting RadixArray requires a somewhat unintuitive backward traversal 37 // to reconstruct a call stack -- read the call stack length and scan backward 38 // while collecting frames in the leaf to root order. build, the caller of this 39 // function, reverses RadixArray in place so that we can reconstruct a call 40 // stack as if we were deserializing an array in a typical way -- the call stack 41 // length followed by the frames in the leaf-to-root order except that we need 42 // to handle pointers to parents along the way. 43 // 44 // To quickly determine the location of the common prefix within RadixArray, 45 // Indexes caches the indexes of the previous call stack's frames within 46 // RadixArray. 47 template <typename FrameIdTy> 48 LinearCallStackId CallStackRadixTreeBuilder<FrameIdTy>::encodeCallStack( 49 const llvm::SmallVector<FrameIdTy> *CallStack, 50 const llvm::SmallVector<FrameIdTy> *Prev, 51 const llvm::DenseMap<FrameIdTy, LinearFrameId> *MemProfFrameIndexes) { 52 // Compute the length of the common root prefix between Prev and CallStack. 53 uint32_t CommonLen = 0; 54 if (Prev) { 55 auto Pos = std::mismatch(Prev->rbegin(), Prev->rend(), CallStack->rbegin(), 56 CallStack->rend()); 57 CommonLen = std::distance(CallStack->rbegin(), Pos.second); 58 } 59 60 // Drop the portion beyond CommonLen. 61 assert(CommonLen <= Indexes.size()); 62 Indexes.resize(CommonLen); 63 64 // Append a pointer to the parent. 65 if (CommonLen) { 66 uint32_t CurrentIndex = RadixArray.size(); 67 uint32_t ParentIndex = Indexes.back(); 68 // The offset to the parent must be negative because we are pointing to an 69 // element we've already added to RadixArray. 70 assert(ParentIndex < CurrentIndex); 71 RadixArray.push_back(ParentIndex - CurrentIndex); 72 } 73 74 // Copy the part of the call stack beyond the common prefix to RadixArray. 75 assert(CommonLen <= CallStack->size()); 76 for (FrameIdTy F : llvm::drop_begin(llvm::reverse(*CallStack), CommonLen)) { 77 // Remember the index of F in RadixArray. 78 Indexes.push_back(RadixArray.size()); 79 RadixArray.push_back( 80 MemProfFrameIndexes ? MemProfFrameIndexes->find(F)->second : F); 81 } 82 assert(CallStack->size() == Indexes.size()); 83 84 // End with the call stack length. 85 RadixArray.push_back(CallStack->size()); 86 87 // Return the index within RadixArray where we can start reconstructing a 88 // given call stack from. 89 return RadixArray.size() - 1; 90 } 91 92 template <typename FrameIdTy> 93 void CallStackRadixTreeBuilder<FrameIdTy>::build( 94 llvm::MapVector<CallStackId, llvm::SmallVector<FrameIdTy>> 95 &&MemProfCallStackData, 96 const llvm::DenseMap<FrameIdTy, LinearFrameId> *MemProfFrameIndexes, 97 llvm::DenseMap<FrameIdTy, FrameStat> &FrameHistogram) { 98 // Take the vector portion of MemProfCallStackData. The vector is exactly 99 // what we need to sort. Also, we no longer need its lookup capability. 100 llvm::SmallVector<CSIdPair, 0> CallStacks = MemProfCallStackData.takeVector(); 101 102 // Return early if we have no work to do. 103 if (CallStacks.empty()) { 104 RadixArray.clear(); 105 CallStackPos.clear(); 106 return; 107 } 108 109 // Sorting the list of call stacks in the dictionary order is sufficient to 110 // maximize the length of the common prefix between two adjacent call stacks 111 // and thus minimize the length of RadixArray. However, we go one step 112 // further and try to reduce the number of times we follow pointers to parents 113 // during deserilization. Consider a poorly encoded radix tree: 114 // 115 // CallStackId 1: f1 -> f2 -> f3 116 // | 117 // CallStackId 2: +--- f4 -> f5 118 // | 119 // CallStackId 3: +--> f6 120 // 121 // Here, f2 and f4 appear once and twice, respectively, in the call stacks. 122 // Once we encode CallStackId 1 into RadixArray, every other call stack with 123 // common prefix f1 ends up pointing to CallStackId 1. Since CallStackId 3 124 // share "f1 f4" with CallStackId 2, CallStackId 3 needs to follow pointers to 125 // parents twice. 126 // 127 // We try to alleviate the situation by sorting the list of call stacks by 128 // comparing the popularity of frames rather than the integer values of 129 // FrameIds. In the example above, f4 is more popular than f2, so we sort the 130 // call stacks and encode them as: 131 // 132 // CallStackId 2: f1 -- f4 -> f5 133 // | | 134 // CallStackId 3: | +--> f6 135 // | 136 // CallStackId 1: +--> f2 -> f3 137 // 138 // Notice that CallStackId 3 follows a pointer to a parent only once. 139 // 140 // All this is a quick-n-dirty trick to reduce the number of jumps. The 141 // proper way would be to compute the weight of each radix tree node -- how 142 // many call stacks use a given radix tree node, and encode a radix tree from 143 // the heaviest node first. We do not do so because that's a lot of work. 144 llvm::sort(CallStacks, [&](const CSIdPair &L, const CSIdPair &R) { 145 // Call stacks are stored from leaf to root. Perform comparisons from the 146 // root. 147 return std::lexicographical_compare( 148 L.second.rbegin(), L.second.rend(), R.second.rbegin(), R.second.rend(), 149 [&](FrameIdTy F1, FrameIdTy F2) { 150 uint64_t H1 = FrameHistogram[F1].Count; 151 uint64_t H2 = FrameHistogram[F2].Count; 152 // Popular frames should come later because we encode call stacks from 153 // the last one in the list. 154 if (H1 != H2) 155 return H1 < H2; 156 // For sort stability. 157 return F1 < F2; 158 }); 159 }); 160 161 // Reserve some reasonable amount of storage. 162 RadixArray.clear(); 163 RadixArray.reserve(CallStacks.size() * 8); 164 165 // Indexes will grow as long as the longest call stack. 166 Indexes.clear(); 167 Indexes.reserve(512); 168 169 // CallStackPos will grow to exactly CallStacks.size() entries. 170 CallStackPos.clear(); 171 CallStackPos.reserve(CallStacks.size()); 172 173 // Compute the radix array. We encode one call stack at a time, computing the 174 // longest prefix that's shared with the previous call stack we encode. For 175 // each call stack we encode, we remember a mapping from CallStackId to its 176 // position within RadixArray. 177 // 178 // As an optimization, we encode from the last call stack in CallStacks to 179 // reduce the number of times we follow pointers to the parents. Consider the 180 // list of call stacks that has been sorted in the dictionary order: 181 // 182 // Call Stack 1: F1 183 // Call Stack 2: F1 -> F2 184 // Call Stack 3: F1 -> F2 -> F3 185 // 186 // If we traversed CallStacks in the forward order, we would end up with a 187 // radix tree like: 188 // 189 // Call Stack 1: F1 190 // | 191 // Call Stack 2: +---> F2 192 // | 193 // Call Stack 3: +---> F3 194 // 195 // Notice that each call stack jumps to the previous one. However, if we 196 // traverse CallStacks in the reverse order, then Call Stack 3 has the 197 // complete call stack encoded without any pointers. Call Stack 1 and 2 point 198 // to appropriate prefixes of Call Stack 3. 199 const llvm::SmallVector<FrameIdTy> *Prev = nullptr; 200 for (const auto &[CSId, CallStack] : llvm::reverse(CallStacks)) { 201 LinearCallStackId Pos = 202 encodeCallStack(&CallStack, Prev, MemProfFrameIndexes); 203 CallStackPos.insert({CSId, Pos}); 204 Prev = &CallStack; 205 } 206 207 // "RadixArray.size() - 1" below is problematic if RadixArray is empty. 208 assert(!RadixArray.empty()); 209 210 // Reverse the radix array in place. We do so mostly for intuitive 211 // deserialization where we would read the length field and then the call 212 // stack frames proper just like any other array deserialization, except 213 // that we have occasional jumps to take advantage of prefixes. 214 for (size_t I = 0, J = RadixArray.size() - 1; I < J; ++I, --J) 215 std::swap(RadixArray[I], RadixArray[J]); 216 217 // "Reverse" the indexes stored in CallStackPos. 218 for (auto &[K, V] : CallStackPos) 219 V = RadixArray.size() - 1 - V; 220 } 221 222 // Explicitly instantiate class with the utilized FrameIdTy. 223 template class LLVM_EXPORT_TEMPLATE CallStackRadixTreeBuilder<FrameId>; 224 template class LLVM_EXPORT_TEMPLATE CallStackRadixTreeBuilder<LinearFrameId>; 225 226 template <typename FrameIdTy> 227 llvm::DenseMap<FrameIdTy, FrameStat> 228 computeFrameHistogram(llvm::MapVector<CallStackId, llvm::SmallVector<FrameIdTy>> 229 &MemProfCallStackData) { 230 llvm::DenseMap<FrameIdTy, FrameStat> Histogram; 231 232 for (const auto &KV : MemProfCallStackData) { 233 const auto &CS = KV.second; 234 for (unsigned I = 0, E = CS.size(); I != E; ++I) { 235 auto &S = Histogram[CS[I]]; 236 ++S.Count; 237 S.PositionSum += I; 238 } 239 } 240 return Histogram; 241 } 242 243 // Explicitly instantiate function with the utilized FrameIdTy. 244 template LLVM_ABI llvm::DenseMap<FrameId, FrameStat> 245 computeFrameHistogram<FrameId>( 246 llvm::MapVector<CallStackId, llvm::SmallVector<FrameId>> 247 &MemProfCallStackData); 248 template LLVM_ABI llvm::DenseMap<LinearFrameId, FrameStat> 249 computeFrameHistogram<LinearFrameId>( 250 llvm::MapVector<CallStackId, llvm::SmallVector<LinearFrameId>> 251 &MemProfCallStackData); 252 } // namespace memprof 253 } // namespace llvm 254