xref: /freebsd/contrib/llvm-project/llvm/lib/ProfileData/MemProf.cpp (revision 0fca6ea1d4eea4c934cfff25ac9ee8ad6fe95583)
1 #include "llvm/ProfileData/MemProf.h"
2 #include "llvm/ADT/SmallVector.h"
3 #include "llvm/IR/Function.h"
4 #include "llvm/ProfileData/InstrProf.h"
5 #include "llvm/ProfileData/SampleProf.h"
6 #include "llvm/Support/BLAKE3.h"
7 #include "llvm/Support/Endian.h"
8 #include "llvm/Support/EndianStream.h"
9 #include "llvm/Support/HashBuilder.h"
10 
11 namespace llvm {
12 namespace memprof {
getFullSchema()13 MemProfSchema getFullSchema() {
14   MemProfSchema List;
15 #define MIBEntryDef(NameTag, Name, Type) List.push_back(Meta::Name);
16 #include "llvm/ProfileData/MIBEntryDef.inc"
17 #undef MIBEntryDef
18   return List;
19 }
20 
getHotColdSchema()21 MemProfSchema getHotColdSchema() {
22   return {Meta::AllocCount, Meta::TotalSize, Meta::TotalLifetime,
23           Meta::TotalLifetimeAccessDensity};
24 }
25 
serializedSizeV0(const IndexedAllocationInfo & IAI,const MemProfSchema & Schema)26 static size_t serializedSizeV0(const IndexedAllocationInfo &IAI,
27                                const MemProfSchema &Schema) {
28   size_t Size = 0;
29   // The number of frames to serialize.
30   Size += sizeof(uint64_t);
31   // The callstack frame ids.
32   Size += sizeof(FrameId) * IAI.CallStack.size();
33   // The size of the payload.
34   Size += PortableMemInfoBlock::serializedSize(Schema);
35   return Size;
36 }
37 
serializedSizeV2(const IndexedAllocationInfo & IAI,const MemProfSchema & Schema)38 static size_t serializedSizeV2(const IndexedAllocationInfo &IAI,
39                                const MemProfSchema &Schema) {
40   size_t Size = 0;
41   // The CallStackId
42   Size += sizeof(CallStackId);
43   // The size of the payload.
44   Size += PortableMemInfoBlock::serializedSize(Schema);
45   return Size;
46 }
47 
serializedSizeV3(const IndexedAllocationInfo & IAI,const MemProfSchema & Schema)48 static size_t serializedSizeV3(const IndexedAllocationInfo &IAI,
49                                const MemProfSchema &Schema) {
50   size_t Size = 0;
51   // The linear call stack ID.
52   Size += sizeof(LinearCallStackId);
53   // The size of the payload.
54   Size += PortableMemInfoBlock::serializedSize(Schema);
55   return Size;
56 }
57 
serializedSize(const MemProfSchema & Schema,IndexedVersion Version) const58 size_t IndexedAllocationInfo::serializedSize(const MemProfSchema &Schema,
59                                              IndexedVersion Version) const {
60   switch (Version) {
61   case Version0:
62   case Version1:
63     return serializedSizeV0(*this, Schema);
64   case Version2:
65     return serializedSizeV2(*this, Schema);
66   case Version3:
67     return serializedSizeV3(*this, Schema);
68   }
69   llvm_unreachable("unsupported MemProf version");
70 }
71 
serializedSizeV0(const IndexedMemProfRecord & Record,const MemProfSchema & Schema)72 static size_t serializedSizeV0(const IndexedMemProfRecord &Record,
73                                const MemProfSchema &Schema) {
74   // The number of alloc sites to serialize.
75   size_t Result = sizeof(uint64_t);
76   for (const IndexedAllocationInfo &N : Record.AllocSites)
77     Result += N.serializedSize(Schema, Version0);
78 
79   // The number of callsites we have information for.
80   Result += sizeof(uint64_t);
81   for (const auto &Frames : Record.CallSites) {
82     // The number of frame ids to serialize.
83     Result += sizeof(uint64_t);
84     Result += Frames.size() * sizeof(FrameId);
85   }
86   return Result;
87 }
88 
serializedSizeV2(const IndexedMemProfRecord & Record,const MemProfSchema & Schema)89 static size_t serializedSizeV2(const IndexedMemProfRecord &Record,
90                                const MemProfSchema &Schema) {
91   // The number of alloc sites to serialize.
92   size_t Result = sizeof(uint64_t);
93   for (const IndexedAllocationInfo &N : Record.AllocSites)
94     Result += N.serializedSize(Schema, Version2);
95 
96   // The number of callsites we have information for.
97   Result += sizeof(uint64_t);
98   // The CallStackId
99   Result += Record.CallSiteIds.size() * sizeof(CallStackId);
100   return Result;
101 }
102 
serializedSizeV3(const IndexedMemProfRecord & Record,const MemProfSchema & Schema)103 static size_t serializedSizeV3(const IndexedMemProfRecord &Record,
104                                const MemProfSchema &Schema) {
105   // The number of alloc sites to serialize.
106   size_t Result = sizeof(uint64_t);
107   for (const IndexedAllocationInfo &N : Record.AllocSites)
108     Result += N.serializedSize(Schema, Version3);
109 
110   // The number of callsites we have information for.
111   Result += sizeof(uint64_t);
112   // The linear call stack ID.
113   Result += Record.CallSiteIds.size() * sizeof(LinearCallStackId);
114   return Result;
115 }
116 
serializedSize(const MemProfSchema & Schema,IndexedVersion Version) const117 size_t IndexedMemProfRecord::serializedSize(const MemProfSchema &Schema,
118                                             IndexedVersion Version) const {
119   switch (Version) {
120   case Version0:
121   case Version1:
122     return serializedSizeV0(*this, Schema);
123   case Version2:
124     return serializedSizeV2(*this, Schema);
125   case Version3:
126     return serializedSizeV3(*this, Schema);
127   }
128   llvm_unreachable("unsupported MemProf version");
129 }
130 
serializeV0(const IndexedMemProfRecord & Record,const MemProfSchema & Schema,raw_ostream & OS)131 static void serializeV0(const IndexedMemProfRecord &Record,
132                         const MemProfSchema &Schema, raw_ostream &OS) {
133   using namespace support;
134 
135   endian::Writer LE(OS, llvm::endianness::little);
136 
137   LE.write<uint64_t>(Record.AllocSites.size());
138   for (const IndexedAllocationInfo &N : Record.AllocSites) {
139     LE.write<uint64_t>(N.CallStack.size());
140     for (const FrameId &Id : N.CallStack)
141       LE.write<FrameId>(Id);
142     N.Info.serialize(Schema, OS);
143   }
144 
145   // Related contexts.
146   LE.write<uint64_t>(Record.CallSites.size());
147   for (const auto &Frames : Record.CallSites) {
148     LE.write<uint64_t>(Frames.size());
149     for (const FrameId &Id : Frames)
150       LE.write<FrameId>(Id);
151   }
152 }
153 
serializeV2(const IndexedMemProfRecord & Record,const MemProfSchema & Schema,raw_ostream & OS)154 static void serializeV2(const IndexedMemProfRecord &Record,
155                         const MemProfSchema &Schema, raw_ostream &OS) {
156   using namespace support;
157 
158   endian::Writer LE(OS, llvm::endianness::little);
159 
160   LE.write<uint64_t>(Record.AllocSites.size());
161   for (const IndexedAllocationInfo &N : Record.AllocSites) {
162     LE.write<CallStackId>(N.CSId);
163     N.Info.serialize(Schema, OS);
164   }
165 
166   // Related contexts.
167   LE.write<uint64_t>(Record.CallSiteIds.size());
168   for (const auto &CSId : Record.CallSiteIds)
169     LE.write<CallStackId>(CSId);
170 }
171 
serializeV3(const IndexedMemProfRecord & Record,const MemProfSchema & Schema,raw_ostream & OS,llvm::DenseMap<CallStackId,LinearCallStackId> & MemProfCallStackIndexes)172 static void serializeV3(
173     const IndexedMemProfRecord &Record, const MemProfSchema &Schema,
174     raw_ostream &OS,
175     llvm::DenseMap<CallStackId, LinearCallStackId> &MemProfCallStackIndexes) {
176   using namespace support;
177 
178   endian::Writer LE(OS, llvm::endianness::little);
179 
180   LE.write<uint64_t>(Record.AllocSites.size());
181   for (const IndexedAllocationInfo &N : Record.AllocSites) {
182     assert(MemProfCallStackIndexes.contains(N.CSId));
183     LE.write<LinearCallStackId>(MemProfCallStackIndexes[N.CSId]);
184     N.Info.serialize(Schema, OS);
185   }
186 
187   // Related contexts.
188   LE.write<uint64_t>(Record.CallSiteIds.size());
189   for (const auto &CSId : Record.CallSiteIds) {
190     assert(MemProfCallStackIndexes.contains(CSId));
191     LE.write<LinearCallStackId>(MemProfCallStackIndexes[CSId]);
192   }
193 }
194 
serialize(const MemProfSchema & Schema,raw_ostream & OS,IndexedVersion Version,llvm::DenseMap<CallStackId,LinearCallStackId> * MemProfCallStackIndexes) const195 void IndexedMemProfRecord::serialize(
196     const MemProfSchema &Schema, raw_ostream &OS, IndexedVersion Version,
197     llvm::DenseMap<CallStackId, LinearCallStackId> *MemProfCallStackIndexes)
198     const {
199   switch (Version) {
200   case Version0:
201   case Version1:
202     serializeV0(*this, Schema, OS);
203     return;
204   case Version2:
205     serializeV2(*this, Schema, OS);
206     return;
207   case Version3:
208     serializeV3(*this, Schema, OS, *MemProfCallStackIndexes);
209     return;
210   }
211   llvm_unreachable("unsupported MemProf version");
212 }
213 
deserializeV0(const MemProfSchema & Schema,const unsigned char * Ptr)214 static IndexedMemProfRecord deserializeV0(const MemProfSchema &Schema,
215                                           const unsigned char *Ptr) {
216   using namespace support;
217 
218   IndexedMemProfRecord Record;
219 
220   // Read the meminfo nodes.
221   const uint64_t NumNodes =
222       endian::readNext<uint64_t, llvm::endianness::little>(Ptr);
223   for (uint64_t I = 0; I < NumNodes; I++) {
224     IndexedAllocationInfo Node;
225     const uint64_t NumFrames =
226         endian::readNext<uint64_t, llvm::endianness::little>(Ptr);
227     for (uint64_t J = 0; J < NumFrames; J++) {
228       const FrameId Id =
229           endian::readNext<FrameId, llvm::endianness::little>(Ptr);
230       Node.CallStack.push_back(Id);
231     }
232     Node.CSId = hashCallStack(Node.CallStack);
233     Node.Info.deserialize(Schema, Ptr);
234     Ptr += PortableMemInfoBlock::serializedSize(Schema);
235     Record.AllocSites.push_back(Node);
236   }
237 
238   // Read the callsite information.
239   const uint64_t NumCtxs =
240       endian::readNext<uint64_t, llvm::endianness::little>(Ptr);
241   for (uint64_t J = 0; J < NumCtxs; J++) {
242     const uint64_t NumFrames =
243         endian::readNext<uint64_t, llvm::endianness::little>(Ptr);
244     llvm::SmallVector<FrameId> Frames;
245     Frames.reserve(NumFrames);
246     for (uint64_t K = 0; K < NumFrames; K++) {
247       const FrameId Id =
248           endian::readNext<FrameId, llvm::endianness::little>(Ptr);
249       Frames.push_back(Id);
250     }
251     Record.CallSites.push_back(Frames);
252     Record.CallSiteIds.push_back(hashCallStack(Frames));
253   }
254 
255   return Record;
256 }
257 
deserializeV2(const MemProfSchema & Schema,const unsigned char * Ptr)258 static IndexedMemProfRecord deserializeV2(const MemProfSchema &Schema,
259                                           const unsigned char *Ptr) {
260   using namespace support;
261 
262   IndexedMemProfRecord Record;
263 
264   // Read the meminfo nodes.
265   const uint64_t NumNodes =
266       endian::readNext<uint64_t, llvm::endianness::little>(Ptr);
267   Record.AllocSites.reserve(NumNodes);
268   for (uint64_t I = 0; I < NumNodes; I++) {
269     IndexedAllocationInfo Node;
270     Node.CSId = endian::readNext<CallStackId, llvm::endianness::little>(Ptr);
271     Node.Info.deserialize(Schema, Ptr);
272     Ptr += PortableMemInfoBlock::serializedSize(Schema);
273     Record.AllocSites.push_back(Node);
274   }
275 
276   // Read the callsite information.
277   const uint64_t NumCtxs =
278       endian::readNext<uint64_t, llvm::endianness::little>(Ptr);
279   Record.CallSiteIds.reserve(NumCtxs);
280   for (uint64_t J = 0; J < NumCtxs; J++) {
281     CallStackId CSId =
282         endian::readNext<CallStackId, llvm::endianness::little>(Ptr);
283     Record.CallSiteIds.push_back(CSId);
284   }
285 
286   return Record;
287 }
288 
deserializeV3(const MemProfSchema & Schema,const unsigned char * Ptr)289 static IndexedMemProfRecord deserializeV3(const MemProfSchema &Schema,
290                                           const unsigned char *Ptr) {
291   using namespace support;
292 
293   IndexedMemProfRecord Record;
294 
295   // Read the meminfo nodes.
296   const uint64_t NumNodes =
297       endian::readNext<uint64_t, llvm::endianness::little>(Ptr);
298   Record.AllocSites.reserve(NumNodes);
299   for (uint64_t I = 0; I < NumNodes; I++) {
300     IndexedAllocationInfo Node;
301     Node.CSId =
302         endian::readNext<LinearCallStackId, llvm::endianness::little>(Ptr);
303     Node.Info.deserialize(Schema, Ptr);
304     Ptr += PortableMemInfoBlock::serializedSize(Schema);
305     Record.AllocSites.push_back(Node);
306   }
307 
308   // Read the callsite information.
309   const uint64_t NumCtxs =
310       endian::readNext<uint64_t, llvm::endianness::little>(Ptr);
311   Record.CallSiteIds.reserve(NumCtxs);
312   for (uint64_t J = 0; J < NumCtxs; J++) {
313     // We are storing LinearCallStackId in CallSiteIds, which is a vector of
314     // CallStackId.  Assert that CallStackId is no smaller than
315     // LinearCallStackId.
316     static_assert(sizeof(LinearCallStackId) <= sizeof(CallStackId));
317     LinearCallStackId CSId =
318         endian::readNext<LinearCallStackId, llvm::endianness::little>(Ptr);
319     Record.CallSiteIds.push_back(CSId);
320   }
321 
322   return Record;
323 }
324 
325 IndexedMemProfRecord
deserialize(const MemProfSchema & Schema,const unsigned char * Ptr,IndexedVersion Version)326 IndexedMemProfRecord::deserialize(const MemProfSchema &Schema,
327                                   const unsigned char *Ptr,
328                                   IndexedVersion Version) {
329   switch (Version) {
330   case Version0:
331   case Version1:
332     return deserializeV0(Schema, Ptr);
333   case Version2:
334     return deserializeV2(Schema, Ptr);
335   case Version3:
336     return deserializeV3(Schema, Ptr);
337   }
338   llvm_unreachable("unsupported MemProf version");
339 }
340 
toMemProfRecord(llvm::function_ref<std::vector<Frame> (const CallStackId)> Callback) const341 MemProfRecord IndexedMemProfRecord::toMemProfRecord(
342     llvm::function_ref<std::vector<Frame>(const CallStackId)> Callback) const {
343   MemProfRecord Record;
344 
345   Record.AllocSites.reserve(AllocSites.size());
346   for (const IndexedAllocationInfo &IndexedAI : AllocSites) {
347     AllocationInfo AI;
348     AI.Info = IndexedAI.Info;
349     AI.CallStack = Callback(IndexedAI.CSId);
350     Record.AllocSites.push_back(std::move(AI));
351   }
352 
353   Record.CallSites.reserve(CallSiteIds.size());
354   for (CallStackId CSId : CallSiteIds)
355     Record.CallSites.push_back(Callback(CSId));
356 
357   return Record;
358 }
359 
getGUID(const StringRef FunctionName)360 GlobalValue::GUID IndexedMemProfRecord::getGUID(const StringRef FunctionName) {
361   // Canonicalize the function name to drop suffixes such as ".llvm.". Note
362   // we do not drop any ".__uniq." suffixes, as getCanonicalFnName does not drop
363   // those by default. This is by design to differentiate internal linkage
364   // functions during matching. By dropping the other suffixes we can then match
365   // functions in the profile use phase prior to their addition. Note that this
366   // applies to both instrumented and sampled function names.
367   StringRef CanonicalName =
368       sampleprof::FunctionSamples::getCanonicalFnName(FunctionName);
369 
370   // We use the function guid which we expect to be a uint64_t. At
371   // this time, it is the lower 64 bits of the md5 of the canonical
372   // function name.
373   return Function::getGUID(CanonicalName);
374 }
375 
readMemProfSchema(const unsigned char * & Buffer)376 Expected<MemProfSchema> readMemProfSchema(const unsigned char *&Buffer) {
377   using namespace support;
378 
379   const unsigned char *Ptr = Buffer;
380   const uint64_t NumSchemaIds =
381       endian::readNext<uint64_t, llvm::endianness::little>(Ptr);
382   if (NumSchemaIds > static_cast<uint64_t>(Meta::Size)) {
383     return make_error<InstrProfError>(instrprof_error::malformed,
384                                       "memprof schema invalid");
385   }
386 
387   MemProfSchema Result;
388   for (size_t I = 0; I < NumSchemaIds; I++) {
389     const uint64_t Tag =
390         endian::readNext<uint64_t, llvm::endianness::little>(Ptr);
391     if (Tag >= static_cast<uint64_t>(Meta::Size)) {
392       return make_error<InstrProfError>(instrprof_error::malformed,
393                                         "memprof schema invalid");
394     }
395     Result.push_back(static_cast<Meta>(Tag));
396   }
397   // Advance the buffer to one past the schema if we succeeded.
398   Buffer = Ptr;
399   return Result;
400 }
401 
hashCallStack(ArrayRef<FrameId> CS)402 CallStackId hashCallStack(ArrayRef<FrameId> CS) {
403   llvm::HashBuilder<llvm::TruncatedBLAKE3<8>, llvm::endianness::little>
404       HashBuilder;
405   for (FrameId F : CS)
406     HashBuilder.add(F);
407   llvm::BLAKE3Result<8> Hash = HashBuilder.final();
408   CallStackId CSId;
409   std::memcpy(&CSId, Hash.data(), sizeof(Hash));
410   return CSId;
411 }
412 
413 // Encode a call stack into RadixArray.  Return the starting index within
414 // RadixArray.  For each call stack we encode, we emit two or three components
415 // into RadixArray.  If a given call stack doesn't have a common prefix relative
416 // to the previous one, we emit:
417 //
418 // - the frames in the given call stack in the root-to-leaf order
419 //
420 // - the length of the given call stack
421 //
422 // If a given call stack has a non-empty common prefix relative to the previous
423 // one, we emit:
424 //
425 // - the relative location of the common prefix, encoded as a negative number.
426 //
427 // - a portion of the given call stack that's beyond the common prefix
428 //
429 // - the length of the given call stack, including the length of the common
430 //   prefix.
431 //
432 // The resulting RadixArray requires a somewhat unintuitive backward traversal
433 // to reconstruct a call stack -- read the call stack length and scan backward
434 // while collecting frames in the leaf to root order.  build, the caller of this
435 // function, reverses RadixArray in place so that we can reconstruct a call
436 // stack as if we were deserializing an array in a typical way -- the call stack
437 // length followed by the frames in the leaf-to-root order except that we need
438 // to handle pointers to parents along the way.
439 //
440 // To quickly determine the location of the common prefix within RadixArray,
441 // Indexes caches the indexes of the previous call stack's frames within
442 // RadixArray.
encodeCallStack(const llvm::SmallVector<FrameId> * CallStack,const llvm::SmallVector<FrameId> * Prev,const llvm::DenseMap<FrameId,LinearFrameId> & MemProfFrameIndexes)443 LinearCallStackId CallStackRadixTreeBuilder::encodeCallStack(
444     const llvm::SmallVector<FrameId> *CallStack,
445     const llvm::SmallVector<FrameId> *Prev,
446     const llvm::DenseMap<FrameId, LinearFrameId> &MemProfFrameIndexes) {
447   // Compute the length of the common root prefix between Prev and CallStack.
448   uint32_t CommonLen = 0;
449   if (Prev) {
450     auto Pos = std::mismatch(Prev->rbegin(), Prev->rend(), CallStack->rbegin(),
451                              CallStack->rend());
452     CommonLen = std::distance(CallStack->rbegin(), Pos.second);
453   }
454 
455   // Drop the portion beyond CommonLen.
456   assert(CommonLen <= Indexes.size());
457   Indexes.resize(CommonLen);
458 
459   // Append a pointer to the parent.
460   if (CommonLen) {
461     uint32_t CurrentIndex = RadixArray.size();
462     uint32_t ParentIndex = Indexes.back();
463     // The offset to the parent must be negative because we are pointing to an
464     // element we've already added to RadixArray.
465     assert(ParentIndex < CurrentIndex);
466     RadixArray.push_back(ParentIndex - CurrentIndex);
467   }
468 
469   // Copy the part of the call stack beyond the common prefix to RadixArray.
470   assert(CommonLen <= CallStack->size());
471   for (FrameId F : llvm::drop_begin(llvm::reverse(*CallStack), CommonLen)) {
472     // Remember the index of F in RadixArray.
473     Indexes.push_back(RadixArray.size());
474     RadixArray.push_back(MemProfFrameIndexes.find(F)->second);
475   }
476   assert(CallStack->size() == Indexes.size());
477 
478   // End with the call stack length.
479   RadixArray.push_back(CallStack->size());
480 
481   // Return the index within RadixArray where we can start reconstructing a
482   // given call stack from.
483   return RadixArray.size() - 1;
484 }
485 
build(llvm::MapVector<CallStackId,llvm::SmallVector<FrameId>> && MemProfCallStackData,const llvm::DenseMap<FrameId,LinearFrameId> & MemProfFrameIndexes,llvm::DenseMap<FrameId,FrameStat> & FrameHistogram)486 void CallStackRadixTreeBuilder::build(
487     llvm::MapVector<CallStackId, llvm::SmallVector<FrameId>>
488         &&MemProfCallStackData,
489     const llvm::DenseMap<FrameId, LinearFrameId> &MemProfFrameIndexes,
490     llvm::DenseMap<FrameId, FrameStat> &FrameHistogram) {
491   // Take the vector portion of MemProfCallStackData.  The vector is exactly
492   // what we need to sort.  Also, we no longer need its lookup capability.
493   llvm::SmallVector<CSIdPair, 0> CallStacks = MemProfCallStackData.takeVector();
494 
495   // Return early if we have no work to do.
496   if (CallStacks.empty()) {
497     RadixArray.clear();
498     CallStackPos.clear();
499     return;
500   }
501 
502   // Sorting the list of call stacks in the dictionary order is sufficient to
503   // maximize the length of the common prefix between two adjacent call stacks
504   // and thus minimize the length of RadixArray.  However, we go one step
505   // further and try to reduce the number of times we follow pointers to parents
506   // during deserilization.  Consider a poorly encoded radix tree:
507   //
508   // CallStackId 1:  f1 -> f2 -> f3
509   //                  |
510   // CallStackId 2:   +--- f4 -> f5
511   //                        |
512   // CallStackId 3:         +--> f6
513   //
514   // Here, f2 and f4 appear once and twice, respectively, in the call stacks.
515   // Once we encode CallStackId 1 into RadixArray, every other call stack with
516   // common prefix f1 ends up pointing to CallStackId 1.  Since CallStackId 3
517   // share "f1 f4" with CallStackId 2, CallStackId 3 needs to follow pointers to
518   // parents twice.
519   //
520   // We try to alleviate the situation by sorting the list of call stacks by
521   // comparing the popularity of frames rather than the integer values of
522   // FrameIds.  In the example above, f4 is more popular than f2, so we sort the
523   // call stacks and encode them as:
524   //
525   // CallStackId 2:  f1 -- f4 -> f5
526   //                  |     |
527   // CallStackId 3:   |     +--> f6
528   //                  |
529   // CallStackId 1:   +--> f2 -> f3
530   //
531   // Notice that CallStackId 3 follows a pointer to a parent only once.
532   //
533   // All this is a quick-n-dirty trick to reduce the number of jumps.  The
534   // proper way would be to compute the weight of each radix tree node -- how
535   // many call stacks use a given radix tree node, and encode a radix tree from
536   // the heaviest node first.  We do not do so because that's a lot of work.
537   llvm::sort(CallStacks, [&](const CSIdPair &L, const CSIdPair &R) {
538     // Call stacks are stored from leaf to root.  Perform comparisons from the
539     // root.
540     return std::lexicographical_compare(
541         L.second.rbegin(), L.second.rend(), R.second.rbegin(), R.second.rend(),
542         [&](FrameId F1, FrameId F2) {
543           uint64_t H1 = FrameHistogram[F1].Count;
544           uint64_t H2 = FrameHistogram[F2].Count;
545           // Popular frames should come later because we encode call stacks from
546           // the last one in the list.
547           if (H1 != H2)
548             return H1 < H2;
549           // For sort stability.
550           return F1 < F2;
551         });
552   });
553 
554   // Reserve some reasonable amount of storage.
555   RadixArray.clear();
556   RadixArray.reserve(CallStacks.size() * 8);
557 
558   // Indexes will grow as long as the longest call stack.
559   Indexes.clear();
560   Indexes.reserve(512);
561 
562   // CallStackPos will grow to exactly CallStacks.size() entries.
563   CallStackPos.clear();
564   CallStackPos.reserve(CallStacks.size());
565 
566   // Compute the radix array.  We encode one call stack at a time, computing the
567   // longest prefix that's shared with the previous call stack we encode.  For
568   // each call stack we encode, we remember a mapping from CallStackId to its
569   // position within RadixArray.
570   //
571   // As an optimization, we encode from the last call stack in CallStacks to
572   // reduce the number of times we follow pointers to the parents.  Consider the
573   // list of call stacks that has been sorted in the dictionary order:
574   //
575   // Call Stack 1: F1
576   // Call Stack 2: F1 -> F2
577   // Call Stack 3: F1 -> F2 -> F3
578   //
579   // If we traversed CallStacks in the forward order, we would end up with a
580   // radix tree like:
581   //
582   // Call Stack 1:  F1
583   //                |
584   // Call Stack 2:  +---> F2
585   //                      |
586   // Call Stack 3:        +---> F3
587   //
588   // Notice that each call stack jumps to the previous one.  However, if we
589   // traverse CallStacks in the reverse order, then Call Stack 3 has the
590   // complete call stack encoded without any pointers.  Call Stack 1 and 2 point
591   // to appropriate prefixes of Call Stack 3.
592   const llvm::SmallVector<FrameId> *Prev = nullptr;
593   for (const auto &[CSId, CallStack] : llvm::reverse(CallStacks)) {
594     LinearCallStackId Pos =
595         encodeCallStack(&CallStack, Prev, MemProfFrameIndexes);
596     CallStackPos.insert({CSId, Pos});
597     Prev = &CallStack;
598   }
599 
600   // "RadixArray.size() - 1" below is problematic if RadixArray is empty.
601   assert(!RadixArray.empty());
602 
603   // Reverse the radix array in place.  We do so mostly for intuitive
604   // deserialization where we would read the length field and then the call
605   // stack frames proper just like any other array deserialization, except
606   // that we have occasional jumps to take advantage of prefixes.
607   for (size_t I = 0, J = RadixArray.size() - 1; I < J; ++I, --J)
608     std::swap(RadixArray[I], RadixArray[J]);
609 
610   // "Reverse" the indexes stored in CallStackPos.
611   for (auto &[K, V] : CallStackPos)
612     V = RadixArray.size() - 1 - V;
613 }
614 
615 llvm::DenseMap<FrameId, FrameStat>
computeFrameHistogram(llvm::MapVector<CallStackId,llvm::SmallVector<FrameId>> & MemProfCallStackData)616 computeFrameHistogram(llvm::MapVector<CallStackId, llvm::SmallVector<FrameId>>
617                           &MemProfCallStackData) {
618   llvm::DenseMap<FrameId, FrameStat> Histogram;
619 
620   for (const auto &KV : MemProfCallStackData) {
621     const auto &CS = KV.second;
622     for (unsigned I = 0, E = CS.size(); I != E; ++I) {
623       auto &S = Histogram[CS[I]];
624       ++S.Count;
625       S.PositionSum += I;
626     }
627   }
628   return Histogram;
629 }
630 
verifyIndexedMemProfRecord(const IndexedMemProfRecord & Record)631 void verifyIndexedMemProfRecord(const IndexedMemProfRecord &Record) {
632   for (const auto &AS : Record.AllocSites) {
633     assert(AS.CSId == hashCallStack(AS.CallStack));
634     (void)AS;
635   }
636 }
637 
verifyFunctionProfileData(const llvm::MapVector<GlobalValue::GUID,IndexedMemProfRecord> & FunctionProfileData)638 void verifyFunctionProfileData(
639     const llvm::MapVector<GlobalValue::GUID, IndexedMemProfRecord>
640         &FunctionProfileData) {
641   for (const auto &[GUID, Record] : FunctionProfileData) {
642     (void)GUID;
643     verifyIndexedMemProfRecord(Record);
644   }
645 }
646 
647 } // namespace memprof
648 } // namespace llvm
649