1 #include "llvm/ProfileData/MemProf.h"
2 #include "llvm/ADT/SmallVector.h"
3 #include "llvm/IR/Function.h"
4 #include "llvm/ProfileData/InstrProf.h"
5 #include "llvm/ProfileData/SampleProf.h"
6 #include "llvm/Support/BLAKE3.h"
7 #include "llvm/Support/Endian.h"
8 #include "llvm/Support/EndianStream.h"
9 #include "llvm/Support/HashBuilder.h"
10
11 namespace llvm {
12 namespace memprof {
getFullSchema()13 MemProfSchema getFullSchema() {
14 MemProfSchema List;
15 #define MIBEntryDef(NameTag, Name, Type) List.push_back(Meta::Name);
16 #include "llvm/ProfileData/MIBEntryDef.inc"
17 #undef MIBEntryDef
18 return List;
19 }
20
getHotColdSchema()21 MemProfSchema getHotColdSchema() {
22 return {Meta::AllocCount, Meta::TotalSize, Meta::TotalLifetime,
23 Meta::TotalLifetimeAccessDensity};
24 }
25
serializedSizeV0(const IndexedAllocationInfo & IAI,const MemProfSchema & Schema)26 static size_t serializedSizeV0(const IndexedAllocationInfo &IAI,
27 const MemProfSchema &Schema) {
28 size_t Size = 0;
29 // The number of frames to serialize.
30 Size += sizeof(uint64_t);
31 // The callstack frame ids.
32 Size += sizeof(FrameId) * IAI.CallStack.size();
33 // The size of the payload.
34 Size += PortableMemInfoBlock::serializedSize(Schema);
35 return Size;
36 }
37
serializedSizeV2(const IndexedAllocationInfo & IAI,const MemProfSchema & Schema)38 static size_t serializedSizeV2(const IndexedAllocationInfo &IAI,
39 const MemProfSchema &Schema) {
40 size_t Size = 0;
41 // The CallStackId
42 Size += sizeof(CallStackId);
43 // The size of the payload.
44 Size += PortableMemInfoBlock::serializedSize(Schema);
45 return Size;
46 }
47
serializedSizeV3(const IndexedAllocationInfo & IAI,const MemProfSchema & Schema)48 static size_t serializedSizeV3(const IndexedAllocationInfo &IAI,
49 const MemProfSchema &Schema) {
50 size_t Size = 0;
51 // The linear call stack ID.
52 Size += sizeof(LinearCallStackId);
53 // The size of the payload.
54 Size += PortableMemInfoBlock::serializedSize(Schema);
55 return Size;
56 }
57
serializedSize(const MemProfSchema & Schema,IndexedVersion Version) const58 size_t IndexedAllocationInfo::serializedSize(const MemProfSchema &Schema,
59 IndexedVersion Version) const {
60 switch (Version) {
61 case Version0:
62 case Version1:
63 return serializedSizeV0(*this, Schema);
64 case Version2:
65 return serializedSizeV2(*this, Schema);
66 case Version3:
67 return serializedSizeV3(*this, Schema);
68 }
69 llvm_unreachable("unsupported MemProf version");
70 }
71
serializedSizeV0(const IndexedMemProfRecord & Record,const MemProfSchema & Schema)72 static size_t serializedSizeV0(const IndexedMemProfRecord &Record,
73 const MemProfSchema &Schema) {
74 // The number of alloc sites to serialize.
75 size_t Result = sizeof(uint64_t);
76 for (const IndexedAllocationInfo &N : Record.AllocSites)
77 Result += N.serializedSize(Schema, Version0);
78
79 // The number of callsites we have information for.
80 Result += sizeof(uint64_t);
81 for (const auto &Frames : Record.CallSites) {
82 // The number of frame ids to serialize.
83 Result += sizeof(uint64_t);
84 Result += Frames.size() * sizeof(FrameId);
85 }
86 return Result;
87 }
88
serializedSizeV2(const IndexedMemProfRecord & Record,const MemProfSchema & Schema)89 static size_t serializedSizeV2(const IndexedMemProfRecord &Record,
90 const MemProfSchema &Schema) {
91 // The number of alloc sites to serialize.
92 size_t Result = sizeof(uint64_t);
93 for (const IndexedAllocationInfo &N : Record.AllocSites)
94 Result += N.serializedSize(Schema, Version2);
95
96 // The number of callsites we have information for.
97 Result += sizeof(uint64_t);
98 // The CallStackId
99 Result += Record.CallSiteIds.size() * sizeof(CallStackId);
100 return Result;
101 }
102
serializedSizeV3(const IndexedMemProfRecord & Record,const MemProfSchema & Schema)103 static size_t serializedSizeV3(const IndexedMemProfRecord &Record,
104 const MemProfSchema &Schema) {
105 // The number of alloc sites to serialize.
106 size_t Result = sizeof(uint64_t);
107 for (const IndexedAllocationInfo &N : Record.AllocSites)
108 Result += N.serializedSize(Schema, Version3);
109
110 // The number of callsites we have information for.
111 Result += sizeof(uint64_t);
112 // The linear call stack ID.
113 Result += Record.CallSiteIds.size() * sizeof(LinearCallStackId);
114 return Result;
115 }
116
serializedSize(const MemProfSchema & Schema,IndexedVersion Version) const117 size_t IndexedMemProfRecord::serializedSize(const MemProfSchema &Schema,
118 IndexedVersion Version) const {
119 switch (Version) {
120 case Version0:
121 case Version1:
122 return serializedSizeV0(*this, Schema);
123 case Version2:
124 return serializedSizeV2(*this, Schema);
125 case Version3:
126 return serializedSizeV3(*this, Schema);
127 }
128 llvm_unreachable("unsupported MemProf version");
129 }
130
serializeV0(const IndexedMemProfRecord & Record,const MemProfSchema & Schema,raw_ostream & OS)131 static void serializeV0(const IndexedMemProfRecord &Record,
132 const MemProfSchema &Schema, raw_ostream &OS) {
133 using namespace support;
134
135 endian::Writer LE(OS, llvm::endianness::little);
136
137 LE.write<uint64_t>(Record.AllocSites.size());
138 for (const IndexedAllocationInfo &N : Record.AllocSites) {
139 LE.write<uint64_t>(N.CallStack.size());
140 for (const FrameId &Id : N.CallStack)
141 LE.write<FrameId>(Id);
142 N.Info.serialize(Schema, OS);
143 }
144
145 // Related contexts.
146 LE.write<uint64_t>(Record.CallSites.size());
147 for (const auto &Frames : Record.CallSites) {
148 LE.write<uint64_t>(Frames.size());
149 for (const FrameId &Id : Frames)
150 LE.write<FrameId>(Id);
151 }
152 }
153
serializeV2(const IndexedMemProfRecord & Record,const MemProfSchema & Schema,raw_ostream & OS)154 static void serializeV2(const IndexedMemProfRecord &Record,
155 const MemProfSchema &Schema, raw_ostream &OS) {
156 using namespace support;
157
158 endian::Writer LE(OS, llvm::endianness::little);
159
160 LE.write<uint64_t>(Record.AllocSites.size());
161 for (const IndexedAllocationInfo &N : Record.AllocSites) {
162 LE.write<CallStackId>(N.CSId);
163 N.Info.serialize(Schema, OS);
164 }
165
166 // Related contexts.
167 LE.write<uint64_t>(Record.CallSiteIds.size());
168 for (const auto &CSId : Record.CallSiteIds)
169 LE.write<CallStackId>(CSId);
170 }
171
serializeV3(const IndexedMemProfRecord & Record,const MemProfSchema & Schema,raw_ostream & OS,llvm::DenseMap<CallStackId,LinearCallStackId> & MemProfCallStackIndexes)172 static void serializeV3(
173 const IndexedMemProfRecord &Record, const MemProfSchema &Schema,
174 raw_ostream &OS,
175 llvm::DenseMap<CallStackId, LinearCallStackId> &MemProfCallStackIndexes) {
176 using namespace support;
177
178 endian::Writer LE(OS, llvm::endianness::little);
179
180 LE.write<uint64_t>(Record.AllocSites.size());
181 for (const IndexedAllocationInfo &N : Record.AllocSites) {
182 assert(MemProfCallStackIndexes.contains(N.CSId));
183 LE.write<LinearCallStackId>(MemProfCallStackIndexes[N.CSId]);
184 N.Info.serialize(Schema, OS);
185 }
186
187 // Related contexts.
188 LE.write<uint64_t>(Record.CallSiteIds.size());
189 for (const auto &CSId : Record.CallSiteIds) {
190 assert(MemProfCallStackIndexes.contains(CSId));
191 LE.write<LinearCallStackId>(MemProfCallStackIndexes[CSId]);
192 }
193 }
194
serialize(const MemProfSchema & Schema,raw_ostream & OS,IndexedVersion Version,llvm::DenseMap<CallStackId,LinearCallStackId> * MemProfCallStackIndexes) const195 void IndexedMemProfRecord::serialize(
196 const MemProfSchema &Schema, raw_ostream &OS, IndexedVersion Version,
197 llvm::DenseMap<CallStackId, LinearCallStackId> *MemProfCallStackIndexes)
198 const {
199 switch (Version) {
200 case Version0:
201 case Version1:
202 serializeV0(*this, Schema, OS);
203 return;
204 case Version2:
205 serializeV2(*this, Schema, OS);
206 return;
207 case Version3:
208 serializeV3(*this, Schema, OS, *MemProfCallStackIndexes);
209 return;
210 }
211 llvm_unreachable("unsupported MemProf version");
212 }
213
deserializeV0(const MemProfSchema & Schema,const unsigned char * Ptr)214 static IndexedMemProfRecord deserializeV0(const MemProfSchema &Schema,
215 const unsigned char *Ptr) {
216 using namespace support;
217
218 IndexedMemProfRecord Record;
219
220 // Read the meminfo nodes.
221 const uint64_t NumNodes =
222 endian::readNext<uint64_t, llvm::endianness::little>(Ptr);
223 for (uint64_t I = 0; I < NumNodes; I++) {
224 IndexedAllocationInfo Node;
225 const uint64_t NumFrames =
226 endian::readNext<uint64_t, llvm::endianness::little>(Ptr);
227 for (uint64_t J = 0; J < NumFrames; J++) {
228 const FrameId Id =
229 endian::readNext<FrameId, llvm::endianness::little>(Ptr);
230 Node.CallStack.push_back(Id);
231 }
232 Node.CSId = hashCallStack(Node.CallStack);
233 Node.Info.deserialize(Schema, Ptr);
234 Ptr += PortableMemInfoBlock::serializedSize(Schema);
235 Record.AllocSites.push_back(Node);
236 }
237
238 // Read the callsite information.
239 const uint64_t NumCtxs =
240 endian::readNext<uint64_t, llvm::endianness::little>(Ptr);
241 for (uint64_t J = 0; J < NumCtxs; J++) {
242 const uint64_t NumFrames =
243 endian::readNext<uint64_t, llvm::endianness::little>(Ptr);
244 llvm::SmallVector<FrameId> Frames;
245 Frames.reserve(NumFrames);
246 for (uint64_t K = 0; K < NumFrames; K++) {
247 const FrameId Id =
248 endian::readNext<FrameId, llvm::endianness::little>(Ptr);
249 Frames.push_back(Id);
250 }
251 Record.CallSites.push_back(Frames);
252 Record.CallSiteIds.push_back(hashCallStack(Frames));
253 }
254
255 return Record;
256 }
257
deserializeV2(const MemProfSchema & Schema,const unsigned char * Ptr)258 static IndexedMemProfRecord deserializeV2(const MemProfSchema &Schema,
259 const unsigned char *Ptr) {
260 using namespace support;
261
262 IndexedMemProfRecord Record;
263
264 // Read the meminfo nodes.
265 const uint64_t NumNodes =
266 endian::readNext<uint64_t, llvm::endianness::little>(Ptr);
267 Record.AllocSites.reserve(NumNodes);
268 for (uint64_t I = 0; I < NumNodes; I++) {
269 IndexedAllocationInfo Node;
270 Node.CSId = endian::readNext<CallStackId, llvm::endianness::little>(Ptr);
271 Node.Info.deserialize(Schema, Ptr);
272 Ptr += PortableMemInfoBlock::serializedSize(Schema);
273 Record.AllocSites.push_back(Node);
274 }
275
276 // Read the callsite information.
277 const uint64_t NumCtxs =
278 endian::readNext<uint64_t, llvm::endianness::little>(Ptr);
279 Record.CallSiteIds.reserve(NumCtxs);
280 for (uint64_t J = 0; J < NumCtxs; J++) {
281 CallStackId CSId =
282 endian::readNext<CallStackId, llvm::endianness::little>(Ptr);
283 Record.CallSiteIds.push_back(CSId);
284 }
285
286 return Record;
287 }
288
deserializeV3(const MemProfSchema & Schema,const unsigned char * Ptr)289 static IndexedMemProfRecord deserializeV3(const MemProfSchema &Schema,
290 const unsigned char *Ptr) {
291 using namespace support;
292
293 IndexedMemProfRecord Record;
294
295 // Read the meminfo nodes.
296 const uint64_t NumNodes =
297 endian::readNext<uint64_t, llvm::endianness::little>(Ptr);
298 Record.AllocSites.reserve(NumNodes);
299 for (uint64_t I = 0; I < NumNodes; I++) {
300 IndexedAllocationInfo Node;
301 Node.CSId =
302 endian::readNext<LinearCallStackId, llvm::endianness::little>(Ptr);
303 Node.Info.deserialize(Schema, Ptr);
304 Ptr += PortableMemInfoBlock::serializedSize(Schema);
305 Record.AllocSites.push_back(Node);
306 }
307
308 // Read the callsite information.
309 const uint64_t NumCtxs =
310 endian::readNext<uint64_t, llvm::endianness::little>(Ptr);
311 Record.CallSiteIds.reserve(NumCtxs);
312 for (uint64_t J = 0; J < NumCtxs; J++) {
313 // We are storing LinearCallStackId in CallSiteIds, which is a vector of
314 // CallStackId. Assert that CallStackId is no smaller than
315 // LinearCallStackId.
316 static_assert(sizeof(LinearCallStackId) <= sizeof(CallStackId));
317 LinearCallStackId CSId =
318 endian::readNext<LinearCallStackId, llvm::endianness::little>(Ptr);
319 Record.CallSiteIds.push_back(CSId);
320 }
321
322 return Record;
323 }
324
325 IndexedMemProfRecord
deserialize(const MemProfSchema & Schema,const unsigned char * Ptr,IndexedVersion Version)326 IndexedMemProfRecord::deserialize(const MemProfSchema &Schema,
327 const unsigned char *Ptr,
328 IndexedVersion Version) {
329 switch (Version) {
330 case Version0:
331 case Version1:
332 return deserializeV0(Schema, Ptr);
333 case Version2:
334 return deserializeV2(Schema, Ptr);
335 case Version3:
336 return deserializeV3(Schema, Ptr);
337 }
338 llvm_unreachable("unsupported MemProf version");
339 }
340
toMemProfRecord(llvm::function_ref<std::vector<Frame> (const CallStackId)> Callback) const341 MemProfRecord IndexedMemProfRecord::toMemProfRecord(
342 llvm::function_ref<std::vector<Frame>(const CallStackId)> Callback) const {
343 MemProfRecord Record;
344
345 Record.AllocSites.reserve(AllocSites.size());
346 for (const IndexedAllocationInfo &IndexedAI : AllocSites) {
347 AllocationInfo AI;
348 AI.Info = IndexedAI.Info;
349 AI.CallStack = Callback(IndexedAI.CSId);
350 Record.AllocSites.push_back(std::move(AI));
351 }
352
353 Record.CallSites.reserve(CallSiteIds.size());
354 for (CallStackId CSId : CallSiteIds)
355 Record.CallSites.push_back(Callback(CSId));
356
357 return Record;
358 }
359
getGUID(const StringRef FunctionName)360 GlobalValue::GUID IndexedMemProfRecord::getGUID(const StringRef FunctionName) {
361 // Canonicalize the function name to drop suffixes such as ".llvm.". Note
362 // we do not drop any ".__uniq." suffixes, as getCanonicalFnName does not drop
363 // those by default. This is by design to differentiate internal linkage
364 // functions during matching. By dropping the other suffixes we can then match
365 // functions in the profile use phase prior to their addition. Note that this
366 // applies to both instrumented and sampled function names.
367 StringRef CanonicalName =
368 sampleprof::FunctionSamples::getCanonicalFnName(FunctionName);
369
370 // We use the function guid which we expect to be a uint64_t. At
371 // this time, it is the lower 64 bits of the md5 of the canonical
372 // function name.
373 return Function::getGUID(CanonicalName);
374 }
375
readMemProfSchema(const unsigned char * & Buffer)376 Expected<MemProfSchema> readMemProfSchema(const unsigned char *&Buffer) {
377 using namespace support;
378
379 const unsigned char *Ptr = Buffer;
380 const uint64_t NumSchemaIds =
381 endian::readNext<uint64_t, llvm::endianness::little>(Ptr);
382 if (NumSchemaIds > static_cast<uint64_t>(Meta::Size)) {
383 return make_error<InstrProfError>(instrprof_error::malformed,
384 "memprof schema invalid");
385 }
386
387 MemProfSchema Result;
388 for (size_t I = 0; I < NumSchemaIds; I++) {
389 const uint64_t Tag =
390 endian::readNext<uint64_t, llvm::endianness::little>(Ptr);
391 if (Tag >= static_cast<uint64_t>(Meta::Size)) {
392 return make_error<InstrProfError>(instrprof_error::malformed,
393 "memprof schema invalid");
394 }
395 Result.push_back(static_cast<Meta>(Tag));
396 }
397 // Advance the buffer to one past the schema if we succeeded.
398 Buffer = Ptr;
399 return Result;
400 }
401
hashCallStack(ArrayRef<FrameId> CS)402 CallStackId hashCallStack(ArrayRef<FrameId> CS) {
403 llvm::HashBuilder<llvm::TruncatedBLAKE3<8>, llvm::endianness::little>
404 HashBuilder;
405 for (FrameId F : CS)
406 HashBuilder.add(F);
407 llvm::BLAKE3Result<8> Hash = HashBuilder.final();
408 CallStackId CSId;
409 std::memcpy(&CSId, Hash.data(), sizeof(Hash));
410 return CSId;
411 }
412
413 // Encode a call stack into RadixArray. Return the starting index within
414 // RadixArray. For each call stack we encode, we emit two or three components
415 // into RadixArray. If a given call stack doesn't have a common prefix relative
416 // to the previous one, we emit:
417 //
418 // - the frames in the given call stack in the root-to-leaf order
419 //
420 // - the length of the given call stack
421 //
422 // If a given call stack has a non-empty common prefix relative to the previous
423 // one, we emit:
424 //
425 // - the relative location of the common prefix, encoded as a negative number.
426 //
427 // - a portion of the given call stack that's beyond the common prefix
428 //
429 // - the length of the given call stack, including the length of the common
430 // prefix.
431 //
432 // The resulting RadixArray requires a somewhat unintuitive backward traversal
433 // to reconstruct a call stack -- read the call stack length and scan backward
434 // while collecting frames in the leaf to root order. build, the caller of this
435 // function, reverses RadixArray in place so that we can reconstruct a call
436 // stack as if we were deserializing an array in a typical way -- the call stack
437 // length followed by the frames in the leaf-to-root order except that we need
438 // to handle pointers to parents along the way.
439 //
440 // To quickly determine the location of the common prefix within RadixArray,
441 // Indexes caches the indexes of the previous call stack's frames within
442 // RadixArray.
encodeCallStack(const llvm::SmallVector<FrameId> * CallStack,const llvm::SmallVector<FrameId> * Prev,const llvm::DenseMap<FrameId,LinearFrameId> & MemProfFrameIndexes)443 LinearCallStackId CallStackRadixTreeBuilder::encodeCallStack(
444 const llvm::SmallVector<FrameId> *CallStack,
445 const llvm::SmallVector<FrameId> *Prev,
446 const llvm::DenseMap<FrameId, LinearFrameId> &MemProfFrameIndexes) {
447 // Compute the length of the common root prefix between Prev and CallStack.
448 uint32_t CommonLen = 0;
449 if (Prev) {
450 auto Pos = std::mismatch(Prev->rbegin(), Prev->rend(), CallStack->rbegin(),
451 CallStack->rend());
452 CommonLen = std::distance(CallStack->rbegin(), Pos.second);
453 }
454
455 // Drop the portion beyond CommonLen.
456 assert(CommonLen <= Indexes.size());
457 Indexes.resize(CommonLen);
458
459 // Append a pointer to the parent.
460 if (CommonLen) {
461 uint32_t CurrentIndex = RadixArray.size();
462 uint32_t ParentIndex = Indexes.back();
463 // The offset to the parent must be negative because we are pointing to an
464 // element we've already added to RadixArray.
465 assert(ParentIndex < CurrentIndex);
466 RadixArray.push_back(ParentIndex - CurrentIndex);
467 }
468
469 // Copy the part of the call stack beyond the common prefix to RadixArray.
470 assert(CommonLen <= CallStack->size());
471 for (FrameId F : llvm::drop_begin(llvm::reverse(*CallStack), CommonLen)) {
472 // Remember the index of F in RadixArray.
473 Indexes.push_back(RadixArray.size());
474 RadixArray.push_back(MemProfFrameIndexes.find(F)->second);
475 }
476 assert(CallStack->size() == Indexes.size());
477
478 // End with the call stack length.
479 RadixArray.push_back(CallStack->size());
480
481 // Return the index within RadixArray where we can start reconstructing a
482 // given call stack from.
483 return RadixArray.size() - 1;
484 }
485
build(llvm::MapVector<CallStackId,llvm::SmallVector<FrameId>> && MemProfCallStackData,const llvm::DenseMap<FrameId,LinearFrameId> & MemProfFrameIndexes,llvm::DenseMap<FrameId,FrameStat> & FrameHistogram)486 void CallStackRadixTreeBuilder::build(
487 llvm::MapVector<CallStackId, llvm::SmallVector<FrameId>>
488 &&MemProfCallStackData,
489 const llvm::DenseMap<FrameId, LinearFrameId> &MemProfFrameIndexes,
490 llvm::DenseMap<FrameId, FrameStat> &FrameHistogram) {
491 // Take the vector portion of MemProfCallStackData. The vector is exactly
492 // what we need to sort. Also, we no longer need its lookup capability.
493 llvm::SmallVector<CSIdPair, 0> CallStacks = MemProfCallStackData.takeVector();
494
495 // Return early if we have no work to do.
496 if (CallStacks.empty()) {
497 RadixArray.clear();
498 CallStackPos.clear();
499 return;
500 }
501
502 // Sorting the list of call stacks in the dictionary order is sufficient to
503 // maximize the length of the common prefix between two adjacent call stacks
504 // and thus minimize the length of RadixArray. However, we go one step
505 // further and try to reduce the number of times we follow pointers to parents
506 // during deserilization. Consider a poorly encoded radix tree:
507 //
508 // CallStackId 1: f1 -> f2 -> f3
509 // |
510 // CallStackId 2: +--- f4 -> f5
511 // |
512 // CallStackId 3: +--> f6
513 //
514 // Here, f2 and f4 appear once and twice, respectively, in the call stacks.
515 // Once we encode CallStackId 1 into RadixArray, every other call stack with
516 // common prefix f1 ends up pointing to CallStackId 1. Since CallStackId 3
517 // share "f1 f4" with CallStackId 2, CallStackId 3 needs to follow pointers to
518 // parents twice.
519 //
520 // We try to alleviate the situation by sorting the list of call stacks by
521 // comparing the popularity of frames rather than the integer values of
522 // FrameIds. In the example above, f4 is more popular than f2, so we sort the
523 // call stacks and encode them as:
524 //
525 // CallStackId 2: f1 -- f4 -> f5
526 // | |
527 // CallStackId 3: | +--> f6
528 // |
529 // CallStackId 1: +--> f2 -> f3
530 //
531 // Notice that CallStackId 3 follows a pointer to a parent only once.
532 //
533 // All this is a quick-n-dirty trick to reduce the number of jumps. The
534 // proper way would be to compute the weight of each radix tree node -- how
535 // many call stacks use a given radix tree node, and encode a radix tree from
536 // the heaviest node first. We do not do so because that's a lot of work.
537 llvm::sort(CallStacks, [&](const CSIdPair &L, const CSIdPair &R) {
538 // Call stacks are stored from leaf to root. Perform comparisons from the
539 // root.
540 return std::lexicographical_compare(
541 L.second.rbegin(), L.second.rend(), R.second.rbegin(), R.second.rend(),
542 [&](FrameId F1, FrameId F2) {
543 uint64_t H1 = FrameHistogram[F1].Count;
544 uint64_t H2 = FrameHistogram[F2].Count;
545 // Popular frames should come later because we encode call stacks from
546 // the last one in the list.
547 if (H1 != H2)
548 return H1 < H2;
549 // For sort stability.
550 return F1 < F2;
551 });
552 });
553
554 // Reserve some reasonable amount of storage.
555 RadixArray.clear();
556 RadixArray.reserve(CallStacks.size() * 8);
557
558 // Indexes will grow as long as the longest call stack.
559 Indexes.clear();
560 Indexes.reserve(512);
561
562 // CallStackPos will grow to exactly CallStacks.size() entries.
563 CallStackPos.clear();
564 CallStackPos.reserve(CallStacks.size());
565
566 // Compute the radix array. We encode one call stack at a time, computing the
567 // longest prefix that's shared with the previous call stack we encode. For
568 // each call stack we encode, we remember a mapping from CallStackId to its
569 // position within RadixArray.
570 //
571 // As an optimization, we encode from the last call stack in CallStacks to
572 // reduce the number of times we follow pointers to the parents. Consider the
573 // list of call stacks that has been sorted in the dictionary order:
574 //
575 // Call Stack 1: F1
576 // Call Stack 2: F1 -> F2
577 // Call Stack 3: F1 -> F2 -> F3
578 //
579 // If we traversed CallStacks in the forward order, we would end up with a
580 // radix tree like:
581 //
582 // Call Stack 1: F1
583 // |
584 // Call Stack 2: +---> F2
585 // |
586 // Call Stack 3: +---> F3
587 //
588 // Notice that each call stack jumps to the previous one. However, if we
589 // traverse CallStacks in the reverse order, then Call Stack 3 has the
590 // complete call stack encoded without any pointers. Call Stack 1 and 2 point
591 // to appropriate prefixes of Call Stack 3.
592 const llvm::SmallVector<FrameId> *Prev = nullptr;
593 for (const auto &[CSId, CallStack] : llvm::reverse(CallStacks)) {
594 LinearCallStackId Pos =
595 encodeCallStack(&CallStack, Prev, MemProfFrameIndexes);
596 CallStackPos.insert({CSId, Pos});
597 Prev = &CallStack;
598 }
599
600 // "RadixArray.size() - 1" below is problematic if RadixArray is empty.
601 assert(!RadixArray.empty());
602
603 // Reverse the radix array in place. We do so mostly for intuitive
604 // deserialization where we would read the length field and then the call
605 // stack frames proper just like any other array deserialization, except
606 // that we have occasional jumps to take advantage of prefixes.
607 for (size_t I = 0, J = RadixArray.size() - 1; I < J; ++I, --J)
608 std::swap(RadixArray[I], RadixArray[J]);
609
610 // "Reverse" the indexes stored in CallStackPos.
611 for (auto &[K, V] : CallStackPos)
612 V = RadixArray.size() - 1 - V;
613 }
614
615 llvm::DenseMap<FrameId, FrameStat>
computeFrameHistogram(llvm::MapVector<CallStackId,llvm::SmallVector<FrameId>> & MemProfCallStackData)616 computeFrameHistogram(llvm::MapVector<CallStackId, llvm::SmallVector<FrameId>>
617 &MemProfCallStackData) {
618 llvm::DenseMap<FrameId, FrameStat> Histogram;
619
620 for (const auto &KV : MemProfCallStackData) {
621 const auto &CS = KV.second;
622 for (unsigned I = 0, E = CS.size(); I != E; ++I) {
623 auto &S = Histogram[CS[I]];
624 ++S.Count;
625 S.PositionSum += I;
626 }
627 }
628 return Histogram;
629 }
630
verifyIndexedMemProfRecord(const IndexedMemProfRecord & Record)631 void verifyIndexedMemProfRecord(const IndexedMemProfRecord &Record) {
632 for (const auto &AS : Record.AllocSites) {
633 assert(AS.CSId == hashCallStack(AS.CallStack));
634 (void)AS;
635 }
636 }
637
verifyFunctionProfileData(const llvm::MapVector<GlobalValue::GUID,IndexedMemProfRecord> & FunctionProfileData)638 void verifyFunctionProfileData(
639 const llvm::MapVector<GlobalValue::GUID, IndexedMemProfRecord>
640 &FunctionProfileData) {
641 for (const auto &[GUID, Record] : FunctionProfileData) {
642 (void)GUID;
643 verifyIndexedMemProfRecord(Record);
644 }
645 }
646
647 } // namespace memprof
648 } // namespace llvm
649