xref: /freebsd/contrib/llvm-project/llvm/lib/Transforms/IPO/HotColdSplitting.cpp (revision bc5304a006238115291e7568583632889dffbab9)
1 //===- HotColdSplitting.cpp -- Outline Cold Regions -------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// The goal of hot/cold splitting is to improve the memory locality of code.
11 /// The splitting pass does this by identifying cold blocks and moving them into
12 /// separate functions.
13 ///
14 /// When the splitting pass finds a cold block (referred to as "the sink"), it
15 /// grows a maximal cold region around that block. The maximal region contains
16 /// all blocks (post-)dominated by the sink [*]. In theory, these blocks are as
17 /// cold as the sink. Once a region is found, it's split out of the original
18 /// function provided it's profitable to do so.
19 ///
20 /// [*] In practice, there is some added complexity because some blocks are not
21 /// safe to extract.
22 ///
23 /// TODO: Use the PM to get domtrees, and preserve BFI/BPI.
24 /// TODO: Reorder outlined functions.
25 ///
26 //===----------------------------------------------------------------------===//
27 
28 #include "llvm/Transforms/IPO/HotColdSplitting.h"
29 #include "llvm/ADT/PostOrderIterator.h"
30 #include "llvm/ADT/SmallVector.h"
31 #include "llvm/ADT/Statistic.h"
32 #include "llvm/Analysis/BlockFrequencyInfo.h"
33 #include "llvm/Analysis/BranchProbabilityInfo.h"
34 #include "llvm/Analysis/CFG.h"
35 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
36 #include "llvm/Analysis/PostDominators.h"
37 #include "llvm/Analysis/ProfileSummaryInfo.h"
38 #include "llvm/Analysis/TargetTransformInfo.h"
39 #include "llvm/IR/BasicBlock.h"
40 #include "llvm/IR/CFG.h"
41 #include "llvm/IR/DataLayout.h"
42 #include "llvm/IR/DiagnosticInfo.h"
43 #include "llvm/IR/Dominators.h"
44 #include "llvm/IR/Function.h"
45 #include "llvm/IR/Instruction.h"
46 #include "llvm/IR/Instructions.h"
47 #include "llvm/IR/IntrinsicInst.h"
48 #include "llvm/IR/Metadata.h"
49 #include "llvm/IR/Module.h"
50 #include "llvm/IR/PassManager.h"
51 #include "llvm/IR/Type.h"
52 #include "llvm/IR/Use.h"
53 #include "llvm/IR/User.h"
54 #include "llvm/IR/Value.h"
55 #include "llvm/InitializePasses.h"
56 #include "llvm/Pass.h"
57 #include "llvm/Support/BlockFrequency.h"
58 #include "llvm/Support/BranchProbability.h"
59 #include "llvm/Support/CommandLine.h"
60 #include "llvm/Support/Debug.h"
61 #include "llvm/Support/raw_ostream.h"
62 #include "llvm/Transforms/IPO.h"
63 #include "llvm/Transforms/Scalar.h"
64 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
65 #include "llvm/Transforms/Utils/Cloning.h"
66 #include "llvm/Transforms/Utils/CodeExtractor.h"
67 #include "llvm/Transforms/Utils/Local.h"
68 #include "llvm/Transforms/Utils/ValueMapper.h"
69 #include <algorithm>
70 #include <limits>
71 #include <cassert>
72 #include <string>
73 
74 #define DEBUG_TYPE "hotcoldsplit"
75 
76 STATISTIC(NumColdRegionsFound, "Number of cold regions found.");
77 STATISTIC(NumColdRegionsOutlined, "Number of cold regions outlined.");
78 
79 using namespace llvm;
80 
81 static cl::opt<bool> EnableStaticAnalysis("hot-cold-static-analysis",
82                                           cl::init(true), cl::Hidden);
83 
84 static cl::opt<int>
85     SplittingThreshold("hotcoldsplit-threshold", cl::init(2), cl::Hidden,
86                        cl::desc("Base penalty for splitting cold code (as a "
87                                 "multiple of TCC_Basic)"));
88 
89 static cl::opt<bool> EnableColdSection(
90     "enable-cold-section", cl::init(false), cl::Hidden,
91     cl::desc("Enable placement of extracted cold functions"
92              " into a separate section after hot-cold splitting."));
93 
94 static cl::opt<std::string>
95     ColdSectionName("hotcoldsplit-cold-section-name", cl::init("__llvm_cold"),
96                     cl::Hidden,
97                     cl::desc("Name for the section containing cold functions "
98                              "extracted by hot-cold splitting."));
99 
100 static cl::opt<int> MaxParametersForSplit(
101     "hotcoldsplit-max-params", cl::init(4), cl::Hidden,
102     cl::desc("Maximum number of parameters for a split function"));
103 
104 namespace {
105 // Same as blockEndsInUnreachable in CodeGen/BranchFolding.cpp. Do not modify
106 // this function unless you modify the MBB version as well.
107 //
108 /// A no successor, non-return block probably ends in unreachable and is cold.
109 /// Also consider a block that ends in an indirect branch to be a return block,
110 /// since many targets use plain indirect branches to return.
111 bool blockEndsInUnreachable(const BasicBlock &BB) {
112   if (!succ_empty(&BB))
113     return false;
114   if (BB.empty())
115     return true;
116   const Instruction *I = BB.getTerminator();
117   return !(isa<ReturnInst>(I) || isa<IndirectBrInst>(I));
118 }
119 
120 bool unlikelyExecuted(BasicBlock &BB) {
121   // Exception handling blocks are unlikely executed.
122   if (BB.isEHPad() || isa<ResumeInst>(BB.getTerminator()))
123     return true;
124 
125   // The block is cold if it calls/invokes a cold function. However, do not
126   // mark sanitizer traps as cold.
127   for (Instruction &I : BB)
128     if (auto *CB = dyn_cast<CallBase>(&I))
129       if (CB->hasFnAttr(Attribute::Cold) && !CB->getMetadata("nosanitize"))
130         return true;
131 
132   // The block is cold if it has an unreachable terminator, unless it's
133   // preceded by a call to a (possibly warm) noreturn call (e.g. longjmp).
134   if (blockEndsInUnreachable(BB)) {
135     if (auto *CI =
136             dyn_cast_or_null<CallInst>(BB.getTerminator()->getPrevNode()))
137       if (CI->hasFnAttr(Attribute::NoReturn))
138         return false;
139     return true;
140   }
141 
142   return false;
143 }
144 
145 /// Check whether it's safe to outline \p BB.
146 static bool mayExtractBlock(const BasicBlock &BB) {
147   // EH pads are unsafe to outline because doing so breaks EH type tables. It
148   // follows that invoke instructions cannot be extracted, because CodeExtractor
149   // requires unwind destinations to be within the extraction region.
150   //
151   // Resumes that are not reachable from a cleanup landing pad are considered to
152   // be unreachable. It’s not safe to split them out either.
153   auto Term = BB.getTerminator();
154   return !BB.hasAddressTaken() && !BB.isEHPad() && !isa<InvokeInst>(Term) &&
155          !isa<ResumeInst>(Term);
156 }
157 
158 /// Mark \p F cold. Based on this assumption, also optimize it for minimum size.
159 /// If \p UpdateEntryCount is true (set when this is a new split function and
160 /// module has profile data), set entry count to 0 to ensure treated as cold.
161 /// Return true if the function is changed.
162 static bool markFunctionCold(Function &F, bool UpdateEntryCount = false) {
163   assert(!F.hasOptNone() && "Can't mark this cold");
164   bool Changed = false;
165   if (!F.hasFnAttribute(Attribute::Cold)) {
166     F.addFnAttr(Attribute::Cold);
167     Changed = true;
168   }
169   if (!F.hasFnAttribute(Attribute::MinSize)) {
170     F.addFnAttr(Attribute::MinSize);
171     Changed = true;
172   }
173   if (UpdateEntryCount) {
174     // Set the entry count to 0 to ensure it is placed in the unlikely text
175     // section when function sections are enabled.
176     F.setEntryCount(0);
177     Changed = true;
178   }
179 
180   return Changed;
181 }
182 
183 class HotColdSplittingLegacyPass : public ModulePass {
184 public:
185   static char ID;
186   HotColdSplittingLegacyPass() : ModulePass(ID) {
187     initializeHotColdSplittingLegacyPassPass(*PassRegistry::getPassRegistry());
188   }
189 
190   void getAnalysisUsage(AnalysisUsage &AU) const override {
191     AU.addRequired<BlockFrequencyInfoWrapperPass>();
192     AU.addRequired<ProfileSummaryInfoWrapperPass>();
193     AU.addRequired<TargetTransformInfoWrapperPass>();
194     AU.addUsedIfAvailable<AssumptionCacheTracker>();
195   }
196 
197   bool runOnModule(Module &M) override;
198 };
199 
200 } // end anonymous namespace
201 
202 /// Check whether \p F is inherently cold.
203 bool HotColdSplitting::isFunctionCold(const Function &F) const {
204   if (F.hasFnAttribute(Attribute::Cold))
205     return true;
206 
207   if (F.getCallingConv() == CallingConv::Cold)
208     return true;
209 
210   if (PSI->isFunctionEntryCold(&F))
211     return true;
212 
213   return false;
214 }
215 
216 // Returns false if the function should not be considered for hot-cold split
217 // optimization.
218 bool HotColdSplitting::shouldOutlineFrom(const Function &F) const {
219   if (F.hasFnAttribute(Attribute::AlwaysInline))
220     return false;
221 
222   if (F.hasFnAttribute(Attribute::NoInline))
223     return false;
224 
225   // A function marked `noreturn` may contain unreachable terminators: these
226   // should not be considered cold, as the function may be a trampoline.
227   if (F.hasFnAttribute(Attribute::NoReturn))
228     return false;
229 
230   if (F.hasFnAttribute(Attribute::SanitizeAddress) ||
231       F.hasFnAttribute(Attribute::SanitizeHWAddress) ||
232       F.hasFnAttribute(Attribute::SanitizeThread) ||
233       F.hasFnAttribute(Attribute::SanitizeMemory))
234     return false;
235 
236   return true;
237 }
238 
239 /// Get the benefit score of outlining \p Region.
240 static InstructionCost getOutliningBenefit(ArrayRef<BasicBlock *> Region,
241                                            TargetTransformInfo &TTI) {
242   // Sum up the code size costs of non-terminator instructions. Tight coupling
243   // with \ref getOutliningPenalty is needed to model the costs of terminators.
244   InstructionCost Benefit = 0;
245   for (BasicBlock *BB : Region)
246     for (Instruction &I : BB->instructionsWithoutDebug())
247       if (&I != BB->getTerminator())
248         Benefit +=
249             TTI.getInstructionCost(&I, TargetTransformInfo::TCK_CodeSize);
250 
251   return Benefit;
252 }
253 
254 /// Get the penalty score for outlining \p Region.
255 static int getOutliningPenalty(ArrayRef<BasicBlock *> Region,
256                                unsigned NumInputs, unsigned NumOutputs) {
257   int Penalty = SplittingThreshold;
258   LLVM_DEBUG(dbgs() << "Applying penalty for splitting: " << Penalty << "\n");
259 
260   // If the splitting threshold is set at or below zero, skip the usual
261   // profitability check.
262   if (SplittingThreshold <= 0)
263     return Penalty;
264 
265   // Find the number of distinct exit blocks for the region. Use a conservative
266   // check to determine whether control returns from the region.
267   bool NoBlocksReturn = true;
268   SmallPtrSet<BasicBlock *, 2> SuccsOutsideRegion;
269   for (BasicBlock *BB : Region) {
270     // If a block has no successors, only assume it does not return if it's
271     // unreachable.
272     if (succ_empty(BB)) {
273       NoBlocksReturn &= isa<UnreachableInst>(BB->getTerminator());
274       continue;
275     }
276 
277     for (BasicBlock *SuccBB : successors(BB)) {
278       if (!is_contained(Region, SuccBB)) {
279         NoBlocksReturn = false;
280         SuccsOutsideRegion.insert(SuccBB);
281       }
282     }
283   }
284 
285   // Count the number of phis in exit blocks with >= 2 incoming values from the
286   // outlining region. These phis are split (\ref severSplitPHINodesOfExits),
287   // and new outputs are created to supply the split phis. CodeExtractor can't
288   // report these new outputs until extraction begins, but it's important to
289   // factor the cost of the outputs into the cost calculation.
290   unsigned NumSplitExitPhis = 0;
291   for (BasicBlock *ExitBB : SuccsOutsideRegion) {
292     for (PHINode &PN : ExitBB->phis()) {
293       // Find all incoming values from the outlining region.
294       int NumIncomingVals = 0;
295       for (unsigned i = 0; i < PN.getNumIncomingValues(); ++i)
296         if (find(Region, PN.getIncomingBlock(i)) != Region.end()) {
297           ++NumIncomingVals;
298           if (NumIncomingVals > 1) {
299             ++NumSplitExitPhis;
300             break;
301           }
302         }
303     }
304   }
305 
306   // Apply a penalty for calling the split function. Factor in the cost of
307   // materializing all of the parameters.
308   int NumOutputsAndSplitPhis = NumOutputs + NumSplitExitPhis;
309   int NumParams = NumInputs + NumOutputsAndSplitPhis;
310   if (NumParams > MaxParametersForSplit) {
311     LLVM_DEBUG(dbgs() << NumInputs << " inputs and " << NumOutputsAndSplitPhis
312                       << " outputs exceeds parameter limit ("
313                       << MaxParametersForSplit << ")\n");
314     return std::numeric_limits<int>::max();
315   }
316   const int CostForArgMaterialization = 2 * TargetTransformInfo::TCC_Basic;
317   LLVM_DEBUG(dbgs() << "Applying penalty for: " << NumParams << " params\n");
318   Penalty += CostForArgMaterialization * NumParams;
319 
320   // Apply the typical code size cost for an output alloca and its associated
321   // reload in the caller. Also penalize the associated store in the callee.
322   LLVM_DEBUG(dbgs() << "Applying penalty for: " << NumOutputsAndSplitPhis
323                     << " outputs/split phis\n");
324   const int CostForRegionOutput = 3 * TargetTransformInfo::TCC_Basic;
325   Penalty += CostForRegionOutput * NumOutputsAndSplitPhis;
326 
327   // Apply a `noreturn` bonus.
328   if (NoBlocksReturn) {
329     LLVM_DEBUG(dbgs() << "Applying bonus for: " << Region.size()
330                       << " non-returning terminators\n");
331     Penalty -= Region.size();
332   }
333 
334   // Apply a penalty for having more than one successor outside of the region.
335   // This penalty accounts for the switch needed in the caller.
336   if (SuccsOutsideRegion.size() > 1) {
337     LLVM_DEBUG(dbgs() << "Applying penalty for: " << SuccsOutsideRegion.size()
338                       << " non-region successors\n");
339     Penalty += (SuccsOutsideRegion.size() - 1) * TargetTransformInfo::TCC_Basic;
340   }
341 
342   return Penalty;
343 }
344 
345 Function *HotColdSplitting::extractColdRegion(
346     const BlockSequence &Region, const CodeExtractorAnalysisCache &CEAC,
347     DominatorTree &DT, BlockFrequencyInfo *BFI, TargetTransformInfo &TTI,
348     OptimizationRemarkEmitter &ORE, AssumptionCache *AC, unsigned Count) {
349   assert(!Region.empty());
350 
351   // TODO: Pass BFI and BPI to update profile information.
352   CodeExtractor CE(Region, &DT, /* AggregateArgs */ false, /* BFI */ nullptr,
353                    /* BPI */ nullptr, AC, /* AllowVarArgs */ false,
354                    /* AllowAlloca */ false,
355                    /* Suffix */ "cold." + std::to_string(Count));
356 
357   // Perform a simple cost/benefit analysis to decide whether or not to permit
358   // splitting.
359   SetVector<Value *> Inputs, Outputs, Sinks;
360   CE.findInputsOutputs(Inputs, Outputs, Sinks);
361   InstructionCost OutliningBenefit = getOutliningBenefit(Region, TTI);
362   int OutliningPenalty =
363       getOutliningPenalty(Region, Inputs.size(), Outputs.size());
364   LLVM_DEBUG(dbgs() << "Split profitability: benefit = " << OutliningBenefit
365                     << ", penalty = " << OutliningPenalty << "\n");
366   if (!OutliningBenefit.isValid() || OutliningBenefit <= OutliningPenalty)
367     return nullptr;
368 
369   Function *OrigF = Region[0]->getParent();
370   if (Function *OutF = CE.extractCodeRegion(CEAC)) {
371     User *U = *OutF->user_begin();
372     CallInst *CI = cast<CallInst>(U);
373     NumColdRegionsOutlined++;
374     if (TTI.useColdCCForColdCall(*OutF)) {
375       OutF->setCallingConv(CallingConv::Cold);
376       CI->setCallingConv(CallingConv::Cold);
377     }
378     CI->setIsNoInline();
379 
380     if (EnableColdSection)
381       OutF->setSection(ColdSectionName);
382     else {
383       if (OrigF->hasSection())
384         OutF->setSection(OrigF->getSection());
385     }
386 
387     markFunctionCold(*OutF, BFI != nullptr);
388 
389     LLVM_DEBUG(llvm::dbgs() << "Outlined Region: " << *OutF);
390     ORE.emit([&]() {
391       return OptimizationRemark(DEBUG_TYPE, "HotColdSplit",
392                                 &*Region[0]->begin())
393              << ore::NV("Original", OrigF) << " split cold code into "
394              << ore::NV("Split", OutF);
395     });
396     return OutF;
397   }
398 
399   ORE.emit([&]() {
400     return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed",
401                                     &*Region[0]->begin())
402            << "Failed to extract region at block "
403            << ore::NV("Block", Region.front());
404   });
405   return nullptr;
406 }
407 
408 /// A pair of (basic block, score).
409 using BlockTy = std::pair<BasicBlock *, unsigned>;
410 
411 namespace {
412 /// A maximal outlining region. This contains all blocks post-dominated by a
413 /// sink block, the sink block itself, and all blocks dominated by the sink.
414 /// If sink-predecessors and sink-successors cannot be extracted in one region,
415 /// the static constructor returns a list of suitable extraction regions.
416 class OutliningRegion {
417   /// A list of (block, score) pairs. A block's score is non-zero iff it's a
418   /// viable sub-region entry point. Blocks with higher scores are better entry
419   /// points (i.e. they are more distant ancestors of the sink block).
420   SmallVector<BlockTy, 0> Blocks = {};
421 
422   /// The suggested entry point into the region. If the region has multiple
423   /// entry points, all blocks within the region may not be reachable from this
424   /// entry point.
425   BasicBlock *SuggestedEntryPoint = nullptr;
426 
427   /// Whether the entire function is cold.
428   bool EntireFunctionCold = false;
429 
430   /// If \p BB is a viable entry point, return \p Score. Return 0 otherwise.
431   static unsigned getEntryPointScore(BasicBlock &BB, unsigned Score) {
432     return mayExtractBlock(BB) ? Score : 0;
433   }
434 
435   /// These scores should be lower than the score for predecessor blocks,
436   /// because regions starting at predecessor blocks are typically larger.
437   static constexpr unsigned ScoreForSuccBlock = 1;
438   static constexpr unsigned ScoreForSinkBlock = 1;
439 
440   OutliningRegion(const OutliningRegion &) = delete;
441   OutliningRegion &operator=(const OutliningRegion &) = delete;
442 
443 public:
444   OutliningRegion() = default;
445   OutliningRegion(OutliningRegion &&) = default;
446   OutliningRegion &operator=(OutliningRegion &&) = default;
447 
448   static std::vector<OutliningRegion> create(BasicBlock &SinkBB,
449                                              const DominatorTree &DT,
450                                              const PostDominatorTree &PDT) {
451     std::vector<OutliningRegion> Regions;
452     SmallPtrSet<BasicBlock *, 4> RegionBlocks;
453 
454     Regions.emplace_back();
455     OutliningRegion *ColdRegion = &Regions.back();
456 
457     auto addBlockToRegion = [&](BasicBlock *BB, unsigned Score) {
458       RegionBlocks.insert(BB);
459       ColdRegion->Blocks.emplace_back(BB, Score);
460     };
461 
462     // The ancestor farthest-away from SinkBB, and also post-dominated by it.
463     unsigned SinkScore = getEntryPointScore(SinkBB, ScoreForSinkBlock);
464     ColdRegion->SuggestedEntryPoint = (SinkScore > 0) ? &SinkBB : nullptr;
465     unsigned BestScore = SinkScore;
466 
467     // Visit SinkBB's ancestors using inverse DFS.
468     auto PredIt = ++idf_begin(&SinkBB);
469     auto PredEnd = idf_end(&SinkBB);
470     while (PredIt != PredEnd) {
471       BasicBlock &PredBB = **PredIt;
472       bool SinkPostDom = PDT.dominates(&SinkBB, &PredBB);
473 
474       // If the predecessor is cold and has no predecessors, the entire
475       // function must be cold.
476       if (SinkPostDom && pred_empty(&PredBB)) {
477         ColdRegion->EntireFunctionCold = true;
478         return Regions;
479       }
480 
481       // If SinkBB does not post-dominate a predecessor, do not mark the
482       // predecessor (or any of its predecessors) cold.
483       if (!SinkPostDom || !mayExtractBlock(PredBB)) {
484         PredIt.skipChildren();
485         continue;
486       }
487 
488       // Keep track of the post-dominated ancestor farthest away from the sink.
489       // The path length is always >= 2, ensuring that predecessor blocks are
490       // considered as entry points before the sink block.
491       unsigned PredScore = getEntryPointScore(PredBB, PredIt.getPathLength());
492       if (PredScore > BestScore) {
493         ColdRegion->SuggestedEntryPoint = &PredBB;
494         BestScore = PredScore;
495       }
496 
497       addBlockToRegion(&PredBB, PredScore);
498       ++PredIt;
499     }
500 
501     // If the sink can be added to the cold region, do so. It's considered as
502     // an entry point before any sink-successor blocks.
503     //
504     // Otherwise, split cold sink-successor blocks using a separate region.
505     // This satisfies the requirement that all extraction blocks other than the
506     // first have predecessors within the extraction region.
507     if (mayExtractBlock(SinkBB)) {
508       addBlockToRegion(&SinkBB, SinkScore);
509       if (pred_empty(&SinkBB)) {
510         ColdRegion->EntireFunctionCold = true;
511         return Regions;
512       }
513     } else {
514       Regions.emplace_back();
515       ColdRegion = &Regions.back();
516       BestScore = 0;
517     }
518 
519     // Find all successors of SinkBB dominated by SinkBB using DFS.
520     auto SuccIt = ++df_begin(&SinkBB);
521     auto SuccEnd = df_end(&SinkBB);
522     while (SuccIt != SuccEnd) {
523       BasicBlock &SuccBB = **SuccIt;
524       bool SinkDom = DT.dominates(&SinkBB, &SuccBB);
525 
526       // Don't allow the backwards & forwards DFSes to mark the same block.
527       bool DuplicateBlock = RegionBlocks.count(&SuccBB);
528 
529       // If SinkBB does not dominate a successor, do not mark the successor (or
530       // any of its successors) cold.
531       if (DuplicateBlock || !SinkDom || !mayExtractBlock(SuccBB)) {
532         SuccIt.skipChildren();
533         continue;
534       }
535 
536       unsigned SuccScore = getEntryPointScore(SuccBB, ScoreForSuccBlock);
537       if (SuccScore > BestScore) {
538         ColdRegion->SuggestedEntryPoint = &SuccBB;
539         BestScore = SuccScore;
540       }
541 
542       addBlockToRegion(&SuccBB, SuccScore);
543       ++SuccIt;
544     }
545 
546     return Regions;
547   }
548 
549   /// Whether this region has nothing to extract.
550   bool empty() const { return !SuggestedEntryPoint; }
551 
552   /// The blocks in this region.
553   ArrayRef<std::pair<BasicBlock *, unsigned>> blocks() const { return Blocks; }
554 
555   /// Whether the entire function containing this region is cold.
556   bool isEntireFunctionCold() const { return EntireFunctionCold; }
557 
558   /// Remove a sub-region from this region and return it as a block sequence.
559   BlockSequence takeSingleEntrySubRegion(DominatorTree &DT) {
560     assert(!empty() && !isEntireFunctionCold() && "Nothing to extract");
561 
562     // Remove blocks dominated by the suggested entry point from this region.
563     // During the removal, identify the next best entry point into the region.
564     // Ensure that the first extracted block is the suggested entry point.
565     BlockSequence SubRegion = {SuggestedEntryPoint};
566     BasicBlock *NextEntryPoint = nullptr;
567     unsigned NextScore = 0;
568     auto RegionEndIt = Blocks.end();
569     auto RegionStartIt = remove_if(Blocks, [&](const BlockTy &Block) {
570       BasicBlock *BB = Block.first;
571       unsigned Score = Block.second;
572       bool InSubRegion =
573           BB == SuggestedEntryPoint || DT.dominates(SuggestedEntryPoint, BB);
574       if (!InSubRegion && Score > NextScore) {
575         NextEntryPoint = BB;
576         NextScore = Score;
577       }
578       if (InSubRegion && BB != SuggestedEntryPoint)
579         SubRegion.push_back(BB);
580       return InSubRegion;
581     });
582     Blocks.erase(RegionStartIt, RegionEndIt);
583 
584     // Update the suggested entry point.
585     SuggestedEntryPoint = NextEntryPoint;
586 
587     return SubRegion;
588   }
589 };
590 } // namespace
591 
592 bool HotColdSplitting::outlineColdRegions(Function &F, bool HasProfileSummary) {
593   bool Changed = false;
594 
595   // The set of cold blocks.
596   SmallPtrSet<BasicBlock *, 4> ColdBlocks;
597 
598   // The worklist of non-intersecting regions left to outline.
599   SmallVector<OutliningRegion, 2> OutliningWorklist;
600 
601   // Set up an RPO traversal. Experimentally, this performs better (outlines
602   // more) than a PO traversal, because we prevent region overlap by keeping
603   // the first region to contain a block.
604   ReversePostOrderTraversal<Function *> RPOT(&F);
605 
606   // Calculate domtrees lazily. This reduces compile-time significantly.
607   std::unique_ptr<DominatorTree> DT;
608   std::unique_ptr<PostDominatorTree> PDT;
609 
610   // Calculate BFI lazily (it's only used to query ProfileSummaryInfo). This
611   // reduces compile-time significantly. TODO: When we *do* use BFI, we should
612   // be able to salvage its domtrees instead of recomputing them.
613   BlockFrequencyInfo *BFI = nullptr;
614   if (HasProfileSummary)
615     BFI = GetBFI(F);
616 
617   TargetTransformInfo &TTI = GetTTI(F);
618   OptimizationRemarkEmitter &ORE = (*GetORE)(F);
619   AssumptionCache *AC = LookupAC(F);
620 
621   // Find all cold regions.
622   for (BasicBlock *BB : RPOT) {
623     // This block is already part of some outlining region.
624     if (ColdBlocks.count(BB))
625       continue;
626 
627     bool Cold = (BFI && PSI->isColdBlock(BB, BFI)) ||
628                 (EnableStaticAnalysis && unlikelyExecuted(*BB));
629     if (!Cold)
630       continue;
631 
632     LLVM_DEBUG({
633       dbgs() << "Found a cold block:\n";
634       BB->dump();
635     });
636 
637     if (!DT)
638       DT = std::make_unique<DominatorTree>(F);
639     if (!PDT)
640       PDT = std::make_unique<PostDominatorTree>(F);
641 
642     auto Regions = OutliningRegion::create(*BB, *DT, *PDT);
643     for (OutliningRegion &Region : Regions) {
644       if (Region.empty())
645         continue;
646 
647       if (Region.isEntireFunctionCold()) {
648         LLVM_DEBUG(dbgs() << "Entire function is cold\n");
649         return markFunctionCold(F);
650       }
651 
652       // If this outlining region intersects with another, drop the new region.
653       //
654       // TODO: It's theoretically possible to outline more by only keeping the
655       // largest region which contains a block, but the extra bookkeeping to do
656       // this is tricky/expensive.
657       bool RegionsOverlap = any_of(Region.blocks(), [&](const BlockTy &Block) {
658         return !ColdBlocks.insert(Block.first).second;
659       });
660       if (RegionsOverlap)
661         continue;
662 
663       OutliningWorklist.emplace_back(std::move(Region));
664       ++NumColdRegionsFound;
665     }
666   }
667 
668   if (OutliningWorklist.empty())
669     return Changed;
670 
671   // Outline single-entry cold regions, splitting up larger regions as needed.
672   unsigned OutlinedFunctionID = 1;
673   // Cache and recycle the CodeExtractor analysis to avoid O(n^2) compile-time.
674   CodeExtractorAnalysisCache CEAC(F);
675   do {
676     OutliningRegion Region = OutliningWorklist.pop_back_val();
677     assert(!Region.empty() && "Empty outlining region in worklist");
678     do {
679       BlockSequence SubRegion = Region.takeSingleEntrySubRegion(*DT);
680       LLVM_DEBUG({
681         dbgs() << "Hot/cold splitting attempting to outline these blocks:\n";
682         for (BasicBlock *BB : SubRegion)
683           BB->dump();
684       });
685 
686       Function *Outlined = extractColdRegion(SubRegion, CEAC, *DT, BFI, TTI,
687                                              ORE, AC, OutlinedFunctionID);
688       if (Outlined) {
689         ++OutlinedFunctionID;
690         Changed = true;
691       }
692     } while (!Region.empty());
693   } while (!OutliningWorklist.empty());
694 
695   return Changed;
696 }
697 
698 bool HotColdSplitting::run(Module &M) {
699   bool Changed = false;
700   bool HasProfileSummary = (M.getProfileSummary(/* IsCS */ false) != nullptr);
701   for (auto It = M.begin(), End = M.end(); It != End; ++It) {
702     Function &F = *It;
703 
704     // Do not touch declarations.
705     if (F.isDeclaration())
706       continue;
707 
708     // Do not modify `optnone` functions.
709     if (F.hasOptNone())
710       continue;
711 
712     // Detect inherently cold functions and mark them as such.
713     if (isFunctionCold(F)) {
714       Changed |= markFunctionCold(F);
715       continue;
716     }
717 
718     if (!shouldOutlineFrom(F)) {
719       LLVM_DEBUG(llvm::dbgs() << "Skipping " << F.getName() << "\n");
720       continue;
721     }
722 
723     LLVM_DEBUG(llvm::dbgs() << "Outlining in " << F.getName() << "\n");
724     Changed |= outlineColdRegions(F, HasProfileSummary);
725   }
726   return Changed;
727 }
728 
729 bool HotColdSplittingLegacyPass::runOnModule(Module &M) {
730   if (skipModule(M))
731     return false;
732   ProfileSummaryInfo *PSI =
733       &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
734   auto GTTI = [this](Function &F) -> TargetTransformInfo & {
735     return this->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
736   };
737   auto GBFI = [this](Function &F) {
738     return &this->getAnalysis<BlockFrequencyInfoWrapperPass>(F).getBFI();
739   };
740   std::unique_ptr<OptimizationRemarkEmitter> ORE;
741   std::function<OptimizationRemarkEmitter &(Function &)> GetORE =
742       [&ORE](Function &F) -> OptimizationRemarkEmitter & {
743     ORE.reset(new OptimizationRemarkEmitter(&F));
744     return *ORE.get();
745   };
746   auto LookupAC = [this](Function &F) -> AssumptionCache * {
747     if (auto *ACT = getAnalysisIfAvailable<AssumptionCacheTracker>())
748       return ACT->lookupAssumptionCache(F);
749     return nullptr;
750   };
751 
752   return HotColdSplitting(PSI, GBFI, GTTI, &GetORE, LookupAC).run(M);
753 }
754 
755 PreservedAnalyses
756 HotColdSplittingPass::run(Module &M, ModuleAnalysisManager &AM) {
757   auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
758 
759   auto LookupAC = [&FAM](Function &F) -> AssumptionCache * {
760     return FAM.getCachedResult<AssumptionAnalysis>(F);
761   };
762 
763   auto GBFI = [&FAM](Function &F) {
764     return &FAM.getResult<BlockFrequencyAnalysis>(F);
765   };
766 
767   std::function<TargetTransformInfo &(Function &)> GTTI =
768       [&FAM](Function &F) -> TargetTransformInfo & {
769     return FAM.getResult<TargetIRAnalysis>(F);
770   };
771 
772   std::unique_ptr<OptimizationRemarkEmitter> ORE;
773   std::function<OptimizationRemarkEmitter &(Function &)> GetORE =
774       [&ORE](Function &F) -> OptimizationRemarkEmitter & {
775     ORE.reset(new OptimizationRemarkEmitter(&F));
776     return *ORE.get();
777   };
778 
779   ProfileSummaryInfo *PSI = &AM.getResult<ProfileSummaryAnalysis>(M);
780 
781   if (HotColdSplitting(PSI, GBFI, GTTI, &GetORE, LookupAC).run(M))
782     return PreservedAnalyses::none();
783   return PreservedAnalyses::all();
784 }
785 
786 char HotColdSplittingLegacyPass::ID = 0;
787 INITIALIZE_PASS_BEGIN(HotColdSplittingLegacyPass, "hotcoldsplit",
788                       "Hot Cold Splitting", false, false)
789 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
790 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
791 INITIALIZE_PASS_END(HotColdSplittingLegacyPass, "hotcoldsplit",
792                     "Hot Cold Splitting", false, false)
793 
794 ModulePass *llvm::createHotColdSplittingPass() {
795   return new HotColdSplittingLegacyPass();
796 }
797