xref: /freebsd/contrib/llvm-project/llvm/lib/Transforms/IPO/PartialInlining.cpp (revision 6966ac055c3b7a39266fb982493330df7a097997)
1 //===- PartialInlining.cpp - Inline parts of functions --------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass performs partial inlining, typically by inlining an if statement
10 // that surrounds the body of the function.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "llvm/Transforms/IPO/PartialInlining.h"
15 #include "llvm/ADT/DenseMap.h"
16 #include "llvm/ADT/DenseSet.h"
17 #include "llvm/ADT/None.h"
18 #include "llvm/ADT/Optional.h"
19 #include "llvm/ADT/STLExtras.h"
20 #include "llvm/ADT/SmallVector.h"
21 #include "llvm/ADT/Statistic.h"
22 #include "llvm/Analysis/BlockFrequencyInfo.h"
23 #include "llvm/Analysis/BranchProbabilityInfo.h"
24 #include "llvm/Analysis/InlineCost.h"
25 #include "llvm/Analysis/LoopInfo.h"
26 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
27 #include "llvm/Analysis/ProfileSummaryInfo.h"
28 #include "llvm/Analysis/TargetLibraryInfo.h"
29 #include "llvm/Analysis/TargetTransformInfo.h"
30 #include "llvm/IR/Attributes.h"
31 #include "llvm/IR/BasicBlock.h"
32 #include "llvm/IR/CFG.h"
33 #include "llvm/IR/CallSite.h"
34 #include "llvm/IR/DebugLoc.h"
35 #include "llvm/IR/DiagnosticInfo.h"
36 #include "llvm/IR/Dominators.h"
37 #include "llvm/IR/Function.h"
38 #include "llvm/IR/InstrTypes.h"
39 #include "llvm/IR/Instruction.h"
40 #include "llvm/IR/Instructions.h"
41 #include "llvm/IR/IntrinsicInst.h"
42 #include "llvm/IR/Intrinsics.h"
43 #include "llvm/IR/Module.h"
44 #include "llvm/IR/User.h"
45 #include "llvm/Pass.h"
46 #include "llvm/Support/BlockFrequency.h"
47 #include "llvm/Support/BranchProbability.h"
48 #include "llvm/Support/Casting.h"
49 #include "llvm/Support/CommandLine.h"
50 #include "llvm/Support/ErrorHandling.h"
51 #include "llvm/Transforms/IPO.h"
52 #include "llvm/Transforms/Utils/Cloning.h"
53 #include "llvm/Transforms/Utils/CodeExtractor.h"
54 #include "llvm/Transforms/Utils/ValueMapper.h"
55 #include <algorithm>
56 #include <cassert>
57 #include <cstdint>
58 #include <functional>
59 #include <iterator>
60 #include <memory>
61 #include <tuple>
62 #include <vector>
63 
64 using namespace llvm;
65 
66 #define DEBUG_TYPE "partial-inlining"
67 
68 STATISTIC(NumPartialInlined,
69           "Number of callsites functions partially inlined into.");
70 STATISTIC(NumColdOutlinePartialInlined, "Number of times functions with "
71                                         "cold outlined regions were partially "
72                                         "inlined into its caller(s).");
73 STATISTIC(NumColdRegionsFound,
74            "Number of cold single entry/exit regions found.");
75 STATISTIC(NumColdRegionsOutlined,
76            "Number of cold single entry/exit regions outlined.");
77 
78 // Command line option to disable partial-inlining. The default is false:
79 static cl::opt<bool>
80     DisablePartialInlining("disable-partial-inlining", cl::init(false),
81                            cl::Hidden, cl::desc("Disable partial inlining"));
82 // Command line option to disable multi-region partial-inlining. The default is
83 // false:
84 static cl::opt<bool> DisableMultiRegionPartialInline(
85     "disable-mr-partial-inlining", cl::init(false), cl::Hidden,
86     cl::desc("Disable multi-region partial inlining"));
87 
88 // Command line option to force outlining in regions with live exit variables.
89 // The default is false:
90 static cl::opt<bool>
91     ForceLiveExit("pi-force-live-exit-outline", cl::init(false), cl::Hidden,
92                cl::desc("Force outline regions with live exits"));
93 
94 // Command line option to enable marking outline functions with Cold Calling
95 // Convention. The default is false:
96 static cl::opt<bool>
97     MarkOutlinedColdCC("pi-mark-coldcc", cl::init(false), cl::Hidden,
98                        cl::desc("Mark outline function calls with ColdCC"));
99 
100 #ifndef NDEBUG
101 // Command line option to debug partial-inlining. The default is none:
102 static cl::opt<bool> TracePartialInlining("trace-partial-inlining",
103                                           cl::init(false), cl::Hidden,
104                                           cl::desc("Trace partial inlining."));
105 #endif
106 
107 // This is an option used by testing:
108 static cl::opt<bool> SkipCostAnalysis("skip-partial-inlining-cost-analysis",
109                                       cl::init(false), cl::ZeroOrMore,
110                                       cl::ReallyHidden,
111                                       cl::desc("Skip Cost Analysis"));
112 // Used to determine if a cold region is worth outlining based on
113 // its inlining cost compared to the original function.  Default is set at 10%.
114 // ie. if the cold region reduces the inlining cost of the original function by
115 // at least 10%.
116 static cl::opt<float> MinRegionSizeRatio(
117     "min-region-size-ratio", cl::init(0.1), cl::Hidden,
118     cl::desc("Minimum ratio comparing relative sizes of each "
119              "outline candidate and original function"));
120 // Used to tune the minimum number of execution counts needed in the predecessor
121 // block to the cold edge. ie. confidence interval.
122 static cl::opt<unsigned>
123     MinBlockCounterExecution("min-block-execution", cl::init(100), cl::Hidden,
124                              cl::desc("Minimum block executions to consider "
125                                       "its BranchProbabilityInfo valid"));
126 // Used to determine when an edge is considered cold. Default is set to 10%. ie.
127 // if the branch probability is 10% or less, then it is deemed as 'cold'.
128 static cl::opt<float> ColdBranchRatio(
129     "cold-branch-ratio", cl::init(0.1), cl::Hidden,
130     cl::desc("Minimum BranchProbability to consider a region cold."));
131 
132 static cl::opt<unsigned> MaxNumInlineBlocks(
133     "max-num-inline-blocks", cl::init(5), cl::Hidden,
134     cl::desc("Max number of blocks to be partially inlined"));
135 
136 // Command line option to set the maximum number of partial inlining allowed
137 // for the module. The default value of -1 means no limit.
138 static cl::opt<int> MaxNumPartialInlining(
139     "max-partial-inlining", cl::init(-1), cl::Hidden, cl::ZeroOrMore,
140     cl::desc("Max number of partial inlining. The default is unlimited"));
141 
142 // Used only when PGO or user annotated branch data is absent. It is
143 // the least value that is used to weigh the outline region. If BFI
144 // produces larger value, the BFI value will be used.
145 static cl::opt<int>
146     OutlineRegionFreqPercent("outline-region-freq-percent", cl::init(75),
147                              cl::Hidden, cl::ZeroOrMore,
148                              cl::desc("Relative frequency of outline region to "
149                                       "the entry block"));
150 
151 static cl::opt<unsigned> ExtraOutliningPenalty(
152     "partial-inlining-extra-penalty", cl::init(0), cl::Hidden,
153     cl::desc("A debug option to add additional penalty to the computed one."));
154 
155 namespace {
156 
157 struct FunctionOutliningInfo {
158   FunctionOutliningInfo() = default;
159 
160   // Returns the number of blocks to be inlined including all blocks
161   // in Entries and one return block.
162   unsigned GetNumInlinedBlocks() const { return Entries.size() + 1; }
163 
164   // A set of blocks including the function entry that guard
165   // the region to be outlined.
166   SmallVector<BasicBlock *, 4> Entries;
167 
168   // The return block that is not included in the outlined region.
169   BasicBlock *ReturnBlock = nullptr;
170 
171   // The dominating block of the region to be outlined.
172   BasicBlock *NonReturnBlock = nullptr;
173 
174   // The set of blocks in Entries that that are predecessors to ReturnBlock
175   SmallVector<BasicBlock *, 4> ReturnBlockPreds;
176 };
177 
178 struct FunctionOutliningMultiRegionInfo {
179   FunctionOutliningMultiRegionInfo()
180       : ORI() {}
181 
182   // Container for outline regions
183   struct OutlineRegionInfo {
184     OutlineRegionInfo(ArrayRef<BasicBlock *> Region,
185                       BasicBlock *EntryBlock, BasicBlock *ExitBlock,
186                       BasicBlock *ReturnBlock)
187         : Region(Region.begin(), Region.end()), EntryBlock(EntryBlock),
188           ExitBlock(ExitBlock), ReturnBlock(ReturnBlock) {}
189     SmallVector<BasicBlock *, 8> Region;
190     BasicBlock *EntryBlock;
191     BasicBlock *ExitBlock;
192     BasicBlock *ReturnBlock;
193   };
194 
195   SmallVector<OutlineRegionInfo, 4> ORI;
196 };
197 
198 struct PartialInlinerImpl {
199 
200   PartialInlinerImpl(
201       std::function<AssumptionCache &(Function &)> *GetAC,
202       function_ref<AssumptionCache *(Function &)> LookupAC,
203       std::function<TargetTransformInfo &(Function &)> *GTTI,
204       Optional<function_ref<BlockFrequencyInfo &(Function &)>> GBFI,
205       ProfileSummaryInfo *ProfSI)
206       : GetAssumptionCache(GetAC), LookupAssumptionCache(LookupAC),
207         GetTTI(GTTI), GetBFI(GBFI), PSI(ProfSI) {}
208 
209   bool run(Module &M);
210   // Main part of the transformation that calls helper functions to find
211   // outlining candidates, clone & outline the function, and attempt to
212   // partially inline the resulting function. Returns true if
213   // inlining was successful, false otherwise.  Also returns the outline
214   // function (only if we partially inlined early returns) as there is a
215   // possibility to further "peel" early return statements that were left in the
216   // outline function due to code size.
217   std::pair<bool, Function *> unswitchFunction(Function *F);
218 
219   // This class speculatively clones the function to be partial inlined.
220   // At the end of partial inlining, the remaining callsites to the cloned
221   // function that are not partially inlined will be fixed up to reference
222   // the original function, and the cloned function will be erased.
223   struct FunctionCloner {
224     // Two constructors, one for single region outlining, the other for
225     // multi-region outlining.
226     FunctionCloner(Function *F, FunctionOutliningInfo *OI,
227                    OptimizationRemarkEmitter &ORE,
228                    function_ref<AssumptionCache *(Function &)> LookupAC);
229     FunctionCloner(Function *F, FunctionOutliningMultiRegionInfo *OMRI,
230                    OptimizationRemarkEmitter &ORE,
231                    function_ref<AssumptionCache *(Function &)> LookupAC);
232     ~FunctionCloner();
233 
234     // Prepare for function outlining: making sure there is only
235     // one incoming edge from the extracted/outlined region to
236     // the return block.
237     void NormalizeReturnBlock();
238 
239     // Do function outlining for cold regions.
240     bool doMultiRegionFunctionOutlining();
241     // Do function outlining for region after early return block(s).
242     // NOTE: For vararg functions that do the vararg handling in the outlined
243     //       function, we temporarily generate IR that does not properly
244     //       forward varargs to the outlined function. Calling InlineFunction
245     //       will update calls to the outlined functions to properly forward
246     //       the varargs.
247     Function *doSingleRegionFunctionOutlining();
248 
249     Function *OrigFunc = nullptr;
250     Function *ClonedFunc = nullptr;
251 
252     typedef std::pair<Function *, BasicBlock *> FuncBodyCallerPair;
253     // Keep track of Outlined Functions and the basic block they're called from.
254     SmallVector<FuncBodyCallerPair, 4> OutlinedFunctions;
255 
256     // ClonedFunc is inlined in one of its callers after function
257     // outlining.
258     bool IsFunctionInlined = false;
259     // The cost of the region to be outlined.
260     int OutlinedRegionCost = 0;
261     // ClonedOI is specific to outlining non-early return blocks.
262     std::unique_ptr<FunctionOutliningInfo> ClonedOI = nullptr;
263     // ClonedOMRI is specific to outlining cold regions.
264     std::unique_ptr<FunctionOutliningMultiRegionInfo> ClonedOMRI = nullptr;
265     std::unique_ptr<BlockFrequencyInfo> ClonedFuncBFI = nullptr;
266     OptimizationRemarkEmitter &ORE;
267     function_ref<AssumptionCache *(Function &)> LookupAC;
268   };
269 
270 private:
271   int NumPartialInlining = 0;
272   std::function<AssumptionCache &(Function &)> *GetAssumptionCache;
273   function_ref<AssumptionCache *(Function &)> LookupAssumptionCache;
274   std::function<TargetTransformInfo &(Function &)> *GetTTI;
275   Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI;
276   ProfileSummaryInfo *PSI;
277 
278   // Return the frequency of the OutlininingBB relative to F's entry point.
279   // The result is no larger than 1 and is represented using BP.
280   // (Note that the outlined region's 'head' block can only have incoming
281   // edges from the guarding entry blocks).
282   BranchProbability getOutliningCallBBRelativeFreq(FunctionCloner &Cloner);
283 
284   // Return true if the callee of CS should be partially inlined with
285   // profit.
286   bool shouldPartialInline(CallSite CS, FunctionCloner &Cloner,
287                            BlockFrequency WeightedOutliningRcost,
288                            OptimizationRemarkEmitter &ORE);
289 
290   // Try to inline DuplicateFunction (cloned from F with call to
291   // the OutlinedFunction into its callers. Return true
292   // if there is any successful inlining.
293   bool tryPartialInline(FunctionCloner &Cloner);
294 
295   // Compute the mapping from use site of DuplicationFunction to the enclosing
296   // BB's profile count.
297   void computeCallsiteToProfCountMap(Function *DuplicateFunction,
298                                      DenseMap<User *, uint64_t> &SiteCountMap);
299 
300   bool IsLimitReached() {
301     return (MaxNumPartialInlining != -1 &&
302             NumPartialInlining >= MaxNumPartialInlining);
303   }
304 
305   static CallSite getCallSite(User *U) {
306     CallSite CS;
307     if (CallInst *CI = dyn_cast<CallInst>(U))
308       CS = CallSite(CI);
309     else if (InvokeInst *II = dyn_cast<InvokeInst>(U))
310       CS = CallSite(II);
311     else
312       llvm_unreachable("All uses must be calls");
313     return CS;
314   }
315 
316   static CallSite getOneCallSiteTo(Function *F) {
317     User *User = *F->user_begin();
318     return getCallSite(User);
319   }
320 
321   std::tuple<DebugLoc, BasicBlock *> getOneDebugLoc(Function *F) {
322     CallSite CS = getOneCallSiteTo(F);
323     DebugLoc DLoc = CS.getInstruction()->getDebugLoc();
324     BasicBlock *Block = CS.getParent();
325     return std::make_tuple(DLoc, Block);
326   }
327 
328   // Returns the costs associated with function outlining:
329   // - The first value is the non-weighted runtime cost for making the call
330   //   to the outlined function, including the addtional  setup cost in the
331   //    outlined function itself;
332   // - The second value is the estimated size of the new call sequence in
333   //   basic block Cloner.OutliningCallBB;
334   std::tuple<int, int> computeOutliningCosts(FunctionCloner &Cloner);
335 
336   // Compute the 'InlineCost' of block BB. InlineCost is a proxy used to
337   // approximate both the size and runtime cost (Note that in the current
338   // inline cost analysis, there is no clear distinction there either).
339   static int computeBBInlineCost(BasicBlock *BB);
340 
341   std::unique_ptr<FunctionOutliningInfo> computeOutliningInfo(Function *F);
342   std::unique_ptr<FunctionOutliningMultiRegionInfo>
343   computeOutliningColdRegionsInfo(Function *F, OptimizationRemarkEmitter &ORE);
344 };
345 
346 struct PartialInlinerLegacyPass : public ModulePass {
347   static char ID; // Pass identification, replacement for typeid
348 
349   PartialInlinerLegacyPass() : ModulePass(ID) {
350     initializePartialInlinerLegacyPassPass(*PassRegistry::getPassRegistry());
351   }
352 
353   void getAnalysisUsage(AnalysisUsage &AU) const override {
354     AU.addRequired<AssumptionCacheTracker>();
355     AU.addRequired<ProfileSummaryInfoWrapperPass>();
356     AU.addRequired<TargetTransformInfoWrapperPass>();
357   }
358 
359   bool runOnModule(Module &M) override {
360     if (skipModule(M))
361       return false;
362 
363     AssumptionCacheTracker *ACT = &getAnalysis<AssumptionCacheTracker>();
364     TargetTransformInfoWrapperPass *TTIWP =
365         &getAnalysis<TargetTransformInfoWrapperPass>();
366     ProfileSummaryInfo *PSI =
367         &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
368 
369     std::function<AssumptionCache &(Function &)> GetAssumptionCache =
370         [&ACT](Function &F) -> AssumptionCache & {
371       return ACT->getAssumptionCache(F);
372     };
373 
374     auto LookupAssumptionCache = [ACT](Function &F) -> AssumptionCache * {
375       return ACT->lookupAssumptionCache(F);
376     };
377 
378     std::function<TargetTransformInfo &(Function &)> GetTTI =
379         [&TTIWP](Function &F) -> TargetTransformInfo & {
380       return TTIWP->getTTI(F);
381     };
382 
383     return PartialInlinerImpl(&GetAssumptionCache, LookupAssumptionCache,
384                               &GetTTI, NoneType::None, PSI)
385         .run(M);
386   }
387 };
388 
389 } // end anonymous namespace
390 
391 std::unique_ptr<FunctionOutliningMultiRegionInfo>
392 PartialInlinerImpl::computeOutliningColdRegionsInfo(Function *F,
393                                                     OptimizationRemarkEmitter &ORE) {
394   BasicBlock *EntryBlock = &F->front();
395 
396   DominatorTree DT(*F);
397   LoopInfo LI(DT);
398   BranchProbabilityInfo BPI(*F, LI);
399   std::unique_ptr<BlockFrequencyInfo> ScopedBFI;
400   BlockFrequencyInfo *BFI;
401   if (!GetBFI) {
402     ScopedBFI.reset(new BlockFrequencyInfo(*F, BPI, LI));
403     BFI = ScopedBFI.get();
404   } else
405     BFI = &(*GetBFI)(*F);
406 
407   // Return if we don't have profiling information.
408   if (!PSI->hasInstrumentationProfile())
409     return std::unique_ptr<FunctionOutliningMultiRegionInfo>();
410 
411   std::unique_ptr<FunctionOutliningMultiRegionInfo> OutliningInfo =
412       llvm::make_unique<FunctionOutliningMultiRegionInfo>();
413 
414   auto IsSingleEntry = [](SmallVectorImpl<BasicBlock *> &BlockList) {
415     BasicBlock *Dom = BlockList.front();
416     return BlockList.size() > 1 && Dom->hasNPredecessors(1);
417   };
418 
419   auto IsSingleExit =
420       [&ORE](SmallVectorImpl<BasicBlock *> &BlockList) -> BasicBlock * {
421     BasicBlock *ExitBlock = nullptr;
422     for (auto *Block : BlockList) {
423       for (auto SI = succ_begin(Block); SI != succ_end(Block); ++SI) {
424         if (!is_contained(BlockList, *SI)) {
425           if (ExitBlock) {
426             ORE.emit([&]() {
427               return OptimizationRemarkMissed(DEBUG_TYPE, "MultiExitRegion",
428                                               &SI->front())
429                      << "Region dominated by "
430                      << ore::NV("Block", BlockList.front()->getName())
431                      << " has more than one region exit edge.";
432             });
433             return nullptr;
434           } else
435             ExitBlock = Block;
436         }
437       }
438     }
439     return ExitBlock;
440   };
441 
442   auto BBProfileCount = [BFI](BasicBlock *BB) {
443     return BFI->getBlockProfileCount(BB)
444                ? BFI->getBlockProfileCount(BB).getValue()
445                : 0;
446   };
447 
448   // Use the same computeBBInlineCost function to compute the cost savings of
449   // the outlining the candidate region.
450   int OverallFunctionCost = 0;
451   for (auto &BB : *F)
452     OverallFunctionCost += computeBBInlineCost(&BB);
453 
454 #ifndef NDEBUG
455   if (TracePartialInlining)
456     dbgs() << "OverallFunctionCost = " << OverallFunctionCost << "\n";
457 #endif
458   int MinOutlineRegionCost =
459       static_cast<int>(OverallFunctionCost * MinRegionSizeRatio);
460   BranchProbability MinBranchProbability(
461       static_cast<int>(ColdBranchRatio * MinBlockCounterExecution),
462       MinBlockCounterExecution);
463   bool ColdCandidateFound = false;
464   BasicBlock *CurrEntry = EntryBlock;
465   std::vector<BasicBlock *> DFS;
466   DenseMap<BasicBlock *, bool> VisitedMap;
467   DFS.push_back(CurrEntry);
468   VisitedMap[CurrEntry] = true;
469   // Use Depth First Search on the basic blocks to find CFG edges that are
470   // considered cold.
471   // Cold regions considered must also have its inline cost compared to the
472   // overall inline cost of the original function.  The region is outlined only
473   // if it reduced the inline cost of the function by 'MinOutlineRegionCost' or
474   // more.
475   while (!DFS.empty()) {
476     auto *thisBB = DFS.back();
477     DFS.pop_back();
478     // Only consider regions with predecessor blocks that are considered
479     // not-cold (default: part of the top 99.99% of all block counters)
480     // AND greater than our minimum block execution count (default: 100).
481     if (PSI->isColdBlock(thisBB, BFI) ||
482         BBProfileCount(thisBB) < MinBlockCounterExecution)
483       continue;
484     for (auto SI = succ_begin(thisBB); SI != succ_end(thisBB); ++SI) {
485       if (VisitedMap[*SI])
486         continue;
487       VisitedMap[*SI] = true;
488       DFS.push_back(*SI);
489       // If branch isn't cold, we skip to the next one.
490       BranchProbability SuccProb = BPI.getEdgeProbability(thisBB, *SI);
491       if (SuccProb > MinBranchProbability)
492         continue;
493 #ifndef NDEBUG
494       if (TracePartialInlining) {
495         dbgs() << "Found cold edge: " << thisBB->getName() << "->"
496                << (*SI)->getName() << "\nBranch Probability = " << SuccProb
497                << "\n";
498       }
499 #endif
500       SmallVector<BasicBlock *, 8> DominateVector;
501       DT.getDescendants(*SI, DominateVector);
502       // We can only outline single entry regions (for now).
503       if (!IsSingleEntry(DominateVector))
504         continue;
505       BasicBlock *ExitBlock = nullptr;
506       // We can only outline single exit regions (for now).
507       if (!(ExitBlock = IsSingleExit(DominateVector)))
508         continue;
509       int OutlineRegionCost = 0;
510       for (auto *BB : DominateVector)
511         OutlineRegionCost += computeBBInlineCost(BB);
512 
513 #ifndef NDEBUG
514       if (TracePartialInlining)
515         dbgs() << "OutlineRegionCost = " << OutlineRegionCost << "\n";
516 #endif
517 
518       if (OutlineRegionCost < MinOutlineRegionCost) {
519         ORE.emit([&]() {
520           return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly",
521                                             &SI->front())
522                  << ore::NV("Callee", F) << " inline cost-savings smaller than "
523                  << ore::NV("Cost", MinOutlineRegionCost);
524         });
525         continue;
526       }
527       // For now, ignore blocks that belong to a SISE region that is a
528       // candidate for outlining.  In the future, we may want to look
529       // at inner regions because the outer region may have live-exit
530       // variables.
531       for (auto *BB : DominateVector)
532         VisitedMap[BB] = true;
533       // ReturnBlock here means the block after the outline call
534       BasicBlock *ReturnBlock = ExitBlock->getSingleSuccessor();
535       // assert(ReturnBlock && "ReturnBlock is NULL somehow!");
536       FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegInfo(
537           DominateVector, DominateVector.front(), ExitBlock, ReturnBlock);
538       OutliningInfo->ORI.push_back(RegInfo);
539 #ifndef NDEBUG
540       if (TracePartialInlining) {
541         dbgs() << "Found Cold Candidate starting at block: "
542                << DominateVector.front()->getName() << "\n";
543       }
544 #endif
545       ColdCandidateFound = true;
546       NumColdRegionsFound++;
547     }
548   }
549   if (ColdCandidateFound)
550     return OutliningInfo;
551   else
552     return std::unique_ptr<FunctionOutliningMultiRegionInfo>();
553 }
554 
555 std::unique_ptr<FunctionOutliningInfo>
556 PartialInlinerImpl::computeOutliningInfo(Function *F) {
557   BasicBlock *EntryBlock = &F->front();
558   BranchInst *BR = dyn_cast<BranchInst>(EntryBlock->getTerminator());
559   if (!BR || BR->isUnconditional())
560     return std::unique_ptr<FunctionOutliningInfo>();
561 
562   // Returns true if Succ is BB's successor
563   auto IsSuccessor = [](BasicBlock *Succ, BasicBlock *BB) {
564     return is_contained(successors(BB), Succ);
565   };
566 
567   auto IsReturnBlock = [](BasicBlock *BB) {
568     Instruction *TI = BB->getTerminator();
569     return isa<ReturnInst>(TI);
570   };
571 
572   auto GetReturnBlock = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
573     if (IsReturnBlock(Succ1))
574       return std::make_tuple(Succ1, Succ2);
575     if (IsReturnBlock(Succ2))
576       return std::make_tuple(Succ2, Succ1);
577 
578     return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr);
579   };
580 
581   // Detect a triangular shape:
582   auto GetCommonSucc = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
583     if (IsSuccessor(Succ1, Succ2))
584       return std::make_tuple(Succ1, Succ2);
585     if (IsSuccessor(Succ2, Succ1))
586       return std::make_tuple(Succ2, Succ1);
587 
588     return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr);
589   };
590 
591   std::unique_ptr<FunctionOutliningInfo> OutliningInfo =
592       llvm::make_unique<FunctionOutliningInfo>();
593 
594   BasicBlock *CurrEntry = EntryBlock;
595   bool CandidateFound = false;
596   do {
597     // The number of blocks to be inlined has already reached
598     // the limit. When MaxNumInlineBlocks is set to 0 or 1, this
599     // disables partial inlining for the function.
600     if (OutliningInfo->GetNumInlinedBlocks() >= MaxNumInlineBlocks)
601       break;
602 
603     if (succ_size(CurrEntry) != 2)
604       break;
605 
606     BasicBlock *Succ1 = *succ_begin(CurrEntry);
607     BasicBlock *Succ2 = *(succ_begin(CurrEntry) + 1);
608 
609     BasicBlock *ReturnBlock, *NonReturnBlock;
610     std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2);
611 
612     if (ReturnBlock) {
613       OutliningInfo->Entries.push_back(CurrEntry);
614       OutliningInfo->ReturnBlock = ReturnBlock;
615       OutliningInfo->NonReturnBlock = NonReturnBlock;
616       CandidateFound = true;
617       break;
618     }
619 
620     BasicBlock *CommSucc;
621     BasicBlock *OtherSucc;
622     std::tie(CommSucc, OtherSucc) = GetCommonSucc(Succ1, Succ2);
623 
624     if (!CommSucc)
625       break;
626 
627     OutliningInfo->Entries.push_back(CurrEntry);
628     CurrEntry = OtherSucc;
629   } while (true);
630 
631   if (!CandidateFound)
632     return std::unique_ptr<FunctionOutliningInfo>();
633 
634   // Do sanity check of the entries: threre should not
635   // be any successors (not in the entry set) other than
636   // {ReturnBlock, NonReturnBlock}
637   assert(OutliningInfo->Entries[0] == &F->front() &&
638          "Function Entry must be the first in Entries vector");
639   DenseSet<BasicBlock *> Entries;
640   for (BasicBlock *E : OutliningInfo->Entries)
641     Entries.insert(E);
642 
643   // Returns true of BB has Predecessor which is not
644   // in Entries set.
645   auto HasNonEntryPred = [Entries](BasicBlock *BB) {
646     for (auto Pred : predecessors(BB)) {
647       if (!Entries.count(Pred))
648         return true;
649     }
650     return false;
651   };
652   auto CheckAndNormalizeCandidate =
653       [Entries, HasNonEntryPred](FunctionOutliningInfo *OutliningInfo) {
654         for (BasicBlock *E : OutliningInfo->Entries) {
655           for (auto Succ : successors(E)) {
656             if (Entries.count(Succ))
657               continue;
658             if (Succ == OutliningInfo->ReturnBlock)
659               OutliningInfo->ReturnBlockPreds.push_back(E);
660             else if (Succ != OutliningInfo->NonReturnBlock)
661               return false;
662           }
663           // There should not be any outside incoming edges either:
664           if (HasNonEntryPred(E))
665             return false;
666         }
667         return true;
668       };
669 
670   if (!CheckAndNormalizeCandidate(OutliningInfo.get()))
671     return std::unique_ptr<FunctionOutliningInfo>();
672 
673   // Now further growing the candidate's inlining region by
674   // peeling off dominating blocks from the outlining region:
675   while (OutliningInfo->GetNumInlinedBlocks() < MaxNumInlineBlocks) {
676     BasicBlock *Cand = OutliningInfo->NonReturnBlock;
677     if (succ_size(Cand) != 2)
678       break;
679 
680     if (HasNonEntryPred(Cand))
681       break;
682 
683     BasicBlock *Succ1 = *succ_begin(Cand);
684     BasicBlock *Succ2 = *(succ_begin(Cand) + 1);
685 
686     BasicBlock *ReturnBlock, *NonReturnBlock;
687     std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2);
688     if (!ReturnBlock || ReturnBlock != OutliningInfo->ReturnBlock)
689       break;
690 
691     if (NonReturnBlock->getSinglePredecessor() != Cand)
692       break;
693 
694     // Now grow and update OutlininigInfo:
695     OutliningInfo->Entries.push_back(Cand);
696     OutliningInfo->NonReturnBlock = NonReturnBlock;
697     OutliningInfo->ReturnBlockPreds.push_back(Cand);
698     Entries.insert(Cand);
699   }
700 
701   return OutliningInfo;
702 }
703 
704 // Check if there is PGO data or user annoated branch data:
705 static bool hasProfileData(Function *F, FunctionOutliningInfo *OI) {
706   if (F->hasProfileData())
707     return true;
708   // Now check if any of the entry block has MD_prof data:
709   for (auto *E : OI->Entries) {
710     BranchInst *BR = dyn_cast<BranchInst>(E->getTerminator());
711     if (!BR || BR->isUnconditional())
712       continue;
713     uint64_t T, F;
714     if (BR->extractProfMetadata(T, F))
715       return true;
716   }
717   return false;
718 }
719 
720 BranchProbability
721 PartialInlinerImpl::getOutliningCallBBRelativeFreq(FunctionCloner &Cloner) {
722   BasicBlock *OutliningCallBB = Cloner.OutlinedFunctions.back().second;
723   auto EntryFreq =
724       Cloner.ClonedFuncBFI->getBlockFreq(&Cloner.ClonedFunc->getEntryBlock());
725   auto OutliningCallFreq =
726       Cloner.ClonedFuncBFI->getBlockFreq(OutliningCallBB);
727   // FIXME Hackery needed because ClonedFuncBFI is based on the function BEFORE
728   // we outlined any regions, so we may encounter situations where the
729   // OutliningCallFreq is *slightly* bigger than the EntryFreq.
730   if (OutliningCallFreq.getFrequency() > EntryFreq.getFrequency()) {
731     OutliningCallFreq = EntryFreq;
732   }
733   auto OutlineRegionRelFreq = BranchProbability::getBranchProbability(
734       OutliningCallFreq.getFrequency(), EntryFreq.getFrequency());
735 
736   if (hasProfileData(Cloner.OrigFunc, Cloner.ClonedOI.get()))
737     return OutlineRegionRelFreq;
738 
739   // When profile data is not available, we need to be conservative in
740   // estimating the overall savings. Static branch prediction can usually
741   // guess the branch direction right (taken/non-taken), but the guessed
742   // branch probability is usually not biased enough. In case when the
743   // outlined region is predicted to be likely, its probability needs
744   // to be made higher (more biased) to not under-estimate the cost of
745   // function outlining. On the other hand, if the outlined region
746   // is predicted to be less likely, the predicted probablity is usually
747   // higher than the actual. For instance, the actual probability of the
748   // less likely target is only 5%, but the guessed probablity can be
749   // 40%. In the latter case, there is no need for further adjustement.
750   // FIXME: add an option for this.
751   if (OutlineRegionRelFreq < BranchProbability(45, 100))
752     return OutlineRegionRelFreq;
753 
754   OutlineRegionRelFreq = std::max(
755       OutlineRegionRelFreq, BranchProbability(OutlineRegionFreqPercent, 100));
756 
757   return OutlineRegionRelFreq;
758 }
759 
760 bool PartialInlinerImpl::shouldPartialInline(
761     CallSite CS, FunctionCloner &Cloner,
762     BlockFrequency WeightedOutliningRcost,
763     OptimizationRemarkEmitter &ORE) {
764   using namespace ore;
765 
766   Instruction *Call = CS.getInstruction();
767   Function *Callee = CS.getCalledFunction();
768   assert(Callee == Cloner.ClonedFunc);
769 
770   if (SkipCostAnalysis)
771     return isInlineViable(*Callee);
772 
773   Function *Caller = CS.getCaller();
774   auto &CalleeTTI = (*GetTTI)(*Callee);
775   bool RemarksEnabled =
776       Callee->getContext().getDiagHandlerPtr()->isMissedOptRemarkEnabled(
777           DEBUG_TYPE);
778   assert(Call && "invalid callsite for partial inline");
779   InlineCost IC = getInlineCost(cast<CallBase>(*Call), getInlineParams(),
780                                 CalleeTTI, *GetAssumptionCache, GetBFI, PSI,
781                                 RemarksEnabled ? &ORE : nullptr);
782 
783   if (IC.isAlways()) {
784     ORE.emit([&]() {
785       return OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", Call)
786              << NV("Callee", Cloner.OrigFunc)
787              << " should always be fully inlined, not partially";
788     });
789     return false;
790   }
791 
792   if (IC.isNever()) {
793     ORE.emit([&]() {
794       return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", Call)
795              << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
796              << NV("Caller", Caller)
797              << " because it should never be inlined (cost=never)";
798     });
799     return false;
800   }
801 
802   if (!IC) {
803     ORE.emit([&]() {
804       return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", Call)
805              << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
806              << NV("Caller", Caller) << " because too costly to inline (cost="
807              << NV("Cost", IC.getCost()) << ", threshold="
808              << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")";
809     });
810     return false;
811   }
812   const DataLayout &DL = Caller->getParent()->getDataLayout();
813 
814   // The savings of eliminating the call:
815   int NonWeightedSavings = getCallsiteCost(cast<CallBase>(*Call), DL);
816   BlockFrequency NormWeightedSavings(NonWeightedSavings);
817 
818   // Weighted saving is smaller than weighted cost, return false
819   if (NormWeightedSavings < WeightedOutliningRcost) {
820     ORE.emit([&]() {
821       return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutliningCallcostTooHigh",
822                                         Call)
823              << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
824              << NV("Caller", Caller) << " runtime overhead (overhead="
825              << NV("Overhead", (unsigned)WeightedOutliningRcost.getFrequency())
826              << ", savings="
827              << NV("Savings", (unsigned)NormWeightedSavings.getFrequency())
828              << ")"
829              << " of making the outlined call is too high";
830     });
831 
832     return false;
833   }
834 
835   ORE.emit([&]() {
836     return OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBePartiallyInlined", Call)
837            << NV("Callee", Cloner.OrigFunc) << " can be partially inlined into "
838            << NV("Caller", Caller) << " with cost=" << NV("Cost", IC.getCost())
839            << " (threshold="
840            << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")";
841   });
842   return true;
843 }
844 
845 // TODO: Ideally  we should share Inliner's InlineCost Analysis code.
846 // For now use a simplified version. The returned 'InlineCost' will be used
847 // to esimate the size cost as well as runtime cost of the BB.
848 int PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB) {
849   int InlineCost = 0;
850   const DataLayout &DL = BB->getParent()->getParent()->getDataLayout();
851   for (Instruction &I : BB->instructionsWithoutDebug()) {
852     // Skip free instructions.
853     switch (I.getOpcode()) {
854     case Instruction::BitCast:
855     case Instruction::PtrToInt:
856     case Instruction::IntToPtr:
857     case Instruction::Alloca:
858     case Instruction::PHI:
859       continue;
860     case Instruction::GetElementPtr:
861       if (cast<GetElementPtrInst>(&I)->hasAllZeroIndices())
862         continue;
863       break;
864     default:
865       break;
866     }
867 
868     if (I.isLifetimeStartOrEnd())
869       continue;
870 
871     if (CallInst *CI = dyn_cast<CallInst>(&I)) {
872       InlineCost += getCallsiteCost(*CI, DL);
873       continue;
874     }
875 
876     if (InvokeInst *II = dyn_cast<InvokeInst>(&I)) {
877       InlineCost += getCallsiteCost(*II, DL);
878       continue;
879     }
880 
881     if (SwitchInst *SI = dyn_cast<SwitchInst>(&I)) {
882       InlineCost += (SI->getNumCases() + 1) * InlineConstants::InstrCost;
883       continue;
884     }
885     InlineCost += InlineConstants::InstrCost;
886   }
887   return InlineCost;
888 }
889 
890 std::tuple<int, int>
891 PartialInlinerImpl::computeOutliningCosts(FunctionCloner &Cloner) {
892   int OutliningFuncCallCost = 0, OutlinedFunctionCost = 0;
893   for (auto FuncBBPair : Cloner.OutlinedFunctions) {
894     Function *OutlinedFunc = FuncBBPair.first;
895     BasicBlock* OutliningCallBB = FuncBBPair.second;
896     // Now compute the cost of the call sequence to the outlined function
897     // 'OutlinedFunction' in BB 'OutliningCallBB':
898     OutliningFuncCallCost += computeBBInlineCost(OutliningCallBB);
899 
900     // Now compute the cost of the extracted/outlined function itself:
901     for (BasicBlock &BB : *OutlinedFunc)
902       OutlinedFunctionCost += computeBBInlineCost(&BB);
903   }
904   assert(OutlinedFunctionCost >= Cloner.OutlinedRegionCost &&
905          "Outlined function cost should be no less than the outlined region");
906 
907   // The code extractor introduces a new root and exit stub blocks with
908   // additional unconditional branches. Those branches will be eliminated
909   // later with bb layout. The cost should be adjusted accordingly:
910   OutlinedFunctionCost -=
911       2 * InlineConstants::InstrCost * Cloner.OutlinedFunctions.size();
912 
913   int OutliningRuntimeOverhead =
914       OutliningFuncCallCost +
915       (OutlinedFunctionCost - Cloner.OutlinedRegionCost) +
916       ExtraOutliningPenalty;
917 
918   return std::make_tuple(OutliningFuncCallCost, OutliningRuntimeOverhead);
919 }
920 
921 // Create the callsite to profile count map which is
922 // used to update the original function's entry count,
923 // after the function is partially inlined into the callsite.
924 void PartialInlinerImpl::computeCallsiteToProfCountMap(
925     Function *DuplicateFunction,
926     DenseMap<User *, uint64_t> &CallSiteToProfCountMap) {
927   std::vector<User *> Users(DuplicateFunction->user_begin(),
928                             DuplicateFunction->user_end());
929   Function *CurrentCaller = nullptr;
930   std::unique_ptr<BlockFrequencyInfo> TempBFI;
931   BlockFrequencyInfo *CurrentCallerBFI = nullptr;
932 
933   auto ComputeCurrBFI = [&,this](Function *Caller) {
934       // For the old pass manager:
935       if (!GetBFI) {
936         DominatorTree DT(*Caller);
937         LoopInfo LI(DT);
938         BranchProbabilityInfo BPI(*Caller, LI);
939         TempBFI.reset(new BlockFrequencyInfo(*Caller, BPI, LI));
940         CurrentCallerBFI = TempBFI.get();
941       } else {
942         // New pass manager:
943         CurrentCallerBFI = &(*GetBFI)(*Caller);
944       }
945   };
946 
947   for (User *User : Users) {
948     CallSite CS = getCallSite(User);
949     Function *Caller = CS.getCaller();
950     if (CurrentCaller != Caller) {
951       CurrentCaller = Caller;
952       ComputeCurrBFI(Caller);
953     } else {
954       assert(CurrentCallerBFI && "CallerBFI is not set");
955     }
956     BasicBlock *CallBB = CS.getInstruction()->getParent();
957     auto Count = CurrentCallerBFI->getBlockProfileCount(CallBB);
958     if (Count)
959       CallSiteToProfCountMap[User] = *Count;
960     else
961       CallSiteToProfCountMap[User] = 0;
962   }
963 }
964 
965 PartialInlinerImpl::FunctionCloner::FunctionCloner(
966     Function *F, FunctionOutliningInfo *OI, OptimizationRemarkEmitter &ORE,
967     function_ref<AssumptionCache *(Function &)> LookupAC)
968     : OrigFunc(F), ORE(ORE), LookupAC(LookupAC) {
969   ClonedOI = llvm::make_unique<FunctionOutliningInfo>();
970 
971   // Clone the function, so that we can hack away on it.
972   ValueToValueMapTy VMap;
973   ClonedFunc = CloneFunction(F, VMap);
974 
975   ClonedOI->ReturnBlock = cast<BasicBlock>(VMap[OI->ReturnBlock]);
976   ClonedOI->NonReturnBlock = cast<BasicBlock>(VMap[OI->NonReturnBlock]);
977   for (BasicBlock *BB : OI->Entries) {
978     ClonedOI->Entries.push_back(cast<BasicBlock>(VMap[BB]));
979   }
980   for (BasicBlock *E : OI->ReturnBlockPreds) {
981     BasicBlock *NewE = cast<BasicBlock>(VMap[E]);
982     ClonedOI->ReturnBlockPreds.push_back(NewE);
983   }
984   // Go ahead and update all uses to the duplicate, so that we can just
985   // use the inliner functionality when we're done hacking.
986   F->replaceAllUsesWith(ClonedFunc);
987 }
988 
989 PartialInlinerImpl::FunctionCloner::FunctionCloner(
990     Function *F, FunctionOutliningMultiRegionInfo *OI,
991     OptimizationRemarkEmitter &ORE,
992     function_ref<AssumptionCache *(Function &)> LookupAC)
993     : OrigFunc(F), ORE(ORE), LookupAC(LookupAC) {
994   ClonedOMRI = llvm::make_unique<FunctionOutliningMultiRegionInfo>();
995 
996   // Clone the function, so that we can hack away on it.
997   ValueToValueMapTy VMap;
998   ClonedFunc = CloneFunction(F, VMap);
999 
1000   // Go through all Outline Candidate Regions and update all BasicBlock
1001   // information.
1002   for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo :
1003        OI->ORI) {
1004     SmallVector<BasicBlock *, 8> Region;
1005     for (BasicBlock *BB : RegionInfo.Region) {
1006       Region.push_back(cast<BasicBlock>(VMap[BB]));
1007     }
1008     BasicBlock *NewEntryBlock = cast<BasicBlock>(VMap[RegionInfo.EntryBlock]);
1009     BasicBlock *NewExitBlock = cast<BasicBlock>(VMap[RegionInfo.ExitBlock]);
1010     BasicBlock *NewReturnBlock = nullptr;
1011     if (RegionInfo.ReturnBlock)
1012       NewReturnBlock = cast<BasicBlock>(VMap[RegionInfo.ReturnBlock]);
1013     FunctionOutliningMultiRegionInfo::OutlineRegionInfo MappedRegionInfo(
1014         Region, NewEntryBlock, NewExitBlock, NewReturnBlock);
1015     ClonedOMRI->ORI.push_back(MappedRegionInfo);
1016   }
1017   // Go ahead and update all uses to the duplicate, so that we can just
1018   // use the inliner functionality when we're done hacking.
1019   F->replaceAllUsesWith(ClonedFunc);
1020 }
1021 
1022 void PartialInlinerImpl::FunctionCloner::NormalizeReturnBlock() {
1023   auto getFirstPHI = [](BasicBlock *BB) {
1024     BasicBlock::iterator I = BB->begin();
1025     PHINode *FirstPhi = nullptr;
1026     while (I != BB->end()) {
1027       PHINode *Phi = dyn_cast<PHINode>(I);
1028       if (!Phi)
1029         break;
1030       if (!FirstPhi) {
1031         FirstPhi = Phi;
1032         break;
1033       }
1034     }
1035     return FirstPhi;
1036   };
1037 
1038   // Shouldn't need to normalize PHIs if we're not outlining non-early return
1039   // blocks.
1040   if (!ClonedOI)
1041     return;
1042 
1043   // Special hackery is needed with PHI nodes that have inputs from more than
1044   // one extracted block.  For simplicity, just split the PHIs into a two-level
1045   // sequence of PHIs, some of which will go in the extracted region, and some
1046   // of which will go outside.
1047   BasicBlock *PreReturn = ClonedOI->ReturnBlock;
1048   // only split block when necessary:
1049   PHINode *FirstPhi = getFirstPHI(PreReturn);
1050   unsigned NumPredsFromEntries = ClonedOI->ReturnBlockPreds.size();
1051 
1052   if (!FirstPhi || FirstPhi->getNumIncomingValues() <= NumPredsFromEntries + 1)
1053     return;
1054 
1055   auto IsTrivialPhi = [](PHINode *PN) -> Value * {
1056     Value *CommonValue = PN->getIncomingValue(0);
1057     if (all_of(PN->incoming_values(),
1058                [&](Value *V) { return V == CommonValue; }))
1059       return CommonValue;
1060     return nullptr;
1061   };
1062 
1063   ClonedOI->ReturnBlock = ClonedOI->ReturnBlock->splitBasicBlock(
1064       ClonedOI->ReturnBlock->getFirstNonPHI()->getIterator());
1065   BasicBlock::iterator I = PreReturn->begin();
1066   Instruction *Ins = &ClonedOI->ReturnBlock->front();
1067   SmallVector<Instruction *, 4> DeadPhis;
1068   while (I != PreReturn->end()) {
1069     PHINode *OldPhi = dyn_cast<PHINode>(I);
1070     if (!OldPhi)
1071       break;
1072 
1073     PHINode *RetPhi =
1074         PHINode::Create(OldPhi->getType(), NumPredsFromEntries + 1, "", Ins);
1075     OldPhi->replaceAllUsesWith(RetPhi);
1076     Ins = ClonedOI->ReturnBlock->getFirstNonPHI();
1077 
1078     RetPhi->addIncoming(&*I, PreReturn);
1079     for (BasicBlock *E : ClonedOI->ReturnBlockPreds) {
1080       RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(E), E);
1081       OldPhi->removeIncomingValue(E);
1082     }
1083 
1084     // After incoming values splitting, the old phi may become trivial.
1085     // Keeping the trivial phi can introduce definition inside the outline
1086     // region which is live-out, causing necessary overhead (load, store
1087     // arg passing etc).
1088     if (auto *OldPhiVal = IsTrivialPhi(OldPhi)) {
1089       OldPhi->replaceAllUsesWith(OldPhiVal);
1090       DeadPhis.push_back(OldPhi);
1091     }
1092     ++I;
1093   }
1094   for (auto *DP : DeadPhis)
1095     DP->eraseFromParent();
1096 
1097   for (auto E : ClonedOI->ReturnBlockPreds) {
1098     E->getTerminator()->replaceUsesOfWith(PreReturn, ClonedOI->ReturnBlock);
1099   }
1100 }
1101 
1102 bool PartialInlinerImpl::FunctionCloner::doMultiRegionFunctionOutlining() {
1103 
1104   auto ComputeRegionCost = [](SmallVectorImpl<BasicBlock *> &Region) {
1105     int Cost = 0;
1106     for (BasicBlock* BB : Region)
1107       Cost += computeBBInlineCost(BB);
1108     return Cost;
1109   };
1110 
1111   assert(ClonedOMRI && "Expecting OutlineInfo for multi region outline");
1112 
1113   if (ClonedOMRI->ORI.empty())
1114     return false;
1115 
1116   // The CodeExtractor needs a dominator tree.
1117   DominatorTree DT;
1118   DT.recalculate(*ClonedFunc);
1119 
1120   // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo.
1121   LoopInfo LI(DT);
1122   BranchProbabilityInfo BPI(*ClonedFunc, LI);
1123   ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI));
1124 
1125   SetVector<Value *> Inputs, Outputs, Sinks;
1126   for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo :
1127        ClonedOMRI->ORI) {
1128     int CurrentOutlinedRegionCost = ComputeRegionCost(RegionInfo.Region);
1129 
1130     CodeExtractor CE(RegionInfo.Region, &DT, /*AggregateArgs*/ false,
1131                      ClonedFuncBFI.get(), &BPI,
1132                      LookupAC(*RegionInfo.EntryBlock->getParent()),
1133                      /* AllowVarargs */ false);
1134 
1135     CE.findInputsOutputs(Inputs, Outputs, Sinks);
1136 
1137 #ifndef NDEBUG
1138     if (TracePartialInlining) {
1139       dbgs() << "inputs: " << Inputs.size() << "\n";
1140       dbgs() << "outputs: " << Outputs.size() << "\n";
1141       for (Value *value : Inputs)
1142         dbgs() << "value used in func: " << *value << "\n";
1143       for (Value *output : Outputs)
1144         dbgs() << "instr used in func: " << *output << "\n";
1145     }
1146 #endif
1147     // Do not extract regions that have live exit variables.
1148     if (Outputs.size() > 0 && !ForceLiveExit)
1149       continue;
1150 
1151     Function *OutlinedFunc = CE.extractCodeRegion();
1152 
1153     if (OutlinedFunc) {
1154       CallSite OCS = PartialInlinerImpl::getOneCallSiteTo(OutlinedFunc);
1155       BasicBlock *OutliningCallBB = OCS.getInstruction()->getParent();
1156       assert(OutliningCallBB->getParent() == ClonedFunc);
1157       OutlinedFunctions.push_back(std::make_pair(OutlinedFunc,OutliningCallBB));
1158       NumColdRegionsOutlined++;
1159       OutlinedRegionCost += CurrentOutlinedRegionCost;
1160 
1161       if (MarkOutlinedColdCC) {
1162         OutlinedFunc->setCallingConv(CallingConv::Cold);
1163         OCS.setCallingConv(CallingConv::Cold);
1164       }
1165     } else
1166       ORE.emit([&]() {
1167         return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed",
1168                                         &RegionInfo.Region.front()->front())
1169                << "Failed to extract region at block "
1170                << ore::NV("Block", RegionInfo.Region.front());
1171       });
1172   }
1173 
1174   return !OutlinedFunctions.empty();
1175 }
1176 
1177 Function *
1178 PartialInlinerImpl::FunctionCloner::doSingleRegionFunctionOutlining() {
1179   // Returns true if the block is to be partial inlined into the caller
1180   // (i.e. not to be extracted to the out of line function)
1181   auto ToBeInlined = [&, this](BasicBlock *BB) {
1182     return BB == ClonedOI->ReturnBlock ||
1183            (std::find(ClonedOI->Entries.begin(), ClonedOI->Entries.end(), BB) !=
1184             ClonedOI->Entries.end());
1185   };
1186 
1187   assert(ClonedOI && "Expecting OutlineInfo for single region outline");
1188   // The CodeExtractor needs a dominator tree.
1189   DominatorTree DT;
1190   DT.recalculate(*ClonedFunc);
1191 
1192   // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo.
1193   LoopInfo LI(DT);
1194   BranchProbabilityInfo BPI(*ClonedFunc, LI);
1195   ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI));
1196 
1197   // Gather up the blocks that we're going to extract.
1198   std::vector<BasicBlock *> ToExtract;
1199   ToExtract.push_back(ClonedOI->NonReturnBlock);
1200   OutlinedRegionCost +=
1201       PartialInlinerImpl::computeBBInlineCost(ClonedOI->NonReturnBlock);
1202   for (BasicBlock &BB : *ClonedFunc)
1203     if (!ToBeInlined(&BB) && &BB != ClonedOI->NonReturnBlock) {
1204       ToExtract.push_back(&BB);
1205       // FIXME: the code extractor may hoist/sink more code
1206       // into the outlined function which may make the outlining
1207       // overhead (the difference of the outlined function cost
1208       // and OutliningRegionCost) look larger.
1209       OutlinedRegionCost += computeBBInlineCost(&BB);
1210     }
1211 
1212   // Extract the body of the if.
1213   Function *OutlinedFunc =
1214       CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false,
1215                     ClonedFuncBFI.get(), &BPI, LookupAC(*ClonedFunc),
1216                     /* AllowVarargs */ true)
1217           .extractCodeRegion();
1218 
1219   if (OutlinedFunc) {
1220     BasicBlock *OutliningCallBB =
1221         PartialInlinerImpl::getOneCallSiteTo(OutlinedFunc)
1222             .getInstruction()
1223             ->getParent();
1224     assert(OutliningCallBB->getParent() == ClonedFunc);
1225     OutlinedFunctions.push_back(std::make_pair(OutlinedFunc, OutliningCallBB));
1226   } else
1227     ORE.emit([&]() {
1228       return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed",
1229                                       &ToExtract.front()->front())
1230              << "Failed to extract region at block "
1231              << ore::NV("Block", ToExtract.front());
1232     });
1233 
1234   return OutlinedFunc;
1235 }
1236 
1237 PartialInlinerImpl::FunctionCloner::~FunctionCloner() {
1238   // Ditch the duplicate, since we're done with it, and rewrite all remaining
1239   // users (function pointers, etc.) back to the original function.
1240   ClonedFunc->replaceAllUsesWith(OrigFunc);
1241   ClonedFunc->eraseFromParent();
1242   if (!IsFunctionInlined) {
1243     // Remove each function that was speculatively created if there is no
1244     // reference.
1245     for (auto FuncBBPair : OutlinedFunctions) {
1246       Function *Func = FuncBBPair.first;
1247       Func->eraseFromParent();
1248     }
1249   }
1250 }
1251 
1252 std::pair<bool, Function *> PartialInlinerImpl::unswitchFunction(Function *F) {
1253 
1254   if (F->hasAddressTaken())
1255     return {false, nullptr};
1256 
1257   // Let inliner handle it
1258   if (F->hasFnAttribute(Attribute::AlwaysInline))
1259     return {false, nullptr};
1260 
1261   if (F->hasFnAttribute(Attribute::NoInline))
1262     return {false, nullptr};
1263 
1264   if (PSI->isFunctionEntryCold(F))
1265     return {false, nullptr};
1266 
1267   if (empty(F->users()))
1268     return {false, nullptr};
1269 
1270   OptimizationRemarkEmitter ORE(F);
1271 
1272   // Only try to outline cold regions if we have a profile summary, which
1273   // implies we have profiling information.
1274   if (PSI->hasProfileSummary() && F->hasProfileData() &&
1275       !DisableMultiRegionPartialInline) {
1276     std::unique_ptr<FunctionOutliningMultiRegionInfo> OMRI =
1277         computeOutliningColdRegionsInfo(F, ORE);
1278     if (OMRI) {
1279       FunctionCloner Cloner(F, OMRI.get(), ORE, LookupAssumptionCache);
1280 
1281 #ifndef NDEBUG
1282       if (TracePartialInlining) {
1283         dbgs() << "HotCountThreshold = " << PSI->getHotCountThreshold() << "\n";
1284         dbgs() << "ColdCountThreshold = " << PSI->getColdCountThreshold()
1285                << "\n";
1286       }
1287 #endif
1288       bool DidOutline = Cloner.doMultiRegionFunctionOutlining();
1289 
1290       if (DidOutline) {
1291 #ifndef NDEBUG
1292         if (TracePartialInlining) {
1293           dbgs() << ">>>>>> Outlined (Cloned) Function >>>>>>\n";
1294           Cloner.ClonedFunc->print(dbgs());
1295           dbgs() << "<<<<<< Outlined (Cloned) Function <<<<<<\n";
1296         }
1297 #endif
1298 
1299         if (tryPartialInline(Cloner))
1300           return {true, nullptr};
1301       }
1302     }
1303   }
1304 
1305   // Fall-thru to regular partial inlining if we:
1306   //    i) can't find any cold regions to outline, or
1307   //   ii) can't inline the outlined function anywhere.
1308   std::unique_ptr<FunctionOutliningInfo> OI = computeOutliningInfo(F);
1309   if (!OI)
1310     return {false, nullptr};
1311 
1312   FunctionCloner Cloner(F, OI.get(), ORE, LookupAssumptionCache);
1313   Cloner.NormalizeReturnBlock();
1314 
1315   Function *OutlinedFunction = Cloner.doSingleRegionFunctionOutlining();
1316 
1317   if (!OutlinedFunction)
1318     return {false, nullptr};
1319 
1320   bool AnyInline = tryPartialInline(Cloner);
1321 
1322   if (AnyInline)
1323     return {true, OutlinedFunction};
1324 
1325   return {false, nullptr};
1326 }
1327 
1328 bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) {
1329   if (Cloner.OutlinedFunctions.empty())
1330     return false;
1331 
1332   int SizeCost = 0;
1333   BlockFrequency WeightedRcost;
1334   int NonWeightedRcost;
1335   std::tie(SizeCost, NonWeightedRcost) = computeOutliningCosts(Cloner);
1336 
1337   // Only calculate RelativeToEntryFreq when we are doing single region
1338   // outlining.
1339   BranchProbability RelativeToEntryFreq;
1340   if (Cloner.ClonedOI) {
1341     RelativeToEntryFreq = getOutliningCallBBRelativeFreq(Cloner);
1342   } else
1343     // RelativeToEntryFreq doesn't make sense when we have more than one
1344     // outlined call because each call will have a different relative frequency
1345     // to the entry block.  We can consider using the average, but the
1346     // usefulness of that information is questionable. For now, assume we never
1347     // execute the calls to outlined functions.
1348     RelativeToEntryFreq = BranchProbability(0, 1);
1349 
1350   WeightedRcost = BlockFrequency(NonWeightedRcost) * RelativeToEntryFreq;
1351 
1352   // The call sequence(s) to the outlined function(s) are larger than the sum of
1353   // the original outlined region size(s), it does not increase the chances of
1354   // inlining the function with outlining (The inliner uses the size increase to
1355   // model the cost of inlining a callee).
1356   if (!SkipCostAnalysis && Cloner.OutlinedRegionCost < SizeCost) {
1357     OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc);
1358     DebugLoc DLoc;
1359     BasicBlock *Block;
1360     std::tie(DLoc, Block) = getOneDebugLoc(Cloner.ClonedFunc);
1361     OrigFuncORE.emit([&]() {
1362       return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall",
1363                                         DLoc, Block)
1364              << ore::NV("Function", Cloner.OrigFunc)
1365              << " not partially inlined into callers (Original Size = "
1366              << ore::NV("OutlinedRegionOriginalSize", Cloner.OutlinedRegionCost)
1367              << ", Size of call sequence to outlined function = "
1368              << ore::NV("NewSize", SizeCost) << ")";
1369     });
1370     return false;
1371   }
1372 
1373   assert(empty(Cloner.OrigFunc->users()) &&
1374          "F's users should all be replaced!");
1375 
1376   std::vector<User *> Users(Cloner.ClonedFunc->user_begin(),
1377                             Cloner.ClonedFunc->user_end());
1378 
1379   DenseMap<User *, uint64_t> CallSiteToProfCountMap;
1380   auto CalleeEntryCount = Cloner.OrigFunc->getEntryCount();
1381   if (CalleeEntryCount)
1382     computeCallsiteToProfCountMap(Cloner.ClonedFunc, CallSiteToProfCountMap);
1383 
1384   uint64_t CalleeEntryCountV =
1385       (CalleeEntryCount ? CalleeEntryCount.getCount() : 0);
1386 
1387   bool AnyInline = false;
1388   for (User *User : Users) {
1389     CallSite CS = getCallSite(User);
1390 
1391     if (IsLimitReached())
1392       continue;
1393 
1394     OptimizationRemarkEmitter CallerORE(CS.getCaller());
1395     if (!shouldPartialInline(CS, Cloner, WeightedRcost, CallerORE))
1396       continue;
1397 
1398     // Construct remark before doing the inlining, as after successful inlining
1399     // the callsite is removed.
1400     OptimizationRemark OR(DEBUG_TYPE, "PartiallyInlined", CS.getInstruction());
1401     OR << ore::NV("Callee", Cloner.OrigFunc) << " partially inlined into "
1402        << ore::NV("Caller", CS.getCaller());
1403 
1404     InlineFunctionInfo IFI(nullptr, GetAssumptionCache, PSI);
1405     // We can only forward varargs when we outlined a single region, else we
1406     // bail on vararg functions.
1407     if (!InlineFunction(CS, IFI, nullptr, true,
1408                         (Cloner.ClonedOI ? Cloner.OutlinedFunctions.back().first
1409                                          : nullptr)))
1410       continue;
1411 
1412     CallerORE.emit(OR);
1413 
1414     // Now update the entry count:
1415     if (CalleeEntryCountV && CallSiteToProfCountMap.count(User)) {
1416       uint64_t CallSiteCount = CallSiteToProfCountMap[User];
1417       CalleeEntryCountV -= std::min(CalleeEntryCountV, CallSiteCount);
1418     }
1419 
1420     AnyInline = true;
1421     NumPartialInlining++;
1422     // Update the stats
1423     if (Cloner.ClonedOI)
1424       NumPartialInlined++;
1425     else
1426       NumColdOutlinePartialInlined++;
1427 
1428   }
1429 
1430   if (AnyInline) {
1431     Cloner.IsFunctionInlined = true;
1432     if (CalleeEntryCount)
1433       Cloner.OrigFunc->setEntryCount(
1434           CalleeEntryCount.setCount(CalleeEntryCountV));
1435     OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc);
1436     OrigFuncORE.emit([&]() {
1437       return OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", Cloner.OrigFunc)
1438              << "Partially inlined into at least one caller";
1439     });
1440 
1441   }
1442 
1443   return AnyInline;
1444 }
1445 
1446 bool PartialInlinerImpl::run(Module &M) {
1447   if (DisablePartialInlining)
1448     return false;
1449 
1450   std::vector<Function *> Worklist;
1451   Worklist.reserve(M.size());
1452   for (Function &F : M)
1453     if (!F.use_empty() && !F.isDeclaration())
1454       Worklist.push_back(&F);
1455 
1456   bool Changed = false;
1457   while (!Worklist.empty()) {
1458     Function *CurrFunc = Worklist.back();
1459     Worklist.pop_back();
1460 
1461     if (CurrFunc->use_empty())
1462       continue;
1463 
1464     bool Recursive = false;
1465     for (User *U : CurrFunc->users())
1466       if (Instruction *I = dyn_cast<Instruction>(U))
1467         if (I->getParent()->getParent() == CurrFunc) {
1468           Recursive = true;
1469           break;
1470         }
1471     if (Recursive)
1472       continue;
1473 
1474     std::pair<bool, Function * > Result = unswitchFunction(CurrFunc);
1475     if (Result.second)
1476       Worklist.push_back(Result.second);
1477     Changed |= Result.first;
1478   }
1479 
1480   return Changed;
1481 }
1482 
1483 char PartialInlinerLegacyPass::ID = 0;
1484 
1485 INITIALIZE_PASS_BEGIN(PartialInlinerLegacyPass, "partial-inliner",
1486                       "Partial Inliner", false, false)
1487 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
1488 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
1489 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
1490 INITIALIZE_PASS_END(PartialInlinerLegacyPass, "partial-inliner",
1491                     "Partial Inliner", false, false)
1492 
1493 ModulePass *llvm::createPartialInliningPass() {
1494   return new PartialInlinerLegacyPass();
1495 }
1496 
1497 PreservedAnalyses PartialInlinerPass::run(Module &M,
1498                                           ModuleAnalysisManager &AM) {
1499   auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
1500 
1501   std::function<AssumptionCache &(Function &)> GetAssumptionCache =
1502       [&FAM](Function &F) -> AssumptionCache & {
1503     return FAM.getResult<AssumptionAnalysis>(F);
1504   };
1505 
1506   auto LookupAssumptionCache = [&FAM](Function &F) -> AssumptionCache * {
1507     return FAM.getCachedResult<AssumptionAnalysis>(F);
1508   };
1509 
1510   std::function<BlockFrequencyInfo &(Function &)> GetBFI =
1511       [&FAM](Function &F) -> BlockFrequencyInfo & {
1512     return FAM.getResult<BlockFrequencyAnalysis>(F);
1513   };
1514 
1515   std::function<TargetTransformInfo &(Function &)> GetTTI =
1516       [&FAM](Function &F) -> TargetTransformInfo & {
1517     return FAM.getResult<TargetIRAnalysis>(F);
1518   };
1519 
1520   ProfileSummaryInfo *PSI = &AM.getResult<ProfileSummaryAnalysis>(M);
1521 
1522   if (PartialInlinerImpl(&GetAssumptionCache, LookupAssumptionCache, &GetTTI,
1523                          {GetBFI}, PSI)
1524           .run(M))
1525     return PreservedAnalyses::none();
1526   return PreservedAnalyses::all();
1527 }
1528