xref: /freebsd/contrib/llvm-project/llvm/lib/Transforms/IPO/PartialInlining.cpp (revision 770cf0a5f02dc8983a89c6568d741fbc25baa999)
1 //===- PartialInlining.cpp - Inline parts of functions --------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass performs partial inlining, typically by inlining an if statement
10 // that surrounds the body of the function.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "llvm/Transforms/IPO/PartialInlining.h"
15 #include "llvm/ADT/DenseMap.h"
16 #include "llvm/ADT/DenseSet.h"
17 #include "llvm/ADT/DepthFirstIterator.h"
18 #include "llvm/ADT/STLExtras.h"
19 #include "llvm/ADT/SmallVector.h"
20 #include "llvm/ADT/Statistic.h"
21 #include "llvm/Analysis/BlockFrequencyInfo.h"
22 #include "llvm/Analysis/BranchProbabilityInfo.h"
23 #include "llvm/Analysis/InlineCost.h"
24 #include "llvm/Analysis/LoopInfo.h"
25 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
26 #include "llvm/Analysis/ProfileSummaryInfo.h"
27 #include "llvm/Analysis/TargetLibraryInfo.h"
28 #include "llvm/Analysis/TargetTransformInfo.h"
29 #include "llvm/IR/Attributes.h"
30 #include "llvm/IR/BasicBlock.h"
31 #include "llvm/IR/CFG.h"
32 #include "llvm/IR/DebugLoc.h"
33 #include "llvm/IR/DiagnosticInfo.h"
34 #include "llvm/IR/Dominators.h"
35 #include "llvm/IR/Function.h"
36 #include "llvm/IR/InstrTypes.h"
37 #include "llvm/IR/Instruction.h"
38 #include "llvm/IR/Instructions.h"
39 #include "llvm/IR/IntrinsicInst.h"
40 #include "llvm/IR/Intrinsics.h"
41 #include "llvm/IR/Module.h"
42 #include "llvm/IR/Operator.h"
43 #include "llvm/IR/ProfDataUtils.h"
44 #include "llvm/IR/User.h"
45 #include "llvm/Support/BlockFrequency.h"
46 #include "llvm/Support/BranchProbability.h"
47 #include "llvm/Support/Casting.h"
48 #include "llvm/Support/CommandLine.h"
49 #include "llvm/Support/ErrorHandling.h"
50 #include "llvm/Transforms/IPO.h"
51 #include "llvm/Transforms/Utils/Cloning.h"
52 #include "llvm/Transforms/Utils/CodeExtractor.h"
53 #include "llvm/Transforms/Utils/ValueMapper.h"
54 #include <algorithm>
55 #include <cassert>
56 #include <cstdint>
57 #include <memory>
58 #include <tuple>
59 #include <vector>
60 
61 using namespace llvm;
62 
63 #define DEBUG_TYPE "partial-inlining"
64 
65 STATISTIC(NumPartialInlined,
66           "Number of callsites functions partially inlined into.");
67 STATISTIC(NumColdOutlinePartialInlined, "Number of times functions with "
68                                         "cold outlined regions were partially "
69                                         "inlined into its caller(s).");
70 STATISTIC(NumColdRegionsFound,
71            "Number of cold single entry/exit regions found.");
72 STATISTIC(NumColdRegionsOutlined,
73            "Number of cold single entry/exit regions outlined.");
74 
75 // Command line option to disable partial-inlining. The default is false:
76 static cl::opt<bool>
77     DisablePartialInlining("disable-partial-inlining", cl::init(false),
78                            cl::Hidden, cl::desc("Disable partial inlining"));
79 // Command line option to disable multi-region partial-inlining. The default is
80 // false:
81 static cl::opt<bool> DisableMultiRegionPartialInline(
82     "disable-mr-partial-inlining", cl::init(false), cl::Hidden,
83     cl::desc("Disable multi-region partial inlining"));
84 
85 // Command line option to force outlining in regions with live exit variables.
86 // The default is false:
87 static cl::opt<bool>
88     ForceLiveExit("pi-force-live-exit-outline", cl::init(false), cl::Hidden,
89                cl::desc("Force outline regions with live exits"));
90 
91 // Command line option to enable marking outline functions with Cold Calling
92 // Convention. The default is false:
93 static cl::opt<bool>
94     MarkOutlinedColdCC("pi-mark-coldcc", cl::init(false), cl::Hidden,
95                        cl::desc("Mark outline function calls with ColdCC"));
96 
97 // This is an option used by testing:
98 static cl::opt<bool> SkipCostAnalysis("skip-partial-inlining-cost-analysis",
99 
100                                       cl::ReallyHidden,
101                                       cl::desc("Skip Cost Analysis"));
102 // Used to determine if a cold region is worth outlining based on
103 // its inlining cost compared to the original function.  Default is set at 10%.
104 // ie. if the cold region reduces the inlining cost of the original function by
105 // at least 10%.
106 static cl::opt<float> MinRegionSizeRatio(
107     "min-region-size-ratio", cl::init(0.1), cl::Hidden,
108     cl::desc("Minimum ratio comparing relative sizes of each "
109              "outline candidate and original function"));
110 // Used to tune the minimum number of execution counts needed in the predecessor
111 // block to the cold edge. ie. confidence interval.
112 static cl::opt<unsigned>
113     MinBlockCounterExecution("min-block-execution", cl::init(100), cl::Hidden,
114                              cl::desc("Minimum block executions to consider "
115                                       "its BranchProbabilityInfo valid"));
116 // Used to determine when an edge is considered cold. Default is set to 10%. ie.
117 // if the branch probability is 10% or less, then it is deemed as 'cold'.
118 static cl::opt<float> ColdBranchRatio(
119     "cold-branch-ratio", cl::init(0.1), cl::Hidden,
120     cl::desc("Minimum BranchProbability to consider a region cold."));
121 
122 static cl::opt<unsigned> MaxNumInlineBlocks(
123     "max-num-inline-blocks", cl::init(5), cl::Hidden,
124     cl::desc("Max number of blocks to be partially inlined"));
125 
126 // Command line option to set the maximum number of partial inlining allowed
127 // for the module. The default value of -1 means no limit.
128 static cl::opt<int> MaxNumPartialInlining(
129     "max-partial-inlining", cl::init(-1), cl::Hidden,
130     cl::desc("Max number of partial inlining. The default is unlimited"));
131 
132 // Used only when PGO or user annotated branch data is absent. It is
133 // the least value that is used to weigh the outline region. If BFI
134 // produces larger value, the BFI value will be used.
135 static cl::opt<int>
136     OutlineRegionFreqPercent("outline-region-freq-percent", cl::init(75),
137                              cl::Hidden,
138                              cl::desc("Relative frequency of outline region to "
139                                       "the entry block"));
140 
141 static cl::opt<unsigned> ExtraOutliningPenalty(
142     "partial-inlining-extra-penalty", cl::init(0), cl::Hidden,
143     cl::desc("A debug option to add additional penalty to the computed one."));
144 
145 namespace {
146 
147 struct FunctionOutliningInfo {
148   FunctionOutliningInfo() = default;
149 
150   // Returns the number of blocks to be inlined including all blocks
151   // in Entries and one return block.
152   unsigned getNumInlinedBlocks() const { return Entries.size() + 1; }
153 
154   // A set of blocks including the function entry that guard
155   // the region to be outlined.
156   SmallVector<BasicBlock *, 4> Entries;
157 
158   // The return block that is not included in the outlined region.
159   BasicBlock *ReturnBlock = nullptr;
160 
161   // The dominating block of the region to be outlined.
162   BasicBlock *NonReturnBlock = nullptr;
163 
164   // The set of blocks in Entries that are predecessors to ReturnBlock
165   SmallVector<BasicBlock *, 4> ReturnBlockPreds;
166 };
167 
168 struct FunctionOutliningMultiRegionInfo {
169   FunctionOutliningMultiRegionInfo() = default;
170 
171   // Container for outline regions
172   struct OutlineRegionInfo {
173     OutlineRegionInfo(ArrayRef<BasicBlock *> Region, BasicBlock *EntryBlock,
174                       BasicBlock *ExitBlock, BasicBlock *ReturnBlock)
175         : Region(Region), EntryBlock(EntryBlock), ExitBlock(ExitBlock),
176           ReturnBlock(ReturnBlock) {}
177     SmallVector<BasicBlock *, 8> Region;
178     BasicBlock *EntryBlock;
179     BasicBlock *ExitBlock;
180     BasicBlock *ReturnBlock;
181   };
182 
183   SmallVector<OutlineRegionInfo, 4> ORI;
184 };
185 
186 struct PartialInlinerImpl {
187 
188   PartialInlinerImpl(
189       function_ref<AssumptionCache &(Function &)> GetAC,
190       function_ref<AssumptionCache *(Function &)> LookupAC,
191       function_ref<TargetTransformInfo &(Function &)> GTTI,
192       function_ref<const TargetLibraryInfo &(Function &)> GTLI,
193       ProfileSummaryInfo &ProfSI,
194       function_ref<BlockFrequencyInfo &(Function &)> GBFI = nullptr)
195       : GetAssumptionCache(GetAC), LookupAssumptionCache(LookupAC),
196         GetTTI(GTTI), GetBFI(GBFI), GetTLI(GTLI), PSI(ProfSI) {}
197 
198   bool run(Module &M);
199   // Main part of the transformation that calls helper functions to find
200   // outlining candidates, clone & outline the function, and attempt to
201   // partially inline the resulting function. Returns true if
202   // inlining was successful, false otherwise.  Also returns the outline
203   // function (only if we partially inlined early returns) as there is a
204   // possibility to further "peel" early return statements that were left in the
205   // outline function due to code size.
206   std::pair<bool, Function *> unswitchFunction(Function &F);
207 
208   // This class speculatively clones the function to be partial inlined.
209   // At the end of partial inlining, the remaining callsites to the cloned
210   // function that are not partially inlined will be fixed up to reference
211   // the original function, and the cloned function will be erased.
212   struct FunctionCloner {
213     // Two constructors, one for single region outlining, the other for
214     // multi-region outlining.
215     FunctionCloner(Function *F, FunctionOutliningInfo *OI,
216                    OptimizationRemarkEmitter &ORE,
217                    function_ref<AssumptionCache *(Function &)> LookupAC,
218                    function_ref<TargetTransformInfo &(Function &)> GetTTI);
219     FunctionCloner(Function *F, FunctionOutliningMultiRegionInfo *OMRI,
220                    OptimizationRemarkEmitter &ORE,
221                    function_ref<AssumptionCache *(Function &)> LookupAC,
222                    function_ref<TargetTransformInfo &(Function &)> GetTTI);
223 
224     ~FunctionCloner();
225 
226     // Prepare for function outlining: making sure there is only
227     // one incoming edge from the extracted/outlined region to
228     // the return block.
229     void normalizeReturnBlock() const;
230 
231     // Do function outlining for cold regions.
232     bool doMultiRegionFunctionOutlining();
233     // Do function outlining for region after early return block(s).
234     // NOTE: For vararg functions that do the vararg handling in the outlined
235     //       function, we temporarily generate IR that does not properly
236     //       forward varargs to the outlined function. Calling InlineFunction
237     //       will update calls to the outlined functions to properly forward
238     //       the varargs.
239     Function *doSingleRegionFunctionOutlining();
240 
241     Function *OrigFunc = nullptr;
242     Function *ClonedFunc = nullptr;
243 
244     typedef std::pair<Function *, BasicBlock *> FuncBodyCallerPair;
245     // Keep track of Outlined Functions and the basic block they're called from.
246     SmallVector<FuncBodyCallerPair, 4> OutlinedFunctions;
247 
248     // ClonedFunc is inlined in one of its callers after function
249     // outlining.
250     bool IsFunctionInlined = false;
251     // The cost of the region to be outlined.
252     InstructionCost OutlinedRegionCost = 0;
253     // ClonedOI is specific to outlining non-early return blocks.
254     std::unique_ptr<FunctionOutliningInfo> ClonedOI = nullptr;
255     // ClonedOMRI is specific to outlining cold regions.
256     std::unique_ptr<FunctionOutliningMultiRegionInfo> ClonedOMRI = nullptr;
257     std::unique_ptr<BlockFrequencyInfo> ClonedFuncBFI = nullptr;
258     OptimizationRemarkEmitter &ORE;
259     function_ref<AssumptionCache *(Function &)> LookupAC;
260     function_ref<TargetTransformInfo &(Function &)> GetTTI;
261   };
262 
263 private:
264   int NumPartialInlining = 0;
265   function_ref<AssumptionCache &(Function &)> GetAssumptionCache;
266   function_ref<AssumptionCache *(Function &)> LookupAssumptionCache;
267   function_ref<TargetTransformInfo &(Function &)> GetTTI;
268   function_ref<BlockFrequencyInfo &(Function &)> GetBFI;
269   function_ref<const TargetLibraryInfo &(Function &)> GetTLI;
270   ProfileSummaryInfo &PSI;
271 
272   // Return the frequency of the OutlininingBB relative to F's entry point.
273   // The result is no larger than 1 and is represented using BP.
274   // (Note that the outlined region's 'head' block can only have incoming
275   // edges from the guarding entry blocks).
276   BranchProbability
277   getOutliningCallBBRelativeFreq(FunctionCloner &Cloner) const;
278 
279   // Return true if the callee of CB should be partially inlined with
280   // profit.
281   bool shouldPartialInline(CallBase &CB, FunctionCloner &Cloner,
282                            BlockFrequency WeightedOutliningRcost,
283                            OptimizationRemarkEmitter &ORE) const;
284 
285   // Try to inline DuplicateFunction (cloned from F with call to
286   // the OutlinedFunction into its callers. Return true
287   // if there is any successful inlining.
288   bool tryPartialInline(FunctionCloner &Cloner);
289 
290   // Compute the mapping from use site of DuplicationFunction to the enclosing
291   // BB's profile count.
292   void
293   computeCallsiteToProfCountMap(Function *DuplicateFunction,
294                                 DenseMap<User *, uint64_t> &SiteCountMap) const;
295 
296   bool isLimitReached() const {
297     return (MaxNumPartialInlining != -1 &&
298             NumPartialInlining >= MaxNumPartialInlining);
299   }
300 
301   static CallBase *getSupportedCallBase(User *U) {
302     if (isa<CallInst>(U) || isa<InvokeInst>(U))
303       return cast<CallBase>(U);
304     llvm_unreachable("All uses must be calls");
305     return nullptr;
306   }
307 
308   static CallBase *getOneCallSiteTo(Function &F) {
309     User *User = *F.user_begin();
310     return getSupportedCallBase(User);
311   }
312 
313   std::tuple<DebugLoc, BasicBlock *> getOneDebugLoc(Function &F) const {
314     CallBase *CB = getOneCallSiteTo(F);
315     DebugLoc DLoc = CB->getDebugLoc();
316     BasicBlock *Block = CB->getParent();
317     return std::make_tuple(DLoc, Block);
318   }
319 
320   // Returns the costs associated with function outlining:
321   // - The first value is the non-weighted runtime cost for making the call
322   //   to the outlined function, including the addtional  setup cost in the
323   //    outlined function itself;
324   // - The second value is the estimated size of the new call sequence in
325   //   basic block Cloner.OutliningCallBB;
326   std::tuple<InstructionCost, InstructionCost>
327   computeOutliningCosts(FunctionCloner &Cloner) const;
328 
329   // Compute the 'InlineCost' of block BB. InlineCost is a proxy used to
330   // approximate both the size and runtime cost (Note that in the current
331   // inline cost analysis, there is no clear distinction there either).
332   static InstructionCost computeBBInlineCost(BasicBlock *BB,
333                                              TargetTransformInfo *TTI);
334 
335   std::unique_ptr<FunctionOutliningInfo>
336   computeOutliningInfo(Function &F) const;
337 
338   std::unique_ptr<FunctionOutliningMultiRegionInfo>
339   computeOutliningColdRegionsInfo(Function &F,
340                                   OptimizationRemarkEmitter &ORE) const;
341 };
342 
343 } // end anonymous namespace
344 
345 std::unique_ptr<FunctionOutliningMultiRegionInfo>
346 PartialInlinerImpl::computeOutliningColdRegionsInfo(
347     Function &F, OptimizationRemarkEmitter &ORE) const {
348   BasicBlock *EntryBlock = &F.front();
349 
350   DominatorTree DT(F);
351   LoopInfo LI(DT);
352   BranchProbabilityInfo BPI(F, LI);
353   std::unique_ptr<BlockFrequencyInfo> ScopedBFI;
354   BlockFrequencyInfo *BFI;
355   if (!GetBFI) {
356     ScopedBFI.reset(new BlockFrequencyInfo(F, BPI, LI));
357     BFI = ScopedBFI.get();
358   } else
359     BFI = &(GetBFI(F));
360 
361   // Return if we don't have profiling information.
362   if (!PSI.hasInstrumentationProfile())
363     return std::unique_ptr<FunctionOutliningMultiRegionInfo>();
364 
365   std::unique_ptr<FunctionOutliningMultiRegionInfo> OutliningInfo =
366       std::make_unique<FunctionOutliningMultiRegionInfo>();
367 
368   auto IsSingleExit =
369       [&ORE](SmallVectorImpl<BasicBlock *> &BlockList) -> BasicBlock * {
370     BasicBlock *ExitBlock = nullptr;
371     for (auto *Block : BlockList) {
372       for (BasicBlock *Succ : successors(Block)) {
373         if (!is_contained(BlockList, Succ)) {
374           if (ExitBlock) {
375             ORE.emit([&]() {
376               return OptimizationRemarkMissed(DEBUG_TYPE, "MultiExitRegion",
377                                               &Succ->front())
378                      << "Region dominated by "
379                      << ore::NV("Block", BlockList.front()->getName())
380                      << " has more than one region exit edge.";
381             });
382             return nullptr;
383           }
384 
385           ExitBlock = Block;
386         }
387       }
388     }
389     return ExitBlock;
390   };
391 
392   auto BBProfileCount = [BFI](BasicBlock *BB) {
393     return BFI->getBlockProfileCount(BB).value_or(0);
394   };
395 
396   // Use the same computeBBInlineCost function to compute the cost savings of
397   // the outlining the candidate region.
398   TargetTransformInfo *FTTI = &GetTTI(F);
399   InstructionCost OverallFunctionCost = 0;
400   for (auto &BB : F)
401     OverallFunctionCost += computeBBInlineCost(&BB, FTTI);
402 
403   LLVM_DEBUG(dbgs() << "OverallFunctionCost = " << OverallFunctionCost
404                     << "\n";);
405 
406   InstructionCost MinOutlineRegionCost = OverallFunctionCost.map(
407       [&](auto Cost) { return Cost * MinRegionSizeRatio; });
408 
409   BranchProbability MinBranchProbability(
410       static_cast<int>(ColdBranchRatio * MinBlockCounterExecution),
411       MinBlockCounterExecution);
412   bool ColdCandidateFound = false;
413   BasicBlock *CurrEntry = EntryBlock;
414   std::vector<BasicBlock *> DFS;
415   SmallPtrSet<BasicBlock *, 8> VisitedSet;
416   DFS.push_back(CurrEntry);
417   VisitedSet.insert(CurrEntry);
418 
419   // Use Depth First Search on the basic blocks to find CFG edges that are
420   // considered cold.
421   // Cold regions considered must also have its inline cost compared to the
422   // overall inline cost of the original function.  The region is outlined only
423   // if it reduced the inline cost of the function by 'MinOutlineRegionCost' or
424   // more.
425   while (!DFS.empty()) {
426     auto *ThisBB = DFS.back();
427     DFS.pop_back();
428     // Only consider regions with predecessor blocks that are considered
429     // not-cold (default: part of the top 99.99% of all block counters)
430     // AND greater than our minimum block execution count (default: 100).
431     if (PSI.isColdBlock(ThisBB, BFI) ||
432         BBProfileCount(ThisBB) < MinBlockCounterExecution)
433       continue;
434     for (auto SI = succ_begin(ThisBB); SI != succ_end(ThisBB); ++SI) {
435       if (!VisitedSet.insert(*SI).second)
436         continue;
437       DFS.push_back(*SI);
438       // If branch isn't cold, we skip to the next one.
439       BranchProbability SuccProb = BPI.getEdgeProbability(ThisBB, *SI);
440       if (SuccProb > MinBranchProbability)
441         continue;
442 
443       LLVM_DEBUG(dbgs() << "Found cold edge: " << ThisBB->getName() << "->"
444                         << SI->getName()
445                         << "\nBranch Probability = " << SuccProb << "\n";);
446 
447       SmallVector<BasicBlock *, 8> DominateVector;
448       DT.getDescendants(*SI, DominateVector);
449       assert(!DominateVector.empty() &&
450              "SI should be reachable and have at least itself as descendant");
451 
452       // We can only outline single entry regions (for now).
453       if (!DominateVector.front()->hasNPredecessors(1)) {
454         LLVM_DEBUG(dbgs() << "ABORT: Block " << SI->getName()
455                           << " doesn't have a single predecessor in the "
456                              "dominator tree\n";);
457         continue;
458       }
459 
460       BasicBlock *ExitBlock = nullptr;
461       // We can only outline single exit regions (for now).
462       if (!(ExitBlock = IsSingleExit(DominateVector))) {
463         LLVM_DEBUG(dbgs() << "ABORT: Block " << SI->getName()
464                           << " doesn't have a unique successor\n";);
465         continue;
466       }
467 
468       InstructionCost OutlineRegionCost = 0;
469       for (auto *BB : DominateVector)
470         OutlineRegionCost += computeBBInlineCost(BB, &GetTTI(*BB->getParent()));
471 
472       LLVM_DEBUG(dbgs() << "OutlineRegionCost = " << OutlineRegionCost
473                         << "\n";);
474 
475       if (!SkipCostAnalysis && OutlineRegionCost < MinOutlineRegionCost) {
476         ORE.emit([&]() {
477           return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly",
478                                             &SI->front())
479                  << ore::NV("Callee", &F)
480                  << " inline cost-savings smaller than "
481                  << ore::NV("Cost", MinOutlineRegionCost);
482         });
483 
484         LLVM_DEBUG(dbgs() << "ABORT: Outline region cost is smaller than "
485                           << MinOutlineRegionCost << "\n";);
486         continue;
487       }
488 
489       // For now, ignore blocks that belong to a SISE region that is a
490       // candidate for outlining.  In the future, we may want to look
491       // at inner regions because the outer region may have live-exit
492       // variables.
493       VisitedSet.insert_range(DominateVector);
494 
495       // ReturnBlock here means the block after the outline call
496       BasicBlock *ReturnBlock = ExitBlock->getSingleSuccessor();
497       FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegInfo(
498           DominateVector, DominateVector.front(), ExitBlock, ReturnBlock);
499       OutliningInfo->ORI.push_back(RegInfo);
500       LLVM_DEBUG(dbgs() << "Found Cold Candidate starting at block: "
501                         << DominateVector.front()->getName() << "\n";);
502       ColdCandidateFound = true;
503       NumColdRegionsFound++;
504     }
505   }
506 
507   if (ColdCandidateFound)
508     return OutliningInfo;
509 
510   return std::unique_ptr<FunctionOutliningMultiRegionInfo>();
511 }
512 
513 std::unique_ptr<FunctionOutliningInfo>
514 PartialInlinerImpl::computeOutliningInfo(Function &F) const {
515   BasicBlock *EntryBlock = &F.front();
516   BranchInst *BR = dyn_cast<BranchInst>(EntryBlock->getTerminator());
517   if (!BR || BR->isUnconditional())
518     return std::unique_ptr<FunctionOutliningInfo>();
519 
520   // Returns true if Succ is BB's successor
521   auto IsSuccessor = [](BasicBlock *Succ, BasicBlock *BB) {
522     return is_contained(successors(BB), Succ);
523   };
524 
525   auto IsReturnBlock = [](BasicBlock *BB) {
526     Instruction *TI = BB->getTerminator();
527     return isa<ReturnInst>(TI);
528   };
529 
530   auto GetReturnBlock = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
531     if (IsReturnBlock(Succ1))
532       return std::make_tuple(Succ1, Succ2);
533     if (IsReturnBlock(Succ2))
534       return std::make_tuple(Succ2, Succ1);
535 
536     return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr);
537   };
538 
539   // Detect a triangular shape:
540   auto GetCommonSucc = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
541     if (IsSuccessor(Succ1, Succ2))
542       return std::make_tuple(Succ1, Succ2);
543     if (IsSuccessor(Succ2, Succ1))
544       return std::make_tuple(Succ2, Succ1);
545 
546     return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr);
547   };
548 
549   std::unique_ptr<FunctionOutliningInfo> OutliningInfo =
550       std::make_unique<FunctionOutliningInfo>();
551 
552   BasicBlock *CurrEntry = EntryBlock;
553   bool CandidateFound = false;
554   do {
555     // The number of blocks to be inlined has already reached
556     // the limit. When MaxNumInlineBlocks is set to 0 or 1, this
557     // disables partial inlining for the function.
558     if (OutliningInfo->getNumInlinedBlocks() >= MaxNumInlineBlocks)
559       break;
560 
561     if (succ_size(CurrEntry) != 2)
562       break;
563 
564     BasicBlock *Succ1 = *succ_begin(CurrEntry);
565     BasicBlock *Succ2 = *(succ_begin(CurrEntry) + 1);
566 
567     BasicBlock *ReturnBlock, *NonReturnBlock;
568     std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2);
569 
570     if (ReturnBlock) {
571       OutliningInfo->Entries.push_back(CurrEntry);
572       OutliningInfo->ReturnBlock = ReturnBlock;
573       OutliningInfo->NonReturnBlock = NonReturnBlock;
574       CandidateFound = true;
575       break;
576     }
577 
578     BasicBlock *CommSucc, *OtherSucc;
579     std::tie(CommSucc, OtherSucc) = GetCommonSucc(Succ1, Succ2);
580 
581     if (!CommSucc)
582       break;
583 
584     OutliningInfo->Entries.push_back(CurrEntry);
585     CurrEntry = OtherSucc;
586   } while (true);
587 
588   if (!CandidateFound)
589     return std::unique_ptr<FunctionOutliningInfo>();
590 
591   // There should not be any successors (not in the entry set) other than
592   // {ReturnBlock, NonReturnBlock}
593   assert(OutliningInfo->Entries[0] == &F.front() &&
594          "Function Entry must be the first in Entries vector");
595   DenseSet<BasicBlock *> Entries(llvm::from_range, OutliningInfo->Entries);
596 
597   // Returns true of BB has Predecessor which is not
598   // in Entries set.
599   auto HasNonEntryPred = [Entries](BasicBlock *BB) {
600     for (auto *Pred : predecessors(BB)) {
601       if (!Entries.count(Pred))
602         return true;
603     }
604     return false;
605   };
606   auto CheckAndNormalizeCandidate =
607       [Entries, HasNonEntryPred](FunctionOutliningInfo *OutliningInfo) {
608         for (BasicBlock *E : OutliningInfo->Entries) {
609           for (auto *Succ : successors(E)) {
610             if (Entries.count(Succ))
611               continue;
612             if (Succ == OutliningInfo->ReturnBlock)
613               OutliningInfo->ReturnBlockPreds.push_back(E);
614             else if (Succ != OutliningInfo->NonReturnBlock)
615               return false;
616           }
617           // There should not be any outside incoming edges either:
618           if (HasNonEntryPred(E))
619             return false;
620         }
621         return true;
622       };
623 
624   if (!CheckAndNormalizeCandidate(OutliningInfo.get()))
625     return std::unique_ptr<FunctionOutliningInfo>();
626 
627   // Now further growing the candidate's inlining region by
628   // peeling off dominating blocks from the outlining region:
629   while (OutliningInfo->getNumInlinedBlocks() < MaxNumInlineBlocks) {
630     BasicBlock *Cand = OutliningInfo->NonReturnBlock;
631     if (succ_size(Cand) != 2)
632       break;
633 
634     if (HasNonEntryPred(Cand))
635       break;
636 
637     BasicBlock *Succ1 = *succ_begin(Cand);
638     BasicBlock *Succ2 = *(succ_begin(Cand) + 1);
639 
640     BasicBlock *ReturnBlock, *NonReturnBlock;
641     std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2);
642     if (!ReturnBlock || ReturnBlock != OutliningInfo->ReturnBlock)
643       break;
644 
645     if (NonReturnBlock->getSinglePredecessor() != Cand)
646       break;
647 
648     // Now grow and update OutlininigInfo:
649     OutliningInfo->Entries.push_back(Cand);
650     OutliningInfo->NonReturnBlock = NonReturnBlock;
651     OutliningInfo->ReturnBlockPreds.push_back(Cand);
652     Entries.insert(Cand);
653   }
654 
655   return OutliningInfo;
656 }
657 
658 // Check if there is PGO data or user annotated branch data:
659 static bool hasProfileData(const Function &F, const FunctionOutliningInfo &OI) {
660   if (F.hasProfileData())
661     return true;
662   // Now check if any of the entry block has MD_prof data:
663   for (auto *E : OI.Entries) {
664     BranchInst *BR = dyn_cast<BranchInst>(E->getTerminator());
665     if (!BR || BR->isUnconditional())
666       continue;
667     if (hasBranchWeightMD(*BR))
668       return true;
669   }
670   return false;
671 }
672 
673 BranchProbability PartialInlinerImpl::getOutliningCallBBRelativeFreq(
674     FunctionCloner &Cloner) const {
675   BasicBlock *OutliningCallBB = Cloner.OutlinedFunctions.back().second;
676   auto EntryFreq =
677       Cloner.ClonedFuncBFI->getBlockFreq(&Cloner.ClonedFunc->getEntryBlock());
678   auto OutliningCallFreq =
679       Cloner.ClonedFuncBFI->getBlockFreq(OutliningCallBB);
680   // FIXME Hackery needed because ClonedFuncBFI is based on the function BEFORE
681   // we outlined any regions, so we may encounter situations where the
682   // OutliningCallFreq is *slightly* bigger than the EntryFreq.
683   if (OutliningCallFreq.getFrequency() > EntryFreq.getFrequency())
684     OutliningCallFreq = EntryFreq;
685 
686   auto OutlineRegionRelFreq = BranchProbability::getBranchProbability(
687       OutliningCallFreq.getFrequency(), EntryFreq.getFrequency());
688 
689   if (hasProfileData(*Cloner.OrigFunc, *Cloner.ClonedOI))
690     return OutlineRegionRelFreq;
691 
692   // When profile data is not available, we need to be conservative in
693   // estimating the overall savings. Static branch prediction can usually
694   // guess the branch direction right (taken/non-taken), but the guessed
695   // branch probability is usually not biased enough. In case when the
696   // outlined region is predicted to be likely, its probability needs
697   // to be made higher (more biased) to not under-estimate the cost of
698   // function outlining. On the other hand, if the outlined region
699   // is predicted to be less likely, the predicted probablity is usually
700   // higher than the actual. For instance, the actual probability of the
701   // less likely target is only 5%, but the guessed probablity can be
702   // 40%. In the latter case, there is no need for further adjustment.
703   // FIXME: add an option for this.
704   if (OutlineRegionRelFreq < BranchProbability(45, 100))
705     return OutlineRegionRelFreq;
706 
707   OutlineRegionRelFreq = std::max(
708       OutlineRegionRelFreq, BranchProbability(OutlineRegionFreqPercent, 100));
709 
710   return OutlineRegionRelFreq;
711 }
712 
713 bool PartialInlinerImpl::shouldPartialInline(
714     CallBase &CB, FunctionCloner &Cloner, BlockFrequency WeightedOutliningRcost,
715     OptimizationRemarkEmitter &ORE) const {
716   using namespace ore;
717 
718   Function *Callee = CB.getCalledFunction();
719   assert(Callee == Cloner.ClonedFunc);
720 
721   if (SkipCostAnalysis)
722     return isInlineViable(*Callee).isSuccess();
723 
724   Function *Caller = CB.getCaller();
725   auto &CalleeTTI = GetTTI(*Callee);
726   bool RemarksEnabled =
727       Callee->getContext().getDiagHandlerPtr()->isMissedOptRemarkEnabled(
728           DEBUG_TYPE);
729   InlineCost IC =
730       getInlineCost(CB, getInlineParams(), CalleeTTI, GetAssumptionCache,
731                     GetTLI, GetBFI, &PSI, RemarksEnabled ? &ORE : nullptr);
732 
733   if (IC.isAlways()) {
734     ORE.emit([&]() {
735       return OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", &CB)
736              << NV("Callee", Cloner.OrigFunc)
737              << " should always be fully inlined, not partially";
738     });
739     return false;
740   }
741 
742   if (IC.isNever()) {
743     ORE.emit([&]() {
744       return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", &CB)
745              << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
746              << NV("Caller", Caller)
747              << " because it should never be inlined (cost=never)";
748     });
749     return false;
750   }
751 
752   if (!IC) {
753     ORE.emit([&]() {
754       return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", &CB)
755              << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
756              << NV("Caller", Caller) << " because too costly to inline (cost="
757              << NV("Cost", IC.getCost()) << ", threshold="
758              << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")";
759     });
760     return false;
761   }
762   const DataLayout &DL = Caller->getDataLayout();
763 
764   // The savings of eliminating the call:
765   int NonWeightedSavings = getCallsiteCost(CalleeTTI, CB, DL);
766   BlockFrequency NormWeightedSavings(NonWeightedSavings);
767 
768   // Weighted saving is smaller than weighted cost, return false
769   if (NormWeightedSavings < WeightedOutliningRcost) {
770     ORE.emit([&]() {
771       return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutliningCallcostTooHigh",
772                                         &CB)
773              << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
774              << NV("Caller", Caller) << " runtime overhead (overhead="
775              << NV("Overhead", (unsigned)WeightedOutliningRcost.getFrequency())
776              << ", savings="
777              << NV("Savings", (unsigned)NormWeightedSavings.getFrequency())
778              << ")"
779              << " of making the outlined call is too high";
780     });
781 
782     return false;
783   }
784 
785   ORE.emit([&]() {
786     return OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBePartiallyInlined", &CB)
787            << NV("Callee", Cloner.OrigFunc) << " can be partially inlined into "
788            << NV("Caller", Caller) << " with cost=" << NV("Cost", IC.getCost())
789            << " (threshold="
790            << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")";
791   });
792   return true;
793 }
794 
795 // TODO: Ideally  we should share Inliner's InlineCost Analysis code.
796 // For now use a simplified version. The returned 'InlineCost' will be used
797 // to esimate the size cost as well as runtime cost of the BB.
798 InstructionCost
799 PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB,
800                                         TargetTransformInfo *TTI) {
801   InstructionCost InlineCost = 0;
802   const DataLayout &DL = BB->getDataLayout();
803   int InstrCost = InlineConstants::getInstrCost();
804   for (Instruction &I : BB->instructionsWithoutDebug()) {
805     // Skip free instructions.
806     switch (I.getOpcode()) {
807     case Instruction::BitCast:
808     case Instruction::PtrToInt:
809     case Instruction::IntToPtr:
810     case Instruction::Alloca:
811     case Instruction::PHI:
812       continue;
813     case Instruction::GetElementPtr:
814       if (cast<GetElementPtrInst>(&I)->hasAllZeroIndices())
815         continue;
816       break;
817     default:
818       break;
819     }
820 
821     if (I.isLifetimeStartOrEnd())
822       continue;
823 
824     if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
825       Intrinsic::ID IID = II->getIntrinsicID();
826       SmallVector<Type *, 4> Tys;
827       FastMathFlags FMF;
828       for (Value *Val : II->args())
829         Tys.push_back(Val->getType());
830 
831       if (auto *FPMO = dyn_cast<FPMathOperator>(II))
832         FMF = FPMO->getFastMathFlags();
833 
834       IntrinsicCostAttributes ICA(IID, II->getType(), Tys, FMF);
835       InlineCost += TTI->getIntrinsicInstrCost(ICA, TTI::TCK_SizeAndLatency);
836       continue;
837     }
838 
839     if (CallInst *CI = dyn_cast<CallInst>(&I)) {
840       InlineCost += getCallsiteCost(*TTI, *CI, DL);
841       continue;
842     }
843 
844     if (InvokeInst *II = dyn_cast<InvokeInst>(&I)) {
845       InlineCost += getCallsiteCost(*TTI, *II, DL);
846       continue;
847     }
848 
849     if (SwitchInst *SI = dyn_cast<SwitchInst>(&I)) {
850       InlineCost += (SI->getNumCases() + 1) * InstrCost;
851       continue;
852     }
853     InlineCost += InstrCost;
854   }
855 
856   return InlineCost;
857 }
858 
859 std::tuple<InstructionCost, InstructionCost>
860 PartialInlinerImpl::computeOutliningCosts(FunctionCloner &Cloner) const {
861   InstructionCost OutliningFuncCallCost = 0, OutlinedFunctionCost = 0;
862   for (auto FuncBBPair : Cloner.OutlinedFunctions) {
863     Function *OutlinedFunc = FuncBBPair.first;
864     BasicBlock* OutliningCallBB = FuncBBPair.second;
865     // Now compute the cost of the call sequence to the outlined function
866     // 'OutlinedFunction' in BB 'OutliningCallBB':
867     auto *OutlinedFuncTTI = &GetTTI(*OutlinedFunc);
868     OutliningFuncCallCost +=
869         computeBBInlineCost(OutliningCallBB, OutlinedFuncTTI);
870 
871     // Now compute the cost of the extracted/outlined function itself:
872     for (BasicBlock &BB : *OutlinedFunc)
873       OutlinedFunctionCost += computeBBInlineCost(&BB, OutlinedFuncTTI);
874   }
875   assert(OutlinedFunctionCost >= Cloner.OutlinedRegionCost &&
876          "Outlined function cost should be no less than the outlined region");
877 
878   // The code extractor introduces a new root and exit stub blocks with
879   // additional unconditional branches. Those branches will be eliminated
880   // later with bb layout. The cost should be adjusted accordingly:
881   OutlinedFunctionCost -=
882       2 * InlineConstants::getInstrCost() * Cloner.OutlinedFunctions.size();
883 
884   InstructionCost OutliningRuntimeOverhead =
885       OutliningFuncCallCost +
886       (OutlinedFunctionCost - Cloner.OutlinedRegionCost) +
887       ExtraOutliningPenalty.getValue();
888 
889   return std::make_tuple(OutliningFuncCallCost, OutliningRuntimeOverhead);
890 }
891 
892 // Create the callsite to profile count map which is
893 // used to update the original function's entry count,
894 // after the function is partially inlined into the callsite.
895 void PartialInlinerImpl::computeCallsiteToProfCountMap(
896     Function *DuplicateFunction,
897     DenseMap<User *, uint64_t> &CallSiteToProfCountMap) const {
898   std::vector<User *> Users(DuplicateFunction->user_begin(),
899                             DuplicateFunction->user_end());
900   Function *CurrentCaller = nullptr;
901   std::unique_ptr<BlockFrequencyInfo> TempBFI;
902   BlockFrequencyInfo *CurrentCallerBFI = nullptr;
903 
904   auto ComputeCurrBFI = [&,this](Function *Caller) {
905       // For the old pass manager:
906       if (!GetBFI) {
907         DominatorTree DT(*Caller);
908         LoopInfo LI(DT);
909         BranchProbabilityInfo BPI(*Caller, LI);
910         TempBFI.reset(new BlockFrequencyInfo(*Caller, BPI, LI));
911         CurrentCallerBFI = TempBFI.get();
912       } else {
913         // New pass manager:
914         CurrentCallerBFI = &(GetBFI(*Caller));
915       }
916   };
917 
918   for (User *User : Users) {
919     CallBase *CB = getSupportedCallBase(User);
920     Function *Caller = CB->getCaller();
921     if (CurrentCaller != Caller) {
922       CurrentCaller = Caller;
923       ComputeCurrBFI(Caller);
924     } else {
925       assert(CurrentCallerBFI && "CallerBFI is not set");
926     }
927     BasicBlock *CallBB = CB->getParent();
928     auto Count = CurrentCallerBFI->getBlockProfileCount(CallBB);
929     if (Count)
930       CallSiteToProfCountMap[User] = *Count;
931     else
932       CallSiteToProfCountMap[User] = 0;
933   }
934 }
935 
936 PartialInlinerImpl::FunctionCloner::FunctionCloner(
937     Function *F, FunctionOutliningInfo *OI, OptimizationRemarkEmitter &ORE,
938     function_ref<AssumptionCache *(Function &)> LookupAC,
939     function_ref<TargetTransformInfo &(Function &)> GetTTI)
940     : OrigFunc(F), ORE(ORE), LookupAC(LookupAC), GetTTI(GetTTI) {
941   ClonedOI = std::make_unique<FunctionOutliningInfo>();
942 
943   // Clone the function, so that we can hack away on it.
944   ValueToValueMapTy VMap;
945   ClonedFunc = CloneFunction(F, VMap);
946 
947   ClonedOI->ReturnBlock = cast<BasicBlock>(VMap[OI->ReturnBlock]);
948   ClonedOI->NonReturnBlock = cast<BasicBlock>(VMap[OI->NonReturnBlock]);
949   for (BasicBlock *BB : OI->Entries)
950     ClonedOI->Entries.push_back(cast<BasicBlock>(VMap[BB]));
951 
952   for (BasicBlock *E : OI->ReturnBlockPreds) {
953     BasicBlock *NewE = cast<BasicBlock>(VMap[E]);
954     ClonedOI->ReturnBlockPreds.push_back(NewE);
955   }
956   // Go ahead and update all uses to the duplicate, so that we can just
957   // use the inliner functionality when we're done hacking.
958   F->replaceAllUsesWith(ClonedFunc);
959 }
960 
961 PartialInlinerImpl::FunctionCloner::FunctionCloner(
962     Function *F, FunctionOutliningMultiRegionInfo *OI,
963     OptimizationRemarkEmitter &ORE,
964     function_ref<AssumptionCache *(Function &)> LookupAC,
965     function_ref<TargetTransformInfo &(Function &)> GetTTI)
966     : OrigFunc(F), ORE(ORE), LookupAC(LookupAC), GetTTI(GetTTI) {
967   ClonedOMRI = std::make_unique<FunctionOutliningMultiRegionInfo>();
968 
969   // Clone the function, so that we can hack away on it.
970   ValueToValueMapTy VMap;
971   ClonedFunc = CloneFunction(F, VMap);
972 
973   // Go through all Outline Candidate Regions and update all BasicBlock
974   // information.
975   for (const FunctionOutliningMultiRegionInfo::OutlineRegionInfo &RegionInfo :
976        OI->ORI) {
977     SmallVector<BasicBlock *, 8> Region;
978     for (BasicBlock *BB : RegionInfo.Region)
979       Region.push_back(cast<BasicBlock>(VMap[BB]));
980 
981     BasicBlock *NewEntryBlock = cast<BasicBlock>(VMap[RegionInfo.EntryBlock]);
982     BasicBlock *NewExitBlock = cast<BasicBlock>(VMap[RegionInfo.ExitBlock]);
983     BasicBlock *NewReturnBlock = nullptr;
984     if (RegionInfo.ReturnBlock)
985       NewReturnBlock = cast<BasicBlock>(VMap[RegionInfo.ReturnBlock]);
986     FunctionOutliningMultiRegionInfo::OutlineRegionInfo MappedRegionInfo(
987         Region, NewEntryBlock, NewExitBlock, NewReturnBlock);
988     ClonedOMRI->ORI.push_back(MappedRegionInfo);
989   }
990   // Go ahead and update all uses to the duplicate, so that we can just
991   // use the inliner functionality when we're done hacking.
992   F->replaceAllUsesWith(ClonedFunc);
993 }
994 
995 void PartialInlinerImpl::FunctionCloner::normalizeReturnBlock() const {
996   auto GetFirstPHI = [](BasicBlock *BB) {
997     BasicBlock::iterator I = BB->begin();
998     PHINode *FirstPhi = nullptr;
999     while (I != BB->end()) {
1000       PHINode *Phi = dyn_cast<PHINode>(I);
1001       if (!Phi)
1002         break;
1003       if (!FirstPhi) {
1004         FirstPhi = Phi;
1005         break;
1006       }
1007     }
1008     return FirstPhi;
1009   };
1010 
1011   // Shouldn't need to normalize PHIs if we're not outlining non-early return
1012   // blocks.
1013   if (!ClonedOI)
1014     return;
1015 
1016   // Special hackery is needed with PHI nodes that have inputs from more than
1017   // one extracted block.  For simplicity, just split the PHIs into a two-level
1018   // sequence of PHIs, some of which will go in the extracted region, and some
1019   // of which will go outside.
1020   BasicBlock *PreReturn = ClonedOI->ReturnBlock;
1021   // only split block when necessary:
1022   PHINode *FirstPhi = GetFirstPHI(PreReturn);
1023   unsigned NumPredsFromEntries = ClonedOI->ReturnBlockPreds.size();
1024 
1025   if (!FirstPhi || FirstPhi->getNumIncomingValues() <= NumPredsFromEntries + 1)
1026     return;
1027 
1028   auto IsTrivialPhi = [](PHINode *PN) -> Value * {
1029     if (llvm::all_equal(PN->incoming_values()))
1030       return PN->getIncomingValue(0);
1031     return nullptr;
1032   };
1033 
1034   ClonedOI->ReturnBlock = ClonedOI->ReturnBlock->splitBasicBlock(
1035       ClonedOI->ReturnBlock->getFirstNonPHIIt());
1036   BasicBlock::iterator I = PreReturn->begin();
1037   BasicBlock::iterator Ins = ClonedOI->ReturnBlock->begin();
1038   SmallVector<Instruction *, 4> DeadPhis;
1039   while (I != PreReturn->end()) {
1040     PHINode *OldPhi = dyn_cast<PHINode>(I);
1041     if (!OldPhi)
1042       break;
1043 
1044     PHINode *RetPhi =
1045         PHINode::Create(OldPhi->getType(), NumPredsFromEntries + 1, "");
1046     RetPhi->insertBefore(Ins);
1047     OldPhi->replaceAllUsesWith(RetPhi);
1048     Ins = ClonedOI->ReturnBlock->getFirstNonPHIIt();
1049 
1050     RetPhi->addIncoming(&*I, PreReturn);
1051     for (BasicBlock *E : ClonedOI->ReturnBlockPreds) {
1052       RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(E), E);
1053       OldPhi->removeIncomingValue(E);
1054     }
1055 
1056     // After incoming values splitting, the old phi may become trivial.
1057     // Keeping the trivial phi can introduce definition inside the outline
1058     // region which is live-out, causing necessary overhead (load, store
1059     // arg passing etc).
1060     if (auto *OldPhiVal = IsTrivialPhi(OldPhi)) {
1061       OldPhi->replaceAllUsesWith(OldPhiVal);
1062       DeadPhis.push_back(OldPhi);
1063     }
1064     ++I;
1065   }
1066   for (auto *DP : DeadPhis)
1067     DP->eraseFromParent();
1068 
1069   for (auto *E : ClonedOI->ReturnBlockPreds)
1070     E->getTerminator()->replaceUsesOfWith(PreReturn, ClonedOI->ReturnBlock);
1071 }
1072 
1073 bool PartialInlinerImpl::FunctionCloner::doMultiRegionFunctionOutlining() {
1074 
1075   auto ComputeRegionCost =
1076       [&](SmallVectorImpl<BasicBlock *> &Region) -> InstructionCost {
1077     InstructionCost Cost = 0;
1078     for (BasicBlock* BB : Region)
1079       Cost += computeBBInlineCost(BB, &GetTTI(*BB->getParent()));
1080     return Cost;
1081   };
1082 
1083   assert(ClonedOMRI && "Expecting OutlineInfo for multi region outline");
1084 
1085   if (ClonedOMRI->ORI.empty())
1086     return false;
1087 
1088   // The CodeExtractor needs a dominator tree.
1089   DominatorTree DT;
1090   DT.recalculate(*ClonedFunc);
1091 
1092   // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo.
1093   LoopInfo LI(DT);
1094   BranchProbabilityInfo BPI(*ClonedFunc, LI);
1095   ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI));
1096 
1097   // Cache and recycle the CodeExtractor analysis to avoid O(n^2) compile-time.
1098   CodeExtractorAnalysisCache CEAC(*ClonedFunc);
1099 
1100   SetVector<Value *> Inputs, Outputs, Sinks;
1101   for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo :
1102        ClonedOMRI->ORI) {
1103     InstructionCost CurrentOutlinedRegionCost =
1104         ComputeRegionCost(RegionInfo.Region);
1105 
1106     CodeExtractor CE(RegionInfo.Region, &DT, /*AggregateArgs*/ false,
1107                      ClonedFuncBFI.get(), &BPI,
1108                      LookupAC(*RegionInfo.EntryBlock->getParent()),
1109                      /* AllowVarargs */ false);
1110 
1111     CE.findInputsOutputs(Inputs, Outputs, Sinks);
1112 
1113     LLVM_DEBUG({
1114       dbgs() << "inputs: " << Inputs.size() << "\n";
1115       dbgs() << "outputs: " << Outputs.size() << "\n";
1116       for (Value *value : Inputs)
1117         dbgs() << "value used in func: " << *value << "\n";
1118       for (Value *output : Outputs)
1119         dbgs() << "instr used in func: " << *output << "\n";
1120     });
1121 
1122     // Do not extract regions that have live exit variables.
1123     if (Outputs.size() > 0 && !ForceLiveExit)
1124       continue;
1125 
1126     if (Function *OutlinedFunc = CE.extractCodeRegion(CEAC)) {
1127       CallBase *OCS = PartialInlinerImpl::getOneCallSiteTo(*OutlinedFunc);
1128       BasicBlock *OutliningCallBB = OCS->getParent();
1129       assert(OutliningCallBB->getParent() == ClonedFunc);
1130       OutlinedFunctions.push_back(std::make_pair(OutlinedFunc,OutliningCallBB));
1131       NumColdRegionsOutlined++;
1132       OutlinedRegionCost += CurrentOutlinedRegionCost;
1133 
1134       if (MarkOutlinedColdCC) {
1135         OutlinedFunc->setCallingConv(CallingConv::Cold);
1136         OCS->setCallingConv(CallingConv::Cold);
1137       }
1138     } else
1139       ORE.emit([&]() {
1140         return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed",
1141                                         &RegionInfo.Region.front()->front())
1142                << "Failed to extract region at block "
1143                << ore::NV("Block", RegionInfo.Region.front());
1144       });
1145   }
1146 
1147   return !OutlinedFunctions.empty();
1148 }
1149 
1150 Function *
1151 PartialInlinerImpl::FunctionCloner::doSingleRegionFunctionOutlining() {
1152   // Returns true if the block is to be partial inlined into the caller
1153   // (i.e. not to be extracted to the out of line function)
1154   auto ToBeInlined = [&, this](BasicBlock *BB) {
1155     return BB == ClonedOI->ReturnBlock ||
1156            llvm::is_contained(ClonedOI->Entries, BB);
1157   };
1158 
1159   assert(ClonedOI && "Expecting OutlineInfo for single region outline");
1160   // The CodeExtractor needs a dominator tree.
1161   DominatorTree DT;
1162   DT.recalculate(*ClonedFunc);
1163 
1164   // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo.
1165   LoopInfo LI(DT);
1166   BranchProbabilityInfo BPI(*ClonedFunc, LI);
1167   ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI));
1168 
1169   // Gather up the blocks that we're going to extract.
1170   std::vector<BasicBlock *> ToExtract;
1171   auto *ClonedFuncTTI = &GetTTI(*ClonedFunc);
1172   ToExtract.push_back(ClonedOI->NonReturnBlock);
1173   OutlinedRegionCost += PartialInlinerImpl::computeBBInlineCost(
1174       ClonedOI->NonReturnBlock, ClonedFuncTTI);
1175   for (BasicBlock *BB : depth_first(&ClonedFunc->getEntryBlock()))
1176     if (!ToBeInlined(BB) && BB != ClonedOI->NonReturnBlock) {
1177       ToExtract.push_back(BB);
1178       // FIXME: the code extractor may hoist/sink more code
1179       // into the outlined function which may make the outlining
1180       // overhead (the difference of the outlined function cost
1181       // and OutliningRegionCost) look larger.
1182       OutlinedRegionCost += computeBBInlineCost(BB, ClonedFuncTTI);
1183     }
1184 
1185   // Extract the body of the if.
1186   CodeExtractorAnalysisCache CEAC(*ClonedFunc);
1187   Function *OutlinedFunc =
1188       CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false,
1189                     ClonedFuncBFI.get(), &BPI, LookupAC(*ClonedFunc),
1190                     /* AllowVarargs */ true)
1191           .extractCodeRegion(CEAC);
1192 
1193   if (OutlinedFunc) {
1194     BasicBlock *OutliningCallBB =
1195         PartialInlinerImpl::getOneCallSiteTo(*OutlinedFunc)->getParent();
1196     assert(OutliningCallBB->getParent() == ClonedFunc);
1197     OutlinedFunctions.push_back(std::make_pair(OutlinedFunc, OutliningCallBB));
1198   } else
1199     ORE.emit([&]() {
1200       return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed",
1201                                       &ToExtract.front()->front())
1202              << "Failed to extract region at block "
1203              << ore::NV("Block", ToExtract.front());
1204     });
1205 
1206   return OutlinedFunc;
1207 }
1208 
1209 PartialInlinerImpl::FunctionCloner::~FunctionCloner() {
1210   // Ditch the duplicate, since we're done with it, and rewrite all remaining
1211   // users (function pointers, etc.) back to the original function.
1212   ClonedFunc->replaceAllUsesWith(OrigFunc);
1213   ClonedFunc->eraseFromParent();
1214   if (!IsFunctionInlined) {
1215     // Remove each function that was speculatively created if there is no
1216     // reference.
1217     for (auto FuncBBPair : OutlinedFunctions) {
1218       Function *Func = FuncBBPair.first;
1219       Func->eraseFromParent();
1220     }
1221   }
1222 }
1223 
1224 std::pair<bool, Function *> PartialInlinerImpl::unswitchFunction(Function &F) {
1225   if (F.hasAddressTaken())
1226     return {false, nullptr};
1227 
1228   // Let inliner handle it
1229   if (F.hasFnAttribute(Attribute::AlwaysInline))
1230     return {false, nullptr};
1231 
1232   if (F.hasFnAttribute(Attribute::NoInline))
1233     return {false, nullptr};
1234 
1235   if (PSI.isFunctionEntryCold(&F))
1236     return {false, nullptr};
1237 
1238   if (F.users().empty())
1239     return {false, nullptr};
1240 
1241   OptimizationRemarkEmitter ORE(&F);
1242 
1243   // Only try to outline cold regions if we have a profile summary, which
1244   // implies we have profiling information.
1245   if (PSI.hasProfileSummary() && F.hasProfileData() &&
1246       !DisableMultiRegionPartialInline) {
1247     std::unique_ptr<FunctionOutliningMultiRegionInfo> OMRI =
1248         computeOutliningColdRegionsInfo(F, ORE);
1249     if (OMRI) {
1250       FunctionCloner Cloner(&F, OMRI.get(), ORE, LookupAssumptionCache, GetTTI);
1251 
1252       LLVM_DEBUG({
1253         dbgs() << "HotCountThreshold = " << PSI.getHotCountThreshold() << "\n";
1254         dbgs() << "ColdCountThreshold = " << PSI.getColdCountThreshold()
1255                << "\n";
1256       });
1257 
1258       bool DidOutline = Cloner.doMultiRegionFunctionOutlining();
1259 
1260       if (DidOutline) {
1261         LLVM_DEBUG({
1262           dbgs() << ">>>>>> Outlined (Cloned) Function >>>>>>\n";
1263           Cloner.ClonedFunc->print(dbgs());
1264           dbgs() << "<<<<<< Outlined (Cloned) Function <<<<<<\n";
1265         });
1266 
1267         if (tryPartialInline(Cloner))
1268           return {true, nullptr};
1269       }
1270     }
1271   }
1272 
1273   // Fall-thru to regular partial inlining if we:
1274   //    i) can't find any cold regions to outline, or
1275   //   ii) can't inline the outlined function anywhere.
1276   std::unique_ptr<FunctionOutliningInfo> OI = computeOutliningInfo(F);
1277   if (!OI)
1278     return {false, nullptr};
1279 
1280   FunctionCloner Cloner(&F, OI.get(), ORE, LookupAssumptionCache, GetTTI);
1281   Cloner.normalizeReturnBlock();
1282 
1283   Function *OutlinedFunction = Cloner.doSingleRegionFunctionOutlining();
1284 
1285   if (!OutlinedFunction)
1286     return {false, nullptr};
1287 
1288   if (tryPartialInline(Cloner))
1289     return {true, OutlinedFunction};
1290 
1291   return {false, nullptr};
1292 }
1293 
1294 bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) {
1295   if (Cloner.OutlinedFunctions.empty())
1296     return false;
1297 
1298   auto OutliningCosts = computeOutliningCosts(Cloner);
1299 
1300   InstructionCost SizeCost = std::get<0>(OutliningCosts);
1301   InstructionCost NonWeightedRcost = std::get<1>(OutliningCosts);
1302 
1303   assert(SizeCost.isValid() && NonWeightedRcost.isValid() &&
1304          "Expected valid costs");
1305 
1306   // Only calculate RelativeToEntryFreq when we are doing single region
1307   // outlining.
1308   BranchProbability RelativeToEntryFreq;
1309   if (Cloner.ClonedOI)
1310     RelativeToEntryFreq = getOutliningCallBBRelativeFreq(Cloner);
1311   else
1312     // RelativeToEntryFreq doesn't make sense when we have more than one
1313     // outlined call because each call will have a different relative frequency
1314     // to the entry block.  We can consider using the average, but the
1315     // usefulness of that information is questionable. For now, assume we never
1316     // execute the calls to outlined functions.
1317     RelativeToEntryFreq = BranchProbability(0, 1);
1318 
1319   BlockFrequency WeightedRcost =
1320       BlockFrequency(NonWeightedRcost.getValue()) * RelativeToEntryFreq;
1321 
1322   // The call sequence(s) to the outlined function(s) are larger than the sum of
1323   // the original outlined region size(s), it does not increase the chances of
1324   // inlining the function with outlining (The inliner uses the size increase to
1325   // model the cost of inlining a callee).
1326   if (!SkipCostAnalysis && Cloner.OutlinedRegionCost < SizeCost) {
1327     OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc);
1328     DebugLoc DLoc;
1329     BasicBlock *Block;
1330     std::tie(DLoc, Block) = getOneDebugLoc(*Cloner.ClonedFunc);
1331     OrigFuncORE.emit([&]() {
1332       return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall",
1333                                         DLoc, Block)
1334              << ore::NV("Function", Cloner.OrigFunc)
1335              << " not partially inlined into callers (Original Size = "
1336              << ore::NV("OutlinedRegionOriginalSize", Cloner.OutlinedRegionCost)
1337              << ", Size of call sequence to outlined function = "
1338              << ore::NV("NewSize", SizeCost) << ")";
1339     });
1340     return false;
1341   }
1342 
1343   assert(Cloner.OrigFunc->users().empty() &&
1344          "F's users should all be replaced!");
1345 
1346   std::vector<User *> Users(Cloner.ClonedFunc->user_begin(),
1347                             Cloner.ClonedFunc->user_end());
1348 
1349   DenseMap<User *, uint64_t> CallSiteToProfCountMap;
1350   auto CalleeEntryCount = Cloner.OrigFunc->getEntryCount();
1351   if (CalleeEntryCount)
1352     computeCallsiteToProfCountMap(Cloner.ClonedFunc, CallSiteToProfCountMap);
1353 
1354   uint64_t CalleeEntryCountV =
1355       (CalleeEntryCount ? CalleeEntryCount->getCount() : 0);
1356 
1357   bool AnyInline = false;
1358   for (User *User : Users) {
1359     CallBase *CB = getSupportedCallBase(User);
1360 
1361     if (isLimitReached())
1362       continue;
1363 
1364     OptimizationRemarkEmitter CallerORE(CB->getCaller());
1365     if (!shouldPartialInline(*CB, Cloner, WeightedRcost, CallerORE))
1366       continue;
1367 
1368     // Construct remark before doing the inlining, as after successful inlining
1369     // the callsite is removed.
1370     OptimizationRemark OR(DEBUG_TYPE, "PartiallyInlined", CB);
1371     OR << ore::NV("Callee", Cloner.OrigFunc) << " partially inlined into "
1372        << ore::NV("Caller", CB->getCaller());
1373 
1374     InlineFunctionInfo IFI(GetAssumptionCache, &PSI);
1375     // We can only forward varargs when we outlined a single region, else we
1376     // bail on vararg functions.
1377     if (!InlineFunction(*CB, IFI, /*MergeAttributes=*/false, nullptr, true,
1378                         (Cloner.ClonedOI ? Cloner.OutlinedFunctions.back().first
1379                                          : nullptr))
1380              .isSuccess())
1381       continue;
1382 
1383     CallerORE.emit(OR);
1384 
1385     // Now update the entry count:
1386     if (CalleeEntryCountV) {
1387       if (auto It = CallSiteToProfCountMap.find(User);
1388           It != CallSiteToProfCountMap.end()) {
1389         uint64_t CallSiteCount = It->second;
1390         CalleeEntryCountV -= std::min(CalleeEntryCountV, CallSiteCount);
1391       }
1392     }
1393 
1394     AnyInline = true;
1395     NumPartialInlining++;
1396     // Update the stats
1397     if (Cloner.ClonedOI)
1398       NumPartialInlined++;
1399     else
1400       NumColdOutlinePartialInlined++;
1401   }
1402 
1403   if (AnyInline) {
1404     Cloner.IsFunctionInlined = true;
1405     if (CalleeEntryCount)
1406       Cloner.OrigFunc->setEntryCount(Function::ProfileCount(
1407           CalleeEntryCountV, CalleeEntryCount->getType()));
1408     OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc);
1409     OrigFuncORE.emit([&]() {
1410       return OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", Cloner.OrigFunc)
1411              << "Partially inlined into at least one caller";
1412     });
1413   }
1414 
1415   return AnyInline;
1416 }
1417 
1418 bool PartialInlinerImpl::run(Module &M) {
1419   if (DisablePartialInlining)
1420     return false;
1421 
1422   std::vector<Function *> Worklist;
1423   Worklist.reserve(M.size());
1424   for (Function &F : M)
1425     if (!F.use_empty() && !F.isDeclaration())
1426       Worklist.push_back(&F);
1427 
1428   bool Changed = false;
1429   while (!Worklist.empty()) {
1430     Function *CurrFunc = Worklist.back();
1431     Worklist.pop_back();
1432 
1433     if (CurrFunc->use_empty())
1434       continue;
1435 
1436     std::pair<bool, Function *> Result = unswitchFunction(*CurrFunc);
1437     if (Result.second)
1438       Worklist.push_back(Result.second);
1439     Changed |= Result.first;
1440   }
1441 
1442   return Changed;
1443 }
1444 
1445 PreservedAnalyses PartialInlinerPass::run(Module &M,
1446                                           ModuleAnalysisManager &AM) {
1447   auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
1448 
1449   auto GetAssumptionCache = [&FAM](Function &F) -> AssumptionCache & {
1450     return FAM.getResult<AssumptionAnalysis>(F);
1451   };
1452 
1453   auto LookupAssumptionCache = [&FAM](Function &F) -> AssumptionCache * {
1454     return FAM.getCachedResult<AssumptionAnalysis>(F);
1455   };
1456 
1457   auto GetBFI = [&FAM](Function &F) -> BlockFrequencyInfo & {
1458     return FAM.getResult<BlockFrequencyAnalysis>(F);
1459   };
1460 
1461   auto GetTTI = [&FAM](Function &F) -> TargetTransformInfo & {
1462     return FAM.getResult<TargetIRAnalysis>(F);
1463   };
1464 
1465   auto GetTLI = [&FAM](Function &F) -> TargetLibraryInfo & {
1466     return FAM.getResult<TargetLibraryAnalysis>(F);
1467   };
1468 
1469   ProfileSummaryInfo &PSI = AM.getResult<ProfileSummaryAnalysis>(M);
1470 
1471   if (PartialInlinerImpl(GetAssumptionCache, LookupAssumptionCache, GetTTI,
1472                          GetTLI, PSI, GetBFI)
1473           .run(M))
1474     return PreservedAnalyses::none();
1475   return PreservedAnalyses::all();
1476 }
1477