1 //===- PartialInlining.cpp - Inline parts of functions --------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass performs partial inlining, typically by inlining an if statement
10 // that surrounds the body of the function.
11 //
12 //===----------------------------------------------------------------------===//
13
14 #include "llvm/Transforms/IPO/PartialInlining.h"
15 #include "llvm/ADT/DenseMap.h"
16 #include "llvm/ADT/DenseSet.h"
17 #include "llvm/ADT/DepthFirstIterator.h"
18 #include "llvm/ADT/STLExtras.h"
19 #include "llvm/ADT/SmallVector.h"
20 #include "llvm/ADT/Statistic.h"
21 #include "llvm/Analysis/BlockFrequencyInfo.h"
22 #include "llvm/Analysis/BranchProbabilityInfo.h"
23 #include "llvm/Analysis/InlineCost.h"
24 #include "llvm/Analysis/LoopInfo.h"
25 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
26 #include "llvm/Analysis/ProfileSummaryInfo.h"
27 #include "llvm/Analysis/TargetLibraryInfo.h"
28 #include "llvm/Analysis/TargetTransformInfo.h"
29 #include "llvm/IR/Attributes.h"
30 #include "llvm/IR/BasicBlock.h"
31 #include "llvm/IR/CFG.h"
32 #include "llvm/IR/DebugLoc.h"
33 #include "llvm/IR/DiagnosticInfo.h"
34 #include "llvm/IR/Dominators.h"
35 #include "llvm/IR/Function.h"
36 #include "llvm/IR/InstrTypes.h"
37 #include "llvm/IR/Instruction.h"
38 #include "llvm/IR/Instructions.h"
39 #include "llvm/IR/IntrinsicInst.h"
40 #include "llvm/IR/Intrinsics.h"
41 #include "llvm/IR/Module.h"
42 #include "llvm/IR/Operator.h"
43 #include "llvm/IR/ProfDataUtils.h"
44 #include "llvm/IR/User.h"
45 #include "llvm/Support/BlockFrequency.h"
46 #include "llvm/Support/BranchProbability.h"
47 #include "llvm/Support/Casting.h"
48 #include "llvm/Support/CommandLine.h"
49 #include "llvm/Support/ErrorHandling.h"
50 #include "llvm/Transforms/IPO.h"
51 #include "llvm/Transforms/Utils/Cloning.h"
52 #include "llvm/Transforms/Utils/CodeExtractor.h"
53 #include "llvm/Transforms/Utils/ValueMapper.h"
54 #include <algorithm>
55 #include <cassert>
56 #include <cstdint>
57 #include <memory>
58 #include <tuple>
59 #include <vector>
60
61 using namespace llvm;
62
63 #define DEBUG_TYPE "partial-inlining"
64
65 STATISTIC(NumPartialInlined,
66 "Number of callsites functions partially inlined into.");
67 STATISTIC(NumColdOutlinePartialInlined, "Number of times functions with "
68 "cold outlined regions were partially "
69 "inlined into its caller(s).");
70 STATISTIC(NumColdRegionsFound,
71 "Number of cold single entry/exit regions found.");
72 STATISTIC(NumColdRegionsOutlined,
73 "Number of cold single entry/exit regions outlined.");
74
75 // Command line option to disable partial-inlining. The default is false:
76 static cl::opt<bool>
77 DisablePartialInlining("disable-partial-inlining", cl::init(false),
78 cl::Hidden, cl::desc("Disable partial inlining"));
79 // Command line option to disable multi-region partial-inlining. The default is
80 // false:
81 static cl::opt<bool> DisableMultiRegionPartialInline(
82 "disable-mr-partial-inlining", cl::init(false), cl::Hidden,
83 cl::desc("Disable multi-region partial inlining"));
84
85 // Command line option to force outlining in regions with live exit variables.
86 // The default is false:
87 static cl::opt<bool>
88 ForceLiveExit("pi-force-live-exit-outline", cl::init(false), cl::Hidden,
89 cl::desc("Force outline regions with live exits"));
90
91 // Command line option to enable marking outline functions with Cold Calling
92 // Convention. The default is false:
93 static cl::opt<bool>
94 MarkOutlinedColdCC("pi-mark-coldcc", cl::init(false), cl::Hidden,
95 cl::desc("Mark outline function calls with ColdCC"));
96
97 // This is an option used by testing:
98 static cl::opt<bool> SkipCostAnalysis("skip-partial-inlining-cost-analysis",
99
100 cl::ReallyHidden,
101 cl::desc("Skip Cost Analysis"));
102 // Used to determine if a cold region is worth outlining based on
103 // its inlining cost compared to the original function. Default is set at 10%.
104 // ie. if the cold region reduces the inlining cost of the original function by
105 // at least 10%.
106 static cl::opt<float> MinRegionSizeRatio(
107 "min-region-size-ratio", cl::init(0.1), cl::Hidden,
108 cl::desc("Minimum ratio comparing relative sizes of each "
109 "outline candidate and original function"));
110 // Used to tune the minimum number of execution counts needed in the predecessor
111 // block to the cold edge. ie. confidence interval.
112 static cl::opt<unsigned>
113 MinBlockCounterExecution("min-block-execution", cl::init(100), cl::Hidden,
114 cl::desc("Minimum block executions to consider "
115 "its BranchProbabilityInfo valid"));
116 // Used to determine when an edge is considered cold. Default is set to 10%. ie.
117 // if the branch probability is 10% or less, then it is deemed as 'cold'.
118 static cl::opt<float> ColdBranchRatio(
119 "cold-branch-ratio", cl::init(0.1), cl::Hidden,
120 cl::desc("Minimum BranchProbability to consider a region cold."));
121
122 static cl::opt<unsigned> MaxNumInlineBlocks(
123 "max-num-inline-blocks", cl::init(5), cl::Hidden,
124 cl::desc("Max number of blocks to be partially inlined"));
125
126 // Command line option to set the maximum number of partial inlining allowed
127 // for the module. The default value of -1 means no limit.
128 static cl::opt<int> MaxNumPartialInlining(
129 "max-partial-inlining", cl::init(-1), cl::Hidden,
130 cl::desc("Max number of partial inlining. The default is unlimited"));
131
132 // Used only when PGO or user annotated branch data is absent. It is
133 // the least value that is used to weigh the outline region. If BFI
134 // produces larger value, the BFI value will be used.
135 static cl::opt<int>
136 OutlineRegionFreqPercent("outline-region-freq-percent", cl::init(75),
137 cl::Hidden,
138 cl::desc("Relative frequency of outline region to "
139 "the entry block"));
140
141 static cl::opt<unsigned> ExtraOutliningPenalty(
142 "partial-inlining-extra-penalty", cl::init(0), cl::Hidden,
143 cl::desc("A debug option to add additional penalty to the computed one."));
144
145 namespace {
146
147 struct FunctionOutliningInfo {
148 FunctionOutliningInfo() = default;
149
150 // Returns the number of blocks to be inlined including all blocks
151 // in Entries and one return block.
getNumInlinedBlocks__anon105b68ca0111::FunctionOutliningInfo152 unsigned getNumInlinedBlocks() const { return Entries.size() + 1; }
153
154 // A set of blocks including the function entry that guard
155 // the region to be outlined.
156 SmallVector<BasicBlock *, 4> Entries;
157
158 // The return block that is not included in the outlined region.
159 BasicBlock *ReturnBlock = nullptr;
160
161 // The dominating block of the region to be outlined.
162 BasicBlock *NonReturnBlock = nullptr;
163
164 // The set of blocks in Entries that are predecessors to ReturnBlock
165 SmallVector<BasicBlock *, 4> ReturnBlockPreds;
166 };
167
168 struct FunctionOutliningMultiRegionInfo {
169 FunctionOutliningMultiRegionInfo() = default;
170
171 // Container for outline regions
172 struct OutlineRegionInfo {
OutlineRegionInfo__anon105b68ca0111::FunctionOutliningMultiRegionInfo::OutlineRegionInfo173 OutlineRegionInfo(ArrayRef<BasicBlock *> Region,
174 BasicBlock *EntryBlock, BasicBlock *ExitBlock,
175 BasicBlock *ReturnBlock)
176 : Region(Region.begin(), Region.end()), EntryBlock(EntryBlock),
177 ExitBlock(ExitBlock), ReturnBlock(ReturnBlock) {}
178 SmallVector<BasicBlock *, 8> Region;
179 BasicBlock *EntryBlock;
180 BasicBlock *ExitBlock;
181 BasicBlock *ReturnBlock;
182 };
183
184 SmallVector<OutlineRegionInfo, 4> ORI;
185 };
186
187 struct PartialInlinerImpl {
188
PartialInlinerImpl__anon105b68ca0111::PartialInlinerImpl189 PartialInlinerImpl(
190 function_ref<AssumptionCache &(Function &)> GetAC,
191 function_ref<AssumptionCache *(Function &)> LookupAC,
192 function_ref<TargetTransformInfo &(Function &)> GTTI,
193 function_ref<const TargetLibraryInfo &(Function &)> GTLI,
194 ProfileSummaryInfo &ProfSI,
195 function_ref<BlockFrequencyInfo &(Function &)> GBFI = nullptr)
196 : GetAssumptionCache(GetAC), LookupAssumptionCache(LookupAC),
197 GetTTI(GTTI), GetBFI(GBFI), GetTLI(GTLI), PSI(ProfSI) {}
198
199 bool run(Module &M);
200 // Main part of the transformation that calls helper functions to find
201 // outlining candidates, clone & outline the function, and attempt to
202 // partially inline the resulting function. Returns true if
203 // inlining was successful, false otherwise. Also returns the outline
204 // function (only if we partially inlined early returns) as there is a
205 // possibility to further "peel" early return statements that were left in the
206 // outline function due to code size.
207 std::pair<bool, Function *> unswitchFunction(Function &F);
208
209 // This class speculatively clones the function to be partial inlined.
210 // At the end of partial inlining, the remaining callsites to the cloned
211 // function that are not partially inlined will be fixed up to reference
212 // the original function, and the cloned function will be erased.
213 struct FunctionCloner {
214 // Two constructors, one for single region outlining, the other for
215 // multi-region outlining.
216 FunctionCloner(Function *F, FunctionOutliningInfo *OI,
217 OptimizationRemarkEmitter &ORE,
218 function_ref<AssumptionCache *(Function &)> LookupAC,
219 function_ref<TargetTransformInfo &(Function &)> GetTTI);
220 FunctionCloner(Function *F, FunctionOutliningMultiRegionInfo *OMRI,
221 OptimizationRemarkEmitter &ORE,
222 function_ref<AssumptionCache *(Function &)> LookupAC,
223 function_ref<TargetTransformInfo &(Function &)> GetTTI);
224
225 ~FunctionCloner();
226
227 // Prepare for function outlining: making sure there is only
228 // one incoming edge from the extracted/outlined region to
229 // the return block.
230 void normalizeReturnBlock() const;
231
232 // Do function outlining for cold regions.
233 bool doMultiRegionFunctionOutlining();
234 // Do function outlining for region after early return block(s).
235 // NOTE: For vararg functions that do the vararg handling in the outlined
236 // function, we temporarily generate IR that does not properly
237 // forward varargs to the outlined function. Calling InlineFunction
238 // will update calls to the outlined functions to properly forward
239 // the varargs.
240 Function *doSingleRegionFunctionOutlining();
241
242 Function *OrigFunc = nullptr;
243 Function *ClonedFunc = nullptr;
244
245 typedef std::pair<Function *, BasicBlock *> FuncBodyCallerPair;
246 // Keep track of Outlined Functions and the basic block they're called from.
247 SmallVector<FuncBodyCallerPair, 4> OutlinedFunctions;
248
249 // ClonedFunc is inlined in one of its callers after function
250 // outlining.
251 bool IsFunctionInlined = false;
252 // The cost of the region to be outlined.
253 InstructionCost OutlinedRegionCost = 0;
254 // ClonedOI is specific to outlining non-early return blocks.
255 std::unique_ptr<FunctionOutliningInfo> ClonedOI = nullptr;
256 // ClonedOMRI is specific to outlining cold regions.
257 std::unique_ptr<FunctionOutliningMultiRegionInfo> ClonedOMRI = nullptr;
258 std::unique_ptr<BlockFrequencyInfo> ClonedFuncBFI = nullptr;
259 OptimizationRemarkEmitter &ORE;
260 function_ref<AssumptionCache *(Function &)> LookupAC;
261 function_ref<TargetTransformInfo &(Function &)> GetTTI;
262 };
263
264 private:
265 int NumPartialInlining = 0;
266 function_ref<AssumptionCache &(Function &)> GetAssumptionCache;
267 function_ref<AssumptionCache *(Function &)> LookupAssumptionCache;
268 function_ref<TargetTransformInfo &(Function &)> GetTTI;
269 function_ref<BlockFrequencyInfo &(Function &)> GetBFI;
270 function_ref<const TargetLibraryInfo &(Function &)> GetTLI;
271 ProfileSummaryInfo &PSI;
272
273 // Return the frequency of the OutlininingBB relative to F's entry point.
274 // The result is no larger than 1 and is represented using BP.
275 // (Note that the outlined region's 'head' block can only have incoming
276 // edges from the guarding entry blocks).
277 BranchProbability
278 getOutliningCallBBRelativeFreq(FunctionCloner &Cloner) const;
279
280 // Return true if the callee of CB should be partially inlined with
281 // profit.
282 bool shouldPartialInline(CallBase &CB, FunctionCloner &Cloner,
283 BlockFrequency WeightedOutliningRcost,
284 OptimizationRemarkEmitter &ORE) const;
285
286 // Try to inline DuplicateFunction (cloned from F with call to
287 // the OutlinedFunction into its callers. Return true
288 // if there is any successful inlining.
289 bool tryPartialInline(FunctionCloner &Cloner);
290
291 // Compute the mapping from use site of DuplicationFunction to the enclosing
292 // BB's profile count.
293 void
294 computeCallsiteToProfCountMap(Function *DuplicateFunction,
295 DenseMap<User *, uint64_t> &SiteCountMap) const;
296
isLimitReached__anon105b68ca0111::PartialInlinerImpl297 bool isLimitReached() const {
298 return (MaxNumPartialInlining != -1 &&
299 NumPartialInlining >= MaxNumPartialInlining);
300 }
301
getSupportedCallBase__anon105b68ca0111::PartialInlinerImpl302 static CallBase *getSupportedCallBase(User *U) {
303 if (isa<CallInst>(U) || isa<InvokeInst>(U))
304 return cast<CallBase>(U);
305 llvm_unreachable("All uses must be calls");
306 return nullptr;
307 }
308
getOneCallSiteTo__anon105b68ca0111::PartialInlinerImpl309 static CallBase *getOneCallSiteTo(Function &F) {
310 User *User = *F.user_begin();
311 return getSupportedCallBase(User);
312 }
313
getOneDebugLoc__anon105b68ca0111::PartialInlinerImpl314 std::tuple<DebugLoc, BasicBlock *> getOneDebugLoc(Function &F) const {
315 CallBase *CB = getOneCallSiteTo(F);
316 DebugLoc DLoc = CB->getDebugLoc();
317 BasicBlock *Block = CB->getParent();
318 return std::make_tuple(DLoc, Block);
319 }
320
321 // Returns the costs associated with function outlining:
322 // - The first value is the non-weighted runtime cost for making the call
323 // to the outlined function, including the addtional setup cost in the
324 // outlined function itself;
325 // - The second value is the estimated size of the new call sequence in
326 // basic block Cloner.OutliningCallBB;
327 std::tuple<InstructionCost, InstructionCost>
328 computeOutliningCosts(FunctionCloner &Cloner) const;
329
330 // Compute the 'InlineCost' of block BB. InlineCost is a proxy used to
331 // approximate both the size and runtime cost (Note that in the current
332 // inline cost analysis, there is no clear distinction there either).
333 static InstructionCost computeBBInlineCost(BasicBlock *BB,
334 TargetTransformInfo *TTI);
335
336 std::unique_ptr<FunctionOutliningInfo>
337 computeOutliningInfo(Function &F) const;
338
339 std::unique_ptr<FunctionOutliningMultiRegionInfo>
340 computeOutliningColdRegionsInfo(Function &F,
341 OptimizationRemarkEmitter &ORE) const;
342 };
343
344 } // end anonymous namespace
345
346 std::unique_ptr<FunctionOutliningMultiRegionInfo>
computeOutliningColdRegionsInfo(Function & F,OptimizationRemarkEmitter & ORE) const347 PartialInlinerImpl::computeOutliningColdRegionsInfo(
348 Function &F, OptimizationRemarkEmitter &ORE) const {
349 BasicBlock *EntryBlock = &F.front();
350
351 DominatorTree DT(F);
352 LoopInfo LI(DT);
353 BranchProbabilityInfo BPI(F, LI);
354 std::unique_ptr<BlockFrequencyInfo> ScopedBFI;
355 BlockFrequencyInfo *BFI;
356 if (!GetBFI) {
357 ScopedBFI.reset(new BlockFrequencyInfo(F, BPI, LI));
358 BFI = ScopedBFI.get();
359 } else
360 BFI = &(GetBFI(F));
361
362 // Return if we don't have profiling information.
363 if (!PSI.hasInstrumentationProfile())
364 return std::unique_ptr<FunctionOutliningMultiRegionInfo>();
365
366 std::unique_ptr<FunctionOutliningMultiRegionInfo> OutliningInfo =
367 std::make_unique<FunctionOutliningMultiRegionInfo>();
368
369 auto IsSingleExit =
370 [&ORE](SmallVectorImpl<BasicBlock *> &BlockList) -> BasicBlock * {
371 BasicBlock *ExitBlock = nullptr;
372 for (auto *Block : BlockList) {
373 for (BasicBlock *Succ : successors(Block)) {
374 if (!is_contained(BlockList, Succ)) {
375 if (ExitBlock) {
376 ORE.emit([&]() {
377 return OptimizationRemarkMissed(DEBUG_TYPE, "MultiExitRegion",
378 &Succ->front())
379 << "Region dominated by "
380 << ore::NV("Block", BlockList.front()->getName())
381 << " has more than one region exit edge.";
382 });
383 return nullptr;
384 }
385
386 ExitBlock = Block;
387 }
388 }
389 }
390 return ExitBlock;
391 };
392
393 auto BBProfileCount = [BFI](BasicBlock *BB) {
394 return BFI->getBlockProfileCount(BB).value_or(0);
395 };
396
397 // Use the same computeBBInlineCost function to compute the cost savings of
398 // the outlining the candidate region.
399 TargetTransformInfo *FTTI = &GetTTI(F);
400 InstructionCost OverallFunctionCost = 0;
401 for (auto &BB : F)
402 OverallFunctionCost += computeBBInlineCost(&BB, FTTI);
403
404 LLVM_DEBUG(dbgs() << "OverallFunctionCost = " << OverallFunctionCost
405 << "\n";);
406
407 InstructionCost MinOutlineRegionCost = OverallFunctionCost.map(
408 [&](auto Cost) { return Cost * MinRegionSizeRatio; });
409
410 BranchProbability MinBranchProbability(
411 static_cast<int>(ColdBranchRatio * MinBlockCounterExecution),
412 MinBlockCounterExecution);
413 bool ColdCandidateFound = false;
414 BasicBlock *CurrEntry = EntryBlock;
415 std::vector<BasicBlock *> DFS;
416 DenseMap<BasicBlock *, bool> VisitedMap;
417 DFS.push_back(CurrEntry);
418 VisitedMap[CurrEntry] = true;
419
420 // Use Depth First Search on the basic blocks to find CFG edges that are
421 // considered cold.
422 // Cold regions considered must also have its inline cost compared to the
423 // overall inline cost of the original function. The region is outlined only
424 // if it reduced the inline cost of the function by 'MinOutlineRegionCost' or
425 // more.
426 while (!DFS.empty()) {
427 auto *ThisBB = DFS.back();
428 DFS.pop_back();
429 // Only consider regions with predecessor blocks that are considered
430 // not-cold (default: part of the top 99.99% of all block counters)
431 // AND greater than our minimum block execution count (default: 100).
432 if (PSI.isColdBlock(ThisBB, BFI) ||
433 BBProfileCount(ThisBB) < MinBlockCounterExecution)
434 continue;
435 for (auto SI = succ_begin(ThisBB); SI != succ_end(ThisBB); ++SI) {
436 if (VisitedMap[*SI])
437 continue;
438 VisitedMap[*SI] = true;
439 DFS.push_back(*SI);
440 // If branch isn't cold, we skip to the next one.
441 BranchProbability SuccProb = BPI.getEdgeProbability(ThisBB, *SI);
442 if (SuccProb > MinBranchProbability)
443 continue;
444
445 LLVM_DEBUG(dbgs() << "Found cold edge: " << ThisBB->getName() << "->"
446 << SI->getName()
447 << "\nBranch Probability = " << SuccProb << "\n";);
448
449 SmallVector<BasicBlock *, 8> DominateVector;
450 DT.getDescendants(*SI, DominateVector);
451 assert(!DominateVector.empty() &&
452 "SI should be reachable and have at least itself as descendant");
453
454 // We can only outline single entry regions (for now).
455 if (!DominateVector.front()->hasNPredecessors(1)) {
456 LLVM_DEBUG(dbgs() << "ABORT: Block " << SI->getName()
457 << " doesn't have a single predecessor in the "
458 "dominator tree\n";);
459 continue;
460 }
461
462 BasicBlock *ExitBlock = nullptr;
463 // We can only outline single exit regions (for now).
464 if (!(ExitBlock = IsSingleExit(DominateVector))) {
465 LLVM_DEBUG(dbgs() << "ABORT: Block " << SI->getName()
466 << " doesn't have a unique successor\n";);
467 continue;
468 }
469
470 InstructionCost OutlineRegionCost = 0;
471 for (auto *BB : DominateVector)
472 OutlineRegionCost += computeBBInlineCost(BB, &GetTTI(*BB->getParent()));
473
474 LLVM_DEBUG(dbgs() << "OutlineRegionCost = " << OutlineRegionCost
475 << "\n";);
476
477 if (!SkipCostAnalysis && OutlineRegionCost < MinOutlineRegionCost) {
478 ORE.emit([&]() {
479 return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly",
480 &SI->front())
481 << ore::NV("Callee", &F)
482 << " inline cost-savings smaller than "
483 << ore::NV("Cost", MinOutlineRegionCost);
484 });
485
486 LLVM_DEBUG(dbgs() << "ABORT: Outline region cost is smaller than "
487 << MinOutlineRegionCost << "\n";);
488 continue;
489 }
490
491 // For now, ignore blocks that belong to a SISE region that is a
492 // candidate for outlining. In the future, we may want to look
493 // at inner regions because the outer region may have live-exit
494 // variables.
495 for (auto *BB : DominateVector)
496 VisitedMap[BB] = true;
497
498 // ReturnBlock here means the block after the outline call
499 BasicBlock *ReturnBlock = ExitBlock->getSingleSuccessor();
500 FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegInfo(
501 DominateVector, DominateVector.front(), ExitBlock, ReturnBlock);
502 OutliningInfo->ORI.push_back(RegInfo);
503 LLVM_DEBUG(dbgs() << "Found Cold Candidate starting at block: "
504 << DominateVector.front()->getName() << "\n";);
505 ColdCandidateFound = true;
506 NumColdRegionsFound++;
507 }
508 }
509
510 if (ColdCandidateFound)
511 return OutliningInfo;
512
513 return std::unique_ptr<FunctionOutliningMultiRegionInfo>();
514 }
515
516 std::unique_ptr<FunctionOutliningInfo>
computeOutliningInfo(Function & F) const517 PartialInlinerImpl::computeOutliningInfo(Function &F) const {
518 BasicBlock *EntryBlock = &F.front();
519 BranchInst *BR = dyn_cast<BranchInst>(EntryBlock->getTerminator());
520 if (!BR || BR->isUnconditional())
521 return std::unique_ptr<FunctionOutliningInfo>();
522
523 // Returns true if Succ is BB's successor
524 auto IsSuccessor = [](BasicBlock *Succ, BasicBlock *BB) {
525 return is_contained(successors(BB), Succ);
526 };
527
528 auto IsReturnBlock = [](BasicBlock *BB) {
529 Instruction *TI = BB->getTerminator();
530 return isa<ReturnInst>(TI);
531 };
532
533 auto GetReturnBlock = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
534 if (IsReturnBlock(Succ1))
535 return std::make_tuple(Succ1, Succ2);
536 if (IsReturnBlock(Succ2))
537 return std::make_tuple(Succ2, Succ1);
538
539 return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr);
540 };
541
542 // Detect a triangular shape:
543 auto GetCommonSucc = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
544 if (IsSuccessor(Succ1, Succ2))
545 return std::make_tuple(Succ1, Succ2);
546 if (IsSuccessor(Succ2, Succ1))
547 return std::make_tuple(Succ2, Succ1);
548
549 return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr);
550 };
551
552 std::unique_ptr<FunctionOutliningInfo> OutliningInfo =
553 std::make_unique<FunctionOutliningInfo>();
554
555 BasicBlock *CurrEntry = EntryBlock;
556 bool CandidateFound = false;
557 do {
558 // The number of blocks to be inlined has already reached
559 // the limit. When MaxNumInlineBlocks is set to 0 or 1, this
560 // disables partial inlining for the function.
561 if (OutliningInfo->getNumInlinedBlocks() >= MaxNumInlineBlocks)
562 break;
563
564 if (succ_size(CurrEntry) != 2)
565 break;
566
567 BasicBlock *Succ1 = *succ_begin(CurrEntry);
568 BasicBlock *Succ2 = *(succ_begin(CurrEntry) + 1);
569
570 BasicBlock *ReturnBlock, *NonReturnBlock;
571 std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2);
572
573 if (ReturnBlock) {
574 OutliningInfo->Entries.push_back(CurrEntry);
575 OutliningInfo->ReturnBlock = ReturnBlock;
576 OutliningInfo->NonReturnBlock = NonReturnBlock;
577 CandidateFound = true;
578 break;
579 }
580
581 BasicBlock *CommSucc, *OtherSucc;
582 std::tie(CommSucc, OtherSucc) = GetCommonSucc(Succ1, Succ2);
583
584 if (!CommSucc)
585 break;
586
587 OutliningInfo->Entries.push_back(CurrEntry);
588 CurrEntry = OtherSucc;
589 } while (true);
590
591 if (!CandidateFound)
592 return std::unique_ptr<FunctionOutliningInfo>();
593
594 // There should not be any successors (not in the entry set) other than
595 // {ReturnBlock, NonReturnBlock}
596 assert(OutliningInfo->Entries[0] == &F.front() &&
597 "Function Entry must be the first in Entries vector");
598 DenseSet<BasicBlock *> Entries;
599 for (BasicBlock *E : OutliningInfo->Entries)
600 Entries.insert(E);
601
602 // Returns true of BB has Predecessor which is not
603 // in Entries set.
604 auto HasNonEntryPred = [Entries](BasicBlock *BB) {
605 for (auto *Pred : predecessors(BB)) {
606 if (!Entries.count(Pred))
607 return true;
608 }
609 return false;
610 };
611 auto CheckAndNormalizeCandidate =
612 [Entries, HasNonEntryPred](FunctionOutliningInfo *OutliningInfo) {
613 for (BasicBlock *E : OutliningInfo->Entries) {
614 for (auto *Succ : successors(E)) {
615 if (Entries.count(Succ))
616 continue;
617 if (Succ == OutliningInfo->ReturnBlock)
618 OutliningInfo->ReturnBlockPreds.push_back(E);
619 else if (Succ != OutliningInfo->NonReturnBlock)
620 return false;
621 }
622 // There should not be any outside incoming edges either:
623 if (HasNonEntryPred(E))
624 return false;
625 }
626 return true;
627 };
628
629 if (!CheckAndNormalizeCandidate(OutliningInfo.get()))
630 return std::unique_ptr<FunctionOutliningInfo>();
631
632 // Now further growing the candidate's inlining region by
633 // peeling off dominating blocks from the outlining region:
634 while (OutliningInfo->getNumInlinedBlocks() < MaxNumInlineBlocks) {
635 BasicBlock *Cand = OutliningInfo->NonReturnBlock;
636 if (succ_size(Cand) != 2)
637 break;
638
639 if (HasNonEntryPred(Cand))
640 break;
641
642 BasicBlock *Succ1 = *succ_begin(Cand);
643 BasicBlock *Succ2 = *(succ_begin(Cand) + 1);
644
645 BasicBlock *ReturnBlock, *NonReturnBlock;
646 std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2);
647 if (!ReturnBlock || ReturnBlock != OutliningInfo->ReturnBlock)
648 break;
649
650 if (NonReturnBlock->getSinglePredecessor() != Cand)
651 break;
652
653 // Now grow and update OutlininigInfo:
654 OutliningInfo->Entries.push_back(Cand);
655 OutliningInfo->NonReturnBlock = NonReturnBlock;
656 OutliningInfo->ReturnBlockPreds.push_back(Cand);
657 Entries.insert(Cand);
658 }
659
660 return OutliningInfo;
661 }
662
663 // Check if there is PGO data or user annotated branch data:
hasProfileData(const Function & F,const FunctionOutliningInfo & OI)664 static bool hasProfileData(const Function &F, const FunctionOutliningInfo &OI) {
665 if (F.hasProfileData())
666 return true;
667 // Now check if any of the entry block has MD_prof data:
668 for (auto *E : OI.Entries) {
669 BranchInst *BR = dyn_cast<BranchInst>(E->getTerminator());
670 if (!BR || BR->isUnconditional())
671 continue;
672 if (hasBranchWeightMD(*BR))
673 return true;
674 }
675 return false;
676 }
677
getOutliningCallBBRelativeFreq(FunctionCloner & Cloner) const678 BranchProbability PartialInlinerImpl::getOutliningCallBBRelativeFreq(
679 FunctionCloner &Cloner) const {
680 BasicBlock *OutliningCallBB = Cloner.OutlinedFunctions.back().second;
681 auto EntryFreq =
682 Cloner.ClonedFuncBFI->getBlockFreq(&Cloner.ClonedFunc->getEntryBlock());
683 auto OutliningCallFreq =
684 Cloner.ClonedFuncBFI->getBlockFreq(OutliningCallBB);
685 // FIXME Hackery needed because ClonedFuncBFI is based on the function BEFORE
686 // we outlined any regions, so we may encounter situations where the
687 // OutliningCallFreq is *slightly* bigger than the EntryFreq.
688 if (OutliningCallFreq.getFrequency() > EntryFreq.getFrequency())
689 OutliningCallFreq = EntryFreq;
690
691 auto OutlineRegionRelFreq = BranchProbability::getBranchProbability(
692 OutliningCallFreq.getFrequency(), EntryFreq.getFrequency());
693
694 if (hasProfileData(*Cloner.OrigFunc, *Cloner.ClonedOI))
695 return OutlineRegionRelFreq;
696
697 // When profile data is not available, we need to be conservative in
698 // estimating the overall savings. Static branch prediction can usually
699 // guess the branch direction right (taken/non-taken), but the guessed
700 // branch probability is usually not biased enough. In case when the
701 // outlined region is predicted to be likely, its probability needs
702 // to be made higher (more biased) to not under-estimate the cost of
703 // function outlining. On the other hand, if the outlined region
704 // is predicted to be less likely, the predicted probablity is usually
705 // higher than the actual. For instance, the actual probability of the
706 // less likely target is only 5%, but the guessed probablity can be
707 // 40%. In the latter case, there is no need for further adjustment.
708 // FIXME: add an option for this.
709 if (OutlineRegionRelFreq < BranchProbability(45, 100))
710 return OutlineRegionRelFreq;
711
712 OutlineRegionRelFreq = std::max(
713 OutlineRegionRelFreq, BranchProbability(OutlineRegionFreqPercent, 100));
714
715 return OutlineRegionRelFreq;
716 }
717
shouldPartialInline(CallBase & CB,FunctionCloner & Cloner,BlockFrequency WeightedOutliningRcost,OptimizationRemarkEmitter & ORE) const718 bool PartialInlinerImpl::shouldPartialInline(
719 CallBase &CB, FunctionCloner &Cloner, BlockFrequency WeightedOutliningRcost,
720 OptimizationRemarkEmitter &ORE) const {
721 using namespace ore;
722
723 Function *Callee = CB.getCalledFunction();
724 assert(Callee == Cloner.ClonedFunc);
725
726 if (SkipCostAnalysis)
727 return isInlineViable(*Callee).isSuccess();
728
729 Function *Caller = CB.getCaller();
730 auto &CalleeTTI = GetTTI(*Callee);
731 bool RemarksEnabled =
732 Callee->getContext().getDiagHandlerPtr()->isMissedOptRemarkEnabled(
733 DEBUG_TYPE);
734 InlineCost IC =
735 getInlineCost(CB, getInlineParams(), CalleeTTI, GetAssumptionCache,
736 GetTLI, GetBFI, &PSI, RemarksEnabled ? &ORE : nullptr);
737
738 if (IC.isAlways()) {
739 ORE.emit([&]() {
740 return OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", &CB)
741 << NV("Callee", Cloner.OrigFunc)
742 << " should always be fully inlined, not partially";
743 });
744 return false;
745 }
746
747 if (IC.isNever()) {
748 ORE.emit([&]() {
749 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", &CB)
750 << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
751 << NV("Caller", Caller)
752 << " because it should never be inlined (cost=never)";
753 });
754 return false;
755 }
756
757 if (!IC) {
758 ORE.emit([&]() {
759 return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", &CB)
760 << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
761 << NV("Caller", Caller) << " because too costly to inline (cost="
762 << NV("Cost", IC.getCost()) << ", threshold="
763 << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")";
764 });
765 return false;
766 }
767 const DataLayout &DL = Caller->getDataLayout();
768
769 // The savings of eliminating the call:
770 int NonWeightedSavings = getCallsiteCost(CalleeTTI, CB, DL);
771 BlockFrequency NormWeightedSavings(NonWeightedSavings);
772
773 // Weighted saving is smaller than weighted cost, return false
774 if (NormWeightedSavings < WeightedOutliningRcost) {
775 ORE.emit([&]() {
776 return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutliningCallcostTooHigh",
777 &CB)
778 << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
779 << NV("Caller", Caller) << " runtime overhead (overhead="
780 << NV("Overhead", (unsigned)WeightedOutliningRcost.getFrequency())
781 << ", savings="
782 << NV("Savings", (unsigned)NormWeightedSavings.getFrequency())
783 << ")"
784 << " of making the outlined call is too high";
785 });
786
787 return false;
788 }
789
790 ORE.emit([&]() {
791 return OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBePartiallyInlined", &CB)
792 << NV("Callee", Cloner.OrigFunc) << " can be partially inlined into "
793 << NV("Caller", Caller) << " with cost=" << NV("Cost", IC.getCost())
794 << " (threshold="
795 << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")";
796 });
797 return true;
798 }
799
800 // TODO: Ideally we should share Inliner's InlineCost Analysis code.
801 // For now use a simplified version. The returned 'InlineCost' will be used
802 // to esimate the size cost as well as runtime cost of the BB.
803 InstructionCost
computeBBInlineCost(BasicBlock * BB,TargetTransformInfo * TTI)804 PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB,
805 TargetTransformInfo *TTI) {
806 InstructionCost InlineCost = 0;
807 const DataLayout &DL = BB->getDataLayout();
808 int InstrCost = InlineConstants::getInstrCost();
809 for (Instruction &I : BB->instructionsWithoutDebug()) {
810 // Skip free instructions.
811 switch (I.getOpcode()) {
812 case Instruction::BitCast:
813 case Instruction::PtrToInt:
814 case Instruction::IntToPtr:
815 case Instruction::Alloca:
816 case Instruction::PHI:
817 continue;
818 case Instruction::GetElementPtr:
819 if (cast<GetElementPtrInst>(&I)->hasAllZeroIndices())
820 continue;
821 break;
822 default:
823 break;
824 }
825
826 if (I.isLifetimeStartOrEnd())
827 continue;
828
829 if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
830 Intrinsic::ID IID = II->getIntrinsicID();
831 SmallVector<Type *, 4> Tys;
832 FastMathFlags FMF;
833 for (Value *Val : II->args())
834 Tys.push_back(Val->getType());
835
836 if (auto *FPMO = dyn_cast<FPMathOperator>(II))
837 FMF = FPMO->getFastMathFlags();
838
839 IntrinsicCostAttributes ICA(IID, II->getType(), Tys, FMF);
840 InlineCost += TTI->getIntrinsicInstrCost(ICA, TTI::TCK_SizeAndLatency);
841 continue;
842 }
843
844 if (CallInst *CI = dyn_cast<CallInst>(&I)) {
845 InlineCost += getCallsiteCost(*TTI, *CI, DL);
846 continue;
847 }
848
849 if (InvokeInst *II = dyn_cast<InvokeInst>(&I)) {
850 InlineCost += getCallsiteCost(*TTI, *II, DL);
851 continue;
852 }
853
854 if (SwitchInst *SI = dyn_cast<SwitchInst>(&I)) {
855 InlineCost += (SI->getNumCases() + 1) * InstrCost;
856 continue;
857 }
858 InlineCost += InstrCost;
859 }
860
861 return InlineCost;
862 }
863
864 std::tuple<InstructionCost, InstructionCost>
computeOutliningCosts(FunctionCloner & Cloner) const865 PartialInlinerImpl::computeOutliningCosts(FunctionCloner &Cloner) const {
866 InstructionCost OutliningFuncCallCost = 0, OutlinedFunctionCost = 0;
867 for (auto FuncBBPair : Cloner.OutlinedFunctions) {
868 Function *OutlinedFunc = FuncBBPair.first;
869 BasicBlock* OutliningCallBB = FuncBBPair.second;
870 // Now compute the cost of the call sequence to the outlined function
871 // 'OutlinedFunction' in BB 'OutliningCallBB':
872 auto *OutlinedFuncTTI = &GetTTI(*OutlinedFunc);
873 OutliningFuncCallCost +=
874 computeBBInlineCost(OutliningCallBB, OutlinedFuncTTI);
875
876 // Now compute the cost of the extracted/outlined function itself:
877 for (BasicBlock &BB : *OutlinedFunc)
878 OutlinedFunctionCost += computeBBInlineCost(&BB, OutlinedFuncTTI);
879 }
880 assert(OutlinedFunctionCost >= Cloner.OutlinedRegionCost &&
881 "Outlined function cost should be no less than the outlined region");
882
883 // The code extractor introduces a new root and exit stub blocks with
884 // additional unconditional branches. Those branches will be eliminated
885 // later with bb layout. The cost should be adjusted accordingly:
886 OutlinedFunctionCost -=
887 2 * InlineConstants::getInstrCost() * Cloner.OutlinedFunctions.size();
888
889 InstructionCost OutliningRuntimeOverhead =
890 OutliningFuncCallCost +
891 (OutlinedFunctionCost - Cloner.OutlinedRegionCost) +
892 ExtraOutliningPenalty.getValue();
893
894 return std::make_tuple(OutliningFuncCallCost, OutliningRuntimeOverhead);
895 }
896
897 // Create the callsite to profile count map which is
898 // used to update the original function's entry count,
899 // after the function is partially inlined into the callsite.
computeCallsiteToProfCountMap(Function * DuplicateFunction,DenseMap<User *,uint64_t> & CallSiteToProfCountMap) const900 void PartialInlinerImpl::computeCallsiteToProfCountMap(
901 Function *DuplicateFunction,
902 DenseMap<User *, uint64_t> &CallSiteToProfCountMap) const {
903 std::vector<User *> Users(DuplicateFunction->user_begin(),
904 DuplicateFunction->user_end());
905 Function *CurrentCaller = nullptr;
906 std::unique_ptr<BlockFrequencyInfo> TempBFI;
907 BlockFrequencyInfo *CurrentCallerBFI = nullptr;
908
909 auto ComputeCurrBFI = [&,this](Function *Caller) {
910 // For the old pass manager:
911 if (!GetBFI) {
912 DominatorTree DT(*Caller);
913 LoopInfo LI(DT);
914 BranchProbabilityInfo BPI(*Caller, LI);
915 TempBFI.reset(new BlockFrequencyInfo(*Caller, BPI, LI));
916 CurrentCallerBFI = TempBFI.get();
917 } else {
918 // New pass manager:
919 CurrentCallerBFI = &(GetBFI(*Caller));
920 }
921 };
922
923 for (User *User : Users) {
924 // Don't bother with BlockAddress used by CallBr for asm goto.
925 if (isa<BlockAddress>(User))
926 continue;
927 CallBase *CB = getSupportedCallBase(User);
928 Function *Caller = CB->getCaller();
929 if (CurrentCaller != Caller) {
930 CurrentCaller = Caller;
931 ComputeCurrBFI(Caller);
932 } else {
933 assert(CurrentCallerBFI && "CallerBFI is not set");
934 }
935 BasicBlock *CallBB = CB->getParent();
936 auto Count = CurrentCallerBFI->getBlockProfileCount(CallBB);
937 if (Count)
938 CallSiteToProfCountMap[User] = *Count;
939 else
940 CallSiteToProfCountMap[User] = 0;
941 }
942 }
943
FunctionCloner(Function * F,FunctionOutliningInfo * OI,OptimizationRemarkEmitter & ORE,function_ref<AssumptionCache * (Function &)> LookupAC,function_ref<TargetTransformInfo & (Function &)> GetTTI)944 PartialInlinerImpl::FunctionCloner::FunctionCloner(
945 Function *F, FunctionOutliningInfo *OI, OptimizationRemarkEmitter &ORE,
946 function_ref<AssumptionCache *(Function &)> LookupAC,
947 function_ref<TargetTransformInfo &(Function &)> GetTTI)
948 : OrigFunc(F), ORE(ORE), LookupAC(LookupAC), GetTTI(GetTTI) {
949 ClonedOI = std::make_unique<FunctionOutliningInfo>();
950
951 // Clone the function, so that we can hack away on it.
952 ValueToValueMapTy VMap;
953 ClonedFunc = CloneFunction(F, VMap);
954
955 ClonedOI->ReturnBlock = cast<BasicBlock>(VMap[OI->ReturnBlock]);
956 ClonedOI->NonReturnBlock = cast<BasicBlock>(VMap[OI->NonReturnBlock]);
957 for (BasicBlock *BB : OI->Entries)
958 ClonedOI->Entries.push_back(cast<BasicBlock>(VMap[BB]));
959
960 for (BasicBlock *E : OI->ReturnBlockPreds) {
961 BasicBlock *NewE = cast<BasicBlock>(VMap[E]);
962 ClonedOI->ReturnBlockPreds.push_back(NewE);
963 }
964 // Go ahead and update all uses to the duplicate, so that we can just
965 // use the inliner functionality when we're done hacking.
966 F->replaceAllUsesWith(ClonedFunc);
967 }
968
FunctionCloner(Function * F,FunctionOutliningMultiRegionInfo * OI,OptimizationRemarkEmitter & ORE,function_ref<AssumptionCache * (Function &)> LookupAC,function_ref<TargetTransformInfo & (Function &)> GetTTI)969 PartialInlinerImpl::FunctionCloner::FunctionCloner(
970 Function *F, FunctionOutliningMultiRegionInfo *OI,
971 OptimizationRemarkEmitter &ORE,
972 function_ref<AssumptionCache *(Function &)> LookupAC,
973 function_ref<TargetTransformInfo &(Function &)> GetTTI)
974 : OrigFunc(F), ORE(ORE), LookupAC(LookupAC), GetTTI(GetTTI) {
975 ClonedOMRI = std::make_unique<FunctionOutliningMultiRegionInfo>();
976
977 // Clone the function, so that we can hack away on it.
978 ValueToValueMapTy VMap;
979 ClonedFunc = CloneFunction(F, VMap);
980
981 // Go through all Outline Candidate Regions and update all BasicBlock
982 // information.
983 for (const FunctionOutliningMultiRegionInfo::OutlineRegionInfo &RegionInfo :
984 OI->ORI) {
985 SmallVector<BasicBlock *, 8> Region;
986 for (BasicBlock *BB : RegionInfo.Region)
987 Region.push_back(cast<BasicBlock>(VMap[BB]));
988
989 BasicBlock *NewEntryBlock = cast<BasicBlock>(VMap[RegionInfo.EntryBlock]);
990 BasicBlock *NewExitBlock = cast<BasicBlock>(VMap[RegionInfo.ExitBlock]);
991 BasicBlock *NewReturnBlock = nullptr;
992 if (RegionInfo.ReturnBlock)
993 NewReturnBlock = cast<BasicBlock>(VMap[RegionInfo.ReturnBlock]);
994 FunctionOutliningMultiRegionInfo::OutlineRegionInfo MappedRegionInfo(
995 Region, NewEntryBlock, NewExitBlock, NewReturnBlock);
996 ClonedOMRI->ORI.push_back(MappedRegionInfo);
997 }
998 // Go ahead and update all uses to the duplicate, so that we can just
999 // use the inliner functionality when we're done hacking.
1000 F->replaceAllUsesWith(ClonedFunc);
1001 }
1002
normalizeReturnBlock() const1003 void PartialInlinerImpl::FunctionCloner::normalizeReturnBlock() const {
1004 auto GetFirstPHI = [](BasicBlock *BB) {
1005 BasicBlock::iterator I = BB->begin();
1006 PHINode *FirstPhi = nullptr;
1007 while (I != BB->end()) {
1008 PHINode *Phi = dyn_cast<PHINode>(I);
1009 if (!Phi)
1010 break;
1011 if (!FirstPhi) {
1012 FirstPhi = Phi;
1013 break;
1014 }
1015 }
1016 return FirstPhi;
1017 };
1018
1019 // Shouldn't need to normalize PHIs if we're not outlining non-early return
1020 // blocks.
1021 if (!ClonedOI)
1022 return;
1023
1024 // Special hackery is needed with PHI nodes that have inputs from more than
1025 // one extracted block. For simplicity, just split the PHIs into a two-level
1026 // sequence of PHIs, some of which will go in the extracted region, and some
1027 // of which will go outside.
1028 BasicBlock *PreReturn = ClonedOI->ReturnBlock;
1029 // only split block when necessary:
1030 PHINode *FirstPhi = GetFirstPHI(PreReturn);
1031 unsigned NumPredsFromEntries = ClonedOI->ReturnBlockPreds.size();
1032
1033 if (!FirstPhi || FirstPhi->getNumIncomingValues() <= NumPredsFromEntries + 1)
1034 return;
1035
1036 auto IsTrivialPhi = [](PHINode *PN) -> Value * {
1037 if (llvm::all_equal(PN->incoming_values()))
1038 return PN->getIncomingValue(0);
1039 return nullptr;
1040 };
1041
1042 ClonedOI->ReturnBlock = ClonedOI->ReturnBlock->splitBasicBlock(
1043 ClonedOI->ReturnBlock->getFirstNonPHI()->getIterator());
1044 BasicBlock::iterator I = PreReturn->begin();
1045 BasicBlock::iterator Ins = ClonedOI->ReturnBlock->begin();
1046 SmallVector<Instruction *, 4> DeadPhis;
1047 while (I != PreReturn->end()) {
1048 PHINode *OldPhi = dyn_cast<PHINode>(I);
1049 if (!OldPhi)
1050 break;
1051
1052 PHINode *RetPhi =
1053 PHINode::Create(OldPhi->getType(), NumPredsFromEntries + 1, "");
1054 RetPhi->insertBefore(Ins);
1055 OldPhi->replaceAllUsesWith(RetPhi);
1056 Ins = ClonedOI->ReturnBlock->getFirstNonPHIIt();
1057
1058 RetPhi->addIncoming(&*I, PreReturn);
1059 for (BasicBlock *E : ClonedOI->ReturnBlockPreds) {
1060 RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(E), E);
1061 OldPhi->removeIncomingValue(E);
1062 }
1063
1064 // After incoming values splitting, the old phi may become trivial.
1065 // Keeping the trivial phi can introduce definition inside the outline
1066 // region which is live-out, causing necessary overhead (load, store
1067 // arg passing etc).
1068 if (auto *OldPhiVal = IsTrivialPhi(OldPhi)) {
1069 OldPhi->replaceAllUsesWith(OldPhiVal);
1070 DeadPhis.push_back(OldPhi);
1071 }
1072 ++I;
1073 }
1074 for (auto *DP : DeadPhis)
1075 DP->eraseFromParent();
1076
1077 for (auto *E : ClonedOI->ReturnBlockPreds)
1078 E->getTerminator()->replaceUsesOfWith(PreReturn, ClonedOI->ReturnBlock);
1079 }
1080
doMultiRegionFunctionOutlining()1081 bool PartialInlinerImpl::FunctionCloner::doMultiRegionFunctionOutlining() {
1082
1083 auto ComputeRegionCost =
1084 [&](SmallVectorImpl<BasicBlock *> &Region) -> InstructionCost {
1085 InstructionCost Cost = 0;
1086 for (BasicBlock* BB : Region)
1087 Cost += computeBBInlineCost(BB, &GetTTI(*BB->getParent()));
1088 return Cost;
1089 };
1090
1091 assert(ClonedOMRI && "Expecting OutlineInfo for multi region outline");
1092
1093 if (ClonedOMRI->ORI.empty())
1094 return false;
1095
1096 // The CodeExtractor needs a dominator tree.
1097 DominatorTree DT;
1098 DT.recalculate(*ClonedFunc);
1099
1100 // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo.
1101 LoopInfo LI(DT);
1102 BranchProbabilityInfo BPI(*ClonedFunc, LI);
1103 ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI));
1104
1105 // Cache and recycle the CodeExtractor analysis to avoid O(n^2) compile-time.
1106 CodeExtractorAnalysisCache CEAC(*ClonedFunc);
1107
1108 SetVector<Value *> Inputs, Outputs, Sinks;
1109 for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo :
1110 ClonedOMRI->ORI) {
1111 InstructionCost CurrentOutlinedRegionCost =
1112 ComputeRegionCost(RegionInfo.Region);
1113
1114 CodeExtractor CE(RegionInfo.Region, &DT, /*AggregateArgs*/ false,
1115 ClonedFuncBFI.get(), &BPI,
1116 LookupAC(*RegionInfo.EntryBlock->getParent()),
1117 /* AllowVarargs */ false);
1118
1119 CE.findInputsOutputs(Inputs, Outputs, Sinks);
1120
1121 LLVM_DEBUG({
1122 dbgs() << "inputs: " << Inputs.size() << "\n";
1123 dbgs() << "outputs: " << Outputs.size() << "\n";
1124 for (Value *value : Inputs)
1125 dbgs() << "value used in func: " << *value << "\n";
1126 for (Value *output : Outputs)
1127 dbgs() << "instr used in func: " << *output << "\n";
1128 });
1129
1130 // Do not extract regions that have live exit variables.
1131 if (Outputs.size() > 0 && !ForceLiveExit)
1132 continue;
1133
1134 if (Function *OutlinedFunc = CE.extractCodeRegion(CEAC)) {
1135 CallBase *OCS = PartialInlinerImpl::getOneCallSiteTo(*OutlinedFunc);
1136 BasicBlock *OutliningCallBB = OCS->getParent();
1137 assert(OutliningCallBB->getParent() == ClonedFunc);
1138 OutlinedFunctions.push_back(std::make_pair(OutlinedFunc,OutliningCallBB));
1139 NumColdRegionsOutlined++;
1140 OutlinedRegionCost += CurrentOutlinedRegionCost;
1141
1142 if (MarkOutlinedColdCC) {
1143 OutlinedFunc->setCallingConv(CallingConv::Cold);
1144 OCS->setCallingConv(CallingConv::Cold);
1145 }
1146 } else
1147 ORE.emit([&]() {
1148 return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed",
1149 &RegionInfo.Region.front()->front())
1150 << "Failed to extract region at block "
1151 << ore::NV("Block", RegionInfo.Region.front());
1152 });
1153 }
1154
1155 return !OutlinedFunctions.empty();
1156 }
1157
1158 Function *
doSingleRegionFunctionOutlining()1159 PartialInlinerImpl::FunctionCloner::doSingleRegionFunctionOutlining() {
1160 // Returns true if the block is to be partial inlined into the caller
1161 // (i.e. not to be extracted to the out of line function)
1162 auto ToBeInlined = [&, this](BasicBlock *BB) {
1163 return BB == ClonedOI->ReturnBlock ||
1164 llvm::is_contained(ClonedOI->Entries, BB);
1165 };
1166
1167 assert(ClonedOI && "Expecting OutlineInfo for single region outline");
1168 // The CodeExtractor needs a dominator tree.
1169 DominatorTree DT;
1170 DT.recalculate(*ClonedFunc);
1171
1172 // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo.
1173 LoopInfo LI(DT);
1174 BranchProbabilityInfo BPI(*ClonedFunc, LI);
1175 ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI));
1176
1177 // Gather up the blocks that we're going to extract.
1178 std::vector<BasicBlock *> ToExtract;
1179 auto *ClonedFuncTTI = &GetTTI(*ClonedFunc);
1180 ToExtract.push_back(ClonedOI->NonReturnBlock);
1181 OutlinedRegionCost += PartialInlinerImpl::computeBBInlineCost(
1182 ClonedOI->NonReturnBlock, ClonedFuncTTI);
1183 for (BasicBlock *BB : depth_first(&ClonedFunc->getEntryBlock()))
1184 if (!ToBeInlined(BB) && BB != ClonedOI->NonReturnBlock) {
1185 ToExtract.push_back(BB);
1186 // FIXME: the code extractor may hoist/sink more code
1187 // into the outlined function which may make the outlining
1188 // overhead (the difference of the outlined function cost
1189 // and OutliningRegionCost) look larger.
1190 OutlinedRegionCost += computeBBInlineCost(BB, ClonedFuncTTI);
1191 }
1192
1193 // Extract the body of the if.
1194 CodeExtractorAnalysisCache CEAC(*ClonedFunc);
1195 Function *OutlinedFunc =
1196 CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false,
1197 ClonedFuncBFI.get(), &BPI, LookupAC(*ClonedFunc),
1198 /* AllowVarargs */ true)
1199 .extractCodeRegion(CEAC);
1200
1201 if (OutlinedFunc) {
1202 BasicBlock *OutliningCallBB =
1203 PartialInlinerImpl::getOneCallSiteTo(*OutlinedFunc)->getParent();
1204 assert(OutliningCallBB->getParent() == ClonedFunc);
1205 OutlinedFunctions.push_back(std::make_pair(OutlinedFunc, OutliningCallBB));
1206 } else
1207 ORE.emit([&]() {
1208 return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed",
1209 &ToExtract.front()->front())
1210 << "Failed to extract region at block "
1211 << ore::NV("Block", ToExtract.front());
1212 });
1213
1214 return OutlinedFunc;
1215 }
1216
~FunctionCloner()1217 PartialInlinerImpl::FunctionCloner::~FunctionCloner() {
1218 // Ditch the duplicate, since we're done with it, and rewrite all remaining
1219 // users (function pointers, etc.) back to the original function.
1220 ClonedFunc->replaceAllUsesWith(OrigFunc);
1221 ClonedFunc->eraseFromParent();
1222 if (!IsFunctionInlined) {
1223 // Remove each function that was speculatively created if there is no
1224 // reference.
1225 for (auto FuncBBPair : OutlinedFunctions) {
1226 Function *Func = FuncBBPair.first;
1227 Func->eraseFromParent();
1228 }
1229 }
1230 }
1231
unswitchFunction(Function & F)1232 std::pair<bool, Function *> PartialInlinerImpl::unswitchFunction(Function &F) {
1233 if (F.hasAddressTaken())
1234 return {false, nullptr};
1235
1236 // Let inliner handle it
1237 if (F.hasFnAttribute(Attribute::AlwaysInline))
1238 return {false, nullptr};
1239
1240 if (F.hasFnAttribute(Attribute::NoInline))
1241 return {false, nullptr};
1242
1243 if (PSI.isFunctionEntryCold(&F))
1244 return {false, nullptr};
1245
1246 if (F.users().empty())
1247 return {false, nullptr};
1248
1249 OptimizationRemarkEmitter ORE(&F);
1250
1251 // Only try to outline cold regions if we have a profile summary, which
1252 // implies we have profiling information.
1253 if (PSI.hasProfileSummary() && F.hasProfileData() &&
1254 !DisableMultiRegionPartialInline) {
1255 std::unique_ptr<FunctionOutliningMultiRegionInfo> OMRI =
1256 computeOutliningColdRegionsInfo(F, ORE);
1257 if (OMRI) {
1258 FunctionCloner Cloner(&F, OMRI.get(), ORE, LookupAssumptionCache, GetTTI);
1259
1260 LLVM_DEBUG({
1261 dbgs() << "HotCountThreshold = " << PSI.getHotCountThreshold() << "\n";
1262 dbgs() << "ColdCountThreshold = " << PSI.getColdCountThreshold()
1263 << "\n";
1264 });
1265
1266 bool DidOutline = Cloner.doMultiRegionFunctionOutlining();
1267
1268 if (DidOutline) {
1269 LLVM_DEBUG({
1270 dbgs() << ">>>>>> Outlined (Cloned) Function >>>>>>\n";
1271 Cloner.ClonedFunc->print(dbgs());
1272 dbgs() << "<<<<<< Outlined (Cloned) Function <<<<<<\n";
1273 });
1274
1275 if (tryPartialInline(Cloner))
1276 return {true, nullptr};
1277 }
1278 }
1279 }
1280
1281 // Fall-thru to regular partial inlining if we:
1282 // i) can't find any cold regions to outline, or
1283 // ii) can't inline the outlined function anywhere.
1284 std::unique_ptr<FunctionOutliningInfo> OI = computeOutliningInfo(F);
1285 if (!OI)
1286 return {false, nullptr};
1287
1288 FunctionCloner Cloner(&F, OI.get(), ORE, LookupAssumptionCache, GetTTI);
1289 Cloner.normalizeReturnBlock();
1290
1291 Function *OutlinedFunction = Cloner.doSingleRegionFunctionOutlining();
1292
1293 if (!OutlinedFunction)
1294 return {false, nullptr};
1295
1296 if (tryPartialInline(Cloner))
1297 return {true, OutlinedFunction};
1298
1299 return {false, nullptr};
1300 }
1301
tryPartialInline(FunctionCloner & Cloner)1302 bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) {
1303 if (Cloner.OutlinedFunctions.empty())
1304 return false;
1305
1306 auto OutliningCosts = computeOutliningCosts(Cloner);
1307
1308 InstructionCost SizeCost = std::get<0>(OutliningCosts);
1309 InstructionCost NonWeightedRcost = std::get<1>(OutliningCosts);
1310
1311 assert(SizeCost.isValid() && NonWeightedRcost.isValid() &&
1312 "Expected valid costs");
1313
1314 // Only calculate RelativeToEntryFreq when we are doing single region
1315 // outlining.
1316 BranchProbability RelativeToEntryFreq;
1317 if (Cloner.ClonedOI)
1318 RelativeToEntryFreq = getOutliningCallBBRelativeFreq(Cloner);
1319 else
1320 // RelativeToEntryFreq doesn't make sense when we have more than one
1321 // outlined call because each call will have a different relative frequency
1322 // to the entry block. We can consider using the average, but the
1323 // usefulness of that information is questionable. For now, assume we never
1324 // execute the calls to outlined functions.
1325 RelativeToEntryFreq = BranchProbability(0, 1);
1326
1327 BlockFrequency WeightedRcost =
1328 BlockFrequency(*NonWeightedRcost.getValue()) * RelativeToEntryFreq;
1329
1330 // The call sequence(s) to the outlined function(s) are larger than the sum of
1331 // the original outlined region size(s), it does not increase the chances of
1332 // inlining the function with outlining (The inliner uses the size increase to
1333 // model the cost of inlining a callee).
1334 if (!SkipCostAnalysis && Cloner.OutlinedRegionCost < SizeCost) {
1335 OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc);
1336 DebugLoc DLoc;
1337 BasicBlock *Block;
1338 std::tie(DLoc, Block) = getOneDebugLoc(*Cloner.ClonedFunc);
1339 OrigFuncORE.emit([&]() {
1340 return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall",
1341 DLoc, Block)
1342 << ore::NV("Function", Cloner.OrigFunc)
1343 << " not partially inlined into callers (Original Size = "
1344 << ore::NV("OutlinedRegionOriginalSize", Cloner.OutlinedRegionCost)
1345 << ", Size of call sequence to outlined function = "
1346 << ore::NV("NewSize", SizeCost) << ")";
1347 });
1348 return false;
1349 }
1350
1351 assert(Cloner.OrigFunc->users().empty() &&
1352 "F's users should all be replaced!");
1353
1354 std::vector<User *> Users(Cloner.ClonedFunc->user_begin(),
1355 Cloner.ClonedFunc->user_end());
1356
1357 DenseMap<User *, uint64_t> CallSiteToProfCountMap;
1358 auto CalleeEntryCount = Cloner.OrigFunc->getEntryCount();
1359 if (CalleeEntryCount)
1360 computeCallsiteToProfCountMap(Cloner.ClonedFunc, CallSiteToProfCountMap);
1361
1362 uint64_t CalleeEntryCountV =
1363 (CalleeEntryCount ? CalleeEntryCount->getCount() : 0);
1364
1365 bool AnyInline = false;
1366 for (User *User : Users) {
1367 // Don't bother with BlockAddress used by CallBr for asm goto.
1368 if (isa<BlockAddress>(User))
1369 continue;
1370
1371 CallBase *CB = getSupportedCallBase(User);
1372
1373 if (isLimitReached())
1374 continue;
1375
1376 OptimizationRemarkEmitter CallerORE(CB->getCaller());
1377 if (!shouldPartialInline(*CB, Cloner, WeightedRcost, CallerORE))
1378 continue;
1379
1380 // Construct remark before doing the inlining, as after successful inlining
1381 // the callsite is removed.
1382 OptimizationRemark OR(DEBUG_TYPE, "PartiallyInlined", CB);
1383 OR << ore::NV("Callee", Cloner.OrigFunc) << " partially inlined into "
1384 << ore::NV("Caller", CB->getCaller());
1385
1386 InlineFunctionInfo IFI(GetAssumptionCache, &PSI);
1387 // We can only forward varargs when we outlined a single region, else we
1388 // bail on vararg functions.
1389 if (!InlineFunction(*CB, IFI, /*MergeAttributes=*/false, nullptr, true,
1390 (Cloner.ClonedOI ? Cloner.OutlinedFunctions.back().first
1391 : nullptr))
1392 .isSuccess())
1393 continue;
1394
1395 CallerORE.emit(OR);
1396
1397 // Now update the entry count:
1398 if (CalleeEntryCountV && CallSiteToProfCountMap.count(User)) {
1399 uint64_t CallSiteCount = CallSiteToProfCountMap[User];
1400 CalleeEntryCountV -= std::min(CalleeEntryCountV, CallSiteCount);
1401 }
1402
1403 AnyInline = true;
1404 NumPartialInlining++;
1405 // Update the stats
1406 if (Cloner.ClonedOI)
1407 NumPartialInlined++;
1408 else
1409 NumColdOutlinePartialInlined++;
1410 }
1411
1412 if (AnyInline) {
1413 Cloner.IsFunctionInlined = true;
1414 if (CalleeEntryCount)
1415 Cloner.OrigFunc->setEntryCount(Function::ProfileCount(
1416 CalleeEntryCountV, CalleeEntryCount->getType()));
1417 OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc);
1418 OrigFuncORE.emit([&]() {
1419 return OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", Cloner.OrigFunc)
1420 << "Partially inlined into at least one caller";
1421 });
1422 }
1423
1424 return AnyInline;
1425 }
1426
run(Module & M)1427 bool PartialInlinerImpl::run(Module &M) {
1428 if (DisablePartialInlining)
1429 return false;
1430
1431 std::vector<Function *> Worklist;
1432 Worklist.reserve(M.size());
1433 for (Function &F : M)
1434 if (!F.use_empty() && !F.isDeclaration())
1435 Worklist.push_back(&F);
1436
1437 bool Changed = false;
1438 while (!Worklist.empty()) {
1439 Function *CurrFunc = Worklist.back();
1440 Worklist.pop_back();
1441
1442 if (CurrFunc->use_empty())
1443 continue;
1444
1445 std::pair<bool, Function *> Result = unswitchFunction(*CurrFunc);
1446 if (Result.second)
1447 Worklist.push_back(Result.second);
1448 Changed |= Result.first;
1449 }
1450
1451 return Changed;
1452 }
1453
run(Module & M,ModuleAnalysisManager & AM)1454 PreservedAnalyses PartialInlinerPass::run(Module &M,
1455 ModuleAnalysisManager &AM) {
1456 auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
1457
1458 auto GetAssumptionCache = [&FAM](Function &F) -> AssumptionCache & {
1459 return FAM.getResult<AssumptionAnalysis>(F);
1460 };
1461
1462 auto LookupAssumptionCache = [&FAM](Function &F) -> AssumptionCache * {
1463 return FAM.getCachedResult<AssumptionAnalysis>(F);
1464 };
1465
1466 auto GetBFI = [&FAM](Function &F) -> BlockFrequencyInfo & {
1467 return FAM.getResult<BlockFrequencyAnalysis>(F);
1468 };
1469
1470 auto GetTTI = [&FAM](Function &F) -> TargetTransformInfo & {
1471 return FAM.getResult<TargetIRAnalysis>(F);
1472 };
1473
1474 auto GetTLI = [&FAM](Function &F) -> TargetLibraryInfo & {
1475 return FAM.getResult<TargetLibraryAnalysis>(F);
1476 };
1477
1478 ProfileSummaryInfo &PSI = AM.getResult<ProfileSummaryAnalysis>(M);
1479
1480 if (PartialInlinerImpl(GetAssumptionCache, LookupAssumptionCache, GetTTI,
1481 GetTLI, PSI, GetBFI)
1482 .run(M))
1483 return PreservedAnalyses::none();
1484 return PreservedAnalyses::all();
1485 }
1486