1 //===- PartialInlining.cpp - Inline parts of functions --------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass performs partial inlining, typically by inlining an if statement 10 // that surrounds the body of the function. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "llvm/Transforms/IPO/PartialInlining.h" 15 #include "llvm/ADT/DenseMap.h" 16 #include "llvm/ADT/DenseSet.h" 17 #include "llvm/ADT/None.h" 18 #include "llvm/ADT/Optional.h" 19 #include "llvm/ADT/STLExtras.h" 20 #include "llvm/ADT/SmallVector.h" 21 #include "llvm/ADT/Statistic.h" 22 #include "llvm/Analysis/BlockFrequencyInfo.h" 23 #include "llvm/Analysis/BranchProbabilityInfo.h" 24 #include "llvm/Analysis/InlineCost.h" 25 #include "llvm/Analysis/LoopInfo.h" 26 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 27 #include "llvm/Analysis/ProfileSummaryInfo.h" 28 #include "llvm/Analysis/TargetLibraryInfo.h" 29 #include "llvm/Analysis/TargetTransformInfo.h" 30 #include "llvm/IR/Attributes.h" 31 #include "llvm/IR/BasicBlock.h" 32 #include "llvm/IR/CFG.h" 33 #include "llvm/IR/CallSite.h" 34 #include "llvm/IR/DebugLoc.h" 35 #include "llvm/IR/DiagnosticInfo.h" 36 #include "llvm/IR/Dominators.h" 37 #include "llvm/IR/Function.h" 38 #include "llvm/IR/InstrTypes.h" 39 #include "llvm/IR/Instruction.h" 40 #include "llvm/IR/Instructions.h" 41 #include "llvm/IR/IntrinsicInst.h" 42 #include "llvm/IR/Intrinsics.h" 43 #include "llvm/IR/Module.h" 44 #include "llvm/IR/User.h" 45 #include "llvm/InitializePasses.h" 46 #include "llvm/Pass.h" 47 #include "llvm/Support/BlockFrequency.h" 48 #include "llvm/Support/BranchProbability.h" 49 #include "llvm/Support/Casting.h" 50 #include "llvm/Support/CommandLine.h" 51 #include "llvm/Support/ErrorHandling.h" 52 #include "llvm/Transforms/IPO.h" 53 #include "llvm/Transforms/Utils/Cloning.h" 54 #include "llvm/Transforms/Utils/CodeExtractor.h" 55 #include "llvm/Transforms/Utils/ValueMapper.h" 56 #include <algorithm> 57 #include <cassert> 58 #include <cstdint> 59 #include <functional> 60 #include <iterator> 61 #include <memory> 62 #include <tuple> 63 #include <vector> 64 65 using namespace llvm; 66 67 #define DEBUG_TYPE "partial-inlining" 68 69 STATISTIC(NumPartialInlined, 70 "Number of callsites functions partially inlined into."); 71 STATISTIC(NumColdOutlinePartialInlined, "Number of times functions with " 72 "cold outlined regions were partially " 73 "inlined into its caller(s)."); 74 STATISTIC(NumColdRegionsFound, 75 "Number of cold single entry/exit regions found."); 76 STATISTIC(NumColdRegionsOutlined, 77 "Number of cold single entry/exit regions outlined."); 78 79 // Command line option to disable partial-inlining. The default is false: 80 static cl::opt<bool> 81 DisablePartialInlining("disable-partial-inlining", cl::init(false), 82 cl::Hidden, cl::desc("Disable partial inlining")); 83 // Command line option to disable multi-region partial-inlining. The default is 84 // false: 85 static cl::opt<bool> DisableMultiRegionPartialInline( 86 "disable-mr-partial-inlining", cl::init(false), cl::Hidden, 87 cl::desc("Disable multi-region partial inlining")); 88 89 // Command line option to force outlining in regions with live exit variables. 90 // The default is false: 91 static cl::opt<bool> 92 ForceLiveExit("pi-force-live-exit-outline", cl::init(false), cl::Hidden, 93 cl::desc("Force outline regions with live exits")); 94 95 // Command line option to enable marking outline functions with Cold Calling 96 // Convention. The default is false: 97 static cl::opt<bool> 98 MarkOutlinedColdCC("pi-mark-coldcc", cl::init(false), cl::Hidden, 99 cl::desc("Mark outline function calls with ColdCC")); 100 101 #ifndef NDEBUG 102 // Command line option to debug partial-inlining. The default is none: 103 static cl::opt<bool> TracePartialInlining("trace-partial-inlining", 104 cl::init(false), cl::Hidden, 105 cl::desc("Trace partial inlining.")); 106 #endif 107 108 // This is an option used by testing: 109 static cl::opt<bool> SkipCostAnalysis("skip-partial-inlining-cost-analysis", 110 cl::init(false), cl::ZeroOrMore, 111 cl::ReallyHidden, 112 cl::desc("Skip Cost Analysis")); 113 // Used to determine if a cold region is worth outlining based on 114 // its inlining cost compared to the original function. Default is set at 10%. 115 // ie. if the cold region reduces the inlining cost of the original function by 116 // at least 10%. 117 static cl::opt<float> MinRegionSizeRatio( 118 "min-region-size-ratio", cl::init(0.1), cl::Hidden, 119 cl::desc("Minimum ratio comparing relative sizes of each " 120 "outline candidate and original function")); 121 // Used to tune the minimum number of execution counts needed in the predecessor 122 // block to the cold edge. ie. confidence interval. 123 static cl::opt<unsigned> 124 MinBlockCounterExecution("min-block-execution", cl::init(100), cl::Hidden, 125 cl::desc("Minimum block executions to consider " 126 "its BranchProbabilityInfo valid")); 127 // Used to determine when an edge is considered cold. Default is set to 10%. ie. 128 // if the branch probability is 10% or less, then it is deemed as 'cold'. 129 static cl::opt<float> ColdBranchRatio( 130 "cold-branch-ratio", cl::init(0.1), cl::Hidden, 131 cl::desc("Minimum BranchProbability to consider a region cold.")); 132 133 static cl::opt<unsigned> MaxNumInlineBlocks( 134 "max-num-inline-blocks", cl::init(5), cl::Hidden, 135 cl::desc("Max number of blocks to be partially inlined")); 136 137 // Command line option to set the maximum number of partial inlining allowed 138 // for the module. The default value of -1 means no limit. 139 static cl::opt<int> MaxNumPartialInlining( 140 "max-partial-inlining", cl::init(-1), cl::Hidden, cl::ZeroOrMore, 141 cl::desc("Max number of partial inlining. The default is unlimited")); 142 143 // Used only when PGO or user annotated branch data is absent. It is 144 // the least value that is used to weigh the outline region. If BFI 145 // produces larger value, the BFI value will be used. 146 static cl::opt<int> 147 OutlineRegionFreqPercent("outline-region-freq-percent", cl::init(75), 148 cl::Hidden, cl::ZeroOrMore, 149 cl::desc("Relative frequency of outline region to " 150 "the entry block")); 151 152 static cl::opt<unsigned> ExtraOutliningPenalty( 153 "partial-inlining-extra-penalty", cl::init(0), cl::Hidden, 154 cl::desc("A debug option to add additional penalty to the computed one.")); 155 156 namespace { 157 158 struct FunctionOutliningInfo { 159 FunctionOutliningInfo() = default; 160 161 // Returns the number of blocks to be inlined including all blocks 162 // in Entries and one return block. 163 unsigned GetNumInlinedBlocks() const { return Entries.size() + 1; } 164 165 // A set of blocks including the function entry that guard 166 // the region to be outlined. 167 SmallVector<BasicBlock *, 4> Entries; 168 169 // The return block that is not included in the outlined region. 170 BasicBlock *ReturnBlock = nullptr; 171 172 // The dominating block of the region to be outlined. 173 BasicBlock *NonReturnBlock = nullptr; 174 175 // The set of blocks in Entries that that are predecessors to ReturnBlock 176 SmallVector<BasicBlock *, 4> ReturnBlockPreds; 177 }; 178 179 struct FunctionOutliningMultiRegionInfo { 180 FunctionOutliningMultiRegionInfo() 181 : ORI() {} 182 183 // Container for outline regions 184 struct OutlineRegionInfo { 185 OutlineRegionInfo(ArrayRef<BasicBlock *> Region, 186 BasicBlock *EntryBlock, BasicBlock *ExitBlock, 187 BasicBlock *ReturnBlock) 188 : Region(Region.begin(), Region.end()), EntryBlock(EntryBlock), 189 ExitBlock(ExitBlock), ReturnBlock(ReturnBlock) {} 190 SmallVector<BasicBlock *, 8> Region; 191 BasicBlock *EntryBlock; 192 BasicBlock *ExitBlock; 193 BasicBlock *ReturnBlock; 194 }; 195 196 SmallVector<OutlineRegionInfo, 4> ORI; 197 }; 198 199 struct PartialInlinerImpl { 200 201 PartialInlinerImpl( 202 std::function<AssumptionCache &(Function &)> *GetAC, 203 function_ref<AssumptionCache *(Function &)> LookupAC, 204 std::function<TargetTransformInfo &(Function &)> *GTTI, 205 Optional<function_ref<BlockFrequencyInfo &(Function &)>> GBFI, 206 ProfileSummaryInfo *ProfSI) 207 : GetAssumptionCache(GetAC), LookupAssumptionCache(LookupAC), 208 GetTTI(GTTI), GetBFI(GBFI), PSI(ProfSI) {} 209 210 bool run(Module &M); 211 // Main part of the transformation that calls helper functions to find 212 // outlining candidates, clone & outline the function, and attempt to 213 // partially inline the resulting function. Returns true if 214 // inlining was successful, false otherwise. Also returns the outline 215 // function (only if we partially inlined early returns) as there is a 216 // possibility to further "peel" early return statements that were left in the 217 // outline function due to code size. 218 std::pair<bool, Function *> unswitchFunction(Function *F); 219 220 // This class speculatively clones the function to be partial inlined. 221 // At the end of partial inlining, the remaining callsites to the cloned 222 // function that are not partially inlined will be fixed up to reference 223 // the original function, and the cloned function will be erased. 224 struct FunctionCloner { 225 // Two constructors, one for single region outlining, the other for 226 // multi-region outlining. 227 FunctionCloner(Function *F, FunctionOutliningInfo *OI, 228 OptimizationRemarkEmitter &ORE, 229 function_ref<AssumptionCache *(Function &)> LookupAC); 230 FunctionCloner(Function *F, FunctionOutliningMultiRegionInfo *OMRI, 231 OptimizationRemarkEmitter &ORE, 232 function_ref<AssumptionCache *(Function &)> LookupAC); 233 ~FunctionCloner(); 234 235 // Prepare for function outlining: making sure there is only 236 // one incoming edge from the extracted/outlined region to 237 // the return block. 238 void NormalizeReturnBlock(); 239 240 // Do function outlining for cold regions. 241 bool doMultiRegionFunctionOutlining(); 242 // Do function outlining for region after early return block(s). 243 // NOTE: For vararg functions that do the vararg handling in the outlined 244 // function, we temporarily generate IR that does not properly 245 // forward varargs to the outlined function. Calling InlineFunction 246 // will update calls to the outlined functions to properly forward 247 // the varargs. 248 Function *doSingleRegionFunctionOutlining(); 249 250 Function *OrigFunc = nullptr; 251 Function *ClonedFunc = nullptr; 252 253 typedef std::pair<Function *, BasicBlock *> FuncBodyCallerPair; 254 // Keep track of Outlined Functions and the basic block they're called from. 255 SmallVector<FuncBodyCallerPair, 4> OutlinedFunctions; 256 257 // ClonedFunc is inlined in one of its callers after function 258 // outlining. 259 bool IsFunctionInlined = false; 260 // The cost of the region to be outlined. 261 int OutlinedRegionCost = 0; 262 // ClonedOI is specific to outlining non-early return blocks. 263 std::unique_ptr<FunctionOutliningInfo> ClonedOI = nullptr; 264 // ClonedOMRI is specific to outlining cold regions. 265 std::unique_ptr<FunctionOutliningMultiRegionInfo> ClonedOMRI = nullptr; 266 std::unique_ptr<BlockFrequencyInfo> ClonedFuncBFI = nullptr; 267 OptimizationRemarkEmitter &ORE; 268 function_ref<AssumptionCache *(Function &)> LookupAC; 269 }; 270 271 private: 272 int NumPartialInlining = 0; 273 std::function<AssumptionCache &(Function &)> *GetAssumptionCache; 274 function_ref<AssumptionCache *(Function &)> LookupAssumptionCache; 275 std::function<TargetTransformInfo &(Function &)> *GetTTI; 276 Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI; 277 ProfileSummaryInfo *PSI; 278 279 // Return the frequency of the OutlininingBB relative to F's entry point. 280 // The result is no larger than 1 and is represented using BP. 281 // (Note that the outlined region's 'head' block can only have incoming 282 // edges from the guarding entry blocks). 283 BranchProbability getOutliningCallBBRelativeFreq(FunctionCloner &Cloner); 284 285 // Return true if the callee of CS should be partially inlined with 286 // profit. 287 bool shouldPartialInline(CallSite CS, FunctionCloner &Cloner, 288 BlockFrequency WeightedOutliningRcost, 289 OptimizationRemarkEmitter &ORE); 290 291 // Try to inline DuplicateFunction (cloned from F with call to 292 // the OutlinedFunction into its callers. Return true 293 // if there is any successful inlining. 294 bool tryPartialInline(FunctionCloner &Cloner); 295 296 // Compute the mapping from use site of DuplicationFunction to the enclosing 297 // BB's profile count. 298 void computeCallsiteToProfCountMap(Function *DuplicateFunction, 299 DenseMap<User *, uint64_t> &SiteCountMap); 300 301 bool IsLimitReached() { 302 return (MaxNumPartialInlining != -1 && 303 NumPartialInlining >= MaxNumPartialInlining); 304 } 305 306 static CallSite getCallSite(User *U) { 307 CallSite CS; 308 if (CallInst *CI = dyn_cast<CallInst>(U)) 309 CS = CallSite(CI); 310 else if (InvokeInst *II = dyn_cast<InvokeInst>(U)) 311 CS = CallSite(II); 312 else 313 llvm_unreachable("All uses must be calls"); 314 return CS; 315 } 316 317 static CallSite getOneCallSiteTo(Function *F) { 318 User *User = *F->user_begin(); 319 return getCallSite(User); 320 } 321 322 std::tuple<DebugLoc, BasicBlock *> getOneDebugLoc(Function *F) { 323 CallSite CS = getOneCallSiteTo(F); 324 DebugLoc DLoc = CS.getInstruction()->getDebugLoc(); 325 BasicBlock *Block = CS.getParent(); 326 return std::make_tuple(DLoc, Block); 327 } 328 329 // Returns the costs associated with function outlining: 330 // - The first value is the non-weighted runtime cost for making the call 331 // to the outlined function, including the addtional setup cost in the 332 // outlined function itself; 333 // - The second value is the estimated size of the new call sequence in 334 // basic block Cloner.OutliningCallBB; 335 std::tuple<int, int> computeOutliningCosts(FunctionCloner &Cloner); 336 337 // Compute the 'InlineCost' of block BB. InlineCost is a proxy used to 338 // approximate both the size and runtime cost (Note that in the current 339 // inline cost analysis, there is no clear distinction there either). 340 static int computeBBInlineCost(BasicBlock *BB); 341 342 std::unique_ptr<FunctionOutliningInfo> computeOutliningInfo(Function *F); 343 std::unique_ptr<FunctionOutliningMultiRegionInfo> 344 computeOutliningColdRegionsInfo(Function *F, OptimizationRemarkEmitter &ORE); 345 }; 346 347 struct PartialInlinerLegacyPass : public ModulePass { 348 static char ID; // Pass identification, replacement for typeid 349 350 PartialInlinerLegacyPass() : ModulePass(ID) { 351 initializePartialInlinerLegacyPassPass(*PassRegistry::getPassRegistry()); 352 } 353 354 void getAnalysisUsage(AnalysisUsage &AU) const override { 355 AU.addRequired<AssumptionCacheTracker>(); 356 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 357 AU.addRequired<TargetTransformInfoWrapperPass>(); 358 } 359 360 bool runOnModule(Module &M) override { 361 if (skipModule(M)) 362 return false; 363 364 AssumptionCacheTracker *ACT = &getAnalysis<AssumptionCacheTracker>(); 365 TargetTransformInfoWrapperPass *TTIWP = 366 &getAnalysis<TargetTransformInfoWrapperPass>(); 367 ProfileSummaryInfo *PSI = 368 &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 369 370 std::function<AssumptionCache &(Function &)> GetAssumptionCache = 371 [&ACT](Function &F) -> AssumptionCache & { 372 return ACT->getAssumptionCache(F); 373 }; 374 375 auto LookupAssumptionCache = [ACT](Function &F) -> AssumptionCache * { 376 return ACT->lookupAssumptionCache(F); 377 }; 378 379 std::function<TargetTransformInfo &(Function &)> GetTTI = 380 [&TTIWP](Function &F) -> TargetTransformInfo & { 381 return TTIWP->getTTI(F); 382 }; 383 384 return PartialInlinerImpl(&GetAssumptionCache, LookupAssumptionCache, 385 &GetTTI, NoneType::None, PSI) 386 .run(M); 387 } 388 }; 389 390 } // end anonymous namespace 391 392 std::unique_ptr<FunctionOutliningMultiRegionInfo> 393 PartialInlinerImpl::computeOutliningColdRegionsInfo(Function *F, 394 OptimizationRemarkEmitter &ORE) { 395 BasicBlock *EntryBlock = &F->front(); 396 397 DominatorTree DT(*F); 398 LoopInfo LI(DT); 399 BranchProbabilityInfo BPI(*F, LI); 400 std::unique_ptr<BlockFrequencyInfo> ScopedBFI; 401 BlockFrequencyInfo *BFI; 402 if (!GetBFI) { 403 ScopedBFI.reset(new BlockFrequencyInfo(*F, BPI, LI)); 404 BFI = ScopedBFI.get(); 405 } else 406 BFI = &(*GetBFI)(*F); 407 408 // Return if we don't have profiling information. 409 if (!PSI->hasInstrumentationProfile()) 410 return std::unique_ptr<FunctionOutliningMultiRegionInfo>(); 411 412 std::unique_ptr<FunctionOutliningMultiRegionInfo> OutliningInfo = 413 std::make_unique<FunctionOutliningMultiRegionInfo>(); 414 415 auto IsSingleEntry = [](SmallVectorImpl<BasicBlock *> &BlockList) { 416 BasicBlock *Dom = BlockList.front(); 417 return BlockList.size() > 1 && Dom->hasNPredecessors(1); 418 }; 419 420 auto IsSingleExit = 421 [&ORE](SmallVectorImpl<BasicBlock *> &BlockList) -> BasicBlock * { 422 BasicBlock *ExitBlock = nullptr; 423 for (auto *Block : BlockList) { 424 for (auto SI = succ_begin(Block); SI != succ_end(Block); ++SI) { 425 if (!is_contained(BlockList, *SI)) { 426 if (ExitBlock) { 427 ORE.emit([&]() { 428 return OptimizationRemarkMissed(DEBUG_TYPE, "MultiExitRegion", 429 &SI->front()) 430 << "Region dominated by " 431 << ore::NV("Block", BlockList.front()->getName()) 432 << " has more than one region exit edge."; 433 }); 434 return nullptr; 435 } else 436 ExitBlock = Block; 437 } 438 } 439 } 440 return ExitBlock; 441 }; 442 443 auto BBProfileCount = [BFI](BasicBlock *BB) { 444 return BFI->getBlockProfileCount(BB) 445 ? BFI->getBlockProfileCount(BB).getValue() 446 : 0; 447 }; 448 449 // Use the same computeBBInlineCost function to compute the cost savings of 450 // the outlining the candidate region. 451 int OverallFunctionCost = 0; 452 for (auto &BB : *F) 453 OverallFunctionCost += computeBBInlineCost(&BB); 454 455 #ifndef NDEBUG 456 if (TracePartialInlining) 457 dbgs() << "OverallFunctionCost = " << OverallFunctionCost << "\n"; 458 #endif 459 int MinOutlineRegionCost = 460 static_cast<int>(OverallFunctionCost * MinRegionSizeRatio); 461 BranchProbability MinBranchProbability( 462 static_cast<int>(ColdBranchRatio * MinBlockCounterExecution), 463 MinBlockCounterExecution); 464 bool ColdCandidateFound = false; 465 BasicBlock *CurrEntry = EntryBlock; 466 std::vector<BasicBlock *> DFS; 467 DenseMap<BasicBlock *, bool> VisitedMap; 468 DFS.push_back(CurrEntry); 469 VisitedMap[CurrEntry] = true; 470 // Use Depth First Search on the basic blocks to find CFG edges that are 471 // considered cold. 472 // Cold regions considered must also have its inline cost compared to the 473 // overall inline cost of the original function. The region is outlined only 474 // if it reduced the inline cost of the function by 'MinOutlineRegionCost' or 475 // more. 476 while (!DFS.empty()) { 477 auto *thisBB = DFS.back(); 478 DFS.pop_back(); 479 // Only consider regions with predecessor blocks that are considered 480 // not-cold (default: part of the top 99.99% of all block counters) 481 // AND greater than our minimum block execution count (default: 100). 482 if (PSI->isColdBlock(thisBB, BFI) || 483 BBProfileCount(thisBB) < MinBlockCounterExecution) 484 continue; 485 for (auto SI = succ_begin(thisBB); SI != succ_end(thisBB); ++SI) { 486 if (VisitedMap[*SI]) 487 continue; 488 VisitedMap[*SI] = true; 489 DFS.push_back(*SI); 490 // If branch isn't cold, we skip to the next one. 491 BranchProbability SuccProb = BPI.getEdgeProbability(thisBB, *SI); 492 if (SuccProb > MinBranchProbability) 493 continue; 494 #ifndef NDEBUG 495 if (TracePartialInlining) { 496 dbgs() << "Found cold edge: " << thisBB->getName() << "->" 497 << (*SI)->getName() << "\nBranch Probability = " << SuccProb 498 << "\n"; 499 } 500 #endif 501 SmallVector<BasicBlock *, 8> DominateVector; 502 DT.getDescendants(*SI, DominateVector); 503 // We can only outline single entry regions (for now). 504 if (!IsSingleEntry(DominateVector)) 505 continue; 506 BasicBlock *ExitBlock = nullptr; 507 // We can only outline single exit regions (for now). 508 if (!(ExitBlock = IsSingleExit(DominateVector))) 509 continue; 510 int OutlineRegionCost = 0; 511 for (auto *BB : DominateVector) 512 OutlineRegionCost += computeBBInlineCost(BB); 513 514 #ifndef NDEBUG 515 if (TracePartialInlining) 516 dbgs() << "OutlineRegionCost = " << OutlineRegionCost << "\n"; 517 #endif 518 519 if (OutlineRegionCost < MinOutlineRegionCost) { 520 ORE.emit([&]() { 521 return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", 522 &SI->front()) 523 << ore::NV("Callee", F) << " inline cost-savings smaller than " 524 << ore::NV("Cost", MinOutlineRegionCost); 525 }); 526 continue; 527 } 528 // For now, ignore blocks that belong to a SISE region that is a 529 // candidate for outlining. In the future, we may want to look 530 // at inner regions because the outer region may have live-exit 531 // variables. 532 for (auto *BB : DominateVector) 533 VisitedMap[BB] = true; 534 // ReturnBlock here means the block after the outline call 535 BasicBlock *ReturnBlock = ExitBlock->getSingleSuccessor(); 536 // assert(ReturnBlock && "ReturnBlock is NULL somehow!"); 537 FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegInfo( 538 DominateVector, DominateVector.front(), ExitBlock, ReturnBlock); 539 OutliningInfo->ORI.push_back(RegInfo); 540 #ifndef NDEBUG 541 if (TracePartialInlining) { 542 dbgs() << "Found Cold Candidate starting at block: " 543 << DominateVector.front()->getName() << "\n"; 544 } 545 #endif 546 ColdCandidateFound = true; 547 NumColdRegionsFound++; 548 } 549 } 550 if (ColdCandidateFound) 551 return OutliningInfo; 552 else 553 return std::unique_ptr<FunctionOutliningMultiRegionInfo>(); 554 } 555 556 std::unique_ptr<FunctionOutliningInfo> 557 PartialInlinerImpl::computeOutliningInfo(Function *F) { 558 BasicBlock *EntryBlock = &F->front(); 559 BranchInst *BR = dyn_cast<BranchInst>(EntryBlock->getTerminator()); 560 if (!BR || BR->isUnconditional()) 561 return std::unique_ptr<FunctionOutliningInfo>(); 562 563 // Returns true if Succ is BB's successor 564 auto IsSuccessor = [](BasicBlock *Succ, BasicBlock *BB) { 565 return is_contained(successors(BB), Succ); 566 }; 567 568 auto IsReturnBlock = [](BasicBlock *BB) { 569 Instruction *TI = BB->getTerminator(); 570 return isa<ReturnInst>(TI); 571 }; 572 573 auto GetReturnBlock = [&](BasicBlock *Succ1, BasicBlock *Succ2) { 574 if (IsReturnBlock(Succ1)) 575 return std::make_tuple(Succ1, Succ2); 576 if (IsReturnBlock(Succ2)) 577 return std::make_tuple(Succ2, Succ1); 578 579 return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr); 580 }; 581 582 // Detect a triangular shape: 583 auto GetCommonSucc = [&](BasicBlock *Succ1, BasicBlock *Succ2) { 584 if (IsSuccessor(Succ1, Succ2)) 585 return std::make_tuple(Succ1, Succ2); 586 if (IsSuccessor(Succ2, Succ1)) 587 return std::make_tuple(Succ2, Succ1); 588 589 return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr); 590 }; 591 592 std::unique_ptr<FunctionOutliningInfo> OutliningInfo = 593 std::make_unique<FunctionOutliningInfo>(); 594 595 BasicBlock *CurrEntry = EntryBlock; 596 bool CandidateFound = false; 597 do { 598 // The number of blocks to be inlined has already reached 599 // the limit. When MaxNumInlineBlocks is set to 0 or 1, this 600 // disables partial inlining for the function. 601 if (OutliningInfo->GetNumInlinedBlocks() >= MaxNumInlineBlocks) 602 break; 603 604 if (succ_size(CurrEntry) != 2) 605 break; 606 607 BasicBlock *Succ1 = *succ_begin(CurrEntry); 608 BasicBlock *Succ2 = *(succ_begin(CurrEntry) + 1); 609 610 BasicBlock *ReturnBlock, *NonReturnBlock; 611 std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2); 612 613 if (ReturnBlock) { 614 OutliningInfo->Entries.push_back(CurrEntry); 615 OutliningInfo->ReturnBlock = ReturnBlock; 616 OutliningInfo->NonReturnBlock = NonReturnBlock; 617 CandidateFound = true; 618 break; 619 } 620 621 BasicBlock *CommSucc; 622 BasicBlock *OtherSucc; 623 std::tie(CommSucc, OtherSucc) = GetCommonSucc(Succ1, Succ2); 624 625 if (!CommSucc) 626 break; 627 628 OutliningInfo->Entries.push_back(CurrEntry); 629 CurrEntry = OtherSucc; 630 } while (true); 631 632 if (!CandidateFound) 633 return std::unique_ptr<FunctionOutliningInfo>(); 634 635 // Do sanity check of the entries: threre should not 636 // be any successors (not in the entry set) other than 637 // {ReturnBlock, NonReturnBlock} 638 assert(OutliningInfo->Entries[0] == &F->front() && 639 "Function Entry must be the first in Entries vector"); 640 DenseSet<BasicBlock *> Entries; 641 for (BasicBlock *E : OutliningInfo->Entries) 642 Entries.insert(E); 643 644 // Returns true of BB has Predecessor which is not 645 // in Entries set. 646 auto HasNonEntryPred = [Entries](BasicBlock *BB) { 647 for (auto Pred : predecessors(BB)) { 648 if (!Entries.count(Pred)) 649 return true; 650 } 651 return false; 652 }; 653 auto CheckAndNormalizeCandidate = 654 [Entries, HasNonEntryPred](FunctionOutliningInfo *OutliningInfo) { 655 for (BasicBlock *E : OutliningInfo->Entries) { 656 for (auto Succ : successors(E)) { 657 if (Entries.count(Succ)) 658 continue; 659 if (Succ == OutliningInfo->ReturnBlock) 660 OutliningInfo->ReturnBlockPreds.push_back(E); 661 else if (Succ != OutliningInfo->NonReturnBlock) 662 return false; 663 } 664 // There should not be any outside incoming edges either: 665 if (HasNonEntryPred(E)) 666 return false; 667 } 668 return true; 669 }; 670 671 if (!CheckAndNormalizeCandidate(OutliningInfo.get())) 672 return std::unique_ptr<FunctionOutliningInfo>(); 673 674 // Now further growing the candidate's inlining region by 675 // peeling off dominating blocks from the outlining region: 676 while (OutliningInfo->GetNumInlinedBlocks() < MaxNumInlineBlocks) { 677 BasicBlock *Cand = OutliningInfo->NonReturnBlock; 678 if (succ_size(Cand) != 2) 679 break; 680 681 if (HasNonEntryPred(Cand)) 682 break; 683 684 BasicBlock *Succ1 = *succ_begin(Cand); 685 BasicBlock *Succ2 = *(succ_begin(Cand) + 1); 686 687 BasicBlock *ReturnBlock, *NonReturnBlock; 688 std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2); 689 if (!ReturnBlock || ReturnBlock != OutliningInfo->ReturnBlock) 690 break; 691 692 if (NonReturnBlock->getSinglePredecessor() != Cand) 693 break; 694 695 // Now grow and update OutlininigInfo: 696 OutliningInfo->Entries.push_back(Cand); 697 OutliningInfo->NonReturnBlock = NonReturnBlock; 698 OutliningInfo->ReturnBlockPreds.push_back(Cand); 699 Entries.insert(Cand); 700 } 701 702 return OutliningInfo; 703 } 704 705 // Check if there is PGO data or user annotated branch data: 706 static bool hasProfileData(Function *F, FunctionOutliningInfo *OI) { 707 if (F->hasProfileData()) 708 return true; 709 // Now check if any of the entry block has MD_prof data: 710 for (auto *E : OI->Entries) { 711 BranchInst *BR = dyn_cast<BranchInst>(E->getTerminator()); 712 if (!BR || BR->isUnconditional()) 713 continue; 714 uint64_t T, F; 715 if (BR->extractProfMetadata(T, F)) 716 return true; 717 } 718 return false; 719 } 720 721 BranchProbability 722 PartialInlinerImpl::getOutliningCallBBRelativeFreq(FunctionCloner &Cloner) { 723 BasicBlock *OutliningCallBB = Cloner.OutlinedFunctions.back().second; 724 auto EntryFreq = 725 Cloner.ClonedFuncBFI->getBlockFreq(&Cloner.ClonedFunc->getEntryBlock()); 726 auto OutliningCallFreq = 727 Cloner.ClonedFuncBFI->getBlockFreq(OutliningCallBB); 728 // FIXME Hackery needed because ClonedFuncBFI is based on the function BEFORE 729 // we outlined any regions, so we may encounter situations where the 730 // OutliningCallFreq is *slightly* bigger than the EntryFreq. 731 if (OutliningCallFreq.getFrequency() > EntryFreq.getFrequency()) { 732 OutliningCallFreq = EntryFreq; 733 } 734 auto OutlineRegionRelFreq = BranchProbability::getBranchProbability( 735 OutliningCallFreq.getFrequency(), EntryFreq.getFrequency()); 736 737 if (hasProfileData(Cloner.OrigFunc, Cloner.ClonedOI.get())) 738 return OutlineRegionRelFreq; 739 740 // When profile data is not available, we need to be conservative in 741 // estimating the overall savings. Static branch prediction can usually 742 // guess the branch direction right (taken/non-taken), but the guessed 743 // branch probability is usually not biased enough. In case when the 744 // outlined region is predicted to be likely, its probability needs 745 // to be made higher (more biased) to not under-estimate the cost of 746 // function outlining. On the other hand, if the outlined region 747 // is predicted to be less likely, the predicted probablity is usually 748 // higher than the actual. For instance, the actual probability of the 749 // less likely target is only 5%, but the guessed probablity can be 750 // 40%. In the latter case, there is no need for further adjustement. 751 // FIXME: add an option for this. 752 if (OutlineRegionRelFreq < BranchProbability(45, 100)) 753 return OutlineRegionRelFreq; 754 755 OutlineRegionRelFreq = std::max( 756 OutlineRegionRelFreq, BranchProbability(OutlineRegionFreqPercent, 100)); 757 758 return OutlineRegionRelFreq; 759 } 760 761 bool PartialInlinerImpl::shouldPartialInline( 762 CallSite CS, FunctionCloner &Cloner, 763 BlockFrequency WeightedOutliningRcost, 764 OptimizationRemarkEmitter &ORE) { 765 using namespace ore; 766 767 Instruction *Call = CS.getInstruction(); 768 Function *Callee = CS.getCalledFunction(); 769 assert(Callee == Cloner.ClonedFunc); 770 771 if (SkipCostAnalysis) 772 return isInlineViable(*Callee); 773 774 Function *Caller = CS.getCaller(); 775 auto &CalleeTTI = (*GetTTI)(*Callee); 776 bool RemarksEnabled = 777 Callee->getContext().getDiagHandlerPtr()->isMissedOptRemarkEnabled( 778 DEBUG_TYPE); 779 assert(Call && "invalid callsite for partial inline"); 780 InlineCost IC = getInlineCost(cast<CallBase>(*Call), getInlineParams(), 781 CalleeTTI, *GetAssumptionCache, GetBFI, PSI, 782 RemarksEnabled ? &ORE : nullptr); 783 784 if (IC.isAlways()) { 785 ORE.emit([&]() { 786 return OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", Call) 787 << NV("Callee", Cloner.OrigFunc) 788 << " should always be fully inlined, not partially"; 789 }); 790 return false; 791 } 792 793 if (IC.isNever()) { 794 ORE.emit([&]() { 795 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", Call) 796 << NV("Callee", Cloner.OrigFunc) << " not partially inlined into " 797 << NV("Caller", Caller) 798 << " because it should never be inlined (cost=never)"; 799 }); 800 return false; 801 } 802 803 if (!IC) { 804 ORE.emit([&]() { 805 return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", Call) 806 << NV("Callee", Cloner.OrigFunc) << " not partially inlined into " 807 << NV("Caller", Caller) << " because too costly to inline (cost=" 808 << NV("Cost", IC.getCost()) << ", threshold=" 809 << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")"; 810 }); 811 return false; 812 } 813 const DataLayout &DL = Caller->getParent()->getDataLayout(); 814 815 // The savings of eliminating the call: 816 int NonWeightedSavings = getCallsiteCost(cast<CallBase>(*Call), DL); 817 BlockFrequency NormWeightedSavings(NonWeightedSavings); 818 819 // Weighted saving is smaller than weighted cost, return false 820 if (NormWeightedSavings < WeightedOutliningRcost) { 821 ORE.emit([&]() { 822 return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutliningCallcostTooHigh", 823 Call) 824 << NV("Callee", Cloner.OrigFunc) << " not partially inlined into " 825 << NV("Caller", Caller) << " runtime overhead (overhead=" 826 << NV("Overhead", (unsigned)WeightedOutliningRcost.getFrequency()) 827 << ", savings=" 828 << NV("Savings", (unsigned)NormWeightedSavings.getFrequency()) 829 << ")" 830 << " of making the outlined call is too high"; 831 }); 832 833 return false; 834 } 835 836 ORE.emit([&]() { 837 return OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBePartiallyInlined", Call) 838 << NV("Callee", Cloner.OrigFunc) << " can be partially inlined into " 839 << NV("Caller", Caller) << " with cost=" << NV("Cost", IC.getCost()) 840 << " (threshold=" 841 << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")"; 842 }); 843 return true; 844 } 845 846 // TODO: Ideally we should share Inliner's InlineCost Analysis code. 847 // For now use a simplified version. The returned 'InlineCost' will be used 848 // to esimate the size cost as well as runtime cost of the BB. 849 int PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB) { 850 int InlineCost = 0; 851 const DataLayout &DL = BB->getParent()->getParent()->getDataLayout(); 852 for (Instruction &I : BB->instructionsWithoutDebug()) { 853 // Skip free instructions. 854 switch (I.getOpcode()) { 855 case Instruction::BitCast: 856 case Instruction::PtrToInt: 857 case Instruction::IntToPtr: 858 case Instruction::Alloca: 859 case Instruction::PHI: 860 continue; 861 case Instruction::GetElementPtr: 862 if (cast<GetElementPtrInst>(&I)->hasAllZeroIndices()) 863 continue; 864 break; 865 default: 866 break; 867 } 868 869 if (I.isLifetimeStartOrEnd()) 870 continue; 871 872 if (CallInst *CI = dyn_cast<CallInst>(&I)) { 873 InlineCost += getCallsiteCost(*CI, DL); 874 continue; 875 } 876 877 if (InvokeInst *II = dyn_cast<InvokeInst>(&I)) { 878 InlineCost += getCallsiteCost(*II, DL); 879 continue; 880 } 881 882 if (SwitchInst *SI = dyn_cast<SwitchInst>(&I)) { 883 InlineCost += (SI->getNumCases() + 1) * InlineConstants::InstrCost; 884 continue; 885 } 886 InlineCost += InlineConstants::InstrCost; 887 } 888 return InlineCost; 889 } 890 891 std::tuple<int, int> 892 PartialInlinerImpl::computeOutliningCosts(FunctionCloner &Cloner) { 893 int OutliningFuncCallCost = 0, OutlinedFunctionCost = 0; 894 for (auto FuncBBPair : Cloner.OutlinedFunctions) { 895 Function *OutlinedFunc = FuncBBPair.first; 896 BasicBlock* OutliningCallBB = FuncBBPair.second; 897 // Now compute the cost of the call sequence to the outlined function 898 // 'OutlinedFunction' in BB 'OutliningCallBB': 899 OutliningFuncCallCost += computeBBInlineCost(OutliningCallBB); 900 901 // Now compute the cost of the extracted/outlined function itself: 902 for (BasicBlock &BB : *OutlinedFunc) 903 OutlinedFunctionCost += computeBBInlineCost(&BB); 904 } 905 assert(OutlinedFunctionCost >= Cloner.OutlinedRegionCost && 906 "Outlined function cost should be no less than the outlined region"); 907 908 // The code extractor introduces a new root and exit stub blocks with 909 // additional unconditional branches. Those branches will be eliminated 910 // later with bb layout. The cost should be adjusted accordingly: 911 OutlinedFunctionCost -= 912 2 * InlineConstants::InstrCost * Cloner.OutlinedFunctions.size(); 913 914 int OutliningRuntimeOverhead = 915 OutliningFuncCallCost + 916 (OutlinedFunctionCost - Cloner.OutlinedRegionCost) + 917 ExtraOutliningPenalty; 918 919 return std::make_tuple(OutliningFuncCallCost, OutliningRuntimeOverhead); 920 } 921 922 // Create the callsite to profile count map which is 923 // used to update the original function's entry count, 924 // after the function is partially inlined into the callsite. 925 void PartialInlinerImpl::computeCallsiteToProfCountMap( 926 Function *DuplicateFunction, 927 DenseMap<User *, uint64_t> &CallSiteToProfCountMap) { 928 std::vector<User *> Users(DuplicateFunction->user_begin(), 929 DuplicateFunction->user_end()); 930 Function *CurrentCaller = nullptr; 931 std::unique_ptr<BlockFrequencyInfo> TempBFI; 932 BlockFrequencyInfo *CurrentCallerBFI = nullptr; 933 934 auto ComputeCurrBFI = [&,this](Function *Caller) { 935 // For the old pass manager: 936 if (!GetBFI) { 937 DominatorTree DT(*Caller); 938 LoopInfo LI(DT); 939 BranchProbabilityInfo BPI(*Caller, LI); 940 TempBFI.reset(new BlockFrequencyInfo(*Caller, BPI, LI)); 941 CurrentCallerBFI = TempBFI.get(); 942 } else { 943 // New pass manager: 944 CurrentCallerBFI = &(*GetBFI)(*Caller); 945 } 946 }; 947 948 for (User *User : Users) { 949 CallSite CS = getCallSite(User); 950 Function *Caller = CS.getCaller(); 951 if (CurrentCaller != Caller) { 952 CurrentCaller = Caller; 953 ComputeCurrBFI(Caller); 954 } else { 955 assert(CurrentCallerBFI && "CallerBFI is not set"); 956 } 957 BasicBlock *CallBB = CS.getInstruction()->getParent(); 958 auto Count = CurrentCallerBFI->getBlockProfileCount(CallBB); 959 if (Count) 960 CallSiteToProfCountMap[User] = *Count; 961 else 962 CallSiteToProfCountMap[User] = 0; 963 } 964 } 965 966 PartialInlinerImpl::FunctionCloner::FunctionCloner( 967 Function *F, FunctionOutliningInfo *OI, OptimizationRemarkEmitter &ORE, 968 function_ref<AssumptionCache *(Function &)> LookupAC) 969 : OrigFunc(F), ORE(ORE), LookupAC(LookupAC) { 970 ClonedOI = std::make_unique<FunctionOutliningInfo>(); 971 972 // Clone the function, so that we can hack away on it. 973 ValueToValueMapTy VMap; 974 ClonedFunc = CloneFunction(F, VMap); 975 976 ClonedOI->ReturnBlock = cast<BasicBlock>(VMap[OI->ReturnBlock]); 977 ClonedOI->NonReturnBlock = cast<BasicBlock>(VMap[OI->NonReturnBlock]); 978 for (BasicBlock *BB : OI->Entries) { 979 ClonedOI->Entries.push_back(cast<BasicBlock>(VMap[BB])); 980 } 981 for (BasicBlock *E : OI->ReturnBlockPreds) { 982 BasicBlock *NewE = cast<BasicBlock>(VMap[E]); 983 ClonedOI->ReturnBlockPreds.push_back(NewE); 984 } 985 // Go ahead and update all uses to the duplicate, so that we can just 986 // use the inliner functionality when we're done hacking. 987 F->replaceAllUsesWith(ClonedFunc); 988 } 989 990 PartialInlinerImpl::FunctionCloner::FunctionCloner( 991 Function *F, FunctionOutliningMultiRegionInfo *OI, 992 OptimizationRemarkEmitter &ORE, 993 function_ref<AssumptionCache *(Function &)> LookupAC) 994 : OrigFunc(F), ORE(ORE), LookupAC(LookupAC) { 995 ClonedOMRI = std::make_unique<FunctionOutliningMultiRegionInfo>(); 996 997 // Clone the function, so that we can hack away on it. 998 ValueToValueMapTy VMap; 999 ClonedFunc = CloneFunction(F, VMap); 1000 1001 // Go through all Outline Candidate Regions and update all BasicBlock 1002 // information. 1003 for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo : 1004 OI->ORI) { 1005 SmallVector<BasicBlock *, 8> Region; 1006 for (BasicBlock *BB : RegionInfo.Region) { 1007 Region.push_back(cast<BasicBlock>(VMap[BB])); 1008 } 1009 BasicBlock *NewEntryBlock = cast<BasicBlock>(VMap[RegionInfo.EntryBlock]); 1010 BasicBlock *NewExitBlock = cast<BasicBlock>(VMap[RegionInfo.ExitBlock]); 1011 BasicBlock *NewReturnBlock = nullptr; 1012 if (RegionInfo.ReturnBlock) 1013 NewReturnBlock = cast<BasicBlock>(VMap[RegionInfo.ReturnBlock]); 1014 FunctionOutliningMultiRegionInfo::OutlineRegionInfo MappedRegionInfo( 1015 Region, NewEntryBlock, NewExitBlock, NewReturnBlock); 1016 ClonedOMRI->ORI.push_back(MappedRegionInfo); 1017 } 1018 // Go ahead and update all uses to the duplicate, so that we can just 1019 // use the inliner functionality when we're done hacking. 1020 F->replaceAllUsesWith(ClonedFunc); 1021 } 1022 1023 void PartialInlinerImpl::FunctionCloner::NormalizeReturnBlock() { 1024 auto getFirstPHI = [](BasicBlock *BB) { 1025 BasicBlock::iterator I = BB->begin(); 1026 PHINode *FirstPhi = nullptr; 1027 while (I != BB->end()) { 1028 PHINode *Phi = dyn_cast<PHINode>(I); 1029 if (!Phi) 1030 break; 1031 if (!FirstPhi) { 1032 FirstPhi = Phi; 1033 break; 1034 } 1035 } 1036 return FirstPhi; 1037 }; 1038 1039 // Shouldn't need to normalize PHIs if we're not outlining non-early return 1040 // blocks. 1041 if (!ClonedOI) 1042 return; 1043 1044 // Special hackery is needed with PHI nodes that have inputs from more than 1045 // one extracted block. For simplicity, just split the PHIs into a two-level 1046 // sequence of PHIs, some of which will go in the extracted region, and some 1047 // of which will go outside. 1048 BasicBlock *PreReturn = ClonedOI->ReturnBlock; 1049 // only split block when necessary: 1050 PHINode *FirstPhi = getFirstPHI(PreReturn); 1051 unsigned NumPredsFromEntries = ClonedOI->ReturnBlockPreds.size(); 1052 1053 if (!FirstPhi || FirstPhi->getNumIncomingValues() <= NumPredsFromEntries + 1) 1054 return; 1055 1056 auto IsTrivialPhi = [](PHINode *PN) -> Value * { 1057 Value *CommonValue = PN->getIncomingValue(0); 1058 if (all_of(PN->incoming_values(), 1059 [&](Value *V) { return V == CommonValue; })) 1060 return CommonValue; 1061 return nullptr; 1062 }; 1063 1064 ClonedOI->ReturnBlock = ClonedOI->ReturnBlock->splitBasicBlock( 1065 ClonedOI->ReturnBlock->getFirstNonPHI()->getIterator()); 1066 BasicBlock::iterator I = PreReturn->begin(); 1067 Instruction *Ins = &ClonedOI->ReturnBlock->front(); 1068 SmallVector<Instruction *, 4> DeadPhis; 1069 while (I != PreReturn->end()) { 1070 PHINode *OldPhi = dyn_cast<PHINode>(I); 1071 if (!OldPhi) 1072 break; 1073 1074 PHINode *RetPhi = 1075 PHINode::Create(OldPhi->getType(), NumPredsFromEntries + 1, "", Ins); 1076 OldPhi->replaceAllUsesWith(RetPhi); 1077 Ins = ClonedOI->ReturnBlock->getFirstNonPHI(); 1078 1079 RetPhi->addIncoming(&*I, PreReturn); 1080 for (BasicBlock *E : ClonedOI->ReturnBlockPreds) { 1081 RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(E), E); 1082 OldPhi->removeIncomingValue(E); 1083 } 1084 1085 // After incoming values splitting, the old phi may become trivial. 1086 // Keeping the trivial phi can introduce definition inside the outline 1087 // region which is live-out, causing necessary overhead (load, store 1088 // arg passing etc). 1089 if (auto *OldPhiVal = IsTrivialPhi(OldPhi)) { 1090 OldPhi->replaceAllUsesWith(OldPhiVal); 1091 DeadPhis.push_back(OldPhi); 1092 } 1093 ++I; 1094 } 1095 for (auto *DP : DeadPhis) 1096 DP->eraseFromParent(); 1097 1098 for (auto E : ClonedOI->ReturnBlockPreds) { 1099 E->getTerminator()->replaceUsesOfWith(PreReturn, ClonedOI->ReturnBlock); 1100 } 1101 } 1102 1103 bool PartialInlinerImpl::FunctionCloner::doMultiRegionFunctionOutlining() { 1104 1105 auto ComputeRegionCost = [](SmallVectorImpl<BasicBlock *> &Region) { 1106 int Cost = 0; 1107 for (BasicBlock* BB : Region) 1108 Cost += computeBBInlineCost(BB); 1109 return Cost; 1110 }; 1111 1112 assert(ClonedOMRI && "Expecting OutlineInfo for multi region outline"); 1113 1114 if (ClonedOMRI->ORI.empty()) 1115 return false; 1116 1117 // The CodeExtractor needs a dominator tree. 1118 DominatorTree DT; 1119 DT.recalculate(*ClonedFunc); 1120 1121 // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo. 1122 LoopInfo LI(DT); 1123 BranchProbabilityInfo BPI(*ClonedFunc, LI); 1124 ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI)); 1125 1126 // Cache and recycle the CodeExtractor analysis to avoid O(n^2) compile-time. 1127 CodeExtractorAnalysisCache CEAC(*ClonedFunc); 1128 1129 SetVector<Value *> Inputs, Outputs, Sinks; 1130 for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo : 1131 ClonedOMRI->ORI) { 1132 int CurrentOutlinedRegionCost = ComputeRegionCost(RegionInfo.Region); 1133 1134 CodeExtractor CE(RegionInfo.Region, &DT, /*AggregateArgs*/ false, 1135 ClonedFuncBFI.get(), &BPI, 1136 LookupAC(*RegionInfo.EntryBlock->getParent()), 1137 /* AllowVarargs */ false); 1138 1139 CE.findInputsOutputs(Inputs, Outputs, Sinks); 1140 1141 #ifndef NDEBUG 1142 if (TracePartialInlining) { 1143 dbgs() << "inputs: " << Inputs.size() << "\n"; 1144 dbgs() << "outputs: " << Outputs.size() << "\n"; 1145 for (Value *value : Inputs) 1146 dbgs() << "value used in func: " << *value << "\n"; 1147 for (Value *output : Outputs) 1148 dbgs() << "instr used in func: " << *output << "\n"; 1149 } 1150 #endif 1151 // Do not extract regions that have live exit variables. 1152 if (Outputs.size() > 0 && !ForceLiveExit) 1153 continue; 1154 1155 Function *OutlinedFunc = CE.extractCodeRegion(CEAC); 1156 1157 if (OutlinedFunc) { 1158 CallSite OCS = PartialInlinerImpl::getOneCallSiteTo(OutlinedFunc); 1159 BasicBlock *OutliningCallBB = OCS.getInstruction()->getParent(); 1160 assert(OutliningCallBB->getParent() == ClonedFunc); 1161 OutlinedFunctions.push_back(std::make_pair(OutlinedFunc,OutliningCallBB)); 1162 NumColdRegionsOutlined++; 1163 OutlinedRegionCost += CurrentOutlinedRegionCost; 1164 1165 if (MarkOutlinedColdCC) { 1166 OutlinedFunc->setCallingConv(CallingConv::Cold); 1167 OCS.setCallingConv(CallingConv::Cold); 1168 } 1169 } else 1170 ORE.emit([&]() { 1171 return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed", 1172 &RegionInfo.Region.front()->front()) 1173 << "Failed to extract region at block " 1174 << ore::NV("Block", RegionInfo.Region.front()); 1175 }); 1176 } 1177 1178 return !OutlinedFunctions.empty(); 1179 } 1180 1181 Function * 1182 PartialInlinerImpl::FunctionCloner::doSingleRegionFunctionOutlining() { 1183 // Returns true if the block is to be partial inlined into the caller 1184 // (i.e. not to be extracted to the out of line function) 1185 auto ToBeInlined = [&, this](BasicBlock *BB) { 1186 return BB == ClonedOI->ReturnBlock || 1187 (std::find(ClonedOI->Entries.begin(), ClonedOI->Entries.end(), BB) != 1188 ClonedOI->Entries.end()); 1189 }; 1190 1191 assert(ClonedOI && "Expecting OutlineInfo for single region outline"); 1192 // The CodeExtractor needs a dominator tree. 1193 DominatorTree DT; 1194 DT.recalculate(*ClonedFunc); 1195 1196 // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo. 1197 LoopInfo LI(DT); 1198 BranchProbabilityInfo BPI(*ClonedFunc, LI); 1199 ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI)); 1200 1201 // Gather up the blocks that we're going to extract. 1202 std::vector<BasicBlock *> ToExtract; 1203 ToExtract.push_back(ClonedOI->NonReturnBlock); 1204 OutlinedRegionCost += 1205 PartialInlinerImpl::computeBBInlineCost(ClonedOI->NonReturnBlock); 1206 for (BasicBlock &BB : *ClonedFunc) 1207 if (!ToBeInlined(&BB) && &BB != ClonedOI->NonReturnBlock) { 1208 ToExtract.push_back(&BB); 1209 // FIXME: the code extractor may hoist/sink more code 1210 // into the outlined function which may make the outlining 1211 // overhead (the difference of the outlined function cost 1212 // and OutliningRegionCost) look larger. 1213 OutlinedRegionCost += computeBBInlineCost(&BB); 1214 } 1215 1216 // Extract the body of the if. 1217 CodeExtractorAnalysisCache CEAC(*ClonedFunc); 1218 Function *OutlinedFunc = 1219 CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false, 1220 ClonedFuncBFI.get(), &BPI, LookupAC(*ClonedFunc), 1221 /* AllowVarargs */ true) 1222 .extractCodeRegion(CEAC); 1223 1224 if (OutlinedFunc) { 1225 BasicBlock *OutliningCallBB = 1226 PartialInlinerImpl::getOneCallSiteTo(OutlinedFunc) 1227 .getInstruction() 1228 ->getParent(); 1229 assert(OutliningCallBB->getParent() == ClonedFunc); 1230 OutlinedFunctions.push_back(std::make_pair(OutlinedFunc, OutliningCallBB)); 1231 } else 1232 ORE.emit([&]() { 1233 return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed", 1234 &ToExtract.front()->front()) 1235 << "Failed to extract region at block " 1236 << ore::NV("Block", ToExtract.front()); 1237 }); 1238 1239 return OutlinedFunc; 1240 } 1241 1242 PartialInlinerImpl::FunctionCloner::~FunctionCloner() { 1243 // Ditch the duplicate, since we're done with it, and rewrite all remaining 1244 // users (function pointers, etc.) back to the original function. 1245 ClonedFunc->replaceAllUsesWith(OrigFunc); 1246 ClonedFunc->eraseFromParent(); 1247 if (!IsFunctionInlined) { 1248 // Remove each function that was speculatively created if there is no 1249 // reference. 1250 for (auto FuncBBPair : OutlinedFunctions) { 1251 Function *Func = FuncBBPair.first; 1252 Func->eraseFromParent(); 1253 } 1254 } 1255 } 1256 1257 std::pair<bool, Function *> PartialInlinerImpl::unswitchFunction(Function *F) { 1258 1259 if (F->hasAddressTaken()) 1260 return {false, nullptr}; 1261 1262 // Let inliner handle it 1263 if (F->hasFnAttribute(Attribute::AlwaysInline)) 1264 return {false, nullptr}; 1265 1266 if (F->hasFnAttribute(Attribute::NoInline)) 1267 return {false, nullptr}; 1268 1269 if (PSI->isFunctionEntryCold(F)) 1270 return {false, nullptr}; 1271 1272 if (F->users().empty()) 1273 return {false, nullptr}; 1274 1275 OptimizationRemarkEmitter ORE(F); 1276 1277 // Only try to outline cold regions if we have a profile summary, which 1278 // implies we have profiling information. 1279 if (PSI->hasProfileSummary() && F->hasProfileData() && 1280 !DisableMultiRegionPartialInline) { 1281 std::unique_ptr<FunctionOutliningMultiRegionInfo> OMRI = 1282 computeOutliningColdRegionsInfo(F, ORE); 1283 if (OMRI) { 1284 FunctionCloner Cloner(F, OMRI.get(), ORE, LookupAssumptionCache); 1285 1286 #ifndef NDEBUG 1287 if (TracePartialInlining) { 1288 dbgs() << "HotCountThreshold = " << PSI->getHotCountThreshold() << "\n"; 1289 dbgs() << "ColdCountThreshold = " << PSI->getColdCountThreshold() 1290 << "\n"; 1291 } 1292 #endif 1293 bool DidOutline = Cloner.doMultiRegionFunctionOutlining(); 1294 1295 if (DidOutline) { 1296 #ifndef NDEBUG 1297 if (TracePartialInlining) { 1298 dbgs() << ">>>>>> Outlined (Cloned) Function >>>>>>\n"; 1299 Cloner.ClonedFunc->print(dbgs()); 1300 dbgs() << "<<<<<< Outlined (Cloned) Function <<<<<<\n"; 1301 } 1302 #endif 1303 1304 if (tryPartialInline(Cloner)) 1305 return {true, nullptr}; 1306 } 1307 } 1308 } 1309 1310 // Fall-thru to regular partial inlining if we: 1311 // i) can't find any cold regions to outline, or 1312 // ii) can't inline the outlined function anywhere. 1313 std::unique_ptr<FunctionOutliningInfo> OI = computeOutliningInfo(F); 1314 if (!OI) 1315 return {false, nullptr}; 1316 1317 FunctionCloner Cloner(F, OI.get(), ORE, LookupAssumptionCache); 1318 Cloner.NormalizeReturnBlock(); 1319 1320 Function *OutlinedFunction = Cloner.doSingleRegionFunctionOutlining(); 1321 1322 if (!OutlinedFunction) 1323 return {false, nullptr}; 1324 1325 bool AnyInline = tryPartialInline(Cloner); 1326 1327 if (AnyInline) 1328 return {true, OutlinedFunction}; 1329 1330 return {false, nullptr}; 1331 } 1332 1333 bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) { 1334 if (Cloner.OutlinedFunctions.empty()) 1335 return false; 1336 1337 int SizeCost = 0; 1338 BlockFrequency WeightedRcost; 1339 int NonWeightedRcost; 1340 std::tie(SizeCost, NonWeightedRcost) = computeOutliningCosts(Cloner); 1341 1342 // Only calculate RelativeToEntryFreq when we are doing single region 1343 // outlining. 1344 BranchProbability RelativeToEntryFreq; 1345 if (Cloner.ClonedOI) { 1346 RelativeToEntryFreq = getOutliningCallBBRelativeFreq(Cloner); 1347 } else 1348 // RelativeToEntryFreq doesn't make sense when we have more than one 1349 // outlined call because each call will have a different relative frequency 1350 // to the entry block. We can consider using the average, but the 1351 // usefulness of that information is questionable. For now, assume we never 1352 // execute the calls to outlined functions. 1353 RelativeToEntryFreq = BranchProbability(0, 1); 1354 1355 WeightedRcost = BlockFrequency(NonWeightedRcost) * RelativeToEntryFreq; 1356 1357 // The call sequence(s) to the outlined function(s) are larger than the sum of 1358 // the original outlined region size(s), it does not increase the chances of 1359 // inlining the function with outlining (The inliner uses the size increase to 1360 // model the cost of inlining a callee). 1361 if (!SkipCostAnalysis && Cloner.OutlinedRegionCost < SizeCost) { 1362 OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc); 1363 DebugLoc DLoc; 1364 BasicBlock *Block; 1365 std::tie(DLoc, Block) = getOneDebugLoc(Cloner.ClonedFunc); 1366 OrigFuncORE.emit([&]() { 1367 return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall", 1368 DLoc, Block) 1369 << ore::NV("Function", Cloner.OrigFunc) 1370 << " not partially inlined into callers (Original Size = " 1371 << ore::NV("OutlinedRegionOriginalSize", Cloner.OutlinedRegionCost) 1372 << ", Size of call sequence to outlined function = " 1373 << ore::NV("NewSize", SizeCost) << ")"; 1374 }); 1375 return false; 1376 } 1377 1378 assert(Cloner.OrigFunc->users().empty() && 1379 "F's users should all be replaced!"); 1380 1381 std::vector<User *> Users(Cloner.ClonedFunc->user_begin(), 1382 Cloner.ClonedFunc->user_end()); 1383 1384 DenseMap<User *, uint64_t> CallSiteToProfCountMap; 1385 auto CalleeEntryCount = Cloner.OrigFunc->getEntryCount(); 1386 if (CalleeEntryCount) 1387 computeCallsiteToProfCountMap(Cloner.ClonedFunc, CallSiteToProfCountMap); 1388 1389 uint64_t CalleeEntryCountV = 1390 (CalleeEntryCount ? CalleeEntryCount.getCount() : 0); 1391 1392 bool AnyInline = false; 1393 for (User *User : Users) { 1394 CallSite CS = getCallSite(User); 1395 1396 if (IsLimitReached()) 1397 continue; 1398 1399 OptimizationRemarkEmitter CallerORE(CS.getCaller()); 1400 if (!shouldPartialInline(CS, Cloner, WeightedRcost, CallerORE)) 1401 continue; 1402 1403 // Construct remark before doing the inlining, as after successful inlining 1404 // the callsite is removed. 1405 OptimizationRemark OR(DEBUG_TYPE, "PartiallyInlined", CS.getInstruction()); 1406 OR << ore::NV("Callee", Cloner.OrigFunc) << " partially inlined into " 1407 << ore::NV("Caller", CS.getCaller()); 1408 1409 InlineFunctionInfo IFI(nullptr, GetAssumptionCache, PSI); 1410 // We can only forward varargs when we outlined a single region, else we 1411 // bail on vararg functions. 1412 if (!InlineFunction(CS, IFI, nullptr, true, 1413 (Cloner.ClonedOI ? Cloner.OutlinedFunctions.back().first 1414 : nullptr))) 1415 continue; 1416 1417 CallerORE.emit(OR); 1418 1419 // Now update the entry count: 1420 if (CalleeEntryCountV && CallSiteToProfCountMap.count(User)) { 1421 uint64_t CallSiteCount = CallSiteToProfCountMap[User]; 1422 CalleeEntryCountV -= std::min(CalleeEntryCountV, CallSiteCount); 1423 } 1424 1425 AnyInline = true; 1426 NumPartialInlining++; 1427 // Update the stats 1428 if (Cloner.ClonedOI) 1429 NumPartialInlined++; 1430 else 1431 NumColdOutlinePartialInlined++; 1432 1433 } 1434 1435 if (AnyInline) { 1436 Cloner.IsFunctionInlined = true; 1437 if (CalleeEntryCount) 1438 Cloner.OrigFunc->setEntryCount( 1439 CalleeEntryCount.setCount(CalleeEntryCountV)); 1440 OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc); 1441 OrigFuncORE.emit([&]() { 1442 return OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", Cloner.OrigFunc) 1443 << "Partially inlined into at least one caller"; 1444 }); 1445 1446 } 1447 1448 return AnyInline; 1449 } 1450 1451 bool PartialInlinerImpl::run(Module &M) { 1452 if (DisablePartialInlining) 1453 return false; 1454 1455 std::vector<Function *> Worklist; 1456 Worklist.reserve(M.size()); 1457 for (Function &F : M) 1458 if (!F.use_empty() && !F.isDeclaration()) 1459 Worklist.push_back(&F); 1460 1461 bool Changed = false; 1462 while (!Worklist.empty()) { 1463 Function *CurrFunc = Worklist.back(); 1464 Worklist.pop_back(); 1465 1466 if (CurrFunc->use_empty()) 1467 continue; 1468 1469 bool Recursive = false; 1470 for (User *U : CurrFunc->users()) 1471 if (Instruction *I = dyn_cast<Instruction>(U)) 1472 if (I->getParent()->getParent() == CurrFunc) { 1473 Recursive = true; 1474 break; 1475 } 1476 if (Recursive) 1477 continue; 1478 1479 std::pair<bool, Function * > Result = unswitchFunction(CurrFunc); 1480 if (Result.second) 1481 Worklist.push_back(Result.second); 1482 Changed |= Result.first; 1483 } 1484 1485 return Changed; 1486 } 1487 1488 char PartialInlinerLegacyPass::ID = 0; 1489 1490 INITIALIZE_PASS_BEGIN(PartialInlinerLegacyPass, "partial-inliner", 1491 "Partial Inliner", false, false) 1492 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 1493 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 1494 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 1495 INITIALIZE_PASS_END(PartialInlinerLegacyPass, "partial-inliner", 1496 "Partial Inliner", false, false) 1497 1498 ModulePass *llvm::createPartialInliningPass() { 1499 return new PartialInlinerLegacyPass(); 1500 } 1501 1502 PreservedAnalyses PartialInlinerPass::run(Module &M, 1503 ModuleAnalysisManager &AM) { 1504 auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); 1505 1506 std::function<AssumptionCache &(Function &)> GetAssumptionCache = 1507 [&FAM](Function &F) -> AssumptionCache & { 1508 return FAM.getResult<AssumptionAnalysis>(F); 1509 }; 1510 1511 auto LookupAssumptionCache = [&FAM](Function &F) -> AssumptionCache * { 1512 return FAM.getCachedResult<AssumptionAnalysis>(F); 1513 }; 1514 1515 std::function<BlockFrequencyInfo &(Function &)> GetBFI = 1516 [&FAM](Function &F) -> BlockFrequencyInfo & { 1517 return FAM.getResult<BlockFrequencyAnalysis>(F); 1518 }; 1519 1520 std::function<TargetTransformInfo &(Function &)> GetTTI = 1521 [&FAM](Function &F) -> TargetTransformInfo & { 1522 return FAM.getResult<TargetIRAnalysis>(F); 1523 }; 1524 1525 ProfileSummaryInfo *PSI = &AM.getResult<ProfileSummaryAnalysis>(M); 1526 1527 if (PartialInlinerImpl(&GetAssumptionCache, LookupAssumptionCache, &GetTTI, 1528 {GetBFI}, PSI) 1529 .run(M)) 1530 return PreservedAnalyses::none(); 1531 return PreservedAnalyses::all(); 1532 } 1533