1 //===- PartialInlining.cpp - Inline parts of functions --------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass performs partial inlining, typically by inlining an if statement 10 // that surrounds the body of the function. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "llvm/Transforms/IPO/PartialInlining.h" 15 #include "llvm/ADT/DenseMap.h" 16 #include "llvm/ADT/DenseSet.h" 17 #include "llvm/ADT/None.h" 18 #include "llvm/ADT/Optional.h" 19 #include "llvm/ADT/STLExtras.h" 20 #include "llvm/ADT/SmallVector.h" 21 #include "llvm/ADT/Statistic.h" 22 #include "llvm/Analysis/BlockFrequencyInfo.h" 23 #include "llvm/Analysis/BranchProbabilityInfo.h" 24 #include "llvm/Analysis/InlineCost.h" 25 #include "llvm/Analysis/LoopInfo.h" 26 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 27 #include "llvm/Analysis/ProfileSummaryInfo.h" 28 #include "llvm/Analysis/TargetLibraryInfo.h" 29 #include "llvm/Analysis/TargetTransformInfo.h" 30 #include "llvm/IR/Attributes.h" 31 #include "llvm/IR/BasicBlock.h" 32 #include "llvm/IR/CFG.h" 33 #include "llvm/IR/CallSite.h" 34 #include "llvm/IR/DebugLoc.h" 35 #include "llvm/IR/DiagnosticInfo.h" 36 #include "llvm/IR/Dominators.h" 37 #include "llvm/IR/Function.h" 38 #include "llvm/IR/InstrTypes.h" 39 #include "llvm/IR/Instruction.h" 40 #include "llvm/IR/Instructions.h" 41 #include "llvm/IR/IntrinsicInst.h" 42 #include "llvm/IR/Intrinsics.h" 43 #include "llvm/IR/Module.h" 44 #include "llvm/IR/User.h" 45 #include "llvm/Pass.h" 46 #include "llvm/Support/BlockFrequency.h" 47 #include "llvm/Support/BranchProbability.h" 48 #include "llvm/Support/Casting.h" 49 #include "llvm/Support/CommandLine.h" 50 #include "llvm/Support/ErrorHandling.h" 51 #include "llvm/Transforms/IPO.h" 52 #include "llvm/Transforms/Utils/Cloning.h" 53 #include "llvm/Transforms/Utils/CodeExtractor.h" 54 #include "llvm/Transforms/Utils/ValueMapper.h" 55 #include <algorithm> 56 #include <cassert> 57 #include <cstdint> 58 #include <functional> 59 #include <iterator> 60 #include <memory> 61 #include <tuple> 62 #include <vector> 63 64 using namespace llvm; 65 66 #define DEBUG_TYPE "partial-inlining" 67 68 STATISTIC(NumPartialInlined, 69 "Number of callsites functions partially inlined into."); 70 STATISTIC(NumColdOutlinePartialInlined, "Number of times functions with " 71 "cold outlined regions were partially " 72 "inlined into its caller(s)."); 73 STATISTIC(NumColdRegionsFound, 74 "Number of cold single entry/exit regions found."); 75 STATISTIC(NumColdRegionsOutlined, 76 "Number of cold single entry/exit regions outlined."); 77 78 // Command line option to disable partial-inlining. The default is false: 79 static cl::opt<bool> 80 DisablePartialInlining("disable-partial-inlining", cl::init(false), 81 cl::Hidden, cl::desc("Disable partial inlining")); 82 // Command line option to disable multi-region partial-inlining. The default is 83 // false: 84 static cl::opt<bool> DisableMultiRegionPartialInline( 85 "disable-mr-partial-inlining", cl::init(false), cl::Hidden, 86 cl::desc("Disable multi-region partial inlining")); 87 88 // Command line option to force outlining in regions with live exit variables. 89 // The default is false: 90 static cl::opt<bool> 91 ForceLiveExit("pi-force-live-exit-outline", cl::init(false), cl::Hidden, 92 cl::desc("Force outline regions with live exits")); 93 94 // Command line option to enable marking outline functions with Cold Calling 95 // Convention. The default is false: 96 static cl::opt<bool> 97 MarkOutlinedColdCC("pi-mark-coldcc", cl::init(false), cl::Hidden, 98 cl::desc("Mark outline function calls with ColdCC")); 99 100 #ifndef NDEBUG 101 // Command line option to debug partial-inlining. The default is none: 102 static cl::opt<bool> TracePartialInlining("trace-partial-inlining", 103 cl::init(false), cl::Hidden, 104 cl::desc("Trace partial inlining.")); 105 #endif 106 107 // This is an option used by testing: 108 static cl::opt<bool> SkipCostAnalysis("skip-partial-inlining-cost-analysis", 109 cl::init(false), cl::ZeroOrMore, 110 cl::ReallyHidden, 111 cl::desc("Skip Cost Analysis")); 112 // Used to determine if a cold region is worth outlining based on 113 // its inlining cost compared to the original function. Default is set at 10%. 114 // ie. if the cold region reduces the inlining cost of the original function by 115 // at least 10%. 116 static cl::opt<float> MinRegionSizeRatio( 117 "min-region-size-ratio", cl::init(0.1), cl::Hidden, 118 cl::desc("Minimum ratio comparing relative sizes of each " 119 "outline candidate and original function")); 120 // Used to tune the minimum number of execution counts needed in the predecessor 121 // block to the cold edge. ie. confidence interval. 122 static cl::opt<unsigned> 123 MinBlockCounterExecution("min-block-execution", cl::init(100), cl::Hidden, 124 cl::desc("Minimum block executions to consider " 125 "its BranchProbabilityInfo valid")); 126 // Used to determine when an edge is considered cold. Default is set to 10%. ie. 127 // if the branch probability is 10% or less, then it is deemed as 'cold'. 128 static cl::opt<float> ColdBranchRatio( 129 "cold-branch-ratio", cl::init(0.1), cl::Hidden, 130 cl::desc("Minimum BranchProbability to consider a region cold.")); 131 132 static cl::opt<unsigned> MaxNumInlineBlocks( 133 "max-num-inline-blocks", cl::init(5), cl::Hidden, 134 cl::desc("Max number of blocks to be partially inlined")); 135 136 // Command line option to set the maximum number of partial inlining allowed 137 // for the module. The default value of -1 means no limit. 138 static cl::opt<int> MaxNumPartialInlining( 139 "max-partial-inlining", cl::init(-1), cl::Hidden, cl::ZeroOrMore, 140 cl::desc("Max number of partial inlining. The default is unlimited")); 141 142 // Used only when PGO or user annotated branch data is absent. It is 143 // the least value that is used to weigh the outline region. If BFI 144 // produces larger value, the BFI value will be used. 145 static cl::opt<int> 146 OutlineRegionFreqPercent("outline-region-freq-percent", cl::init(75), 147 cl::Hidden, cl::ZeroOrMore, 148 cl::desc("Relative frequency of outline region to " 149 "the entry block")); 150 151 static cl::opt<unsigned> ExtraOutliningPenalty( 152 "partial-inlining-extra-penalty", cl::init(0), cl::Hidden, 153 cl::desc("A debug option to add additional penalty to the computed one.")); 154 155 namespace { 156 157 struct FunctionOutliningInfo { 158 FunctionOutliningInfo() = default; 159 160 // Returns the number of blocks to be inlined including all blocks 161 // in Entries and one return block. 162 unsigned GetNumInlinedBlocks() const { return Entries.size() + 1; } 163 164 // A set of blocks including the function entry that guard 165 // the region to be outlined. 166 SmallVector<BasicBlock *, 4> Entries; 167 168 // The return block that is not included in the outlined region. 169 BasicBlock *ReturnBlock = nullptr; 170 171 // The dominating block of the region to be outlined. 172 BasicBlock *NonReturnBlock = nullptr; 173 174 // The set of blocks in Entries that that are predecessors to ReturnBlock 175 SmallVector<BasicBlock *, 4> ReturnBlockPreds; 176 }; 177 178 struct FunctionOutliningMultiRegionInfo { 179 FunctionOutliningMultiRegionInfo() 180 : ORI() {} 181 182 // Container for outline regions 183 struct OutlineRegionInfo { 184 OutlineRegionInfo(ArrayRef<BasicBlock *> Region, 185 BasicBlock *EntryBlock, BasicBlock *ExitBlock, 186 BasicBlock *ReturnBlock) 187 : Region(Region.begin(), Region.end()), EntryBlock(EntryBlock), 188 ExitBlock(ExitBlock), ReturnBlock(ReturnBlock) {} 189 SmallVector<BasicBlock *, 8> Region; 190 BasicBlock *EntryBlock; 191 BasicBlock *ExitBlock; 192 BasicBlock *ReturnBlock; 193 }; 194 195 SmallVector<OutlineRegionInfo, 4> ORI; 196 }; 197 198 struct PartialInlinerImpl { 199 200 PartialInlinerImpl( 201 std::function<AssumptionCache &(Function &)> *GetAC, 202 function_ref<AssumptionCache *(Function &)> LookupAC, 203 std::function<TargetTransformInfo &(Function &)> *GTTI, 204 Optional<function_ref<BlockFrequencyInfo &(Function &)>> GBFI, 205 ProfileSummaryInfo *ProfSI) 206 : GetAssumptionCache(GetAC), LookupAssumptionCache(LookupAC), 207 GetTTI(GTTI), GetBFI(GBFI), PSI(ProfSI) {} 208 209 bool run(Module &M); 210 // Main part of the transformation that calls helper functions to find 211 // outlining candidates, clone & outline the function, and attempt to 212 // partially inline the resulting function. Returns true if 213 // inlining was successful, false otherwise. Also returns the outline 214 // function (only if we partially inlined early returns) as there is a 215 // possibility to further "peel" early return statements that were left in the 216 // outline function due to code size. 217 std::pair<bool, Function *> unswitchFunction(Function *F); 218 219 // This class speculatively clones the function to be partial inlined. 220 // At the end of partial inlining, the remaining callsites to the cloned 221 // function that are not partially inlined will be fixed up to reference 222 // the original function, and the cloned function will be erased. 223 struct FunctionCloner { 224 // Two constructors, one for single region outlining, the other for 225 // multi-region outlining. 226 FunctionCloner(Function *F, FunctionOutliningInfo *OI, 227 OptimizationRemarkEmitter &ORE, 228 function_ref<AssumptionCache *(Function &)> LookupAC); 229 FunctionCloner(Function *F, FunctionOutliningMultiRegionInfo *OMRI, 230 OptimizationRemarkEmitter &ORE, 231 function_ref<AssumptionCache *(Function &)> LookupAC); 232 ~FunctionCloner(); 233 234 // Prepare for function outlining: making sure there is only 235 // one incoming edge from the extracted/outlined region to 236 // the return block. 237 void NormalizeReturnBlock(); 238 239 // Do function outlining for cold regions. 240 bool doMultiRegionFunctionOutlining(); 241 // Do function outlining for region after early return block(s). 242 // NOTE: For vararg functions that do the vararg handling in the outlined 243 // function, we temporarily generate IR that does not properly 244 // forward varargs to the outlined function. Calling InlineFunction 245 // will update calls to the outlined functions to properly forward 246 // the varargs. 247 Function *doSingleRegionFunctionOutlining(); 248 249 Function *OrigFunc = nullptr; 250 Function *ClonedFunc = nullptr; 251 252 typedef std::pair<Function *, BasicBlock *> FuncBodyCallerPair; 253 // Keep track of Outlined Functions and the basic block they're called from. 254 SmallVector<FuncBodyCallerPair, 4> OutlinedFunctions; 255 256 // ClonedFunc is inlined in one of its callers after function 257 // outlining. 258 bool IsFunctionInlined = false; 259 // The cost of the region to be outlined. 260 int OutlinedRegionCost = 0; 261 // ClonedOI is specific to outlining non-early return blocks. 262 std::unique_ptr<FunctionOutliningInfo> ClonedOI = nullptr; 263 // ClonedOMRI is specific to outlining cold regions. 264 std::unique_ptr<FunctionOutliningMultiRegionInfo> ClonedOMRI = nullptr; 265 std::unique_ptr<BlockFrequencyInfo> ClonedFuncBFI = nullptr; 266 OptimizationRemarkEmitter &ORE; 267 function_ref<AssumptionCache *(Function &)> LookupAC; 268 }; 269 270 private: 271 int NumPartialInlining = 0; 272 std::function<AssumptionCache &(Function &)> *GetAssumptionCache; 273 function_ref<AssumptionCache *(Function &)> LookupAssumptionCache; 274 std::function<TargetTransformInfo &(Function &)> *GetTTI; 275 Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI; 276 ProfileSummaryInfo *PSI; 277 278 // Return the frequency of the OutlininingBB relative to F's entry point. 279 // The result is no larger than 1 and is represented using BP. 280 // (Note that the outlined region's 'head' block can only have incoming 281 // edges from the guarding entry blocks). 282 BranchProbability getOutliningCallBBRelativeFreq(FunctionCloner &Cloner); 283 284 // Return true if the callee of CS should be partially inlined with 285 // profit. 286 bool shouldPartialInline(CallSite CS, FunctionCloner &Cloner, 287 BlockFrequency WeightedOutliningRcost, 288 OptimizationRemarkEmitter &ORE); 289 290 // Try to inline DuplicateFunction (cloned from F with call to 291 // the OutlinedFunction into its callers. Return true 292 // if there is any successful inlining. 293 bool tryPartialInline(FunctionCloner &Cloner); 294 295 // Compute the mapping from use site of DuplicationFunction to the enclosing 296 // BB's profile count. 297 void computeCallsiteToProfCountMap(Function *DuplicateFunction, 298 DenseMap<User *, uint64_t> &SiteCountMap); 299 300 bool IsLimitReached() { 301 return (MaxNumPartialInlining != -1 && 302 NumPartialInlining >= MaxNumPartialInlining); 303 } 304 305 static CallSite getCallSite(User *U) { 306 CallSite CS; 307 if (CallInst *CI = dyn_cast<CallInst>(U)) 308 CS = CallSite(CI); 309 else if (InvokeInst *II = dyn_cast<InvokeInst>(U)) 310 CS = CallSite(II); 311 else 312 llvm_unreachable("All uses must be calls"); 313 return CS; 314 } 315 316 static CallSite getOneCallSiteTo(Function *F) { 317 User *User = *F->user_begin(); 318 return getCallSite(User); 319 } 320 321 std::tuple<DebugLoc, BasicBlock *> getOneDebugLoc(Function *F) { 322 CallSite CS = getOneCallSiteTo(F); 323 DebugLoc DLoc = CS.getInstruction()->getDebugLoc(); 324 BasicBlock *Block = CS.getParent(); 325 return std::make_tuple(DLoc, Block); 326 } 327 328 // Returns the costs associated with function outlining: 329 // - The first value is the non-weighted runtime cost for making the call 330 // to the outlined function, including the addtional setup cost in the 331 // outlined function itself; 332 // - The second value is the estimated size of the new call sequence in 333 // basic block Cloner.OutliningCallBB; 334 std::tuple<int, int> computeOutliningCosts(FunctionCloner &Cloner); 335 336 // Compute the 'InlineCost' of block BB. InlineCost is a proxy used to 337 // approximate both the size and runtime cost (Note that in the current 338 // inline cost analysis, there is no clear distinction there either). 339 static int computeBBInlineCost(BasicBlock *BB); 340 341 std::unique_ptr<FunctionOutliningInfo> computeOutliningInfo(Function *F); 342 std::unique_ptr<FunctionOutliningMultiRegionInfo> 343 computeOutliningColdRegionsInfo(Function *F, OptimizationRemarkEmitter &ORE); 344 }; 345 346 struct PartialInlinerLegacyPass : public ModulePass { 347 static char ID; // Pass identification, replacement for typeid 348 349 PartialInlinerLegacyPass() : ModulePass(ID) { 350 initializePartialInlinerLegacyPassPass(*PassRegistry::getPassRegistry()); 351 } 352 353 void getAnalysisUsage(AnalysisUsage &AU) const override { 354 AU.addRequired<AssumptionCacheTracker>(); 355 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 356 AU.addRequired<TargetTransformInfoWrapperPass>(); 357 } 358 359 bool runOnModule(Module &M) override { 360 if (skipModule(M)) 361 return false; 362 363 AssumptionCacheTracker *ACT = &getAnalysis<AssumptionCacheTracker>(); 364 TargetTransformInfoWrapperPass *TTIWP = 365 &getAnalysis<TargetTransformInfoWrapperPass>(); 366 ProfileSummaryInfo *PSI = 367 &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 368 369 std::function<AssumptionCache &(Function &)> GetAssumptionCache = 370 [&ACT](Function &F) -> AssumptionCache & { 371 return ACT->getAssumptionCache(F); 372 }; 373 374 auto LookupAssumptionCache = [ACT](Function &F) -> AssumptionCache * { 375 return ACT->lookupAssumptionCache(F); 376 }; 377 378 std::function<TargetTransformInfo &(Function &)> GetTTI = 379 [&TTIWP](Function &F) -> TargetTransformInfo & { 380 return TTIWP->getTTI(F); 381 }; 382 383 return PartialInlinerImpl(&GetAssumptionCache, LookupAssumptionCache, 384 &GetTTI, NoneType::None, PSI) 385 .run(M); 386 } 387 }; 388 389 } // end anonymous namespace 390 391 std::unique_ptr<FunctionOutliningMultiRegionInfo> 392 PartialInlinerImpl::computeOutliningColdRegionsInfo(Function *F, 393 OptimizationRemarkEmitter &ORE) { 394 BasicBlock *EntryBlock = &F->front(); 395 396 DominatorTree DT(*F); 397 LoopInfo LI(DT); 398 BranchProbabilityInfo BPI(*F, LI); 399 std::unique_ptr<BlockFrequencyInfo> ScopedBFI; 400 BlockFrequencyInfo *BFI; 401 if (!GetBFI) { 402 ScopedBFI.reset(new BlockFrequencyInfo(*F, BPI, LI)); 403 BFI = ScopedBFI.get(); 404 } else 405 BFI = &(*GetBFI)(*F); 406 407 // Return if we don't have profiling information. 408 if (!PSI->hasInstrumentationProfile()) 409 return std::unique_ptr<FunctionOutliningMultiRegionInfo>(); 410 411 std::unique_ptr<FunctionOutliningMultiRegionInfo> OutliningInfo = 412 llvm::make_unique<FunctionOutliningMultiRegionInfo>(); 413 414 auto IsSingleEntry = [](SmallVectorImpl<BasicBlock *> &BlockList) { 415 BasicBlock *Dom = BlockList.front(); 416 return BlockList.size() > 1 && Dom->hasNPredecessors(1); 417 }; 418 419 auto IsSingleExit = 420 [&ORE](SmallVectorImpl<BasicBlock *> &BlockList) -> BasicBlock * { 421 BasicBlock *ExitBlock = nullptr; 422 for (auto *Block : BlockList) { 423 for (auto SI = succ_begin(Block); SI != succ_end(Block); ++SI) { 424 if (!is_contained(BlockList, *SI)) { 425 if (ExitBlock) { 426 ORE.emit([&]() { 427 return OptimizationRemarkMissed(DEBUG_TYPE, "MultiExitRegion", 428 &SI->front()) 429 << "Region dominated by " 430 << ore::NV("Block", BlockList.front()->getName()) 431 << " has more than one region exit edge."; 432 }); 433 return nullptr; 434 } else 435 ExitBlock = Block; 436 } 437 } 438 } 439 return ExitBlock; 440 }; 441 442 auto BBProfileCount = [BFI](BasicBlock *BB) { 443 return BFI->getBlockProfileCount(BB) 444 ? BFI->getBlockProfileCount(BB).getValue() 445 : 0; 446 }; 447 448 // Use the same computeBBInlineCost function to compute the cost savings of 449 // the outlining the candidate region. 450 int OverallFunctionCost = 0; 451 for (auto &BB : *F) 452 OverallFunctionCost += computeBBInlineCost(&BB); 453 454 #ifndef NDEBUG 455 if (TracePartialInlining) 456 dbgs() << "OverallFunctionCost = " << OverallFunctionCost << "\n"; 457 #endif 458 int MinOutlineRegionCost = 459 static_cast<int>(OverallFunctionCost * MinRegionSizeRatio); 460 BranchProbability MinBranchProbability( 461 static_cast<int>(ColdBranchRatio * MinBlockCounterExecution), 462 MinBlockCounterExecution); 463 bool ColdCandidateFound = false; 464 BasicBlock *CurrEntry = EntryBlock; 465 std::vector<BasicBlock *> DFS; 466 DenseMap<BasicBlock *, bool> VisitedMap; 467 DFS.push_back(CurrEntry); 468 VisitedMap[CurrEntry] = true; 469 // Use Depth First Search on the basic blocks to find CFG edges that are 470 // considered cold. 471 // Cold regions considered must also have its inline cost compared to the 472 // overall inline cost of the original function. The region is outlined only 473 // if it reduced the inline cost of the function by 'MinOutlineRegionCost' or 474 // more. 475 while (!DFS.empty()) { 476 auto *thisBB = DFS.back(); 477 DFS.pop_back(); 478 // Only consider regions with predecessor blocks that are considered 479 // not-cold (default: part of the top 99.99% of all block counters) 480 // AND greater than our minimum block execution count (default: 100). 481 if (PSI->isColdBlock(thisBB, BFI) || 482 BBProfileCount(thisBB) < MinBlockCounterExecution) 483 continue; 484 for (auto SI = succ_begin(thisBB); SI != succ_end(thisBB); ++SI) { 485 if (VisitedMap[*SI]) 486 continue; 487 VisitedMap[*SI] = true; 488 DFS.push_back(*SI); 489 // If branch isn't cold, we skip to the next one. 490 BranchProbability SuccProb = BPI.getEdgeProbability(thisBB, *SI); 491 if (SuccProb > MinBranchProbability) 492 continue; 493 #ifndef NDEBUG 494 if (TracePartialInlining) { 495 dbgs() << "Found cold edge: " << thisBB->getName() << "->" 496 << (*SI)->getName() << "\nBranch Probability = " << SuccProb 497 << "\n"; 498 } 499 #endif 500 SmallVector<BasicBlock *, 8> DominateVector; 501 DT.getDescendants(*SI, DominateVector); 502 // We can only outline single entry regions (for now). 503 if (!IsSingleEntry(DominateVector)) 504 continue; 505 BasicBlock *ExitBlock = nullptr; 506 // We can only outline single exit regions (for now). 507 if (!(ExitBlock = IsSingleExit(DominateVector))) 508 continue; 509 int OutlineRegionCost = 0; 510 for (auto *BB : DominateVector) 511 OutlineRegionCost += computeBBInlineCost(BB); 512 513 #ifndef NDEBUG 514 if (TracePartialInlining) 515 dbgs() << "OutlineRegionCost = " << OutlineRegionCost << "\n"; 516 #endif 517 518 if (OutlineRegionCost < MinOutlineRegionCost) { 519 ORE.emit([&]() { 520 return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", 521 &SI->front()) 522 << ore::NV("Callee", F) << " inline cost-savings smaller than " 523 << ore::NV("Cost", MinOutlineRegionCost); 524 }); 525 continue; 526 } 527 // For now, ignore blocks that belong to a SISE region that is a 528 // candidate for outlining. In the future, we may want to look 529 // at inner regions because the outer region may have live-exit 530 // variables. 531 for (auto *BB : DominateVector) 532 VisitedMap[BB] = true; 533 // ReturnBlock here means the block after the outline call 534 BasicBlock *ReturnBlock = ExitBlock->getSingleSuccessor(); 535 // assert(ReturnBlock && "ReturnBlock is NULL somehow!"); 536 FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegInfo( 537 DominateVector, DominateVector.front(), ExitBlock, ReturnBlock); 538 OutliningInfo->ORI.push_back(RegInfo); 539 #ifndef NDEBUG 540 if (TracePartialInlining) { 541 dbgs() << "Found Cold Candidate starting at block: " 542 << DominateVector.front()->getName() << "\n"; 543 } 544 #endif 545 ColdCandidateFound = true; 546 NumColdRegionsFound++; 547 } 548 } 549 if (ColdCandidateFound) 550 return OutliningInfo; 551 else 552 return std::unique_ptr<FunctionOutliningMultiRegionInfo>(); 553 } 554 555 std::unique_ptr<FunctionOutliningInfo> 556 PartialInlinerImpl::computeOutliningInfo(Function *F) { 557 BasicBlock *EntryBlock = &F->front(); 558 BranchInst *BR = dyn_cast<BranchInst>(EntryBlock->getTerminator()); 559 if (!BR || BR->isUnconditional()) 560 return std::unique_ptr<FunctionOutliningInfo>(); 561 562 // Returns true if Succ is BB's successor 563 auto IsSuccessor = [](BasicBlock *Succ, BasicBlock *BB) { 564 return is_contained(successors(BB), Succ); 565 }; 566 567 auto IsReturnBlock = [](BasicBlock *BB) { 568 Instruction *TI = BB->getTerminator(); 569 return isa<ReturnInst>(TI); 570 }; 571 572 auto GetReturnBlock = [&](BasicBlock *Succ1, BasicBlock *Succ2) { 573 if (IsReturnBlock(Succ1)) 574 return std::make_tuple(Succ1, Succ2); 575 if (IsReturnBlock(Succ2)) 576 return std::make_tuple(Succ2, Succ1); 577 578 return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr); 579 }; 580 581 // Detect a triangular shape: 582 auto GetCommonSucc = [&](BasicBlock *Succ1, BasicBlock *Succ2) { 583 if (IsSuccessor(Succ1, Succ2)) 584 return std::make_tuple(Succ1, Succ2); 585 if (IsSuccessor(Succ2, Succ1)) 586 return std::make_tuple(Succ2, Succ1); 587 588 return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr); 589 }; 590 591 std::unique_ptr<FunctionOutliningInfo> OutliningInfo = 592 llvm::make_unique<FunctionOutliningInfo>(); 593 594 BasicBlock *CurrEntry = EntryBlock; 595 bool CandidateFound = false; 596 do { 597 // The number of blocks to be inlined has already reached 598 // the limit. When MaxNumInlineBlocks is set to 0 or 1, this 599 // disables partial inlining for the function. 600 if (OutliningInfo->GetNumInlinedBlocks() >= MaxNumInlineBlocks) 601 break; 602 603 if (succ_size(CurrEntry) != 2) 604 break; 605 606 BasicBlock *Succ1 = *succ_begin(CurrEntry); 607 BasicBlock *Succ2 = *(succ_begin(CurrEntry) + 1); 608 609 BasicBlock *ReturnBlock, *NonReturnBlock; 610 std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2); 611 612 if (ReturnBlock) { 613 OutliningInfo->Entries.push_back(CurrEntry); 614 OutliningInfo->ReturnBlock = ReturnBlock; 615 OutliningInfo->NonReturnBlock = NonReturnBlock; 616 CandidateFound = true; 617 break; 618 } 619 620 BasicBlock *CommSucc; 621 BasicBlock *OtherSucc; 622 std::tie(CommSucc, OtherSucc) = GetCommonSucc(Succ1, Succ2); 623 624 if (!CommSucc) 625 break; 626 627 OutliningInfo->Entries.push_back(CurrEntry); 628 CurrEntry = OtherSucc; 629 } while (true); 630 631 if (!CandidateFound) 632 return std::unique_ptr<FunctionOutliningInfo>(); 633 634 // Do sanity check of the entries: threre should not 635 // be any successors (not in the entry set) other than 636 // {ReturnBlock, NonReturnBlock} 637 assert(OutliningInfo->Entries[0] == &F->front() && 638 "Function Entry must be the first in Entries vector"); 639 DenseSet<BasicBlock *> Entries; 640 for (BasicBlock *E : OutliningInfo->Entries) 641 Entries.insert(E); 642 643 // Returns true of BB has Predecessor which is not 644 // in Entries set. 645 auto HasNonEntryPred = [Entries](BasicBlock *BB) { 646 for (auto Pred : predecessors(BB)) { 647 if (!Entries.count(Pred)) 648 return true; 649 } 650 return false; 651 }; 652 auto CheckAndNormalizeCandidate = 653 [Entries, HasNonEntryPred](FunctionOutliningInfo *OutliningInfo) { 654 for (BasicBlock *E : OutliningInfo->Entries) { 655 for (auto Succ : successors(E)) { 656 if (Entries.count(Succ)) 657 continue; 658 if (Succ == OutliningInfo->ReturnBlock) 659 OutliningInfo->ReturnBlockPreds.push_back(E); 660 else if (Succ != OutliningInfo->NonReturnBlock) 661 return false; 662 } 663 // There should not be any outside incoming edges either: 664 if (HasNonEntryPred(E)) 665 return false; 666 } 667 return true; 668 }; 669 670 if (!CheckAndNormalizeCandidate(OutliningInfo.get())) 671 return std::unique_ptr<FunctionOutliningInfo>(); 672 673 // Now further growing the candidate's inlining region by 674 // peeling off dominating blocks from the outlining region: 675 while (OutliningInfo->GetNumInlinedBlocks() < MaxNumInlineBlocks) { 676 BasicBlock *Cand = OutliningInfo->NonReturnBlock; 677 if (succ_size(Cand) != 2) 678 break; 679 680 if (HasNonEntryPred(Cand)) 681 break; 682 683 BasicBlock *Succ1 = *succ_begin(Cand); 684 BasicBlock *Succ2 = *(succ_begin(Cand) + 1); 685 686 BasicBlock *ReturnBlock, *NonReturnBlock; 687 std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2); 688 if (!ReturnBlock || ReturnBlock != OutliningInfo->ReturnBlock) 689 break; 690 691 if (NonReturnBlock->getSinglePredecessor() != Cand) 692 break; 693 694 // Now grow and update OutlininigInfo: 695 OutliningInfo->Entries.push_back(Cand); 696 OutliningInfo->NonReturnBlock = NonReturnBlock; 697 OutliningInfo->ReturnBlockPreds.push_back(Cand); 698 Entries.insert(Cand); 699 } 700 701 return OutliningInfo; 702 } 703 704 // Check if there is PGO data or user annoated branch data: 705 static bool hasProfileData(Function *F, FunctionOutliningInfo *OI) { 706 if (F->hasProfileData()) 707 return true; 708 // Now check if any of the entry block has MD_prof data: 709 for (auto *E : OI->Entries) { 710 BranchInst *BR = dyn_cast<BranchInst>(E->getTerminator()); 711 if (!BR || BR->isUnconditional()) 712 continue; 713 uint64_t T, F; 714 if (BR->extractProfMetadata(T, F)) 715 return true; 716 } 717 return false; 718 } 719 720 BranchProbability 721 PartialInlinerImpl::getOutliningCallBBRelativeFreq(FunctionCloner &Cloner) { 722 BasicBlock *OutliningCallBB = Cloner.OutlinedFunctions.back().second; 723 auto EntryFreq = 724 Cloner.ClonedFuncBFI->getBlockFreq(&Cloner.ClonedFunc->getEntryBlock()); 725 auto OutliningCallFreq = 726 Cloner.ClonedFuncBFI->getBlockFreq(OutliningCallBB); 727 // FIXME Hackery needed because ClonedFuncBFI is based on the function BEFORE 728 // we outlined any regions, so we may encounter situations where the 729 // OutliningCallFreq is *slightly* bigger than the EntryFreq. 730 if (OutliningCallFreq.getFrequency() > EntryFreq.getFrequency()) { 731 OutliningCallFreq = EntryFreq; 732 } 733 auto OutlineRegionRelFreq = BranchProbability::getBranchProbability( 734 OutliningCallFreq.getFrequency(), EntryFreq.getFrequency()); 735 736 if (hasProfileData(Cloner.OrigFunc, Cloner.ClonedOI.get())) 737 return OutlineRegionRelFreq; 738 739 // When profile data is not available, we need to be conservative in 740 // estimating the overall savings. Static branch prediction can usually 741 // guess the branch direction right (taken/non-taken), but the guessed 742 // branch probability is usually not biased enough. In case when the 743 // outlined region is predicted to be likely, its probability needs 744 // to be made higher (more biased) to not under-estimate the cost of 745 // function outlining. On the other hand, if the outlined region 746 // is predicted to be less likely, the predicted probablity is usually 747 // higher than the actual. For instance, the actual probability of the 748 // less likely target is only 5%, but the guessed probablity can be 749 // 40%. In the latter case, there is no need for further adjustement. 750 // FIXME: add an option for this. 751 if (OutlineRegionRelFreq < BranchProbability(45, 100)) 752 return OutlineRegionRelFreq; 753 754 OutlineRegionRelFreq = std::max( 755 OutlineRegionRelFreq, BranchProbability(OutlineRegionFreqPercent, 100)); 756 757 return OutlineRegionRelFreq; 758 } 759 760 bool PartialInlinerImpl::shouldPartialInline( 761 CallSite CS, FunctionCloner &Cloner, 762 BlockFrequency WeightedOutliningRcost, 763 OptimizationRemarkEmitter &ORE) { 764 using namespace ore; 765 766 Instruction *Call = CS.getInstruction(); 767 Function *Callee = CS.getCalledFunction(); 768 assert(Callee == Cloner.ClonedFunc); 769 770 if (SkipCostAnalysis) 771 return isInlineViable(*Callee); 772 773 Function *Caller = CS.getCaller(); 774 auto &CalleeTTI = (*GetTTI)(*Callee); 775 bool RemarksEnabled = 776 Callee->getContext().getDiagHandlerPtr()->isMissedOptRemarkEnabled( 777 DEBUG_TYPE); 778 assert(Call && "invalid callsite for partial inline"); 779 InlineCost IC = getInlineCost(cast<CallBase>(*Call), getInlineParams(), 780 CalleeTTI, *GetAssumptionCache, GetBFI, PSI, 781 RemarksEnabled ? &ORE : nullptr); 782 783 if (IC.isAlways()) { 784 ORE.emit([&]() { 785 return OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", Call) 786 << NV("Callee", Cloner.OrigFunc) 787 << " should always be fully inlined, not partially"; 788 }); 789 return false; 790 } 791 792 if (IC.isNever()) { 793 ORE.emit([&]() { 794 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", Call) 795 << NV("Callee", Cloner.OrigFunc) << " not partially inlined into " 796 << NV("Caller", Caller) 797 << " because it should never be inlined (cost=never)"; 798 }); 799 return false; 800 } 801 802 if (!IC) { 803 ORE.emit([&]() { 804 return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", Call) 805 << NV("Callee", Cloner.OrigFunc) << " not partially inlined into " 806 << NV("Caller", Caller) << " because too costly to inline (cost=" 807 << NV("Cost", IC.getCost()) << ", threshold=" 808 << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")"; 809 }); 810 return false; 811 } 812 const DataLayout &DL = Caller->getParent()->getDataLayout(); 813 814 // The savings of eliminating the call: 815 int NonWeightedSavings = getCallsiteCost(cast<CallBase>(*Call), DL); 816 BlockFrequency NormWeightedSavings(NonWeightedSavings); 817 818 // Weighted saving is smaller than weighted cost, return false 819 if (NormWeightedSavings < WeightedOutliningRcost) { 820 ORE.emit([&]() { 821 return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutliningCallcostTooHigh", 822 Call) 823 << NV("Callee", Cloner.OrigFunc) << " not partially inlined into " 824 << NV("Caller", Caller) << " runtime overhead (overhead=" 825 << NV("Overhead", (unsigned)WeightedOutliningRcost.getFrequency()) 826 << ", savings=" 827 << NV("Savings", (unsigned)NormWeightedSavings.getFrequency()) 828 << ")" 829 << " of making the outlined call is too high"; 830 }); 831 832 return false; 833 } 834 835 ORE.emit([&]() { 836 return OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBePartiallyInlined", Call) 837 << NV("Callee", Cloner.OrigFunc) << " can be partially inlined into " 838 << NV("Caller", Caller) << " with cost=" << NV("Cost", IC.getCost()) 839 << " (threshold=" 840 << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")"; 841 }); 842 return true; 843 } 844 845 // TODO: Ideally we should share Inliner's InlineCost Analysis code. 846 // For now use a simplified version. The returned 'InlineCost' will be used 847 // to esimate the size cost as well as runtime cost of the BB. 848 int PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB) { 849 int InlineCost = 0; 850 const DataLayout &DL = BB->getParent()->getParent()->getDataLayout(); 851 for (Instruction &I : BB->instructionsWithoutDebug()) { 852 // Skip free instructions. 853 switch (I.getOpcode()) { 854 case Instruction::BitCast: 855 case Instruction::PtrToInt: 856 case Instruction::IntToPtr: 857 case Instruction::Alloca: 858 case Instruction::PHI: 859 continue; 860 case Instruction::GetElementPtr: 861 if (cast<GetElementPtrInst>(&I)->hasAllZeroIndices()) 862 continue; 863 break; 864 default: 865 break; 866 } 867 868 if (I.isLifetimeStartOrEnd()) 869 continue; 870 871 if (CallInst *CI = dyn_cast<CallInst>(&I)) { 872 InlineCost += getCallsiteCost(*CI, DL); 873 continue; 874 } 875 876 if (InvokeInst *II = dyn_cast<InvokeInst>(&I)) { 877 InlineCost += getCallsiteCost(*II, DL); 878 continue; 879 } 880 881 if (SwitchInst *SI = dyn_cast<SwitchInst>(&I)) { 882 InlineCost += (SI->getNumCases() + 1) * InlineConstants::InstrCost; 883 continue; 884 } 885 InlineCost += InlineConstants::InstrCost; 886 } 887 return InlineCost; 888 } 889 890 std::tuple<int, int> 891 PartialInlinerImpl::computeOutliningCosts(FunctionCloner &Cloner) { 892 int OutliningFuncCallCost = 0, OutlinedFunctionCost = 0; 893 for (auto FuncBBPair : Cloner.OutlinedFunctions) { 894 Function *OutlinedFunc = FuncBBPair.first; 895 BasicBlock* OutliningCallBB = FuncBBPair.second; 896 // Now compute the cost of the call sequence to the outlined function 897 // 'OutlinedFunction' in BB 'OutliningCallBB': 898 OutliningFuncCallCost += computeBBInlineCost(OutliningCallBB); 899 900 // Now compute the cost of the extracted/outlined function itself: 901 for (BasicBlock &BB : *OutlinedFunc) 902 OutlinedFunctionCost += computeBBInlineCost(&BB); 903 } 904 assert(OutlinedFunctionCost >= Cloner.OutlinedRegionCost && 905 "Outlined function cost should be no less than the outlined region"); 906 907 // The code extractor introduces a new root and exit stub blocks with 908 // additional unconditional branches. Those branches will be eliminated 909 // later with bb layout. The cost should be adjusted accordingly: 910 OutlinedFunctionCost -= 911 2 * InlineConstants::InstrCost * Cloner.OutlinedFunctions.size(); 912 913 int OutliningRuntimeOverhead = 914 OutliningFuncCallCost + 915 (OutlinedFunctionCost - Cloner.OutlinedRegionCost) + 916 ExtraOutliningPenalty; 917 918 return std::make_tuple(OutliningFuncCallCost, OutliningRuntimeOverhead); 919 } 920 921 // Create the callsite to profile count map which is 922 // used to update the original function's entry count, 923 // after the function is partially inlined into the callsite. 924 void PartialInlinerImpl::computeCallsiteToProfCountMap( 925 Function *DuplicateFunction, 926 DenseMap<User *, uint64_t> &CallSiteToProfCountMap) { 927 std::vector<User *> Users(DuplicateFunction->user_begin(), 928 DuplicateFunction->user_end()); 929 Function *CurrentCaller = nullptr; 930 std::unique_ptr<BlockFrequencyInfo> TempBFI; 931 BlockFrequencyInfo *CurrentCallerBFI = nullptr; 932 933 auto ComputeCurrBFI = [&,this](Function *Caller) { 934 // For the old pass manager: 935 if (!GetBFI) { 936 DominatorTree DT(*Caller); 937 LoopInfo LI(DT); 938 BranchProbabilityInfo BPI(*Caller, LI); 939 TempBFI.reset(new BlockFrequencyInfo(*Caller, BPI, LI)); 940 CurrentCallerBFI = TempBFI.get(); 941 } else { 942 // New pass manager: 943 CurrentCallerBFI = &(*GetBFI)(*Caller); 944 } 945 }; 946 947 for (User *User : Users) { 948 CallSite CS = getCallSite(User); 949 Function *Caller = CS.getCaller(); 950 if (CurrentCaller != Caller) { 951 CurrentCaller = Caller; 952 ComputeCurrBFI(Caller); 953 } else { 954 assert(CurrentCallerBFI && "CallerBFI is not set"); 955 } 956 BasicBlock *CallBB = CS.getInstruction()->getParent(); 957 auto Count = CurrentCallerBFI->getBlockProfileCount(CallBB); 958 if (Count) 959 CallSiteToProfCountMap[User] = *Count; 960 else 961 CallSiteToProfCountMap[User] = 0; 962 } 963 } 964 965 PartialInlinerImpl::FunctionCloner::FunctionCloner( 966 Function *F, FunctionOutliningInfo *OI, OptimizationRemarkEmitter &ORE, 967 function_ref<AssumptionCache *(Function &)> LookupAC) 968 : OrigFunc(F), ORE(ORE), LookupAC(LookupAC) { 969 ClonedOI = llvm::make_unique<FunctionOutliningInfo>(); 970 971 // Clone the function, so that we can hack away on it. 972 ValueToValueMapTy VMap; 973 ClonedFunc = CloneFunction(F, VMap); 974 975 ClonedOI->ReturnBlock = cast<BasicBlock>(VMap[OI->ReturnBlock]); 976 ClonedOI->NonReturnBlock = cast<BasicBlock>(VMap[OI->NonReturnBlock]); 977 for (BasicBlock *BB : OI->Entries) { 978 ClonedOI->Entries.push_back(cast<BasicBlock>(VMap[BB])); 979 } 980 for (BasicBlock *E : OI->ReturnBlockPreds) { 981 BasicBlock *NewE = cast<BasicBlock>(VMap[E]); 982 ClonedOI->ReturnBlockPreds.push_back(NewE); 983 } 984 // Go ahead and update all uses to the duplicate, so that we can just 985 // use the inliner functionality when we're done hacking. 986 F->replaceAllUsesWith(ClonedFunc); 987 } 988 989 PartialInlinerImpl::FunctionCloner::FunctionCloner( 990 Function *F, FunctionOutliningMultiRegionInfo *OI, 991 OptimizationRemarkEmitter &ORE, 992 function_ref<AssumptionCache *(Function &)> LookupAC) 993 : OrigFunc(F), ORE(ORE), LookupAC(LookupAC) { 994 ClonedOMRI = llvm::make_unique<FunctionOutliningMultiRegionInfo>(); 995 996 // Clone the function, so that we can hack away on it. 997 ValueToValueMapTy VMap; 998 ClonedFunc = CloneFunction(F, VMap); 999 1000 // Go through all Outline Candidate Regions and update all BasicBlock 1001 // information. 1002 for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo : 1003 OI->ORI) { 1004 SmallVector<BasicBlock *, 8> Region; 1005 for (BasicBlock *BB : RegionInfo.Region) { 1006 Region.push_back(cast<BasicBlock>(VMap[BB])); 1007 } 1008 BasicBlock *NewEntryBlock = cast<BasicBlock>(VMap[RegionInfo.EntryBlock]); 1009 BasicBlock *NewExitBlock = cast<BasicBlock>(VMap[RegionInfo.ExitBlock]); 1010 BasicBlock *NewReturnBlock = nullptr; 1011 if (RegionInfo.ReturnBlock) 1012 NewReturnBlock = cast<BasicBlock>(VMap[RegionInfo.ReturnBlock]); 1013 FunctionOutliningMultiRegionInfo::OutlineRegionInfo MappedRegionInfo( 1014 Region, NewEntryBlock, NewExitBlock, NewReturnBlock); 1015 ClonedOMRI->ORI.push_back(MappedRegionInfo); 1016 } 1017 // Go ahead and update all uses to the duplicate, so that we can just 1018 // use the inliner functionality when we're done hacking. 1019 F->replaceAllUsesWith(ClonedFunc); 1020 } 1021 1022 void PartialInlinerImpl::FunctionCloner::NormalizeReturnBlock() { 1023 auto getFirstPHI = [](BasicBlock *BB) { 1024 BasicBlock::iterator I = BB->begin(); 1025 PHINode *FirstPhi = nullptr; 1026 while (I != BB->end()) { 1027 PHINode *Phi = dyn_cast<PHINode>(I); 1028 if (!Phi) 1029 break; 1030 if (!FirstPhi) { 1031 FirstPhi = Phi; 1032 break; 1033 } 1034 } 1035 return FirstPhi; 1036 }; 1037 1038 // Shouldn't need to normalize PHIs if we're not outlining non-early return 1039 // blocks. 1040 if (!ClonedOI) 1041 return; 1042 1043 // Special hackery is needed with PHI nodes that have inputs from more than 1044 // one extracted block. For simplicity, just split the PHIs into a two-level 1045 // sequence of PHIs, some of which will go in the extracted region, and some 1046 // of which will go outside. 1047 BasicBlock *PreReturn = ClonedOI->ReturnBlock; 1048 // only split block when necessary: 1049 PHINode *FirstPhi = getFirstPHI(PreReturn); 1050 unsigned NumPredsFromEntries = ClonedOI->ReturnBlockPreds.size(); 1051 1052 if (!FirstPhi || FirstPhi->getNumIncomingValues() <= NumPredsFromEntries + 1) 1053 return; 1054 1055 auto IsTrivialPhi = [](PHINode *PN) -> Value * { 1056 Value *CommonValue = PN->getIncomingValue(0); 1057 if (all_of(PN->incoming_values(), 1058 [&](Value *V) { return V == CommonValue; })) 1059 return CommonValue; 1060 return nullptr; 1061 }; 1062 1063 ClonedOI->ReturnBlock = ClonedOI->ReturnBlock->splitBasicBlock( 1064 ClonedOI->ReturnBlock->getFirstNonPHI()->getIterator()); 1065 BasicBlock::iterator I = PreReturn->begin(); 1066 Instruction *Ins = &ClonedOI->ReturnBlock->front(); 1067 SmallVector<Instruction *, 4> DeadPhis; 1068 while (I != PreReturn->end()) { 1069 PHINode *OldPhi = dyn_cast<PHINode>(I); 1070 if (!OldPhi) 1071 break; 1072 1073 PHINode *RetPhi = 1074 PHINode::Create(OldPhi->getType(), NumPredsFromEntries + 1, "", Ins); 1075 OldPhi->replaceAllUsesWith(RetPhi); 1076 Ins = ClonedOI->ReturnBlock->getFirstNonPHI(); 1077 1078 RetPhi->addIncoming(&*I, PreReturn); 1079 for (BasicBlock *E : ClonedOI->ReturnBlockPreds) { 1080 RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(E), E); 1081 OldPhi->removeIncomingValue(E); 1082 } 1083 1084 // After incoming values splitting, the old phi may become trivial. 1085 // Keeping the trivial phi can introduce definition inside the outline 1086 // region which is live-out, causing necessary overhead (load, store 1087 // arg passing etc). 1088 if (auto *OldPhiVal = IsTrivialPhi(OldPhi)) { 1089 OldPhi->replaceAllUsesWith(OldPhiVal); 1090 DeadPhis.push_back(OldPhi); 1091 } 1092 ++I; 1093 } 1094 for (auto *DP : DeadPhis) 1095 DP->eraseFromParent(); 1096 1097 for (auto E : ClonedOI->ReturnBlockPreds) { 1098 E->getTerminator()->replaceUsesOfWith(PreReturn, ClonedOI->ReturnBlock); 1099 } 1100 } 1101 1102 bool PartialInlinerImpl::FunctionCloner::doMultiRegionFunctionOutlining() { 1103 1104 auto ComputeRegionCost = [](SmallVectorImpl<BasicBlock *> &Region) { 1105 int Cost = 0; 1106 for (BasicBlock* BB : Region) 1107 Cost += computeBBInlineCost(BB); 1108 return Cost; 1109 }; 1110 1111 assert(ClonedOMRI && "Expecting OutlineInfo for multi region outline"); 1112 1113 if (ClonedOMRI->ORI.empty()) 1114 return false; 1115 1116 // The CodeExtractor needs a dominator tree. 1117 DominatorTree DT; 1118 DT.recalculate(*ClonedFunc); 1119 1120 // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo. 1121 LoopInfo LI(DT); 1122 BranchProbabilityInfo BPI(*ClonedFunc, LI); 1123 ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI)); 1124 1125 SetVector<Value *> Inputs, Outputs, Sinks; 1126 for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo : 1127 ClonedOMRI->ORI) { 1128 int CurrentOutlinedRegionCost = ComputeRegionCost(RegionInfo.Region); 1129 1130 CodeExtractor CE(RegionInfo.Region, &DT, /*AggregateArgs*/ false, 1131 ClonedFuncBFI.get(), &BPI, 1132 LookupAC(*RegionInfo.EntryBlock->getParent()), 1133 /* AllowVarargs */ false); 1134 1135 CE.findInputsOutputs(Inputs, Outputs, Sinks); 1136 1137 #ifndef NDEBUG 1138 if (TracePartialInlining) { 1139 dbgs() << "inputs: " << Inputs.size() << "\n"; 1140 dbgs() << "outputs: " << Outputs.size() << "\n"; 1141 for (Value *value : Inputs) 1142 dbgs() << "value used in func: " << *value << "\n"; 1143 for (Value *output : Outputs) 1144 dbgs() << "instr used in func: " << *output << "\n"; 1145 } 1146 #endif 1147 // Do not extract regions that have live exit variables. 1148 if (Outputs.size() > 0 && !ForceLiveExit) 1149 continue; 1150 1151 Function *OutlinedFunc = CE.extractCodeRegion(); 1152 1153 if (OutlinedFunc) { 1154 CallSite OCS = PartialInlinerImpl::getOneCallSiteTo(OutlinedFunc); 1155 BasicBlock *OutliningCallBB = OCS.getInstruction()->getParent(); 1156 assert(OutliningCallBB->getParent() == ClonedFunc); 1157 OutlinedFunctions.push_back(std::make_pair(OutlinedFunc,OutliningCallBB)); 1158 NumColdRegionsOutlined++; 1159 OutlinedRegionCost += CurrentOutlinedRegionCost; 1160 1161 if (MarkOutlinedColdCC) { 1162 OutlinedFunc->setCallingConv(CallingConv::Cold); 1163 OCS.setCallingConv(CallingConv::Cold); 1164 } 1165 } else 1166 ORE.emit([&]() { 1167 return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed", 1168 &RegionInfo.Region.front()->front()) 1169 << "Failed to extract region at block " 1170 << ore::NV("Block", RegionInfo.Region.front()); 1171 }); 1172 } 1173 1174 return !OutlinedFunctions.empty(); 1175 } 1176 1177 Function * 1178 PartialInlinerImpl::FunctionCloner::doSingleRegionFunctionOutlining() { 1179 // Returns true if the block is to be partial inlined into the caller 1180 // (i.e. not to be extracted to the out of line function) 1181 auto ToBeInlined = [&, this](BasicBlock *BB) { 1182 return BB == ClonedOI->ReturnBlock || 1183 (std::find(ClonedOI->Entries.begin(), ClonedOI->Entries.end(), BB) != 1184 ClonedOI->Entries.end()); 1185 }; 1186 1187 assert(ClonedOI && "Expecting OutlineInfo for single region outline"); 1188 // The CodeExtractor needs a dominator tree. 1189 DominatorTree DT; 1190 DT.recalculate(*ClonedFunc); 1191 1192 // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo. 1193 LoopInfo LI(DT); 1194 BranchProbabilityInfo BPI(*ClonedFunc, LI); 1195 ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI)); 1196 1197 // Gather up the blocks that we're going to extract. 1198 std::vector<BasicBlock *> ToExtract; 1199 ToExtract.push_back(ClonedOI->NonReturnBlock); 1200 OutlinedRegionCost += 1201 PartialInlinerImpl::computeBBInlineCost(ClonedOI->NonReturnBlock); 1202 for (BasicBlock &BB : *ClonedFunc) 1203 if (!ToBeInlined(&BB) && &BB != ClonedOI->NonReturnBlock) { 1204 ToExtract.push_back(&BB); 1205 // FIXME: the code extractor may hoist/sink more code 1206 // into the outlined function which may make the outlining 1207 // overhead (the difference of the outlined function cost 1208 // and OutliningRegionCost) look larger. 1209 OutlinedRegionCost += computeBBInlineCost(&BB); 1210 } 1211 1212 // Extract the body of the if. 1213 Function *OutlinedFunc = 1214 CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false, 1215 ClonedFuncBFI.get(), &BPI, LookupAC(*ClonedFunc), 1216 /* AllowVarargs */ true) 1217 .extractCodeRegion(); 1218 1219 if (OutlinedFunc) { 1220 BasicBlock *OutliningCallBB = 1221 PartialInlinerImpl::getOneCallSiteTo(OutlinedFunc) 1222 .getInstruction() 1223 ->getParent(); 1224 assert(OutliningCallBB->getParent() == ClonedFunc); 1225 OutlinedFunctions.push_back(std::make_pair(OutlinedFunc, OutliningCallBB)); 1226 } else 1227 ORE.emit([&]() { 1228 return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed", 1229 &ToExtract.front()->front()) 1230 << "Failed to extract region at block " 1231 << ore::NV("Block", ToExtract.front()); 1232 }); 1233 1234 return OutlinedFunc; 1235 } 1236 1237 PartialInlinerImpl::FunctionCloner::~FunctionCloner() { 1238 // Ditch the duplicate, since we're done with it, and rewrite all remaining 1239 // users (function pointers, etc.) back to the original function. 1240 ClonedFunc->replaceAllUsesWith(OrigFunc); 1241 ClonedFunc->eraseFromParent(); 1242 if (!IsFunctionInlined) { 1243 // Remove each function that was speculatively created if there is no 1244 // reference. 1245 for (auto FuncBBPair : OutlinedFunctions) { 1246 Function *Func = FuncBBPair.first; 1247 Func->eraseFromParent(); 1248 } 1249 } 1250 } 1251 1252 std::pair<bool, Function *> PartialInlinerImpl::unswitchFunction(Function *F) { 1253 1254 if (F->hasAddressTaken()) 1255 return {false, nullptr}; 1256 1257 // Let inliner handle it 1258 if (F->hasFnAttribute(Attribute::AlwaysInline)) 1259 return {false, nullptr}; 1260 1261 if (F->hasFnAttribute(Attribute::NoInline)) 1262 return {false, nullptr}; 1263 1264 if (PSI->isFunctionEntryCold(F)) 1265 return {false, nullptr}; 1266 1267 if (empty(F->users())) 1268 return {false, nullptr}; 1269 1270 OptimizationRemarkEmitter ORE(F); 1271 1272 // Only try to outline cold regions if we have a profile summary, which 1273 // implies we have profiling information. 1274 if (PSI->hasProfileSummary() && F->hasProfileData() && 1275 !DisableMultiRegionPartialInline) { 1276 std::unique_ptr<FunctionOutliningMultiRegionInfo> OMRI = 1277 computeOutliningColdRegionsInfo(F, ORE); 1278 if (OMRI) { 1279 FunctionCloner Cloner(F, OMRI.get(), ORE, LookupAssumptionCache); 1280 1281 #ifndef NDEBUG 1282 if (TracePartialInlining) { 1283 dbgs() << "HotCountThreshold = " << PSI->getHotCountThreshold() << "\n"; 1284 dbgs() << "ColdCountThreshold = " << PSI->getColdCountThreshold() 1285 << "\n"; 1286 } 1287 #endif 1288 bool DidOutline = Cloner.doMultiRegionFunctionOutlining(); 1289 1290 if (DidOutline) { 1291 #ifndef NDEBUG 1292 if (TracePartialInlining) { 1293 dbgs() << ">>>>>> Outlined (Cloned) Function >>>>>>\n"; 1294 Cloner.ClonedFunc->print(dbgs()); 1295 dbgs() << "<<<<<< Outlined (Cloned) Function <<<<<<\n"; 1296 } 1297 #endif 1298 1299 if (tryPartialInline(Cloner)) 1300 return {true, nullptr}; 1301 } 1302 } 1303 } 1304 1305 // Fall-thru to regular partial inlining if we: 1306 // i) can't find any cold regions to outline, or 1307 // ii) can't inline the outlined function anywhere. 1308 std::unique_ptr<FunctionOutliningInfo> OI = computeOutliningInfo(F); 1309 if (!OI) 1310 return {false, nullptr}; 1311 1312 FunctionCloner Cloner(F, OI.get(), ORE, LookupAssumptionCache); 1313 Cloner.NormalizeReturnBlock(); 1314 1315 Function *OutlinedFunction = Cloner.doSingleRegionFunctionOutlining(); 1316 1317 if (!OutlinedFunction) 1318 return {false, nullptr}; 1319 1320 bool AnyInline = tryPartialInline(Cloner); 1321 1322 if (AnyInline) 1323 return {true, OutlinedFunction}; 1324 1325 return {false, nullptr}; 1326 } 1327 1328 bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) { 1329 if (Cloner.OutlinedFunctions.empty()) 1330 return false; 1331 1332 int SizeCost = 0; 1333 BlockFrequency WeightedRcost; 1334 int NonWeightedRcost; 1335 std::tie(SizeCost, NonWeightedRcost) = computeOutliningCosts(Cloner); 1336 1337 // Only calculate RelativeToEntryFreq when we are doing single region 1338 // outlining. 1339 BranchProbability RelativeToEntryFreq; 1340 if (Cloner.ClonedOI) { 1341 RelativeToEntryFreq = getOutliningCallBBRelativeFreq(Cloner); 1342 } else 1343 // RelativeToEntryFreq doesn't make sense when we have more than one 1344 // outlined call because each call will have a different relative frequency 1345 // to the entry block. We can consider using the average, but the 1346 // usefulness of that information is questionable. For now, assume we never 1347 // execute the calls to outlined functions. 1348 RelativeToEntryFreq = BranchProbability(0, 1); 1349 1350 WeightedRcost = BlockFrequency(NonWeightedRcost) * RelativeToEntryFreq; 1351 1352 // The call sequence(s) to the outlined function(s) are larger than the sum of 1353 // the original outlined region size(s), it does not increase the chances of 1354 // inlining the function with outlining (The inliner uses the size increase to 1355 // model the cost of inlining a callee). 1356 if (!SkipCostAnalysis && Cloner.OutlinedRegionCost < SizeCost) { 1357 OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc); 1358 DebugLoc DLoc; 1359 BasicBlock *Block; 1360 std::tie(DLoc, Block) = getOneDebugLoc(Cloner.ClonedFunc); 1361 OrigFuncORE.emit([&]() { 1362 return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall", 1363 DLoc, Block) 1364 << ore::NV("Function", Cloner.OrigFunc) 1365 << " not partially inlined into callers (Original Size = " 1366 << ore::NV("OutlinedRegionOriginalSize", Cloner.OutlinedRegionCost) 1367 << ", Size of call sequence to outlined function = " 1368 << ore::NV("NewSize", SizeCost) << ")"; 1369 }); 1370 return false; 1371 } 1372 1373 assert(empty(Cloner.OrigFunc->users()) && 1374 "F's users should all be replaced!"); 1375 1376 std::vector<User *> Users(Cloner.ClonedFunc->user_begin(), 1377 Cloner.ClonedFunc->user_end()); 1378 1379 DenseMap<User *, uint64_t> CallSiteToProfCountMap; 1380 auto CalleeEntryCount = Cloner.OrigFunc->getEntryCount(); 1381 if (CalleeEntryCount) 1382 computeCallsiteToProfCountMap(Cloner.ClonedFunc, CallSiteToProfCountMap); 1383 1384 uint64_t CalleeEntryCountV = 1385 (CalleeEntryCount ? CalleeEntryCount.getCount() : 0); 1386 1387 bool AnyInline = false; 1388 for (User *User : Users) { 1389 CallSite CS = getCallSite(User); 1390 1391 if (IsLimitReached()) 1392 continue; 1393 1394 OptimizationRemarkEmitter CallerORE(CS.getCaller()); 1395 if (!shouldPartialInline(CS, Cloner, WeightedRcost, CallerORE)) 1396 continue; 1397 1398 // Construct remark before doing the inlining, as after successful inlining 1399 // the callsite is removed. 1400 OptimizationRemark OR(DEBUG_TYPE, "PartiallyInlined", CS.getInstruction()); 1401 OR << ore::NV("Callee", Cloner.OrigFunc) << " partially inlined into " 1402 << ore::NV("Caller", CS.getCaller()); 1403 1404 InlineFunctionInfo IFI(nullptr, GetAssumptionCache, PSI); 1405 // We can only forward varargs when we outlined a single region, else we 1406 // bail on vararg functions. 1407 if (!InlineFunction(CS, IFI, nullptr, true, 1408 (Cloner.ClonedOI ? Cloner.OutlinedFunctions.back().first 1409 : nullptr))) 1410 continue; 1411 1412 CallerORE.emit(OR); 1413 1414 // Now update the entry count: 1415 if (CalleeEntryCountV && CallSiteToProfCountMap.count(User)) { 1416 uint64_t CallSiteCount = CallSiteToProfCountMap[User]; 1417 CalleeEntryCountV -= std::min(CalleeEntryCountV, CallSiteCount); 1418 } 1419 1420 AnyInline = true; 1421 NumPartialInlining++; 1422 // Update the stats 1423 if (Cloner.ClonedOI) 1424 NumPartialInlined++; 1425 else 1426 NumColdOutlinePartialInlined++; 1427 1428 } 1429 1430 if (AnyInline) { 1431 Cloner.IsFunctionInlined = true; 1432 if (CalleeEntryCount) 1433 Cloner.OrigFunc->setEntryCount( 1434 CalleeEntryCount.setCount(CalleeEntryCountV)); 1435 OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc); 1436 OrigFuncORE.emit([&]() { 1437 return OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", Cloner.OrigFunc) 1438 << "Partially inlined into at least one caller"; 1439 }); 1440 1441 } 1442 1443 return AnyInline; 1444 } 1445 1446 bool PartialInlinerImpl::run(Module &M) { 1447 if (DisablePartialInlining) 1448 return false; 1449 1450 std::vector<Function *> Worklist; 1451 Worklist.reserve(M.size()); 1452 for (Function &F : M) 1453 if (!F.use_empty() && !F.isDeclaration()) 1454 Worklist.push_back(&F); 1455 1456 bool Changed = false; 1457 while (!Worklist.empty()) { 1458 Function *CurrFunc = Worklist.back(); 1459 Worklist.pop_back(); 1460 1461 if (CurrFunc->use_empty()) 1462 continue; 1463 1464 bool Recursive = false; 1465 for (User *U : CurrFunc->users()) 1466 if (Instruction *I = dyn_cast<Instruction>(U)) 1467 if (I->getParent()->getParent() == CurrFunc) { 1468 Recursive = true; 1469 break; 1470 } 1471 if (Recursive) 1472 continue; 1473 1474 std::pair<bool, Function * > Result = unswitchFunction(CurrFunc); 1475 if (Result.second) 1476 Worklist.push_back(Result.second); 1477 Changed |= Result.first; 1478 } 1479 1480 return Changed; 1481 } 1482 1483 char PartialInlinerLegacyPass::ID = 0; 1484 1485 INITIALIZE_PASS_BEGIN(PartialInlinerLegacyPass, "partial-inliner", 1486 "Partial Inliner", false, false) 1487 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 1488 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 1489 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 1490 INITIALIZE_PASS_END(PartialInlinerLegacyPass, "partial-inliner", 1491 "Partial Inliner", false, false) 1492 1493 ModulePass *llvm::createPartialInliningPass() { 1494 return new PartialInlinerLegacyPass(); 1495 } 1496 1497 PreservedAnalyses PartialInlinerPass::run(Module &M, 1498 ModuleAnalysisManager &AM) { 1499 auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); 1500 1501 std::function<AssumptionCache &(Function &)> GetAssumptionCache = 1502 [&FAM](Function &F) -> AssumptionCache & { 1503 return FAM.getResult<AssumptionAnalysis>(F); 1504 }; 1505 1506 auto LookupAssumptionCache = [&FAM](Function &F) -> AssumptionCache * { 1507 return FAM.getCachedResult<AssumptionAnalysis>(F); 1508 }; 1509 1510 std::function<BlockFrequencyInfo &(Function &)> GetBFI = 1511 [&FAM](Function &F) -> BlockFrequencyInfo & { 1512 return FAM.getResult<BlockFrequencyAnalysis>(F); 1513 }; 1514 1515 std::function<TargetTransformInfo &(Function &)> GetTTI = 1516 [&FAM](Function &F) -> TargetTransformInfo & { 1517 return FAM.getResult<TargetIRAnalysis>(F); 1518 }; 1519 1520 ProfileSummaryInfo *PSI = &AM.getResult<ProfileSummaryAnalysis>(M); 1521 1522 if (PartialInlinerImpl(&GetAssumptionCache, LookupAssumptionCache, &GetTTI, 1523 {GetBFI}, PSI) 1524 .run(M)) 1525 return PreservedAnalyses::none(); 1526 return PreservedAnalyses::all(); 1527 } 1528