1 //===- PartialInlining.cpp - Inline parts of functions --------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass performs partial inlining, typically by inlining an if statement 10 // that surrounds the body of the function. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "llvm/Transforms/IPO/PartialInlining.h" 15 #include "llvm/ADT/DenseMap.h" 16 #include "llvm/ADT/DenseSet.h" 17 #include "llvm/ADT/Optional.h" 18 #include "llvm/ADT/STLExtras.h" 19 #include "llvm/ADT/SmallVector.h" 20 #include "llvm/ADT/Statistic.h" 21 #include "llvm/Analysis/BlockFrequencyInfo.h" 22 #include "llvm/Analysis/BranchProbabilityInfo.h" 23 #include "llvm/Analysis/InlineCost.h" 24 #include "llvm/Analysis/LoopInfo.h" 25 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 26 #include "llvm/Analysis/ProfileSummaryInfo.h" 27 #include "llvm/Analysis/TargetLibraryInfo.h" 28 #include "llvm/Analysis/TargetTransformInfo.h" 29 #include "llvm/IR/Attributes.h" 30 #include "llvm/IR/BasicBlock.h" 31 #include "llvm/IR/CFG.h" 32 #include "llvm/IR/DebugLoc.h" 33 #include "llvm/IR/DiagnosticInfo.h" 34 #include "llvm/IR/Dominators.h" 35 #include "llvm/IR/Function.h" 36 #include "llvm/IR/InstrTypes.h" 37 #include "llvm/IR/Instruction.h" 38 #include "llvm/IR/Instructions.h" 39 #include "llvm/IR/IntrinsicInst.h" 40 #include "llvm/IR/Intrinsics.h" 41 #include "llvm/IR/Module.h" 42 #include "llvm/IR/Operator.h" 43 #include "llvm/IR/User.h" 44 #include "llvm/InitializePasses.h" 45 #include "llvm/Pass.h" 46 #include "llvm/Support/BlockFrequency.h" 47 #include "llvm/Support/BranchProbability.h" 48 #include "llvm/Support/Casting.h" 49 #include "llvm/Support/CommandLine.h" 50 #include "llvm/Support/ErrorHandling.h" 51 #include "llvm/Transforms/IPO.h" 52 #include "llvm/Transforms/Utils/Cloning.h" 53 #include "llvm/Transforms/Utils/CodeExtractor.h" 54 #include "llvm/Transforms/Utils/ValueMapper.h" 55 #include <algorithm> 56 #include <cassert> 57 #include <cstdint> 58 #include <memory> 59 #include <tuple> 60 #include <vector> 61 62 using namespace llvm; 63 64 #define DEBUG_TYPE "partial-inlining" 65 66 STATISTIC(NumPartialInlined, 67 "Number of callsites functions partially inlined into."); 68 STATISTIC(NumColdOutlinePartialInlined, "Number of times functions with " 69 "cold outlined regions were partially " 70 "inlined into its caller(s)."); 71 STATISTIC(NumColdRegionsFound, 72 "Number of cold single entry/exit regions found."); 73 STATISTIC(NumColdRegionsOutlined, 74 "Number of cold single entry/exit regions outlined."); 75 76 // Command line option to disable partial-inlining. The default is false: 77 static cl::opt<bool> 78 DisablePartialInlining("disable-partial-inlining", cl::init(false), 79 cl::Hidden, cl::desc("Disable partial inlining")); 80 // Command line option to disable multi-region partial-inlining. The default is 81 // false: 82 static cl::opt<bool> DisableMultiRegionPartialInline( 83 "disable-mr-partial-inlining", cl::init(false), cl::Hidden, 84 cl::desc("Disable multi-region partial inlining")); 85 86 // Command line option to force outlining in regions with live exit variables. 87 // The default is false: 88 static cl::opt<bool> 89 ForceLiveExit("pi-force-live-exit-outline", cl::init(false), cl::Hidden, 90 cl::desc("Force outline regions with live exits")); 91 92 // Command line option to enable marking outline functions with Cold Calling 93 // Convention. The default is false: 94 static cl::opt<bool> 95 MarkOutlinedColdCC("pi-mark-coldcc", cl::init(false), cl::Hidden, 96 cl::desc("Mark outline function calls with ColdCC")); 97 98 // This is an option used by testing: 99 static cl::opt<bool> SkipCostAnalysis("skip-partial-inlining-cost-analysis", 100 101 cl::ReallyHidden, 102 cl::desc("Skip Cost Analysis")); 103 // Used to determine if a cold region is worth outlining based on 104 // its inlining cost compared to the original function. Default is set at 10%. 105 // ie. if the cold region reduces the inlining cost of the original function by 106 // at least 10%. 107 static cl::opt<float> MinRegionSizeRatio( 108 "min-region-size-ratio", cl::init(0.1), cl::Hidden, 109 cl::desc("Minimum ratio comparing relative sizes of each " 110 "outline candidate and original function")); 111 // Used to tune the minimum number of execution counts needed in the predecessor 112 // block to the cold edge. ie. confidence interval. 113 static cl::opt<unsigned> 114 MinBlockCounterExecution("min-block-execution", cl::init(100), cl::Hidden, 115 cl::desc("Minimum block executions to consider " 116 "its BranchProbabilityInfo valid")); 117 // Used to determine when an edge is considered cold. Default is set to 10%. ie. 118 // if the branch probability is 10% or less, then it is deemed as 'cold'. 119 static cl::opt<float> ColdBranchRatio( 120 "cold-branch-ratio", cl::init(0.1), cl::Hidden, 121 cl::desc("Minimum BranchProbability to consider a region cold.")); 122 123 static cl::opt<unsigned> MaxNumInlineBlocks( 124 "max-num-inline-blocks", cl::init(5), cl::Hidden, 125 cl::desc("Max number of blocks to be partially inlined")); 126 127 // Command line option to set the maximum number of partial inlining allowed 128 // for the module. The default value of -1 means no limit. 129 static cl::opt<int> MaxNumPartialInlining( 130 "max-partial-inlining", cl::init(-1), cl::Hidden, 131 cl::desc("Max number of partial inlining. The default is unlimited")); 132 133 // Used only when PGO or user annotated branch data is absent. It is 134 // the least value that is used to weigh the outline region. If BFI 135 // produces larger value, the BFI value will be used. 136 static cl::opt<int> 137 OutlineRegionFreqPercent("outline-region-freq-percent", cl::init(75), 138 cl::Hidden, 139 cl::desc("Relative frequency of outline region to " 140 "the entry block")); 141 142 static cl::opt<unsigned> ExtraOutliningPenalty( 143 "partial-inlining-extra-penalty", cl::init(0), cl::Hidden, 144 cl::desc("A debug option to add additional penalty to the computed one.")); 145 146 namespace { 147 148 struct FunctionOutliningInfo { 149 FunctionOutliningInfo() = default; 150 151 // Returns the number of blocks to be inlined including all blocks 152 // in Entries and one return block. 153 unsigned getNumInlinedBlocks() const { return Entries.size() + 1; } 154 155 // A set of blocks including the function entry that guard 156 // the region to be outlined. 157 SmallVector<BasicBlock *, 4> Entries; 158 159 // The return block that is not included in the outlined region. 160 BasicBlock *ReturnBlock = nullptr; 161 162 // The dominating block of the region to be outlined. 163 BasicBlock *NonReturnBlock = nullptr; 164 165 // The set of blocks in Entries that that are predecessors to ReturnBlock 166 SmallVector<BasicBlock *, 4> ReturnBlockPreds; 167 }; 168 169 struct FunctionOutliningMultiRegionInfo { 170 FunctionOutliningMultiRegionInfo() = default; 171 172 // Container for outline regions 173 struct OutlineRegionInfo { 174 OutlineRegionInfo(ArrayRef<BasicBlock *> Region, 175 BasicBlock *EntryBlock, BasicBlock *ExitBlock, 176 BasicBlock *ReturnBlock) 177 : Region(Region.begin(), Region.end()), EntryBlock(EntryBlock), 178 ExitBlock(ExitBlock), ReturnBlock(ReturnBlock) {} 179 SmallVector<BasicBlock *, 8> Region; 180 BasicBlock *EntryBlock; 181 BasicBlock *ExitBlock; 182 BasicBlock *ReturnBlock; 183 }; 184 185 SmallVector<OutlineRegionInfo, 4> ORI; 186 }; 187 188 struct PartialInlinerImpl { 189 190 PartialInlinerImpl( 191 function_ref<AssumptionCache &(Function &)> GetAC, 192 function_ref<AssumptionCache *(Function &)> LookupAC, 193 function_ref<TargetTransformInfo &(Function &)> GTTI, 194 function_ref<const TargetLibraryInfo &(Function &)> GTLI, 195 ProfileSummaryInfo &ProfSI, 196 function_ref<BlockFrequencyInfo &(Function &)> GBFI = nullptr) 197 : GetAssumptionCache(GetAC), LookupAssumptionCache(LookupAC), 198 GetTTI(GTTI), GetBFI(GBFI), GetTLI(GTLI), PSI(ProfSI) {} 199 200 bool run(Module &M); 201 // Main part of the transformation that calls helper functions to find 202 // outlining candidates, clone & outline the function, and attempt to 203 // partially inline the resulting function. Returns true if 204 // inlining was successful, false otherwise. Also returns the outline 205 // function (only if we partially inlined early returns) as there is a 206 // possibility to further "peel" early return statements that were left in the 207 // outline function due to code size. 208 std::pair<bool, Function *> unswitchFunction(Function &F); 209 210 // This class speculatively clones the function to be partial inlined. 211 // At the end of partial inlining, the remaining callsites to the cloned 212 // function that are not partially inlined will be fixed up to reference 213 // the original function, and the cloned function will be erased. 214 struct FunctionCloner { 215 // Two constructors, one for single region outlining, the other for 216 // multi-region outlining. 217 FunctionCloner(Function *F, FunctionOutliningInfo *OI, 218 OptimizationRemarkEmitter &ORE, 219 function_ref<AssumptionCache *(Function &)> LookupAC, 220 function_ref<TargetTransformInfo &(Function &)> GetTTI); 221 FunctionCloner(Function *F, FunctionOutliningMultiRegionInfo *OMRI, 222 OptimizationRemarkEmitter &ORE, 223 function_ref<AssumptionCache *(Function &)> LookupAC, 224 function_ref<TargetTransformInfo &(Function &)> GetTTI); 225 226 ~FunctionCloner(); 227 228 // Prepare for function outlining: making sure there is only 229 // one incoming edge from the extracted/outlined region to 230 // the return block. 231 void normalizeReturnBlock() const; 232 233 // Do function outlining for cold regions. 234 bool doMultiRegionFunctionOutlining(); 235 // Do function outlining for region after early return block(s). 236 // NOTE: For vararg functions that do the vararg handling in the outlined 237 // function, we temporarily generate IR that does not properly 238 // forward varargs to the outlined function. Calling InlineFunction 239 // will update calls to the outlined functions to properly forward 240 // the varargs. 241 Function *doSingleRegionFunctionOutlining(); 242 243 Function *OrigFunc = nullptr; 244 Function *ClonedFunc = nullptr; 245 246 typedef std::pair<Function *, BasicBlock *> FuncBodyCallerPair; 247 // Keep track of Outlined Functions and the basic block they're called from. 248 SmallVector<FuncBodyCallerPair, 4> OutlinedFunctions; 249 250 // ClonedFunc is inlined in one of its callers after function 251 // outlining. 252 bool IsFunctionInlined = false; 253 // The cost of the region to be outlined. 254 InstructionCost OutlinedRegionCost = 0; 255 // ClonedOI is specific to outlining non-early return blocks. 256 std::unique_ptr<FunctionOutliningInfo> ClonedOI = nullptr; 257 // ClonedOMRI is specific to outlining cold regions. 258 std::unique_ptr<FunctionOutliningMultiRegionInfo> ClonedOMRI = nullptr; 259 std::unique_ptr<BlockFrequencyInfo> ClonedFuncBFI = nullptr; 260 OptimizationRemarkEmitter &ORE; 261 function_ref<AssumptionCache *(Function &)> LookupAC; 262 function_ref<TargetTransformInfo &(Function &)> GetTTI; 263 }; 264 265 private: 266 int NumPartialInlining = 0; 267 function_ref<AssumptionCache &(Function &)> GetAssumptionCache; 268 function_ref<AssumptionCache *(Function &)> LookupAssumptionCache; 269 function_ref<TargetTransformInfo &(Function &)> GetTTI; 270 function_ref<BlockFrequencyInfo &(Function &)> GetBFI; 271 function_ref<const TargetLibraryInfo &(Function &)> GetTLI; 272 ProfileSummaryInfo &PSI; 273 274 // Return the frequency of the OutlininingBB relative to F's entry point. 275 // The result is no larger than 1 and is represented using BP. 276 // (Note that the outlined region's 'head' block can only have incoming 277 // edges from the guarding entry blocks). 278 BranchProbability 279 getOutliningCallBBRelativeFreq(FunctionCloner &Cloner) const; 280 281 // Return true if the callee of CB should be partially inlined with 282 // profit. 283 bool shouldPartialInline(CallBase &CB, FunctionCloner &Cloner, 284 BlockFrequency WeightedOutliningRcost, 285 OptimizationRemarkEmitter &ORE) const; 286 287 // Try to inline DuplicateFunction (cloned from F with call to 288 // the OutlinedFunction into its callers. Return true 289 // if there is any successful inlining. 290 bool tryPartialInline(FunctionCloner &Cloner); 291 292 // Compute the mapping from use site of DuplicationFunction to the enclosing 293 // BB's profile count. 294 void 295 computeCallsiteToProfCountMap(Function *DuplicateFunction, 296 DenseMap<User *, uint64_t> &SiteCountMap) const; 297 298 bool isLimitReached() const { 299 return (MaxNumPartialInlining != -1 && 300 NumPartialInlining >= MaxNumPartialInlining); 301 } 302 303 static CallBase *getSupportedCallBase(User *U) { 304 if (isa<CallInst>(U) || isa<InvokeInst>(U)) 305 return cast<CallBase>(U); 306 llvm_unreachable("All uses must be calls"); 307 return nullptr; 308 } 309 310 static CallBase *getOneCallSiteTo(Function &F) { 311 User *User = *F.user_begin(); 312 return getSupportedCallBase(User); 313 } 314 315 std::tuple<DebugLoc, BasicBlock *> getOneDebugLoc(Function &F) const { 316 CallBase *CB = getOneCallSiteTo(F); 317 DebugLoc DLoc = CB->getDebugLoc(); 318 BasicBlock *Block = CB->getParent(); 319 return std::make_tuple(DLoc, Block); 320 } 321 322 // Returns the costs associated with function outlining: 323 // - The first value is the non-weighted runtime cost for making the call 324 // to the outlined function, including the addtional setup cost in the 325 // outlined function itself; 326 // - The second value is the estimated size of the new call sequence in 327 // basic block Cloner.OutliningCallBB; 328 std::tuple<InstructionCost, InstructionCost> 329 computeOutliningCosts(FunctionCloner &Cloner) const; 330 331 // Compute the 'InlineCost' of block BB. InlineCost is a proxy used to 332 // approximate both the size and runtime cost (Note that in the current 333 // inline cost analysis, there is no clear distinction there either). 334 static InstructionCost computeBBInlineCost(BasicBlock *BB, 335 TargetTransformInfo *TTI); 336 337 std::unique_ptr<FunctionOutliningInfo> 338 computeOutliningInfo(Function &F) const; 339 340 std::unique_ptr<FunctionOutliningMultiRegionInfo> 341 computeOutliningColdRegionsInfo(Function &F, 342 OptimizationRemarkEmitter &ORE) const; 343 }; 344 345 struct PartialInlinerLegacyPass : public ModulePass { 346 static char ID; // Pass identification, replacement for typeid 347 348 PartialInlinerLegacyPass() : ModulePass(ID) { 349 initializePartialInlinerLegacyPassPass(*PassRegistry::getPassRegistry()); 350 } 351 352 void getAnalysisUsage(AnalysisUsage &AU) const override { 353 AU.addRequired<AssumptionCacheTracker>(); 354 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 355 AU.addRequired<TargetTransformInfoWrapperPass>(); 356 AU.addRequired<TargetLibraryInfoWrapperPass>(); 357 } 358 359 bool runOnModule(Module &M) override { 360 if (skipModule(M)) 361 return false; 362 363 AssumptionCacheTracker *ACT = &getAnalysis<AssumptionCacheTracker>(); 364 TargetTransformInfoWrapperPass *TTIWP = 365 &getAnalysis<TargetTransformInfoWrapperPass>(); 366 ProfileSummaryInfo &PSI = 367 getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 368 369 auto GetAssumptionCache = [&ACT](Function &F) -> AssumptionCache & { 370 return ACT->getAssumptionCache(F); 371 }; 372 373 auto LookupAssumptionCache = [ACT](Function &F) -> AssumptionCache * { 374 return ACT->lookupAssumptionCache(F); 375 }; 376 377 auto GetTTI = [&TTIWP](Function &F) -> TargetTransformInfo & { 378 return TTIWP->getTTI(F); 379 }; 380 381 auto GetTLI = [this](Function &F) -> TargetLibraryInfo & { 382 return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); 383 }; 384 385 return PartialInlinerImpl(GetAssumptionCache, LookupAssumptionCache, GetTTI, 386 GetTLI, PSI) 387 .run(M); 388 } 389 }; 390 391 } // end anonymous namespace 392 393 std::unique_ptr<FunctionOutliningMultiRegionInfo> 394 PartialInlinerImpl::computeOutliningColdRegionsInfo( 395 Function &F, OptimizationRemarkEmitter &ORE) const { 396 BasicBlock *EntryBlock = &F.front(); 397 398 DominatorTree DT(F); 399 LoopInfo LI(DT); 400 BranchProbabilityInfo BPI(F, LI); 401 std::unique_ptr<BlockFrequencyInfo> ScopedBFI; 402 BlockFrequencyInfo *BFI; 403 if (!GetBFI) { 404 ScopedBFI.reset(new BlockFrequencyInfo(F, BPI, LI)); 405 BFI = ScopedBFI.get(); 406 } else 407 BFI = &(GetBFI(F)); 408 409 // Return if we don't have profiling information. 410 if (!PSI.hasInstrumentationProfile()) 411 return std::unique_ptr<FunctionOutliningMultiRegionInfo>(); 412 413 std::unique_ptr<FunctionOutliningMultiRegionInfo> OutliningInfo = 414 std::make_unique<FunctionOutliningMultiRegionInfo>(); 415 416 auto IsSingleExit = 417 [&ORE](SmallVectorImpl<BasicBlock *> &BlockList) -> BasicBlock * { 418 BasicBlock *ExitBlock = nullptr; 419 for (auto *Block : BlockList) { 420 for (BasicBlock *Succ : successors(Block)) { 421 if (!is_contained(BlockList, Succ)) { 422 if (ExitBlock) { 423 ORE.emit([&]() { 424 return OptimizationRemarkMissed(DEBUG_TYPE, "MultiExitRegion", 425 &Succ->front()) 426 << "Region dominated by " 427 << ore::NV("Block", BlockList.front()->getName()) 428 << " has more than one region exit edge."; 429 }); 430 return nullptr; 431 } 432 433 ExitBlock = Block; 434 } 435 } 436 } 437 return ExitBlock; 438 }; 439 440 auto BBProfileCount = [BFI](BasicBlock *BB) { 441 return BFI->getBlockProfileCount(BB).value_or(0); 442 }; 443 444 // Use the same computeBBInlineCost function to compute the cost savings of 445 // the outlining the candidate region. 446 TargetTransformInfo *FTTI = &GetTTI(F); 447 InstructionCost OverallFunctionCost = 0; 448 for (auto &BB : F) 449 OverallFunctionCost += computeBBInlineCost(&BB, FTTI); 450 451 LLVM_DEBUG(dbgs() << "OverallFunctionCost = " << OverallFunctionCost 452 << "\n";); 453 454 InstructionCost MinOutlineRegionCost = OverallFunctionCost.map( 455 [&](auto Cost) { return Cost * MinRegionSizeRatio; }); 456 457 BranchProbability MinBranchProbability( 458 static_cast<int>(ColdBranchRatio * MinBlockCounterExecution), 459 MinBlockCounterExecution); 460 bool ColdCandidateFound = false; 461 BasicBlock *CurrEntry = EntryBlock; 462 std::vector<BasicBlock *> DFS; 463 DenseMap<BasicBlock *, bool> VisitedMap; 464 DFS.push_back(CurrEntry); 465 VisitedMap[CurrEntry] = true; 466 467 // Use Depth First Search on the basic blocks to find CFG edges that are 468 // considered cold. 469 // Cold regions considered must also have its inline cost compared to the 470 // overall inline cost of the original function. The region is outlined only 471 // if it reduced the inline cost of the function by 'MinOutlineRegionCost' or 472 // more. 473 while (!DFS.empty()) { 474 auto *ThisBB = DFS.back(); 475 DFS.pop_back(); 476 // Only consider regions with predecessor blocks that are considered 477 // not-cold (default: part of the top 99.99% of all block counters) 478 // AND greater than our minimum block execution count (default: 100). 479 if (PSI.isColdBlock(ThisBB, BFI) || 480 BBProfileCount(ThisBB) < MinBlockCounterExecution) 481 continue; 482 for (auto SI = succ_begin(ThisBB); SI != succ_end(ThisBB); ++SI) { 483 if (VisitedMap[*SI]) 484 continue; 485 VisitedMap[*SI] = true; 486 DFS.push_back(*SI); 487 // If branch isn't cold, we skip to the next one. 488 BranchProbability SuccProb = BPI.getEdgeProbability(ThisBB, *SI); 489 if (SuccProb > MinBranchProbability) 490 continue; 491 492 LLVM_DEBUG(dbgs() << "Found cold edge: " << ThisBB->getName() << "->" 493 << SI->getName() 494 << "\nBranch Probability = " << SuccProb << "\n";); 495 496 SmallVector<BasicBlock *, 8> DominateVector; 497 DT.getDescendants(*SI, DominateVector); 498 assert(!DominateVector.empty() && 499 "SI should be reachable and have at least itself as descendant"); 500 501 // We can only outline single entry regions (for now). 502 if (!DominateVector.front()->hasNPredecessors(1)) { 503 LLVM_DEBUG(dbgs() << "ABORT: Block " << SI->getName() 504 << " doesn't have a single predecessor in the " 505 "dominator tree\n";); 506 continue; 507 } 508 509 BasicBlock *ExitBlock = nullptr; 510 // We can only outline single exit regions (for now). 511 if (!(ExitBlock = IsSingleExit(DominateVector))) { 512 LLVM_DEBUG(dbgs() << "ABORT: Block " << SI->getName() 513 << " doesn't have a unique successor\n";); 514 continue; 515 } 516 517 InstructionCost OutlineRegionCost = 0; 518 for (auto *BB : DominateVector) 519 OutlineRegionCost += computeBBInlineCost(BB, &GetTTI(*BB->getParent())); 520 521 LLVM_DEBUG(dbgs() << "OutlineRegionCost = " << OutlineRegionCost 522 << "\n";); 523 524 if (!SkipCostAnalysis && OutlineRegionCost < MinOutlineRegionCost) { 525 ORE.emit([&]() { 526 return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", 527 &SI->front()) 528 << ore::NV("Callee", &F) 529 << " inline cost-savings smaller than " 530 << ore::NV("Cost", MinOutlineRegionCost); 531 }); 532 533 LLVM_DEBUG(dbgs() << "ABORT: Outline region cost is smaller than " 534 << MinOutlineRegionCost << "\n";); 535 continue; 536 } 537 538 // For now, ignore blocks that belong to a SISE region that is a 539 // candidate for outlining. In the future, we may want to look 540 // at inner regions because the outer region may have live-exit 541 // variables. 542 for (auto *BB : DominateVector) 543 VisitedMap[BB] = true; 544 545 // ReturnBlock here means the block after the outline call 546 BasicBlock *ReturnBlock = ExitBlock->getSingleSuccessor(); 547 FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegInfo( 548 DominateVector, DominateVector.front(), ExitBlock, ReturnBlock); 549 OutliningInfo->ORI.push_back(RegInfo); 550 LLVM_DEBUG(dbgs() << "Found Cold Candidate starting at block: " 551 << DominateVector.front()->getName() << "\n";); 552 ColdCandidateFound = true; 553 NumColdRegionsFound++; 554 } 555 } 556 557 if (ColdCandidateFound) 558 return OutliningInfo; 559 560 return std::unique_ptr<FunctionOutliningMultiRegionInfo>(); 561 } 562 563 std::unique_ptr<FunctionOutliningInfo> 564 PartialInlinerImpl::computeOutliningInfo(Function &F) const { 565 BasicBlock *EntryBlock = &F.front(); 566 BranchInst *BR = dyn_cast<BranchInst>(EntryBlock->getTerminator()); 567 if (!BR || BR->isUnconditional()) 568 return std::unique_ptr<FunctionOutliningInfo>(); 569 570 // Returns true if Succ is BB's successor 571 auto IsSuccessor = [](BasicBlock *Succ, BasicBlock *BB) { 572 return is_contained(successors(BB), Succ); 573 }; 574 575 auto IsReturnBlock = [](BasicBlock *BB) { 576 Instruction *TI = BB->getTerminator(); 577 return isa<ReturnInst>(TI); 578 }; 579 580 auto GetReturnBlock = [&](BasicBlock *Succ1, BasicBlock *Succ2) { 581 if (IsReturnBlock(Succ1)) 582 return std::make_tuple(Succ1, Succ2); 583 if (IsReturnBlock(Succ2)) 584 return std::make_tuple(Succ2, Succ1); 585 586 return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr); 587 }; 588 589 // Detect a triangular shape: 590 auto GetCommonSucc = [&](BasicBlock *Succ1, BasicBlock *Succ2) { 591 if (IsSuccessor(Succ1, Succ2)) 592 return std::make_tuple(Succ1, Succ2); 593 if (IsSuccessor(Succ2, Succ1)) 594 return std::make_tuple(Succ2, Succ1); 595 596 return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr); 597 }; 598 599 std::unique_ptr<FunctionOutliningInfo> OutliningInfo = 600 std::make_unique<FunctionOutliningInfo>(); 601 602 BasicBlock *CurrEntry = EntryBlock; 603 bool CandidateFound = false; 604 do { 605 // The number of blocks to be inlined has already reached 606 // the limit. When MaxNumInlineBlocks is set to 0 or 1, this 607 // disables partial inlining for the function. 608 if (OutliningInfo->getNumInlinedBlocks() >= MaxNumInlineBlocks) 609 break; 610 611 if (succ_size(CurrEntry) != 2) 612 break; 613 614 BasicBlock *Succ1 = *succ_begin(CurrEntry); 615 BasicBlock *Succ2 = *(succ_begin(CurrEntry) + 1); 616 617 BasicBlock *ReturnBlock, *NonReturnBlock; 618 std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2); 619 620 if (ReturnBlock) { 621 OutliningInfo->Entries.push_back(CurrEntry); 622 OutliningInfo->ReturnBlock = ReturnBlock; 623 OutliningInfo->NonReturnBlock = NonReturnBlock; 624 CandidateFound = true; 625 break; 626 } 627 628 BasicBlock *CommSucc, *OtherSucc; 629 std::tie(CommSucc, OtherSucc) = GetCommonSucc(Succ1, Succ2); 630 631 if (!CommSucc) 632 break; 633 634 OutliningInfo->Entries.push_back(CurrEntry); 635 CurrEntry = OtherSucc; 636 } while (true); 637 638 if (!CandidateFound) 639 return std::unique_ptr<FunctionOutliningInfo>(); 640 641 // There should not be any successors (not in the entry set) other than 642 // {ReturnBlock, NonReturnBlock} 643 assert(OutliningInfo->Entries[0] == &F.front() && 644 "Function Entry must be the first in Entries vector"); 645 DenseSet<BasicBlock *> Entries; 646 for (BasicBlock *E : OutliningInfo->Entries) 647 Entries.insert(E); 648 649 // Returns true of BB has Predecessor which is not 650 // in Entries set. 651 auto HasNonEntryPred = [Entries](BasicBlock *BB) { 652 for (auto *Pred : predecessors(BB)) { 653 if (!Entries.count(Pred)) 654 return true; 655 } 656 return false; 657 }; 658 auto CheckAndNormalizeCandidate = 659 [Entries, HasNonEntryPred](FunctionOutliningInfo *OutliningInfo) { 660 for (BasicBlock *E : OutliningInfo->Entries) { 661 for (auto *Succ : successors(E)) { 662 if (Entries.count(Succ)) 663 continue; 664 if (Succ == OutliningInfo->ReturnBlock) 665 OutliningInfo->ReturnBlockPreds.push_back(E); 666 else if (Succ != OutliningInfo->NonReturnBlock) 667 return false; 668 } 669 // There should not be any outside incoming edges either: 670 if (HasNonEntryPred(E)) 671 return false; 672 } 673 return true; 674 }; 675 676 if (!CheckAndNormalizeCandidate(OutliningInfo.get())) 677 return std::unique_ptr<FunctionOutliningInfo>(); 678 679 // Now further growing the candidate's inlining region by 680 // peeling off dominating blocks from the outlining region: 681 while (OutliningInfo->getNumInlinedBlocks() < MaxNumInlineBlocks) { 682 BasicBlock *Cand = OutliningInfo->NonReturnBlock; 683 if (succ_size(Cand) != 2) 684 break; 685 686 if (HasNonEntryPred(Cand)) 687 break; 688 689 BasicBlock *Succ1 = *succ_begin(Cand); 690 BasicBlock *Succ2 = *(succ_begin(Cand) + 1); 691 692 BasicBlock *ReturnBlock, *NonReturnBlock; 693 std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2); 694 if (!ReturnBlock || ReturnBlock != OutliningInfo->ReturnBlock) 695 break; 696 697 if (NonReturnBlock->getSinglePredecessor() != Cand) 698 break; 699 700 // Now grow and update OutlininigInfo: 701 OutliningInfo->Entries.push_back(Cand); 702 OutliningInfo->NonReturnBlock = NonReturnBlock; 703 OutliningInfo->ReturnBlockPreds.push_back(Cand); 704 Entries.insert(Cand); 705 } 706 707 return OutliningInfo; 708 } 709 710 // Check if there is PGO data or user annotated branch data: 711 static bool hasProfileData(const Function &F, const FunctionOutliningInfo &OI) { 712 if (F.hasProfileData()) 713 return true; 714 // Now check if any of the entry block has MD_prof data: 715 for (auto *E : OI.Entries) { 716 BranchInst *BR = dyn_cast<BranchInst>(E->getTerminator()); 717 if (!BR || BR->isUnconditional()) 718 continue; 719 uint64_t T, F; 720 if (BR->extractProfMetadata(T, F)) 721 return true; 722 } 723 return false; 724 } 725 726 BranchProbability PartialInlinerImpl::getOutliningCallBBRelativeFreq( 727 FunctionCloner &Cloner) const { 728 BasicBlock *OutliningCallBB = Cloner.OutlinedFunctions.back().second; 729 auto EntryFreq = 730 Cloner.ClonedFuncBFI->getBlockFreq(&Cloner.ClonedFunc->getEntryBlock()); 731 auto OutliningCallFreq = 732 Cloner.ClonedFuncBFI->getBlockFreq(OutliningCallBB); 733 // FIXME Hackery needed because ClonedFuncBFI is based on the function BEFORE 734 // we outlined any regions, so we may encounter situations where the 735 // OutliningCallFreq is *slightly* bigger than the EntryFreq. 736 if (OutliningCallFreq.getFrequency() > EntryFreq.getFrequency()) 737 OutliningCallFreq = EntryFreq; 738 739 auto OutlineRegionRelFreq = BranchProbability::getBranchProbability( 740 OutliningCallFreq.getFrequency(), EntryFreq.getFrequency()); 741 742 if (hasProfileData(*Cloner.OrigFunc, *Cloner.ClonedOI)) 743 return OutlineRegionRelFreq; 744 745 // When profile data is not available, we need to be conservative in 746 // estimating the overall savings. Static branch prediction can usually 747 // guess the branch direction right (taken/non-taken), but the guessed 748 // branch probability is usually not biased enough. In case when the 749 // outlined region is predicted to be likely, its probability needs 750 // to be made higher (more biased) to not under-estimate the cost of 751 // function outlining. On the other hand, if the outlined region 752 // is predicted to be less likely, the predicted probablity is usually 753 // higher than the actual. For instance, the actual probability of the 754 // less likely target is only 5%, but the guessed probablity can be 755 // 40%. In the latter case, there is no need for further adjustement. 756 // FIXME: add an option for this. 757 if (OutlineRegionRelFreq < BranchProbability(45, 100)) 758 return OutlineRegionRelFreq; 759 760 OutlineRegionRelFreq = std::max( 761 OutlineRegionRelFreq, BranchProbability(OutlineRegionFreqPercent, 100)); 762 763 return OutlineRegionRelFreq; 764 } 765 766 bool PartialInlinerImpl::shouldPartialInline( 767 CallBase &CB, FunctionCloner &Cloner, BlockFrequency WeightedOutliningRcost, 768 OptimizationRemarkEmitter &ORE) const { 769 using namespace ore; 770 771 Function *Callee = CB.getCalledFunction(); 772 assert(Callee == Cloner.ClonedFunc); 773 774 if (SkipCostAnalysis) 775 return isInlineViable(*Callee).isSuccess(); 776 777 Function *Caller = CB.getCaller(); 778 auto &CalleeTTI = GetTTI(*Callee); 779 bool RemarksEnabled = 780 Callee->getContext().getDiagHandlerPtr()->isMissedOptRemarkEnabled( 781 DEBUG_TYPE); 782 InlineCost IC = 783 getInlineCost(CB, getInlineParams(), CalleeTTI, GetAssumptionCache, 784 GetTLI, GetBFI, &PSI, RemarksEnabled ? &ORE : nullptr); 785 786 if (IC.isAlways()) { 787 ORE.emit([&]() { 788 return OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", &CB) 789 << NV("Callee", Cloner.OrigFunc) 790 << " should always be fully inlined, not partially"; 791 }); 792 return false; 793 } 794 795 if (IC.isNever()) { 796 ORE.emit([&]() { 797 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", &CB) 798 << NV("Callee", Cloner.OrigFunc) << " not partially inlined into " 799 << NV("Caller", Caller) 800 << " because it should never be inlined (cost=never)"; 801 }); 802 return false; 803 } 804 805 if (!IC) { 806 ORE.emit([&]() { 807 return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", &CB) 808 << NV("Callee", Cloner.OrigFunc) << " not partially inlined into " 809 << NV("Caller", Caller) << " because too costly to inline (cost=" 810 << NV("Cost", IC.getCost()) << ", threshold=" 811 << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")"; 812 }); 813 return false; 814 } 815 const DataLayout &DL = Caller->getParent()->getDataLayout(); 816 817 // The savings of eliminating the call: 818 int NonWeightedSavings = getCallsiteCost(CB, DL); 819 BlockFrequency NormWeightedSavings(NonWeightedSavings); 820 821 // Weighted saving is smaller than weighted cost, return false 822 if (NormWeightedSavings < WeightedOutliningRcost) { 823 ORE.emit([&]() { 824 return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutliningCallcostTooHigh", 825 &CB) 826 << NV("Callee", Cloner.OrigFunc) << " not partially inlined into " 827 << NV("Caller", Caller) << " runtime overhead (overhead=" 828 << NV("Overhead", (unsigned)WeightedOutliningRcost.getFrequency()) 829 << ", savings=" 830 << NV("Savings", (unsigned)NormWeightedSavings.getFrequency()) 831 << ")" 832 << " of making the outlined call is too high"; 833 }); 834 835 return false; 836 } 837 838 ORE.emit([&]() { 839 return OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBePartiallyInlined", &CB) 840 << NV("Callee", Cloner.OrigFunc) << " can be partially inlined into " 841 << NV("Caller", Caller) << " with cost=" << NV("Cost", IC.getCost()) 842 << " (threshold=" 843 << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")"; 844 }); 845 return true; 846 } 847 848 // TODO: Ideally we should share Inliner's InlineCost Analysis code. 849 // For now use a simplified version. The returned 'InlineCost' will be used 850 // to esimate the size cost as well as runtime cost of the BB. 851 InstructionCost 852 PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB, 853 TargetTransformInfo *TTI) { 854 InstructionCost InlineCost = 0; 855 const DataLayout &DL = BB->getParent()->getParent()->getDataLayout(); 856 for (Instruction &I : BB->instructionsWithoutDebug()) { 857 // Skip free instructions. 858 switch (I.getOpcode()) { 859 case Instruction::BitCast: 860 case Instruction::PtrToInt: 861 case Instruction::IntToPtr: 862 case Instruction::Alloca: 863 case Instruction::PHI: 864 continue; 865 case Instruction::GetElementPtr: 866 if (cast<GetElementPtrInst>(&I)->hasAllZeroIndices()) 867 continue; 868 break; 869 default: 870 break; 871 } 872 873 if (I.isLifetimeStartOrEnd()) 874 continue; 875 876 if (auto *II = dyn_cast<IntrinsicInst>(&I)) { 877 Intrinsic::ID IID = II->getIntrinsicID(); 878 SmallVector<Type *, 4> Tys; 879 FastMathFlags FMF; 880 for (Value *Val : II->args()) 881 Tys.push_back(Val->getType()); 882 883 if (auto *FPMO = dyn_cast<FPMathOperator>(II)) 884 FMF = FPMO->getFastMathFlags(); 885 886 IntrinsicCostAttributes ICA(IID, II->getType(), Tys, FMF); 887 InlineCost += TTI->getIntrinsicInstrCost(ICA, TTI::TCK_SizeAndLatency); 888 continue; 889 } 890 891 if (CallInst *CI = dyn_cast<CallInst>(&I)) { 892 InlineCost += getCallsiteCost(*CI, DL); 893 continue; 894 } 895 896 if (InvokeInst *II = dyn_cast<InvokeInst>(&I)) { 897 InlineCost += getCallsiteCost(*II, DL); 898 continue; 899 } 900 901 if (SwitchInst *SI = dyn_cast<SwitchInst>(&I)) { 902 InlineCost += (SI->getNumCases() + 1) * InlineConstants::InstrCost; 903 continue; 904 } 905 InlineCost += InlineConstants::InstrCost; 906 } 907 908 return InlineCost; 909 } 910 911 std::tuple<InstructionCost, InstructionCost> 912 PartialInlinerImpl::computeOutliningCosts(FunctionCloner &Cloner) const { 913 InstructionCost OutliningFuncCallCost = 0, OutlinedFunctionCost = 0; 914 for (auto FuncBBPair : Cloner.OutlinedFunctions) { 915 Function *OutlinedFunc = FuncBBPair.first; 916 BasicBlock* OutliningCallBB = FuncBBPair.second; 917 // Now compute the cost of the call sequence to the outlined function 918 // 'OutlinedFunction' in BB 'OutliningCallBB': 919 auto *OutlinedFuncTTI = &GetTTI(*OutlinedFunc); 920 OutliningFuncCallCost += 921 computeBBInlineCost(OutliningCallBB, OutlinedFuncTTI); 922 923 // Now compute the cost of the extracted/outlined function itself: 924 for (BasicBlock &BB : *OutlinedFunc) 925 OutlinedFunctionCost += computeBBInlineCost(&BB, OutlinedFuncTTI); 926 } 927 assert(OutlinedFunctionCost >= Cloner.OutlinedRegionCost && 928 "Outlined function cost should be no less than the outlined region"); 929 930 // The code extractor introduces a new root and exit stub blocks with 931 // additional unconditional branches. Those branches will be eliminated 932 // later with bb layout. The cost should be adjusted accordingly: 933 OutlinedFunctionCost -= 934 2 * InlineConstants::InstrCost * Cloner.OutlinedFunctions.size(); 935 936 InstructionCost OutliningRuntimeOverhead = 937 OutliningFuncCallCost + 938 (OutlinedFunctionCost - Cloner.OutlinedRegionCost) + 939 ExtraOutliningPenalty.getValue(); 940 941 return std::make_tuple(OutliningFuncCallCost, OutliningRuntimeOverhead); 942 } 943 944 // Create the callsite to profile count map which is 945 // used to update the original function's entry count, 946 // after the function is partially inlined into the callsite. 947 void PartialInlinerImpl::computeCallsiteToProfCountMap( 948 Function *DuplicateFunction, 949 DenseMap<User *, uint64_t> &CallSiteToProfCountMap) const { 950 std::vector<User *> Users(DuplicateFunction->user_begin(), 951 DuplicateFunction->user_end()); 952 Function *CurrentCaller = nullptr; 953 std::unique_ptr<BlockFrequencyInfo> TempBFI; 954 BlockFrequencyInfo *CurrentCallerBFI = nullptr; 955 956 auto ComputeCurrBFI = [&,this](Function *Caller) { 957 // For the old pass manager: 958 if (!GetBFI) { 959 DominatorTree DT(*Caller); 960 LoopInfo LI(DT); 961 BranchProbabilityInfo BPI(*Caller, LI); 962 TempBFI.reset(new BlockFrequencyInfo(*Caller, BPI, LI)); 963 CurrentCallerBFI = TempBFI.get(); 964 } else { 965 // New pass manager: 966 CurrentCallerBFI = &(GetBFI(*Caller)); 967 } 968 }; 969 970 for (User *User : Users) { 971 // Don't bother with BlockAddress used by CallBr for asm goto. 972 if (isa<BlockAddress>(User)) 973 continue; 974 CallBase *CB = getSupportedCallBase(User); 975 Function *Caller = CB->getCaller(); 976 if (CurrentCaller != Caller) { 977 CurrentCaller = Caller; 978 ComputeCurrBFI(Caller); 979 } else { 980 assert(CurrentCallerBFI && "CallerBFI is not set"); 981 } 982 BasicBlock *CallBB = CB->getParent(); 983 auto Count = CurrentCallerBFI->getBlockProfileCount(CallBB); 984 if (Count) 985 CallSiteToProfCountMap[User] = *Count; 986 else 987 CallSiteToProfCountMap[User] = 0; 988 } 989 } 990 991 PartialInlinerImpl::FunctionCloner::FunctionCloner( 992 Function *F, FunctionOutliningInfo *OI, OptimizationRemarkEmitter &ORE, 993 function_ref<AssumptionCache *(Function &)> LookupAC, 994 function_ref<TargetTransformInfo &(Function &)> GetTTI) 995 : OrigFunc(F), ORE(ORE), LookupAC(LookupAC), GetTTI(GetTTI) { 996 ClonedOI = std::make_unique<FunctionOutliningInfo>(); 997 998 // Clone the function, so that we can hack away on it. 999 ValueToValueMapTy VMap; 1000 ClonedFunc = CloneFunction(F, VMap); 1001 1002 ClonedOI->ReturnBlock = cast<BasicBlock>(VMap[OI->ReturnBlock]); 1003 ClonedOI->NonReturnBlock = cast<BasicBlock>(VMap[OI->NonReturnBlock]); 1004 for (BasicBlock *BB : OI->Entries) 1005 ClonedOI->Entries.push_back(cast<BasicBlock>(VMap[BB])); 1006 1007 for (BasicBlock *E : OI->ReturnBlockPreds) { 1008 BasicBlock *NewE = cast<BasicBlock>(VMap[E]); 1009 ClonedOI->ReturnBlockPreds.push_back(NewE); 1010 } 1011 // Go ahead and update all uses to the duplicate, so that we can just 1012 // use the inliner functionality when we're done hacking. 1013 F->replaceAllUsesWith(ClonedFunc); 1014 } 1015 1016 PartialInlinerImpl::FunctionCloner::FunctionCloner( 1017 Function *F, FunctionOutliningMultiRegionInfo *OI, 1018 OptimizationRemarkEmitter &ORE, 1019 function_ref<AssumptionCache *(Function &)> LookupAC, 1020 function_ref<TargetTransformInfo &(Function &)> GetTTI) 1021 : OrigFunc(F), ORE(ORE), LookupAC(LookupAC), GetTTI(GetTTI) { 1022 ClonedOMRI = std::make_unique<FunctionOutliningMultiRegionInfo>(); 1023 1024 // Clone the function, so that we can hack away on it. 1025 ValueToValueMapTy VMap; 1026 ClonedFunc = CloneFunction(F, VMap); 1027 1028 // Go through all Outline Candidate Regions and update all BasicBlock 1029 // information. 1030 for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo : 1031 OI->ORI) { 1032 SmallVector<BasicBlock *, 8> Region; 1033 for (BasicBlock *BB : RegionInfo.Region) 1034 Region.push_back(cast<BasicBlock>(VMap[BB])); 1035 1036 BasicBlock *NewEntryBlock = cast<BasicBlock>(VMap[RegionInfo.EntryBlock]); 1037 BasicBlock *NewExitBlock = cast<BasicBlock>(VMap[RegionInfo.ExitBlock]); 1038 BasicBlock *NewReturnBlock = nullptr; 1039 if (RegionInfo.ReturnBlock) 1040 NewReturnBlock = cast<BasicBlock>(VMap[RegionInfo.ReturnBlock]); 1041 FunctionOutliningMultiRegionInfo::OutlineRegionInfo MappedRegionInfo( 1042 Region, NewEntryBlock, NewExitBlock, NewReturnBlock); 1043 ClonedOMRI->ORI.push_back(MappedRegionInfo); 1044 } 1045 // Go ahead and update all uses to the duplicate, so that we can just 1046 // use the inliner functionality when we're done hacking. 1047 F->replaceAllUsesWith(ClonedFunc); 1048 } 1049 1050 void PartialInlinerImpl::FunctionCloner::normalizeReturnBlock() const { 1051 auto GetFirstPHI = [](BasicBlock *BB) { 1052 BasicBlock::iterator I = BB->begin(); 1053 PHINode *FirstPhi = nullptr; 1054 while (I != BB->end()) { 1055 PHINode *Phi = dyn_cast<PHINode>(I); 1056 if (!Phi) 1057 break; 1058 if (!FirstPhi) { 1059 FirstPhi = Phi; 1060 break; 1061 } 1062 } 1063 return FirstPhi; 1064 }; 1065 1066 // Shouldn't need to normalize PHIs if we're not outlining non-early return 1067 // blocks. 1068 if (!ClonedOI) 1069 return; 1070 1071 // Special hackery is needed with PHI nodes that have inputs from more than 1072 // one extracted block. For simplicity, just split the PHIs into a two-level 1073 // sequence of PHIs, some of which will go in the extracted region, and some 1074 // of which will go outside. 1075 BasicBlock *PreReturn = ClonedOI->ReturnBlock; 1076 // only split block when necessary: 1077 PHINode *FirstPhi = GetFirstPHI(PreReturn); 1078 unsigned NumPredsFromEntries = ClonedOI->ReturnBlockPreds.size(); 1079 1080 if (!FirstPhi || FirstPhi->getNumIncomingValues() <= NumPredsFromEntries + 1) 1081 return; 1082 1083 auto IsTrivialPhi = [](PHINode *PN) -> Value * { 1084 Value *CommonValue = PN->getIncomingValue(0); 1085 if (all_of(PN->incoming_values(), 1086 [&](Value *V) { return V == CommonValue; })) 1087 return CommonValue; 1088 return nullptr; 1089 }; 1090 1091 ClonedOI->ReturnBlock = ClonedOI->ReturnBlock->splitBasicBlock( 1092 ClonedOI->ReturnBlock->getFirstNonPHI()->getIterator()); 1093 BasicBlock::iterator I = PreReturn->begin(); 1094 Instruction *Ins = &ClonedOI->ReturnBlock->front(); 1095 SmallVector<Instruction *, 4> DeadPhis; 1096 while (I != PreReturn->end()) { 1097 PHINode *OldPhi = dyn_cast<PHINode>(I); 1098 if (!OldPhi) 1099 break; 1100 1101 PHINode *RetPhi = 1102 PHINode::Create(OldPhi->getType(), NumPredsFromEntries + 1, "", Ins); 1103 OldPhi->replaceAllUsesWith(RetPhi); 1104 Ins = ClonedOI->ReturnBlock->getFirstNonPHI(); 1105 1106 RetPhi->addIncoming(&*I, PreReturn); 1107 for (BasicBlock *E : ClonedOI->ReturnBlockPreds) { 1108 RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(E), E); 1109 OldPhi->removeIncomingValue(E); 1110 } 1111 1112 // After incoming values splitting, the old phi may become trivial. 1113 // Keeping the trivial phi can introduce definition inside the outline 1114 // region which is live-out, causing necessary overhead (load, store 1115 // arg passing etc). 1116 if (auto *OldPhiVal = IsTrivialPhi(OldPhi)) { 1117 OldPhi->replaceAllUsesWith(OldPhiVal); 1118 DeadPhis.push_back(OldPhi); 1119 } 1120 ++I; 1121 } 1122 for (auto *DP : DeadPhis) 1123 DP->eraseFromParent(); 1124 1125 for (auto *E : ClonedOI->ReturnBlockPreds) 1126 E->getTerminator()->replaceUsesOfWith(PreReturn, ClonedOI->ReturnBlock); 1127 } 1128 1129 bool PartialInlinerImpl::FunctionCloner::doMultiRegionFunctionOutlining() { 1130 1131 auto ComputeRegionCost = 1132 [&](SmallVectorImpl<BasicBlock *> &Region) -> InstructionCost { 1133 InstructionCost Cost = 0; 1134 for (BasicBlock* BB : Region) 1135 Cost += computeBBInlineCost(BB, &GetTTI(*BB->getParent())); 1136 return Cost; 1137 }; 1138 1139 assert(ClonedOMRI && "Expecting OutlineInfo for multi region outline"); 1140 1141 if (ClonedOMRI->ORI.empty()) 1142 return false; 1143 1144 // The CodeExtractor needs a dominator tree. 1145 DominatorTree DT; 1146 DT.recalculate(*ClonedFunc); 1147 1148 // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo. 1149 LoopInfo LI(DT); 1150 BranchProbabilityInfo BPI(*ClonedFunc, LI); 1151 ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI)); 1152 1153 // Cache and recycle the CodeExtractor analysis to avoid O(n^2) compile-time. 1154 CodeExtractorAnalysisCache CEAC(*ClonedFunc); 1155 1156 SetVector<Value *> Inputs, Outputs, Sinks; 1157 for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo : 1158 ClonedOMRI->ORI) { 1159 InstructionCost CurrentOutlinedRegionCost = 1160 ComputeRegionCost(RegionInfo.Region); 1161 1162 CodeExtractor CE(RegionInfo.Region, &DT, /*AggregateArgs*/ false, 1163 ClonedFuncBFI.get(), &BPI, 1164 LookupAC(*RegionInfo.EntryBlock->getParent()), 1165 /* AllowVarargs */ false); 1166 1167 CE.findInputsOutputs(Inputs, Outputs, Sinks); 1168 1169 LLVM_DEBUG({ 1170 dbgs() << "inputs: " << Inputs.size() << "\n"; 1171 dbgs() << "outputs: " << Outputs.size() << "\n"; 1172 for (Value *value : Inputs) 1173 dbgs() << "value used in func: " << *value << "\n"; 1174 for (Value *output : Outputs) 1175 dbgs() << "instr used in func: " << *output << "\n"; 1176 }); 1177 1178 // Do not extract regions that have live exit variables. 1179 if (Outputs.size() > 0 && !ForceLiveExit) 1180 continue; 1181 1182 if (Function *OutlinedFunc = CE.extractCodeRegion(CEAC)) { 1183 CallBase *OCS = PartialInlinerImpl::getOneCallSiteTo(*OutlinedFunc); 1184 BasicBlock *OutliningCallBB = OCS->getParent(); 1185 assert(OutliningCallBB->getParent() == ClonedFunc); 1186 OutlinedFunctions.push_back(std::make_pair(OutlinedFunc,OutliningCallBB)); 1187 NumColdRegionsOutlined++; 1188 OutlinedRegionCost += CurrentOutlinedRegionCost; 1189 1190 if (MarkOutlinedColdCC) { 1191 OutlinedFunc->setCallingConv(CallingConv::Cold); 1192 OCS->setCallingConv(CallingConv::Cold); 1193 } 1194 } else 1195 ORE.emit([&]() { 1196 return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed", 1197 &RegionInfo.Region.front()->front()) 1198 << "Failed to extract region at block " 1199 << ore::NV("Block", RegionInfo.Region.front()); 1200 }); 1201 } 1202 1203 return !OutlinedFunctions.empty(); 1204 } 1205 1206 Function * 1207 PartialInlinerImpl::FunctionCloner::doSingleRegionFunctionOutlining() { 1208 // Returns true if the block is to be partial inlined into the caller 1209 // (i.e. not to be extracted to the out of line function) 1210 auto ToBeInlined = [&, this](BasicBlock *BB) { 1211 return BB == ClonedOI->ReturnBlock || 1212 llvm::is_contained(ClonedOI->Entries, BB); 1213 }; 1214 1215 assert(ClonedOI && "Expecting OutlineInfo for single region outline"); 1216 // The CodeExtractor needs a dominator tree. 1217 DominatorTree DT; 1218 DT.recalculate(*ClonedFunc); 1219 1220 // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo. 1221 LoopInfo LI(DT); 1222 BranchProbabilityInfo BPI(*ClonedFunc, LI); 1223 ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI)); 1224 1225 // Gather up the blocks that we're going to extract. 1226 std::vector<BasicBlock *> ToExtract; 1227 auto *ClonedFuncTTI = &GetTTI(*ClonedFunc); 1228 ToExtract.push_back(ClonedOI->NonReturnBlock); 1229 OutlinedRegionCost += PartialInlinerImpl::computeBBInlineCost( 1230 ClonedOI->NonReturnBlock, ClonedFuncTTI); 1231 for (BasicBlock &BB : *ClonedFunc) 1232 if (!ToBeInlined(&BB) && &BB != ClonedOI->NonReturnBlock) { 1233 ToExtract.push_back(&BB); 1234 // FIXME: the code extractor may hoist/sink more code 1235 // into the outlined function which may make the outlining 1236 // overhead (the difference of the outlined function cost 1237 // and OutliningRegionCost) look larger. 1238 OutlinedRegionCost += computeBBInlineCost(&BB, ClonedFuncTTI); 1239 } 1240 1241 // Extract the body of the if. 1242 CodeExtractorAnalysisCache CEAC(*ClonedFunc); 1243 Function *OutlinedFunc = 1244 CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false, 1245 ClonedFuncBFI.get(), &BPI, LookupAC(*ClonedFunc), 1246 /* AllowVarargs */ true) 1247 .extractCodeRegion(CEAC); 1248 1249 if (OutlinedFunc) { 1250 BasicBlock *OutliningCallBB = 1251 PartialInlinerImpl::getOneCallSiteTo(*OutlinedFunc)->getParent(); 1252 assert(OutliningCallBB->getParent() == ClonedFunc); 1253 OutlinedFunctions.push_back(std::make_pair(OutlinedFunc, OutliningCallBB)); 1254 } else 1255 ORE.emit([&]() { 1256 return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed", 1257 &ToExtract.front()->front()) 1258 << "Failed to extract region at block " 1259 << ore::NV("Block", ToExtract.front()); 1260 }); 1261 1262 return OutlinedFunc; 1263 } 1264 1265 PartialInlinerImpl::FunctionCloner::~FunctionCloner() { 1266 // Ditch the duplicate, since we're done with it, and rewrite all remaining 1267 // users (function pointers, etc.) back to the original function. 1268 ClonedFunc->replaceAllUsesWith(OrigFunc); 1269 ClonedFunc->eraseFromParent(); 1270 if (!IsFunctionInlined) { 1271 // Remove each function that was speculatively created if there is no 1272 // reference. 1273 for (auto FuncBBPair : OutlinedFunctions) { 1274 Function *Func = FuncBBPair.first; 1275 Func->eraseFromParent(); 1276 } 1277 } 1278 } 1279 1280 std::pair<bool, Function *> PartialInlinerImpl::unswitchFunction(Function &F) { 1281 if (F.hasAddressTaken()) 1282 return {false, nullptr}; 1283 1284 // Let inliner handle it 1285 if (F.hasFnAttribute(Attribute::AlwaysInline)) 1286 return {false, nullptr}; 1287 1288 if (F.hasFnAttribute(Attribute::NoInline)) 1289 return {false, nullptr}; 1290 1291 if (PSI.isFunctionEntryCold(&F)) 1292 return {false, nullptr}; 1293 1294 if (F.users().empty()) 1295 return {false, nullptr}; 1296 1297 OptimizationRemarkEmitter ORE(&F); 1298 1299 // Only try to outline cold regions if we have a profile summary, which 1300 // implies we have profiling information. 1301 if (PSI.hasProfileSummary() && F.hasProfileData() && 1302 !DisableMultiRegionPartialInline) { 1303 std::unique_ptr<FunctionOutliningMultiRegionInfo> OMRI = 1304 computeOutliningColdRegionsInfo(F, ORE); 1305 if (OMRI) { 1306 FunctionCloner Cloner(&F, OMRI.get(), ORE, LookupAssumptionCache, GetTTI); 1307 1308 LLVM_DEBUG({ 1309 dbgs() << "HotCountThreshold = " << PSI.getHotCountThreshold() << "\n"; 1310 dbgs() << "ColdCountThreshold = " << PSI.getColdCountThreshold() 1311 << "\n"; 1312 }); 1313 1314 bool DidOutline = Cloner.doMultiRegionFunctionOutlining(); 1315 1316 if (DidOutline) { 1317 LLVM_DEBUG({ 1318 dbgs() << ">>>>>> Outlined (Cloned) Function >>>>>>\n"; 1319 Cloner.ClonedFunc->print(dbgs()); 1320 dbgs() << "<<<<<< Outlined (Cloned) Function <<<<<<\n"; 1321 }); 1322 1323 if (tryPartialInline(Cloner)) 1324 return {true, nullptr}; 1325 } 1326 } 1327 } 1328 1329 // Fall-thru to regular partial inlining if we: 1330 // i) can't find any cold regions to outline, or 1331 // ii) can't inline the outlined function anywhere. 1332 std::unique_ptr<FunctionOutliningInfo> OI = computeOutliningInfo(F); 1333 if (!OI) 1334 return {false, nullptr}; 1335 1336 FunctionCloner Cloner(&F, OI.get(), ORE, LookupAssumptionCache, GetTTI); 1337 Cloner.normalizeReturnBlock(); 1338 1339 Function *OutlinedFunction = Cloner.doSingleRegionFunctionOutlining(); 1340 1341 if (!OutlinedFunction) 1342 return {false, nullptr}; 1343 1344 if (tryPartialInline(Cloner)) 1345 return {true, OutlinedFunction}; 1346 1347 return {false, nullptr}; 1348 } 1349 1350 bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) { 1351 if (Cloner.OutlinedFunctions.empty()) 1352 return false; 1353 1354 int SizeCost = 0; 1355 BlockFrequency WeightedRcost; 1356 int NonWeightedRcost; 1357 1358 auto OutliningCosts = computeOutliningCosts(Cloner); 1359 assert(std::get<0>(OutliningCosts).isValid() && 1360 std::get<1>(OutliningCosts).isValid() && "Expected valid costs"); 1361 1362 SizeCost = *std::get<0>(OutliningCosts).getValue(); 1363 NonWeightedRcost = *std::get<1>(OutliningCosts).getValue(); 1364 1365 // Only calculate RelativeToEntryFreq when we are doing single region 1366 // outlining. 1367 BranchProbability RelativeToEntryFreq; 1368 if (Cloner.ClonedOI) 1369 RelativeToEntryFreq = getOutliningCallBBRelativeFreq(Cloner); 1370 else 1371 // RelativeToEntryFreq doesn't make sense when we have more than one 1372 // outlined call because each call will have a different relative frequency 1373 // to the entry block. We can consider using the average, but the 1374 // usefulness of that information is questionable. For now, assume we never 1375 // execute the calls to outlined functions. 1376 RelativeToEntryFreq = BranchProbability(0, 1); 1377 1378 WeightedRcost = BlockFrequency(NonWeightedRcost) * RelativeToEntryFreq; 1379 1380 // The call sequence(s) to the outlined function(s) are larger than the sum of 1381 // the original outlined region size(s), it does not increase the chances of 1382 // inlining the function with outlining (The inliner uses the size increase to 1383 // model the cost of inlining a callee). 1384 if (!SkipCostAnalysis && Cloner.OutlinedRegionCost < SizeCost) { 1385 OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc); 1386 DebugLoc DLoc; 1387 BasicBlock *Block; 1388 std::tie(DLoc, Block) = getOneDebugLoc(*Cloner.ClonedFunc); 1389 OrigFuncORE.emit([&]() { 1390 return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall", 1391 DLoc, Block) 1392 << ore::NV("Function", Cloner.OrigFunc) 1393 << " not partially inlined into callers (Original Size = " 1394 << ore::NV("OutlinedRegionOriginalSize", Cloner.OutlinedRegionCost) 1395 << ", Size of call sequence to outlined function = " 1396 << ore::NV("NewSize", SizeCost) << ")"; 1397 }); 1398 return false; 1399 } 1400 1401 assert(Cloner.OrigFunc->users().empty() && 1402 "F's users should all be replaced!"); 1403 1404 std::vector<User *> Users(Cloner.ClonedFunc->user_begin(), 1405 Cloner.ClonedFunc->user_end()); 1406 1407 DenseMap<User *, uint64_t> CallSiteToProfCountMap; 1408 auto CalleeEntryCount = Cloner.OrigFunc->getEntryCount(); 1409 if (CalleeEntryCount) 1410 computeCallsiteToProfCountMap(Cloner.ClonedFunc, CallSiteToProfCountMap); 1411 1412 uint64_t CalleeEntryCountV = 1413 (CalleeEntryCount ? CalleeEntryCount->getCount() : 0); 1414 1415 bool AnyInline = false; 1416 for (User *User : Users) { 1417 // Don't bother with BlockAddress used by CallBr for asm goto. 1418 if (isa<BlockAddress>(User)) 1419 continue; 1420 1421 CallBase *CB = getSupportedCallBase(User); 1422 1423 if (isLimitReached()) 1424 continue; 1425 1426 OptimizationRemarkEmitter CallerORE(CB->getCaller()); 1427 if (!shouldPartialInline(*CB, Cloner, WeightedRcost, CallerORE)) 1428 continue; 1429 1430 // Construct remark before doing the inlining, as after successful inlining 1431 // the callsite is removed. 1432 OptimizationRemark OR(DEBUG_TYPE, "PartiallyInlined", CB); 1433 OR << ore::NV("Callee", Cloner.OrigFunc) << " partially inlined into " 1434 << ore::NV("Caller", CB->getCaller()); 1435 1436 InlineFunctionInfo IFI(nullptr, GetAssumptionCache, &PSI); 1437 // We can only forward varargs when we outlined a single region, else we 1438 // bail on vararg functions. 1439 if (!InlineFunction(*CB, IFI, nullptr, true, 1440 (Cloner.ClonedOI ? Cloner.OutlinedFunctions.back().first 1441 : nullptr)) 1442 .isSuccess()) 1443 continue; 1444 1445 CallerORE.emit(OR); 1446 1447 // Now update the entry count: 1448 if (CalleeEntryCountV && CallSiteToProfCountMap.count(User)) { 1449 uint64_t CallSiteCount = CallSiteToProfCountMap[User]; 1450 CalleeEntryCountV -= std::min(CalleeEntryCountV, CallSiteCount); 1451 } 1452 1453 AnyInline = true; 1454 NumPartialInlining++; 1455 // Update the stats 1456 if (Cloner.ClonedOI) 1457 NumPartialInlined++; 1458 else 1459 NumColdOutlinePartialInlined++; 1460 } 1461 1462 if (AnyInline) { 1463 Cloner.IsFunctionInlined = true; 1464 if (CalleeEntryCount) 1465 Cloner.OrigFunc->setEntryCount(Function::ProfileCount( 1466 CalleeEntryCountV, CalleeEntryCount->getType())); 1467 OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc); 1468 OrigFuncORE.emit([&]() { 1469 return OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", Cloner.OrigFunc) 1470 << "Partially inlined into at least one caller"; 1471 }); 1472 } 1473 1474 return AnyInline; 1475 } 1476 1477 bool PartialInlinerImpl::run(Module &M) { 1478 if (DisablePartialInlining) 1479 return false; 1480 1481 std::vector<Function *> Worklist; 1482 Worklist.reserve(M.size()); 1483 for (Function &F : M) 1484 if (!F.use_empty() && !F.isDeclaration()) 1485 Worklist.push_back(&F); 1486 1487 bool Changed = false; 1488 while (!Worklist.empty()) { 1489 Function *CurrFunc = Worklist.back(); 1490 Worklist.pop_back(); 1491 1492 if (CurrFunc->use_empty()) 1493 continue; 1494 1495 bool Recursive = false; 1496 for (User *U : CurrFunc->users()) 1497 if (Instruction *I = dyn_cast<Instruction>(U)) 1498 if (I->getParent()->getParent() == CurrFunc) { 1499 Recursive = true; 1500 break; 1501 } 1502 if (Recursive) 1503 continue; 1504 1505 std::pair<bool, Function *> Result = unswitchFunction(*CurrFunc); 1506 if (Result.second) 1507 Worklist.push_back(Result.second); 1508 Changed |= Result.first; 1509 } 1510 1511 return Changed; 1512 } 1513 1514 char PartialInlinerLegacyPass::ID = 0; 1515 1516 INITIALIZE_PASS_BEGIN(PartialInlinerLegacyPass, "partial-inliner", 1517 "Partial Inliner", false, false) 1518 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 1519 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 1520 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 1521 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) 1522 INITIALIZE_PASS_END(PartialInlinerLegacyPass, "partial-inliner", 1523 "Partial Inliner", false, false) 1524 1525 ModulePass *llvm::createPartialInliningPass() { 1526 return new PartialInlinerLegacyPass(); 1527 } 1528 1529 PreservedAnalyses PartialInlinerPass::run(Module &M, 1530 ModuleAnalysisManager &AM) { 1531 auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); 1532 1533 auto GetAssumptionCache = [&FAM](Function &F) -> AssumptionCache & { 1534 return FAM.getResult<AssumptionAnalysis>(F); 1535 }; 1536 1537 auto LookupAssumptionCache = [&FAM](Function &F) -> AssumptionCache * { 1538 return FAM.getCachedResult<AssumptionAnalysis>(F); 1539 }; 1540 1541 auto GetBFI = [&FAM](Function &F) -> BlockFrequencyInfo & { 1542 return FAM.getResult<BlockFrequencyAnalysis>(F); 1543 }; 1544 1545 auto GetTTI = [&FAM](Function &F) -> TargetTransformInfo & { 1546 return FAM.getResult<TargetIRAnalysis>(F); 1547 }; 1548 1549 auto GetTLI = [&FAM](Function &F) -> TargetLibraryInfo & { 1550 return FAM.getResult<TargetLibraryAnalysis>(F); 1551 }; 1552 1553 ProfileSummaryInfo &PSI = AM.getResult<ProfileSummaryAnalysis>(M); 1554 1555 if (PartialInlinerImpl(GetAssumptionCache, LookupAssumptionCache, GetTTI, 1556 GetTLI, PSI, GetBFI) 1557 .run(M)) 1558 return PreservedAnalyses::none(); 1559 return PreservedAnalyses::all(); 1560 } 1561