1 //===- PartialInlining.cpp - Inline parts of functions --------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass performs partial inlining, typically by inlining an if statement 10 // that surrounds the body of the function. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "llvm/Transforms/IPO/PartialInlining.h" 15 #include "llvm/ADT/DenseMap.h" 16 #include "llvm/ADT/DenseSet.h" 17 #include "llvm/ADT/None.h" 18 #include "llvm/ADT/Optional.h" 19 #include "llvm/ADT/STLExtras.h" 20 #include "llvm/ADT/SmallVector.h" 21 #include "llvm/ADT/Statistic.h" 22 #include "llvm/Analysis/BlockFrequencyInfo.h" 23 #include "llvm/Analysis/BranchProbabilityInfo.h" 24 #include "llvm/Analysis/InlineCost.h" 25 #include "llvm/Analysis/LoopInfo.h" 26 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 27 #include "llvm/Analysis/ProfileSummaryInfo.h" 28 #include "llvm/Analysis/TargetLibraryInfo.h" 29 #include "llvm/Analysis/TargetTransformInfo.h" 30 #include "llvm/IR/Attributes.h" 31 #include "llvm/IR/BasicBlock.h" 32 #include "llvm/IR/CFG.h" 33 #include "llvm/IR/DebugLoc.h" 34 #include "llvm/IR/DiagnosticInfo.h" 35 #include "llvm/IR/Dominators.h" 36 #include "llvm/IR/Function.h" 37 #include "llvm/IR/InstrTypes.h" 38 #include "llvm/IR/Instruction.h" 39 #include "llvm/IR/Instructions.h" 40 #include "llvm/IR/IntrinsicInst.h" 41 #include "llvm/IR/Intrinsics.h" 42 #include "llvm/IR/Module.h" 43 #include "llvm/IR/User.h" 44 #include "llvm/InitializePasses.h" 45 #include "llvm/Pass.h" 46 #include "llvm/Support/BlockFrequency.h" 47 #include "llvm/Support/BranchProbability.h" 48 #include "llvm/Support/Casting.h" 49 #include "llvm/Support/CommandLine.h" 50 #include "llvm/Support/ErrorHandling.h" 51 #include "llvm/Transforms/IPO.h" 52 #include "llvm/Transforms/Utils/Cloning.h" 53 #include "llvm/Transforms/Utils/CodeExtractor.h" 54 #include "llvm/Transforms/Utils/ValueMapper.h" 55 #include <algorithm> 56 #include <cassert> 57 #include <cstdint> 58 #include <functional> 59 #include <iterator> 60 #include <memory> 61 #include <tuple> 62 #include <vector> 63 64 using namespace llvm; 65 66 #define DEBUG_TYPE "partial-inlining" 67 68 STATISTIC(NumPartialInlined, 69 "Number of callsites functions partially inlined into."); 70 STATISTIC(NumColdOutlinePartialInlined, "Number of times functions with " 71 "cold outlined regions were partially " 72 "inlined into its caller(s)."); 73 STATISTIC(NumColdRegionsFound, 74 "Number of cold single entry/exit regions found."); 75 STATISTIC(NumColdRegionsOutlined, 76 "Number of cold single entry/exit regions outlined."); 77 78 // Command line option to disable partial-inlining. The default is false: 79 static cl::opt<bool> 80 DisablePartialInlining("disable-partial-inlining", cl::init(false), 81 cl::Hidden, cl::desc("Disable partial inlining")); 82 // Command line option to disable multi-region partial-inlining. The default is 83 // false: 84 static cl::opt<bool> DisableMultiRegionPartialInline( 85 "disable-mr-partial-inlining", cl::init(false), cl::Hidden, 86 cl::desc("Disable multi-region partial inlining")); 87 88 // Command line option to force outlining in regions with live exit variables. 89 // The default is false: 90 static cl::opt<bool> 91 ForceLiveExit("pi-force-live-exit-outline", cl::init(false), cl::Hidden, 92 cl::desc("Force outline regions with live exits")); 93 94 // Command line option to enable marking outline functions with Cold Calling 95 // Convention. The default is false: 96 static cl::opt<bool> 97 MarkOutlinedColdCC("pi-mark-coldcc", cl::init(false), cl::Hidden, 98 cl::desc("Mark outline function calls with ColdCC")); 99 100 // This is an option used by testing: 101 static cl::opt<bool> SkipCostAnalysis("skip-partial-inlining-cost-analysis", 102 cl::init(false), cl::ZeroOrMore, 103 cl::ReallyHidden, 104 cl::desc("Skip Cost Analysis")); 105 // Used to determine if a cold region is worth outlining based on 106 // its inlining cost compared to the original function. Default is set at 10%. 107 // ie. if the cold region reduces the inlining cost of the original function by 108 // at least 10%. 109 static cl::opt<float> MinRegionSizeRatio( 110 "min-region-size-ratio", cl::init(0.1), cl::Hidden, 111 cl::desc("Minimum ratio comparing relative sizes of each " 112 "outline candidate and original function")); 113 // Used to tune the minimum number of execution counts needed in the predecessor 114 // block to the cold edge. ie. confidence interval. 115 static cl::opt<unsigned> 116 MinBlockCounterExecution("min-block-execution", cl::init(100), cl::Hidden, 117 cl::desc("Minimum block executions to consider " 118 "its BranchProbabilityInfo valid")); 119 // Used to determine when an edge is considered cold. Default is set to 10%. ie. 120 // if the branch probability is 10% or less, then it is deemed as 'cold'. 121 static cl::opt<float> ColdBranchRatio( 122 "cold-branch-ratio", cl::init(0.1), cl::Hidden, 123 cl::desc("Minimum BranchProbability to consider a region cold.")); 124 125 static cl::opt<unsigned> MaxNumInlineBlocks( 126 "max-num-inline-blocks", cl::init(5), cl::Hidden, 127 cl::desc("Max number of blocks to be partially inlined")); 128 129 // Command line option to set the maximum number of partial inlining allowed 130 // for the module. The default value of -1 means no limit. 131 static cl::opt<int> MaxNumPartialInlining( 132 "max-partial-inlining", cl::init(-1), cl::Hidden, cl::ZeroOrMore, 133 cl::desc("Max number of partial inlining. The default is unlimited")); 134 135 // Used only when PGO or user annotated branch data is absent. It is 136 // the least value that is used to weigh the outline region. If BFI 137 // produces larger value, the BFI value will be used. 138 static cl::opt<int> 139 OutlineRegionFreqPercent("outline-region-freq-percent", cl::init(75), 140 cl::Hidden, cl::ZeroOrMore, 141 cl::desc("Relative frequency of outline region to " 142 "the entry block")); 143 144 static cl::opt<unsigned> ExtraOutliningPenalty( 145 "partial-inlining-extra-penalty", cl::init(0), cl::Hidden, 146 cl::desc("A debug option to add additional penalty to the computed one.")); 147 148 namespace { 149 150 struct FunctionOutliningInfo { 151 FunctionOutliningInfo() = default; 152 153 // Returns the number of blocks to be inlined including all blocks 154 // in Entries and one return block. 155 unsigned getNumInlinedBlocks() const { return Entries.size() + 1; } 156 157 // A set of blocks including the function entry that guard 158 // the region to be outlined. 159 SmallVector<BasicBlock *, 4> Entries; 160 161 // The return block that is not included in the outlined region. 162 BasicBlock *ReturnBlock = nullptr; 163 164 // The dominating block of the region to be outlined. 165 BasicBlock *NonReturnBlock = nullptr; 166 167 // The set of blocks in Entries that that are predecessors to ReturnBlock 168 SmallVector<BasicBlock *, 4> ReturnBlockPreds; 169 }; 170 171 struct FunctionOutliningMultiRegionInfo { 172 FunctionOutliningMultiRegionInfo() {} 173 174 // Container for outline regions 175 struct OutlineRegionInfo { 176 OutlineRegionInfo(ArrayRef<BasicBlock *> Region, 177 BasicBlock *EntryBlock, BasicBlock *ExitBlock, 178 BasicBlock *ReturnBlock) 179 : Region(Region.begin(), Region.end()), EntryBlock(EntryBlock), 180 ExitBlock(ExitBlock), ReturnBlock(ReturnBlock) {} 181 SmallVector<BasicBlock *, 8> Region; 182 BasicBlock *EntryBlock; 183 BasicBlock *ExitBlock; 184 BasicBlock *ReturnBlock; 185 }; 186 187 SmallVector<OutlineRegionInfo, 4> ORI; 188 }; 189 190 struct PartialInlinerImpl { 191 192 PartialInlinerImpl( 193 function_ref<AssumptionCache &(Function &)> GetAC, 194 function_ref<AssumptionCache *(Function &)> LookupAC, 195 function_ref<TargetTransformInfo &(Function &)> GTTI, 196 function_ref<const TargetLibraryInfo &(Function &)> GTLI, 197 ProfileSummaryInfo &ProfSI, 198 function_ref<BlockFrequencyInfo &(Function &)> GBFI = nullptr) 199 : GetAssumptionCache(GetAC), LookupAssumptionCache(LookupAC), 200 GetTTI(GTTI), GetBFI(GBFI), GetTLI(GTLI), PSI(ProfSI) {} 201 202 bool run(Module &M); 203 // Main part of the transformation that calls helper functions to find 204 // outlining candidates, clone & outline the function, and attempt to 205 // partially inline the resulting function. Returns true if 206 // inlining was successful, false otherwise. Also returns the outline 207 // function (only if we partially inlined early returns) as there is a 208 // possibility to further "peel" early return statements that were left in the 209 // outline function due to code size. 210 std::pair<bool, Function *> unswitchFunction(Function &F); 211 212 // This class speculatively clones the function to be partial inlined. 213 // At the end of partial inlining, the remaining callsites to the cloned 214 // function that are not partially inlined will be fixed up to reference 215 // the original function, and the cloned function will be erased. 216 struct FunctionCloner { 217 // Two constructors, one for single region outlining, the other for 218 // multi-region outlining. 219 FunctionCloner(Function *F, FunctionOutliningInfo *OI, 220 OptimizationRemarkEmitter &ORE, 221 function_ref<AssumptionCache *(Function &)> LookupAC, 222 function_ref<TargetTransformInfo &(Function &)> GetTTI); 223 FunctionCloner(Function *F, FunctionOutliningMultiRegionInfo *OMRI, 224 OptimizationRemarkEmitter &ORE, 225 function_ref<AssumptionCache *(Function &)> LookupAC, 226 function_ref<TargetTransformInfo &(Function &)> GetTTI); 227 228 ~FunctionCloner(); 229 230 // Prepare for function outlining: making sure there is only 231 // one incoming edge from the extracted/outlined region to 232 // the return block. 233 void normalizeReturnBlock() const; 234 235 // Do function outlining for cold regions. 236 bool doMultiRegionFunctionOutlining(); 237 // Do function outlining for region after early return block(s). 238 // NOTE: For vararg functions that do the vararg handling in the outlined 239 // function, we temporarily generate IR that does not properly 240 // forward varargs to the outlined function. Calling InlineFunction 241 // will update calls to the outlined functions to properly forward 242 // the varargs. 243 Function *doSingleRegionFunctionOutlining(); 244 245 Function *OrigFunc = nullptr; 246 Function *ClonedFunc = nullptr; 247 248 typedef std::pair<Function *, BasicBlock *> FuncBodyCallerPair; 249 // Keep track of Outlined Functions and the basic block they're called from. 250 SmallVector<FuncBodyCallerPair, 4> OutlinedFunctions; 251 252 // ClonedFunc is inlined in one of its callers after function 253 // outlining. 254 bool IsFunctionInlined = false; 255 // The cost of the region to be outlined. 256 InstructionCost OutlinedRegionCost = 0; 257 // ClonedOI is specific to outlining non-early return blocks. 258 std::unique_ptr<FunctionOutliningInfo> ClonedOI = nullptr; 259 // ClonedOMRI is specific to outlining cold regions. 260 std::unique_ptr<FunctionOutliningMultiRegionInfo> ClonedOMRI = nullptr; 261 std::unique_ptr<BlockFrequencyInfo> ClonedFuncBFI = nullptr; 262 OptimizationRemarkEmitter &ORE; 263 function_ref<AssumptionCache *(Function &)> LookupAC; 264 function_ref<TargetTransformInfo &(Function &)> GetTTI; 265 }; 266 267 private: 268 int NumPartialInlining = 0; 269 function_ref<AssumptionCache &(Function &)> GetAssumptionCache; 270 function_ref<AssumptionCache *(Function &)> LookupAssumptionCache; 271 function_ref<TargetTransformInfo &(Function &)> GetTTI; 272 function_ref<BlockFrequencyInfo &(Function &)> GetBFI; 273 function_ref<const TargetLibraryInfo &(Function &)> GetTLI; 274 ProfileSummaryInfo &PSI; 275 276 // Return the frequency of the OutlininingBB relative to F's entry point. 277 // The result is no larger than 1 and is represented using BP. 278 // (Note that the outlined region's 'head' block can only have incoming 279 // edges from the guarding entry blocks). 280 BranchProbability 281 getOutliningCallBBRelativeFreq(FunctionCloner &Cloner) const; 282 283 // Return true if the callee of CB should be partially inlined with 284 // profit. 285 bool shouldPartialInline(CallBase &CB, FunctionCloner &Cloner, 286 BlockFrequency WeightedOutliningRcost, 287 OptimizationRemarkEmitter &ORE) const; 288 289 // Try to inline DuplicateFunction (cloned from F with call to 290 // the OutlinedFunction into its callers. Return true 291 // if there is any successful inlining. 292 bool tryPartialInline(FunctionCloner &Cloner); 293 294 // Compute the mapping from use site of DuplicationFunction to the enclosing 295 // BB's profile count. 296 void 297 computeCallsiteToProfCountMap(Function *DuplicateFunction, 298 DenseMap<User *, uint64_t> &SiteCountMap) const; 299 300 bool isLimitReached() const { 301 return (MaxNumPartialInlining != -1 && 302 NumPartialInlining >= MaxNumPartialInlining); 303 } 304 305 static CallBase *getSupportedCallBase(User *U) { 306 if (isa<CallInst>(U) || isa<InvokeInst>(U)) 307 return cast<CallBase>(U); 308 llvm_unreachable("All uses must be calls"); 309 return nullptr; 310 } 311 312 static CallBase *getOneCallSiteTo(Function &F) { 313 User *User = *F.user_begin(); 314 return getSupportedCallBase(User); 315 } 316 317 std::tuple<DebugLoc, BasicBlock *> getOneDebugLoc(Function &F) const { 318 CallBase *CB = getOneCallSiteTo(F); 319 DebugLoc DLoc = CB->getDebugLoc(); 320 BasicBlock *Block = CB->getParent(); 321 return std::make_tuple(DLoc, Block); 322 } 323 324 // Returns the costs associated with function outlining: 325 // - The first value is the non-weighted runtime cost for making the call 326 // to the outlined function, including the addtional setup cost in the 327 // outlined function itself; 328 // - The second value is the estimated size of the new call sequence in 329 // basic block Cloner.OutliningCallBB; 330 std::tuple<InstructionCost, InstructionCost> 331 computeOutliningCosts(FunctionCloner &Cloner) const; 332 333 // Compute the 'InlineCost' of block BB. InlineCost is a proxy used to 334 // approximate both the size and runtime cost (Note that in the current 335 // inline cost analysis, there is no clear distinction there either). 336 static InstructionCost computeBBInlineCost(BasicBlock *BB, 337 TargetTransformInfo *TTI); 338 339 std::unique_ptr<FunctionOutliningInfo> 340 computeOutliningInfo(Function &F) const; 341 342 std::unique_ptr<FunctionOutliningMultiRegionInfo> 343 computeOutliningColdRegionsInfo(Function &F, 344 OptimizationRemarkEmitter &ORE) const; 345 }; 346 347 struct PartialInlinerLegacyPass : public ModulePass { 348 static char ID; // Pass identification, replacement for typeid 349 350 PartialInlinerLegacyPass() : ModulePass(ID) { 351 initializePartialInlinerLegacyPassPass(*PassRegistry::getPassRegistry()); 352 } 353 354 void getAnalysisUsage(AnalysisUsage &AU) const override { 355 AU.addRequired<AssumptionCacheTracker>(); 356 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 357 AU.addRequired<TargetTransformInfoWrapperPass>(); 358 AU.addRequired<TargetLibraryInfoWrapperPass>(); 359 } 360 361 bool runOnModule(Module &M) override { 362 if (skipModule(M)) 363 return false; 364 365 AssumptionCacheTracker *ACT = &getAnalysis<AssumptionCacheTracker>(); 366 TargetTransformInfoWrapperPass *TTIWP = 367 &getAnalysis<TargetTransformInfoWrapperPass>(); 368 ProfileSummaryInfo &PSI = 369 getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 370 371 auto GetAssumptionCache = [&ACT](Function &F) -> AssumptionCache & { 372 return ACT->getAssumptionCache(F); 373 }; 374 375 auto LookupAssumptionCache = [ACT](Function &F) -> AssumptionCache * { 376 return ACT->lookupAssumptionCache(F); 377 }; 378 379 auto GetTTI = [&TTIWP](Function &F) -> TargetTransformInfo & { 380 return TTIWP->getTTI(F); 381 }; 382 383 auto GetTLI = [this](Function &F) -> TargetLibraryInfo & { 384 return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); 385 }; 386 387 return PartialInlinerImpl(GetAssumptionCache, LookupAssumptionCache, GetTTI, 388 GetTLI, PSI) 389 .run(M); 390 } 391 }; 392 393 } // end anonymous namespace 394 395 std::unique_ptr<FunctionOutliningMultiRegionInfo> 396 PartialInlinerImpl::computeOutliningColdRegionsInfo( 397 Function &F, OptimizationRemarkEmitter &ORE) const { 398 BasicBlock *EntryBlock = &F.front(); 399 400 DominatorTree DT(F); 401 LoopInfo LI(DT); 402 BranchProbabilityInfo BPI(F, LI); 403 std::unique_ptr<BlockFrequencyInfo> ScopedBFI; 404 BlockFrequencyInfo *BFI; 405 if (!GetBFI) { 406 ScopedBFI.reset(new BlockFrequencyInfo(F, BPI, LI)); 407 BFI = ScopedBFI.get(); 408 } else 409 BFI = &(GetBFI(F)); 410 411 // Return if we don't have profiling information. 412 if (!PSI.hasInstrumentationProfile()) 413 return std::unique_ptr<FunctionOutliningMultiRegionInfo>(); 414 415 std::unique_ptr<FunctionOutliningMultiRegionInfo> OutliningInfo = 416 std::make_unique<FunctionOutliningMultiRegionInfo>(); 417 418 auto IsSingleExit = 419 [&ORE](SmallVectorImpl<BasicBlock *> &BlockList) -> BasicBlock * { 420 BasicBlock *ExitBlock = nullptr; 421 for (auto *Block : BlockList) { 422 for (BasicBlock *Succ : successors(Block)) { 423 if (!is_contained(BlockList, Succ)) { 424 if (ExitBlock) { 425 ORE.emit([&]() { 426 return OptimizationRemarkMissed(DEBUG_TYPE, "MultiExitRegion", 427 &Succ->front()) 428 << "Region dominated by " 429 << ore::NV("Block", BlockList.front()->getName()) 430 << " has more than one region exit edge."; 431 }); 432 return nullptr; 433 } 434 435 ExitBlock = Block; 436 } 437 } 438 } 439 return ExitBlock; 440 }; 441 442 auto BBProfileCount = [BFI](BasicBlock *BB) { 443 return BFI->getBlockProfileCount(BB).getValueOr(0); 444 }; 445 446 // Use the same computeBBInlineCost function to compute the cost savings of 447 // the outlining the candidate region. 448 TargetTransformInfo *FTTI = &GetTTI(F); 449 InstructionCost OverallFunctionCost = 0; 450 for (auto &BB : F) 451 OverallFunctionCost += computeBBInlineCost(&BB, FTTI); 452 453 LLVM_DEBUG(dbgs() << "OverallFunctionCost = " << OverallFunctionCost 454 << "\n";); 455 456 InstructionCost MinOutlineRegionCost = OverallFunctionCost.map( 457 [&](auto Cost) { return Cost * MinRegionSizeRatio; }); 458 459 BranchProbability MinBranchProbability( 460 static_cast<int>(ColdBranchRatio * MinBlockCounterExecution), 461 MinBlockCounterExecution); 462 bool ColdCandidateFound = false; 463 BasicBlock *CurrEntry = EntryBlock; 464 std::vector<BasicBlock *> DFS; 465 DenseMap<BasicBlock *, bool> VisitedMap; 466 DFS.push_back(CurrEntry); 467 VisitedMap[CurrEntry] = true; 468 469 // Use Depth First Search on the basic blocks to find CFG edges that are 470 // considered cold. 471 // Cold regions considered must also have its inline cost compared to the 472 // overall inline cost of the original function. The region is outlined only 473 // if it reduced the inline cost of the function by 'MinOutlineRegionCost' or 474 // more. 475 while (!DFS.empty()) { 476 auto *ThisBB = DFS.back(); 477 DFS.pop_back(); 478 // Only consider regions with predecessor blocks that are considered 479 // not-cold (default: part of the top 99.99% of all block counters) 480 // AND greater than our minimum block execution count (default: 100). 481 if (PSI.isColdBlock(ThisBB, BFI) || 482 BBProfileCount(ThisBB) < MinBlockCounterExecution) 483 continue; 484 for (auto SI = succ_begin(ThisBB); SI != succ_end(ThisBB); ++SI) { 485 if (VisitedMap[*SI]) 486 continue; 487 VisitedMap[*SI] = true; 488 DFS.push_back(*SI); 489 // If branch isn't cold, we skip to the next one. 490 BranchProbability SuccProb = BPI.getEdgeProbability(ThisBB, *SI); 491 if (SuccProb > MinBranchProbability) 492 continue; 493 494 LLVM_DEBUG(dbgs() << "Found cold edge: " << ThisBB->getName() << "->" 495 << SI->getName() 496 << "\nBranch Probability = " << SuccProb << "\n";); 497 498 SmallVector<BasicBlock *, 8> DominateVector; 499 DT.getDescendants(*SI, DominateVector); 500 assert(!DominateVector.empty() && 501 "SI should be reachable and have at least itself as descendant"); 502 503 // We can only outline single entry regions (for now). 504 if (!DominateVector.front()->hasNPredecessors(1)) { 505 LLVM_DEBUG(dbgs() << "ABORT: Block " << SI->getName() 506 << " doesn't have a single predecessor in the " 507 "dominator tree\n";); 508 continue; 509 } 510 511 BasicBlock *ExitBlock = nullptr; 512 // We can only outline single exit regions (for now). 513 if (!(ExitBlock = IsSingleExit(DominateVector))) { 514 LLVM_DEBUG(dbgs() << "ABORT: Block " << SI->getName() 515 << " doesn't have a unique successor\n";); 516 continue; 517 } 518 519 InstructionCost OutlineRegionCost = 0; 520 for (auto *BB : DominateVector) 521 OutlineRegionCost += computeBBInlineCost(BB, &GetTTI(*BB->getParent())); 522 523 LLVM_DEBUG(dbgs() << "OutlineRegionCost = " << OutlineRegionCost 524 << "\n";); 525 526 if (!SkipCostAnalysis && OutlineRegionCost < MinOutlineRegionCost) { 527 ORE.emit([&]() { 528 return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", 529 &SI->front()) 530 << ore::NV("Callee", &F) 531 << " inline cost-savings smaller than " 532 << ore::NV("Cost", MinOutlineRegionCost); 533 }); 534 535 LLVM_DEBUG(dbgs() << "ABORT: Outline region cost is smaller than " 536 << MinOutlineRegionCost << "\n";); 537 continue; 538 } 539 540 // For now, ignore blocks that belong to a SISE region that is a 541 // candidate for outlining. In the future, we may want to look 542 // at inner regions because the outer region may have live-exit 543 // variables. 544 for (auto *BB : DominateVector) 545 VisitedMap[BB] = true; 546 547 // ReturnBlock here means the block after the outline call 548 BasicBlock *ReturnBlock = ExitBlock->getSingleSuccessor(); 549 FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegInfo( 550 DominateVector, DominateVector.front(), ExitBlock, ReturnBlock); 551 OutliningInfo->ORI.push_back(RegInfo); 552 LLVM_DEBUG(dbgs() << "Found Cold Candidate starting at block: " 553 << DominateVector.front()->getName() << "\n";); 554 ColdCandidateFound = true; 555 NumColdRegionsFound++; 556 } 557 } 558 559 if (ColdCandidateFound) 560 return OutliningInfo; 561 562 return std::unique_ptr<FunctionOutliningMultiRegionInfo>(); 563 } 564 565 std::unique_ptr<FunctionOutliningInfo> 566 PartialInlinerImpl::computeOutliningInfo(Function &F) const { 567 BasicBlock *EntryBlock = &F.front(); 568 BranchInst *BR = dyn_cast<BranchInst>(EntryBlock->getTerminator()); 569 if (!BR || BR->isUnconditional()) 570 return std::unique_ptr<FunctionOutliningInfo>(); 571 572 // Returns true if Succ is BB's successor 573 auto IsSuccessor = [](BasicBlock *Succ, BasicBlock *BB) { 574 return is_contained(successors(BB), Succ); 575 }; 576 577 auto IsReturnBlock = [](BasicBlock *BB) { 578 Instruction *TI = BB->getTerminator(); 579 return isa<ReturnInst>(TI); 580 }; 581 582 auto GetReturnBlock = [&](BasicBlock *Succ1, BasicBlock *Succ2) { 583 if (IsReturnBlock(Succ1)) 584 return std::make_tuple(Succ1, Succ2); 585 if (IsReturnBlock(Succ2)) 586 return std::make_tuple(Succ2, Succ1); 587 588 return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr); 589 }; 590 591 // Detect a triangular shape: 592 auto GetCommonSucc = [&](BasicBlock *Succ1, BasicBlock *Succ2) { 593 if (IsSuccessor(Succ1, Succ2)) 594 return std::make_tuple(Succ1, Succ2); 595 if (IsSuccessor(Succ2, Succ1)) 596 return std::make_tuple(Succ2, Succ1); 597 598 return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr); 599 }; 600 601 std::unique_ptr<FunctionOutliningInfo> OutliningInfo = 602 std::make_unique<FunctionOutliningInfo>(); 603 604 BasicBlock *CurrEntry = EntryBlock; 605 bool CandidateFound = false; 606 do { 607 // The number of blocks to be inlined has already reached 608 // the limit. When MaxNumInlineBlocks is set to 0 or 1, this 609 // disables partial inlining for the function. 610 if (OutliningInfo->getNumInlinedBlocks() >= MaxNumInlineBlocks) 611 break; 612 613 if (succ_size(CurrEntry) != 2) 614 break; 615 616 BasicBlock *Succ1 = *succ_begin(CurrEntry); 617 BasicBlock *Succ2 = *(succ_begin(CurrEntry) + 1); 618 619 BasicBlock *ReturnBlock, *NonReturnBlock; 620 std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2); 621 622 if (ReturnBlock) { 623 OutliningInfo->Entries.push_back(CurrEntry); 624 OutliningInfo->ReturnBlock = ReturnBlock; 625 OutliningInfo->NonReturnBlock = NonReturnBlock; 626 CandidateFound = true; 627 break; 628 } 629 630 BasicBlock *CommSucc, *OtherSucc; 631 std::tie(CommSucc, OtherSucc) = GetCommonSucc(Succ1, Succ2); 632 633 if (!CommSucc) 634 break; 635 636 OutliningInfo->Entries.push_back(CurrEntry); 637 CurrEntry = OtherSucc; 638 } while (true); 639 640 if (!CandidateFound) 641 return std::unique_ptr<FunctionOutliningInfo>(); 642 643 // There should not be any successors (not in the entry set) other than 644 // {ReturnBlock, NonReturnBlock} 645 assert(OutliningInfo->Entries[0] == &F.front() && 646 "Function Entry must be the first in Entries vector"); 647 DenseSet<BasicBlock *> Entries; 648 for (BasicBlock *E : OutliningInfo->Entries) 649 Entries.insert(E); 650 651 // Returns true of BB has Predecessor which is not 652 // in Entries set. 653 auto HasNonEntryPred = [Entries](BasicBlock *BB) { 654 for (auto *Pred : predecessors(BB)) { 655 if (!Entries.count(Pred)) 656 return true; 657 } 658 return false; 659 }; 660 auto CheckAndNormalizeCandidate = 661 [Entries, HasNonEntryPred](FunctionOutliningInfo *OutliningInfo) { 662 for (BasicBlock *E : OutliningInfo->Entries) { 663 for (auto *Succ : successors(E)) { 664 if (Entries.count(Succ)) 665 continue; 666 if (Succ == OutliningInfo->ReturnBlock) 667 OutliningInfo->ReturnBlockPreds.push_back(E); 668 else if (Succ != OutliningInfo->NonReturnBlock) 669 return false; 670 } 671 // There should not be any outside incoming edges either: 672 if (HasNonEntryPred(E)) 673 return false; 674 } 675 return true; 676 }; 677 678 if (!CheckAndNormalizeCandidate(OutliningInfo.get())) 679 return std::unique_ptr<FunctionOutliningInfo>(); 680 681 // Now further growing the candidate's inlining region by 682 // peeling off dominating blocks from the outlining region: 683 while (OutliningInfo->getNumInlinedBlocks() < MaxNumInlineBlocks) { 684 BasicBlock *Cand = OutliningInfo->NonReturnBlock; 685 if (succ_size(Cand) != 2) 686 break; 687 688 if (HasNonEntryPred(Cand)) 689 break; 690 691 BasicBlock *Succ1 = *succ_begin(Cand); 692 BasicBlock *Succ2 = *(succ_begin(Cand) + 1); 693 694 BasicBlock *ReturnBlock, *NonReturnBlock; 695 std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2); 696 if (!ReturnBlock || ReturnBlock != OutliningInfo->ReturnBlock) 697 break; 698 699 if (NonReturnBlock->getSinglePredecessor() != Cand) 700 break; 701 702 // Now grow and update OutlininigInfo: 703 OutliningInfo->Entries.push_back(Cand); 704 OutliningInfo->NonReturnBlock = NonReturnBlock; 705 OutliningInfo->ReturnBlockPreds.push_back(Cand); 706 Entries.insert(Cand); 707 } 708 709 return OutliningInfo; 710 } 711 712 // Check if there is PGO data or user annotated branch data: 713 static bool hasProfileData(const Function &F, const FunctionOutliningInfo &OI) { 714 if (F.hasProfileData()) 715 return true; 716 // Now check if any of the entry block has MD_prof data: 717 for (auto *E : OI.Entries) { 718 BranchInst *BR = dyn_cast<BranchInst>(E->getTerminator()); 719 if (!BR || BR->isUnconditional()) 720 continue; 721 uint64_t T, F; 722 if (BR->extractProfMetadata(T, F)) 723 return true; 724 } 725 return false; 726 } 727 728 BranchProbability PartialInlinerImpl::getOutliningCallBBRelativeFreq( 729 FunctionCloner &Cloner) const { 730 BasicBlock *OutliningCallBB = Cloner.OutlinedFunctions.back().second; 731 auto EntryFreq = 732 Cloner.ClonedFuncBFI->getBlockFreq(&Cloner.ClonedFunc->getEntryBlock()); 733 auto OutliningCallFreq = 734 Cloner.ClonedFuncBFI->getBlockFreq(OutliningCallBB); 735 // FIXME Hackery needed because ClonedFuncBFI is based on the function BEFORE 736 // we outlined any regions, so we may encounter situations where the 737 // OutliningCallFreq is *slightly* bigger than the EntryFreq. 738 if (OutliningCallFreq.getFrequency() > EntryFreq.getFrequency()) 739 OutliningCallFreq = EntryFreq; 740 741 auto OutlineRegionRelFreq = BranchProbability::getBranchProbability( 742 OutliningCallFreq.getFrequency(), EntryFreq.getFrequency()); 743 744 if (hasProfileData(*Cloner.OrigFunc, *Cloner.ClonedOI.get())) 745 return OutlineRegionRelFreq; 746 747 // When profile data is not available, we need to be conservative in 748 // estimating the overall savings. Static branch prediction can usually 749 // guess the branch direction right (taken/non-taken), but the guessed 750 // branch probability is usually not biased enough. In case when the 751 // outlined region is predicted to be likely, its probability needs 752 // to be made higher (more biased) to not under-estimate the cost of 753 // function outlining. On the other hand, if the outlined region 754 // is predicted to be less likely, the predicted probablity is usually 755 // higher than the actual. For instance, the actual probability of the 756 // less likely target is only 5%, but the guessed probablity can be 757 // 40%. In the latter case, there is no need for further adjustement. 758 // FIXME: add an option for this. 759 if (OutlineRegionRelFreq < BranchProbability(45, 100)) 760 return OutlineRegionRelFreq; 761 762 OutlineRegionRelFreq = std::max( 763 OutlineRegionRelFreq, BranchProbability(OutlineRegionFreqPercent, 100)); 764 765 return OutlineRegionRelFreq; 766 } 767 768 bool PartialInlinerImpl::shouldPartialInline( 769 CallBase &CB, FunctionCloner &Cloner, BlockFrequency WeightedOutliningRcost, 770 OptimizationRemarkEmitter &ORE) const { 771 using namespace ore; 772 773 Function *Callee = CB.getCalledFunction(); 774 assert(Callee == Cloner.ClonedFunc); 775 776 if (SkipCostAnalysis) 777 return isInlineViable(*Callee).isSuccess(); 778 779 Function *Caller = CB.getCaller(); 780 auto &CalleeTTI = GetTTI(*Callee); 781 bool RemarksEnabled = 782 Callee->getContext().getDiagHandlerPtr()->isMissedOptRemarkEnabled( 783 DEBUG_TYPE); 784 InlineCost IC = 785 getInlineCost(CB, getInlineParams(), CalleeTTI, GetAssumptionCache, 786 GetTLI, GetBFI, &PSI, RemarksEnabled ? &ORE : nullptr); 787 788 if (IC.isAlways()) { 789 ORE.emit([&]() { 790 return OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", &CB) 791 << NV("Callee", Cloner.OrigFunc) 792 << " should always be fully inlined, not partially"; 793 }); 794 return false; 795 } 796 797 if (IC.isNever()) { 798 ORE.emit([&]() { 799 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", &CB) 800 << NV("Callee", Cloner.OrigFunc) << " not partially inlined into " 801 << NV("Caller", Caller) 802 << " because it should never be inlined (cost=never)"; 803 }); 804 return false; 805 } 806 807 if (!IC) { 808 ORE.emit([&]() { 809 return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", &CB) 810 << NV("Callee", Cloner.OrigFunc) << " not partially inlined into " 811 << NV("Caller", Caller) << " because too costly to inline (cost=" 812 << NV("Cost", IC.getCost()) << ", threshold=" 813 << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")"; 814 }); 815 return false; 816 } 817 const DataLayout &DL = Caller->getParent()->getDataLayout(); 818 819 // The savings of eliminating the call: 820 int NonWeightedSavings = getCallsiteCost(CB, DL); 821 BlockFrequency NormWeightedSavings(NonWeightedSavings); 822 823 // Weighted saving is smaller than weighted cost, return false 824 if (NormWeightedSavings < WeightedOutliningRcost) { 825 ORE.emit([&]() { 826 return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutliningCallcostTooHigh", 827 &CB) 828 << NV("Callee", Cloner.OrigFunc) << " not partially inlined into " 829 << NV("Caller", Caller) << " runtime overhead (overhead=" 830 << NV("Overhead", (unsigned)WeightedOutliningRcost.getFrequency()) 831 << ", savings=" 832 << NV("Savings", (unsigned)NormWeightedSavings.getFrequency()) 833 << ")" 834 << " of making the outlined call is too high"; 835 }); 836 837 return false; 838 } 839 840 ORE.emit([&]() { 841 return OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBePartiallyInlined", &CB) 842 << NV("Callee", Cloner.OrigFunc) << " can be partially inlined into " 843 << NV("Caller", Caller) << " with cost=" << NV("Cost", IC.getCost()) 844 << " (threshold=" 845 << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")"; 846 }); 847 return true; 848 } 849 850 // TODO: Ideally we should share Inliner's InlineCost Analysis code. 851 // For now use a simplified version. The returned 'InlineCost' will be used 852 // to esimate the size cost as well as runtime cost of the BB. 853 InstructionCost 854 PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB, 855 TargetTransformInfo *TTI) { 856 InstructionCost InlineCost = 0; 857 const DataLayout &DL = BB->getParent()->getParent()->getDataLayout(); 858 for (Instruction &I : BB->instructionsWithoutDebug()) { 859 // Skip free instructions. 860 switch (I.getOpcode()) { 861 case Instruction::BitCast: 862 case Instruction::PtrToInt: 863 case Instruction::IntToPtr: 864 case Instruction::Alloca: 865 case Instruction::PHI: 866 continue; 867 case Instruction::GetElementPtr: 868 if (cast<GetElementPtrInst>(&I)->hasAllZeroIndices()) 869 continue; 870 break; 871 default: 872 break; 873 } 874 875 if (I.isLifetimeStartOrEnd()) 876 continue; 877 878 if (auto *II = dyn_cast<IntrinsicInst>(&I)) { 879 Intrinsic::ID IID = II->getIntrinsicID(); 880 SmallVector<Type *, 4> Tys; 881 FastMathFlags FMF; 882 for (Value *Val : II->args()) 883 Tys.push_back(Val->getType()); 884 885 if (auto *FPMO = dyn_cast<FPMathOperator>(II)) 886 FMF = FPMO->getFastMathFlags(); 887 888 IntrinsicCostAttributes ICA(IID, II->getType(), Tys, FMF); 889 InlineCost += TTI->getIntrinsicInstrCost(ICA, TTI::TCK_SizeAndLatency); 890 continue; 891 } 892 893 if (CallInst *CI = dyn_cast<CallInst>(&I)) { 894 InlineCost += getCallsiteCost(*CI, DL); 895 continue; 896 } 897 898 if (InvokeInst *II = dyn_cast<InvokeInst>(&I)) { 899 InlineCost += getCallsiteCost(*II, DL); 900 continue; 901 } 902 903 if (SwitchInst *SI = dyn_cast<SwitchInst>(&I)) { 904 InlineCost += (SI->getNumCases() + 1) * InlineConstants::InstrCost; 905 continue; 906 } 907 InlineCost += InlineConstants::InstrCost; 908 } 909 910 return InlineCost; 911 } 912 913 std::tuple<InstructionCost, InstructionCost> 914 PartialInlinerImpl::computeOutliningCosts(FunctionCloner &Cloner) const { 915 InstructionCost OutliningFuncCallCost = 0, OutlinedFunctionCost = 0; 916 for (auto FuncBBPair : Cloner.OutlinedFunctions) { 917 Function *OutlinedFunc = FuncBBPair.first; 918 BasicBlock* OutliningCallBB = FuncBBPair.second; 919 // Now compute the cost of the call sequence to the outlined function 920 // 'OutlinedFunction' in BB 'OutliningCallBB': 921 auto *OutlinedFuncTTI = &GetTTI(*OutlinedFunc); 922 OutliningFuncCallCost += 923 computeBBInlineCost(OutliningCallBB, OutlinedFuncTTI); 924 925 // Now compute the cost of the extracted/outlined function itself: 926 for (BasicBlock &BB : *OutlinedFunc) 927 OutlinedFunctionCost += computeBBInlineCost(&BB, OutlinedFuncTTI); 928 } 929 assert(OutlinedFunctionCost >= Cloner.OutlinedRegionCost && 930 "Outlined function cost should be no less than the outlined region"); 931 932 // The code extractor introduces a new root and exit stub blocks with 933 // additional unconditional branches. Those branches will be eliminated 934 // later with bb layout. The cost should be adjusted accordingly: 935 OutlinedFunctionCost -= 936 2 * InlineConstants::InstrCost * Cloner.OutlinedFunctions.size(); 937 938 InstructionCost OutliningRuntimeOverhead = 939 OutliningFuncCallCost + 940 (OutlinedFunctionCost - Cloner.OutlinedRegionCost) + 941 ExtraOutliningPenalty.getValue(); 942 943 return std::make_tuple(OutliningFuncCallCost, OutliningRuntimeOverhead); 944 } 945 946 // Create the callsite to profile count map which is 947 // used to update the original function's entry count, 948 // after the function is partially inlined into the callsite. 949 void PartialInlinerImpl::computeCallsiteToProfCountMap( 950 Function *DuplicateFunction, 951 DenseMap<User *, uint64_t> &CallSiteToProfCountMap) const { 952 std::vector<User *> Users(DuplicateFunction->user_begin(), 953 DuplicateFunction->user_end()); 954 Function *CurrentCaller = nullptr; 955 std::unique_ptr<BlockFrequencyInfo> TempBFI; 956 BlockFrequencyInfo *CurrentCallerBFI = nullptr; 957 958 auto ComputeCurrBFI = [&,this](Function *Caller) { 959 // For the old pass manager: 960 if (!GetBFI) { 961 DominatorTree DT(*Caller); 962 LoopInfo LI(DT); 963 BranchProbabilityInfo BPI(*Caller, LI); 964 TempBFI.reset(new BlockFrequencyInfo(*Caller, BPI, LI)); 965 CurrentCallerBFI = TempBFI.get(); 966 } else { 967 // New pass manager: 968 CurrentCallerBFI = &(GetBFI(*Caller)); 969 } 970 }; 971 972 for (User *User : Users) { 973 // Don't bother with BlockAddress used by CallBr for asm goto. 974 if (isa<BlockAddress>(User)) 975 continue; 976 CallBase *CB = getSupportedCallBase(User); 977 Function *Caller = CB->getCaller(); 978 if (CurrentCaller != Caller) { 979 CurrentCaller = Caller; 980 ComputeCurrBFI(Caller); 981 } else { 982 assert(CurrentCallerBFI && "CallerBFI is not set"); 983 } 984 BasicBlock *CallBB = CB->getParent(); 985 auto Count = CurrentCallerBFI->getBlockProfileCount(CallBB); 986 if (Count) 987 CallSiteToProfCountMap[User] = *Count; 988 else 989 CallSiteToProfCountMap[User] = 0; 990 } 991 } 992 993 PartialInlinerImpl::FunctionCloner::FunctionCloner( 994 Function *F, FunctionOutliningInfo *OI, OptimizationRemarkEmitter &ORE, 995 function_ref<AssumptionCache *(Function &)> LookupAC, 996 function_ref<TargetTransformInfo &(Function &)> GetTTI) 997 : OrigFunc(F), ORE(ORE), LookupAC(LookupAC), GetTTI(GetTTI) { 998 ClonedOI = std::make_unique<FunctionOutliningInfo>(); 999 1000 // Clone the function, so that we can hack away on it. 1001 ValueToValueMapTy VMap; 1002 ClonedFunc = CloneFunction(F, VMap); 1003 1004 ClonedOI->ReturnBlock = cast<BasicBlock>(VMap[OI->ReturnBlock]); 1005 ClonedOI->NonReturnBlock = cast<BasicBlock>(VMap[OI->NonReturnBlock]); 1006 for (BasicBlock *BB : OI->Entries) 1007 ClonedOI->Entries.push_back(cast<BasicBlock>(VMap[BB])); 1008 1009 for (BasicBlock *E : OI->ReturnBlockPreds) { 1010 BasicBlock *NewE = cast<BasicBlock>(VMap[E]); 1011 ClonedOI->ReturnBlockPreds.push_back(NewE); 1012 } 1013 // Go ahead and update all uses to the duplicate, so that we can just 1014 // use the inliner functionality when we're done hacking. 1015 F->replaceAllUsesWith(ClonedFunc); 1016 } 1017 1018 PartialInlinerImpl::FunctionCloner::FunctionCloner( 1019 Function *F, FunctionOutliningMultiRegionInfo *OI, 1020 OptimizationRemarkEmitter &ORE, 1021 function_ref<AssumptionCache *(Function &)> LookupAC, 1022 function_ref<TargetTransformInfo &(Function &)> GetTTI) 1023 : OrigFunc(F), ORE(ORE), LookupAC(LookupAC), GetTTI(GetTTI) { 1024 ClonedOMRI = std::make_unique<FunctionOutliningMultiRegionInfo>(); 1025 1026 // Clone the function, so that we can hack away on it. 1027 ValueToValueMapTy VMap; 1028 ClonedFunc = CloneFunction(F, VMap); 1029 1030 // Go through all Outline Candidate Regions and update all BasicBlock 1031 // information. 1032 for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo : 1033 OI->ORI) { 1034 SmallVector<BasicBlock *, 8> Region; 1035 for (BasicBlock *BB : RegionInfo.Region) 1036 Region.push_back(cast<BasicBlock>(VMap[BB])); 1037 1038 BasicBlock *NewEntryBlock = cast<BasicBlock>(VMap[RegionInfo.EntryBlock]); 1039 BasicBlock *NewExitBlock = cast<BasicBlock>(VMap[RegionInfo.ExitBlock]); 1040 BasicBlock *NewReturnBlock = nullptr; 1041 if (RegionInfo.ReturnBlock) 1042 NewReturnBlock = cast<BasicBlock>(VMap[RegionInfo.ReturnBlock]); 1043 FunctionOutliningMultiRegionInfo::OutlineRegionInfo MappedRegionInfo( 1044 Region, NewEntryBlock, NewExitBlock, NewReturnBlock); 1045 ClonedOMRI->ORI.push_back(MappedRegionInfo); 1046 } 1047 // Go ahead and update all uses to the duplicate, so that we can just 1048 // use the inliner functionality when we're done hacking. 1049 F->replaceAllUsesWith(ClonedFunc); 1050 } 1051 1052 void PartialInlinerImpl::FunctionCloner::normalizeReturnBlock() const { 1053 auto GetFirstPHI = [](BasicBlock *BB) { 1054 BasicBlock::iterator I = BB->begin(); 1055 PHINode *FirstPhi = nullptr; 1056 while (I != BB->end()) { 1057 PHINode *Phi = dyn_cast<PHINode>(I); 1058 if (!Phi) 1059 break; 1060 if (!FirstPhi) { 1061 FirstPhi = Phi; 1062 break; 1063 } 1064 } 1065 return FirstPhi; 1066 }; 1067 1068 // Shouldn't need to normalize PHIs if we're not outlining non-early return 1069 // blocks. 1070 if (!ClonedOI) 1071 return; 1072 1073 // Special hackery is needed with PHI nodes that have inputs from more than 1074 // one extracted block. For simplicity, just split the PHIs into a two-level 1075 // sequence of PHIs, some of which will go in the extracted region, and some 1076 // of which will go outside. 1077 BasicBlock *PreReturn = ClonedOI->ReturnBlock; 1078 // only split block when necessary: 1079 PHINode *FirstPhi = GetFirstPHI(PreReturn); 1080 unsigned NumPredsFromEntries = ClonedOI->ReturnBlockPreds.size(); 1081 1082 if (!FirstPhi || FirstPhi->getNumIncomingValues() <= NumPredsFromEntries + 1) 1083 return; 1084 1085 auto IsTrivialPhi = [](PHINode *PN) -> Value * { 1086 Value *CommonValue = PN->getIncomingValue(0); 1087 if (all_of(PN->incoming_values(), 1088 [&](Value *V) { return V == CommonValue; })) 1089 return CommonValue; 1090 return nullptr; 1091 }; 1092 1093 ClonedOI->ReturnBlock = ClonedOI->ReturnBlock->splitBasicBlock( 1094 ClonedOI->ReturnBlock->getFirstNonPHI()->getIterator()); 1095 BasicBlock::iterator I = PreReturn->begin(); 1096 Instruction *Ins = &ClonedOI->ReturnBlock->front(); 1097 SmallVector<Instruction *, 4> DeadPhis; 1098 while (I != PreReturn->end()) { 1099 PHINode *OldPhi = dyn_cast<PHINode>(I); 1100 if (!OldPhi) 1101 break; 1102 1103 PHINode *RetPhi = 1104 PHINode::Create(OldPhi->getType(), NumPredsFromEntries + 1, "", Ins); 1105 OldPhi->replaceAllUsesWith(RetPhi); 1106 Ins = ClonedOI->ReturnBlock->getFirstNonPHI(); 1107 1108 RetPhi->addIncoming(&*I, PreReturn); 1109 for (BasicBlock *E : ClonedOI->ReturnBlockPreds) { 1110 RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(E), E); 1111 OldPhi->removeIncomingValue(E); 1112 } 1113 1114 // After incoming values splitting, the old phi may become trivial. 1115 // Keeping the trivial phi can introduce definition inside the outline 1116 // region which is live-out, causing necessary overhead (load, store 1117 // arg passing etc). 1118 if (auto *OldPhiVal = IsTrivialPhi(OldPhi)) { 1119 OldPhi->replaceAllUsesWith(OldPhiVal); 1120 DeadPhis.push_back(OldPhi); 1121 } 1122 ++I; 1123 } 1124 for (auto *DP : DeadPhis) 1125 DP->eraseFromParent(); 1126 1127 for (auto *E : ClonedOI->ReturnBlockPreds) 1128 E->getTerminator()->replaceUsesOfWith(PreReturn, ClonedOI->ReturnBlock); 1129 } 1130 1131 bool PartialInlinerImpl::FunctionCloner::doMultiRegionFunctionOutlining() { 1132 1133 auto ComputeRegionCost = 1134 [&](SmallVectorImpl<BasicBlock *> &Region) -> InstructionCost { 1135 InstructionCost Cost = 0; 1136 for (BasicBlock* BB : Region) 1137 Cost += computeBBInlineCost(BB, &GetTTI(*BB->getParent())); 1138 return Cost; 1139 }; 1140 1141 assert(ClonedOMRI && "Expecting OutlineInfo for multi region outline"); 1142 1143 if (ClonedOMRI->ORI.empty()) 1144 return false; 1145 1146 // The CodeExtractor needs a dominator tree. 1147 DominatorTree DT; 1148 DT.recalculate(*ClonedFunc); 1149 1150 // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo. 1151 LoopInfo LI(DT); 1152 BranchProbabilityInfo BPI(*ClonedFunc, LI); 1153 ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI)); 1154 1155 // Cache and recycle the CodeExtractor analysis to avoid O(n^2) compile-time. 1156 CodeExtractorAnalysisCache CEAC(*ClonedFunc); 1157 1158 SetVector<Value *> Inputs, Outputs, Sinks; 1159 for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo : 1160 ClonedOMRI->ORI) { 1161 InstructionCost CurrentOutlinedRegionCost = 1162 ComputeRegionCost(RegionInfo.Region); 1163 1164 CodeExtractor CE(RegionInfo.Region, &DT, /*AggregateArgs*/ false, 1165 ClonedFuncBFI.get(), &BPI, 1166 LookupAC(*RegionInfo.EntryBlock->getParent()), 1167 /* AllowVarargs */ false); 1168 1169 CE.findInputsOutputs(Inputs, Outputs, Sinks); 1170 1171 LLVM_DEBUG({ 1172 dbgs() << "inputs: " << Inputs.size() << "\n"; 1173 dbgs() << "outputs: " << Outputs.size() << "\n"; 1174 for (Value *value : Inputs) 1175 dbgs() << "value used in func: " << *value << "\n"; 1176 for (Value *output : Outputs) 1177 dbgs() << "instr used in func: " << *output << "\n"; 1178 }); 1179 1180 // Do not extract regions that have live exit variables. 1181 if (Outputs.size() > 0 && !ForceLiveExit) 1182 continue; 1183 1184 if (Function *OutlinedFunc = CE.extractCodeRegion(CEAC)) { 1185 CallBase *OCS = PartialInlinerImpl::getOneCallSiteTo(*OutlinedFunc); 1186 BasicBlock *OutliningCallBB = OCS->getParent(); 1187 assert(OutliningCallBB->getParent() == ClonedFunc); 1188 OutlinedFunctions.push_back(std::make_pair(OutlinedFunc,OutliningCallBB)); 1189 NumColdRegionsOutlined++; 1190 OutlinedRegionCost += CurrentOutlinedRegionCost; 1191 1192 if (MarkOutlinedColdCC) { 1193 OutlinedFunc->setCallingConv(CallingConv::Cold); 1194 OCS->setCallingConv(CallingConv::Cold); 1195 } 1196 } else 1197 ORE.emit([&]() { 1198 return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed", 1199 &RegionInfo.Region.front()->front()) 1200 << "Failed to extract region at block " 1201 << ore::NV("Block", RegionInfo.Region.front()); 1202 }); 1203 } 1204 1205 return !OutlinedFunctions.empty(); 1206 } 1207 1208 Function * 1209 PartialInlinerImpl::FunctionCloner::doSingleRegionFunctionOutlining() { 1210 // Returns true if the block is to be partial inlined into the caller 1211 // (i.e. not to be extracted to the out of line function) 1212 auto ToBeInlined = [&, this](BasicBlock *BB) { 1213 return BB == ClonedOI->ReturnBlock || 1214 llvm::is_contained(ClonedOI->Entries, BB); 1215 }; 1216 1217 assert(ClonedOI && "Expecting OutlineInfo for single region outline"); 1218 // The CodeExtractor needs a dominator tree. 1219 DominatorTree DT; 1220 DT.recalculate(*ClonedFunc); 1221 1222 // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo. 1223 LoopInfo LI(DT); 1224 BranchProbabilityInfo BPI(*ClonedFunc, LI); 1225 ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI)); 1226 1227 // Gather up the blocks that we're going to extract. 1228 std::vector<BasicBlock *> ToExtract; 1229 auto *ClonedFuncTTI = &GetTTI(*ClonedFunc); 1230 ToExtract.push_back(ClonedOI->NonReturnBlock); 1231 OutlinedRegionCost += PartialInlinerImpl::computeBBInlineCost( 1232 ClonedOI->NonReturnBlock, ClonedFuncTTI); 1233 for (BasicBlock &BB : *ClonedFunc) 1234 if (!ToBeInlined(&BB) && &BB != ClonedOI->NonReturnBlock) { 1235 ToExtract.push_back(&BB); 1236 // FIXME: the code extractor may hoist/sink more code 1237 // into the outlined function which may make the outlining 1238 // overhead (the difference of the outlined function cost 1239 // and OutliningRegionCost) look larger. 1240 OutlinedRegionCost += computeBBInlineCost(&BB, ClonedFuncTTI); 1241 } 1242 1243 // Extract the body of the if. 1244 CodeExtractorAnalysisCache CEAC(*ClonedFunc); 1245 Function *OutlinedFunc = 1246 CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false, 1247 ClonedFuncBFI.get(), &BPI, LookupAC(*ClonedFunc), 1248 /* AllowVarargs */ true) 1249 .extractCodeRegion(CEAC); 1250 1251 if (OutlinedFunc) { 1252 BasicBlock *OutliningCallBB = 1253 PartialInlinerImpl::getOneCallSiteTo(*OutlinedFunc)->getParent(); 1254 assert(OutliningCallBB->getParent() == ClonedFunc); 1255 OutlinedFunctions.push_back(std::make_pair(OutlinedFunc, OutliningCallBB)); 1256 } else 1257 ORE.emit([&]() { 1258 return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed", 1259 &ToExtract.front()->front()) 1260 << "Failed to extract region at block " 1261 << ore::NV("Block", ToExtract.front()); 1262 }); 1263 1264 return OutlinedFunc; 1265 } 1266 1267 PartialInlinerImpl::FunctionCloner::~FunctionCloner() { 1268 // Ditch the duplicate, since we're done with it, and rewrite all remaining 1269 // users (function pointers, etc.) back to the original function. 1270 ClonedFunc->replaceAllUsesWith(OrigFunc); 1271 ClonedFunc->eraseFromParent(); 1272 if (!IsFunctionInlined) { 1273 // Remove each function that was speculatively created if there is no 1274 // reference. 1275 for (auto FuncBBPair : OutlinedFunctions) { 1276 Function *Func = FuncBBPair.first; 1277 Func->eraseFromParent(); 1278 } 1279 } 1280 } 1281 1282 std::pair<bool, Function *> PartialInlinerImpl::unswitchFunction(Function &F) { 1283 if (F.hasAddressTaken()) 1284 return {false, nullptr}; 1285 1286 // Let inliner handle it 1287 if (F.hasFnAttribute(Attribute::AlwaysInline)) 1288 return {false, nullptr}; 1289 1290 if (F.hasFnAttribute(Attribute::NoInline)) 1291 return {false, nullptr}; 1292 1293 if (PSI.isFunctionEntryCold(&F)) 1294 return {false, nullptr}; 1295 1296 if (F.users().empty()) 1297 return {false, nullptr}; 1298 1299 OptimizationRemarkEmitter ORE(&F); 1300 1301 // Only try to outline cold regions if we have a profile summary, which 1302 // implies we have profiling information. 1303 if (PSI.hasProfileSummary() && F.hasProfileData() && 1304 !DisableMultiRegionPartialInline) { 1305 std::unique_ptr<FunctionOutliningMultiRegionInfo> OMRI = 1306 computeOutliningColdRegionsInfo(F, ORE); 1307 if (OMRI) { 1308 FunctionCloner Cloner(&F, OMRI.get(), ORE, LookupAssumptionCache, GetTTI); 1309 1310 LLVM_DEBUG({ 1311 dbgs() << "HotCountThreshold = " << PSI.getHotCountThreshold() << "\n"; 1312 dbgs() << "ColdCountThreshold = " << PSI.getColdCountThreshold() 1313 << "\n"; 1314 }); 1315 1316 bool DidOutline = Cloner.doMultiRegionFunctionOutlining(); 1317 1318 if (DidOutline) { 1319 LLVM_DEBUG({ 1320 dbgs() << ">>>>>> Outlined (Cloned) Function >>>>>>\n"; 1321 Cloner.ClonedFunc->print(dbgs()); 1322 dbgs() << "<<<<<< Outlined (Cloned) Function <<<<<<\n"; 1323 }); 1324 1325 if (tryPartialInline(Cloner)) 1326 return {true, nullptr}; 1327 } 1328 } 1329 } 1330 1331 // Fall-thru to regular partial inlining if we: 1332 // i) can't find any cold regions to outline, or 1333 // ii) can't inline the outlined function anywhere. 1334 std::unique_ptr<FunctionOutliningInfo> OI = computeOutliningInfo(F); 1335 if (!OI) 1336 return {false, nullptr}; 1337 1338 FunctionCloner Cloner(&F, OI.get(), ORE, LookupAssumptionCache, GetTTI); 1339 Cloner.normalizeReturnBlock(); 1340 1341 Function *OutlinedFunction = Cloner.doSingleRegionFunctionOutlining(); 1342 1343 if (!OutlinedFunction) 1344 return {false, nullptr}; 1345 1346 if (tryPartialInline(Cloner)) 1347 return {true, OutlinedFunction}; 1348 1349 return {false, nullptr}; 1350 } 1351 1352 bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) { 1353 if (Cloner.OutlinedFunctions.empty()) 1354 return false; 1355 1356 int SizeCost = 0; 1357 BlockFrequency WeightedRcost; 1358 int NonWeightedRcost; 1359 1360 auto OutliningCosts = computeOutliningCosts(Cloner); 1361 assert(std::get<0>(OutliningCosts).isValid() && 1362 std::get<1>(OutliningCosts).isValid() && "Expected valid costs"); 1363 1364 SizeCost = *std::get<0>(OutliningCosts).getValue(); 1365 NonWeightedRcost = *std::get<1>(OutliningCosts).getValue(); 1366 1367 // Only calculate RelativeToEntryFreq when we are doing single region 1368 // outlining. 1369 BranchProbability RelativeToEntryFreq; 1370 if (Cloner.ClonedOI) 1371 RelativeToEntryFreq = getOutliningCallBBRelativeFreq(Cloner); 1372 else 1373 // RelativeToEntryFreq doesn't make sense when we have more than one 1374 // outlined call because each call will have a different relative frequency 1375 // to the entry block. We can consider using the average, but the 1376 // usefulness of that information is questionable. For now, assume we never 1377 // execute the calls to outlined functions. 1378 RelativeToEntryFreq = BranchProbability(0, 1); 1379 1380 WeightedRcost = BlockFrequency(NonWeightedRcost) * RelativeToEntryFreq; 1381 1382 // The call sequence(s) to the outlined function(s) are larger than the sum of 1383 // the original outlined region size(s), it does not increase the chances of 1384 // inlining the function with outlining (The inliner uses the size increase to 1385 // model the cost of inlining a callee). 1386 if (!SkipCostAnalysis && Cloner.OutlinedRegionCost < SizeCost) { 1387 OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc); 1388 DebugLoc DLoc; 1389 BasicBlock *Block; 1390 std::tie(DLoc, Block) = getOneDebugLoc(*Cloner.ClonedFunc); 1391 OrigFuncORE.emit([&]() { 1392 return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall", 1393 DLoc, Block) 1394 << ore::NV("Function", Cloner.OrigFunc) 1395 << " not partially inlined into callers (Original Size = " 1396 << ore::NV("OutlinedRegionOriginalSize", Cloner.OutlinedRegionCost) 1397 << ", Size of call sequence to outlined function = " 1398 << ore::NV("NewSize", SizeCost) << ")"; 1399 }); 1400 return false; 1401 } 1402 1403 assert(Cloner.OrigFunc->users().empty() && 1404 "F's users should all be replaced!"); 1405 1406 std::vector<User *> Users(Cloner.ClonedFunc->user_begin(), 1407 Cloner.ClonedFunc->user_end()); 1408 1409 DenseMap<User *, uint64_t> CallSiteToProfCountMap; 1410 auto CalleeEntryCount = Cloner.OrigFunc->getEntryCount(); 1411 if (CalleeEntryCount) 1412 computeCallsiteToProfCountMap(Cloner.ClonedFunc, CallSiteToProfCountMap); 1413 1414 uint64_t CalleeEntryCountV = 1415 (CalleeEntryCount ? CalleeEntryCount->getCount() : 0); 1416 1417 bool AnyInline = false; 1418 for (User *User : Users) { 1419 // Don't bother with BlockAddress used by CallBr for asm goto. 1420 if (isa<BlockAddress>(User)) 1421 continue; 1422 1423 CallBase *CB = getSupportedCallBase(User); 1424 1425 if (isLimitReached()) 1426 continue; 1427 1428 OptimizationRemarkEmitter CallerORE(CB->getCaller()); 1429 if (!shouldPartialInline(*CB, Cloner, WeightedRcost, CallerORE)) 1430 continue; 1431 1432 // Construct remark before doing the inlining, as after successful inlining 1433 // the callsite is removed. 1434 OptimizationRemark OR(DEBUG_TYPE, "PartiallyInlined", CB); 1435 OR << ore::NV("Callee", Cloner.OrigFunc) << " partially inlined into " 1436 << ore::NV("Caller", CB->getCaller()); 1437 1438 InlineFunctionInfo IFI(nullptr, GetAssumptionCache, &PSI); 1439 // We can only forward varargs when we outlined a single region, else we 1440 // bail on vararg functions. 1441 if (!InlineFunction(*CB, IFI, nullptr, true, 1442 (Cloner.ClonedOI ? Cloner.OutlinedFunctions.back().first 1443 : nullptr)) 1444 .isSuccess()) 1445 continue; 1446 1447 CallerORE.emit(OR); 1448 1449 // Now update the entry count: 1450 if (CalleeEntryCountV && CallSiteToProfCountMap.count(User)) { 1451 uint64_t CallSiteCount = CallSiteToProfCountMap[User]; 1452 CalleeEntryCountV -= std::min(CalleeEntryCountV, CallSiteCount); 1453 } 1454 1455 AnyInline = true; 1456 NumPartialInlining++; 1457 // Update the stats 1458 if (Cloner.ClonedOI) 1459 NumPartialInlined++; 1460 else 1461 NumColdOutlinePartialInlined++; 1462 } 1463 1464 if (AnyInline) { 1465 Cloner.IsFunctionInlined = true; 1466 if (CalleeEntryCount) 1467 Cloner.OrigFunc->setEntryCount(Function::ProfileCount( 1468 CalleeEntryCountV, CalleeEntryCount->getType())); 1469 OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc); 1470 OrigFuncORE.emit([&]() { 1471 return OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", Cloner.OrigFunc) 1472 << "Partially inlined into at least one caller"; 1473 }); 1474 } 1475 1476 return AnyInline; 1477 } 1478 1479 bool PartialInlinerImpl::run(Module &M) { 1480 if (DisablePartialInlining) 1481 return false; 1482 1483 std::vector<Function *> Worklist; 1484 Worklist.reserve(M.size()); 1485 for (Function &F : M) 1486 if (!F.use_empty() && !F.isDeclaration()) 1487 Worklist.push_back(&F); 1488 1489 bool Changed = false; 1490 while (!Worklist.empty()) { 1491 Function *CurrFunc = Worklist.back(); 1492 Worklist.pop_back(); 1493 1494 if (CurrFunc->use_empty()) 1495 continue; 1496 1497 bool Recursive = false; 1498 for (User *U : CurrFunc->users()) 1499 if (Instruction *I = dyn_cast<Instruction>(U)) 1500 if (I->getParent()->getParent() == CurrFunc) { 1501 Recursive = true; 1502 break; 1503 } 1504 if (Recursive) 1505 continue; 1506 1507 std::pair<bool, Function *> Result = unswitchFunction(*CurrFunc); 1508 if (Result.second) 1509 Worklist.push_back(Result.second); 1510 Changed |= Result.first; 1511 } 1512 1513 return Changed; 1514 } 1515 1516 char PartialInlinerLegacyPass::ID = 0; 1517 1518 INITIALIZE_PASS_BEGIN(PartialInlinerLegacyPass, "partial-inliner", 1519 "Partial Inliner", false, false) 1520 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 1521 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 1522 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 1523 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) 1524 INITIALIZE_PASS_END(PartialInlinerLegacyPass, "partial-inliner", 1525 "Partial Inliner", false, false) 1526 1527 ModulePass *llvm::createPartialInliningPass() { 1528 return new PartialInlinerLegacyPass(); 1529 } 1530 1531 PreservedAnalyses PartialInlinerPass::run(Module &M, 1532 ModuleAnalysisManager &AM) { 1533 auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); 1534 1535 auto GetAssumptionCache = [&FAM](Function &F) -> AssumptionCache & { 1536 return FAM.getResult<AssumptionAnalysis>(F); 1537 }; 1538 1539 auto LookupAssumptionCache = [&FAM](Function &F) -> AssumptionCache * { 1540 return FAM.getCachedResult<AssumptionAnalysis>(F); 1541 }; 1542 1543 auto GetBFI = [&FAM](Function &F) -> BlockFrequencyInfo & { 1544 return FAM.getResult<BlockFrequencyAnalysis>(F); 1545 }; 1546 1547 auto GetTTI = [&FAM](Function &F) -> TargetTransformInfo & { 1548 return FAM.getResult<TargetIRAnalysis>(F); 1549 }; 1550 1551 auto GetTLI = [&FAM](Function &F) -> TargetLibraryInfo & { 1552 return FAM.getResult<TargetLibraryAnalysis>(F); 1553 }; 1554 1555 ProfileSummaryInfo &PSI = AM.getResult<ProfileSummaryAnalysis>(M); 1556 1557 if (PartialInlinerImpl(GetAssumptionCache, LookupAssumptionCache, GetTTI, 1558 GetTLI, PSI, GetBFI) 1559 .run(M)) 1560 return PreservedAnalyses::none(); 1561 return PreservedAnalyses::all(); 1562 } 1563