1 //===- PartialInlining.cpp - Inline parts of functions --------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass performs partial inlining, typically by inlining an if statement 10 // that surrounds the body of the function. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "llvm/Transforms/IPO/PartialInlining.h" 15 #include "llvm/ADT/DenseMap.h" 16 #include "llvm/ADT/DenseSet.h" 17 #include "llvm/ADT/DepthFirstIterator.h" 18 #include "llvm/ADT/STLExtras.h" 19 #include "llvm/ADT/SmallVector.h" 20 #include "llvm/ADT/Statistic.h" 21 #include "llvm/Analysis/BlockFrequencyInfo.h" 22 #include "llvm/Analysis/BranchProbabilityInfo.h" 23 #include "llvm/Analysis/InlineCost.h" 24 #include "llvm/Analysis/LoopInfo.h" 25 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 26 #include "llvm/Analysis/ProfileSummaryInfo.h" 27 #include "llvm/Analysis/TargetLibraryInfo.h" 28 #include "llvm/Analysis/TargetTransformInfo.h" 29 #include "llvm/IR/Attributes.h" 30 #include "llvm/IR/BasicBlock.h" 31 #include "llvm/IR/CFG.h" 32 #include "llvm/IR/DebugLoc.h" 33 #include "llvm/IR/DiagnosticInfo.h" 34 #include "llvm/IR/Dominators.h" 35 #include "llvm/IR/Function.h" 36 #include "llvm/IR/InstrTypes.h" 37 #include "llvm/IR/Instruction.h" 38 #include "llvm/IR/Instructions.h" 39 #include "llvm/IR/IntrinsicInst.h" 40 #include "llvm/IR/Intrinsics.h" 41 #include "llvm/IR/Module.h" 42 #include "llvm/IR/Operator.h" 43 #include "llvm/IR/ProfDataUtils.h" 44 #include "llvm/IR/User.h" 45 #include "llvm/Support/BlockFrequency.h" 46 #include "llvm/Support/BranchProbability.h" 47 #include "llvm/Support/Casting.h" 48 #include "llvm/Support/CommandLine.h" 49 #include "llvm/Support/ErrorHandling.h" 50 #include "llvm/Transforms/IPO.h" 51 #include "llvm/Transforms/Utils/Cloning.h" 52 #include "llvm/Transforms/Utils/CodeExtractor.h" 53 #include "llvm/Transforms/Utils/ValueMapper.h" 54 #include <algorithm> 55 #include <cassert> 56 #include <cstdint> 57 #include <memory> 58 #include <tuple> 59 #include <vector> 60 61 using namespace llvm; 62 63 #define DEBUG_TYPE "partial-inlining" 64 65 STATISTIC(NumPartialInlined, 66 "Number of callsites functions partially inlined into."); 67 STATISTIC(NumColdOutlinePartialInlined, "Number of times functions with " 68 "cold outlined regions were partially " 69 "inlined into its caller(s)."); 70 STATISTIC(NumColdRegionsFound, 71 "Number of cold single entry/exit regions found."); 72 STATISTIC(NumColdRegionsOutlined, 73 "Number of cold single entry/exit regions outlined."); 74 75 // Command line option to disable partial-inlining. The default is false: 76 static cl::opt<bool> 77 DisablePartialInlining("disable-partial-inlining", cl::init(false), 78 cl::Hidden, cl::desc("Disable partial inlining")); 79 // Command line option to disable multi-region partial-inlining. The default is 80 // false: 81 static cl::opt<bool> DisableMultiRegionPartialInline( 82 "disable-mr-partial-inlining", cl::init(false), cl::Hidden, 83 cl::desc("Disable multi-region partial inlining")); 84 85 // Command line option to force outlining in regions with live exit variables. 86 // The default is false: 87 static cl::opt<bool> 88 ForceLiveExit("pi-force-live-exit-outline", cl::init(false), cl::Hidden, 89 cl::desc("Force outline regions with live exits")); 90 91 // Command line option to enable marking outline functions with Cold Calling 92 // Convention. The default is false: 93 static cl::opt<bool> 94 MarkOutlinedColdCC("pi-mark-coldcc", cl::init(false), cl::Hidden, 95 cl::desc("Mark outline function calls with ColdCC")); 96 97 // This is an option used by testing: 98 static cl::opt<bool> SkipCostAnalysis("skip-partial-inlining-cost-analysis", 99 100 cl::ReallyHidden, 101 cl::desc("Skip Cost Analysis")); 102 // Used to determine if a cold region is worth outlining based on 103 // its inlining cost compared to the original function. Default is set at 10%. 104 // ie. if the cold region reduces the inlining cost of the original function by 105 // at least 10%. 106 static cl::opt<float> MinRegionSizeRatio( 107 "min-region-size-ratio", cl::init(0.1), cl::Hidden, 108 cl::desc("Minimum ratio comparing relative sizes of each " 109 "outline candidate and original function")); 110 // Used to tune the minimum number of execution counts needed in the predecessor 111 // block to the cold edge. ie. confidence interval. 112 static cl::opt<unsigned> 113 MinBlockCounterExecution("min-block-execution", cl::init(100), cl::Hidden, 114 cl::desc("Minimum block executions to consider " 115 "its BranchProbabilityInfo valid")); 116 // Used to determine when an edge is considered cold. Default is set to 10%. ie. 117 // if the branch probability is 10% or less, then it is deemed as 'cold'. 118 static cl::opt<float> ColdBranchRatio( 119 "cold-branch-ratio", cl::init(0.1), cl::Hidden, 120 cl::desc("Minimum BranchProbability to consider a region cold.")); 121 122 static cl::opt<unsigned> MaxNumInlineBlocks( 123 "max-num-inline-blocks", cl::init(5), cl::Hidden, 124 cl::desc("Max number of blocks to be partially inlined")); 125 126 // Command line option to set the maximum number of partial inlining allowed 127 // for the module. The default value of -1 means no limit. 128 static cl::opt<int> MaxNumPartialInlining( 129 "max-partial-inlining", cl::init(-1), cl::Hidden, 130 cl::desc("Max number of partial inlining. The default is unlimited")); 131 132 // Used only when PGO or user annotated branch data is absent. It is 133 // the least value that is used to weigh the outline region. If BFI 134 // produces larger value, the BFI value will be used. 135 static cl::opt<int> 136 OutlineRegionFreqPercent("outline-region-freq-percent", cl::init(75), 137 cl::Hidden, 138 cl::desc("Relative frequency of outline region to " 139 "the entry block")); 140 141 static cl::opt<unsigned> ExtraOutliningPenalty( 142 "partial-inlining-extra-penalty", cl::init(0), cl::Hidden, 143 cl::desc("A debug option to add additional penalty to the computed one.")); 144 145 namespace { 146 147 struct FunctionOutliningInfo { 148 FunctionOutliningInfo() = default; 149 150 // Returns the number of blocks to be inlined including all blocks 151 // in Entries and one return block. 152 unsigned getNumInlinedBlocks() const { return Entries.size() + 1; } 153 154 // A set of blocks including the function entry that guard 155 // the region to be outlined. 156 SmallVector<BasicBlock *, 4> Entries; 157 158 // The return block that is not included in the outlined region. 159 BasicBlock *ReturnBlock = nullptr; 160 161 // The dominating block of the region to be outlined. 162 BasicBlock *NonReturnBlock = nullptr; 163 164 // The set of blocks in Entries that are predecessors to ReturnBlock 165 SmallVector<BasicBlock *, 4> ReturnBlockPreds; 166 }; 167 168 struct FunctionOutliningMultiRegionInfo { 169 FunctionOutliningMultiRegionInfo() = default; 170 171 // Container for outline regions 172 struct OutlineRegionInfo { 173 OutlineRegionInfo(ArrayRef<BasicBlock *> Region, BasicBlock *EntryBlock, 174 BasicBlock *ExitBlock, BasicBlock *ReturnBlock) 175 : Region(Region), EntryBlock(EntryBlock), ExitBlock(ExitBlock), 176 ReturnBlock(ReturnBlock) {} 177 SmallVector<BasicBlock *, 8> Region; 178 BasicBlock *EntryBlock; 179 BasicBlock *ExitBlock; 180 BasicBlock *ReturnBlock; 181 }; 182 183 SmallVector<OutlineRegionInfo, 4> ORI; 184 }; 185 186 struct PartialInlinerImpl { 187 188 PartialInlinerImpl( 189 function_ref<AssumptionCache &(Function &)> GetAC, 190 function_ref<AssumptionCache *(Function &)> LookupAC, 191 function_ref<TargetTransformInfo &(Function &)> GTTI, 192 function_ref<const TargetLibraryInfo &(Function &)> GTLI, 193 ProfileSummaryInfo &ProfSI, 194 function_ref<BlockFrequencyInfo &(Function &)> GBFI = nullptr) 195 : GetAssumptionCache(GetAC), LookupAssumptionCache(LookupAC), 196 GetTTI(GTTI), GetBFI(GBFI), GetTLI(GTLI), PSI(ProfSI) {} 197 198 bool run(Module &M); 199 // Main part of the transformation that calls helper functions to find 200 // outlining candidates, clone & outline the function, and attempt to 201 // partially inline the resulting function. Returns true if 202 // inlining was successful, false otherwise. Also returns the outline 203 // function (only if we partially inlined early returns) as there is a 204 // possibility to further "peel" early return statements that were left in the 205 // outline function due to code size. 206 std::pair<bool, Function *> unswitchFunction(Function &F); 207 208 // This class speculatively clones the function to be partial inlined. 209 // At the end of partial inlining, the remaining callsites to the cloned 210 // function that are not partially inlined will be fixed up to reference 211 // the original function, and the cloned function will be erased. 212 struct FunctionCloner { 213 // Two constructors, one for single region outlining, the other for 214 // multi-region outlining. 215 FunctionCloner(Function *F, FunctionOutliningInfo *OI, 216 OptimizationRemarkEmitter &ORE, 217 function_ref<AssumptionCache *(Function &)> LookupAC, 218 function_ref<TargetTransformInfo &(Function &)> GetTTI); 219 FunctionCloner(Function *F, FunctionOutliningMultiRegionInfo *OMRI, 220 OptimizationRemarkEmitter &ORE, 221 function_ref<AssumptionCache *(Function &)> LookupAC, 222 function_ref<TargetTransformInfo &(Function &)> GetTTI); 223 224 ~FunctionCloner(); 225 226 // Prepare for function outlining: making sure there is only 227 // one incoming edge from the extracted/outlined region to 228 // the return block. 229 void normalizeReturnBlock() const; 230 231 // Do function outlining for cold regions. 232 bool doMultiRegionFunctionOutlining(); 233 // Do function outlining for region after early return block(s). 234 // NOTE: For vararg functions that do the vararg handling in the outlined 235 // function, we temporarily generate IR that does not properly 236 // forward varargs to the outlined function. Calling InlineFunction 237 // will update calls to the outlined functions to properly forward 238 // the varargs. 239 Function *doSingleRegionFunctionOutlining(); 240 241 Function *OrigFunc = nullptr; 242 Function *ClonedFunc = nullptr; 243 244 typedef std::pair<Function *, BasicBlock *> FuncBodyCallerPair; 245 // Keep track of Outlined Functions and the basic block they're called from. 246 SmallVector<FuncBodyCallerPair, 4> OutlinedFunctions; 247 248 // ClonedFunc is inlined in one of its callers after function 249 // outlining. 250 bool IsFunctionInlined = false; 251 // The cost of the region to be outlined. 252 InstructionCost OutlinedRegionCost = 0; 253 // ClonedOI is specific to outlining non-early return blocks. 254 std::unique_ptr<FunctionOutliningInfo> ClonedOI = nullptr; 255 // ClonedOMRI is specific to outlining cold regions. 256 std::unique_ptr<FunctionOutliningMultiRegionInfo> ClonedOMRI = nullptr; 257 std::unique_ptr<BlockFrequencyInfo> ClonedFuncBFI = nullptr; 258 OptimizationRemarkEmitter &ORE; 259 function_ref<AssumptionCache *(Function &)> LookupAC; 260 function_ref<TargetTransformInfo &(Function &)> GetTTI; 261 }; 262 263 private: 264 int NumPartialInlining = 0; 265 function_ref<AssumptionCache &(Function &)> GetAssumptionCache; 266 function_ref<AssumptionCache *(Function &)> LookupAssumptionCache; 267 function_ref<TargetTransformInfo &(Function &)> GetTTI; 268 function_ref<BlockFrequencyInfo &(Function &)> GetBFI; 269 function_ref<const TargetLibraryInfo &(Function &)> GetTLI; 270 ProfileSummaryInfo &PSI; 271 272 // Return the frequency of the OutlininingBB relative to F's entry point. 273 // The result is no larger than 1 and is represented using BP. 274 // (Note that the outlined region's 'head' block can only have incoming 275 // edges from the guarding entry blocks). 276 BranchProbability 277 getOutliningCallBBRelativeFreq(FunctionCloner &Cloner) const; 278 279 // Return true if the callee of CB should be partially inlined with 280 // profit. 281 bool shouldPartialInline(CallBase &CB, FunctionCloner &Cloner, 282 BlockFrequency WeightedOutliningRcost, 283 OptimizationRemarkEmitter &ORE) const; 284 285 // Try to inline DuplicateFunction (cloned from F with call to 286 // the OutlinedFunction into its callers. Return true 287 // if there is any successful inlining. 288 bool tryPartialInline(FunctionCloner &Cloner); 289 290 // Compute the mapping from use site of DuplicationFunction to the enclosing 291 // BB's profile count. 292 void 293 computeCallsiteToProfCountMap(Function *DuplicateFunction, 294 DenseMap<User *, uint64_t> &SiteCountMap) const; 295 296 bool isLimitReached() const { 297 return (MaxNumPartialInlining != -1 && 298 NumPartialInlining >= MaxNumPartialInlining); 299 } 300 301 static CallBase *getSupportedCallBase(User *U) { 302 if (isa<CallInst>(U) || isa<InvokeInst>(U)) 303 return cast<CallBase>(U); 304 llvm_unreachable("All uses must be calls"); 305 return nullptr; 306 } 307 308 static CallBase *getOneCallSiteTo(Function &F) { 309 User *User = *F.user_begin(); 310 return getSupportedCallBase(User); 311 } 312 313 std::tuple<DebugLoc, BasicBlock *> getOneDebugLoc(Function &F) const { 314 CallBase *CB = getOneCallSiteTo(F); 315 DebugLoc DLoc = CB->getDebugLoc(); 316 BasicBlock *Block = CB->getParent(); 317 return std::make_tuple(DLoc, Block); 318 } 319 320 // Returns the costs associated with function outlining: 321 // - The first value is the non-weighted runtime cost for making the call 322 // to the outlined function, including the addtional setup cost in the 323 // outlined function itself; 324 // - The second value is the estimated size of the new call sequence in 325 // basic block Cloner.OutliningCallBB; 326 std::tuple<InstructionCost, InstructionCost> 327 computeOutliningCosts(FunctionCloner &Cloner) const; 328 329 // Compute the 'InlineCost' of block BB. InlineCost is a proxy used to 330 // approximate both the size and runtime cost (Note that in the current 331 // inline cost analysis, there is no clear distinction there either). 332 static InstructionCost computeBBInlineCost(BasicBlock *BB, 333 TargetTransformInfo *TTI); 334 335 std::unique_ptr<FunctionOutliningInfo> 336 computeOutliningInfo(Function &F) const; 337 338 std::unique_ptr<FunctionOutliningMultiRegionInfo> 339 computeOutliningColdRegionsInfo(Function &F, 340 OptimizationRemarkEmitter &ORE) const; 341 }; 342 343 } // end anonymous namespace 344 345 std::unique_ptr<FunctionOutliningMultiRegionInfo> 346 PartialInlinerImpl::computeOutliningColdRegionsInfo( 347 Function &F, OptimizationRemarkEmitter &ORE) const { 348 BasicBlock *EntryBlock = &F.front(); 349 350 DominatorTree DT(F); 351 LoopInfo LI(DT); 352 BranchProbabilityInfo BPI(F, LI); 353 std::unique_ptr<BlockFrequencyInfo> ScopedBFI; 354 BlockFrequencyInfo *BFI; 355 if (!GetBFI) { 356 ScopedBFI.reset(new BlockFrequencyInfo(F, BPI, LI)); 357 BFI = ScopedBFI.get(); 358 } else 359 BFI = &(GetBFI(F)); 360 361 // Return if we don't have profiling information. 362 if (!PSI.hasInstrumentationProfile()) 363 return std::unique_ptr<FunctionOutliningMultiRegionInfo>(); 364 365 std::unique_ptr<FunctionOutliningMultiRegionInfo> OutliningInfo = 366 std::make_unique<FunctionOutliningMultiRegionInfo>(); 367 368 auto IsSingleExit = 369 [&ORE](SmallVectorImpl<BasicBlock *> &BlockList) -> BasicBlock * { 370 BasicBlock *ExitBlock = nullptr; 371 for (auto *Block : BlockList) { 372 for (BasicBlock *Succ : successors(Block)) { 373 if (!is_contained(BlockList, Succ)) { 374 if (ExitBlock) { 375 ORE.emit([&]() { 376 return OptimizationRemarkMissed(DEBUG_TYPE, "MultiExitRegion", 377 &Succ->front()) 378 << "Region dominated by " 379 << ore::NV("Block", BlockList.front()->getName()) 380 << " has more than one region exit edge."; 381 }); 382 return nullptr; 383 } 384 385 ExitBlock = Block; 386 } 387 } 388 } 389 return ExitBlock; 390 }; 391 392 auto BBProfileCount = [BFI](BasicBlock *BB) { 393 return BFI->getBlockProfileCount(BB).value_or(0); 394 }; 395 396 // Use the same computeBBInlineCost function to compute the cost savings of 397 // the outlining the candidate region. 398 TargetTransformInfo *FTTI = &GetTTI(F); 399 InstructionCost OverallFunctionCost = 0; 400 for (auto &BB : F) 401 OverallFunctionCost += computeBBInlineCost(&BB, FTTI); 402 403 LLVM_DEBUG(dbgs() << "OverallFunctionCost = " << OverallFunctionCost 404 << "\n";); 405 406 InstructionCost MinOutlineRegionCost = OverallFunctionCost.map( 407 [&](auto Cost) { return Cost * MinRegionSizeRatio; }); 408 409 BranchProbability MinBranchProbability( 410 static_cast<int>(ColdBranchRatio * MinBlockCounterExecution), 411 MinBlockCounterExecution); 412 bool ColdCandidateFound = false; 413 BasicBlock *CurrEntry = EntryBlock; 414 std::vector<BasicBlock *> DFS; 415 SmallPtrSet<BasicBlock *, 8> VisitedSet; 416 DFS.push_back(CurrEntry); 417 VisitedSet.insert(CurrEntry); 418 419 // Use Depth First Search on the basic blocks to find CFG edges that are 420 // considered cold. 421 // Cold regions considered must also have its inline cost compared to the 422 // overall inline cost of the original function. The region is outlined only 423 // if it reduced the inline cost of the function by 'MinOutlineRegionCost' or 424 // more. 425 while (!DFS.empty()) { 426 auto *ThisBB = DFS.back(); 427 DFS.pop_back(); 428 // Only consider regions with predecessor blocks that are considered 429 // not-cold (default: part of the top 99.99% of all block counters) 430 // AND greater than our minimum block execution count (default: 100). 431 if (PSI.isColdBlock(ThisBB, BFI) || 432 BBProfileCount(ThisBB) < MinBlockCounterExecution) 433 continue; 434 for (auto SI = succ_begin(ThisBB); SI != succ_end(ThisBB); ++SI) { 435 if (!VisitedSet.insert(*SI).second) 436 continue; 437 DFS.push_back(*SI); 438 // If branch isn't cold, we skip to the next one. 439 BranchProbability SuccProb = BPI.getEdgeProbability(ThisBB, *SI); 440 if (SuccProb > MinBranchProbability) 441 continue; 442 443 LLVM_DEBUG(dbgs() << "Found cold edge: " << ThisBB->getName() << "->" 444 << SI->getName() 445 << "\nBranch Probability = " << SuccProb << "\n";); 446 447 SmallVector<BasicBlock *, 8> DominateVector; 448 DT.getDescendants(*SI, DominateVector); 449 assert(!DominateVector.empty() && 450 "SI should be reachable and have at least itself as descendant"); 451 452 // We can only outline single entry regions (for now). 453 if (!DominateVector.front()->hasNPredecessors(1)) { 454 LLVM_DEBUG(dbgs() << "ABORT: Block " << SI->getName() 455 << " doesn't have a single predecessor in the " 456 "dominator tree\n";); 457 continue; 458 } 459 460 BasicBlock *ExitBlock = nullptr; 461 // We can only outline single exit regions (for now). 462 if (!(ExitBlock = IsSingleExit(DominateVector))) { 463 LLVM_DEBUG(dbgs() << "ABORT: Block " << SI->getName() 464 << " doesn't have a unique successor\n";); 465 continue; 466 } 467 468 InstructionCost OutlineRegionCost = 0; 469 for (auto *BB : DominateVector) 470 OutlineRegionCost += computeBBInlineCost(BB, &GetTTI(*BB->getParent())); 471 472 LLVM_DEBUG(dbgs() << "OutlineRegionCost = " << OutlineRegionCost 473 << "\n";); 474 475 if (!SkipCostAnalysis && OutlineRegionCost < MinOutlineRegionCost) { 476 ORE.emit([&]() { 477 return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", 478 &SI->front()) 479 << ore::NV("Callee", &F) 480 << " inline cost-savings smaller than " 481 << ore::NV("Cost", MinOutlineRegionCost); 482 }); 483 484 LLVM_DEBUG(dbgs() << "ABORT: Outline region cost is smaller than " 485 << MinOutlineRegionCost << "\n";); 486 continue; 487 } 488 489 // For now, ignore blocks that belong to a SISE region that is a 490 // candidate for outlining. In the future, we may want to look 491 // at inner regions because the outer region may have live-exit 492 // variables. 493 VisitedSet.insert_range(DominateVector); 494 495 // ReturnBlock here means the block after the outline call 496 BasicBlock *ReturnBlock = ExitBlock->getSingleSuccessor(); 497 FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegInfo( 498 DominateVector, DominateVector.front(), ExitBlock, ReturnBlock); 499 OutliningInfo->ORI.push_back(RegInfo); 500 LLVM_DEBUG(dbgs() << "Found Cold Candidate starting at block: " 501 << DominateVector.front()->getName() << "\n";); 502 ColdCandidateFound = true; 503 NumColdRegionsFound++; 504 } 505 } 506 507 if (ColdCandidateFound) 508 return OutliningInfo; 509 510 return std::unique_ptr<FunctionOutliningMultiRegionInfo>(); 511 } 512 513 std::unique_ptr<FunctionOutliningInfo> 514 PartialInlinerImpl::computeOutliningInfo(Function &F) const { 515 BasicBlock *EntryBlock = &F.front(); 516 BranchInst *BR = dyn_cast<BranchInst>(EntryBlock->getTerminator()); 517 if (!BR || BR->isUnconditional()) 518 return std::unique_ptr<FunctionOutliningInfo>(); 519 520 // Returns true if Succ is BB's successor 521 auto IsSuccessor = [](BasicBlock *Succ, BasicBlock *BB) { 522 return is_contained(successors(BB), Succ); 523 }; 524 525 auto IsReturnBlock = [](BasicBlock *BB) { 526 Instruction *TI = BB->getTerminator(); 527 return isa<ReturnInst>(TI); 528 }; 529 530 auto GetReturnBlock = [&](BasicBlock *Succ1, BasicBlock *Succ2) { 531 if (IsReturnBlock(Succ1)) 532 return std::make_tuple(Succ1, Succ2); 533 if (IsReturnBlock(Succ2)) 534 return std::make_tuple(Succ2, Succ1); 535 536 return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr); 537 }; 538 539 // Detect a triangular shape: 540 auto GetCommonSucc = [&](BasicBlock *Succ1, BasicBlock *Succ2) { 541 if (IsSuccessor(Succ1, Succ2)) 542 return std::make_tuple(Succ1, Succ2); 543 if (IsSuccessor(Succ2, Succ1)) 544 return std::make_tuple(Succ2, Succ1); 545 546 return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr); 547 }; 548 549 std::unique_ptr<FunctionOutliningInfo> OutliningInfo = 550 std::make_unique<FunctionOutliningInfo>(); 551 552 BasicBlock *CurrEntry = EntryBlock; 553 bool CandidateFound = false; 554 do { 555 // The number of blocks to be inlined has already reached 556 // the limit. When MaxNumInlineBlocks is set to 0 or 1, this 557 // disables partial inlining for the function. 558 if (OutliningInfo->getNumInlinedBlocks() >= MaxNumInlineBlocks) 559 break; 560 561 if (succ_size(CurrEntry) != 2) 562 break; 563 564 BasicBlock *Succ1 = *succ_begin(CurrEntry); 565 BasicBlock *Succ2 = *(succ_begin(CurrEntry) + 1); 566 567 BasicBlock *ReturnBlock, *NonReturnBlock; 568 std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2); 569 570 if (ReturnBlock) { 571 OutliningInfo->Entries.push_back(CurrEntry); 572 OutliningInfo->ReturnBlock = ReturnBlock; 573 OutliningInfo->NonReturnBlock = NonReturnBlock; 574 CandidateFound = true; 575 break; 576 } 577 578 BasicBlock *CommSucc, *OtherSucc; 579 std::tie(CommSucc, OtherSucc) = GetCommonSucc(Succ1, Succ2); 580 581 if (!CommSucc) 582 break; 583 584 OutliningInfo->Entries.push_back(CurrEntry); 585 CurrEntry = OtherSucc; 586 } while (true); 587 588 if (!CandidateFound) 589 return std::unique_ptr<FunctionOutliningInfo>(); 590 591 // There should not be any successors (not in the entry set) other than 592 // {ReturnBlock, NonReturnBlock} 593 assert(OutliningInfo->Entries[0] == &F.front() && 594 "Function Entry must be the first in Entries vector"); 595 DenseSet<BasicBlock *> Entries(llvm::from_range, OutliningInfo->Entries); 596 597 // Returns true of BB has Predecessor which is not 598 // in Entries set. 599 auto HasNonEntryPred = [Entries](BasicBlock *BB) { 600 for (auto *Pred : predecessors(BB)) { 601 if (!Entries.count(Pred)) 602 return true; 603 } 604 return false; 605 }; 606 auto CheckAndNormalizeCandidate = 607 [Entries, HasNonEntryPred](FunctionOutliningInfo *OutliningInfo) { 608 for (BasicBlock *E : OutliningInfo->Entries) { 609 for (auto *Succ : successors(E)) { 610 if (Entries.count(Succ)) 611 continue; 612 if (Succ == OutliningInfo->ReturnBlock) 613 OutliningInfo->ReturnBlockPreds.push_back(E); 614 else if (Succ != OutliningInfo->NonReturnBlock) 615 return false; 616 } 617 // There should not be any outside incoming edges either: 618 if (HasNonEntryPred(E)) 619 return false; 620 } 621 return true; 622 }; 623 624 if (!CheckAndNormalizeCandidate(OutliningInfo.get())) 625 return std::unique_ptr<FunctionOutliningInfo>(); 626 627 // Now further growing the candidate's inlining region by 628 // peeling off dominating blocks from the outlining region: 629 while (OutliningInfo->getNumInlinedBlocks() < MaxNumInlineBlocks) { 630 BasicBlock *Cand = OutliningInfo->NonReturnBlock; 631 if (succ_size(Cand) != 2) 632 break; 633 634 if (HasNonEntryPred(Cand)) 635 break; 636 637 BasicBlock *Succ1 = *succ_begin(Cand); 638 BasicBlock *Succ2 = *(succ_begin(Cand) + 1); 639 640 BasicBlock *ReturnBlock, *NonReturnBlock; 641 std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2); 642 if (!ReturnBlock || ReturnBlock != OutliningInfo->ReturnBlock) 643 break; 644 645 if (NonReturnBlock->getSinglePredecessor() != Cand) 646 break; 647 648 // Now grow and update OutlininigInfo: 649 OutliningInfo->Entries.push_back(Cand); 650 OutliningInfo->NonReturnBlock = NonReturnBlock; 651 OutliningInfo->ReturnBlockPreds.push_back(Cand); 652 Entries.insert(Cand); 653 } 654 655 return OutliningInfo; 656 } 657 658 // Check if there is PGO data or user annotated branch data: 659 static bool hasProfileData(const Function &F, const FunctionOutliningInfo &OI) { 660 if (F.hasProfileData()) 661 return true; 662 // Now check if any of the entry block has MD_prof data: 663 for (auto *E : OI.Entries) { 664 BranchInst *BR = dyn_cast<BranchInst>(E->getTerminator()); 665 if (!BR || BR->isUnconditional()) 666 continue; 667 if (hasBranchWeightMD(*BR)) 668 return true; 669 } 670 return false; 671 } 672 673 BranchProbability PartialInlinerImpl::getOutliningCallBBRelativeFreq( 674 FunctionCloner &Cloner) const { 675 BasicBlock *OutliningCallBB = Cloner.OutlinedFunctions.back().second; 676 auto EntryFreq = 677 Cloner.ClonedFuncBFI->getBlockFreq(&Cloner.ClonedFunc->getEntryBlock()); 678 auto OutliningCallFreq = 679 Cloner.ClonedFuncBFI->getBlockFreq(OutliningCallBB); 680 // FIXME Hackery needed because ClonedFuncBFI is based on the function BEFORE 681 // we outlined any regions, so we may encounter situations where the 682 // OutliningCallFreq is *slightly* bigger than the EntryFreq. 683 if (OutliningCallFreq.getFrequency() > EntryFreq.getFrequency()) 684 OutliningCallFreq = EntryFreq; 685 686 auto OutlineRegionRelFreq = BranchProbability::getBranchProbability( 687 OutliningCallFreq.getFrequency(), EntryFreq.getFrequency()); 688 689 if (hasProfileData(*Cloner.OrigFunc, *Cloner.ClonedOI)) 690 return OutlineRegionRelFreq; 691 692 // When profile data is not available, we need to be conservative in 693 // estimating the overall savings. Static branch prediction can usually 694 // guess the branch direction right (taken/non-taken), but the guessed 695 // branch probability is usually not biased enough. In case when the 696 // outlined region is predicted to be likely, its probability needs 697 // to be made higher (more biased) to not under-estimate the cost of 698 // function outlining. On the other hand, if the outlined region 699 // is predicted to be less likely, the predicted probablity is usually 700 // higher than the actual. For instance, the actual probability of the 701 // less likely target is only 5%, but the guessed probablity can be 702 // 40%. In the latter case, there is no need for further adjustment. 703 // FIXME: add an option for this. 704 if (OutlineRegionRelFreq < BranchProbability(45, 100)) 705 return OutlineRegionRelFreq; 706 707 OutlineRegionRelFreq = std::max( 708 OutlineRegionRelFreq, BranchProbability(OutlineRegionFreqPercent, 100)); 709 710 return OutlineRegionRelFreq; 711 } 712 713 bool PartialInlinerImpl::shouldPartialInline( 714 CallBase &CB, FunctionCloner &Cloner, BlockFrequency WeightedOutliningRcost, 715 OptimizationRemarkEmitter &ORE) const { 716 using namespace ore; 717 718 Function *Callee = CB.getCalledFunction(); 719 assert(Callee == Cloner.ClonedFunc); 720 721 if (SkipCostAnalysis) 722 return isInlineViable(*Callee).isSuccess(); 723 724 Function *Caller = CB.getCaller(); 725 auto &CalleeTTI = GetTTI(*Callee); 726 bool RemarksEnabled = 727 Callee->getContext().getDiagHandlerPtr()->isMissedOptRemarkEnabled( 728 DEBUG_TYPE); 729 InlineCost IC = 730 getInlineCost(CB, getInlineParams(), CalleeTTI, GetAssumptionCache, 731 GetTLI, GetBFI, &PSI, RemarksEnabled ? &ORE : nullptr); 732 733 if (IC.isAlways()) { 734 ORE.emit([&]() { 735 return OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", &CB) 736 << NV("Callee", Cloner.OrigFunc) 737 << " should always be fully inlined, not partially"; 738 }); 739 return false; 740 } 741 742 if (IC.isNever()) { 743 ORE.emit([&]() { 744 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", &CB) 745 << NV("Callee", Cloner.OrigFunc) << " not partially inlined into " 746 << NV("Caller", Caller) 747 << " because it should never be inlined (cost=never)"; 748 }); 749 return false; 750 } 751 752 if (!IC) { 753 ORE.emit([&]() { 754 return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", &CB) 755 << NV("Callee", Cloner.OrigFunc) << " not partially inlined into " 756 << NV("Caller", Caller) << " because too costly to inline (cost=" 757 << NV("Cost", IC.getCost()) << ", threshold=" 758 << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")"; 759 }); 760 return false; 761 } 762 const DataLayout &DL = Caller->getDataLayout(); 763 764 // The savings of eliminating the call: 765 int NonWeightedSavings = getCallsiteCost(CalleeTTI, CB, DL); 766 BlockFrequency NormWeightedSavings(NonWeightedSavings); 767 768 // Weighted saving is smaller than weighted cost, return false 769 if (NormWeightedSavings < WeightedOutliningRcost) { 770 ORE.emit([&]() { 771 return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutliningCallcostTooHigh", 772 &CB) 773 << NV("Callee", Cloner.OrigFunc) << " not partially inlined into " 774 << NV("Caller", Caller) << " runtime overhead (overhead=" 775 << NV("Overhead", (unsigned)WeightedOutliningRcost.getFrequency()) 776 << ", savings=" 777 << NV("Savings", (unsigned)NormWeightedSavings.getFrequency()) 778 << ")" 779 << " of making the outlined call is too high"; 780 }); 781 782 return false; 783 } 784 785 ORE.emit([&]() { 786 return OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBePartiallyInlined", &CB) 787 << NV("Callee", Cloner.OrigFunc) << " can be partially inlined into " 788 << NV("Caller", Caller) << " with cost=" << NV("Cost", IC.getCost()) 789 << " (threshold=" 790 << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")"; 791 }); 792 return true; 793 } 794 795 // TODO: Ideally we should share Inliner's InlineCost Analysis code. 796 // For now use a simplified version. The returned 'InlineCost' will be used 797 // to esimate the size cost as well as runtime cost of the BB. 798 InstructionCost 799 PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB, 800 TargetTransformInfo *TTI) { 801 InstructionCost InlineCost = 0; 802 const DataLayout &DL = BB->getDataLayout(); 803 int InstrCost = InlineConstants::getInstrCost(); 804 for (Instruction &I : BB->instructionsWithoutDebug()) { 805 // Skip free instructions. 806 switch (I.getOpcode()) { 807 case Instruction::BitCast: 808 case Instruction::PtrToInt: 809 case Instruction::IntToPtr: 810 case Instruction::Alloca: 811 case Instruction::PHI: 812 continue; 813 case Instruction::GetElementPtr: 814 if (cast<GetElementPtrInst>(&I)->hasAllZeroIndices()) 815 continue; 816 break; 817 default: 818 break; 819 } 820 821 if (I.isLifetimeStartOrEnd()) 822 continue; 823 824 if (auto *II = dyn_cast<IntrinsicInst>(&I)) { 825 Intrinsic::ID IID = II->getIntrinsicID(); 826 SmallVector<Type *, 4> Tys; 827 FastMathFlags FMF; 828 for (Value *Val : II->args()) 829 Tys.push_back(Val->getType()); 830 831 if (auto *FPMO = dyn_cast<FPMathOperator>(II)) 832 FMF = FPMO->getFastMathFlags(); 833 834 IntrinsicCostAttributes ICA(IID, II->getType(), Tys, FMF); 835 InlineCost += TTI->getIntrinsicInstrCost(ICA, TTI::TCK_SizeAndLatency); 836 continue; 837 } 838 839 if (CallInst *CI = dyn_cast<CallInst>(&I)) { 840 InlineCost += getCallsiteCost(*TTI, *CI, DL); 841 continue; 842 } 843 844 if (InvokeInst *II = dyn_cast<InvokeInst>(&I)) { 845 InlineCost += getCallsiteCost(*TTI, *II, DL); 846 continue; 847 } 848 849 if (SwitchInst *SI = dyn_cast<SwitchInst>(&I)) { 850 InlineCost += (SI->getNumCases() + 1) * InstrCost; 851 continue; 852 } 853 InlineCost += InstrCost; 854 } 855 856 return InlineCost; 857 } 858 859 std::tuple<InstructionCost, InstructionCost> 860 PartialInlinerImpl::computeOutliningCosts(FunctionCloner &Cloner) const { 861 InstructionCost OutliningFuncCallCost = 0, OutlinedFunctionCost = 0; 862 for (auto FuncBBPair : Cloner.OutlinedFunctions) { 863 Function *OutlinedFunc = FuncBBPair.first; 864 BasicBlock* OutliningCallBB = FuncBBPair.second; 865 // Now compute the cost of the call sequence to the outlined function 866 // 'OutlinedFunction' in BB 'OutliningCallBB': 867 auto *OutlinedFuncTTI = &GetTTI(*OutlinedFunc); 868 OutliningFuncCallCost += 869 computeBBInlineCost(OutliningCallBB, OutlinedFuncTTI); 870 871 // Now compute the cost of the extracted/outlined function itself: 872 for (BasicBlock &BB : *OutlinedFunc) 873 OutlinedFunctionCost += computeBBInlineCost(&BB, OutlinedFuncTTI); 874 } 875 assert(OutlinedFunctionCost >= Cloner.OutlinedRegionCost && 876 "Outlined function cost should be no less than the outlined region"); 877 878 // The code extractor introduces a new root and exit stub blocks with 879 // additional unconditional branches. Those branches will be eliminated 880 // later with bb layout. The cost should be adjusted accordingly: 881 OutlinedFunctionCost -= 882 2 * InlineConstants::getInstrCost() * Cloner.OutlinedFunctions.size(); 883 884 InstructionCost OutliningRuntimeOverhead = 885 OutliningFuncCallCost + 886 (OutlinedFunctionCost - Cloner.OutlinedRegionCost) + 887 ExtraOutliningPenalty.getValue(); 888 889 return std::make_tuple(OutliningFuncCallCost, OutliningRuntimeOverhead); 890 } 891 892 // Create the callsite to profile count map which is 893 // used to update the original function's entry count, 894 // after the function is partially inlined into the callsite. 895 void PartialInlinerImpl::computeCallsiteToProfCountMap( 896 Function *DuplicateFunction, 897 DenseMap<User *, uint64_t> &CallSiteToProfCountMap) const { 898 std::vector<User *> Users(DuplicateFunction->user_begin(), 899 DuplicateFunction->user_end()); 900 Function *CurrentCaller = nullptr; 901 std::unique_ptr<BlockFrequencyInfo> TempBFI; 902 BlockFrequencyInfo *CurrentCallerBFI = nullptr; 903 904 auto ComputeCurrBFI = [&,this](Function *Caller) { 905 // For the old pass manager: 906 if (!GetBFI) { 907 DominatorTree DT(*Caller); 908 LoopInfo LI(DT); 909 BranchProbabilityInfo BPI(*Caller, LI); 910 TempBFI.reset(new BlockFrequencyInfo(*Caller, BPI, LI)); 911 CurrentCallerBFI = TempBFI.get(); 912 } else { 913 // New pass manager: 914 CurrentCallerBFI = &(GetBFI(*Caller)); 915 } 916 }; 917 918 for (User *User : Users) { 919 CallBase *CB = getSupportedCallBase(User); 920 Function *Caller = CB->getCaller(); 921 if (CurrentCaller != Caller) { 922 CurrentCaller = Caller; 923 ComputeCurrBFI(Caller); 924 } else { 925 assert(CurrentCallerBFI && "CallerBFI is not set"); 926 } 927 BasicBlock *CallBB = CB->getParent(); 928 auto Count = CurrentCallerBFI->getBlockProfileCount(CallBB); 929 if (Count) 930 CallSiteToProfCountMap[User] = *Count; 931 else 932 CallSiteToProfCountMap[User] = 0; 933 } 934 } 935 936 PartialInlinerImpl::FunctionCloner::FunctionCloner( 937 Function *F, FunctionOutliningInfo *OI, OptimizationRemarkEmitter &ORE, 938 function_ref<AssumptionCache *(Function &)> LookupAC, 939 function_ref<TargetTransformInfo &(Function &)> GetTTI) 940 : OrigFunc(F), ORE(ORE), LookupAC(LookupAC), GetTTI(GetTTI) { 941 ClonedOI = std::make_unique<FunctionOutliningInfo>(); 942 943 // Clone the function, so that we can hack away on it. 944 ValueToValueMapTy VMap; 945 ClonedFunc = CloneFunction(F, VMap); 946 947 ClonedOI->ReturnBlock = cast<BasicBlock>(VMap[OI->ReturnBlock]); 948 ClonedOI->NonReturnBlock = cast<BasicBlock>(VMap[OI->NonReturnBlock]); 949 for (BasicBlock *BB : OI->Entries) 950 ClonedOI->Entries.push_back(cast<BasicBlock>(VMap[BB])); 951 952 for (BasicBlock *E : OI->ReturnBlockPreds) { 953 BasicBlock *NewE = cast<BasicBlock>(VMap[E]); 954 ClonedOI->ReturnBlockPreds.push_back(NewE); 955 } 956 // Go ahead and update all uses to the duplicate, so that we can just 957 // use the inliner functionality when we're done hacking. 958 F->replaceAllUsesWith(ClonedFunc); 959 } 960 961 PartialInlinerImpl::FunctionCloner::FunctionCloner( 962 Function *F, FunctionOutliningMultiRegionInfo *OI, 963 OptimizationRemarkEmitter &ORE, 964 function_ref<AssumptionCache *(Function &)> LookupAC, 965 function_ref<TargetTransformInfo &(Function &)> GetTTI) 966 : OrigFunc(F), ORE(ORE), LookupAC(LookupAC), GetTTI(GetTTI) { 967 ClonedOMRI = std::make_unique<FunctionOutliningMultiRegionInfo>(); 968 969 // Clone the function, so that we can hack away on it. 970 ValueToValueMapTy VMap; 971 ClonedFunc = CloneFunction(F, VMap); 972 973 // Go through all Outline Candidate Regions and update all BasicBlock 974 // information. 975 for (const FunctionOutliningMultiRegionInfo::OutlineRegionInfo &RegionInfo : 976 OI->ORI) { 977 SmallVector<BasicBlock *, 8> Region; 978 for (BasicBlock *BB : RegionInfo.Region) 979 Region.push_back(cast<BasicBlock>(VMap[BB])); 980 981 BasicBlock *NewEntryBlock = cast<BasicBlock>(VMap[RegionInfo.EntryBlock]); 982 BasicBlock *NewExitBlock = cast<BasicBlock>(VMap[RegionInfo.ExitBlock]); 983 BasicBlock *NewReturnBlock = nullptr; 984 if (RegionInfo.ReturnBlock) 985 NewReturnBlock = cast<BasicBlock>(VMap[RegionInfo.ReturnBlock]); 986 FunctionOutliningMultiRegionInfo::OutlineRegionInfo MappedRegionInfo( 987 Region, NewEntryBlock, NewExitBlock, NewReturnBlock); 988 ClonedOMRI->ORI.push_back(MappedRegionInfo); 989 } 990 // Go ahead and update all uses to the duplicate, so that we can just 991 // use the inliner functionality when we're done hacking. 992 F->replaceAllUsesWith(ClonedFunc); 993 } 994 995 void PartialInlinerImpl::FunctionCloner::normalizeReturnBlock() const { 996 auto GetFirstPHI = [](BasicBlock *BB) { 997 BasicBlock::iterator I = BB->begin(); 998 PHINode *FirstPhi = nullptr; 999 while (I != BB->end()) { 1000 PHINode *Phi = dyn_cast<PHINode>(I); 1001 if (!Phi) 1002 break; 1003 if (!FirstPhi) { 1004 FirstPhi = Phi; 1005 break; 1006 } 1007 } 1008 return FirstPhi; 1009 }; 1010 1011 // Shouldn't need to normalize PHIs if we're not outlining non-early return 1012 // blocks. 1013 if (!ClonedOI) 1014 return; 1015 1016 // Special hackery is needed with PHI nodes that have inputs from more than 1017 // one extracted block. For simplicity, just split the PHIs into a two-level 1018 // sequence of PHIs, some of which will go in the extracted region, and some 1019 // of which will go outside. 1020 BasicBlock *PreReturn = ClonedOI->ReturnBlock; 1021 // only split block when necessary: 1022 PHINode *FirstPhi = GetFirstPHI(PreReturn); 1023 unsigned NumPredsFromEntries = ClonedOI->ReturnBlockPreds.size(); 1024 1025 if (!FirstPhi || FirstPhi->getNumIncomingValues() <= NumPredsFromEntries + 1) 1026 return; 1027 1028 auto IsTrivialPhi = [](PHINode *PN) -> Value * { 1029 if (llvm::all_equal(PN->incoming_values())) 1030 return PN->getIncomingValue(0); 1031 return nullptr; 1032 }; 1033 1034 ClonedOI->ReturnBlock = ClonedOI->ReturnBlock->splitBasicBlock( 1035 ClonedOI->ReturnBlock->getFirstNonPHIIt()); 1036 BasicBlock::iterator I = PreReturn->begin(); 1037 BasicBlock::iterator Ins = ClonedOI->ReturnBlock->begin(); 1038 SmallVector<Instruction *, 4> DeadPhis; 1039 while (I != PreReturn->end()) { 1040 PHINode *OldPhi = dyn_cast<PHINode>(I); 1041 if (!OldPhi) 1042 break; 1043 1044 PHINode *RetPhi = 1045 PHINode::Create(OldPhi->getType(), NumPredsFromEntries + 1, ""); 1046 RetPhi->insertBefore(Ins); 1047 OldPhi->replaceAllUsesWith(RetPhi); 1048 Ins = ClonedOI->ReturnBlock->getFirstNonPHIIt(); 1049 1050 RetPhi->addIncoming(&*I, PreReturn); 1051 for (BasicBlock *E : ClonedOI->ReturnBlockPreds) { 1052 RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(E), E); 1053 OldPhi->removeIncomingValue(E); 1054 } 1055 1056 // After incoming values splitting, the old phi may become trivial. 1057 // Keeping the trivial phi can introduce definition inside the outline 1058 // region which is live-out, causing necessary overhead (load, store 1059 // arg passing etc). 1060 if (auto *OldPhiVal = IsTrivialPhi(OldPhi)) { 1061 OldPhi->replaceAllUsesWith(OldPhiVal); 1062 DeadPhis.push_back(OldPhi); 1063 } 1064 ++I; 1065 } 1066 for (auto *DP : DeadPhis) 1067 DP->eraseFromParent(); 1068 1069 for (auto *E : ClonedOI->ReturnBlockPreds) 1070 E->getTerminator()->replaceUsesOfWith(PreReturn, ClonedOI->ReturnBlock); 1071 } 1072 1073 bool PartialInlinerImpl::FunctionCloner::doMultiRegionFunctionOutlining() { 1074 1075 auto ComputeRegionCost = 1076 [&](SmallVectorImpl<BasicBlock *> &Region) -> InstructionCost { 1077 InstructionCost Cost = 0; 1078 for (BasicBlock* BB : Region) 1079 Cost += computeBBInlineCost(BB, &GetTTI(*BB->getParent())); 1080 return Cost; 1081 }; 1082 1083 assert(ClonedOMRI && "Expecting OutlineInfo for multi region outline"); 1084 1085 if (ClonedOMRI->ORI.empty()) 1086 return false; 1087 1088 // The CodeExtractor needs a dominator tree. 1089 DominatorTree DT; 1090 DT.recalculate(*ClonedFunc); 1091 1092 // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo. 1093 LoopInfo LI(DT); 1094 BranchProbabilityInfo BPI(*ClonedFunc, LI); 1095 ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI)); 1096 1097 // Cache and recycle the CodeExtractor analysis to avoid O(n^2) compile-time. 1098 CodeExtractorAnalysisCache CEAC(*ClonedFunc); 1099 1100 SetVector<Value *> Inputs, Outputs, Sinks; 1101 for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo : 1102 ClonedOMRI->ORI) { 1103 InstructionCost CurrentOutlinedRegionCost = 1104 ComputeRegionCost(RegionInfo.Region); 1105 1106 CodeExtractor CE(RegionInfo.Region, &DT, /*AggregateArgs*/ false, 1107 ClonedFuncBFI.get(), &BPI, 1108 LookupAC(*RegionInfo.EntryBlock->getParent()), 1109 /* AllowVarargs */ false); 1110 1111 CE.findInputsOutputs(Inputs, Outputs, Sinks); 1112 1113 LLVM_DEBUG({ 1114 dbgs() << "inputs: " << Inputs.size() << "\n"; 1115 dbgs() << "outputs: " << Outputs.size() << "\n"; 1116 for (Value *value : Inputs) 1117 dbgs() << "value used in func: " << *value << "\n"; 1118 for (Value *output : Outputs) 1119 dbgs() << "instr used in func: " << *output << "\n"; 1120 }); 1121 1122 // Do not extract regions that have live exit variables. 1123 if (Outputs.size() > 0 && !ForceLiveExit) 1124 continue; 1125 1126 if (Function *OutlinedFunc = CE.extractCodeRegion(CEAC)) { 1127 CallBase *OCS = PartialInlinerImpl::getOneCallSiteTo(*OutlinedFunc); 1128 BasicBlock *OutliningCallBB = OCS->getParent(); 1129 assert(OutliningCallBB->getParent() == ClonedFunc); 1130 OutlinedFunctions.push_back(std::make_pair(OutlinedFunc,OutliningCallBB)); 1131 NumColdRegionsOutlined++; 1132 OutlinedRegionCost += CurrentOutlinedRegionCost; 1133 1134 if (MarkOutlinedColdCC) { 1135 OutlinedFunc->setCallingConv(CallingConv::Cold); 1136 OCS->setCallingConv(CallingConv::Cold); 1137 } 1138 } else 1139 ORE.emit([&]() { 1140 return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed", 1141 &RegionInfo.Region.front()->front()) 1142 << "Failed to extract region at block " 1143 << ore::NV("Block", RegionInfo.Region.front()); 1144 }); 1145 } 1146 1147 return !OutlinedFunctions.empty(); 1148 } 1149 1150 Function * 1151 PartialInlinerImpl::FunctionCloner::doSingleRegionFunctionOutlining() { 1152 // Returns true if the block is to be partial inlined into the caller 1153 // (i.e. not to be extracted to the out of line function) 1154 auto ToBeInlined = [&, this](BasicBlock *BB) { 1155 return BB == ClonedOI->ReturnBlock || 1156 llvm::is_contained(ClonedOI->Entries, BB); 1157 }; 1158 1159 assert(ClonedOI && "Expecting OutlineInfo for single region outline"); 1160 // The CodeExtractor needs a dominator tree. 1161 DominatorTree DT; 1162 DT.recalculate(*ClonedFunc); 1163 1164 // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo. 1165 LoopInfo LI(DT); 1166 BranchProbabilityInfo BPI(*ClonedFunc, LI); 1167 ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI)); 1168 1169 // Gather up the blocks that we're going to extract. 1170 std::vector<BasicBlock *> ToExtract; 1171 auto *ClonedFuncTTI = &GetTTI(*ClonedFunc); 1172 ToExtract.push_back(ClonedOI->NonReturnBlock); 1173 OutlinedRegionCost += PartialInlinerImpl::computeBBInlineCost( 1174 ClonedOI->NonReturnBlock, ClonedFuncTTI); 1175 for (BasicBlock *BB : depth_first(&ClonedFunc->getEntryBlock())) 1176 if (!ToBeInlined(BB) && BB != ClonedOI->NonReturnBlock) { 1177 ToExtract.push_back(BB); 1178 // FIXME: the code extractor may hoist/sink more code 1179 // into the outlined function which may make the outlining 1180 // overhead (the difference of the outlined function cost 1181 // and OutliningRegionCost) look larger. 1182 OutlinedRegionCost += computeBBInlineCost(BB, ClonedFuncTTI); 1183 } 1184 1185 // Extract the body of the if. 1186 CodeExtractorAnalysisCache CEAC(*ClonedFunc); 1187 Function *OutlinedFunc = 1188 CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false, 1189 ClonedFuncBFI.get(), &BPI, LookupAC(*ClonedFunc), 1190 /* AllowVarargs */ true) 1191 .extractCodeRegion(CEAC); 1192 1193 if (OutlinedFunc) { 1194 BasicBlock *OutliningCallBB = 1195 PartialInlinerImpl::getOneCallSiteTo(*OutlinedFunc)->getParent(); 1196 assert(OutliningCallBB->getParent() == ClonedFunc); 1197 OutlinedFunctions.push_back(std::make_pair(OutlinedFunc, OutliningCallBB)); 1198 } else 1199 ORE.emit([&]() { 1200 return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed", 1201 &ToExtract.front()->front()) 1202 << "Failed to extract region at block " 1203 << ore::NV("Block", ToExtract.front()); 1204 }); 1205 1206 return OutlinedFunc; 1207 } 1208 1209 PartialInlinerImpl::FunctionCloner::~FunctionCloner() { 1210 // Ditch the duplicate, since we're done with it, and rewrite all remaining 1211 // users (function pointers, etc.) back to the original function. 1212 ClonedFunc->replaceAllUsesWith(OrigFunc); 1213 ClonedFunc->eraseFromParent(); 1214 if (!IsFunctionInlined) { 1215 // Remove each function that was speculatively created if there is no 1216 // reference. 1217 for (auto FuncBBPair : OutlinedFunctions) { 1218 Function *Func = FuncBBPair.first; 1219 Func->eraseFromParent(); 1220 } 1221 } 1222 } 1223 1224 std::pair<bool, Function *> PartialInlinerImpl::unswitchFunction(Function &F) { 1225 if (F.hasAddressTaken()) 1226 return {false, nullptr}; 1227 1228 // Let inliner handle it 1229 if (F.hasFnAttribute(Attribute::AlwaysInline)) 1230 return {false, nullptr}; 1231 1232 if (F.hasFnAttribute(Attribute::NoInline)) 1233 return {false, nullptr}; 1234 1235 if (PSI.isFunctionEntryCold(&F)) 1236 return {false, nullptr}; 1237 1238 if (F.users().empty()) 1239 return {false, nullptr}; 1240 1241 OptimizationRemarkEmitter ORE(&F); 1242 1243 // Only try to outline cold regions if we have a profile summary, which 1244 // implies we have profiling information. 1245 if (PSI.hasProfileSummary() && F.hasProfileData() && 1246 !DisableMultiRegionPartialInline) { 1247 std::unique_ptr<FunctionOutliningMultiRegionInfo> OMRI = 1248 computeOutliningColdRegionsInfo(F, ORE); 1249 if (OMRI) { 1250 FunctionCloner Cloner(&F, OMRI.get(), ORE, LookupAssumptionCache, GetTTI); 1251 1252 LLVM_DEBUG({ 1253 dbgs() << "HotCountThreshold = " << PSI.getHotCountThreshold() << "\n"; 1254 dbgs() << "ColdCountThreshold = " << PSI.getColdCountThreshold() 1255 << "\n"; 1256 }); 1257 1258 bool DidOutline = Cloner.doMultiRegionFunctionOutlining(); 1259 1260 if (DidOutline) { 1261 LLVM_DEBUG({ 1262 dbgs() << ">>>>>> Outlined (Cloned) Function >>>>>>\n"; 1263 Cloner.ClonedFunc->print(dbgs()); 1264 dbgs() << "<<<<<< Outlined (Cloned) Function <<<<<<\n"; 1265 }); 1266 1267 if (tryPartialInline(Cloner)) 1268 return {true, nullptr}; 1269 } 1270 } 1271 } 1272 1273 // Fall-thru to regular partial inlining if we: 1274 // i) can't find any cold regions to outline, or 1275 // ii) can't inline the outlined function anywhere. 1276 std::unique_ptr<FunctionOutliningInfo> OI = computeOutliningInfo(F); 1277 if (!OI) 1278 return {false, nullptr}; 1279 1280 FunctionCloner Cloner(&F, OI.get(), ORE, LookupAssumptionCache, GetTTI); 1281 Cloner.normalizeReturnBlock(); 1282 1283 Function *OutlinedFunction = Cloner.doSingleRegionFunctionOutlining(); 1284 1285 if (!OutlinedFunction) 1286 return {false, nullptr}; 1287 1288 if (tryPartialInline(Cloner)) 1289 return {true, OutlinedFunction}; 1290 1291 return {false, nullptr}; 1292 } 1293 1294 bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) { 1295 if (Cloner.OutlinedFunctions.empty()) 1296 return false; 1297 1298 auto OutliningCosts = computeOutliningCosts(Cloner); 1299 1300 InstructionCost SizeCost = std::get<0>(OutliningCosts); 1301 InstructionCost NonWeightedRcost = std::get<1>(OutliningCosts); 1302 1303 assert(SizeCost.isValid() && NonWeightedRcost.isValid() && 1304 "Expected valid costs"); 1305 1306 // Only calculate RelativeToEntryFreq when we are doing single region 1307 // outlining. 1308 BranchProbability RelativeToEntryFreq; 1309 if (Cloner.ClonedOI) 1310 RelativeToEntryFreq = getOutliningCallBBRelativeFreq(Cloner); 1311 else 1312 // RelativeToEntryFreq doesn't make sense when we have more than one 1313 // outlined call because each call will have a different relative frequency 1314 // to the entry block. We can consider using the average, but the 1315 // usefulness of that information is questionable. For now, assume we never 1316 // execute the calls to outlined functions. 1317 RelativeToEntryFreq = BranchProbability(0, 1); 1318 1319 BlockFrequency WeightedRcost = 1320 BlockFrequency(NonWeightedRcost.getValue()) * RelativeToEntryFreq; 1321 1322 // The call sequence(s) to the outlined function(s) are larger than the sum of 1323 // the original outlined region size(s), it does not increase the chances of 1324 // inlining the function with outlining (The inliner uses the size increase to 1325 // model the cost of inlining a callee). 1326 if (!SkipCostAnalysis && Cloner.OutlinedRegionCost < SizeCost) { 1327 OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc); 1328 DebugLoc DLoc; 1329 BasicBlock *Block; 1330 std::tie(DLoc, Block) = getOneDebugLoc(*Cloner.ClonedFunc); 1331 OrigFuncORE.emit([&]() { 1332 return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall", 1333 DLoc, Block) 1334 << ore::NV("Function", Cloner.OrigFunc) 1335 << " not partially inlined into callers (Original Size = " 1336 << ore::NV("OutlinedRegionOriginalSize", Cloner.OutlinedRegionCost) 1337 << ", Size of call sequence to outlined function = " 1338 << ore::NV("NewSize", SizeCost) << ")"; 1339 }); 1340 return false; 1341 } 1342 1343 assert(Cloner.OrigFunc->users().empty() && 1344 "F's users should all be replaced!"); 1345 1346 std::vector<User *> Users(Cloner.ClonedFunc->user_begin(), 1347 Cloner.ClonedFunc->user_end()); 1348 1349 DenseMap<User *, uint64_t> CallSiteToProfCountMap; 1350 auto CalleeEntryCount = Cloner.OrigFunc->getEntryCount(); 1351 if (CalleeEntryCount) 1352 computeCallsiteToProfCountMap(Cloner.ClonedFunc, CallSiteToProfCountMap); 1353 1354 uint64_t CalleeEntryCountV = 1355 (CalleeEntryCount ? CalleeEntryCount->getCount() : 0); 1356 1357 bool AnyInline = false; 1358 for (User *User : Users) { 1359 CallBase *CB = getSupportedCallBase(User); 1360 1361 if (isLimitReached()) 1362 continue; 1363 1364 OptimizationRemarkEmitter CallerORE(CB->getCaller()); 1365 if (!shouldPartialInline(*CB, Cloner, WeightedRcost, CallerORE)) 1366 continue; 1367 1368 // Construct remark before doing the inlining, as after successful inlining 1369 // the callsite is removed. 1370 OptimizationRemark OR(DEBUG_TYPE, "PartiallyInlined", CB); 1371 OR << ore::NV("Callee", Cloner.OrigFunc) << " partially inlined into " 1372 << ore::NV("Caller", CB->getCaller()); 1373 1374 InlineFunctionInfo IFI(GetAssumptionCache, &PSI); 1375 // We can only forward varargs when we outlined a single region, else we 1376 // bail on vararg functions. 1377 if (!InlineFunction(*CB, IFI, /*MergeAttributes=*/false, nullptr, true, 1378 (Cloner.ClonedOI ? Cloner.OutlinedFunctions.back().first 1379 : nullptr)) 1380 .isSuccess()) 1381 continue; 1382 1383 CallerORE.emit(OR); 1384 1385 // Now update the entry count: 1386 if (CalleeEntryCountV) { 1387 if (auto It = CallSiteToProfCountMap.find(User); 1388 It != CallSiteToProfCountMap.end()) { 1389 uint64_t CallSiteCount = It->second; 1390 CalleeEntryCountV -= std::min(CalleeEntryCountV, CallSiteCount); 1391 } 1392 } 1393 1394 AnyInline = true; 1395 NumPartialInlining++; 1396 // Update the stats 1397 if (Cloner.ClonedOI) 1398 NumPartialInlined++; 1399 else 1400 NumColdOutlinePartialInlined++; 1401 } 1402 1403 if (AnyInline) { 1404 Cloner.IsFunctionInlined = true; 1405 if (CalleeEntryCount) 1406 Cloner.OrigFunc->setEntryCount(Function::ProfileCount( 1407 CalleeEntryCountV, CalleeEntryCount->getType())); 1408 OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc); 1409 OrigFuncORE.emit([&]() { 1410 return OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", Cloner.OrigFunc) 1411 << "Partially inlined into at least one caller"; 1412 }); 1413 } 1414 1415 return AnyInline; 1416 } 1417 1418 bool PartialInlinerImpl::run(Module &M) { 1419 if (DisablePartialInlining) 1420 return false; 1421 1422 std::vector<Function *> Worklist; 1423 Worklist.reserve(M.size()); 1424 for (Function &F : M) 1425 if (!F.use_empty() && !F.isDeclaration()) 1426 Worklist.push_back(&F); 1427 1428 bool Changed = false; 1429 while (!Worklist.empty()) { 1430 Function *CurrFunc = Worklist.back(); 1431 Worklist.pop_back(); 1432 1433 if (CurrFunc->use_empty()) 1434 continue; 1435 1436 std::pair<bool, Function *> Result = unswitchFunction(*CurrFunc); 1437 if (Result.second) 1438 Worklist.push_back(Result.second); 1439 Changed |= Result.first; 1440 } 1441 1442 return Changed; 1443 } 1444 1445 PreservedAnalyses PartialInlinerPass::run(Module &M, 1446 ModuleAnalysisManager &AM) { 1447 auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); 1448 1449 auto GetAssumptionCache = [&FAM](Function &F) -> AssumptionCache & { 1450 return FAM.getResult<AssumptionAnalysis>(F); 1451 }; 1452 1453 auto LookupAssumptionCache = [&FAM](Function &F) -> AssumptionCache * { 1454 return FAM.getCachedResult<AssumptionAnalysis>(F); 1455 }; 1456 1457 auto GetBFI = [&FAM](Function &F) -> BlockFrequencyInfo & { 1458 return FAM.getResult<BlockFrequencyAnalysis>(F); 1459 }; 1460 1461 auto GetTTI = [&FAM](Function &F) -> TargetTransformInfo & { 1462 return FAM.getResult<TargetIRAnalysis>(F); 1463 }; 1464 1465 auto GetTLI = [&FAM](Function &F) -> TargetLibraryInfo & { 1466 return FAM.getResult<TargetLibraryAnalysis>(F); 1467 }; 1468 1469 ProfileSummaryInfo &PSI = AM.getResult<ProfileSummaryAnalysis>(M); 1470 1471 if (PartialInlinerImpl(GetAssumptionCache, LookupAssumptionCache, GetTTI, 1472 GetTLI, PSI, GetBFI) 1473 .run(M)) 1474 return PreservedAnalyses::none(); 1475 return PreservedAnalyses::all(); 1476 } 1477