1 //===- PartialInlining.cpp - Inline parts of functions --------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass performs partial inlining, typically by inlining an if statement 10 // that surrounds the body of the function. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "llvm/Transforms/IPO/PartialInlining.h" 15 #include "llvm/ADT/DenseMap.h" 16 #include "llvm/ADT/DenseSet.h" 17 #include "llvm/ADT/DepthFirstIterator.h" 18 #include "llvm/ADT/STLExtras.h" 19 #include "llvm/ADT/SmallVector.h" 20 #include "llvm/ADT/Statistic.h" 21 #include "llvm/Analysis/BlockFrequencyInfo.h" 22 #include "llvm/Analysis/BranchProbabilityInfo.h" 23 #include "llvm/Analysis/InlineCost.h" 24 #include "llvm/Analysis/LoopInfo.h" 25 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 26 #include "llvm/Analysis/ProfileSummaryInfo.h" 27 #include "llvm/Analysis/TargetLibraryInfo.h" 28 #include "llvm/Analysis/TargetTransformInfo.h" 29 #include "llvm/IR/Attributes.h" 30 #include "llvm/IR/BasicBlock.h" 31 #include "llvm/IR/CFG.h" 32 #include "llvm/IR/DebugLoc.h" 33 #include "llvm/IR/DiagnosticInfo.h" 34 #include "llvm/IR/Dominators.h" 35 #include "llvm/IR/Function.h" 36 #include "llvm/IR/InstrTypes.h" 37 #include "llvm/IR/Instruction.h" 38 #include "llvm/IR/Instructions.h" 39 #include "llvm/IR/IntrinsicInst.h" 40 #include "llvm/IR/Intrinsics.h" 41 #include "llvm/IR/Module.h" 42 #include "llvm/IR/Operator.h" 43 #include "llvm/IR/ProfDataUtils.h" 44 #include "llvm/IR/User.h" 45 #include "llvm/Support/BlockFrequency.h" 46 #include "llvm/Support/BranchProbability.h" 47 #include "llvm/Support/Casting.h" 48 #include "llvm/Support/CommandLine.h" 49 #include "llvm/Support/ErrorHandling.h" 50 #include "llvm/Transforms/IPO.h" 51 #include "llvm/Transforms/Utils/Cloning.h" 52 #include "llvm/Transforms/Utils/CodeExtractor.h" 53 #include "llvm/Transforms/Utils/ValueMapper.h" 54 #include <algorithm> 55 #include <cassert> 56 #include <cstdint> 57 #include <memory> 58 #include <tuple> 59 #include <vector> 60 61 using namespace llvm; 62 63 #define DEBUG_TYPE "partial-inlining" 64 65 STATISTIC(NumPartialInlined, 66 "Number of callsites functions partially inlined into."); 67 STATISTIC(NumColdOutlinePartialInlined, "Number of times functions with " 68 "cold outlined regions were partially " 69 "inlined into its caller(s)."); 70 STATISTIC(NumColdRegionsFound, 71 "Number of cold single entry/exit regions found."); 72 STATISTIC(NumColdRegionsOutlined, 73 "Number of cold single entry/exit regions outlined."); 74 75 // Command line option to disable partial-inlining. The default is false: 76 static cl::opt<bool> 77 DisablePartialInlining("disable-partial-inlining", cl::init(false), 78 cl::Hidden, cl::desc("Disable partial inlining")); 79 // Command line option to disable multi-region partial-inlining. The default is 80 // false: 81 static cl::opt<bool> DisableMultiRegionPartialInline( 82 "disable-mr-partial-inlining", cl::init(false), cl::Hidden, 83 cl::desc("Disable multi-region partial inlining")); 84 85 // Command line option to force outlining in regions with live exit variables. 86 // The default is false: 87 static cl::opt<bool> 88 ForceLiveExit("pi-force-live-exit-outline", cl::init(false), cl::Hidden, 89 cl::desc("Force outline regions with live exits")); 90 91 // Command line option to enable marking outline functions with Cold Calling 92 // Convention. The default is false: 93 static cl::opt<bool> 94 MarkOutlinedColdCC("pi-mark-coldcc", cl::init(false), cl::Hidden, 95 cl::desc("Mark outline function calls with ColdCC")); 96 97 // This is an option used by testing: 98 static cl::opt<bool> SkipCostAnalysis("skip-partial-inlining-cost-analysis", 99 100 cl::ReallyHidden, 101 cl::desc("Skip Cost Analysis")); 102 // Used to determine if a cold region is worth outlining based on 103 // its inlining cost compared to the original function. Default is set at 10%. 104 // ie. if the cold region reduces the inlining cost of the original function by 105 // at least 10%. 106 static cl::opt<float> MinRegionSizeRatio( 107 "min-region-size-ratio", cl::init(0.1), cl::Hidden, 108 cl::desc("Minimum ratio comparing relative sizes of each " 109 "outline candidate and original function")); 110 // Used to tune the minimum number of execution counts needed in the predecessor 111 // block to the cold edge. ie. confidence interval. 112 static cl::opt<unsigned> 113 MinBlockCounterExecution("min-block-execution", cl::init(100), cl::Hidden, 114 cl::desc("Minimum block executions to consider " 115 "its BranchProbabilityInfo valid")); 116 // Used to determine when an edge is considered cold. Default is set to 10%. ie. 117 // if the branch probability is 10% or less, then it is deemed as 'cold'. 118 static cl::opt<float> ColdBranchRatio( 119 "cold-branch-ratio", cl::init(0.1), cl::Hidden, 120 cl::desc("Minimum BranchProbability to consider a region cold.")); 121 122 static cl::opt<unsigned> MaxNumInlineBlocks( 123 "max-num-inline-blocks", cl::init(5), cl::Hidden, 124 cl::desc("Max number of blocks to be partially inlined")); 125 126 // Command line option to set the maximum number of partial inlining allowed 127 // for the module. The default value of -1 means no limit. 128 static cl::opt<int> MaxNumPartialInlining( 129 "max-partial-inlining", cl::init(-1), cl::Hidden, 130 cl::desc("Max number of partial inlining. The default is unlimited")); 131 132 // Used only when PGO or user annotated branch data is absent. It is 133 // the least value that is used to weigh the outline region. If BFI 134 // produces larger value, the BFI value will be used. 135 static cl::opt<int> 136 OutlineRegionFreqPercent("outline-region-freq-percent", cl::init(75), 137 cl::Hidden, 138 cl::desc("Relative frequency of outline region to " 139 "the entry block")); 140 141 static cl::opt<unsigned> ExtraOutliningPenalty( 142 "partial-inlining-extra-penalty", cl::init(0), cl::Hidden, 143 cl::desc("A debug option to add additional penalty to the computed one.")); 144 145 namespace { 146 147 struct FunctionOutliningInfo { 148 FunctionOutliningInfo() = default; 149 150 // Returns the number of blocks to be inlined including all blocks 151 // in Entries and one return block. 152 unsigned getNumInlinedBlocks() const { return Entries.size() + 1; } 153 154 // A set of blocks including the function entry that guard 155 // the region to be outlined. 156 SmallVector<BasicBlock *, 4> Entries; 157 158 // The return block that is not included in the outlined region. 159 BasicBlock *ReturnBlock = nullptr; 160 161 // The dominating block of the region to be outlined. 162 BasicBlock *NonReturnBlock = nullptr; 163 164 // The set of blocks in Entries that are predecessors to ReturnBlock 165 SmallVector<BasicBlock *, 4> ReturnBlockPreds; 166 }; 167 168 struct FunctionOutliningMultiRegionInfo { 169 FunctionOutliningMultiRegionInfo() = default; 170 171 // Container for outline regions 172 struct OutlineRegionInfo { 173 OutlineRegionInfo(ArrayRef<BasicBlock *> Region, 174 BasicBlock *EntryBlock, BasicBlock *ExitBlock, 175 BasicBlock *ReturnBlock) 176 : Region(Region.begin(), Region.end()), EntryBlock(EntryBlock), 177 ExitBlock(ExitBlock), ReturnBlock(ReturnBlock) {} 178 SmallVector<BasicBlock *, 8> Region; 179 BasicBlock *EntryBlock; 180 BasicBlock *ExitBlock; 181 BasicBlock *ReturnBlock; 182 }; 183 184 SmallVector<OutlineRegionInfo, 4> ORI; 185 }; 186 187 struct PartialInlinerImpl { 188 189 PartialInlinerImpl( 190 function_ref<AssumptionCache &(Function &)> GetAC, 191 function_ref<AssumptionCache *(Function &)> LookupAC, 192 function_ref<TargetTransformInfo &(Function &)> GTTI, 193 function_ref<const TargetLibraryInfo &(Function &)> GTLI, 194 ProfileSummaryInfo &ProfSI, 195 function_ref<BlockFrequencyInfo &(Function &)> GBFI = nullptr) 196 : GetAssumptionCache(GetAC), LookupAssumptionCache(LookupAC), 197 GetTTI(GTTI), GetBFI(GBFI), GetTLI(GTLI), PSI(ProfSI) {} 198 199 bool run(Module &M); 200 // Main part of the transformation that calls helper functions to find 201 // outlining candidates, clone & outline the function, and attempt to 202 // partially inline the resulting function. Returns true if 203 // inlining was successful, false otherwise. Also returns the outline 204 // function (only if we partially inlined early returns) as there is a 205 // possibility to further "peel" early return statements that were left in the 206 // outline function due to code size. 207 std::pair<bool, Function *> unswitchFunction(Function &F); 208 209 // This class speculatively clones the function to be partial inlined. 210 // At the end of partial inlining, the remaining callsites to the cloned 211 // function that are not partially inlined will be fixed up to reference 212 // the original function, and the cloned function will be erased. 213 struct FunctionCloner { 214 // Two constructors, one for single region outlining, the other for 215 // multi-region outlining. 216 FunctionCloner(Function *F, FunctionOutliningInfo *OI, 217 OptimizationRemarkEmitter &ORE, 218 function_ref<AssumptionCache *(Function &)> LookupAC, 219 function_ref<TargetTransformInfo &(Function &)> GetTTI); 220 FunctionCloner(Function *F, FunctionOutliningMultiRegionInfo *OMRI, 221 OptimizationRemarkEmitter &ORE, 222 function_ref<AssumptionCache *(Function &)> LookupAC, 223 function_ref<TargetTransformInfo &(Function &)> GetTTI); 224 225 ~FunctionCloner(); 226 227 // Prepare for function outlining: making sure there is only 228 // one incoming edge from the extracted/outlined region to 229 // the return block. 230 void normalizeReturnBlock() const; 231 232 // Do function outlining for cold regions. 233 bool doMultiRegionFunctionOutlining(); 234 // Do function outlining for region after early return block(s). 235 // NOTE: For vararg functions that do the vararg handling in the outlined 236 // function, we temporarily generate IR that does not properly 237 // forward varargs to the outlined function. Calling InlineFunction 238 // will update calls to the outlined functions to properly forward 239 // the varargs. 240 Function *doSingleRegionFunctionOutlining(); 241 242 Function *OrigFunc = nullptr; 243 Function *ClonedFunc = nullptr; 244 245 typedef std::pair<Function *, BasicBlock *> FuncBodyCallerPair; 246 // Keep track of Outlined Functions and the basic block they're called from. 247 SmallVector<FuncBodyCallerPair, 4> OutlinedFunctions; 248 249 // ClonedFunc is inlined in one of its callers after function 250 // outlining. 251 bool IsFunctionInlined = false; 252 // The cost of the region to be outlined. 253 InstructionCost OutlinedRegionCost = 0; 254 // ClonedOI is specific to outlining non-early return blocks. 255 std::unique_ptr<FunctionOutliningInfo> ClonedOI = nullptr; 256 // ClonedOMRI is specific to outlining cold regions. 257 std::unique_ptr<FunctionOutliningMultiRegionInfo> ClonedOMRI = nullptr; 258 std::unique_ptr<BlockFrequencyInfo> ClonedFuncBFI = nullptr; 259 OptimizationRemarkEmitter &ORE; 260 function_ref<AssumptionCache *(Function &)> LookupAC; 261 function_ref<TargetTransformInfo &(Function &)> GetTTI; 262 }; 263 264 private: 265 int NumPartialInlining = 0; 266 function_ref<AssumptionCache &(Function &)> GetAssumptionCache; 267 function_ref<AssumptionCache *(Function &)> LookupAssumptionCache; 268 function_ref<TargetTransformInfo &(Function &)> GetTTI; 269 function_ref<BlockFrequencyInfo &(Function &)> GetBFI; 270 function_ref<const TargetLibraryInfo &(Function &)> GetTLI; 271 ProfileSummaryInfo &PSI; 272 273 // Return the frequency of the OutlininingBB relative to F's entry point. 274 // The result is no larger than 1 and is represented using BP. 275 // (Note that the outlined region's 'head' block can only have incoming 276 // edges from the guarding entry blocks). 277 BranchProbability 278 getOutliningCallBBRelativeFreq(FunctionCloner &Cloner) const; 279 280 // Return true if the callee of CB should be partially inlined with 281 // profit. 282 bool shouldPartialInline(CallBase &CB, FunctionCloner &Cloner, 283 BlockFrequency WeightedOutliningRcost, 284 OptimizationRemarkEmitter &ORE) const; 285 286 // Try to inline DuplicateFunction (cloned from F with call to 287 // the OutlinedFunction into its callers. Return true 288 // if there is any successful inlining. 289 bool tryPartialInline(FunctionCloner &Cloner); 290 291 // Compute the mapping from use site of DuplicationFunction to the enclosing 292 // BB's profile count. 293 void 294 computeCallsiteToProfCountMap(Function *DuplicateFunction, 295 DenseMap<User *, uint64_t> &SiteCountMap) const; 296 297 bool isLimitReached() const { 298 return (MaxNumPartialInlining != -1 && 299 NumPartialInlining >= MaxNumPartialInlining); 300 } 301 302 static CallBase *getSupportedCallBase(User *U) { 303 if (isa<CallInst>(U) || isa<InvokeInst>(U)) 304 return cast<CallBase>(U); 305 llvm_unreachable("All uses must be calls"); 306 return nullptr; 307 } 308 309 static CallBase *getOneCallSiteTo(Function &F) { 310 User *User = *F.user_begin(); 311 return getSupportedCallBase(User); 312 } 313 314 std::tuple<DebugLoc, BasicBlock *> getOneDebugLoc(Function &F) const { 315 CallBase *CB = getOneCallSiteTo(F); 316 DebugLoc DLoc = CB->getDebugLoc(); 317 BasicBlock *Block = CB->getParent(); 318 return std::make_tuple(DLoc, Block); 319 } 320 321 // Returns the costs associated with function outlining: 322 // - The first value is the non-weighted runtime cost for making the call 323 // to the outlined function, including the addtional setup cost in the 324 // outlined function itself; 325 // - The second value is the estimated size of the new call sequence in 326 // basic block Cloner.OutliningCallBB; 327 std::tuple<InstructionCost, InstructionCost> 328 computeOutliningCosts(FunctionCloner &Cloner) const; 329 330 // Compute the 'InlineCost' of block BB. InlineCost is a proxy used to 331 // approximate both the size and runtime cost (Note that in the current 332 // inline cost analysis, there is no clear distinction there either). 333 static InstructionCost computeBBInlineCost(BasicBlock *BB, 334 TargetTransformInfo *TTI); 335 336 std::unique_ptr<FunctionOutliningInfo> 337 computeOutliningInfo(Function &F) const; 338 339 std::unique_ptr<FunctionOutliningMultiRegionInfo> 340 computeOutliningColdRegionsInfo(Function &F, 341 OptimizationRemarkEmitter &ORE) const; 342 }; 343 344 } // end anonymous namespace 345 346 std::unique_ptr<FunctionOutliningMultiRegionInfo> 347 PartialInlinerImpl::computeOutliningColdRegionsInfo( 348 Function &F, OptimizationRemarkEmitter &ORE) const { 349 BasicBlock *EntryBlock = &F.front(); 350 351 DominatorTree DT(F); 352 LoopInfo LI(DT); 353 BranchProbabilityInfo BPI(F, LI); 354 std::unique_ptr<BlockFrequencyInfo> ScopedBFI; 355 BlockFrequencyInfo *BFI; 356 if (!GetBFI) { 357 ScopedBFI.reset(new BlockFrequencyInfo(F, BPI, LI)); 358 BFI = ScopedBFI.get(); 359 } else 360 BFI = &(GetBFI(F)); 361 362 // Return if we don't have profiling information. 363 if (!PSI.hasInstrumentationProfile()) 364 return std::unique_ptr<FunctionOutliningMultiRegionInfo>(); 365 366 std::unique_ptr<FunctionOutliningMultiRegionInfo> OutliningInfo = 367 std::make_unique<FunctionOutliningMultiRegionInfo>(); 368 369 auto IsSingleExit = 370 [&ORE](SmallVectorImpl<BasicBlock *> &BlockList) -> BasicBlock * { 371 BasicBlock *ExitBlock = nullptr; 372 for (auto *Block : BlockList) { 373 for (BasicBlock *Succ : successors(Block)) { 374 if (!is_contained(BlockList, Succ)) { 375 if (ExitBlock) { 376 ORE.emit([&]() { 377 return OptimizationRemarkMissed(DEBUG_TYPE, "MultiExitRegion", 378 &Succ->front()) 379 << "Region dominated by " 380 << ore::NV("Block", BlockList.front()->getName()) 381 << " has more than one region exit edge."; 382 }); 383 return nullptr; 384 } 385 386 ExitBlock = Block; 387 } 388 } 389 } 390 return ExitBlock; 391 }; 392 393 auto BBProfileCount = [BFI](BasicBlock *BB) { 394 return BFI->getBlockProfileCount(BB).value_or(0); 395 }; 396 397 // Use the same computeBBInlineCost function to compute the cost savings of 398 // the outlining the candidate region. 399 TargetTransformInfo *FTTI = &GetTTI(F); 400 InstructionCost OverallFunctionCost = 0; 401 for (auto &BB : F) 402 OverallFunctionCost += computeBBInlineCost(&BB, FTTI); 403 404 LLVM_DEBUG(dbgs() << "OverallFunctionCost = " << OverallFunctionCost 405 << "\n";); 406 407 InstructionCost MinOutlineRegionCost = OverallFunctionCost.map( 408 [&](auto Cost) { return Cost * MinRegionSizeRatio; }); 409 410 BranchProbability MinBranchProbability( 411 static_cast<int>(ColdBranchRatio * MinBlockCounterExecution), 412 MinBlockCounterExecution); 413 bool ColdCandidateFound = false; 414 BasicBlock *CurrEntry = EntryBlock; 415 std::vector<BasicBlock *> DFS; 416 DenseMap<BasicBlock *, bool> VisitedMap; 417 DFS.push_back(CurrEntry); 418 VisitedMap[CurrEntry] = true; 419 420 // Use Depth First Search on the basic blocks to find CFG edges that are 421 // considered cold. 422 // Cold regions considered must also have its inline cost compared to the 423 // overall inline cost of the original function. The region is outlined only 424 // if it reduced the inline cost of the function by 'MinOutlineRegionCost' or 425 // more. 426 while (!DFS.empty()) { 427 auto *ThisBB = DFS.back(); 428 DFS.pop_back(); 429 // Only consider regions with predecessor blocks that are considered 430 // not-cold (default: part of the top 99.99% of all block counters) 431 // AND greater than our minimum block execution count (default: 100). 432 if (PSI.isColdBlock(ThisBB, BFI) || 433 BBProfileCount(ThisBB) < MinBlockCounterExecution) 434 continue; 435 for (auto SI = succ_begin(ThisBB); SI != succ_end(ThisBB); ++SI) { 436 if (VisitedMap[*SI]) 437 continue; 438 VisitedMap[*SI] = true; 439 DFS.push_back(*SI); 440 // If branch isn't cold, we skip to the next one. 441 BranchProbability SuccProb = BPI.getEdgeProbability(ThisBB, *SI); 442 if (SuccProb > MinBranchProbability) 443 continue; 444 445 LLVM_DEBUG(dbgs() << "Found cold edge: " << ThisBB->getName() << "->" 446 << SI->getName() 447 << "\nBranch Probability = " << SuccProb << "\n";); 448 449 SmallVector<BasicBlock *, 8> DominateVector; 450 DT.getDescendants(*SI, DominateVector); 451 assert(!DominateVector.empty() && 452 "SI should be reachable and have at least itself as descendant"); 453 454 // We can only outline single entry regions (for now). 455 if (!DominateVector.front()->hasNPredecessors(1)) { 456 LLVM_DEBUG(dbgs() << "ABORT: Block " << SI->getName() 457 << " doesn't have a single predecessor in the " 458 "dominator tree\n";); 459 continue; 460 } 461 462 BasicBlock *ExitBlock = nullptr; 463 // We can only outline single exit regions (for now). 464 if (!(ExitBlock = IsSingleExit(DominateVector))) { 465 LLVM_DEBUG(dbgs() << "ABORT: Block " << SI->getName() 466 << " doesn't have a unique successor\n";); 467 continue; 468 } 469 470 InstructionCost OutlineRegionCost = 0; 471 for (auto *BB : DominateVector) 472 OutlineRegionCost += computeBBInlineCost(BB, &GetTTI(*BB->getParent())); 473 474 LLVM_DEBUG(dbgs() << "OutlineRegionCost = " << OutlineRegionCost 475 << "\n";); 476 477 if (!SkipCostAnalysis && OutlineRegionCost < MinOutlineRegionCost) { 478 ORE.emit([&]() { 479 return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", 480 &SI->front()) 481 << ore::NV("Callee", &F) 482 << " inline cost-savings smaller than " 483 << ore::NV("Cost", MinOutlineRegionCost); 484 }); 485 486 LLVM_DEBUG(dbgs() << "ABORT: Outline region cost is smaller than " 487 << MinOutlineRegionCost << "\n";); 488 continue; 489 } 490 491 // For now, ignore blocks that belong to a SISE region that is a 492 // candidate for outlining. In the future, we may want to look 493 // at inner regions because the outer region may have live-exit 494 // variables. 495 for (auto *BB : DominateVector) 496 VisitedMap[BB] = true; 497 498 // ReturnBlock here means the block after the outline call 499 BasicBlock *ReturnBlock = ExitBlock->getSingleSuccessor(); 500 FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegInfo( 501 DominateVector, DominateVector.front(), ExitBlock, ReturnBlock); 502 OutliningInfo->ORI.push_back(RegInfo); 503 LLVM_DEBUG(dbgs() << "Found Cold Candidate starting at block: " 504 << DominateVector.front()->getName() << "\n";); 505 ColdCandidateFound = true; 506 NumColdRegionsFound++; 507 } 508 } 509 510 if (ColdCandidateFound) 511 return OutliningInfo; 512 513 return std::unique_ptr<FunctionOutliningMultiRegionInfo>(); 514 } 515 516 std::unique_ptr<FunctionOutliningInfo> 517 PartialInlinerImpl::computeOutliningInfo(Function &F) const { 518 BasicBlock *EntryBlock = &F.front(); 519 BranchInst *BR = dyn_cast<BranchInst>(EntryBlock->getTerminator()); 520 if (!BR || BR->isUnconditional()) 521 return std::unique_ptr<FunctionOutliningInfo>(); 522 523 // Returns true if Succ is BB's successor 524 auto IsSuccessor = [](BasicBlock *Succ, BasicBlock *BB) { 525 return is_contained(successors(BB), Succ); 526 }; 527 528 auto IsReturnBlock = [](BasicBlock *BB) { 529 Instruction *TI = BB->getTerminator(); 530 return isa<ReturnInst>(TI); 531 }; 532 533 auto GetReturnBlock = [&](BasicBlock *Succ1, BasicBlock *Succ2) { 534 if (IsReturnBlock(Succ1)) 535 return std::make_tuple(Succ1, Succ2); 536 if (IsReturnBlock(Succ2)) 537 return std::make_tuple(Succ2, Succ1); 538 539 return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr); 540 }; 541 542 // Detect a triangular shape: 543 auto GetCommonSucc = [&](BasicBlock *Succ1, BasicBlock *Succ2) { 544 if (IsSuccessor(Succ1, Succ2)) 545 return std::make_tuple(Succ1, Succ2); 546 if (IsSuccessor(Succ2, Succ1)) 547 return std::make_tuple(Succ2, Succ1); 548 549 return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr); 550 }; 551 552 std::unique_ptr<FunctionOutliningInfo> OutliningInfo = 553 std::make_unique<FunctionOutliningInfo>(); 554 555 BasicBlock *CurrEntry = EntryBlock; 556 bool CandidateFound = false; 557 do { 558 // The number of blocks to be inlined has already reached 559 // the limit. When MaxNumInlineBlocks is set to 0 or 1, this 560 // disables partial inlining for the function. 561 if (OutliningInfo->getNumInlinedBlocks() >= MaxNumInlineBlocks) 562 break; 563 564 if (succ_size(CurrEntry) != 2) 565 break; 566 567 BasicBlock *Succ1 = *succ_begin(CurrEntry); 568 BasicBlock *Succ2 = *(succ_begin(CurrEntry) + 1); 569 570 BasicBlock *ReturnBlock, *NonReturnBlock; 571 std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2); 572 573 if (ReturnBlock) { 574 OutliningInfo->Entries.push_back(CurrEntry); 575 OutliningInfo->ReturnBlock = ReturnBlock; 576 OutliningInfo->NonReturnBlock = NonReturnBlock; 577 CandidateFound = true; 578 break; 579 } 580 581 BasicBlock *CommSucc, *OtherSucc; 582 std::tie(CommSucc, OtherSucc) = GetCommonSucc(Succ1, Succ2); 583 584 if (!CommSucc) 585 break; 586 587 OutliningInfo->Entries.push_back(CurrEntry); 588 CurrEntry = OtherSucc; 589 } while (true); 590 591 if (!CandidateFound) 592 return std::unique_ptr<FunctionOutliningInfo>(); 593 594 // There should not be any successors (not in the entry set) other than 595 // {ReturnBlock, NonReturnBlock} 596 assert(OutliningInfo->Entries[0] == &F.front() && 597 "Function Entry must be the first in Entries vector"); 598 DenseSet<BasicBlock *> Entries; 599 for (BasicBlock *E : OutliningInfo->Entries) 600 Entries.insert(E); 601 602 // Returns true of BB has Predecessor which is not 603 // in Entries set. 604 auto HasNonEntryPred = [Entries](BasicBlock *BB) { 605 for (auto *Pred : predecessors(BB)) { 606 if (!Entries.count(Pred)) 607 return true; 608 } 609 return false; 610 }; 611 auto CheckAndNormalizeCandidate = 612 [Entries, HasNonEntryPred](FunctionOutliningInfo *OutliningInfo) { 613 for (BasicBlock *E : OutliningInfo->Entries) { 614 for (auto *Succ : successors(E)) { 615 if (Entries.count(Succ)) 616 continue; 617 if (Succ == OutliningInfo->ReturnBlock) 618 OutliningInfo->ReturnBlockPreds.push_back(E); 619 else if (Succ != OutliningInfo->NonReturnBlock) 620 return false; 621 } 622 // There should not be any outside incoming edges either: 623 if (HasNonEntryPred(E)) 624 return false; 625 } 626 return true; 627 }; 628 629 if (!CheckAndNormalizeCandidate(OutliningInfo.get())) 630 return std::unique_ptr<FunctionOutliningInfo>(); 631 632 // Now further growing the candidate's inlining region by 633 // peeling off dominating blocks from the outlining region: 634 while (OutliningInfo->getNumInlinedBlocks() < MaxNumInlineBlocks) { 635 BasicBlock *Cand = OutliningInfo->NonReturnBlock; 636 if (succ_size(Cand) != 2) 637 break; 638 639 if (HasNonEntryPred(Cand)) 640 break; 641 642 BasicBlock *Succ1 = *succ_begin(Cand); 643 BasicBlock *Succ2 = *(succ_begin(Cand) + 1); 644 645 BasicBlock *ReturnBlock, *NonReturnBlock; 646 std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2); 647 if (!ReturnBlock || ReturnBlock != OutliningInfo->ReturnBlock) 648 break; 649 650 if (NonReturnBlock->getSinglePredecessor() != Cand) 651 break; 652 653 // Now grow and update OutlininigInfo: 654 OutliningInfo->Entries.push_back(Cand); 655 OutliningInfo->NonReturnBlock = NonReturnBlock; 656 OutliningInfo->ReturnBlockPreds.push_back(Cand); 657 Entries.insert(Cand); 658 } 659 660 return OutliningInfo; 661 } 662 663 // Check if there is PGO data or user annotated branch data: 664 static bool hasProfileData(const Function &F, const FunctionOutliningInfo &OI) { 665 if (F.hasProfileData()) 666 return true; 667 // Now check if any of the entry block has MD_prof data: 668 for (auto *E : OI.Entries) { 669 BranchInst *BR = dyn_cast<BranchInst>(E->getTerminator()); 670 if (!BR || BR->isUnconditional()) 671 continue; 672 if (hasBranchWeightMD(*BR)) 673 return true; 674 } 675 return false; 676 } 677 678 BranchProbability PartialInlinerImpl::getOutliningCallBBRelativeFreq( 679 FunctionCloner &Cloner) const { 680 BasicBlock *OutliningCallBB = Cloner.OutlinedFunctions.back().second; 681 auto EntryFreq = 682 Cloner.ClonedFuncBFI->getBlockFreq(&Cloner.ClonedFunc->getEntryBlock()); 683 auto OutliningCallFreq = 684 Cloner.ClonedFuncBFI->getBlockFreq(OutliningCallBB); 685 // FIXME Hackery needed because ClonedFuncBFI is based on the function BEFORE 686 // we outlined any regions, so we may encounter situations where the 687 // OutliningCallFreq is *slightly* bigger than the EntryFreq. 688 if (OutliningCallFreq.getFrequency() > EntryFreq.getFrequency()) 689 OutliningCallFreq = EntryFreq; 690 691 auto OutlineRegionRelFreq = BranchProbability::getBranchProbability( 692 OutliningCallFreq.getFrequency(), EntryFreq.getFrequency()); 693 694 if (hasProfileData(*Cloner.OrigFunc, *Cloner.ClonedOI)) 695 return OutlineRegionRelFreq; 696 697 // When profile data is not available, we need to be conservative in 698 // estimating the overall savings. Static branch prediction can usually 699 // guess the branch direction right (taken/non-taken), but the guessed 700 // branch probability is usually not biased enough. In case when the 701 // outlined region is predicted to be likely, its probability needs 702 // to be made higher (more biased) to not under-estimate the cost of 703 // function outlining. On the other hand, if the outlined region 704 // is predicted to be less likely, the predicted probablity is usually 705 // higher than the actual. For instance, the actual probability of the 706 // less likely target is only 5%, but the guessed probablity can be 707 // 40%. In the latter case, there is no need for further adjustment. 708 // FIXME: add an option for this. 709 if (OutlineRegionRelFreq < BranchProbability(45, 100)) 710 return OutlineRegionRelFreq; 711 712 OutlineRegionRelFreq = std::max( 713 OutlineRegionRelFreq, BranchProbability(OutlineRegionFreqPercent, 100)); 714 715 return OutlineRegionRelFreq; 716 } 717 718 bool PartialInlinerImpl::shouldPartialInline( 719 CallBase &CB, FunctionCloner &Cloner, BlockFrequency WeightedOutliningRcost, 720 OptimizationRemarkEmitter &ORE) const { 721 using namespace ore; 722 723 Function *Callee = CB.getCalledFunction(); 724 assert(Callee == Cloner.ClonedFunc); 725 726 if (SkipCostAnalysis) 727 return isInlineViable(*Callee).isSuccess(); 728 729 Function *Caller = CB.getCaller(); 730 auto &CalleeTTI = GetTTI(*Callee); 731 bool RemarksEnabled = 732 Callee->getContext().getDiagHandlerPtr()->isMissedOptRemarkEnabled( 733 DEBUG_TYPE); 734 InlineCost IC = 735 getInlineCost(CB, getInlineParams(), CalleeTTI, GetAssumptionCache, 736 GetTLI, GetBFI, &PSI, RemarksEnabled ? &ORE : nullptr); 737 738 if (IC.isAlways()) { 739 ORE.emit([&]() { 740 return OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", &CB) 741 << NV("Callee", Cloner.OrigFunc) 742 << " should always be fully inlined, not partially"; 743 }); 744 return false; 745 } 746 747 if (IC.isNever()) { 748 ORE.emit([&]() { 749 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", &CB) 750 << NV("Callee", Cloner.OrigFunc) << " not partially inlined into " 751 << NV("Caller", Caller) 752 << " because it should never be inlined (cost=never)"; 753 }); 754 return false; 755 } 756 757 if (!IC) { 758 ORE.emit([&]() { 759 return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", &CB) 760 << NV("Callee", Cloner.OrigFunc) << " not partially inlined into " 761 << NV("Caller", Caller) << " because too costly to inline (cost=" 762 << NV("Cost", IC.getCost()) << ", threshold=" 763 << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")"; 764 }); 765 return false; 766 } 767 const DataLayout &DL = Caller->getParent()->getDataLayout(); 768 769 // The savings of eliminating the call: 770 int NonWeightedSavings = getCallsiteCost(CalleeTTI, CB, DL); 771 BlockFrequency NormWeightedSavings(NonWeightedSavings); 772 773 // Weighted saving is smaller than weighted cost, return false 774 if (NormWeightedSavings < WeightedOutliningRcost) { 775 ORE.emit([&]() { 776 return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutliningCallcostTooHigh", 777 &CB) 778 << NV("Callee", Cloner.OrigFunc) << " not partially inlined into " 779 << NV("Caller", Caller) << " runtime overhead (overhead=" 780 << NV("Overhead", (unsigned)WeightedOutliningRcost.getFrequency()) 781 << ", savings=" 782 << NV("Savings", (unsigned)NormWeightedSavings.getFrequency()) 783 << ")" 784 << " of making the outlined call is too high"; 785 }); 786 787 return false; 788 } 789 790 ORE.emit([&]() { 791 return OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBePartiallyInlined", &CB) 792 << NV("Callee", Cloner.OrigFunc) << " can be partially inlined into " 793 << NV("Caller", Caller) << " with cost=" << NV("Cost", IC.getCost()) 794 << " (threshold=" 795 << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")"; 796 }); 797 return true; 798 } 799 800 // TODO: Ideally we should share Inliner's InlineCost Analysis code. 801 // For now use a simplified version. The returned 'InlineCost' will be used 802 // to esimate the size cost as well as runtime cost of the BB. 803 InstructionCost 804 PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB, 805 TargetTransformInfo *TTI) { 806 InstructionCost InlineCost = 0; 807 const DataLayout &DL = BB->getParent()->getParent()->getDataLayout(); 808 int InstrCost = InlineConstants::getInstrCost(); 809 for (Instruction &I : BB->instructionsWithoutDebug()) { 810 // Skip free instructions. 811 switch (I.getOpcode()) { 812 case Instruction::BitCast: 813 case Instruction::PtrToInt: 814 case Instruction::IntToPtr: 815 case Instruction::Alloca: 816 case Instruction::PHI: 817 continue; 818 case Instruction::GetElementPtr: 819 if (cast<GetElementPtrInst>(&I)->hasAllZeroIndices()) 820 continue; 821 break; 822 default: 823 break; 824 } 825 826 if (I.isLifetimeStartOrEnd()) 827 continue; 828 829 if (auto *II = dyn_cast<IntrinsicInst>(&I)) { 830 Intrinsic::ID IID = II->getIntrinsicID(); 831 SmallVector<Type *, 4> Tys; 832 FastMathFlags FMF; 833 for (Value *Val : II->args()) 834 Tys.push_back(Val->getType()); 835 836 if (auto *FPMO = dyn_cast<FPMathOperator>(II)) 837 FMF = FPMO->getFastMathFlags(); 838 839 IntrinsicCostAttributes ICA(IID, II->getType(), Tys, FMF); 840 InlineCost += TTI->getIntrinsicInstrCost(ICA, TTI::TCK_SizeAndLatency); 841 continue; 842 } 843 844 if (CallInst *CI = dyn_cast<CallInst>(&I)) { 845 InlineCost += getCallsiteCost(*TTI, *CI, DL); 846 continue; 847 } 848 849 if (InvokeInst *II = dyn_cast<InvokeInst>(&I)) { 850 InlineCost += getCallsiteCost(*TTI, *II, DL); 851 continue; 852 } 853 854 if (SwitchInst *SI = dyn_cast<SwitchInst>(&I)) { 855 InlineCost += (SI->getNumCases() + 1) * InstrCost; 856 continue; 857 } 858 InlineCost += InstrCost; 859 } 860 861 return InlineCost; 862 } 863 864 std::tuple<InstructionCost, InstructionCost> 865 PartialInlinerImpl::computeOutliningCosts(FunctionCloner &Cloner) const { 866 InstructionCost OutliningFuncCallCost = 0, OutlinedFunctionCost = 0; 867 for (auto FuncBBPair : Cloner.OutlinedFunctions) { 868 Function *OutlinedFunc = FuncBBPair.first; 869 BasicBlock* OutliningCallBB = FuncBBPair.second; 870 // Now compute the cost of the call sequence to the outlined function 871 // 'OutlinedFunction' in BB 'OutliningCallBB': 872 auto *OutlinedFuncTTI = &GetTTI(*OutlinedFunc); 873 OutliningFuncCallCost += 874 computeBBInlineCost(OutliningCallBB, OutlinedFuncTTI); 875 876 // Now compute the cost of the extracted/outlined function itself: 877 for (BasicBlock &BB : *OutlinedFunc) 878 OutlinedFunctionCost += computeBBInlineCost(&BB, OutlinedFuncTTI); 879 } 880 assert(OutlinedFunctionCost >= Cloner.OutlinedRegionCost && 881 "Outlined function cost should be no less than the outlined region"); 882 883 // The code extractor introduces a new root and exit stub blocks with 884 // additional unconditional branches. Those branches will be eliminated 885 // later with bb layout. The cost should be adjusted accordingly: 886 OutlinedFunctionCost -= 887 2 * InlineConstants::getInstrCost() * Cloner.OutlinedFunctions.size(); 888 889 InstructionCost OutliningRuntimeOverhead = 890 OutliningFuncCallCost + 891 (OutlinedFunctionCost - Cloner.OutlinedRegionCost) + 892 ExtraOutliningPenalty.getValue(); 893 894 return std::make_tuple(OutliningFuncCallCost, OutliningRuntimeOverhead); 895 } 896 897 // Create the callsite to profile count map which is 898 // used to update the original function's entry count, 899 // after the function is partially inlined into the callsite. 900 void PartialInlinerImpl::computeCallsiteToProfCountMap( 901 Function *DuplicateFunction, 902 DenseMap<User *, uint64_t> &CallSiteToProfCountMap) const { 903 std::vector<User *> Users(DuplicateFunction->user_begin(), 904 DuplicateFunction->user_end()); 905 Function *CurrentCaller = nullptr; 906 std::unique_ptr<BlockFrequencyInfo> TempBFI; 907 BlockFrequencyInfo *CurrentCallerBFI = nullptr; 908 909 auto ComputeCurrBFI = [&,this](Function *Caller) { 910 // For the old pass manager: 911 if (!GetBFI) { 912 DominatorTree DT(*Caller); 913 LoopInfo LI(DT); 914 BranchProbabilityInfo BPI(*Caller, LI); 915 TempBFI.reset(new BlockFrequencyInfo(*Caller, BPI, LI)); 916 CurrentCallerBFI = TempBFI.get(); 917 } else { 918 // New pass manager: 919 CurrentCallerBFI = &(GetBFI(*Caller)); 920 } 921 }; 922 923 for (User *User : Users) { 924 // Don't bother with BlockAddress used by CallBr for asm goto. 925 if (isa<BlockAddress>(User)) 926 continue; 927 CallBase *CB = getSupportedCallBase(User); 928 Function *Caller = CB->getCaller(); 929 if (CurrentCaller != Caller) { 930 CurrentCaller = Caller; 931 ComputeCurrBFI(Caller); 932 } else { 933 assert(CurrentCallerBFI && "CallerBFI is not set"); 934 } 935 BasicBlock *CallBB = CB->getParent(); 936 auto Count = CurrentCallerBFI->getBlockProfileCount(CallBB); 937 if (Count) 938 CallSiteToProfCountMap[User] = *Count; 939 else 940 CallSiteToProfCountMap[User] = 0; 941 } 942 } 943 944 PartialInlinerImpl::FunctionCloner::FunctionCloner( 945 Function *F, FunctionOutliningInfo *OI, OptimizationRemarkEmitter &ORE, 946 function_ref<AssumptionCache *(Function &)> LookupAC, 947 function_ref<TargetTransformInfo &(Function &)> GetTTI) 948 : OrigFunc(F), ORE(ORE), LookupAC(LookupAC), GetTTI(GetTTI) { 949 ClonedOI = std::make_unique<FunctionOutliningInfo>(); 950 951 // Clone the function, so that we can hack away on it. 952 ValueToValueMapTy VMap; 953 ClonedFunc = CloneFunction(F, VMap); 954 955 ClonedOI->ReturnBlock = cast<BasicBlock>(VMap[OI->ReturnBlock]); 956 ClonedOI->NonReturnBlock = cast<BasicBlock>(VMap[OI->NonReturnBlock]); 957 for (BasicBlock *BB : OI->Entries) 958 ClonedOI->Entries.push_back(cast<BasicBlock>(VMap[BB])); 959 960 for (BasicBlock *E : OI->ReturnBlockPreds) { 961 BasicBlock *NewE = cast<BasicBlock>(VMap[E]); 962 ClonedOI->ReturnBlockPreds.push_back(NewE); 963 } 964 // Go ahead and update all uses to the duplicate, so that we can just 965 // use the inliner functionality when we're done hacking. 966 F->replaceAllUsesWith(ClonedFunc); 967 } 968 969 PartialInlinerImpl::FunctionCloner::FunctionCloner( 970 Function *F, FunctionOutliningMultiRegionInfo *OI, 971 OptimizationRemarkEmitter &ORE, 972 function_ref<AssumptionCache *(Function &)> LookupAC, 973 function_ref<TargetTransformInfo &(Function &)> GetTTI) 974 : OrigFunc(F), ORE(ORE), LookupAC(LookupAC), GetTTI(GetTTI) { 975 ClonedOMRI = std::make_unique<FunctionOutliningMultiRegionInfo>(); 976 977 // Clone the function, so that we can hack away on it. 978 ValueToValueMapTy VMap; 979 ClonedFunc = CloneFunction(F, VMap); 980 981 // Go through all Outline Candidate Regions and update all BasicBlock 982 // information. 983 for (const FunctionOutliningMultiRegionInfo::OutlineRegionInfo &RegionInfo : 984 OI->ORI) { 985 SmallVector<BasicBlock *, 8> Region; 986 for (BasicBlock *BB : RegionInfo.Region) 987 Region.push_back(cast<BasicBlock>(VMap[BB])); 988 989 BasicBlock *NewEntryBlock = cast<BasicBlock>(VMap[RegionInfo.EntryBlock]); 990 BasicBlock *NewExitBlock = cast<BasicBlock>(VMap[RegionInfo.ExitBlock]); 991 BasicBlock *NewReturnBlock = nullptr; 992 if (RegionInfo.ReturnBlock) 993 NewReturnBlock = cast<BasicBlock>(VMap[RegionInfo.ReturnBlock]); 994 FunctionOutliningMultiRegionInfo::OutlineRegionInfo MappedRegionInfo( 995 Region, NewEntryBlock, NewExitBlock, NewReturnBlock); 996 ClonedOMRI->ORI.push_back(MappedRegionInfo); 997 } 998 // Go ahead and update all uses to the duplicate, so that we can just 999 // use the inliner functionality when we're done hacking. 1000 F->replaceAllUsesWith(ClonedFunc); 1001 } 1002 1003 void PartialInlinerImpl::FunctionCloner::normalizeReturnBlock() const { 1004 auto GetFirstPHI = [](BasicBlock *BB) { 1005 BasicBlock::iterator I = BB->begin(); 1006 PHINode *FirstPhi = nullptr; 1007 while (I != BB->end()) { 1008 PHINode *Phi = dyn_cast<PHINode>(I); 1009 if (!Phi) 1010 break; 1011 if (!FirstPhi) { 1012 FirstPhi = Phi; 1013 break; 1014 } 1015 } 1016 return FirstPhi; 1017 }; 1018 1019 // Shouldn't need to normalize PHIs if we're not outlining non-early return 1020 // blocks. 1021 if (!ClonedOI) 1022 return; 1023 1024 // Special hackery is needed with PHI nodes that have inputs from more than 1025 // one extracted block. For simplicity, just split the PHIs into a two-level 1026 // sequence of PHIs, some of which will go in the extracted region, and some 1027 // of which will go outside. 1028 BasicBlock *PreReturn = ClonedOI->ReturnBlock; 1029 // only split block when necessary: 1030 PHINode *FirstPhi = GetFirstPHI(PreReturn); 1031 unsigned NumPredsFromEntries = ClonedOI->ReturnBlockPreds.size(); 1032 1033 if (!FirstPhi || FirstPhi->getNumIncomingValues() <= NumPredsFromEntries + 1) 1034 return; 1035 1036 auto IsTrivialPhi = [](PHINode *PN) -> Value * { 1037 if (llvm::all_equal(PN->incoming_values())) 1038 return PN->getIncomingValue(0); 1039 return nullptr; 1040 }; 1041 1042 ClonedOI->ReturnBlock = ClonedOI->ReturnBlock->splitBasicBlock( 1043 ClonedOI->ReturnBlock->getFirstNonPHI()->getIterator()); 1044 BasicBlock::iterator I = PreReturn->begin(); 1045 BasicBlock::iterator Ins = ClonedOI->ReturnBlock->begin(); 1046 SmallVector<Instruction *, 4> DeadPhis; 1047 while (I != PreReturn->end()) { 1048 PHINode *OldPhi = dyn_cast<PHINode>(I); 1049 if (!OldPhi) 1050 break; 1051 1052 PHINode *RetPhi = 1053 PHINode::Create(OldPhi->getType(), NumPredsFromEntries + 1, ""); 1054 RetPhi->insertBefore(Ins); 1055 OldPhi->replaceAllUsesWith(RetPhi); 1056 Ins = ClonedOI->ReturnBlock->getFirstNonPHIIt(); 1057 1058 RetPhi->addIncoming(&*I, PreReturn); 1059 for (BasicBlock *E : ClonedOI->ReturnBlockPreds) { 1060 RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(E), E); 1061 OldPhi->removeIncomingValue(E); 1062 } 1063 1064 // After incoming values splitting, the old phi may become trivial. 1065 // Keeping the trivial phi can introduce definition inside the outline 1066 // region which is live-out, causing necessary overhead (load, store 1067 // arg passing etc). 1068 if (auto *OldPhiVal = IsTrivialPhi(OldPhi)) { 1069 OldPhi->replaceAllUsesWith(OldPhiVal); 1070 DeadPhis.push_back(OldPhi); 1071 } 1072 ++I; 1073 } 1074 for (auto *DP : DeadPhis) 1075 DP->eraseFromParent(); 1076 1077 for (auto *E : ClonedOI->ReturnBlockPreds) 1078 E->getTerminator()->replaceUsesOfWith(PreReturn, ClonedOI->ReturnBlock); 1079 } 1080 1081 bool PartialInlinerImpl::FunctionCloner::doMultiRegionFunctionOutlining() { 1082 1083 auto ComputeRegionCost = 1084 [&](SmallVectorImpl<BasicBlock *> &Region) -> InstructionCost { 1085 InstructionCost Cost = 0; 1086 for (BasicBlock* BB : Region) 1087 Cost += computeBBInlineCost(BB, &GetTTI(*BB->getParent())); 1088 return Cost; 1089 }; 1090 1091 assert(ClonedOMRI && "Expecting OutlineInfo for multi region outline"); 1092 1093 if (ClonedOMRI->ORI.empty()) 1094 return false; 1095 1096 // The CodeExtractor needs a dominator tree. 1097 DominatorTree DT; 1098 DT.recalculate(*ClonedFunc); 1099 1100 // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo. 1101 LoopInfo LI(DT); 1102 BranchProbabilityInfo BPI(*ClonedFunc, LI); 1103 ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI)); 1104 1105 // Cache and recycle the CodeExtractor analysis to avoid O(n^2) compile-time. 1106 CodeExtractorAnalysisCache CEAC(*ClonedFunc); 1107 1108 SetVector<Value *> Inputs, Outputs, Sinks; 1109 for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo : 1110 ClonedOMRI->ORI) { 1111 InstructionCost CurrentOutlinedRegionCost = 1112 ComputeRegionCost(RegionInfo.Region); 1113 1114 CodeExtractor CE(RegionInfo.Region, &DT, /*AggregateArgs*/ false, 1115 ClonedFuncBFI.get(), &BPI, 1116 LookupAC(*RegionInfo.EntryBlock->getParent()), 1117 /* AllowVarargs */ false); 1118 1119 CE.findInputsOutputs(Inputs, Outputs, Sinks); 1120 1121 LLVM_DEBUG({ 1122 dbgs() << "inputs: " << Inputs.size() << "\n"; 1123 dbgs() << "outputs: " << Outputs.size() << "\n"; 1124 for (Value *value : Inputs) 1125 dbgs() << "value used in func: " << *value << "\n"; 1126 for (Value *output : Outputs) 1127 dbgs() << "instr used in func: " << *output << "\n"; 1128 }); 1129 1130 // Do not extract regions that have live exit variables. 1131 if (Outputs.size() > 0 && !ForceLiveExit) 1132 continue; 1133 1134 if (Function *OutlinedFunc = CE.extractCodeRegion(CEAC)) { 1135 CallBase *OCS = PartialInlinerImpl::getOneCallSiteTo(*OutlinedFunc); 1136 BasicBlock *OutliningCallBB = OCS->getParent(); 1137 assert(OutliningCallBB->getParent() == ClonedFunc); 1138 OutlinedFunctions.push_back(std::make_pair(OutlinedFunc,OutliningCallBB)); 1139 NumColdRegionsOutlined++; 1140 OutlinedRegionCost += CurrentOutlinedRegionCost; 1141 1142 if (MarkOutlinedColdCC) { 1143 OutlinedFunc->setCallingConv(CallingConv::Cold); 1144 OCS->setCallingConv(CallingConv::Cold); 1145 } 1146 } else 1147 ORE.emit([&]() { 1148 return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed", 1149 &RegionInfo.Region.front()->front()) 1150 << "Failed to extract region at block " 1151 << ore::NV("Block", RegionInfo.Region.front()); 1152 }); 1153 } 1154 1155 return !OutlinedFunctions.empty(); 1156 } 1157 1158 Function * 1159 PartialInlinerImpl::FunctionCloner::doSingleRegionFunctionOutlining() { 1160 // Returns true if the block is to be partial inlined into the caller 1161 // (i.e. not to be extracted to the out of line function) 1162 auto ToBeInlined = [&, this](BasicBlock *BB) { 1163 return BB == ClonedOI->ReturnBlock || 1164 llvm::is_contained(ClonedOI->Entries, BB); 1165 }; 1166 1167 assert(ClonedOI && "Expecting OutlineInfo for single region outline"); 1168 // The CodeExtractor needs a dominator tree. 1169 DominatorTree DT; 1170 DT.recalculate(*ClonedFunc); 1171 1172 // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo. 1173 LoopInfo LI(DT); 1174 BranchProbabilityInfo BPI(*ClonedFunc, LI); 1175 ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI)); 1176 1177 // Gather up the blocks that we're going to extract. 1178 std::vector<BasicBlock *> ToExtract; 1179 auto *ClonedFuncTTI = &GetTTI(*ClonedFunc); 1180 ToExtract.push_back(ClonedOI->NonReturnBlock); 1181 OutlinedRegionCost += PartialInlinerImpl::computeBBInlineCost( 1182 ClonedOI->NonReturnBlock, ClonedFuncTTI); 1183 for (BasicBlock *BB : depth_first(&ClonedFunc->getEntryBlock())) 1184 if (!ToBeInlined(BB) && BB != ClonedOI->NonReturnBlock) { 1185 ToExtract.push_back(BB); 1186 // FIXME: the code extractor may hoist/sink more code 1187 // into the outlined function which may make the outlining 1188 // overhead (the difference of the outlined function cost 1189 // and OutliningRegionCost) look larger. 1190 OutlinedRegionCost += computeBBInlineCost(BB, ClonedFuncTTI); 1191 } 1192 1193 // Extract the body of the if. 1194 CodeExtractorAnalysisCache CEAC(*ClonedFunc); 1195 Function *OutlinedFunc = 1196 CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false, 1197 ClonedFuncBFI.get(), &BPI, LookupAC(*ClonedFunc), 1198 /* AllowVarargs */ true) 1199 .extractCodeRegion(CEAC); 1200 1201 if (OutlinedFunc) { 1202 BasicBlock *OutliningCallBB = 1203 PartialInlinerImpl::getOneCallSiteTo(*OutlinedFunc)->getParent(); 1204 assert(OutliningCallBB->getParent() == ClonedFunc); 1205 OutlinedFunctions.push_back(std::make_pair(OutlinedFunc, OutliningCallBB)); 1206 } else 1207 ORE.emit([&]() { 1208 return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed", 1209 &ToExtract.front()->front()) 1210 << "Failed to extract region at block " 1211 << ore::NV("Block", ToExtract.front()); 1212 }); 1213 1214 return OutlinedFunc; 1215 } 1216 1217 PartialInlinerImpl::FunctionCloner::~FunctionCloner() { 1218 // Ditch the duplicate, since we're done with it, and rewrite all remaining 1219 // users (function pointers, etc.) back to the original function. 1220 ClonedFunc->replaceAllUsesWith(OrigFunc); 1221 ClonedFunc->eraseFromParent(); 1222 if (!IsFunctionInlined) { 1223 // Remove each function that was speculatively created if there is no 1224 // reference. 1225 for (auto FuncBBPair : OutlinedFunctions) { 1226 Function *Func = FuncBBPair.first; 1227 Func->eraseFromParent(); 1228 } 1229 } 1230 } 1231 1232 std::pair<bool, Function *> PartialInlinerImpl::unswitchFunction(Function &F) { 1233 if (F.hasAddressTaken()) 1234 return {false, nullptr}; 1235 1236 // Let inliner handle it 1237 if (F.hasFnAttribute(Attribute::AlwaysInline)) 1238 return {false, nullptr}; 1239 1240 if (F.hasFnAttribute(Attribute::NoInline)) 1241 return {false, nullptr}; 1242 1243 if (PSI.isFunctionEntryCold(&F)) 1244 return {false, nullptr}; 1245 1246 if (F.users().empty()) 1247 return {false, nullptr}; 1248 1249 OptimizationRemarkEmitter ORE(&F); 1250 1251 // Only try to outline cold regions if we have a profile summary, which 1252 // implies we have profiling information. 1253 if (PSI.hasProfileSummary() && F.hasProfileData() && 1254 !DisableMultiRegionPartialInline) { 1255 std::unique_ptr<FunctionOutliningMultiRegionInfo> OMRI = 1256 computeOutliningColdRegionsInfo(F, ORE); 1257 if (OMRI) { 1258 FunctionCloner Cloner(&F, OMRI.get(), ORE, LookupAssumptionCache, GetTTI); 1259 1260 LLVM_DEBUG({ 1261 dbgs() << "HotCountThreshold = " << PSI.getHotCountThreshold() << "\n"; 1262 dbgs() << "ColdCountThreshold = " << PSI.getColdCountThreshold() 1263 << "\n"; 1264 }); 1265 1266 bool DidOutline = Cloner.doMultiRegionFunctionOutlining(); 1267 1268 if (DidOutline) { 1269 LLVM_DEBUG({ 1270 dbgs() << ">>>>>> Outlined (Cloned) Function >>>>>>\n"; 1271 Cloner.ClonedFunc->print(dbgs()); 1272 dbgs() << "<<<<<< Outlined (Cloned) Function <<<<<<\n"; 1273 }); 1274 1275 if (tryPartialInline(Cloner)) 1276 return {true, nullptr}; 1277 } 1278 } 1279 } 1280 1281 // Fall-thru to regular partial inlining if we: 1282 // i) can't find any cold regions to outline, or 1283 // ii) can't inline the outlined function anywhere. 1284 std::unique_ptr<FunctionOutliningInfo> OI = computeOutliningInfo(F); 1285 if (!OI) 1286 return {false, nullptr}; 1287 1288 FunctionCloner Cloner(&F, OI.get(), ORE, LookupAssumptionCache, GetTTI); 1289 Cloner.normalizeReturnBlock(); 1290 1291 Function *OutlinedFunction = Cloner.doSingleRegionFunctionOutlining(); 1292 1293 if (!OutlinedFunction) 1294 return {false, nullptr}; 1295 1296 if (tryPartialInline(Cloner)) 1297 return {true, OutlinedFunction}; 1298 1299 return {false, nullptr}; 1300 } 1301 1302 bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) { 1303 if (Cloner.OutlinedFunctions.empty()) 1304 return false; 1305 1306 auto OutliningCosts = computeOutliningCosts(Cloner); 1307 1308 InstructionCost SizeCost = std::get<0>(OutliningCosts); 1309 InstructionCost NonWeightedRcost = std::get<1>(OutliningCosts); 1310 1311 assert(SizeCost.isValid() && NonWeightedRcost.isValid() && 1312 "Expected valid costs"); 1313 1314 // Only calculate RelativeToEntryFreq when we are doing single region 1315 // outlining. 1316 BranchProbability RelativeToEntryFreq; 1317 if (Cloner.ClonedOI) 1318 RelativeToEntryFreq = getOutliningCallBBRelativeFreq(Cloner); 1319 else 1320 // RelativeToEntryFreq doesn't make sense when we have more than one 1321 // outlined call because each call will have a different relative frequency 1322 // to the entry block. We can consider using the average, but the 1323 // usefulness of that information is questionable. For now, assume we never 1324 // execute the calls to outlined functions. 1325 RelativeToEntryFreq = BranchProbability(0, 1); 1326 1327 BlockFrequency WeightedRcost = 1328 BlockFrequency(*NonWeightedRcost.getValue()) * RelativeToEntryFreq; 1329 1330 // The call sequence(s) to the outlined function(s) are larger than the sum of 1331 // the original outlined region size(s), it does not increase the chances of 1332 // inlining the function with outlining (The inliner uses the size increase to 1333 // model the cost of inlining a callee). 1334 if (!SkipCostAnalysis && Cloner.OutlinedRegionCost < SizeCost) { 1335 OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc); 1336 DebugLoc DLoc; 1337 BasicBlock *Block; 1338 std::tie(DLoc, Block) = getOneDebugLoc(*Cloner.ClonedFunc); 1339 OrigFuncORE.emit([&]() { 1340 return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall", 1341 DLoc, Block) 1342 << ore::NV("Function", Cloner.OrigFunc) 1343 << " not partially inlined into callers (Original Size = " 1344 << ore::NV("OutlinedRegionOriginalSize", Cloner.OutlinedRegionCost) 1345 << ", Size of call sequence to outlined function = " 1346 << ore::NV("NewSize", SizeCost) << ")"; 1347 }); 1348 return false; 1349 } 1350 1351 assert(Cloner.OrigFunc->users().empty() && 1352 "F's users should all be replaced!"); 1353 1354 std::vector<User *> Users(Cloner.ClonedFunc->user_begin(), 1355 Cloner.ClonedFunc->user_end()); 1356 1357 DenseMap<User *, uint64_t> CallSiteToProfCountMap; 1358 auto CalleeEntryCount = Cloner.OrigFunc->getEntryCount(); 1359 if (CalleeEntryCount) 1360 computeCallsiteToProfCountMap(Cloner.ClonedFunc, CallSiteToProfCountMap); 1361 1362 uint64_t CalleeEntryCountV = 1363 (CalleeEntryCount ? CalleeEntryCount->getCount() : 0); 1364 1365 bool AnyInline = false; 1366 for (User *User : Users) { 1367 // Don't bother with BlockAddress used by CallBr for asm goto. 1368 if (isa<BlockAddress>(User)) 1369 continue; 1370 1371 CallBase *CB = getSupportedCallBase(User); 1372 1373 if (isLimitReached()) 1374 continue; 1375 1376 OptimizationRemarkEmitter CallerORE(CB->getCaller()); 1377 if (!shouldPartialInline(*CB, Cloner, WeightedRcost, CallerORE)) 1378 continue; 1379 1380 // Construct remark before doing the inlining, as after successful inlining 1381 // the callsite is removed. 1382 OptimizationRemark OR(DEBUG_TYPE, "PartiallyInlined", CB); 1383 OR << ore::NV("Callee", Cloner.OrigFunc) << " partially inlined into " 1384 << ore::NV("Caller", CB->getCaller()); 1385 1386 InlineFunctionInfo IFI(GetAssumptionCache, &PSI); 1387 // We can only forward varargs when we outlined a single region, else we 1388 // bail on vararg functions. 1389 if (!InlineFunction(*CB, IFI, /*MergeAttributes=*/false, nullptr, true, 1390 (Cloner.ClonedOI ? Cloner.OutlinedFunctions.back().first 1391 : nullptr)) 1392 .isSuccess()) 1393 continue; 1394 1395 CallerORE.emit(OR); 1396 1397 // Now update the entry count: 1398 if (CalleeEntryCountV && CallSiteToProfCountMap.count(User)) { 1399 uint64_t CallSiteCount = CallSiteToProfCountMap[User]; 1400 CalleeEntryCountV -= std::min(CalleeEntryCountV, CallSiteCount); 1401 } 1402 1403 AnyInline = true; 1404 NumPartialInlining++; 1405 // Update the stats 1406 if (Cloner.ClonedOI) 1407 NumPartialInlined++; 1408 else 1409 NumColdOutlinePartialInlined++; 1410 } 1411 1412 if (AnyInline) { 1413 Cloner.IsFunctionInlined = true; 1414 if (CalleeEntryCount) 1415 Cloner.OrigFunc->setEntryCount(Function::ProfileCount( 1416 CalleeEntryCountV, CalleeEntryCount->getType())); 1417 OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc); 1418 OrigFuncORE.emit([&]() { 1419 return OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", Cloner.OrigFunc) 1420 << "Partially inlined into at least one caller"; 1421 }); 1422 } 1423 1424 return AnyInline; 1425 } 1426 1427 bool PartialInlinerImpl::run(Module &M) { 1428 if (DisablePartialInlining) 1429 return false; 1430 1431 std::vector<Function *> Worklist; 1432 Worklist.reserve(M.size()); 1433 for (Function &F : M) 1434 if (!F.use_empty() && !F.isDeclaration()) 1435 Worklist.push_back(&F); 1436 1437 bool Changed = false; 1438 while (!Worklist.empty()) { 1439 Function *CurrFunc = Worklist.back(); 1440 Worklist.pop_back(); 1441 1442 if (CurrFunc->use_empty()) 1443 continue; 1444 1445 std::pair<bool, Function *> Result = unswitchFunction(*CurrFunc); 1446 if (Result.second) 1447 Worklist.push_back(Result.second); 1448 Changed |= Result.first; 1449 } 1450 1451 return Changed; 1452 } 1453 1454 PreservedAnalyses PartialInlinerPass::run(Module &M, 1455 ModuleAnalysisManager &AM) { 1456 auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); 1457 1458 auto GetAssumptionCache = [&FAM](Function &F) -> AssumptionCache & { 1459 return FAM.getResult<AssumptionAnalysis>(F); 1460 }; 1461 1462 auto LookupAssumptionCache = [&FAM](Function &F) -> AssumptionCache * { 1463 return FAM.getCachedResult<AssumptionAnalysis>(F); 1464 }; 1465 1466 auto GetBFI = [&FAM](Function &F) -> BlockFrequencyInfo & { 1467 return FAM.getResult<BlockFrequencyAnalysis>(F); 1468 }; 1469 1470 auto GetTTI = [&FAM](Function &F) -> TargetTransformInfo & { 1471 return FAM.getResult<TargetIRAnalysis>(F); 1472 }; 1473 1474 auto GetTLI = [&FAM](Function &F) -> TargetLibraryInfo & { 1475 return FAM.getResult<TargetLibraryAnalysis>(F); 1476 }; 1477 1478 ProfileSummaryInfo &PSI = AM.getResult<ProfileSummaryAnalysis>(M); 1479 1480 if (PartialInlinerImpl(GetAssumptionCache, LookupAssumptionCache, GetTTI, 1481 GetTLI, PSI, GetBFI) 1482 .run(M)) 1483 return PreservedAnalyses::none(); 1484 return PreservedAnalyses::all(); 1485 } 1486