1 //===- InlineCost.cpp - Cost analysis for inliner -------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements inline cost analysis. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "llvm/Analysis/InlineCost.h" 14 #include "llvm/ADT/STLExtras.h" 15 #include "llvm/ADT/SetVector.h" 16 #include "llvm/ADT/SmallPtrSet.h" 17 #include "llvm/ADT/SmallVector.h" 18 #include "llvm/ADT/Statistic.h" 19 #include "llvm/Analysis/AssumptionCache.h" 20 #include "llvm/Analysis/BlockFrequencyInfo.h" 21 #include "llvm/Analysis/CodeMetrics.h" 22 #include "llvm/Analysis/ConstantFolding.h" 23 #include "llvm/Analysis/CFG.h" 24 #include "llvm/Analysis/InstructionSimplify.h" 25 #include "llvm/Analysis/LoopInfo.h" 26 #include "llvm/Analysis/ProfileSummaryInfo.h" 27 #include "llvm/Analysis/TargetTransformInfo.h" 28 #include "llvm/Analysis/ValueTracking.h" 29 #include "llvm/Config/llvm-config.h" 30 #include "llvm/IR/CallingConv.h" 31 #include "llvm/IR/DataLayout.h" 32 #include "llvm/IR/Dominators.h" 33 #include "llvm/IR/GetElementPtrTypeIterator.h" 34 #include "llvm/IR/GlobalAlias.h" 35 #include "llvm/IR/InstVisitor.h" 36 #include "llvm/IR/IntrinsicInst.h" 37 #include "llvm/IR/Operator.h" 38 #include "llvm/IR/PatternMatch.h" 39 #include "llvm/Support/Debug.h" 40 #include "llvm/Support/raw_ostream.h" 41 42 using namespace llvm; 43 44 #define DEBUG_TYPE "inline-cost" 45 46 STATISTIC(NumCallsAnalyzed, "Number of call sites analyzed"); 47 48 static cl::opt<int> InlineThreshold( 49 "inline-threshold", cl::Hidden, cl::init(225), cl::ZeroOrMore, 50 cl::desc("Control the amount of inlining to perform (default = 225)")); 51 52 static cl::opt<int> HintThreshold( 53 "inlinehint-threshold", cl::Hidden, cl::init(325), cl::ZeroOrMore, 54 cl::desc("Threshold for inlining functions with inline hint")); 55 56 static cl::opt<int> 57 ColdCallSiteThreshold("inline-cold-callsite-threshold", cl::Hidden, 58 cl::init(45), cl::ZeroOrMore, 59 cl::desc("Threshold for inlining cold callsites")); 60 61 // We introduce this threshold to help performance of instrumentation based 62 // PGO before we actually hook up inliner with analysis passes such as BPI and 63 // BFI. 64 static cl::opt<int> ColdThreshold( 65 "inlinecold-threshold", cl::Hidden, cl::init(45), cl::ZeroOrMore, 66 cl::desc("Threshold for inlining functions with cold attribute")); 67 68 static cl::opt<int> 69 HotCallSiteThreshold("hot-callsite-threshold", cl::Hidden, cl::init(3000), 70 cl::ZeroOrMore, 71 cl::desc("Threshold for hot callsites ")); 72 73 static cl::opt<int> LocallyHotCallSiteThreshold( 74 "locally-hot-callsite-threshold", cl::Hidden, cl::init(525), cl::ZeroOrMore, 75 cl::desc("Threshold for locally hot callsites ")); 76 77 static cl::opt<int> ColdCallSiteRelFreq( 78 "cold-callsite-rel-freq", cl::Hidden, cl::init(2), cl::ZeroOrMore, 79 cl::desc("Maximum block frequency, expressed as a percentage of caller's " 80 "entry frequency, for a callsite to be cold in the absence of " 81 "profile information.")); 82 83 static cl::opt<int> HotCallSiteRelFreq( 84 "hot-callsite-rel-freq", cl::Hidden, cl::init(60), cl::ZeroOrMore, 85 cl::desc("Minimum block frequency, expressed as a multiple of caller's " 86 "entry frequency, for a callsite to be hot in the absence of " 87 "profile information.")); 88 89 static cl::opt<bool> OptComputeFullInlineCost( 90 "inline-cost-full", cl::Hidden, cl::init(false), cl::ZeroOrMore, 91 cl::desc("Compute the full inline cost of a call site even when the cost " 92 "exceeds the threshold.")); 93 94 namespace { 95 96 class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> { 97 typedef InstVisitor<CallAnalyzer, bool> Base; 98 friend class InstVisitor<CallAnalyzer, bool>; 99 100 /// The TargetTransformInfo available for this compilation. 101 const TargetTransformInfo &TTI; 102 103 /// Getter for the cache of @llvm.assume intrinsics. 104 std::function<AssumptionCache &(Function &)> &GetAssumptionCache; 105 106 /// Getter for BlockFrequencyInfo 107 Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI; 108 109 /// Profile summary information. 110 ProfileSummaryInfo *PSI; 111 112 /// The called function. 113 Function &F; 114 115 // Cache the DataLayout since we use it a lot. 116 const DataLayout &DL; 117 118 /// The OptimizationRemarkEmitter available for this compilation. 119 OptimizationRemarkEmitter *ORE; 120 121 /// The candidate callsite being analyzed. Please do not use this to do 122 /// analysis in the caller function; we want the inline cost query to be 123 /// easily cacheable. Instead, use the cover function paramHasAttr. 124 CallBase &CandidateCall; 125 126 /// Tunable parameters that control the analysis. 127 const InlineParams &Params; 128 129 /// Upper bound for the inlining cost. Bonuses are being applied to account 130 /// for speculative "expected profit" of the inlining decision. 131 int Threshold; 132 133 /// Inlining cost measured in abstract units, accounts for all the 134 /// instructions expected to be executed for a given function invocation. 135 /// Instructions that are statically proven to be dead based on call-site 136 /// arguments are not counted here. 137 int Cost = 0; 138 139 bool ComputeFullInlineCost; 140 141 bool IsCallerRecursive = false; 142 bool IsRecursiveCall = false; 143 bool ExposesReturnsTwice = false; 144 bool HasDynamicAlloca = false; 145 bool ContainsNoDuplicateCall = false; 146 bool HasReturn = false; 147 bool HasIndirectBr = false; 148 bool HasUninlineableIntrinsic = false; 149 bool InitsVargArgs = false; 150 151 /// Number of bytes allocated statically by the callee. 152 uint64_t AllocatedSize = 0; 153 unsigned NumInstructions = 0; 154 unsigned NumVectorInstructions = 0; 155 156 /// Bonus to be applied when percentage of vector instructions in callee is 157 /// high (see more details in updateThreshold). 158 int VectorBonus = 0; 159 /// Bonus to be applied when the callee has only one reachable basic block. 160 int SingleBBBonus = 0; 161 162 /// While we walk the potentially-inlined instructions, we build up and 163 /// maintain a mapping of simplified values specific to this callsite. The 164 /// idea is to propagate any special information we have about arguments to 165 /// this call through the inlinable section of the function, and account for 166 /// likely simplifications post-inlining. The most important aspect we track 167 /// is CFG altering simplifications -- when we prove a basic block dead, that 168 /// can cause dramatic shifts in the cost of inlining a function. 169 DenseMap<Value *, Constant *> SimplifiedValues; 170 171 /// Keep track of the values which map back (through function arguments) to 172 /// allocas on the caller stack which could be simplified through SROA. 173 DenseMap<Value *, Value *> SROAArgValues; 174 175 /// The mapping of caller Alloca values to their accumulated cost savings. If 176 /// we have to disable SROA for one of the allocas, this tells us how much 177 /// cost must be added. 178 DenseMap<Value *, int> SROAArgCosts; 179 180 /// Keep track of values which map to a pointer base and constant offset. 181 DenseMap<Value *, std::pair<Value *, APInt>> ConstantOffsetPtrs; 182 183 /// Keep track of dead blocks due to the constant arguments. 184 SetVector<BasicBlock *> DeadBlocks; 185 186 /// The mapping of the blocks to their known unique successors due to the 187 /// constant arguments. 188 DenseMap<BasicBlock *, BasicBlock *> KnownSuccessors; 189 190 /// Model the elimination of repeated loads that is expected to happen 191 /// whenever we simplify away the stores that would otherwise cause them to be 192 /// loads. 193 bool EnableLoadElimination; 194 SmallPtrSet<Value *, 16> LoadAddrSet; 195 int LoadEliminationCost = 0; 196 197 // Custom simplification helper routines. 198 bool isAllocaDerivedArg(Value *V); 199 bool lookupSROAArgAndCost(Value *V, Value *&Arg, 200 DenseMap<Value *, int>::iterator &CostIt); 201 void disableSROA(DenseMap<Value *, int>::iterator CostIt); 202 void disableSROA(Value *V); 203 void findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB); 204 void accumulateSROACost(DenseMap<Value *, int>::iterator CostIt, 205 int InstructionCost); 206 void disableLoadElimination(); 207 bool isGEPFree(GetElementPtrInst &GEP); 208 bool canFoldInboundsGEP(GetElementPtrInst &I); 209 bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset); 210 bool simplifyCallSite(Function *F, CallBase &Call); 211 template <typename Callable> 212 bool simplifyInstruction(Instruction &I, Callable Evaluate); 213 ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V); 214 215 /// Return true if the given argument to the function being considered for 216 /// inlining has the given attribute set either at the call site or the 217 /// function declaration. Primarily used to inspect call site specific 218 /// attributes since these can be more precise than the ones on the callee 219 /// itself. 220 bool paramHasAttr(Argument *A, Attribute::AttrKind Attr); 221 222 /// Return true if the given value is known non null within the callee if 223 /// inlined through this particular callsite. 224 bool isKnownNonNullInCallee(Value *V); 225 226 /// Update Threshold based on callsite properties such as callee 227 /// attributes and callee hotness for PGO builds. The Callee is explicitly 228 /// passed to support analyzing indirect calls whose target is inferred by 229 /// analysis. 230 void updateThreshold(CallBase &Call, Function &Callee); 231 232 /// Return true if size growth is allowed when inlining the callee at \p Call. 233 bool allowSizeGrowth(CallBase &Call); 234 235 /// Return true if \p Call is a cold callsite. 236 bool isColdCallSite(CallBase &Call, BlockFrequencyInfo *CallerBFI); 237 238 /// Return a higher threshold if \p Call is a hot callsite. 239 Optional<int> getHotCallSiteThreshold(CallBase &Call, 240 BlockFrequencyInfo *CallerBFI); 241 242 // Custom analysis routines. 243 InlineResult analyzeBlock(BasicBlock *BB, 244 SmallPtrSetImpl<const Value *> &EphValues); 245 246 /// Handle a capped 'int' increment for Cost. 247 void addCost(int64_t Inc, int64_t UpperBound = INT_MAX) { 248 assert(UpperBound > 0 && UpperBound <= INT_MAX && "invalid upper bound"); 249 Cost = (int)std::min(UpperBound, Cost + Inc); 250 } 251 252 // Disable several entry points to the visitor so we don't accidentally use 253 // them by declaring but not defining them here. 254 void visit(Module *); 255 void visit(Module &); 256 void visit(Function *); 257 void visit(Function &); 258 void visit(BasicBlock *); 259 void visit(BasicBlock &); 260 261 // Provide base case for our instruction visit. 262 bool visitInstruction(Instruction &I); 263 264 // Our visit overrides. 265 bool visitAlloca(AllocaInst &I); 266 bool visitPHI(PHINode &I); 267 bool visitGetElementPtr(GetElementPtrInst &I); 268 bool visitBitCast(BitCastInst &I); 269 bool visitPtrToInt(PtrToIntInst &I); 270 bool visitIntToPtr(IntToPtrInst &I); 271 bool visitCastInst(CastInst &I); 272 bool visitUnaryInstruction(UnaryInstruction &I); 273 bool visitCmpInst(CmpInst &I); 274 bool visitSub(BinaryOperator &I); 275 bool visitBinaryOperator(BinaryOperator &I); 276 bool visitFNeg(UnaryOperator &I); 277 bool visitLoad(LoadInst &I); 278 bool visitStore(StoreInst &I); 279 bool visitExtractValue(ExtractValueInst &I); 280 bool visitInsertValue(InsertValueInst &I); 281 bool visitCallBase(CallBase &Call); 282 bool visitReturnInst(ReturnInst &RI); 283 bool visitBranchInst(BranchInst &BI); 284 bool visitSelectInst(SelectInst &SI); 285 bool visitSwitchInst(SwitchInst &SI); 286 bool visitIndirectBrInst(IndirectBrInst &IBI); 287 bool visitResumeInst(ResumeInst &RI); 288 bool visitCleanupReturnInst(CleanupReturnInst &RI); 289 bool visitCatchReturnInst(CatchReturnInst &RI); 290 bool visitUnreachableInst(UnreachableInst &I); 291 292 public: 293 CallAnalyzer(const TargetTransformInfo &TTI, 294 std::function<AssumptionCache &(Function &)> &GetAssumptionCache, 295 Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI, 296 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE, 297 Function &Callee, CallBase &Call, const InlineParams &Params) 298 : TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI), 299 PSI(PSI), F(Callee), DL(F.getParent()->getDataLayout()), ORE(ORE), 300 CandidateCall(Call), Params(Params), Threshold(Params.DefaultThreshold), 301 ComputeFullInlineCost(OptComputeFullInlineCost || 302 Params.ComputeFullInlineCost || ORE), 303 EnableLoadElimination(true) {} 304 305 InlineResult analyzeCall(CallBase &Call); 306 307 int getThreshold() { return Threshold; } 308 int getCost() { return Cost; } 309 310 // Keep a bunch of stats about the cost savings found so we can print them 311 // out when debugging. 312 unsigned NumConstantArgs = 0; 313 unsigned NumConstantOffsetPtrArgs = 0; 314 unsigned NumAllocaArgs = 0; 315 unsigned NumConstantPtrCmps = 0; 316 unsigned NumConstantPtrDiffs = 0; 317 unsigned NumInstructionsSimplified = 0; 318 unsigned SROACostSavings = 0; 319 unsigned SROACostSavingsLost = 0; 320 321 void dump(); 322 }; 323 324 } // namespace 325 326 /// Test whether the given value is an Alloca-derived function argument. 327 bool CallAnalyzer::isAllocaDerivedArg(Value *V) { 328 return SROAArgValues.count(V); 329 } 330 331 /// Lookup the SROA-candidate argument and cost iterator which V maps to. 332 /// Returns false if V does not map to a SROA-candidate. 333 bool CallAnalyzer::lookupSROAArgAndCost( 334 Value *V, Value *&Arg, DenseMap<Value *, int>::iterator &CostIt) { 335 if (SROAArgValues.empty() || SROAArgCosts.empty()) 336 return false; 337 338 DenseMap<Value *, Value *>::iterator ArgIt = SROAArgValues.find(V); 339 if (ArgIt == SROAArgValues.end()) 340 return false; 341 342 Arg = ArgIt->second; 343 CostIt = SROAArgCosts.find(Arg); 344 return CostIt != SROAArgCosts.end(); 345 } 346 347 /// Disable SROA for the candidate marked by this cost iterator. 348 /// 349 /// This marks the candidate as no longer viable for SROA, and adds the cost 350 /// savings associated with it back into the inline cost measurement. 351 void CallAnalyzer::disableSROA(DenseMap<Value *, int>::iterator CostIt) { 352 // If we're no longer able to perform SROA we need to undo its cost savings 353 // and prevent subsequent analysis. 354 addCost(CostIt->second); 355 SROACostSavings -= CostIt->second; 356 SROACostSavingsLost += CostIt->second; 357 SROAArgCosts.erase(CostIt); 358 disableLoadElimination(); 359 } 360 361 /// If 'V' maps to a SROA candidate, disable SROA for it. 362 void CallAnalyzer::disableSROA(Value *V) { 363 Value *SROAArg; 364 DenseMap<Value *, int>::iterator CostIt; 365 if (lookupSROAArgAndCost(V, SROAArg, CostIt)) 366 disableSROA(CostIt); 367 } 368 369 /// Accumulate the given cost for a particular SROA candidate. 370 void CallAnalyzer::accumulateSROACost(DenseMap<Value *, int>::iterator CostIt, 371 int InstructionCost) { 372 CostIt->second += InstructionCost; 373 SROACostSavings += InstructionCost; 374 } 375 376 void CallAnalyzer::disableLoadElimination() { 377 if (EnableLoadElimination) { 378 addCost(LoadEliminationCost); 379 LoadEliminationCost = 0; 380 EnableLoadElimination = false; 381 } 382 } 383 384 /// Accumulate a constant GEP offset into an APInt if possible. 385 /// 386 /// Returns false if unable to compute the offset for any reason. Respects any 387 /// simplified values known during the analysis of this callsite. 388 bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) { 389 unsigned IntPtrWidth = DL.getIndexTypeSizeInBits(GEP.getType()); 390 assert(IntPtrWidth == Offset.getBitWidth()); 391 392 for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP); 393 GTI != GTE; ++GTI) { 394 ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand()); 395 if (!OpC) 396 if (Constant *SimpleOp = SimplifiedValues.lookup(GTI.getOperand())) 397 OpC = dyn_cast<ConstantInt>(SimpleOp); 398 if (!OpC) 399 return false; 400 if (OpC->isZero()) 401 continue; 402 403 // Handle a struct index, which adds its field offset to the pointer. 404 if (StructType *STy = GTI.getStructTypeOrNull()) { 405 unsigned ElementIdx = OpC->getZExtValue(); 406 const StructLayout *SL = DL.getStructLayout(STy); 407 Offset += APInt(IntPtrWidth, SL->getElementOffset(ElementIdx)); 408 continue; 409 } 410 411 APInt TypeSize(IntPtrWidth, DL.getTypeAllocSize(GTI.getIndexedType())); 412 Offset += OpC->getValue().sextOrTrunc(IntPtrWidth) * TypeSize; 413 } 414 return true; 415 } 416 417 /// Use TTI to check whether a GEP is free. 418 /// 419 /// Respects any simplified values known during the analysis of this callsite. 420 bool CallAnalyzer::isGEPFree(GetElementPtrInst &GEP) { 421 SmallVector<Value *, 4> Operands; 422 Operands.push_back(GEP.getOperand(0)); 423 for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I) 424 if (Constant *SimpleOp = SimplifiedValues.lookup(*I)) 425 Operands.push_back(SimpleOp); 426 else 427 Operands.push_back(*I); 428 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&GEP, Operands); 429 } 430 431 bool CallAnalyzer::visitAlloca(AllocaInst &I) { 432 // Check whether inlining will turn a dynamic alloca into a static 433 // alloca and handle that case. 434 if (I.isArrayAllocation()) { 435 Constant *Size = SimplifiedValues.lookup(I.getArraySize()); 436 if (auto *AllocSize = dyn_cast_or_null<ConstantInt>(Size)) { 437 Type *Ty = I.getAllocatedType(); 438 AllocatedSize = SaturatingMultiplyAdd( 439 AllocSize->getLimitedValue(), DL.getTypeAllocSize(Ty).getFixedSize(), 440 AllocatedSize); 441 return Base::visitAlloca(I); 442 } 443 } 444 445 // Accumulate the allocated size. 446 if (I.isStaticAlloca()) { 447 Type *Ty = I.getAllocatedType(); 448 AllocatedSize = SaturatingAdd(DL.getTypeAllocSize(Ty).getFixedSize(), 449 AllocatedSize); 450 } 451 452 // We will happily inline static alloca instructions. 453 if (I.isStaticAlloca()) 454 return Base::visitAlloca(I); 455 456 // FIXME: This is overly conservative. Dynamic allocas are inefficient for 457 // a variety of reasons, and so we would like to not inline them into 458 // functions which don't currently have a dynamic alloca. This simply 459 // disables inlining altogether in the presence of a dynamic alloca. 460 HasDynamicAlloca = true; 461 return false; 462 } 463 464 bool CallAnalyzer::visitPHI(PHINode &I) { 465 // FIXME: We need to propagate SROA *disabling* through phi nodes, even 466 // though we don't want to propagate it's bonuses. The idea is to disable 467 // SROA if it *might* be used in an inappropriate manner. 468 469 // Phi nodes are always zero-cost. 470 // FIXME: Pointer sizes may differ between different address spaces, so do we 471 // need to use correct address space in the call to getPointerSizeInBits here? 472 // Or could we skip the getPointerSizeInBits call completely? As far as I can 473 // see the ZeroOffset is used as a dummy value, so we can probably use any 474 // bit width for the ZeroOffset? 475 APInt ZeroOffset = APInt::getNullValue(DL.getPointerSizeInBits(0)); 476 bool CheckSROA = I.getType()->isPointerTy(); 477 478 // Track the constant or pointer with constant offset we've seen so far. 479 Constant *FirstC = nullptr; 480 std::pair<Value *, APInt> FirstBaseAndOffset = {nullptr, ZeroOffset}; 481 Value *FirstV = nullptr; 482 483 for (unsigned i = 0, e = I.getNumIncomingValues(); i != e; ++i) { 484 BasicBlock *Pred = I.getIncomingBlock(i); 485 // If the incoming block is dead, skip the incoming block. 486 if (DeadBlocks.count(Pred)) 487 continue; 488 // If the parent block of phi is not the known successor of the incoming 489 // block, skip the incoming block. 490 BasicBlock *KnownSuccessor = KnownSuccessors[Pred]; 491 if (KnownSuccessor && KnownSuccessor != I.getParent()) 492 continue; 493 494 Value *V = I.getIncomingValue(i); 495 // If the incoming value is this phi itself, skip the incoming value. 496 if (&I == V) 497 continue; 498 499 Constant *C = dyn_cast<Constant>(V); 500 if (!C) 501 C = SimplifiedValues.lookup(V); 502 503 std::pair<Value *, APInt> BaseAndOffset = {nullptr, ZeroOffset}; 504 if (!C && CheckSROA) 505 BaseAndOffset = ConstantOffsetPtrs.lookup(V); 506 507 if (!C && !BaseAndOffset.first) 508 // The incoming value is neither a constant nor a pointer with constant 509 // offset, exit early. 510 return true; 511 512 if (FirstC) { 513 if (FirstC == C) 514 // If we've seen a constant incoming value before and it is the same 515 // constant we see this time, continue checking the next incoming value. 516 continue; 517 // Otherwise early exit because we either see a different constant or saw 518 // a constant before but we have a pointer with constant offset this time. 519 return true; 520 } 521 522 if (FirstV) { 523 // The same logic as above, but check pointer with constant offset here. 524 if (FirstBaseAndOffset == BaseAndOffset) 525 continue; 526 return true; 527 } 528 529 if (C) { 530 // This is the 1st time we've seen a constant, record it. 531 FirstC = C; 532 continue; 533 } 534 535 // The remaining case is that this is the 1st time we've seen a pointer with 536 // constant offset, record it. 537 FirstV = V; 538 FirstBaseAndOffset = BaseAndOffset; 539 } 540 541 // Check if we can map phi to a constant. 542 if (FirstC) { 543 SimplifiedValues[&I] = FirstC; 544 return true; 545 } 546 547 // Check if we can map phi to a pointer with constant offset. 548 if (FirstBaseAndOffset.first) { 549 ConstantOffsetPtrs[&I] = FirstBaseAndOffset; 550 551 Value *SROAArg; 552 DenseMap<Value *, int>::iterator CostIt; 553 if (lookupSROAArgAndCost(FirstV, SROAArg, CostIt)) 554 SROAArgValues[&I] = SROAArg; 555 } 556 557 return true; 558 } 559 560 /// Check we can fold GEPs of constant-offset call site argument pointers. 561 /// This requires target data and inbounds GEPs. 562 /// 563 /// \return true if the specified GEP can be folded. 564 bool CallAnalyzer::canFoldInboundsGEP(GetElementPtrInst &I) { 565 // Check if we have a base + offset for the pointer. 566 std::pair<Value *, APInt> BaseAndOffset = 567 ConstantOffsetPtrs.lookup(I.getPointerOperand()); 568 if (!BaseAndOffset.first) 569 return false; 570 571 // Check if the offset of this GEP is constant, and if so accumulate it 572 // into Offset. 573 if (!accumulateGEPOffset(cast<GEPOperator>(I), BaseAndOffset.second)) 574 return false; 575 576 // Add the result as a new mapping to Base + Offset. 577 ConstantOffsetPtrs[&I] = BaseAndOffset; 578 579 return true; 580 } 581 582 bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) { 583 Value *SROAArg; 584 DenseMap<Value *, int>::iterator CostIt; 585 bool SROACandidate = 586 lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt); 587 588 // Lambda to check whether a GEP's indices are all constant. 589 auto IsGEPOffsetConstant = [&](GetElementPtrInst &GEP) { 590 for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I) 591 if (!isa<Constant>(*I) && !SimplifiedValues.lookup(*I)) 592 return false; 593 return true; 594 }; 595 596 if ((I.isInBounds() && canFoldInboundsGEP(I)) || IsGEPOffsetConstant(I)) { 597 if (SROACandidate) 598 SROAArgValues[&I] = SROAArg; 599 600 // Constant GEPs are modeled as free. 601 return true; 602 } 603 604 // Variable GEPs will require math and will disable SROA. 605 if (SROACandidate) 606 disableSROA(CostIt); 607 return isGEPFree(I); 608 } 609 610 /// Simplify \p I if its operands are constants and update SimplifiedValues. 611 /// \p Evaluate is a callable specific to instruction type that evaluates the 612 /// instruction when all the operands are constants. 613 template <typename Callable> 614 bool CallAnalyzer::simplifyInstruction(Instruction &I, Callable Evaluate) { 615 SmallVector<Constant *, 2> COps; 616 for (Value *Op : I.operands()) { 617 Constant *COp = dyn_cast<Constant>(Op); 618 if (!COp) 619 COp = SimplifiedValues.lookup(Op); 620 if (!COp) 621 return false; 622 COps.push_back(COp); 623 } 624 auto *C = Evaluate(COps); 625 if (!C) 626 return false; 627 SimplifiedValues[&I] = C; 628 return true; 629 } 630 631 bool CallAnalyzer::visitBitCast(BitCastInst &I) { 632 // Propagate constants through bitcasts. 633 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 634 return ConstantExpr::getBitCast(COps[0], I.getType()); 635 })) 636 return true; 637 638 // Track base/offsets through casts 639 std::pair<Value *, APInt> BaseAndOffset = 640 ConstantOffsetPtrs.lookup(I.getOperand(0)); 641 // Casts don't change the offset, just wrap it up. 642 if (BaseAndOffset.first) 643 ConstantOffsetPtrs[&I] = BaseAndOffset; 644 645 // Also look for SROA candidates here. 646 Value *SROAArg; 647 DenseMap<Value *, int>::iterator CostIt; 648 if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) 649 SROAArgValues[&I] = SROAArg; 650 651 // Bitcasts are always zero cost. 652 return true; 653 } 654 655 bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) { 656 // Propagate constants through ptrtoint. 657 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 658 return ConstantExpr::getPtrToInt(COps[0], I.getType()); 659 })) 660 return true; 661 662 // Track base/offset pairs when converted to a plain integer provided the 663 // integer is large enough to represent the pointer. 664 unsigned IntegerSize = I.getType()->getScalarSizeInBits(); 665 unsigned AS = I.getOperand(0)->getType()->getPointerAddressSpace(); 666 if (IntegerSize >= DL.getPointerSizeInBits(AS)) { 667 std::pair<Value *, APInt> BaseAndOffset = 668 ConstantOffsetPtrs.lookup(I.getOperand(0)); 669 if (BaseAndOffset.first) 670 ConstantOffsetPtrs[&I] = BaseAndOffset; 671 } 672 673 // This is really weird. Technically, ptrtoint will disable SROA. However, 674 // unless that ptrtoint is *used* somewhere in the live basic blocks after 675 // inlining, it will be nuked, and SROA should proceed. All of the uses which 676 // would block SROA would also block SROA if applied directly to a pointer, 677 // and so we can just add the integer in here. The only places where SROA is 678 // preserved either cannot fire on an integer, or won't in-and-of themselves 679 // disable SROA (ext) w/o some later use that we would see and disable. 680 Value *SROAArg; 681 DenseMap<Value *, int>::iterator CostIt; 682 if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) 683 SROAArgValues[&I] = SROAArg; 684 685 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I); 686 } 687 688 bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) { 689 // Propagate constants through ptrtoint. 690 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 691 return ConstantExpr::getIntToPtr(COps[0], I.getType()); 692 })) 693 return true; 694 695 // Track base/offset pairs when round-tripped through a pointer without 696 // modifications provided the integer is not too large. 697 Value *Op = I.getOperand(0); 698 unsigned IntegerSize = Op->getType()->getScalarSizeInBits(); 699 if (IntegerSize <= DL.getPointerTypeSizeInBits(I.getType())) { 700 std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Op); 701 if (BaseAndOffset.first) 702 ConstantOffsetPtrs[&I] = BaseAndOffset; 703 } 704 705 // "Propagate" SROA here in the same manner as we do for ptrtoint above. 706 Value *SROAArg; 707 DenseMap<Value *, int>::iterator CostIt; 708 if (lookupSROAArgAndCost(Op, SROAArg, CostIt)) 709 SROAArgValues[&I] = SROAArg; 710 711 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I); 712 } 713 714 bool CallAnalyzer::visitCastInst(CastInst &I) { 715 // Propagate constants through casts. 716 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 717 return ConstantExpr::getCast(I.getOpcode(), COps[0], I.getType()); 718 })) 719 return true; 720 721 // Disable SROA in the face of arbitrary casts we don't whitelist elsewhere. 722 disableSROA(I.getOperand(0)); 723 724 // If this is a floating-point cast, and the target says this operation 725 // is expensive, this may eventually become a library call. Treat the cost 726 // as such. 727 switch (I.getOpcode()) { 728 case Instruction::FPTrunc: 729 case Instruction::FPExt: 730 case Instruction::UIToFP: 731 case Instruction::SIToFP: 732 case Instruction::FPToUI: 733 case Instruction::FPToSI: 734 if (TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive) 735 addCost(InlineConstants::CallPenalty); 736 break; 737 default: 738 break; 739 } 740 741 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I); 742 } 743 744 bool CallAnalyzer::visitUnaryInstruction(UnaryInstruction &I) { 745 Value *Operand = I.getOperand(0); 746 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 747 return ConstantFoldInstOperands(&I, COps[0], DL); 748 })) 749 return true; 750 751 // Disable any SROA on the argument to arbitrary unary instructions. 752 disableSROA(Operand); 753 754 return false; 755 } 756 757 bool CallAnalyzer::paramHasAttr(Argument *A, Attribute::AttrKind Attr) { 758 return CandidateCall.paramHasAttr(A->getArgNo(), Attr); 759 } 760 761 bool CallAnalyzer::isKnownNonNullInCallee(Value *V) { 762 // Does the *call site* have the NonNull attribute set on an argument? We 763 // use the attribute on the call site to memoize any analysis done in the 764 // caller. This will also trip if the callee function has a non-null 765 // parameter attribute, but that's a less interesting case because hopefully 766 // the callee would already have been simplified based on that. 767 if (Argument *A = dyn_cast<Argument>(V)) 768 if (paramHasAttr(A, Attribute::NonNull)) 769 return true; 770 771 // Is this an alloca in the caller? This is distinct from the attribute case 772 // above because attributes aren't updated within the inliner itself and we 773 // always want to catch the alloca derived case. 774 if (isAllocaDerivedArg(V)) 775 // We can actually predict the result of comparisons between an 776 // alloca-derived value and null. Note that this fires regardless of 777 // SROA firing. 778 return true; 779 780 return false; 781 } 782 783 bool CallAnalyzer::allowSizeGrowth(CallBase &Call) { 784 // If the normal destination of the invoke or the parent block of the call 785 // site is unreachable-terminated, there is little point in inlining this 786 // unless there is literally zero cost. 787 // FIXME: Note that it is possible that an unreachable-terminated block has a 788 // hot entry. For example, in below scenario inlining hot_call_X() may be 789 // beneficial : 790 // main() { 791 // hot_call_1(); 792 // ... 793 // hot_call_N() 794 // exit(0); 795 // } 796 // For now, we are not handling this corner case here as it is rare in real 797 // code. In future, we should elaborate this based on BPI and BFI in more 798 // general threshold adjusting heuristics in updateThreshold(). 799 if (InvokeInst *II = dyn_cast<InvokeInst>(&Call)) { 800 if (isa<UnreachableInst>(II->getNormalDest()->getTerminator())) 801 return false; 802 } else if (isa<UnreachableInst>(Call.getParent()->getTerminator())) 803 return false; 804 805 return true; 806 } 807 808 bool CallAnalyzer::isColdCallSite(CallBase &Call, 809 BlockFrequencyInfo *CallerBFI) { 810 // If global profile summary is available, then callsite's coldness is 811 // determined based on that. 812 if (PSI && PSI->hasProfileSummary()) 813 return PSI->isColdCallSite(CallSite(&Call), CallerBFI); 814 815 // Otherwise we need BFI to be available. 816 if (!CallerBFI) 817 return false; 818 819 // Determine if the callsite is cold relative to caller's entry. We could 820 // potentially cache the computation of scaled entry frequency, but the added 821 // complexity is not worth it unless this scaling shows up high in the 822 // profiles. 823 const BranchProbability ColdProb(ColdCallSiteRelFreq, 100); 824 auto CallSiteBB = Call.getParent(); 825 auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB); 826 auto CallerEntryFreq = 827 CallerBFI->getBlockFreq(&(Call.getCaller()->getEntryBlock())); 828 return CallSiteFreq < CallerEntryFreq * ColdProb; 829 } 830 831 Optional<int> 832 CallAnalyzer::getHotCallSiteThreshold(CallBase &Call, 833 BlockFrequencyInfo *CallerBFI) { 834 835 // If global profile summary is available, then callsite's hotness is 836 // determined based on that. 837 if (PSI && PSI->hasProfileSummary() && 838 PSI->isHotCallSite(CallSite(&Call), CallerBFI)) 839 return Params.HotCallSiteThreshold; 840 841 // Otherwise we need BFI to be available and to have a locally hot callsite 842 // threshold. 843 if (!CallerBFI || !Params.LocallyHotCallSiteThreshold) 844 return None; 845 846 // Determine if the callsite is hot relative to caller's entry. We could 847 // potentially cache the computation of scaled entry frequency, but the added 848 // complexity is not worth it unless this scaling shows up high in the 849 // profiles. 850 auto CallSiteBB = Call.getParent(); 851 auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB).getFrequency(); 852 auto CallerEntryFreq = CallerBFI->getEntryFreq(); 853 if (CallSiteFreq >= CallerEntryFreq * HotCallSiteRelFreq) 854 return Params.LocallyHotCallSiteThreshold; 855 856 // Otherwise treat it normally. 857 return None; 858 } 859 860 void CallAnalyzer::updateThreshold(CallBase &Call, Function &Callee) { 861 // If no size growth is allowed for this inlining, set Threshold to 0. 862 if (!allowSizeGrowth(Call)) { 863 Threshold = 0; 864 return; 865 } 866 867 Function *Caller = Call.getCaller(); 868 869 // return min(A, B) if B is valid. 870 auto MinIfValid = [](int A, Optional<int> B) { 871 return B ? std::min(A, B.getValue()) : A; 872 }; 873 874 // return max(A, B) if B is valid. 875 auto MaxIfValid = [](int A, Optional<int> B) { 876 return B ? std::max(A, B.getValue()) : A; 877 }; 878 879 // Various bonus percentages. These are multiplied by Threshold to get the 880 // bonus values. 881 // SingleBBBonus: This bonus is applied if the callee has a single reachable 882 // basic block at the given callsite context. This is speculatively applied 883 // and withdrawn if more than one basic block is seen. 884 // 885 // LstCallToStaticBonus: This large bonus is applied to ensure the inlining 886 // of the last call to a static function as inlining such functions is 887 // guaranteed to reduce code size. 888 // 889 // These bonus percentages may be set to 0 based on properties of the caller 890 // and the callsite. 891 int SingleBBBonusPercent = 50; 892 int VectorBonusPercent = TTI.getInlinerVectorBonusPercent(); 893 int LastCallToStaticBonus = InlineConstants::LastCallToStaticBonus; 894 895 // Lambda to set all the above bonus and bonus percentages to 0. 896 auto DisallowAllBonuses = [&]() { 897 SingleBBBonusPercent = 0; 898 VectorBonusPercent = 0; 899 LastCallToStaticBonus = 0; 900 }; 901 902 // Use the OptMinSizeThreshold or OptSizeThreshold knob if they are available 903 // and reduce the threshold if the caller has the necessary attribute. 904 if (Caller->hasMinSize()) { 905 Threshold = MinIfValid(Threshold, Params.OptMinSizeThreshold); 906 // For minsize, we want to disable the single BB bonus and the vector 907 // bonuses, but not the last-call-to-static bonus. Inlining the last call to 908 // a static function will, at the minimum, eliminate the parameter setup and 909 // call/return instructions. 910 SingleBBBonusPercent = 0; 911 VectorBonusPercent = 0; 912 } else if (Caller->hasOptSize()) 913 Threshold = MinIfValid(Threshold, Params.OptSizeThreshold); 914 915 // Adjust the threshold based on inlinehint attribute and profile based 916 // hotness information if the caller does not have MinSize attribute. 917 if (!Caller->hasMinSize()) { 918 if (Callee.hasFnAttribute(Attribute::InlineHint)) 919 Threshold = MaxIfValid(Threshold, Params.HintThreshold); 920 921 // FIXME: After switching to the new passmanager, simplify the logic below 922 // by checking only the callsite hotness/coldness as we will reliably 923 // have local profile information. 924 // 925 // Callsite hotness and coldness can be determined if sample profile is 926 // used (which adds hotness metadata to calls) or if caller's 927 // BlockFrequencyInfo is available. 928 BlockFrequencyInfo *CallerBFI = GetBFI ? &((*GetBFI)(*Caller)) : nullptr; 929 auto HotCallSiteThreshold = getHotCallSiteThreshold(Call, CallerBFI); 930 if (!Caller->hasOptSize() && HotCallSiteThreshold) { 931 LLVM_DEBUG(dbgs() << "Hot callsite.\n"); 932 // FIXME: This should update the threshold only if it exceeds the 933 // current threshold, but AutoFDO + ThinLTO currently relies on this 934 // behavior to prevent inlining of hot callsites during ThinLTO 935 // compile phase. 936 Threshold = HotCallSiteThreshold.getValue(); 937 } else if (isColdCallSite(Call, CallerBFI)) { 938 LLVM_DEBUG(dbgs() << "Cold callsite.\n"); 939 // Do not apply bonuses for a cold callsite including the 940 // LastCallToStatic bonus. While this bonus might result in code size 941 // reduction, it can cause the size of a non-cold caller to increase 942 // preventing it from being inlined. 943 DisallowAllBonuses(); 944 Threshold = MinIfValid(Threshold, Params.ColdCallSiteThreshold); 945 } else if (PSI) { 946 // Use callee's global profile information only if we have no way of 947 // determining this via callsite information. 948 if (PSI->isFunctionEntryHot(&Callee)) { 949 LLVM_DEBUG(dbgs() << "Hot callee.\n"); 950 // If callsite hotness can not be determined, we may still know 951 // that the callee is hot and treat it as a weaker hint for threshold 952 // increase. 953 Threshold = MaxIfValid(Threshold, Params.HintThreshold); 954 } else if (PSI->isFunctionEntryCold(&Callee)) { 955 LLVM_DEBUG(dbgs() << "Cold callee.\n"); 956 // Do not apply bonuses for a cold callee including the 957 // LastCallToStatic bonus. While this bonus might result in code size 958 // reduction, it can cause the size of a non-cold caller to increase 959 // preventing it from being inlined. 960 DisallowAllBonuses(); 961 Threshold = MinIfValid(Threshold, Params.ColdThreshold); 962 } 963 } 964 } 965 966 // Finally, take the target-specific inlining threshold multiplier into 967 // account. 968 Threshold *= TTI.getInliningThresholdMultiplier(); 969 970 SingleBBBonus = Threshold * SingleBBBonusPercent / 100; 971 VectorBonus = Threshold * VectorBonusPercent / 100; 972 973 bool OnlyOneCallAndLocalLinkage = 974 F.hasLocalLinkage() && F.hasOneUse() && &F == Call.getCalledFunction(); 975 // If there is only one call of the function, and it has internal linkage, 976 // the cost of inlining it drops dramatically. It may seem odd to update 977 // Cost in updateThreshold, but the bonus depends on the logic in this method. 978 if (OnlyOneCallAndLocalLinkage) 979 Cost -= LastCallToStaticBonus; 980 } 981 982 bool CallAnalyzer::visitCmpInst(CmpInst &I) { 983 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); 984 // First try to handle simplified comparisons. 985 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 986 return ConstantExpr::getCompare(I.getPredicate(), COps[0], COps[1]); 987 })) 988 return true; 989 990 if (I.getOpcode() == Instruction::FCmp) 991 return false; 992 993 // Otherwise look for a comparison between constant offset pointers with 994 // a common base. 995 Value *LHSBase, *RHSBase; 996 APInt LHSOffset, RHSOffset; 997 std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS); 998 if (LHSBase) { 999 std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS); 1000 if (RHSBase && LHSBase == RHSBase) { 1001 // We have common bases, fold the icmp to a constant based on the 1002 // offsets. 1003 Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset); 1004 Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset); 1005 if (Constant *C = ConstantExpr::getICmp(I.getPredicate(), CLHS, CRHS)) { 1006 SimplifiedValues[&I] = C; 1007 ++NumConstantPtrCmps; 1008 return true; 1009 } 1010 } 1011 } 1012 1013 // If the comparison is an equality comparison with null, we can simplify it 1014 // if we know the value (argument) can't be null 1015 if (I.isEquality() && isa<ConstantPointerNull>(I.getOperand(1)) && 1016 isKnownNonNullInCallee(I.getOperand(0))) { 1017 bool IsNotEqual = I.getPredicate() == CmpInst::ICMP_NE; 1018 SimplifiedValues[&I] = IsNotEqual ? ConstantInt::getTrue(I.getType()) 1019 : ConstantInt::getFalse(I.getType()); 1020 return true; 1021 } 1022 // Finally check for SROA candidates in comparisons. 1023 Value *SROAArg; 1024 DenseMap<Value *, int>::iterator CostIt; 1025 if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) { 1026 if (isa<ConstantPointerNull>(I.getOperand(1))) { 1027 accumulateSROACost(CostIt, InlineConstants::InstrCost); 1028 return true; 1029 } 1030 1031 disableSROA(CostIt); 1032 } 1033 1034 return false; 1035 } 1036 1037 bool CallAnalyzer::visitSub(BinaryOperator &I) { 1038 // Try to handle a special case: we can fold computing the difference of two 1039 // constant-related pointers. 1040 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); 1041 Value *LHSBase, *RHSBase; 1042 APInt LHSOffset, RHSOffset; 1043 std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS); 1044 if (LHSBase) { 1045 std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS); 1046 if (RHSBase && LHSBase == RHSBase) { 1047 // We have common bases, fold the subtract to a constant based on the 1048 // offsets. 1049 Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset); 1050 Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset); 1051 if (Constant *C = ConstantExpr::getSub(CLHS, CRHS)) { 1052 SimplifiedValues[&I] = C; 1053 ++NumConstantPtrDiffs; 1054 return true; 1055 } 1056 } 1057 } 1058 1059 // Otherwise, fall back to the generic logic for simplifying and handling 1060 // instructions. 1061 return Base::visitSub(I); 1062 } 1063 1064 bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) { 1065 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); 1066 Constant *CLHS = dyn_cast<Constant>(LHS); 1067 if (!CLHS) 1068 CLHS = SimplifiedValues.lookup(LHS); 1069 Constant *CRHS = dyn_cast<Constant>(RHS); 1070 if (!CRHS) 1071 CRHS = SimplifiedValues.lookup(RHS); 1072 1073 Value *SimpleV = nullptr; 1074 if (auto FI = dyn_cast<FPMathOperator>(&I)) 1075 SimpleV = SimplifyBinOp(I.getOpcode(), CLHS ? CLHS : LHS, 1076 CRHS ? CRHS : RHS, FI->getFastMathFlags(), DL); 1077 else 1078 SimpleV = 1079 SimplifyBinOp(I.getOpcode(), CLHS ? CLHS : LHS, CRHS ? CRHS : RHS, DL); 1080 1081 if (Constant *C = dyn_cast_or_null<Constant>(SimpleV)) 1082 SimplifiedValues[&I] = C; 1083 1084 if (SimpleV) 1085 return true; 1086 1087 // Disable any SROA on arguments to arbitrary, unsimplified binary operators. 1088 disableSROA(LHS); 1089 disableSROA(RHS); 1090 1091 // If the instruction is floating point, and the target says this operation 1092 // is expensive, this may eventually become a library call. Treat the cost 1093 // as such. Unless it's fneg which can be implemented with an xor. 1094 using namespace llvm::PatternMatch; 1095 if (I.getType()->isFloatingPointTy() && 1096 TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive && 1097 !match(&I, m_FNeg(m_Value()))) 1098 addCost(InlineConstants::CallPenalty); 1099 1100 return false; 1101 } 1102 1103 bool CallAnalyzer::visitFNeg(UnaryOperator &I) { 1104 Value *Op = I.getOperand(0); 1105 Constant *COp = dyn_cast<Constant>(Op); 1106 if (!COp) 1107 COp = SimplifiedValues.lookup(Op); 1108 1109 Value *SimpleV = SimplifyFNegInst(COp ? COp : Op, 1110 cast<FPMathOperator>(I).getFastMathFlags(), 1111 DL); 1112 1113 if (Constant *C = dyn_cast_or_null<Constant>(SimpleV)) 1114 SimplifiedValues[&I] = C; 1115 1116 if (SimpleV) 1117 return true; 1118 1119 // Disable any SROA on arguments to arbitrary, unsimplified fneg. 1120 disableSROA(Op); 1121 1122 return false; 1123 } 1124 1125 bool CallAnalyzer::visitLoad(LoadInst &I) { 1126 Value *SROAArg; 1127 DenseMap<Value *, int>::iterator CostIt; 1128 if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) { 1129 if (I.isSimple()) { 1130 accumulateSROACost(CostIt, InlineConstants::InstrCost); 1131 return true; 1132 } 1133 1134 disableSROA(CostIt); 1135 } 1136 1137 // If the data is already loaded from this address and hasn't been clobbered 1138 // by any stores or calls, this load is likely to be redundant and can be 1139 // eliminated. 1140 if (EnableLoadElimination && 1141 !LoadAddrSet.insert(I.getPointerOperand()).second && I.isUnordered()) { 1142 LoadEliminationCost += InlineConstants::InstrCost; 1143 return true; 1144 } 1145 1146 return false; 1147 } 1148 1149 bool CallAnalyzer::visitStore(StoreInst &I) { 1150 Value *SROAArg; 1151 DenseMap<Value *, int>::iterator CostIt; 1152 if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) { 1153 if (I.isSimple()) { 1154 accumulateSROACost(CostIt, InlineConstants::InstrCost); 1155 return true; 1156 } 1157 1158 disableSROA(CostIt); 1159 } 1160 1161 // The store can potentially clobber loads and prevent repeated loads from 1162 // being eliminated. 1163 // FIXME: 1164 // 1. We can probably keep an initial set of eliminatable loads substracted 1165 // from the cost even when we finally see a store. We just need to disable 1166 // *further* accumulation of elimination savings. 1167 // 2. We should probably at some point thread MemorySSA for the callee into 1168 // this and then use that to actually compute *really* precise savings. 1169 disableLoadElimination(); 1170 return false; 1171 } 1172 1173 bool CallAnalyzer::visitExtractValue(ExtractValueInst &I) { 1174 // Constant folding for extract value is trivial. 1175 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 1176 return ConstantExpr::getExtractValue(COps[0], I.getIndices()); 1177 })) 1178 return true; 1179 1180 // SROA can look through these but give them a cost. 1181 return false; 1182 } 1183 1184 bool CallAnalyzer::visitInsertValue(InsertValueInst &I) { 1185 // Constant folding for insert value is trivial. 1186 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 1187 return ConstantExpr::getInsertValue(/*AggregateOperand*/ COps[0], 1188 /*InsertedValueOperand*/ COps[1], 1189 I.getIndices()); 1190 })) 1191 return true; 1192 1193 // SROA can look through these but give them a cost. 1194 return false; 1195 } 1196 1197 /// Try to simplify a call site. 1198 /// 1199 /// Takes a concrete function and callsite and tries to actually simplify it by 1200 /// analyzing the arguments and call itself with instsimplify. Returns true if 1201 /// it has simplified the callsite to some other entity (a constant), making it 1202 /// free. 1203 bool CallAnalyzer::simplifyCallSite(Function *F, CallBase &Call) { 1204 // FIXME: Using the instsimplify logic directly for this is inefficient 1205 // because we have to continually rebuild the argument list even when no 1206 // simplifications can be performed. Until that is fixed with remapping 1207 // inside of instsimplify, directly constant fold calls here. 1208 if (!canConstantFoldCallTo(&Call, F)) 1209 return false; 1210 1211 // Try to re-map the arguments to constants. 1212 SmallVector<Constant *, 4> ConstantArgs; 1213 ConstantArgs.reserve(Call.arg_size()); 1214 for (Value *I : Call.args()) { 1215 Constant *C = dyn_cast<Constant>(I); 1216 if (!C) 1217 C = dyn_cast_or_null<Constant>(SimplifiedValues.lookup(I)); 1218 if (!C) 1219 return false; // This argument doesn't map to a constant. 1220 1221 ConstantArgs.push_back(C); 1222 } 1223 if (Constant *C = ConstantFoldCall(&Call, F, ConstantArgs)) { 1224 SimplifiedValues[&Call] = C; 1225 return true; 1226 } 1227 1228 return false; 1229 } 1230 1231 bool CallAnalyzer::visitCallBase(CallBase &Call) { 1232 if (Call.hasFnAttr(Attribute::ReturnsTwice) && 1233 !F.hasFnAttribute(Attribute::ReturnsTwice)) { 1234 // This aborts the entire analysis. 1235 ExposesReturnsTwice = true; 1236 return false; 1237 } 1238 if (isa<CallInst>(Call) && cast<CallInst>(Call).cannotDuplicate()) 1239 ContainsNoDuplicateCall = true; 1240 1241 if (Function *F = Call.getCalledFunction()) { 1242 // When we have a concrete function, first try to simplify it directly. 1243 if (simplifyCallSite(F, Call)) 1244 return true; 1245 1246 // Next check if it is an intrinsic we know about. 1247 // FIXME: Lift this into part of the InstVisitor. 1248 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&Call)) { 1249 switch (II->getIntrinsicID()) { 1250 default: 1251 if (!Call.onlyReadsMemory() && !isAssumeLikeIntrinsic(II)) 1252 disableLoadElimination(); 1253 return Base::visitCallBase(Call); 1254 1255 case Intrinsic::load_relative: 1256 // This is normally lowered to 4 LLVM instructions. 1257 addCost(3 * InlineConstants::InstrCost); 1258 return false; 1259 1260 case Intrinsic::memset: 1261 case Intrinsic::memcpy: 1262 case Intrinsic::memmove: 1263 disableLoadElimination(); 1264 // SROA can usually chew through these intrinsics, but they aren't free. 1265 return false; 1266 case Intrinsic::icall_branch_funnel: 1267 case Intrinsic::localescape: 1268 HasUninlineableIntrinsic = true; 1269 return false; 1270 case Intrinsic::vastart: 1271 InitsVargArgs = true; 1272 return false; 1273 } 1274 } 1275 1276 if (F == Call.getFunction()) { 1277 // This flag will fully abort the analysis, so don't bother with anything 1278 // else. 1279 IsRecursiveCall = true; 1280 return false; 1281 } 1282 1283 if (TTI.isLoweredToCall(F)) { 1284 // We account for the average 1 instruction per call argument setup 1285 // here. 1286 addCost(Call.arg_size() * InlineConstants::InstrCost); 1287 1288 // Everything other than inline ASM will also have a significant cost 1289 // merely from making the call. 1290 if (!isa<InlineAsm>(Call.getCalledValue())) 1291 addCost(InlineConstants::CallPenalty); 1292 } 1293 1294 if (!Call.onlyReadsMemory()) 1295 disableLoadElimination(); 1296 return Base::visitCallBase(Call); 1297 } 1298 1299 // Otherwise we're in a very special case -- an indirect function call. See 1300 // if we can be particularly clever about this. 1301 Value *Callee = Call.getCalledValue(); 1302 1303 // First, pay the price of the argument setup. We account for the average 1304 // 1 instruction per call argument setup here. 1305 addCost(Call.arg_size() * InlineConstants::InstrCost); 1306 1307 // Next, check if this happens to be an indirect function call to a known 1308 // function in this inline context. If not, we've done all we can. 1309 Function *F = dyn_cast_or_null<Function>(SimplifiedValues.lookup(Callee)); 1310 if (!F) { 1311 if (!Call.onlyReadsMemory()) 1312 disableLoadElimination(); 1313 return Base::visitCallBase(Call); 1314 } 1315 1316 // If we have a constant that we are calling as a function, we can peer 1317 // through it and see the function target. This happens not infrequently 1318 // during devirtualization and so we want to give it a hefty bonus for 1319 // inlining, but cap that bonus in the event that inlining wouldn't pan 1320 // out. Pretend to inline the function, with a custom threshold. 1321 auto IndirectCallParams = Params; 1322 IndirectCallParams.DefaultThreshold = InlineConstants::IndirectCallThreshold; 1323 CallAnalyzer CA(TTI, GetAssumptionCache, GetBFI, PSI, ORE, *F, Call, 1324 IndirectCallParams); 1325 if (CA.analyzeCall(Call)) { 1326 // We were able to inline the indirect call! Subtract the cost from the 1327 // threshold to get the bonus we want to apply, but don't go below zero. 1328 Cost -= std::max(0, CA.getThreshold() - CA.getCost()); 1329 } 1330 1331 if (!F->onlyReadsMemory()) 1332 disableLoadElimination(); 1333 return Base::visitCallBase(Call); 1334 } 1335 1336 bool CallAnalyzer::visitReturnInst(ReturnInst &RI) { 1337 // At least one return instruction will be free after inlining. 1338 bool Free = !HasReturn; 1339 HasReturn = true; 1340 return Free; 1341 } 1342 1343 bool CallAnalyzer::visitBranchInst(BranchInst &BI) { 1344 // We model unconditional branches as essentially free -- they really 1345 // shouldn't exist at all, but handling them makes the behavior of the 1346 // inliner more regular and predictable. Interestingly, conditional branches 1347 // which will fold away are also free. 1348 return BI.isUnconditional() || isa<ConstantInt>(BI.getCondition()) || 1349 dyn_cast_or_null<ConstantInt>( 1350 SimplifiedValues.lookup(BI.getCondition())); 1351 } 1352 1353 bool CallAnalyzer::visitSelectInst(SelectInst &SI) { 1354 bool CheckSROA = SI.getType()->isPointerTy(); 1355 Value *TrueVal = SI.getTrueValue(); 1356 Value *FalseVal = SI.getFalseValue(); 1357 1358 Constant *TrueC = dyn_cast<Constant>(TrueVal); 1359 if (!TrueC) 1360 TrueC = SimplifiedValues.lookup(TrueVal); 1361 Constant *FalseC = dyn_cast<Constant>(FalseVal); 1362 if (!FalseC) 1363 FalseC = SimplifiedValues.lookup(FalseVal); 1364 Constant *CondC = 1365 dyn_cast_or_null<Constant>(SimplifiedValues.lookup(SI.getCondition())); 1366 1367 if (!CondC) { 1368 // Select C, X, X => X 1369 if (TrueC == FalseC && TrueC) { 1370 SimplifiedValues[&SI] = TrueC; 1371 return true; 1372 } 1373 1374 if (!CheckSROA) 1375 return Base::visitSelectInst(SI); 1376 1377 std::pair<Value *, APInt> TrueBaseAndOffset = 1378 ConstantOffsetPtrs.lookup(TrueVal); 1379 std::pair<Value *, APInt> FalseBaseAndOffset = 1380 ConstantOffsetPtrs.lookup(FalseVal); 1381 if (TrueBaseAndOffset == FalseBaseAndOffset && TrueBaseAndOffset.first) { 1382 ConstantOffsetPtrs[&SI] = TrueBaseAndOffset; 1383 1384 Value *SROAArg; 1385 DenseMap<Value *, int>::iterator CostIt; 1386 if (lookupSROAArgAndCost(TrueVal, SROAArg, CostIt)) 1387 SROAArgValues[&SI] = SROAArg; 1388 return true; 1389 } 1390 1391 return Base::visitSelectInst(SI); 1392 } 1393 1394 // Select condition is a constant. 1395 Value *SelectedV = CondC->isAllOnesValue() 1396 ? TrueVal 1397 : (CondC->isNullValue()) ? FalseVal : nullptr; 1398 if (!SelectedV) { 1399 // Condition is a vector constant that is not all 1s or all 0s. If all 1400 // operands are constants, ConstantExpr::getSelect() can handle the cases 1401 // such as select vectors. 1402 if (TrueC && FalseC) { 1403 if (auto *C = ConstantExpr::getSelect(CondC, TrueC, FalseC)) { 1404 SimplifiedValues[&SI] = C; 1405 return true; 1406 } 1407 } 1408 return Base::visitSelectInst(SI); 1409 } 1410 1411 // Condition is either all 1s or all 0s. SI can be simplified. 1412 if (Constant *SelectedC = dyn_cast<Constant>(SelectedV)) { 1413 SimplifiedValues[&SI] = SelectedC; 1414 return true; 1415 } 1416 1417 if (!CheckSROA) 1418 return true; 1419 1420 std::pair<Value *, APInt> BaseAndOffset = 1421 ConstantOffsetPtrs.lookup(SelectedV); 1422 if (BaseAndOffset.first) { 1423 ConstantOffsetPtrs[&SI] = BaseAndOffset; 1424 1425 Value *SROAArg; 1426 DenseMap<Value *, int>::iterator CostIt; 1427 if (lookupSROAArgAndCost(SelectedV, SROAArg, CostIt)) 1428 SROAArgValues[&SI] = SROAArg; 1429 } 1430 1431 return true; 1432 } 1433 1434 bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) { 1435 // We model unconditional switches as free, see the comments on handling 1436 // branches. 1437 if (isa<ConstantInt>(SI.getCondition())) 1438 return true; 1439 if (Value *V = SimplifiedValues.lookup(SI.getCondition())) 1440 if (isa<ConstantInt>(V)) 1441 return true; 1442 1443 // Assume the most general case where the switch is lowered into 1444 // either a jump table, bit test, or a balanced binary tree consisting of 1445 // case clusters without merging adjacent clusters with the same 1446 // destination. We do not consider the switches that are lowered with a mix 1447 // of jump table/bit test/binary search tree. The cost of the switch is 1448 // proportional to the size of the tree or the size of jump table range. 1449 // 1450 // NB: We convert large switches which are just used to initialize large phi 1451 // nodes to lookup tables instead in simplify-cfg, so this shouldn't prevent 1452 // inlining those. It will prevent inlining in cases where the optimization 1453 // does not (yet) fire. 1454 1455 // Maximum valid cost increased in this function. 1456 int CostUpperBound = INT_MAX - InlineConstants::InstrCost - 1; 1457 1458 unsigned JumpTableSize = 0; 1459 unsigned NumCaseCluster = 1460 TTI.getEstimatedNumberOfCaseClusters(SI, JumpTableSize); 1461 1462 // If suitable for a jump table, consider the cost for the table size and 1463 // branch to destination. 1464 if (JumpTableSize) { 1465 int64_t JTCost = (int64_t)JumpTableSize * InlineConstants::InstrCost + 1466 4 * InlineConstants::InstrCost; 1467 1468 addCost(JTCost, (int64_t)CostUpperBound); 1469 return false; 1470 } 1471 1472 // Considering forming a binary search, we should find the number of nodes 1473 // which is same as the number of comparisons when lowered. For a given 1474 // number of clusters, n, we can define a recursive function, f(n), to find 1475 // the number of nodes in the tree. The recursion is : 1476 // f(n) = 1 + f(n/2) + f (n - n/2), when n > 3, 1477 // and f(n) = n, when n <= 3. 1478 // This will lead a binary tree where the leaf should be either f(2) or f(3) 1479 // when n > 3. So, the number of comparisons from leaves should be n, while 1480 // the number of non-leaf should be : 1481 // 2^(log2(n) - 1) - 1 1482 // = 2^log2(n) * 2^-1 - 1 1483 // = n / 2 - 1. 1484 // Considering comparisons from leaf and non-leaf nodes, we can estimate the 1485 // number of comparisons in a simple closed form : 1486 // n + n / 2 - 1 = n * 3 / 2 - 1 1487 if (NumCaseCluster <= 3) { 1488 // Suppose a comparison includes one compare and one conditional branch. 1489 addCost(NumCaseCluster * 2 * InlineConstants::InstrCost); 1490 return false; 1491 } 1492 1493 int64_t ExpectedNumberOfCompare = 3 * (int64_t)NumCaseCluster / 2 - 1; 1494 int64_t SwitchCost = 1495 ExpectedNumberOfCompare * 2 * InlineConstants::InstrCost; 1496 1497 addCost(SwitchCost, (int64_t)CostUpperBound); 1498 return false; 1499 } 1500 1501 bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) { 1502 // We never want to inline functions that contain an indirectbr. This is 1503 // incorrect because all the blockaddress's (in static global initializers 1504 // for example) would be referring to the original function, and this 1505 // indirect jump would jump from the inlined copy of the function into the 1506 // original function which is extremely undefined behavior. 1507 // FIXME: This logic isn't really right; we can safely inline functions with 1508 // indirectbr's as long as no other function or global references the 1509 // blockaddress of a block within the current function. 1510 HasIndirectBr = true; 1511 return false; 1512 } 1513 1514 bool CallAnalyzer::visitResumeInst(ResumeInst &RI) { 1515 // FIXME: It's not clear that a single instruction is an accurate model for 1516 // the inline cost of a resume instruction. 1517 return false; 1518 } 1519 1520 bool CallAnalyzer::visitCleanupReturnInst(CleanupReturnInst &CRI) { 1521 // FIXME: It's not clear that a single instruction is an accurate model for 1522 // the inline cost of a cleanupret instruction. 1523 return false; 1524 } 1525 1526 bool CallAnalyzer::visitCatchReturnInst(CatchReturnInst &CRI) { 1527 // FIXME: It's not clear that a single instruction is an accurate model for 1528 // the inline cost of a catchret instruction. 1529 return false; 1530 } 1531 1532 bool CallAnalyzer::visitUnreachableInst(UnreachableInst &I) { 1533 // FIXME: It might be reasonably to discount the cost of instructions leading 1534 // to unreachable as they have the lowest possible impact on both runtime and 1535 // code size. 1536 return true; // No actual code is needed for unreachable. 1537 } 1538 1539 bool CallAnalyzer::visitInstruction(Instruction &I) { 1540 // Some instructions are free. All of the free intrinsics can also be 1541 // handled by SROA, etc. 1542 if (TargetTransformInfo::TCC_Free == TTI.getUserCost(&I)) 1543 return true; 1544 1545 // We found something we don't understand or can't handle. Mark any SROA-able 1546 // values in the operand list as no longer viable. 1547 for (User::op_iterator OI = I.op_begin(), OE = I.op_end(); OI != OE; ++OI) 1548 disableSROA(*OI); 1549 1550 return false; 1551 } 1552 1553 /// Analyze a basic block for its contribution to the inline cost. 1554 /// 1555 /// This method walks the analyzer over every instruction in the given basic 1556 /// block and accounts for their cost during inlining at this callsite. It 1557 /// aborts early if the threshold has been exceeded or an impossible to inline 1558 /// construct has been detected. It returns false if inlining is no longer 1559 /// viable, and true if inlining remains viable. 1560 InlineResult 1561 CallAnalyzer::analyzeBlock(BasicBlock *BB, 1562 SmallPtrSetImpl<const Value *> &EphValues) { 1563 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { 1564 // FIXME: Currently, the number of instructions in a function regardless of 1565 // our ability to simplify them during inline to constants or dead code, 1566 // are actually used by the vector bonus heuristic. As long as that's true, 1567 // we have to special case debug intrinsics here to prevent differences in 1568 // inlining due to debug symbols. Eventually, the number of unsimplified 1569 // instructions shouldn't factor into the cost computation, but until then, 1570 // hack around it here. 1571 if (isa<DbgInfoIntrinsic>(I)) 1572 continue; 1573 1574 // Skip ephemeral values. 1575 if (EphValues.count(&*I)) 1576 continue; 1577 1578 ++NumInstructions; 1579 if (isa<ExtractElementInst>(I) || I->getType()->isVectorTy()) 1580 ++NumVectorInstructions; 1581 1582 // If the instruction simplified to a constant, there is no cost to this 1583 // instruction. Visit the instructions using our InstVisitor to account for 1584 // all of the per-instruction logic. The visit tree returns true if we 1585 // consumed the instruction in any way, and false if the instruction's base 1586 // cost should count against inlining. 1587 if (Base::visit(&*I)) 1588 ++NumInstructionsSimplified; 1589 else 1590 addCost(InlineConstants::InstrCost); 1591 1592 using namespace ore; 1593 // If the visit this instruction detected an uninlinable pattern, abort. 1594 InlineResult IR; 1595 if (IsRecursiveCall) 1596 IR = "recursive"; 1597 else if (ExposesReturnsTwice) 1598 IR = "exposes returns twice"; 1599 else if (HasDynamicAlloca) 1600 IR = "dynamic alloca"; 1601 else if (HasIndirectBr) 1602 IR = "indirect branch"; 1603 else if (HasUninlineableIntrinsic) 1604 IR = "uninlinable intrinsic"; 1605 else if (InitsVargArgs) 1606 IR = "varargs"; 1607 if (!IR) { 1608 if (ORE) 1609 ORE->emit([&]() { 1610 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", 1611 &CandidateCall) 1612 << NV("Callee", &F) << " has uninlinable pattern (" 1613 << NV("InlineResult", IR.message) 1614 << ") and cost is not fully computed"; 1615 }); 1616 return IR; 1617 } 1618 1619 // If the caller is a recursive function then we don't want to inline 1620 // functions which allocate a lot of stack space because it would increase 1621 // the caller stack usage dramatically. 1622 if (IsCallerRecursive && 1623 AllocatedSize > InlineConstants::TotalAllocaSizeRecursiveCaller) { 1624 InlineResult IR = "recursive and allocates too much stack space"; 1625 if (ORE) 1626 ORE->emit([&]() { 1627 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", 1628 &CandidateCall) 1629 << NV("Callee", &F) << " is " << NV("InlineResult", IR.message) 1630 << ". Cost is not fully computed"; 1631 }); 1632 return IR; 1633 } 1634 1635 // Check if we've passed the maximum possible threshold so we don't spin in 1636 // huge basic blocks that will never inline. 1637 if (Cost >= Threshold && !ComputeFullInlineCost) 1638 return false; 1639 } 1640 1641 return true; 1642 } 1643 1644 /// Compute the base pointer and cumulative constant offsets for V. 1645 /// 1646 /// This strips all constant offsets off of V, leaving it the base pointer, and 1647 /// accumulates the total constant offset applied in the returned constant. It 1648 /// returns 0 if V is not a pointer, and returns the constant '0' if there are 1649 /// no constant offsets applied. 1650 ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) { 1651 if (!V->getType()->isPointerTy()) 1652 return nullptr; 1653 1654 unsigned AS = V->getType()->getPointerAddressSpace(); 1655 unsigned IntPtrWidth = DL.getIndexSizeInBits(AS); 1656 APInt Offset = APInt::getNullValue(IntPtrWidth); 1657 1658 // Even though we don't look through PHI nodes, we could be called on an 1659 // instruction in an unreachable block, which may be on a cycle. 1660 SmallPtrSet<Value *, 4> Visited; 1661 Visited.insert(V); 1662 do { 1663 if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) { 1664 if (!GEP->isInBounds() || !accumulateGEPOffset(*GEP, Offset)) 1665 return nullptr; 1666 V = GEP->getPointerOperand(); 1667 } else if (Operator::getOpcode(V) == Instruction::BitCast) { 1668 V = cast<Operator>(V)->getOperand(0); 1669 } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) { 1670 if (GA->isInterposable()) 1671 break; 1672 V = GA->getAliasee(); 1673 } else { 1674 break; 1675 } 1676 assert(V->getType()->isPointerTy() && "Unexpected operand type!"); 1677 } while (Visited.insert(V).second); 1678 1679 Type *IntPtrTy = DL.getIntPtrType(V->getContext(), AS); 1680 return cast<ConstantInt>(ConstantInt::get(IntPtrTy, Offset)); 1681 } 1682 1683 /// Find dead blocks due to deleted CFG edges during inlining. 1684 /// 1685 /// If we know the successor of the current block, \p CurrBB, has to be \p 1686 /// NextBB, the other successors of \p CurrBB are dead if these successors have 1687 /// no live incoming CFG edges. If one block is found to be dead, we can 1688 /// continue growing the dead block list by checking the successors of the dead 1689 /// blocks to see if all their incoming edges are dead or not. 1690 void CallAnalyzer::findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB) { 1691 auto IsEdgeDead = [&](BasicBlock *Pred, BasicBlock *Succ) { 1692 // A CFG edge is dead if the predecessor is dead or the predecessor has a 1693 // known successor which is not the one under exam. 1694 return (DeadBlocks.count(Pred) || 1695 (KnownSuccessors[Pred] && KnownSuccessors[Pred] != Succ)); 1696 }; 1697 1698 auto IsNewlyDead = [&](BasicBlock *BB) { 1699 // If all the edges to a block are dead, the block is also dead. 1700 return (!DeadBlocks.count(BB) && 1701 llvm::all_of(predecessors(BB), 1702 [&](BasicBlock *P) { return IsEdgeDead(P, BB); })); 1703 }; 1704 1705 for (BasicBlock *Succ : successors(CurrBB)) { 1706 if (Succ == NextBB || !IsNewlyDead(Succ)) 1707 continue; 1708 SmallVector<BasicBlock *, 4> NewDead; 1709 NewDead.push_back(Succ); 1710 while (!NewDead.empty()) { 1711 BasicBlock *Dead = NewDead.pop_back_val(); 1712 if (DeadBlocks.insert(Dead)) 1713 // Continue growing the dead block lists. 1714 for (BasicBlock *S : successors(Dead)) 1715 if (IsNewlyDead(S)) 1716 NewDead.push_back(S); 1717 } 1718 } 1719 } 1720 1721 /// Analyze a call site for potential inlining. 1722 /// 1723 /// Returns true if inlining this call is viable, and false if it is not 1724 /// viable. It computes the cost and adjusts the threshold based on numerous 1725 /// factors and heuristics. If this method returns false but the computed cost 1726 /// is below the computed threshold, then inlining was forcibly disabled by 1727 /// some artifact of the routine. 1728 InlineResult CallAnalyzer::analyzeCall(CallBase &Call) { 1729 ++NumCallsAnalyzed; 1730 1731 // Perform some tweaks to the cost and threshold based on the direct 1732 // callsite information. 1733 1734 // We want to more aggressively inline vector-dense kernels, so up the 1735 // threshold, and we'll lower it if the % of vector instructions gets too 1736 // low. Note that these bonuses are some what arbitrary and evolved over time 1737 // by accident as much as because they are principled bonuses. 1738 // 1739 // FIXME: It would be nice to remove all such bonuses. At least it would be 1740 // nice to base the bonus values on something more scientific. 1741 assert(NumInstructions == 0); 1742 assert(NumVectorInstructions == 0); 1743 1744 // Update the threshold based on callsite properties 1745 updateThreshold(Call, F); 1746 1747 // While Threshold depends on commandline options that can take negative 1748 // values, we want to enforce the invariant that the computed threshold and 1749 // bonuses are non-negative. 1750 assert(Threshold >= 0); 1751 assert(SingleBBBonus >= 0); 1752 assert(VectorBonus >= 0); 1753 1754 // Speculatively apply all possible bonuses to Threshold. If cost exceeds 1755 // this Threshold any time, and cost cannot decrease, we can stop processing 1756 // the rest of the function body. 1757 Threshold += (SingleBBBonus + VectorBonus); 1758 1759 // Give out bonuses for the callsite, as the instructions setting them up 1760 // will be gone after inlining. 1761 addCost(-getCallsiteCost(Call, DL)); 1762 1763 // If this function uses the coldcc calling convention, prefer not to inline 1764 // it. 1765 if (F.getCallingConv() == CallingConv::Cold) 1766 Cost += InlineConstants::ColdccPenalty; 1767 1768 // Check if we're done. This can happen due to bonuses and penalties. 1769 if (Cost >= Threshold && !ComputeFullInlineCost) 1770 return "high cost"; 1771 1772 if (F.empty()) 1773 return true; 1774 1775 Function *Caller = Call.getFunction(); 1776 // Check if the caller function is recursive itself. 1777 for (User *U : Caller->users()) { 1778 CallBase *Call = dyn_cast<CallBase>(U); 1779 if (Call && Call->getFunction() == Caller) { 1780 IsCallerRecursive = true; 1781 break; 1782 } 1783 } 1784 1785 // Populate our simplified values by mapping from function arguments to call 1786 // arguments with known important simplifications. 1787 auto CAI = Call.arg_begin(); 1788 for (Function::arg_iterator FAI = F.arg_begin(), FAE = F.arg_end(); 1789 FAI != FAE; ++FAI, ++CAI) { 1790 assert(CAI != Call.arg_end()); 1791 if (Constant *C = dyn_cast<Constant>(CAI)) 1792 SimplifiedValues[&*FAI] = C; 1793 1794 Value *PtrArg = *CAI; 1795 if (ConstantInt *C = stripAndComputeInBoundsConstantOffsets(PtrArg)) { 1796 ConstantOffsetPtrs[&*FAI] = std::make_pair(PtrArg, C->getValue()); 1797 1798 // We can SROA any pointer arguments derived from alloca instructions. 1799 if (isa<AllocaInst>(PtrArg)) { 1800 SROAArgValues[&*FAI] = PtrArg; 1801 SROAArgCosts[PtrArg] = 0; 1802 } 1803 } 1804 } 1805 NumConstantArgs = SimplifiedValues.size(); 1806 NumConstantOffsetPtrArgs = ConstantOffsetPtrs.size(); 1807 NumAllocaArgs = SROAArgValues.size(); 1808 1809 // FIXME: If a caller has multiple calls to a callee, we end up recomputing 1810 // the ephemeral values multiple times (and they're completely determined by 1811 // the callee, so this is purely duplicate work). 1812 SmallPtrSet<const Value *, 32> EphValues; 1813 CodeMetrics::collectEphemeralValues(&F, &GetAssumptionCache(F), EphValues); 1814 1815 // The worklist of live basic blocks in the callee *after* inlining. We avoid 1816 // adding basic blocks of the callee which can be proven to be dead for this 1817 // particular call site in order to get more accurate cost estimates. This 1818 // requires a somewhat heavyweight iteration pattern: we need to walk the 1819 // basic blocks in a breadth-first order as we insert live successors. To 1820 // accomplish this, prioritizing for small iterations because we exit after 1821 // crossing our threshold, we use a small-size optimized SetVector. 1822 typedef SetVector<BasicBlock *, SmallVector<BasicBlock *, 16>, 1823 SmallPtrSet<BasicBlock *, 16>> 1824 BBSetVector; 1825 BBSetVector BBWorklist; 1826 BBWorklist.insert(&F.getEntryBlock()); 1827 bool SingleBB = true; 1828 // Note that we *must not* cache the size, this loop grows the worklist. 1829 for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) { 1830 // Bail out the moment we cross the threshold. This means we'll under-count 1831 // the cost, but only when undercounting doesn't matter. 1832 if (Cost >= Threshold && !ComputeFullInlineCost) 1833 break; 1834 1835 BasicBlock *BB = BBWorklist[Idx]; 1836 if (BB->empty()) 1837 continue; 1838 1839 // Disallow inlining a blockaddress with uses other than strictly callbr. 1840 // A blockaddress only has defined behavior for an indirect branch in the 1841 // same function, and we do not currently support inlining indirect 1842 // branches. But, the inliner may not see an indirect branch that ends up 1843 // being dead code at a particular call site. If the blockaddress escapes 1844 // the function, e.g., via a global variable, inlining may lead to an 1845 // invalid cross-function reference. 1846 // FIXME: pr/39560: continue relaxing this overt restriction. 1847 if (BB->hasAddressTaken()) 1848 for (User *U : BlockAddress::get(&*BB)->users()) 1849 if (!isa<CallBrInst>(*U)) 1850 return "blockaddress used outside of callbr"; 1851 1852 // Analyze the cost of this block. If we blow through the threshold, this 1853 // returns false, and we can bail on out. 1854 InlineResult IR = analyzeBlock(BB, EphValues); 1855 if (!IR) 1856 return IR; 1857 1858 Instruction *TI = BB->getTerminator(); 1859 1860 // Add in the live successors by first checking whether we have terminator 1861 // that may be simplified based on the values simplified by this call. 1862 if (BranchInst *BI = dyn_cast<BranchInst>(TI)) { 1863 if (BI->isConditional()) { 1864 Value *Cond = BI->getCondition(); 1865 if (ConstantInt *SimpleCond = 1866 dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) { 1867 BasicBlock *NextBB = BI->getSuccessor(SimpleCond->isZero() ? 1 : 0); 1868 BBWorklist.insert(NextBB); 1869 KnownSuccessors[BB] = NextBB; 1870 findDeadBlocks(BB, NextBB); 1871 continue; 1872 } 1873 } 1874 } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) { 1875 Value *Cond = SI->getCondition(); 1876 if (ConstantInt *SimpleCond = 1877 dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) { 1878 BasicBlock *NextBB = SI->findCaseValue(SimpleCond)->getCaseSuccessor(); 1879 BBWorklist.insert(NextBB); 1880 KnownSuccessors[BB] = NextBB; 1881 findDeadBlocks(BB, NextBB); 1882 continue; 1883 } 1884 } 1885 1886 // If we're unable to select a particular successor, just count all of 1887 // them. 1888 for (unsigned TIdx = 0, TSize = TI->getNumSuccessors(); TIdx != TSize; 1889 ++TIdx) 1890 BBWorklist.insert(TI->getSuccessor(TIdx)); 1891 1892 // If we had any successors at this point, than post-inlining is likely to 1893 // have them as well. Note that we assume any basic blocks which existed 1894 // due to branches or switches which folded above will also fold after 1895 // inlining. 1896 if (SingleBB && TI->getNumSuccessors() > 1) { 1897 // Take off the bonus we applied to the threshold. 1898 Threshold -= SingleBBBonus; 1899 SingleBB = false; 1900 } 1901 } 1902 1903 bool OnlyOneCallAndLocalLinkage = 1904 F.hasLocalLinkage() && F.hasOneUse() && &F == Call.getCalledFunction(); 1905 // If this is a noduplicate call, we can still inline as long as 1906 // inlining this would cause the removal of the caller (so the instruction 1907 // is not actually duplicated, just moved). 1908 if (!OnlyOneCallAndLocalLinkage && ContainsNoDuplicateCall) 1909 return "noduplicate"; 1910 1911 // Loops generally act a lot like calls in that they act like barriers to 1912 // movement, require a certain amount of setup, etc. So when optimising for 1913 // size, we penalise any call sites that perform loops. We do this after all 1914 // other costs here, so will likely only be dealing with relatively small 1915 // functions (and hence DT and LI will hopefully be cheap). 1916 if (Caller->hasMinSize()) { 1917 DominatorTree DT(F); 1918 LoopInfo LI(DT); 1919 int NumLoops = 0; 1920 for (Loop *L : LI) { 1921 // Ignore loops that will not be executed 1922 if (DeadBlocks.count(L->getHeader())) 1923 continue; 1924 NumLoops++; 1925 } 1926 addCost(NumLoops * InlineConstants::CallPenalty); 1927 } 1928 1929 // We applied the maximum possible vector bonus at the beginning. Now, 1930 // subtract the excess bonus, if any, from the Threshold before 1931 // comparing against Cost. 1932 if (NumVectorInstructions <= NumInstructions / 10) 1933 Threshold -= VectorBonus; 1934 else if (NumVectorInstructions <= NumInstructions / 2) 1935 Threshold -= VectorBonus/2; 1936 1937 return Cost < std::max(1, Threshold); 1938 } 1939 1940 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1941 /// Dump stats about this call's analysis. 1942 LLVM_DUMP_METHOD void CallAnalyzer::dump() { 1943 #define DEBUG_PRINT_STAT(x) dbgs() << " " #x ": " << x << "\n" 1944 DEBUG_PRINT_STAT(NumConstantArgs); 1945 DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs); 1946 DEBUG_PRINT_STAT(NumAllocaArgs); 1947 DEBUG_PRINT_STAT(NumConstantPtrCmps); 1948 DEBUG_PRINT_STAT(NumConstantPtrDiffs); 1949 DEBUG_PRINT_STAT(NumInstructionsSimplified); 1950 DEBUG_PRINT_STAT(NumInstructions); 1951 DEBUG_PRINT_STAT(SROACostSavings); 1952 DEBUG_PRINT_STAT(SROACostSavingsLost); 1953 DEBUG_PRINT_STAT(LoadEliminationCost); 1954 DEBUG_PRINT_STAT(ContainsNoDuplicateCall); 1955 DEBUG_PRINT_STAT(Cost); 1956 DEBUG_PRINT_STAT(Threshold); 1957 #undef DEBUG_PRINT_STAT 1958 } 1959 #endif 1960 1961 /// Test that there are no attribute conflicts between Caller and Callee 1962 /// that prevent inlining. 1963 static bool functionsHaveCompatibleAttributes(Function *Caller, 1964 Function *Callee, 1965 TargetTransformInfo &TTI) { 1966 return TTI.areInlineCompatible(Caller, Callee) && 1967 AttributeFuncs::areInlineCompatible(*Caller, *Callee); 1968 } 1969 1970 int llvm::getCallsiteCost(CallBase &Call, const DataLayout &DL) { 1971 int Cost = 0; 1972 for (unsigned I = 0, E = Call.arg_size(); I != E; ++I) { 1973 if (Call.isByValArgument(I)) { 1974 // We approximate the number of loads and stores needed by dividing the 1975 // size of the byval type by the target's pointer size. 1976 PointerType *PTy = cast<PointerType>(Call.getArgOperand(I)->getType()); 1977 unsigned TypeSize = DL.getTypeSizeInBits(PTy->getElementType()); 1978 unsigned AS = PTy->getAddressSpace(); 1979 unsigned PointerSize = DL.getPointerSizeInBits(AS); 1980 // Ceiling division. 1981 unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize; 1982 1983 // If it generates more than 8 stores it is likely to be expanded as an 1984 // inline memcpy so we take that as an upper bound. Otherwise we assume 1985 // one load and one store per word copied. 1986 // FIXME: The maxStoresPerMemcpy setting from the target should be used 1987 // here instead of a magic number of 8, but it's not available via 1988 // DataLayout. 1989 NumStores = std::min(NumStores, 8U); 1990 1991 Cost += 2 * NumStores * InlineConstants::InstrCost; 1992 } else { 1993 // For non-byval arguments subtract off one instruction per call 1994 // argument. 1995 Cost += InlineConstants::InstrCost; 1996 } 1997 } 1998 // The call instruction also disappears after inlining. 1999 Cost += InlineConstants::InstrCost + InlineConstants::CallPenalty; 2000 return Cost; 2001 } 2002 2003 InlineCost llvm::getInlineCost( 2004 CallBase &Call, const InlineParams &Params, TargetTransformInfo &CalleeTTI, 2005 std::function<AssumptionCache &(Function &)> &GetAssumptionCache, 2006 Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI, 2007 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) { 2008 return getInlineCost(Call, Call.getCalledFunction(), Params, CalleeTTI, 2009 GetAssumptionCache, GetBFI, PSI, ORE); 2010 } 2011 2012 InlineCost llvm::getInlineCost( 2013 CallBase &Call, Function *Callee, const InlineParams &Params, 2014 TargetTransformInfo &CalleeTTI, 2015 std::function<AssumptionCache &(Function &)> &GetAssumptionCache, 2016 Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI, 2017 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) { 2018 2019 // Cannot inline indirect calls. 2020 if (!Callee) 2021 return llvm::InlineCost::getNever("indirect call"); 2022 2023 // Never inline calls with byval arguments that does not have the alloca 2024 // address space. Since byval arguments can be replaced with a copy to an 2025 // alloca, the inlined code would need to be adjusted to handle that the 2026 // argument is in the alloca address space (so it is a little bit complicated 2027 // to solve). 2028 unsigned AllocaAS = Callee->getParent()->getDataLayout().getAllocaAddrSpace(); 2029 for (unsigned I = 0, E = Call.arg_size(); I != E; ++I) 2030 if (Call.isByValArgument(I)) { 2031 PointerType *PTy = cast<PointerType>(Call.getArgOperand(I)->getType()); 2032 if (PTy->getAddressSpace() != AllocaAS) 2033 return llvm::InlineCost::getNever("byval arguments without alloca" 2034 " address space"); 2035 } 2036 2037 // Calls to functions with always-inline attributes should be inlined 2038 // whenever possible. 2039 if (Call.hasFnAttr(Attribute::AlwaysInline)) { 2040 auto IsViable = isInlineViable(*Callee); 2041 if (IsViable) 2042 return llvm::InlineCost::getAlways("always inline attribute"); 2043 return llvm::InlineCost::getNever(IsViable.message); 2044 } 2045 2046 // Never inline functions with conflicting attributes (unless callee has 2047 // always-inline attribute). 2048 Function *Caller = Call.getCaller(); 2049 if (!functionsHaveCompatibleAttributes(Caller, Callee, CalleeTTI)) 2050 return llvm::InlineCost::getNever("conflicting attributes"); 2051 2052 // Don't inline this call if the caller has the optnone attribute. 2053 if (Caller->hasOptNone()) 2054 return llvm::InlineCost::getNever("optnone attribute"); 2055 2056 // Don't inline a function that treats null pointer as valid into a caller 2057 // that does not have this attribute. 2058 if (!Caller->nullPointerIsDefined() && Callee->nullPointerIsDefined()) 2059 return llvm::InlineCost::getNever("nullptr definitions incompatible"); 2060 2061 // Don't inline functions which can be interposed at link-time. 2062 if (Callee->isInterposable()) 2063 return llvm::InlineCost::getNever("interposable"); 2064 2065 // Don't inline functions marked noinline. 2066 if (Callee->hasFnAttribute(Attribute::NoInline)) 2067 return llvm::InlineCost::getNever("noinline function attribute"); 2068 2069 // Don't inline call sites marked noinline. 2070 if (Call.isNoInline()) 2071 return llvm::InlineCost::getNever("noinline call site attribute"); 2072 2073 LLVM_DEBUG(llvm::dbgs() << " Analyzing call of " << Callee->getName() 2074 << "... (caller:" << Caller->getName() << ")\n"); 2075 2076 CallAnalyzer CA(CalleeTTI, GetAssumptionCache, GetBFI, PSI, ORE, *Callee, 2077 Call, Params); 2078 InlineResult ShouldInline = CA.analyzeCall(Call); 2079 2080 LLVM_DEBUG(CA.dump()); 2081 2082 // Check if there was a reason to force inlining or no inlining. 2083 if (!ShouldInline && CA.getCost() < CA.getThreshold()) 2084 return InlineCost::getNever(ShouldInline.message); 2085 if (ShouldInline && CA.getCost() >= CA.getThreshold()) 2086 return InlineCost::getAlways("empty function"); 2087 2088 return llvm::InlineCost::get(CA.getCost(), CA.getThreshold()); 2089 } 2090 2091 InlineResult llvm::isInlineViable(Function &F) { 2092 bool ReturnsTwice = F.hasFnAttribute(Attribute::ReturnsTwice); 2093 for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) { 2094 // Disallow inlining of functions which contain indirect branches. 2095 if (isa<IndirectBrInst>(BI->getTerminator())) 2096 return "contains indirect branches"; 2097 2098 // Disallow inlining of blockaddresses which are used by non-callbr 2099 // instructions. 2100 if (BI->hasAddressTaken()) 2101 for (User *U : BlockAddress::get(&*BI)->users()) 2102 if (!isa<CallBrInst>(*U)) 2103 return "blockaddress used outside of callbr"; 2104 2105 for (auto &II : *BI) { 2106 CallBase *Call = dyn_cast<CallBase>(&II); 2107 if (!Call) 2108 continue; 2109 2110 // Disallow recursive calls. 2111 if (&F == Call->getCalledFunction()) 2112 return "recursive call"; 2113 2114 // Disallow calls which expose returns-twice to a function not previously 2115 // attributed as such. 2116 if (!ReturnsTwice && isa<CallInst>(Call) && 2117 cast<CallInst>(Call)->canReturnTwice()) 2118 return "exposes returns-twice attribute"; 2119 2120 if (Call->getCalledFunction()) 2121 switch (Call->getCalledFunction()->getIntrinsicID()) { 2122 default: 2123 break; 2124 // Disallow inlining of @llvm.icall.branch.funnel because current 2125 // backend can't separate call targets from call arguments. 2126 case llvm::Intrinsic::icall_branch_funnel: 2127 return "disallowed inlining of @llvm.icall.branch.funnel"; 2128 // Disallow inlining functions that call @llvm.localescape. Doing this 2129 // correctly would require major changes to the inliner. 2130 case llvm::Intrinsic::localescape: 2131 return "disallowed inlining of @llvm.localescape"; 2132 // Disallow inlining of functions that initialize VarArgs with va_start. 2133 case llvm::Intrinsic::vastart: 2134 return "contains VarArgs initialized with va_start"; 2135 } 2136 } 2137 } 2138 2139 return true; 2140 } 2141 2142 // APIs to create InlineParams based on command line flags and/or other 2143 // parameters. 2144 2145 InlineParams llvm::getInlineParams(int Threshold) { 2146 InlineParams Params; 2147 2148 // This field is the threshold to use for a callee by default. This is 2149 // derived from one or more of: 2150 // * optimization or size-optimization levels, 2151 // * a value passed to createFunctionInliningPass function, or 2152 // * the -inline-threshold flag. 2153 // If the -inline-threshold flag is explicitly specified, that is used 2154 // irrespective of anything else. 2155 if (InlineThreshold.getNumOccurrences() > 0) 2156 Params.DefaultThreshold = InlineThreshold; 2157 else 2158 Params.DefaultThreshold = Threshold; 2159 2160 // Set the HintThreshold knob from the -inlinehint-threshold. 2161 Params.HintThreshold = HintThreshold; 2162 2163 // Set the HotCallSiteThreshold knob from the -hot-callsite-threshold. 2164 Params.HotCallSiteThreshold = HotCallSiteThreshold; 2165 2166 // If the -locally-hot-callsite-threshold is explicitly specified, use it to 2167 // populate LocallyHotCallSiteThreshold. Later, we populate 2168 // Params.LocallyHotCallSiteThreshold from -locally-hot-callsite-threshold if 2169 // we know that optimization level is O3 (in the getInlineParams variant that 2170 // takes the opt and size levels). 2171 // FIXME: Remove this check (and make the assignment unconditional) after 2172 // addressing size regression issues at O2. 2173 if (LocallyHotCallSiteThreshold.getNumOccurrences() > 0) 2174 Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold; 2175 2176 // Set the ColdCallSiteThreshold knob from the -inline-cold-callsite-threshold. 2177 Params.ColdCallSiteThreshold = ColdCallSiteThreshold; 2178 2179 // Set the OptMinSizeThreshold and OptSizeThreshold params only if the 2180 // -inlinehint-threshold commandline option is not explicitly given. If that 2181 // option is present, then its value applies even for callees with size and 2182 // minsize attributes. 2183 // If the -inline-threshold is not specified, set the ColdThreshold from the 2184 // -inlinecold-threshold even if it is not explicitly passed. If 2185 // -inline-threshold is specified, then -inlinecold-threshold needs to be 2186 // explicitly specified to set the ColdThreshold knob 2187 if (InlineThreshold.getNumOccurrences() == 0) { 2188 Params.OptMinSizeThreshold = InlineConstants::OptMinSizeThreshold; 2189 Params.OptSizeThreshold = InlineConstants::OptSizeThreshold; 2190 Params.ColdThreshold = ColdThreshold; 2191 } else if (ColdThreshold.getNumOccurrences() > 0) { 2192 Params.ColdThreshold = ColdThreshold; 2193 } 2194 return Params; 2195 } 2196 2197 InlineParams llvm::getInlineParams() { 2198 return getInlineParams(InlineThreshold); 2199 } 2200 2201 // Compute the default threshold for inlining based on the opt level and the 2202 // size opt level. 2203 static int computeThresholdFromOptLevels(unsigned OptLevel, 2204 unsigned SizeOptLevel) { 2205 if (OptLevel > 2) 2206 return InlineConstants::OptAggressiveThreshold; 2207 if (SizeOptLevel == 1) // -Os 2208 return InlineConstants::OptSizeThreshold; 2209 if (SizeOptLevel == 2) // -Oz 2210 return InlineConstants::OptMinSizeThreshold; 2211 return InlineThreshold; 2212 } 2213 2214 InlineParams llvm::getInlineParams(unsigned OptLevel, unsigned SizeOptLevel) { 2215 auto Params = 2216 getInlineParams(computeThresholdFromOptLevels(OptLevel, SizeOptLevel)); 2217 // At O3, use the value of -locally-hot-callsite-threshold option to populate 2218 // Params.LocallyHotCallSiteThreshold. Below O3, this flag has effect only 2219 // when it is specified explicitly. 2220 if (OptLevel > 2) 2221 Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold; 2222 return Params; 2223 } 2224