1 //===- InlineCost.cpp - Cost analysis for inliner -------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements inline cost analysis. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "llvm/Analysis/InlineCost.h" 14 #include "llvm/ADT/STLExtras.h" 15 #include "llvm/ADT/SetVector.h" 16 #include "llvm/ADT/SmallPtrSet.h" 17 #include "llvm/ADT/SmallVector.h" 18 #include "llvm/ADT/Statistic.h" 19 #include "llvm/Analysis/AssumptionCache.h" 20 #include "llvm/Analysis/BlockFrequencyInfo.h" 21 #include "llvm/Analysis/CodeMetrics.h" 22 #include "llvm/Analysis/ConstantFolding.h" 23 #include "llvm/Analysis/CFG.h" 24 #include "llvm/Analysis/InstructionSimplify.h" 25 #include "llvm/Analysis/LoopInfo.h" 26 #include "llvm/Analysis/ProfileSummaryInfo.h" 27 #include "llvm/Analysis/TargetTransformInfo.h" 28 #include "llvm/Analysis/ValueTracking.h" 29 #include "llvm/Config/llvm-config.h" 30 #include "llvm/IR/CallingConv.h" 31 #include "llvm/IR/DataLayout.h" 32 #include "llvm/IR/Dominators.h" 33 #include "llvm/IR/GetElementPtrTypeIterator.h" 34 #include "llvm/IR/GlobalAlias.h" 35 #include "llvm/IR/InstVisitor.h" 36 #include "llvm/IR/IntrinsicInst.h" 37 #include "llvm/IR/Operator.h" 38 #include "llvm/IR/PatternMatch.h" 39 #include "llvm/Support/Debug.h" 40 #include "llvm/Support/raw_ostream.h" 41 42 using namespace llvm; 43 44 #define DEBUG_TYPE "inline-cost" 45 46 STATISTIC(NumCallsAnalyzed, "Number of call sites analyzed"); 47 48 static cl::opt<int> InlineThreshold( 49 "inline-threshold", cl::Hidden, cl::init(225), cl::ZeroOrMore, 50 cl::desc("Control the amount of inlining to perform (default = 225)")); 51 52 static cl::opt<int> HintThreshold( 53 "inlinehint-threshold", cl::Hidden, cl::init(325), cl::ZeroOrMore, 54 cl::desc("Threshold for inlining functions with inline hint")); 55 56 static cl::opt<int> 57 ColdCallSiteThreshold("inline-cold-callsite-threshold", cl::Hidden, 58 cl::init(45), cl::ZeroOrMore, 59 cl::desc("Threshold for inlining cold callsites")); 60 61 // We introduce this threshold to help performance of instrumentation based 62 // PGO before we actually hook up inliner with analysis passes such as BPI and 63 // BFI. 64 static cl::opt<int> ColdThreshold( 65 "inlinecold-threshold", cl::Hidden, cl::init(45), cl::ZeroOrMore, 66 cl::desc("Threshold for inlining functions with cold attribute")); 67 68 static cl::opt<int> 69 HotCallSiteThreshold("hot-callsite-threshold", cl::Hidden, cl::init(3000), 70 cl::ZeroOrMore, 71 cl::desc("Threshold for hot callsites ")); 72 73 static cl::opt<int> LocallyHotCallSiteThreshold( 74 "locally-hot-callsite-threshold", cl::Hidden, cl::init(525), cl::ZeroOrMore, 75 cl::desc("Threshold for locally hot callsites ")); 76 77 static cl::opt<int> ColdCallSiteRelFreq( 78 "cold-callsite-rel-freq", cl::Hidden, cl::init(2), cl::ZeroOrMore, 79 cl::desc("Maximum block frequency, expressed as a percentage of caller's " 80 "entry frequency, for a callsite to be cold in the absence of " 81 "profile information.")); 82 83 static cl::opt<int> HotCallSiteRelFreq( 84 "hot-callsite-rel-freq", cl::Hidden, cl::init(60), cl::ZeroOrMore, 85 cl::desc("Minimum block frequency, expressed as a multiple of caller's " 86 "entry frequency, for a callsite to be hot in the absence of " 87 "profile information.")); 88 89 static cl::opt<bool> OptComputeFullInlineCost( 90 "inline-cost-full", cl::Hidden, cl::init(false), cl::ZeroOrMore, 91 cl::desc("Compute the full inline cost of a call site even when the cost " 92 "exceeds the threshold.")); 93 94 namespace { 95 96 class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> { 97 typedef InstVisitor<CallAnalyzer, bool> Base; 98 friend class InstVisitor<CallAnalyzer, bool>; 99 100 /// The TargetTransformInfo available for this compilation. 101 const TargetTransformInfo &TTI; 102 103 /// Getter for the cache of @llvm.assume intrinsics. 104 std::function<AssumptionCache &(Function &)> &GetAssumptionCache; 105 106 /// Getter for BlockFrequencyInfo 107 Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI; 108 109 /// Profile summary information. 110 ProfileSummaryInfo *PSI; 111 112 /// The called function. 113 Function &F; 114 115 // Cache the DataLayout since we use it a lot. 116 const DataLayout &DL; 117 118 /// The OptimizationRemarkEmitter available for this compilation. 119 OptimizationRemarkEmitter *ORE; 120 121 /// The candidate callsite being analyzed. Please do not use this to do 122 /// analysis in the caller function; we want the inline cost query to be 123 /// easily cacheable. Instead, use the cover function paramHasAttr. 124 CallBase &CandidateCall; 125 126 /// Tunable parameters that control the analysis. 127 const InlineParams &Params; 128 129 /// Upper bound for the inlining cost. Bonuses are being applied to account 130 /// for speculative "expected profit" of the inlining decision. 131 int Threshold; 132 133 /// Inlining cost measured in abstract units, accounts for all the 134 /// instructions expected to be executed for a given function invocation. 135 /// Instructions that are statically proven to be dead based on call-site 136 /// arguments are not counted here. 137 int Cost = 0; 138 139 bool ComputeFullInlineCost; 140 141 bool IsCallerRecursive = false; 142 bool IsRecursiveCall = false; 143 bool ExposesReturnsTwice = false; 144 bool HasDynamicAlloca = false; 145 bool ContainsNoDuplicateCall = false; 146 bool HasReturn = false; 147 bool HasIndirectBr = false; 148 bool HasUninlineableIntrinsic = false; 149 bool InitsVargArgs = false; 150 151 /// Number of bytes allocated statically by the callee. 152 uint64_t AllocatedSize = 0; 153 unsigned NumInstructions = 0; 154 unsigned NumVectorInstructions = 0; 155 156 /// Bonus to be applied when percentage of vector instructions in callee is 157 /// high (see more details in updateThreshold). 158 int VectorBonus = 0; 159 /// Bonus to be applied when the callee has only one reachable basic block. 160 int SingleBBBonus = 0; 161 162 /// While we walk the potentially-inlined instructions, we build up and 163 /// maintain a mapping of simplified values specific to this callsite. The 164 /// idea is to propagate any special information we have about arguments to 165 /// this call through the inlinable section of the function, and account for 166 /// likely simplifications post-inlining. The most important aspect we track 167 /// is CFG altering simplifications -- when we prove a basic block dead, that 168 /// can cause dramatic shifts in the cost of inlining a function. 169 DenseMap<Value *, Constant *> SimplifiedValues; 170 171 /// Keep track of the values which map back (through function arguments) to 172 /// allocas on the caller stack which could be simplified through SROA. 173 DenseMap<Value *, Value *> SROAArgValues; 174 175 /// The mapping of caller Alloca values to their accumulated cost savings. If 176 /// we have to disable SROA for one of the allocas, this tells us how much 177 /// cost must be added. 178 DenseMap<Value *, int> SROAArgCosts; 179 180 /// Keep track of values which map to a pointer base and constant offset. 181 DenseMap<Value *, std::pair<Value *, APInt>> ConstantOffsetPtrs; 182 183 /// Keep track of dead blocks due to the constant arguments. 184 SetVector<BasicBlock *> DeadBlocks; 185 186 /// The mapping of the blocks to their known unique successors due to the 187 /// constant arguments. 188 DenseMap<BasicBlock *, BasicBlock *> KnownSuccessors; 189 190 /// Model the elimination of repeated loads that is expected to happen 191 /// whenever we simplify away the stores that would otherwise cause them to be 192 /// loads. 193 bool EnableLoadElimination; 194 SmallPtrSet<Value *, 16> LoadAddrSet; 195 int LoadEliminationCost = 0; 196 197 // Custom simplification helper routines. 198 bool isAllocaDerivedArg(Value *V); 199 bool lookupSROAArgAndCost(Value *V, Value *&Arg, 200 DenseMap<Value *, int>::iterator &CostIt); 201 void disableSROA(DenseMap<Value *, int>::iterator CostIt); 202 void disableSROA(Value *V); 203 void findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB); 204 void accumulateSROACost(DenseMap<Value *, int>::iterator CostIt, 205 int InstructionCost); 206 void disableLoadElimination(); 207 bool isGEPFree(GetElementPtrInst &GEP); 208 bool canFoldInboundsGEP(GetElementPtrInst &I); 209 bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset); 210 bool simplifyCallSite(Function *F, CallBase &Call); 211 template <typename Callable> 212 bool simplifyInstruction(Instruction &I, Callable Evaluate); 213 ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V); 214 215 /// Return true if the given argument to the function being considered for 216 /// inlining has the given attribute set either at the call site or the 217 /// function declaration. Primarily used to inspect call site specific 218 /// attributes since these can be more precise than the ones on the callee 219 /// itself. 220 bool paramHasAttr(Argument *A, Attribute::AttrKind Attr); 221 222 /// Return true if the given value is known non null within the callee if 223 /// inlined through this particular callsite. 224 bool isKnownNonNullInCallee(Value *V); 225 226 /// Update Threshold based on callsite properties such as callee 227 /// attributes and callee hotness for PGO builds. The Callee is explicitly 228 /// passed to support analyzing indirect calls whose target is inferred by 229 /// analysis. 230 void updateThreshold(CallBase &Call, Function &Callee); 231 232 /// Return true if size growth is allowed when inlining the callee at \p Call. 233 bool allowSizeGrowth(CallBase &Call); 234 235 /// Return true if \p Call is a cold callsite. 236 bool isColdCallSite(CallBase &Call, BlockFrequencyInfo *CallerBFI); 237 238 /// Return a higher threshold if \p Call is a hot callsite. 239 Optional<int> getHotCallSiteThreshold(CallBase &Call, 240 BlockFrequencyInfo *CallerBFI); 241 242 // Custom analysis routines. 243 InlineResult analyzeBlock(BasicBlock *BB, 244 SmallPtrSetImpl<const Value *> &EphValues); 245 246 /// Handle a capped 'int' increment for Cost. 247 void addCost(int64_t Inc, int64_t UpperBound = INT_MAX) { 248 assert(UpperBound > 0 && UpperBound <= INT_MAX && "invalid upper bound"); 249 Cost = (int)std::min(UpperBound, Cost + Inc); 250 } 251 252 // Disable several entry points to the visitor so we don't accidentally use 253 // them by declaring but not defining them here. 254 void visit(Module *); 255 void visit(Module &); 256 void visit(Function *); 257 void visit(Function &); 258 void visit(BasicBlock *); 259 void visit(BasicBlock &); 260 261 // Provide base case for our instruction visit. 262 bool visitInstruction(Instruction &I); 263 264 // Our visit overrides. 265 bool visitAlloca(AllocaInst &I); 266 bool visitPHI(PHINode &I); 267 bool visitGetElementPtr(GetElementPtrInst &I); 268 bool visitBitCast(BitCastInst &I); 269 bool visitPtrToInt(PtrToIntInst &I); 270 bool visitIntToPtr(IntToPtrInst &I); 271 bool visitCastInst(CastInst &I); 272 bool visitUnaryInstruction(UnaryInstruction &I); 273 bool visitCmpInst(CmpInst &I); 274 bool visitSub(BinaryOperator &I); 275 bool visitBinaryOperator(BinaryOperator &I); 276 bool visitFNeg(UnaryOperator &I); 277 bool visitLoad(LoadInst &I); 278 bool visitStore(StoreInst &I); 279 bool visitExtractValue(ExtractValueInst &I); 280 bool visitInsertValue(InsertValueInst &I); 281 bool visitCallBase(CallBase &Call); 282 bool visitReturnInst(ReturnInst &RI); 283 bool visitBranchInst(BranchInst &BI); 284 bool visitSelectInst(SelectInst &SI); 285 bool visitSwitchInst(SwitchInst &SI); 286 bool visitIndirectBrInst(IndirectBrInst &IBI); 287 bool visitResumeInst(ResumeInst &RI); 288 bool visitCleanupReturnInst(CleanupReturnInst &RI); 289 bool visitCatchReturnInst(CatchReturnInst &RI); 290 bool visitUnreachableInst(UnreachableInst &I); 291 292 public: 293 CallAnalyzer(const TargetTransformInfo &TTI, 294 std::function<AssumptionCache &(Function &)> &GetAssumptionCache, 295 Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI, 296 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE, 297 Function &Callee, CallBase &Call, const InlineParams &Params) 298 : TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI), 299 PSI(PSI), F(Callee), DL(F.getParent()->getDataLayout()), ORE(ORE), 300 CandidateCall(Call), Params(Params), Threshold(Params.DefaultThreshold), 301 ComputeFullInlineCost(OptComputeFullInlineCost || 302 Params.ComputeFullInlineCost || ORE), 303 EnableLoadElimination(true) {} 304 305 InlineResult analyzeCall(CallBase &Call); 306 307 int getThreshold() { return Threshold; } 308 int getCost() { return Cost; } 309 310 // Keep a bunch of stats about the cost savings found so we can print them 311 // out when debugging. 312 unsigned NumConstantArgs = 0; 313 unsigned NumConstantOffsetPtrArgs = 0; 314 unsigned NumAllocaArgs = 0; 315 unsigned NumConstantPtrCmps = 0; 316 unsigned NumConstantPtrDiffs = 0; 317 unsigned NumInstructionsSimplified = 0; 318 unsigned SROACostSavings = 0; 319 unsigned SROACostSavingsLost = 0; 320 321 void dump(); 322 }; 323 324 } // namespace 325 326 /// Test whether the given value is an Alloca-derived function argument. 327 bool CallAnalyzer::isAllocaDerivedArg(Value *V) { 328 return SROAArgValues.count(V); 329 } 330 331 /// Lookup the SROA-candidate argument and cost iterator which V maps to. 332 /// Returns false if V does not map to a SROA-candidate. 333 bool CallAnalyzer::lookupSROAArgAndCost( 334 Value *V, Value *&Arg, DenseMap<Value *, int>::iterator &CostIt) { 335 if (SROAArgValues.empty() || SROAArgCosts.empty()) 336 return false; 337 338 DenseMap<Value *, Value *>::iterator ArgIt = SROAArgValues.find(V); 339 if (ArgIt == SROAArgValues.end()) 340 return false; 341 342 Arg = ArgIt->second; 343 CostIt = SROAArgCosts.find(Arg); 344 return CostIt != SROAArgCosts.end(); 345 } 346 347 /// Disable SROA for the candidate marked by this cost iterator. 348 /// 349 /// This marks the candidate as no longer viable for SROA, and adds the cost 350 /// savings associated with it back into the inline cost measurement. 351 void CallAnalyzer::disableSROA(DenseMap<Value *, int>::iterator CostIt) { 352 // If we're no longer able to perform SROA we need to undo its cost savings 353 // and prevent subsequent analysis. 354 addCost(CostIt->second); 355 SROACostSavings -= CostIt->second; 356 SROACostSavingsLost += CostIt->second; 357 SROAArgCosts.erase(CostIt); 358 disableLoadElimination(); 359 } 360 361 /// If 'V' maps to a SROA candidate, disable SROA for it. 362 void CallAnalyzer::disableSROA(Value *V) { 363 Value *SROAArg; 364 DenseMap<Value *, int>::iterator CostIt; 365 if (lookupSROAArgAndCost(V, SROAArg, CostIt)) 366 disableSROA(CostIt); 367 } 368 369 /// Accumulate the given cost for a particular SROA candidate. 370 void CallAnalyzer::accumulateSROACost(DenseMap<Value *, int>::iterator CostIt, 371 int InstructionCost) { 372 CostIt->second += InstructionCost; 373 SROACostSavings += InstructionCost; 374 } 375 376 void CallAnalyzer::disableLoadElimination() { 377 if (EnableLoadElimination) { 378 addCost(LoadEliminationCost); 379 LoadEliminationCost = 0; 380 EnableLoadElimination = false; 381 } 382 } 383 384 /// Accumulate a constant GEP offset into an APInt if possible. 385 /// 386 /// Returns false if unable to compute the offset for any reason. Respects any 387 /// simplified values known during the analysis of this callsite. 388 bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) { 389 unsigned IntPtrWidth = DL.getIndexTypeSizeInBits(GEP.getType()); 390 assert(IntPtrWidth == Offset.getBitWidth()); 391 392 for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP); 393 GTI != GTE; ++GTI) { 394 ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand()); 395 if (!OpC) 396 if (Constant *SimpleOp = SimplifiedValues.lookup(GTI.getOperand())) 397 OpC = dyn_cast<ConstantInt>(SimpleOp); 398 if (!OpC) 399 return false; 400 if (OpC->isZero()) 401 continue; 402 403 // Handle a struct index, which adds its field offset to the pointer. 404 if (StructType *STy = GTI.getStructTypeOrNull()) { 405 unsigned ElementIdx = OpC->getZExtValue(); 406 const StructLayout *SL = DL.getStructLayout(STy); 407 Offset += APInt(IntPtrWidth, SL->getElementOffset(ElementIdx)); 408 continue; 409 } 410 411 APInt TypeSize(IntPtrWidth, DL.getTypeAllocSize(GTI.getIndexedType())); 412 Offset += OpC->getValue().sextOrTrunc(IntPtrWidth) * TypeSize; 413 } 414 return true; 415 } 416 417 /// Use TTI to check whether a GEP is free. 418 /// 419 /// Respects any simplified values known during the analysis of this callsite. 420 bool CallAnalyzer::isGEPFree(GetElementPtrInst &GEP) { 421 SmallVector<Value *, 4> Operands; 422 Operands.push_back(GEP.getOperand(0)); 423 for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I) 424 if (Constant *SimpleOp = SimplifiedValues.lookup(*I)) 425 Operands.push_back(SimpleOp); 426 else 427 Operands.push_back(*I); 428 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&GEP, Operands); 429 } 430 431 bool CallAnalyzer::visitAlloca(AllocaInst &I) { 432 // Check whether inlining will turn a dynamic alloca into a static 433 // alloca and handle that case. 434 if (I.isArrayAllocation()) { 435 Constant *Size = SimplifiedValues.lookup(I.getArraySize()); 436 if (auto *AllocSize = dyn_cast_or_null<ConstantInt>(Size)) { 437 Type *Ty = I.getAllocatedType(); 438 AllocatedSize = SaturatingMultiplyAdd( 439 AllocSize->getLimitedValue(), DL.getTypeAllocSize(Ty), AllocatedSize); 440 return Base::visitAlloca(I); 441 } 442 } 443 444 // Accumulate the allocated size. 445 if (I.isStaticAlloca()) { 446 Type *Ty = I.getAllocatedType(); 447 AllocatedSize = SaturatingAdd(DL.getTypeAllocSize(Ty), AllocatedSize); 448 } 449 450 // We will happily inline static alloca instructions. 451 if (I.isStaticAlloca()) 452 return Base::visitAlloca(I); 453 454 // FIXME: This is overly conservative. Dynamic allocas are inefficient for 455 // a variety of reasons, and so we would like to not inline them into 456 // functions which don't currently have a dynamic alloca. This simply 457 // disables inlining altogether in the presence of a dynamic alloca. 458 HasDynamicAlloca = true; 459 return false; 460 } 461 462 bool CallAnalyzer::visitPHI(PHINode &I) { 463 // FIXME: We need to propagate SROA *disabling* through phi nodes, even 464 // though we don't want to propagate it's bonuses. The idea is to disable 465 // SROA if it *might* be used in an inappropriate manner. 466 467 // Phi nodes are always zero-cost. 468 // FIXME: Pointer sizes may differ between different address spaces, so do we 469 // need to use correct address space in the call to getPointerSizeInBits here? 470 // Or could we skip the getPointerSizeInBits call completely? As far as I can 471 // see the ZeroOffset is used as a dummy value, so we can probably use any 472 // bit width for the ZeroOffset? 473 APInt ZeroOffset = APInt::getNullValue(DL.getPointerSizeInBits(0)); 474 bool CheckSROA = I.getType()->isPointerTy(); 475 476 // Track the constant or pointer with constant offset we've seen so far. 477 Constant *FirstC = nullptr; 478 std::pair<Value *, APInt> FirstBaseAndOffset = {nullptr, ZeroOffset}; 479 Value *FirstV = nullptr; 480 481 for (unsigned i = 0, e = I.getNumIncomingValues(); i != e; ++i) { 482 BasicBlock *Pred = I.getIncomingBlock(i); 483 // If the incoming block is dead, skip the incoming block. 484 if (DeadBlocks.count(Pred)) 485 continue; 486 // If the parent block of phi is not the known successor of the incoming 487 // block, skip the incoming block. 488 BasicBlock *KnownSuccessor = KnownSuccessors[Pred]; 489 if (KnownSuccessor && KnownSuccessor != I.getParent()) 490 continue; 491 492 Value *V = I.getIncomingValue(i); 493 // If the incoming value is this phi itself, skip the incoming value. 494 if (&I == V) 495 continue; 496 497 Constant *C = dyn_cast<Constant>(V); 498 if (!C) 499 C = SimplifiedValues.lookup(V); 500 501 std::pair<Value *, APInt> BaseAndOffset = {nullptr, ZeroOffset}; 502 if (!C && CheckSROA) 503 BaseAndOffset = ConstantOffsetPtrs.lookup(V); 504 505 if (!C && !BaseAndOffset.first) 506 // The incoming value is neither a constant nor a pointer with constant 507 // offset, exit early. 508 return true; 509 510 if (FirstC) { 511 if (FirstC == C) 512 // If we've seen a constant incoming value before and it is the same 513 // constant we see this time, continue checking the next incoming value. 514 continue; 515 // Otherwise early exit because we either see a different constant or saw 516 // a constant before but we have a pointer with constant offset this time. 517 return true; 518 } 519 520 if (FirstV) { 521 // The same logic as above, but check pointer with constant offset here. 522 if (FirstBaseAndOffset == BaseAndOffset) 523 continue; 524 return true; 525 } 526 527 if (C) { 528 // This is the 1st time we've seen a constant, record it. 529 FirstC = C; 530 continue; 531 } 532 533 // The remaining case is that this is the 1st time we've seen a pointer with 534 // constant offset, record it. 535 FirstV = V; 536 FirstBaseAndOffset = BaseAndOffset; 537 } 538 539 // Check if we can map phi to a constant. 540 if (FirstC) { 541 SimplifiedValues[&I] = FirstC; 542 return true; 543 } 544 545 // Check if we can map phi to a pointer with constant offset. 546 if (FirstBaseAndOffset.first) { 547 ConstantOffsetPtrs[&I] = FirstBaseAndOffset; 548 549 Value *SROAArg; 550 DenseMap<Value *, int>::iterator CostIt; 551 if (lookupSROAArgAndCost(FirstV, SROAArg, CostIt)) 552 SROAArgValues[&I] = SROAArg; 553 } 554 555 return true; 556 } 557 558 /// Check we can fold GEPs of constant-offset call site argument pointers. 559 /// This requires target data and inbounds GEPs. 560 /// 561 /// \return true if the specified GEP can be folded. 562 bool CallAnalyzer::canFoldInboundsGEP(GetElementPtrInst &I) { 563 // Check if we have a base + offset for the pointer. 564 std::pair<Value *, APInt> BaseAndOffset = 565 ConstantOffsetPtrs.lookup(I.getPointerOperand()); 566 if (!BaseAndOffset.first) 567 return false; 568 569 // Check if the offset of this GEP is constant, and if so accumulate it 570 // into Offset. 571 if (!accumulateGEPOffset(cast<GEPOperator>(I), BaseAndOffset.second)) 572 return false; 573 574 // Add the result as a new mapping to Base + Offset. 575 ConstantOffsetPtrs[&I] = BaseAndOffset; 576 577 return true; 578 } 579 580 bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) { 581 Value *SROAArg; 582 DenseMap<Value *, int>::iterator CostIt; 583 bool SROACandidate = 584 lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt); 585 586 // Lambda to check whether a GEP's indices are all constant. 587 auto IsGEPOffsetConstant = [&](GetElementPtrInst &GEP) { 588 for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I) 589 if (!isa<Constant>(*I) && !SimplifiedValues.lookup(*I)) 590 return false; 591 return true; 592 }; 593 594 if ((I.isInBounds() && canFoldInboundsGEP(I)) || IsGEPOffsetConstant(I)) { 595 if (SROACandidate) 596 SROAArgValues[&I] = SROAArg; 597 598 // Constant GEPs are modeled as free. 599 return true; 600 } 601 602 // Variable GEPs will require math and will disable SROA. 603 if (SROACandidate) 604 disableSROA(CostIt); 605 return isGEPFree(I); 606 } 607 608 /// Simplify \p I if its operands are constants and update SimplifiedValues. 609 /// \p Evaluate is a callable specific to instruction type that evaluates the 610 /// instruction when all the operands are constants. 611 template <typename Callable> 612 bool CallAnalyzer::simplifyInstruction(Instruction &I, Callable Evaluate) { 613 SmallVector<Constant *, 2> COps; 614 for (Value *Op : I.operands()) { 615 Constant *COp = dyn_cast<Constant>(Op); 616 if (!COp) 617 COp = SimplifiedValues.lookup(Op); 618 if (!COp) 619 return false; 620 COps.push_back(COp); 621 } 622 auto *C = Evaluate(COps); 623 if (!C) 624 return false; 625 SimplifiedValues[&I] = C; 626 return true; 627 } 628 629 bool CallAnalyzer::visitBitCast(BitCastInst &I) { 630 // Propagate constants through bitcasts. 631 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 632 return ConstantExpr::getBitCast(COps[0], I.getType()); 633 })) 634 return true; 635 636 // Track base/offsets through casts 637 std::pair<Value *, APInt> BaseAndOffset = 638 ConstantOffsetPtrs.lookup(I.getOperand(0)); 639 // Casts don't change the offset, just wrap it up. 640 if (BaseAndOffset.first) 641 ConstantOffsetPtrs[&I] = BaseAndOffset; 642 643 // Also look for SROA candidates here. 644 Value *SROAArg; 645 DenseMap<Value *, int>::iterator CostIt; 646 if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) 647 SROAArgValues[&I] = SROAArg; 648 649 // Bitcasts are always zero cost. 650 return true; 651 } 652 653 bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) { 654 // Propagate constants through ptrtoint. 655 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 656 return ConstantExpr::getPtrToInt(COps[0], I.getType()); 657 })) 658 return true; 659 660 // Track base/offset pairs when converted to a plain integer provided the 661 // integer is large enough to represent the pointer. 662 unsigned IntegerSize = I.getType()->getScalarSizeInBits(); 663 unsigned AS = I.getOperand(0)->getType()->getPointerAddressSpace(); 664 if (IntegerSize >= DL.getPointerSizeInBits(AS)) { 665 std::pair<Value *, APInt> BaseAndOffset = 666 ConstantOffsetPtrs.lookup(I.getOperand(0)); 667 if (BaseAndOffset.first) 668 ConstantOffsetPtrs[&I] = BaseAndOffset; 669 } 670 671 // This is really weird. Technically, ptrtoint will disable SROA. However, 672 // unless that ptrtoint is *used* somewhere in the live basic blocks after 673 // inlining, it will be nuked, and SROA should proceed. All of the uses which 674 // would block SROA would also block SROA if applied directly to a pointer, 675 // and so we can just add the integer in here. The only places where SROA is 676 // preserved either cannot fire on an integer, or won't in-and-of themselves 677 // disable SROA (ext) w/o some later use that we would see and disable. 678 Value *SROAArg; 679 DenseMap<Value *, int>::iterator CostIt; 680 if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) 681 SROAArgValues[&I] = SROAArg; 682 683 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I); 684 } 685 686 bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) { 687 // Propagate constants through ptrtoint. 688 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 689 return ConstantExpr::getIntToPtr(COps[0], I.getType()); 690 })) 691 return true; 692 693 // Track base/offset pairs when round-tripped through a pointer without 694 // modifications provided the integer is not too large. 695 Value *Op = I.getOperand(0); 696 unsigned IntegerSize = Op->getType()->getScalarSizeInBits(); 697 if (IntegerSize <= DL.getPointerTypeSizeInBits(I.getType())) { 698 std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Op); 699 if (BaseAndOffset.first) 700 ConstantOffsetPtrs[&I] = BaseAndOffset; 701 } 702 703 // "Propagate" SROA here in the same manner as we do for ptrtoint above. 704 Value *SROAArg; 705 DenseMap<Value *, int>::iterator CostIt; 706 if (lookupSROAArgAndCost(Op, SROAArg, CostIt)) 707 SROAArgValues[&I] = SROAArg; 708 709 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I); 710 } 711 712 bool CallAnalyzer::visitCastInst(CastInst &I) { 713 // Propagate constants through casts. 714 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 715 return ConstantExpr::getCast(I.getOpcode(), COps[0], I.getType()); 716 })) 717 return true; 718 719 // Disable SROA in the face of arbitrary casts we don't whitelist elsewhere. 720 disableSROA(I.getOperand(0)); 721 722 // If this is a floating-point cast, and the target says this operation 723 // is expensive, this may eventually become a library call. Treat the cost 724 // as such. 725 switch (I.getOpcode()) { 726 case Instruction::FPTrunc: 727 case Instruction::FPExt: 728 case Instruction::UIToFP: 729 case Instruction::SIToFP: 730 case Instruction::FPToUI: 731 case Instruction::FPToSI: 732 if (TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive) 733 addCost(InlineConstants::CallPenalty); 734 break; 735 default: 736 break; 737 } 738 739 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I); 740 } 741 742 bool CallAnalyzer::visitUnaryInstruction(UnaryInstruction &I) { 743 Value *Operand = I.getOperand(0); 744 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 745 return ConstantFoldInstOperands(&I, COps[0], DL); 746 })) 747 return true; 748 749 // Disable any SROA on the argument to arbitrary unary instructions. 750 disableSROA(Operand); 751 752 return false; 753 } 754 755 bool CallAnalyzer::paramHasAttr(Argument *A, Attribute::AttrKind Attr) { 756 return CandidateCall.paramHasAttr(A->getArgNo(), Attr); 757 } 758 759 bool CallAnalyzer::isKnownNonNullInCallee(Value *V) { 760 // Does the *call site* have the NonNull attribute set on an argument? We 761 // use the attribute on the call site to memoize any analysis done in the 762 // caller. This will also trip if the callee function has a non-null 763 // parameter attribute, but that's a less interesting case because hopefully 764 // the callee would already have been simplified based on that. 765 if (Argument *A = dyn_cast<Argument>(V)) 766 if (paramHasAttr(A, Attribute::NonNull)) 767 return true; 768 769 // Is this an alloca in the caller? This is distinct from the attribute case 770 // above because attributes aren't updated within the inliner itself and we 771 // always want to catch the alloca derived case. 772 if (isAllocaDerivedArg(V)) 773 // We can actually predict the result of comparisons between an 774 // alloca-derived value and null. Note that this fires regardless of 775 // SROA firing. 776 return true; 777 778 return false; 779 } 780 781 bool CallAnalyzer::allowSizeGrowth(CallBase &Call) { 782 // If the normal destination of the invoke or the parent block of the call 783 // site is unreachable-terminated, there is little point in inlining this 784 // unless there is literally zero cost. 785 // FIXME: Note that it is possible that an unreachable-terminated block has a 786 // hot entry. For example, in below scenario inlining hot_call_X() may be 787 // beneficial : 788 // main() { 789 // hot_call_1(); 790 // ... 791 // hot_call_N() 792 // exit(0); 793 // } 794 // For now, we are not handling this corner case here as it is rare in real 795 // code. In future, we should elaborate this based on BPI and BFI in more 796 // general threshold adjusting heuristics in updateThreshold(). 797 if (InvokeInst *II = dyn_cast<InvokeInst>(&Call)) { 798 if (isa<UnreachableInst>(II->getNormalDest()->getTerminator())) 799 return false; 800 } else if (isa<UnreachableInst>(Call.getParent()->getTerminator())) 801 return false; 802 803 return true; 804 } 805 806 bool CallAnalyzer::isColdCallSite(CallBase &Call, 807 BlockFrequencyInfo *CallerBFI) { 808 // If global profile summary is available, then callsite's coldness is 809 // determined based on that. 810 if (PSI && PSI->hasProfileSummary()) 811 return PSI->isColdCallSite(CallSite(&Call), CallerBFI); 812 813 // Otherwise we need BFI to be available. 814 if (!CallerBFI) 815 return false; 816 817 // Determine if the callsite is cold relative to caller's entry. We could 818 // potentially cache the computation of scaled entry frequency, but the added 819 // complexity is not worth it unless this scaling shows up high in the 820 // profiles. 821 const BranchProbability ColdProb(ColdCallSiteRelFreq, 100); 822 auto CallSiteBB = Call.getParent(); 823 auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB); 824 auto CallerEntryFreq = 825 CallerBFI->getBlockFreq(&(Call.getCaller()->getEntryBlock())); 826 return CallSiteFreq < CallerEntryFreq * ColdProb; 827 } 828 829 Optional<int> 830 CallAnalyzer::getHotCallSiteThreshold(CallBase &Call, 831 BlockFrequencyInfo *CallerBFI) { 832 833 // If global profile summary is available, then callsite's hotness is 834 // determined based on that. 835 if (PSI && PSI->hasProfileSummary() && 836 PSI->isHotCallSite(CallSite(&Call), CallerBFI)) 837 return Params.HotCallSiteThreshold; 838 839 // Otherwise we need BFI to be available and to have a locally hot callsite 840 // threshold. 841 if (!CallerBFI || !Params.LocallyHotCallSiteThreshold) 842 return None; 843 844 // Determine if the callsite is hot relative to caller's entry. We could 845 // potentially cache the computation of scaled entry frequency, but the added 846 // complexity is not worth it unless this scaling shows up high in the 847 // profiles. 848 auto CallSiteBB = Call.getParent(); 849 auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB).getFrequency(); 850 auto CallerEntryFreq = CallerBFI->getEntryFreq(); 851 if (CallSiteFreq >= CallerEntryFreq * HotCallSiteRelFreq) 852 return Params.LocallyHotCallSiteThreshold; 853 854 // Otherwise treat it normally. 855 return None; 856 } 857 858 void CallAnalyzer::updateThreshold(CallBase &Call, Function &Callee) { 859 // If no size growth is allowed for this inlining, set Threshold to 0. 860 if (!allowSizeGrowth(Call)) { 861 Threshold = 0; 862 return; 863 } 864 865 Function *Caller = Call.getCaller(); 866 867 // return min(A, B) if B is valid. 868 auto MinIfValid = [](int A, Optional<int> B) { 869 return B ? std::min(A, B.getValue()) : A; 870 }; 871 872 // return max(A, B) if B is valid. 873 auto MaxIfValid = [](int A, Optional<int> B) { 874 return B ? std::max(A, B.getValue()) : A; 875 }; 876 877 // Various bonus percentages. These are multiplied by Threshold to get the 878 // bonus values. 879 // SingleBBBonus: This bonus is applied if the callee has a single reachable 880 // basic block at the given callsite context. This is speculatively applied 881 // and withdrawn if more than one basic block is seen. 882 // 883 // LstCallToStaticBonus: This large bonus is applied to ensure the inlining 884 // of the last call to a static function as inlining such functions is 885 // guaranteed to reduce code size. 886 // 887 // These bonus percentages may be set to 0 based on properties of the caller 888 // and the callsite. 889 int SingleBBBonusPercent = 50; 890 int VectorBonusPercent = TTI.getInlinerVectorBonusPercent(); 891 int LastCallToStaticBonus = InlineConstants::LastCallToStaticBonus; 892 893 // Lambda to set all the above bonus and bonus percentages to 0. 894 auto DisallowAllBonuses = [&]() { 895 SingleBBBonusPercent = 0; 896 VectorBonusPercent = 0; 897 LastCallToStaticBonus = 0; 898 }; 899 900 // Use the OptMinSizeThreshold or OptSizeThreshold knob if they are available 901 // and reduce the threshold if the caller has the necessary attribute. 902 if (Caller->hasMinSize()) { 903 Threshold = MinIfValid(Threshold, Params.OptMinSizeThreshold); 904 // For minsize, we want to disable the single BB bonus and the vector 905 // bonuses, but not the last-call-to-static bonus. Inlining the last call to 906 // a static function will, at the minimum, eliminate the parameter setup and 907 // call/return instructions. 908 SingleBBBonusPercent = 0; 909 VectorBonusPercent = 0; 910 } else if (Caller->hasOptSize()) 911 Threshold = MinIfValid(Threshold, Params.OptSizeThreshold); 912 913 // Adjust the threshold based on inlinehint attribute and profile based 914 // hotness information if the caller does not have MinSize attribute. 915 if (!Caller->hasMinSize()) { 916 if (Callee.hasFnAttribute(Attribute::InlineHint)) 917 Threshold = MaxIfValid(Threshold, Params.HintThreshold); 918 919 // FIXME: After switching to the new passmanager, simplify the logic below 920 // by checking only the callsite hotness/coldness as we will reliably 921 // have local profile information. 922 // 923 // Callsite hotness and coldness can be determined if sample profile is 924 // used (which adds hotness metadata to calls) or if caller's 925 // BlockFrequencyInfo is available. 926 BlockFrequencyInfo *CallerBFI = GetBFI ? &((*GetBFI)(*Caller)) : nullptr; 927 auto HotCallSiteThreshold = getHotCallSiteThreshold(Call, CallerBFI); 928 if (!Caller->hasOptSize() && HotCallSiteThreshold) { 929 LLVM_DEBUG(dbgs() << "Hot callsite.\n"); 930 // FIXME: This should update the threshold only if it exceeds the 931 // current threshold, but AutoFDO + ThinLTO currently relies on this 932 // behavior to prevent inlining of hot callsites during ThinLTO 933 // compile phase. 934 Threshold = HotCallSiteThreshold.getValue(); 935 } else if (isColdCallSite(Call, CallerBFI)) { 936 LLVM_DEBUG(dbgs() << "Cold callsite.\n"); 937 // Do not apply bonuses for a cold callsite including the 938 // LastCallToStatic bonus. While this bonus might result in code size 939 // reduction, it can cause the size of a non-cold caller to increase 940 // preventing it from being inlined. 941 DisallowAllBonuses(); 942 Threshold = MinIfValid(Threshold, Params.ColdCallSiteThreshold); 943 } else if (PSI) { 944 // Use callee's global profile information only if we have no way of 945 // determining this via callsite information. 946 if (PSI->isFunctionEntryHot(&Callee)) { 947 LLVM_DEBUG(dbgs() << "Hot callee.\n"); 948 // If callsite hotness can not be determined, we may still know 949 // that the callee is hot and treat it as a weaker hint for threshold 950 // increase. 951 Threshold = MaxIfValid(Threshold, Params.HintThreshold); 952 } else if (PSI->isFunctionEntryCold(&Callee)) { 953 LLVM_DEBUG(dbgs() << "Cold callee.\n"); 954 // Do not apply bonuses for a cold callee including the 955 // LastCallToStatic bonus. While this bonus might result in code size 956 // reduction, it can cause the size of a non-cold caller to increase 957 // preventing it from being inlined. 958 DisallowAllBonuses(); 959 Threshold = MinIfValid(Threshold, Params.ColdThreshold); 960 } 961 } 962 } 963 964 // Finally, take the target-specific inlining threshold multiplier into 965 // account. 966 Threshold *= TTI.getInliningThresholdMultiplier(); 967 968 SingleBBBonus = Threshold * SingleBBBonusPercent / 100; 969 VectorBonus = Threshold * VectorBonusPercent / 100; 970 971 bool OnlyOneCallAndLocalLinkage = 972 F.hasLocalLinkage() && F.hasOneUse() && &F == Call.getCalledFunction(); 973 // If there is only one call of the function, and it has internal linkage, 974 // the cost of inlining it drops dramatically. It may seem odd to update 975 // Cost in updateThreshold, but the bonus depends on the logic in this method. 976 if (OnlyOneCallAndLocalLinkage) 977 Cost -= LastCallToStaticBonus; 978 } 979 980 bool CallAnalyzer::visitCmpInst(CmpInst &I) { 981 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); 982 // First try to handle simplified comparisons. 983 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 984 return ConstantExpr::getCompare(I.getPredicate(), COps[0], COps[1]); 985 })) 986 return true; 987 988 if (I.getOpcode() == Instruction::FCmp) 989 return false; 990 991 // Otherwise look for a comparison between constant offset pointers with 992 // a common base. 993 Value *LHSBase, *RHSBase; 994 APInt LHSOffset, RHSOffset; 995 std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS); 996 if (LHSBase) { 997 std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS); 998 if (RHSBase && LHSBase == RHSBase) { 999 // We have common bases, fold the icmp to a constant based on the 1000 // offsets. 1001 Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset); 1002 Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset); 1003 if (Constant *C = ConstantExpr::getICmp(I.getPredicate(), CLHS, CRHS)) { 1004 SimplifiedValues[&I] = C; 1005 ++NumConstantPtrCmps; 1006 return true; 1007 } 1008 } 1009 } 1010 1011 // If the comparison is an equality comparison with null, we can simplify it 1012 // if we know the value (argument) can't be null 1013 if (I.isEquality() && isa<ConstantPointerNull>(I.getOperand(1)) && 1014 isKnownNonNullInCallee(I.getOperand(0))) { 1015 bool IsNotEqual = I.getPredicate() == CmpInst::ICMP_NE; 1016 SimplifiedValues[&I] = IsNotEqual ? ConstantInt::getTrue(I.getType()) 1017 : ConstantInt::getFalse(I.getType()); 1018 return true; 1019 } 1020 // Finally check for SROA candidates in comparisons. 1021 Value *SROAArg; 1022 DenseMap<Value *, int>::iterator CostIt; 1023 if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) { 1024 if (isa<ConstantPointerNull>(I.getOperand(1))) { 1025 accumulateSROACost(CostIt, InlineConstants::InstrCost); 1026 return true; 1027 } 1028 1029 disableSROA(CostIt); 1030 } 1031 1032 return false; 1033 } 1034 1035 bool CallAnalyzer::visitSub(BinaryOperator &I) { 1036 // Try to handle a special case: we can fold computing the difference of two 1037 // constant-related pointers. 1038 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); 1039 Value *LHSBase, *RHSBase; 1040 APInt LHSOffset, RHSOffset; 1041 std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS); 1042 if (LHSBase) { 1043 std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS); 1044 if (RHSBase && LHSBase == RHSBase) { 1045 // We have common bases, fold the subtract to a constant based on the 1046 // offsets. 1047 Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset); 1048 Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset); 1049 if (Constant *C = ConstantExpr::getSub(CLHS, CRHS)) { 1050 SimplifiedValues[&I] = C; 1051 ++NumConstantPtrDiffs; 1052 return true; 1053 } 1054 } 1055 } 1056 1057 // Otherwise, fall back to the generic logic for simplifying and handling 1058 // instructions. 1059 return Base::visitSub(I); 1060 } 1061 1062 bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) { 1063 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); 1064 Constant *CLHS = dyn_cast<Constant>(LHS); 1065 if (!CLHS) 1066 CLHS = SimplifiedValues.lookup(LHS); 1067 Constant *CRHS = dyn_cast<Constant>(RHS); 1068 if (!CRHS) 1069 CRHS = SimplifiedValues.lookup(RHS); 1070 1071 Value *SimpleV = nullptr; 1072 if (auto FI = dyn_cast<FPMathOperator>(&I)) 1073 SimpleV = SimplifyFPBinOp(I.getOpcode(), CLHS ? CLHS : LHS, 1074 CRHS ? CRHS : RHS, FI->getFastMathFlags(), DL); 1075 else 1076 SimpleV = 1077 SimplifyBinOp(I.getOpcode(), CLHS ? CLHS : LHS, CRHS ? CRHS : RHS, DL); 1078 1079 if (Constant *C = dyn_cast_or_null<Constant>(SimpleV)) 1080 SimplifiedValues[&I] = C; 1081 1082 if (SimpleV) 1083 return true; 1084 1085 // Disable any SROA on arguments to arbitrary, unsimplified binary operators. 1086 disableSROA(LHS); 1087 disableSROA(RHS); 1088 1089 // If the instruction is floating point, and the target says this operation 1090 // is expensive, this may eventually become a library call. Treat the cost 1091 // as such. Unless it's fneg which can be implemented with an xor. 1092 using namespace llvm::PatternMatch; 1093 if (I.getType()->isFloatingPointTy() && 1094 TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive && 1095 !match(&I, m_FNeg(m_Value()))) 1096 addCost(InlineConstants::CallPenalty); 1097 1098 return false; 1099 } 1100 1101 bool CallAnalyzer::visitFNeg(UnaryOperator &I) { 1102 Value *Op = I.getOperand(0); 1103 Constant *COp = dyn_cast<Constant>(Op); 1104 if (!COp) 1105 COp = SimplifiedValues.lookup(Op); 1106 1107 Value *SimpleV = SimplifyFNegInst(COp ? COp : Op, 1108 cast<FPMathOperator>(I).getFastMathFlags(), 1109 DL); 1110 1111 if (Constant *C = dyn_cast_or_null<Constant>(SimpleV)) 1112 SimplifiedValues[&I] = C; 1113 1114 if (SimpleV) 1115 return true; 1116 1117 // Disable any SROA on arguments to arbitrary, unsimplified fneg. 1118 disableSROA(Op); 1119 1120 return false; 1121 } 1122 1123 bool CallAnalyzer::visitLoad(LoadInst &I) { 1124 Value *SROAArg; 1125 DenseMap<Value *, int>::iterator CostIt; 1126 if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) { 1127 if (I.isSimple()) { 1128 accumulateSROACost(CostIt, InlineConstants::InstrCost); 1129 return true; 1130 } 1131 1132 disableSROA(CostIt); 1133 } 1134 1135 // If the data is already loaded from this address and hasn't been clobbered 1136 // by any stores or calls, this load is likely to be redundant and can be 1137 // eliminated. 1138 if (EnableLoadElimination && 1139 !LoadAddrSet.insert(I.getPointerOperand()).second && I.isUnordered()) { 1140 LoadEliminationCost += InlineConstants::InstrCost; 1141 return true; 1142 } 1143 1144 return false; 1145 } 1146 1147 bool CallAnalyzer::visitStore(StoreInst &I) { 1148 Value *SROAArg; 1149 DenseMap<Value *, int>::iterator CostIt; 1150 if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) { 1151 if (I.isSimple()) { 1152 accumulateSROACost(CostIt, InlineConstants::InstrCost); 1153 return true; 1154 } 1155 1156 disableSROA(CostIt); 1157 } 1158 1159 // The store can potentially clobber loads and prevent repeated loads from 1160 // being eliminated. 1161 // FIXME: 1162 // 1. We can probably keep an initial set of eliminatable loads substracted 1163 // from the cost even when we finally see a store. We just need to disable 1164 // *further* accumulation of elimination savings. 1165 // 2. We should probably at some point thread MemorySSA for the callee into 1166 // this and then use that to actually compute *really* precise savings. 1167 disableLoadElimination(); 1168 return false; 1169 } 1170 1171 bool CallAnalyzer::visitExtractValue(ExtractValueInst &I) { 1172 // Constant folding for extract value is trivial. 1173 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 1174 return ConstantExpr::getExtractValue(COps[0], I.getIndices()); 1175 })) 1176 return true; 1177 1178 // SROA can look through these but give them a cost. 1179 return false; 1180 } 1181 1182 bool CallAnalyzer::visitInsertValue(InsertValueInst &I) { 1183 // Constant folding for insert value is trivial. 1184 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 1185 return ConstantExpr::getInsertValue(/*AggregateOperand*/ COps[0], 1186 /*InsertedValueOperand*/ COps[1], 1187 I.getIndices()); 1188 })) 1189 return true; 1190 1191 // SROA can look through these but give them a cost. 1192 return false; 1193 } 1194 1195 /// Try to simplify a call site. 1196 /// 1197 /// Takes a concrete function and callsite and tries to actually simplify it by 1198 /// analyzing the arguments and call itself with instsimplify. Returns true if 1199 /// it has simplified the callsite to some other entity (a constant), making it 1200 /// free. 1201 bool CallAnalyzer::simplifyCallSite(Function *F, CallBase &Call) { 1202 // FIXME: Using the instsimplify logic directly for this is inefficient 1203 // because we have to continually rebuild the argument list even when no 1204 // simplifications can be performed. Until that is fixed with remapping 1205 // inside of instsimplify, directly constant fold calls here. 1206 if (!canConstantFoldCallTo(&Call, F)) 1207 return false; 1208 1209 // Try to re-map the arguments to constants. 1210 SmallVector<Constant *, 4> ConstantArgs; 1211 ConstantArgs.reserve(Call.arg_size()); 1212 for (Value *I : Call.args()) { 1213 Constant *C = dyn_cast<Constant>(I); 1214 if (!C) 1215 C = dyn_cast_or_null<Constant>(SimplifiedValues.lookup(I)); 1216 if (!C) 1217 return false; // This argument doesn't map to a constant. 1218 1219 ConstantArgs.push_back(C); 1220 } 1221 if (Constant *C = ConstantFoldCall(&Call, F, ConstantArgs)) { 1222 SimplifiedValues[&Call] = C; 1223 return true; 1224 } 1225 1226 return false; 1227 } 1228 1229 bool CallAnalyzer::visitCallBase(CallBase &Call) { 1230 if (Call.hasFnAttr(Attribute::ReturnsTwice) && 1231 !F.hasFnAttribute(Attribute::ReturnsTwice)) { 1232 // This aborts the entire analysis. 1233 ExposesReturnsTwice = true; 1234 return false; 1235 } 1236 if (isa<CallInst>(Call) && cast<CallInst>(Call).cannotDuplicate()) 1237 ContainsNoDuplicateCall = true; 1238 1239 if (Function *F = Call.getCalledFunction()) { 1240 // When we have a concrete function, first try to simplify it directly. 1241 if (simplifyCallSite(F, Call)) 1242 return true; 1243 1244 // Next check if it is an intrinsic we know about. 1245 // FIXME: Lift this into part of the InstVisitor. 1246 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&Call)) { 1247 switch (II->getIntrinsicID()) { 1248 default: 1249 if (!Call.onlyReadsMemory() && !isAssumeLikeIntrinsic(II)) 1250 disableLoadElimination(); 1251 return Base::visitCallBase(Call); 1252 1253 case Intrinsic::load_relative: 1254 // This is normally lowered to 4 LLVM instructions. 1255 addCost(3 * InlineConstants::InstrCost); 1256 return false; 1257 1258 case Intrinsic::memset: 1259 case Intrinsic::memcpy: 1260 case Intrinsic::memmove: 1261 disableLoadElimination(); 1262 // SROA can usually chew through these intrinsics, but they aren't free. 1263 return false; 1264 case Intrinsic::icall_branch_funnel: 1265 case Intrinsic::localescape: 1266 HasUninlineableIntrinsic = true; 1267 return false; 1268 case Intrinsic::vastart: 1269 InitsVargArgs = true; 1270 return false; 1271 } 1272 } 1273 1274 if (F == Call.getFunction()) { 1275 // This flag will fully abort the analysis, so don't bother with anything 1276 // else. 1277 IsRecursiveCall = true; 1278 return false; 1279 } 1280 1281 if (TTI.isLoweredToCall(F)) { 1282 // We account for the average 1 instruction per call argument setup 1283 // here. 1284 addCost(Call.arg_size() * InlineConstants::InstrCost); 1285 1286 // Everything other than inline ASM will also have a significant cost 1287 // merely from making the call. 1288 if (!isa<InlineAsm>(Call.getCalledValue())) 1289 addCost(InlineConstants::CallPenalty); 1290 } 1291 1292 if (!Call.onlyReadsMemory()) 1293 disableLoadElimination(); 1294 return Base::visitCallBase(Call); 1295 } 1296 1297 // Otherwise we're in a very special case -- an indirect function call. See 1298 // if we can be particularly clever about this. 1299 Value *Callee = Call.getCalledValue(); 1300 1301 // First, pay the price of the argument setup. We account for the average 1302 // 1 instruction per call argument setup here. 1303 addCost(Call.arg_size() * InlineConstants::InstrCost); 1304 1305 // Next, check if this happens to be an indirect function call to a known 1306 // function in this inline context. If not, we've done all we can. 1307 Function *F = dyn_cast_or_null<Function>(SimplifiedValues.lookup(Callee)); 1308 if (!F) { 1309 if (!Call.onlyReadsMemory()) 1310 disableLoadElimination(); 1311 return Base::visitCallBase(Call); 1312 } 1313 1314 // If we have a constant that we are calling as a function, we can peer 1315 // through it and see the function target. This happens not infrequently 1316 // during devirtualization and so we want to give it a hefty bonus for 1317 // inlining, but cap that bonus in the event that inlining wouldn't pan 1318 // out. Pretend to inline the function, with a custom threshold. 1319 auto IndirectCallParams = Params; 1320 IndirectCallParams.DefaultThreshold = InlineConstants::IndirectCallThreshold; 1321 CallAnalyzer CA(TTI, GetAssumptionCache, GetBFI, PSI, ORE, *F, Call, 1322 IndirectCallParams); 1323 if (CA.analyzeCall(Call)) { 1324 // We were able to inline the indirect call! Subtract the cost from the 1325 // threshold to get the bonus we want to apply, but don't go below zero. 1326 Cost -= std::max(0, CA.getThreshold() - CA.getCost()); 1327 } 1328 1329 if (!F->onlyReadsMemory()) 1330 disableLoadElimination(); 1331 return Base::visitCallBase(Call); 1332 } 1333 1334 bool CallAnalyzer::visitReturnInst(ReturnInst &RI) { 1335 // At least one return instruction will be free after inlining. 1336 bool Free = !HasReturn; 1337 HasReturn = true; 1338 return Free; 1339 } 1340 1341 bool CallAnalyzer::visitBranchInst(BranchInst &BI) { 1342 // We model unconditional branches as essentially free -- they really 1343 // shouldn't exist at all, but handling them makes the behavior of the 1344 // inliner more regular and predictable. Interestingly, conditional branches 1345 // which will fold away are also free. 1346 return BI.isUnconditional() || isa<ConstantInt>(BI.getCondition()) || 1347 dyn_cast_or_null<ConstantInt>( 1348 SimplifiedValues.lookup(BI.getCondition())); 1349 } 1350 1351 bool CallAnalyzer::visitSelectInst(SelectInst &SI) { 1352 bool CheckSROA = SI.getType()->isPointerTy(); 1353 Value *TrueVal = SI.getTrueValue(); 1354 Value *FalseVal = SI.getFalseValue(); 1355 1356 Constant *TrueC = dyn_cast<Constant>(TrueVal); 1357 if (!TrueC) 1358 TrueC = SimplifiedValues.lookup(TrueVal); 1359 Constant *FalseC = dyn_cast<Constant>(FalseVal); 1360 if (!FalseC) 1361 FalseC = SimplifiedValues.lookup(FalseVal); 1362 Constant *CondC = 1363 dyn_cast_or_null<Constant>(SimplifiedValues.lookup(SI.getCondition())); 1364 1365 if (!CondC) { 1366 // Select C, X, X => X 1367 if (TrueC == FalseC && TrueC) { 1368 SimplifiedValues[&SI] = TrueC; 1369 return true; 1370 } 1371 1372 if (!CheckSROA) 1373 return Base::visitSelectInst(SI); 1374 1375 std::pair<Value *, APInt> TrueBaseAndOffset = 1376 ConstantOffsetPtrs.lookup(TrueVal); 1377 std::pair<Value *, APInt> FalseBaseAndOffset = 1378 ConstantOffsetPtrs.lookup(FalseVal); 1379 if (TrueBaseAndOffset == FalseBaseAndOffset && TrueBaseAndOffset.first) { 1380 ConstantOffsetPtrs[&SI] = TrueBaseAndOffset; 1381 1382 Value *SROAArg; 1383 DenseMap<Value *, int>::iterator CostIt; 1384 if (lookupSROAArgAndCost(TrueVal, SROAArg, CostIt)) 1385 SROAArgValues[&SI] = SROAArg; 1386 return true; 1387 } 1388 1389 return Base::visitSelectInst(SI); 1390 } 1391 1392 // Select condition is a constant. 1393 Value *SelectedV = CondC->isAllOnesValue() 1394 ? TrueVal 1395 : (CondC->isNullValue()) ? FalseVal : nullptr; 1396 if (!SelectedV) { 1397 // Condition is a vector constant that is not all 1s or all 0s. If all 1398 // operands are constants, ConstantExpr::getSelect() can handle the cases 1399 // such as select vectors. 1400 if (TrueC && FalseC) { 1401 if (auto *C = ConstantExpr::getSelect(CondC, TrueC, FalseC)) { 1402 SimplifiedValues[&SI] = C; 1403 return true; 1404 } 1405 } 1406 return Base::visitSelectInst(SI); 1407 } 1408 1409 // Condition is either all 1s or all 0s. SI can be simplified. 1410 if (Constant *SelectedC = dyn_cast<Constant>(SelectedV)) { 1411 SimplifiedValues[&SI] = SelectedC; 1412 return true; 1413 } 1414 1415 if (!CheckSROA) 1416 return true; 1417 1418 std::pair<Value *, APInt> BaseAndOffset = 1419 ConstantOffsetPtrs.lookup(SelectedV); 1420 if (BaseAndOffset.first) { 1421 ConstantOffsetPtrs[&SI] = BaseAndOffset; 1422 1423 Value *SROAArg; 1424 DenseMap<Value *, int>::iterator CostIt; 1425 if (lookupSROAArgAndCost(SelectedV, SROAArg, CostIt)) 1426 SROAArgValues[&SI] = SROAArg; 1427 } 1428 1429 return true; 1430 } 1431 1432 bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) { 1433 // We model unconditional switches as free, see the comments on handling 1434 // branches. 1435 if (isa<ConstantInt>(SI.getCondition())) 1436 return true; 1437 if (Value *V = SimplifiedValues.lookup(SI.getCondition())) 1438 if (isa<ConstantInt>(V)) 1439 return true; 1440 1441 // Assume the most general case where the switch is lowered into 1442 // either a jump table, bit test, or a balanced binary tree consisting of 1443 // case clusters without merging adjacent clusters with the same 1444 // destination. We do not consider the switches that are lowered with a mix 1445 // of jump table/bit test/binary search tree. The cost of the switch is 1446 // proportional to the size of the tree or the size of jump table range. 1447 // 1448 // NB: We convert large switches which are just used to initialize large phi 1449 // nodes to lookup tables instead in simplify-cfg, so this shouldn't prevent 1450 // inlining those. It will prevent inlining in cases where the optimization 1451 // does not (yet) fire. 1452 1453 // Maximum valid cost increased in this function. 1454 int CostUpperBound = INT_MAX - InlineConstants::InstrCost - 1; 1455 1456 // Exit early for a large switch, assuming one case needs at least one 1457 // instruction. 1458 // FIXME: This is not true for a bit test, but ignore such case for now to 1459 // save compile-time. 1460 int64_t CostLowerBound = 1461 std::min((int64_t)CostUpperBound, 1462 (int64_t)SI.getNumCases() * InlineConstants::InstrCost + Cost); 1463 1464 if (CostLowerBound > Threshold && !ComputeFullInlineCost) { 1465 addCost((int64_t)SI.getNumCases() * InlineConstants::InstrCost); 1466 return false; 1467 } 1468 1469 unsigned JumpTableSize = 0; 1470 unsigned NumCaseCluster = 1471 TTI.getEstimatedNumberOfCaseClusters(SI, JumpTableSize); 1472 1473 // If suitable for a jump table, consider the cost for the table size and 1474 // branch to destination. 1475 if (JumpTableSize) { 1476 int64_t JTCost = (int64_t)JumpTableSize * InlineConstants::InstrCost + 1477 4 * InlineConstants::InstrCost; 1478 1479 addCost(JTCost, (int64_t)CostUpperBound); 1480 return false; 1481 } 1482 1483 // Considering forming a binary search, we should find the number of nodes 1484 // which is same as the number of comparisons when lowered. For a given 1485 // number of clusters, n, we can define a recursive function, f(n), to find 1486 // the number of nodes in the tree. The recursion is : 1487 // f(n) = 1 + f(n/2) + f (n - n/2), when n > 3, 1488 // and f(n) = n, when n <= 3. 1489 // This will lead a binary tree where the leaf should be either f(2) or f(3) 1490 // when n > 3. So, the number of comparisons from leaves should be n, while 1491 // the number of non-leaf should be : 1492 // 2^(log2(n) - 1) - 1 1493 // = 2^log2(n) * 2^-1 - 1 1494 // = n / 2 - 1. 1495 // Considering comparisons from leaf and non-leaf nodes, we can estimate the 1496 // number of comparisons in a simple closed form : 1497 // n + n / 2 - 1 = n * 3 / 2 - 1 1498 if (NumCaseCluster <= 3) { 1499 // Suppose a comparison includes one compare and one conditional branch. 1500 addCost(NumCaseCluster * 2 * InlineConstants::InstrCost); 1501 return false; 1502 } 1503 1504 int64_t ExpectedNumberOfCompare = 3 * (int64_t)NumCaseCluster / 2 - 1; 1505 int64_t SwitchCost = 1506 ExpectedNumberOfCompare * 2 * InlineConstants::InstrCost; 1507 1508 addCost(SwitchCost, (int64_t)CostUpperBound); 1509 return false; 1510 } 1511 1512 bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) { 1513 // We never want to inline functions that contain an indirectbr. This is 1514 // incorrect because all the blockaddress's (in static global initializers 1515 // for example) would be referring to the original function, and this 1516 // indirect jump would jump from the inlined copy of the function into the 1517 // original function which is extremely undefined behavior. 1518 // FIXME: This logic isn't really right; we can safely inline functions with 1519 // indirectbr's as long as no other function or global references the 1520 // blockaddress of a block within the current function. 1521 HasIndirectBr = true; 1522 return false; 1523 } 1524 1525 bool CallAnalyzer::visitResumeInst(ResumeInst &RI) { 1526 // FIXME: It's not clear that a single instruction is an accurate model for 1527 // the inline cost of a resume instruction. 1528 return false; 1529 } 1530 1531 bool CallAnalyzer::visitCleanupReturnInst(CleanupReturnInst &CRI) { 1532 // FIXME: It's not clear that a single instruction is an accurate model for 1533 // the inline cost of a cleanupret instruction. 1534 return false; 1535 } 1536 1537 bool CallAnalyzer::visitCatchReturnInst(CatchReturnInst &CRI) { 1538 // FIXME: It's not clear that a single instruction is an accurate model for 1539 // the inline cost of a catchret instruction. 1540 return false; 1541 } 1542 1543 bool CallAnalyzer::visitUnreachableInst(UnreachableInst &I) { 1544 // FIXME: It might be reasonably to discount the cost of instructions leading 1545 // to unreachable as they have the lowest possible impact on both runtime and 1546 // code size. 1547 return true; // No actual code is needed for unreachable. 1548 } 1549 1550 bool CallAnalyzer::visitInstruction(Instruction &I) { 1551 // Some instructions are free. All of the free intrinsics can also be 1552 // handled by SROA, etc. 1553 if (TargetTransformInfo::TCC_Free == TTI.getUserCost(&I)) 1554 return true; 1555 1556 // We found something we don't understand or can't handle. Mark any SROA-able 1557 // values in the operand list as no longer viable. 1558 for (User::op_iterator OI = I.op_begin(), OE = I.op_end(); OI != OE; ++OI) 1559 disableSROA(*OI); 1560 1561 return false; 1562 } 1563 1564 /// Analyze a basic block for its contribution to the inline cost. 1565 /// 1566 /// This method walks the analyzer over every instruction in the given basic 1567 /// block and accounts for their cost during inlining at this callsite. It 1568 /// aborts early if the threshold has been exceeded or an impossible to inline 1569 /// construct has been detected. It returns false if inlining is no longer 1570 /// viable, and true if inlining remains viable. 1571 InlineResult 1572 CallAnalyzer::analyzeBlock(BasicBlock *BB, 1573 SmallPtrSetImpl<const Value *> &EphValues) { 1574 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { 1575 // FIXME: Currently, the number of instructions in a function regardless of 1576 // our ability to simplify them during inline to constants or dead code, 1577 // are actually used by the vector bonus heuristic. As long as that's true, 1578 // we have to special case debug intrinsics here to prevent differences in 1579 // inlining due to debug symbols. Eventually, the number of unsimplified 1580 // instructions shouldn't factor into the cost computation, but until then, 1581 // hack around it here. 1582 if (isa<DbgInfoIntrinsic>(I)) 1583 continue; 1584 1585 // Skip ephemeral values. 1586 if (EphValues.count(&*I)) 1587 continue; 1588 1589 ++NumInstructions; 1590 if (isa<ExtractElementInst>(I) || I->getType()->isVectorTy()) 1591 ++NumVectorInstructions; 1592 1593 // If the instruction simplified to a constant, there is no cost to this 1594 // instruction. Visit the instructions using our InstVisitor to account for 1595 // all of the per-instruction logic. The visit tree returns true if we 1596 // consumed the instruction in any way, and false if the instruction's base 1597 // cost should count against inlining. 1598 if (Base::visit(&*I)) 1599 ++NumInstructionsSimplified; 1600 else 1601 addCost(InlineConstants::InstrCost); 1602 1603 using namespace ore; 1604 // If the visit this instruction detected an uninlinable pattern, abort. 1605 InlineResult IR; 1606 if (IsRecursiveCall) 1607 IR = "recursive"; 1608 else if (ExposesReturnsTwice) 1609 IR = "exposes returns twice"; 1610 else if (HasDynamicAlloca) 1611 IR = "dynamic alloca"; 1612 else if (HasIndirectBr) 1613 IR = "indirect branch"; 1614 else if (HasUninlineableIntrinsic) 1615 IR = "uninlinable intrinsic"; 1616 else if (InitsVargArgs) 1617 IR = "varargs"; 1618 if (!IR) { 1619 if (ORE) 1620 ORE->emit([&]() { 1621 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", 1622 &CandidateCall) 1623 << NV("Callee", &F) << " has uninlinable pattern (" 1624 << NV("InlineResult", IR.message) 1625 << ") and cost is not fully computed"; 1626 }); 1627 return IR; 1628 } 1629 1630 // If the caller is a recursive function then we don't want to inline 1631 // functions which allocate a lot of stack space because it would increase 1632 // the caller stack usage dramatically. 1633 if (IsCallerRecursive && 1634 AllocatedSize > InlineConstants::TotalAllocaSizeRecursiveCaller) { 1635 InlineResult IR = "recursive and allocates too much stack space"; 1636 if (ORE) 1637 ORE->emit([&]() { 1638 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", 1639 &CandidateCall) 1640 << NV("Callee", &F) << " is " << NV("InlineResult", IR.message) 1641 << ". Cost is not fully computed"; 1642 }); 1643 return IR; 1644 } 1645 1646 // Check if we've passed the maximum possible threshold so we don't spin in 1647 // huge basic blocks that will never inline. 1648 if (Cost >= Threshold && !ComputeFullInlineCost) 1649 return false; 1650 } 1651 1652 return true; 1653 } 1654 1655 /// Compute the base pointer and cumulative constant offsets for V. 1656 /// 1657 /// This strips all constant offsets off of V, leaving it the base pointer, and 1658 /// accumulates the total constant offset applied in the returned constant. It 1659 /// returns 0 if V is not a pointer, and returns the constant '0' if there are 1660 /// no constant offsets applied. 1661 ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) { 1662 if (!V->getType()->isPointerTy()) 1663 return nullptr; 1664 1665 unsigned AS = V->getType()->getPointerAddressSpace(); 1666 unsigned IntPtrWidth = DL.getIndexSizeInBits(AS); 1667 APInt Offset = APInt::getNullValue(IntPtrWidth); 1668 1669 // Even though we don't look through PHI nodes, we could be called on an 1670 // instruction in an unreachable block, which may be on a cycle. 1671 SmallPtrSet<Value *, 4> Visited; 1672 Visited.insert(V); 1673 do { 1674 if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) { 1675 if (!GEP->isInBounds() || !accumulateGEPOffset(*GEP, Offset)) 1676 return nullptr; 1677 V = GEP->getPointerOperand(); 1678 } else if (Operator::getOpcode(V) == Instruction::BitCast) { 1679 V = cast<Operator>(V)->getOperand(0); 1680 } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) { 1681 if (GA->isInterposable()) 1682 break; 1683 V = GA->getAliasee(); 1684 } else { 1685 break; 1686 } 1687 assert(V->getType()->isPointerTy() && "Unexpected operand type!"); 1688 } while (Visited.insert(V).second); 1689 1690 Type *IntPtrTy = DL.getIntPtrType(V->getContext(), AS); 1691 return cast<ConstantInt>(ConstantInt::get(IntPtrTy, Offset)); 1692 } 1693 1694 /// Find dead blocks due to deleted CFG edges during inlining. 1695 /// 1696 /// If we know the successor of the current block, \p CurrBB, has to be \p 1697 /// NextBB, the other successors of \p CurrBB are dead if these successors have 1698 /// no live incoming CFG edges. If one block is found to be dead, we can 1699 /// continue growing the dead block list by checking the successors of the dead 1700 /// blocks to see if all their incoming edges are dead or not. 1701 void CallAnalyzer::findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB) { 1702 auto IsEdgeDead = [&](BasicBlock *Pred, BasicBlock *Succ) { 1703 // A CFG edge is dead if the predecessor is dead or the predecessor has a 1704 // known successor which is not the one under exam. 1705 return (DeadBlocks.count(Pred) || 1706 (KnownSuccessors[Pred] && KnownSuccessors[Pred] != Succ)); 1707 }; 1708 1709 auto IsNewlyDead = [&](BasicBlock *BB) { 1710 // If all the edges to a block are dead, the block is also dead. 1711 return (!DeadBlocks.count(BB) && 1712 llvm::all_of(predecessors(BB), 1713 [&](BasicBlock *P) { return IsEdgeDead(P, BB); })); 1714 }; 1715 1716 for (BasicBlock *Succ : successors(CurrBB)) { 1717 if (Succ == NextBB || !IsNewlyDead(Succ)) 1718 continue; 1719 SmallVector<BasicBlock *, 4> NewDead; 1720 NewDead.push_back(Succ); 1721 while (!NewDead.empty()) { 1722 BasicBlock *Dead = NewDead.pop_back_val(); 1723 if (DeadBlocks.insert(Dead)) 1724 // Continue growing the dead block lists. 1725 for (BasicBlock *S : successors(Dead)) 1726 if (IsNewlyDead(S)) 1727 NewDead.push_back(S); 1728 } 1729 } 1730 } 1731 1732 /// Analyze a call site for potential inlining. 1733 /// 1734 /// Returns true if inlining this call is viable, and false if it is not 1735 /// viable. It computes the cost and adjusts the threshold based on numerous 1736 /// factors and heuristics. If this method returns false but the computed cost 1737 /// is below the computed threshold, then inlining was forcibly disabled by 1738 /// some artifact of the routine. 1739 InlineResult CallAnalyzer::analyzeCall(CallBase &Call) { 1740 ++NumCallsAnalyzed; 1741 1742 // Perform some tweaks to the cost and threshold based on the direct 1743 // callsite information. 1744 1745 // We want to more aggressively inline vector-dense kernels, so up the 1746 // threshold, and we'll lower it if the % of vector instructions gets too 1747 // low. Note that these bonuses are some what arbitrary and evolved over time 1748 // by accident as much as because they are principled bonuses. 1749 // 1750 // FIXME: It would be nice to remove all such bonuses. At least it would be 1751 // nice to base the bonus values on something more scientific. 1752 assert(NumInstructions == 0); 1753 assert(NumVectorInstructions == 0); 1754 1755 // Update the threshold based on callsite properties 1756 updateThreshold(Call, F); 1757 1758 // While Threshold depends on commandline options that can take negative 1759 // values, we want to enforce the invariant that the computed threshold and 1760 // bonuses are non-negative. 1761 assert(Threshold >= 0); 1762 assert(SingleBBBonus >= 0); 1763 assert(VectorBonus >= 0); 1764 1765 // Speculatively apply all possible bonuses to Threshold. If cost exceeds 1766 // this Threshold any time, and cost cannot decrease, we can stop processing 1767 // the rest of the function body. 1768 Threshold += (SingleBBBonus + VectorBonus); 1769 1770 // Give out bonuses for the callsite, as the instructions setting them up 1771 // will be gone after inlining. 1772 addCost(-getCallsiteCost(Call, DL)); 1773 1774 // If this function uses the coldcc calling convention, prefer not to inline 1775 // it. 1776 if (F.getCallingConv() == CallingConv::Cold) 1777 Cost += InlineConstants::ColdccPenalty; 1778 1779 // Check if we're done. This can happen due to bonuses and penalties. 1780 if (Cost >= Threshold && !ComputeFullInlineCost) 1781 return "high cost"; 1782 1783 if (F.empty()) 1784 return true; 1785 1786 Function *Caller = Call.getFunction(); 1787 // Check if the caller function is recursive itself. 1788 for (User *U : Caller->users()) { 1789 CallBase *Call = dyn_cast<CallBase>(U); 1790 if (Call && Call->getFunction() == Caller) { 1791 IsCallerRecursive = true; 1792 break; 1793 } 1794 } 1795 1796 // Populate our simplified values by mapping from function arguments to call 1797 // arguments with known important simplifications. 1798 auto CAI = Call.arg_begin(); 1799 for (Function::arg_iterator FAI = F.arg_begin(), FAE = F.arg_end(); 1800 FAI != FAE; ++FAI, ++CAI) { 1801 assert(CAI != Call.arg_end()); 1802 if (Constant *C = dyn_cast<Constant>(CAI)) 1803 SimplifiedValues[&*FAI] = C; 1804 1805 Value *PtrArg = *CAI; 1806 if (ConstantInt *C = stripAndComputeInBoundsConstantOffsets(PtrArg)) { 1807 ConstantOffsetPtrs[&*FAI] = std::make_pair(PtrArg, C->getValue()); 1808 1809 // We can SROA any pointer arguments derived from alloca instructions. 1810 if (isa<AllocaInst>(PtrArg)) { 1811 SROAArgValues[&*FAI] = PtrArg; 1812 SROAArgCosts[PtrArg] = 0; 1813 } 1814 } 1815 } 1816 NumConstantArgs = SimplifiedValues.size(); 1817 NumConstantOffsetPtrArgs = ConstantOffsetPtrs.size(); 1818 NumAllocaArgs = SROAArgValues.size(); 1819 1820 // FIXME: If a caller has multiple calls to a callee, we end up recomputing 1821 // the ephemeral values multiple times (and they're completely determined by 1822 // the callee, so this is purely duplicate work). 1823 SmallPtrSet<const Value *, 32> EphValues; 1824 CodeMetrics::collectEphemeralValues(&F, &GetAssumptionCache(F), EphValues); 1825 1826 // The worklist of live basic blocks in the callee *after* inlining. We avoid 1827 // adding basic blocks of the callee which can be proven to be dead for this 1828 // particular call site in order to get more accurate cost estimates. This 1829 // requires a somewhat heavyweight iteration pattern: we need to walk the 1830 // basic blocks in a breadth-first order as we insert live successors. To 1831 // accomplish this, prioritizing for small iterations because we exit after 1832 // crossing our threshold, we use a small-size optimized SetVector. 1833 typedef SetVector<BasicBlock *, SmallVector<BasicBlock *, 16>, 1834 SmallPtrSet<BasicBlock *, 16>> 1835 BBSetVector; 1836 BBSetVector BBWorklist; 1837 BBWorklist.insert(&F.getEntryBlock()); 1838 bool SingleBB = true; 1839 // Note that we *must not* cache the size, this loop grows the worklist. 1840 for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) { 1841 // Bail out the moment we cross the threshold. This means we'll under-count 1842 // the cost, but only when undercounting doesn't matter. 1843 if (Cost >= Threshold && !ComputeFullInlineCost) 1844 break; 1845 1846 BasicBlock *BB = BBWorklist[Idx]; 1847 if (BB->empty()) 1848 continue; 1849 1850 // Disallow inlining a blockaddress with uses other than strictly callbr. 1851 // A blockaddress only has defined behavior for an indirect branch in the 1852 // same function, and we do not currently support inlining indirect 1853 // branches. But, the inliner may not see an indirect branch that ends up 1854 // being dead code at a particular call site. If the blockaddress escapes 1855 // the function, e.g., via a global variable, inlining may lead to an 1856 // invalid cross-function reference. 1857 // FIXME: pr/39560: continue relaxing this overt restriction. 1858 if (BB->hasAddressTaken()) 1859 for (User *U : BlockAddress::get(&*BB)->users()) 1860 if (!isa<CallBrInst>(*U)) 1861 return "blockaddress used outside of callbr"; 1862 1863 // Analyze the cost of this block. If we blow through the threshold, this 1864 // returns false, and we can bail on out. 1865 InlineResult IR = analyzeBlock(BB, EphValues); 1866 if (!IR) 1867 return IR; 1868 1869 Instruction *TI = BB->getTerminator(); 1870 1871 // Add in the live successors by first checking whether we have terminator 1872 // that may be simplified based on the values simplified by this call. 1873 if (BranchInst *BI = dyn_cast<BranchInst>(TI)) { 1874 if (BI->isConditional()) { 1875 Value *Cond = BI->getCondition(); 1876 if (ConstantInt *SimpleCond = 1877 dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) { 1878 BasicBlock *NextBB = BI->getSuccessor(SimpleCond->isZero() ? 1 : 0); 1879 BBWorklist.insert(NextBB); 1880 KnownSuccessors[BB] = NextBB; 1881 findDeadBlocks(BB, NextBB); 1882 continue; 1883 } 1884 } 1885 } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) { 1886 Value *Cond = SI->getCondition(); 1887 if (ConstantInt *SimpleCond = 1888 dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) { 1889 BasicBlock *NextBB = SI->findCaseValue(SimpleCond)->getCaseSuccessor(); 1890 BBWorklist.insert(NextBB); 1891 KnownSuccessors[BB] = NextBB; 1892 findDeadBlocks(BB, NextBB); 1893 continue; 1894 } 1895 } 1896 1897 // If we're unable to select a particular successor, just count all of 1898 // them. 1899 for (unsigned TIdx = 0, TSize = TI->getNumSuccessors(); TIdx != TSize; 1900 ++TIdx) 1901 BBWorklist.insert(TI->getSuccessor(TIdx)); 1902 1903 // If we had any successors at this point, than post-inlining is likely to 1904 // have them as well. Note that we assume any basic blocks which existed 1905 // due to branches or switches which folded above will also fold after 1906 // inlining. 1907 if (SingleBB && TI->getNumSuccessors() > 1) { 1908 // Take off the bonus we applied to the threshold. 1909 Threshold -= SingleBBBonus; 1910 SingleBB = false; 1911 } 1912 } 1913 1914 bool OnlyOneCallAndLocalLinkage = 1915 F.hasLocalLinkage() && F.hasOneUse() && &F == Call.getCalledFunction(); 1916 // If this is a noduplicate call, we can still inline as long as 1917 // inlining this would cause the removal of the caller (so the instruction 1918 // is not actually duplicated, just moved). 1919 if (!OnlyOneCallAndLocalLinkage && ContainsNoDuplicateCall) 1920 return "noduplicate"; 1921 1922 // Loops generally act a lot like calls in that they act like barriers to 1923 // movement, require a certain amount of setup, etc. So when optimising for 1924 // size, we penalise any call sites that perform loops. We do this after all 1925 // other costs here, so will likely only be dealing with relatively small 1926 // functions (and hence DT and LI will hopefully be cheap). 1927 if (Caller->hasMinSize()) { 1928 DominatorTree DT(F); 1929 LoopInfo LI(DT); 1930 int NumLoops = 0; 1931 for (Loop *L : LI) { 1932 // Ignore loops that will not be executed 1933 if (DeadBlocks.count(L->getHeader())) 1934 continue; 1935 NumLoops++; 1936 } 1937 addCost(NumLoops * InlineConstants::CallPenalty); 1938 } 1939 1940 // We applied the maximum possible vector bonus at the beginning. Now, 1941 // subtract the excess bonus, if any, from the Threshold before 1942 // comparing against Cost. 1943 if (NumVectorInstructions <= NumInstructions / 10) 1944 Threshold -= VectorBonus; 1945 else if (NumVectorInstructions <= NumInstructions / 2) 1946 Threshold -= VectorBonus/2; 1947 1948 return Cost < std::max(1, Threshold); 1949 } 1950 1951 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1952 /// Dump stats about this call's analysis. 1953 LLVM_DUMP_METHOD void CallAnalyzer::dump() { 1954 #define DEBUG_PRINT_STAT(x) dbgs() << " " #x ": " << x << "\n" 1955 DEBUG_PRINT_STAT(NumConstantArgs); 1956 DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs); 1957 DEBUG_PRINT_STAT(NumAllocaArgs); 1958 DEBUG_PRINT_STAT(NumConstantPtrCmps); 1959 DEBUG_PRINT_STAT(NumConstantPtrDiffs); 1960 DEBUG_PRINT_STAT(NumInstructionsSimplified); 1961 DEBUG_PRINT_STAT(NumInstructions); 1962 DEBUG_PRINT_STAT(SROACostSavings); 1963 DEBUG_PRINT_STAT(SROACostSavingsLost); 1964 DEBUG_PRINT_STAT(LoadEliminationCost); 1965 DEBUG_PRINT_STAT(ContainsNoDuplicateCall); 1966 DEBUG_PRINT_STAT(Cost); 1967 DEBUG_PRINT_STAT(Threshold); 1968 #undef DEBUG_PRINT_STAT 1969 } 1970 #endif 1971 1972 /// Test that there are no attribute conflicts between Caller and Callee 1973 /// that prevent inlining. 1974 static bool functionsHaveCompatibleAttributes(Function *Caller, 1975 Function *Callee, 1976 TargetTransformInfo &TTI) { 1977 return TTI.areInlineCompatible(Caller, Callee) && 1978 AttributeFuncs::areInlineCompatible(*Caller, *Callee); 1979 } 1980 1981 int llvm::getCallsiteCost(CallBase &Call, const DataLayout &DL) { 1982 int Cost = 0; 1983 for (unsigned I = 0, E = Call.arg_size(); I != E; ++I) { 1984 if (Call.isByValArgument(I)) { 1985 // We approximate the number of loads and stores needed by dividing the 1986 // size of the byval type by the target's pointer size. 1987 PointerType *PTy = cast<PointerType>(Call.getArgOperand(I)->getType()); 1988 unsigned TypeSize = DL.getTypeSizeInBits(PTy->getElementType()); 1989 unsigned AS = PTy->getAddressSpace(); 1990 unsigned PointerSize = DL.getPointerSizeInBits(AS); 1991 // Ceiling division. 1992 unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize; 1993 1994 // If it generates more than 8 stores it is likely to be expanded as an 1995 // inline memcpy so we take that as an upper bound. Otherwise we assume 1996 // one load and one store per word copied. 1997 // FIXME: The maxStoresPerMemcpy setting from the target should be used 1998 // here instead of a magic number of 8, but it's not available via 1999 // DataLayout. 2000 NumStores = std::min(NumStores, 8U); 2001 2002 Cost += 2 * NumStores * InlineConstants::InstrCost; 2003 } else { 2004 // For non-byval arguments subtract off one instruction per call 2005 // argument. 2006 Cost += InlineConstants::InstrCost; 2007 } 2008 } 2009 // The call instruction also disappears after inlining. 2010 Cost += InlineConstants::InstrCost + InlineConstants::CallPenalty; 2011 return Cost; 2012 } 2013 2014 InlineCost llvm::getInlineCost( 2015 CallBase &Call, const InlineParams &Params, TargetTransformInfo &CalleeTTI, 2016 std::function<AssumptionCache &(Function &)> &GetAssumptionCache, 2017 Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI, 2018 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) { 2019 return getInlineCost(Call, Call.getCalledFunction(), Params, CalleeTTI, 2020 GetAssumptionCache, GetBFI, PSI, ORE); 2021 } 2022 2023 InlineCost llvm::getInlineCost( 2024 CallBase &Call, Function *Callee, const InlineParams &Params, 2025 TargetTransformInfo &CalleeTTI, 2026 std::function<AssumptionCache &(Function &)> &GetAssumptionCache, 2027 Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI, 2028 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) { 2029 2030 // Cannot inline indirect calls. 2031 if (!Callee) 2032 return llvm::InlineCost::getNever("indirect call"); 2033 2034 // Never inline calls with byval arguments that does not have the alloca 2035 // address space. Since byval arguments can be replaced with a copy to an 2036 // alloca, the inlined code would need to be adjusted to handle that the 2037 // argument is in the alloca address space (so it is a little bit complicated 2038 // to solve). 2039 unsigned AllocaAS = Callee->getParent()->getDataLayout().getAllocaAddrSpace(); 2040 for (unsigned I = 0, E = Call.arg_size(); I != E; ++I) 2041 if (Call.isByValArgument(I)) { 2042 PointerType *PTy = cast<PointerType>(Call.getArgOperand(I)->getType()); 2043 if (PTy->getAddressSpace() != AllocaAS) 2044 return llvm::InlineCost::getNever("byval arguments without alloca" 2045 " address space"); 2046 } 2047 2048 // Calls to functions with always-inline attributes should be inlined 2049 // whenever possible. 2050 if (Call.hasFnAttr(Attribute::AlwaysInline)) { 2051 auto IsViable = isInlineViable(*Callee); 2052 if (IsViable) 2053 return llvm::InlineCost::getAlways("always inline attribute"); 2054 return llvm::InlineCost::getNever(IsViable.message); 2055 } 2056 2057 // Never inline functions with conflicting attributes (unless callee has 2058 // always-inline attribute). 2059 Function *Caller = Call.getCaller(); 2060 if (!functionsHaveCompatibleAttributes(Caller, Callee, CalleeTTI)) 2061 return llvm::InlineCost::getNever("conflicting attributes"); 2062 2063 // Don't inline this call if the caller has the optnone attribute. 2064 if (Caller->hasOptNone()) 2065 return llvm::InlineCost::getNever("optnone attribute"); 2066 2067 // Don't inline a function that treats null pointer as valid into a caller 2068 // that does not have this attribute. 2069 if (!Caller->nullPointerIsDefined() && Callee->nullPointerIsDefined()) 2070 return llvm::InlineCost::getNever("nullptr definitions incompatible"); 2071 2072 // Don't inline functions which can be interposed at link-time. 2073 if (Callee->isInterposable()) 2074 return llvm::InlineCost::getNever("interposable"); 2075 2076 // Don't inline functions marked noinline. 2077 if (Callee->hasFnAttribute(Attribute::NoInline)) 2078 return llvm::InlineCost::getNever("noinline function attribute"); 2079 2080 // Don't inline call sites marked noinline. 2081 if (Call.isNoInline()) 2082 return llvm::InlineCost::getNever("noinline call site attribute"); 2083 2084 LLVM_DEBUG(llvm::dbgs() << " Analyzing call of " << Callee->getName() 2085 << "... (caller:" << Caller->getName() << ")\n"); 2086 2087 CallAnalyzer CA(CalleeTTI, GetAssumptionCache, GetBFI, PSI, ORE, *Callee, 2088 Call, Params); 2089 InlineResult ShouldInline = CA.analyzeCall(Call); 2090 2091 LLVM_DEBUG(CA.dump()); 2092 2093 // Check if there was a reason to force inlining or no inlining. 2094 if (!ShouldInline && CA.getCost() < CA.getThreshold()) 2095 return InlineCost::getNever(ShouldInline.message); 2096 if (ShouldInline && CA.getCost() >= CA.getThreshold()) 2097 return InlineCost::getAlways("empty function"); 2098 2099 return llvm::InlineCost::get(CA.getCost(), CA.getThreshold()); 2100 } 2101 2102 InlineResult llvm::isInlineViable(Function &F) { 2103 bool ReturnsTwice = F.hasFnAttribute(Attribute::ReturnsTwice); 2104 for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) { 2105 // Disallow inlining of functions which contain indirect branches. 2106 if (isa<IndirectBrInst>(BI->getTerminator())) 2107 return "contains indirect branches"; 2108 2109 // Disallow inlining of blockaddresses which are used by non-callbr 2110 // instructions. 2111 if (BI->hasAddressTaken()) 2112 for (User *U : BlockAddress::get(&*BI)->users()) 2113 if (!isa<CallBrInst>(*U)) 2114 return "blockaddress used outside of callbr"; 2115 2116 for (auto &II : *BI) { 2117 CallBase *Call = dyn_cast<CallBase>(&II); 2118 if (!Call) 2119 continue; 2120 2121 // Disallow recursive calls. 2122 if (&F == Call->getCalledFunction()) 2123 return "recursive call"; 2124 2125 // Disallow calls which expose returns-twice to a function not previously 2126 // attributed as such. 2127 if (!ReturnsTwice && isa<CallInst>(Call) && 2128 cast<CallInst>(Call)->canReturnTwice()) 2129 return "exposes returns-twice attribute"; 2130 2131 if (Call->getCalledFunction()) 2132 switch (Call->getCalledFunction()->getIntrinsicID()) { 2133 default: 2134 break; 2135 // Disallow inlining of @llvm.icall.branch.funnel because current 2136 // backend can't separate call targets from call arguments. 2137 case llvm::Intrinsic::icall_branch_funnel: 2138 return "disallowed inlining of @llvm.icall.branch.funnel"; 2139 // Disallow inlining functions that call @llvm.localescape. Doing this 2140 // correctly would require major changes to the inliner. 2141 case llvm::Intrinsic::localescape: 2142 return "disallowed inlining of @llvm.localescape"; 2143 // Disallow inlining of functions that initialize VarArgs with va_start. 2144 case llvm::Intrinsic::vastart: 2145 return "contains VarArgs initialized with va_start"; 2146 } 2147 } 2148 } 2149 2150 return true; 2151 } 2152 2153 // APIs to create InlineParams based on command line flags and/or other 2154 // parameters. 2155 2156 InlineParams llvm::getInlineParams(int Threshold) { 2157 InlineParams Params; 2158 2159 // This field is the threshold to use for a callee by default. This is 2160 // derived from one or more of: 2161 // * optimization or size-optimization levels, 2162 // * a value passed to createFunctionInliningPass function, or 2163 // * the -inline-threshold flag. 2164 // If the -inline-threshold flag is explicitly specified, that is used 2165 // irrespective of anything else. 2166 if (InlineThreshold.getNumOccurrences() > 0) 2167 Params.DefaultThreshold = InlineThreshold; 2168 else 2169 Params.DefaultThreshold = Threshold; 2170 2171 // Set the HintThreshold knob from the -inlinehint-threshold. 2172 Params.HintThreshold = HintThreshold; 2173 2174 // Set the HotCallSiteThreshold knob from the -hot-callsite-threshold. 2175 Params.HotCallSiteThreshold = HotCallSiteThreshold; 2176 2177 // If the -locally-hot-callsite-threshold is explicitly specified, use it to 2178 // populate LocallyHotCallSiteThreshold. Later, we populate 2179 // Params.LocallyHotCallSiteThreshold from -locally-hot-callsite-threshold if 2180 // we know that optimization level is O3 (in the getInlineParams variant that 2181 // takes the opt and size levels). 2182 // FIXME: Remove this check (and make the assignment unconditional) after 2183 // addressing size regression issues at O2. 2184 if (LocallyHotCallSiteThreshold.getNumOccurrences() > 0) 2185 Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold; 2186 2187 // Set the ColdCallSiteThreshold knob from the -inline-cold-callsite-threshold. 2188 Params.ColdCallSiteThreshold = ColdCallSiteThreshold; 2189 2190 // Set the OptMinSizeThreshold and OptSizeThreshold params only if the 2191 // -inlinehint-threshold commandline option is not explicitly given. If that 2192 // option is present, then its value applies even for callees with size and 2193 // minsize attributes. 2194 // If the -inline-threshold is not specified, set the ColdThreshold from the 2195 // -inlinecold-threshold even if it is not explicitly passed. If 2196 // -inline-threshold is specified, then -inlinecold-threshold needs to be 2197 // explicitly specified to set the ColdThreshold knob 2198 if (InlineThreshold.getNumOccurrences() == 0) { 2199 Params.OptMinSizeThreshold = InlineConstants::OptMinSizeThreshold; 2200 Params.OptSizeThreshold = InlineConstants::OptSizeThreshold; 2201 Params.ColdThreshold = ColdThreshold; 2202 } else if (ColdThreshold.getNumOccurrences() > 0) { 2203 Params.ColdThreshold = ColdThreshold; 2204 } 2205 return Params; 2206 } 2207 2208 InlineParams llvm::getInlineParams() { 2209 return getInlineParams(InlineThreshold); 2210 } 2211 2212 // Compute the default threshold for inlining based on the opt level and the 2213 // size opt level. 2214 static int computeThresholdFromOptLevels(unsigned OptLevel, 2215 unsigned SizeOptLevel) { 2216 if (OptLevel > 2) 2217 return InlineConstants::OptAggressiveThreshold; 2218 if (SizeOptLevel == 1) // -Os 2219 return InlineConstants::OptSizeThreshold; 2220 if (SizeOptLevel == 2) // -Oz 2221 return InlineConstants::OptMinSizeThreshold; 2222 return InlineThreshold; 2223 } 2224 2225 InlineParams llvm::getInlineParams(unsigned OptLevel, unsigned SizeOptLevel) { 2226 auto Params = 2227 getInlineParams(computeThresholdFromOptLevels(OptLevel, SizeOptLevel)); 2228 // At O3, use the value of -locally-hot-callsite-threshold option to populate 2229 // Params.LocallyHotCallSiteThreshold. Below O3, this flag has effect only 2230 // when it is specified explicitly. 2231 if (OptLevel > 2) 2232 Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold; 2233 return Params; 2234 } 2235