xref: /freebsd/contrib/llvm-project/llvm/lib/Analysis/InlineCost.cpp (revision 700637cbb5e582861067a11aaca4d053546871d2)
1 //===- InlineCost.cpp - Cost analysis for inliner -------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements inline cost analysis.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "llvm/Analysis/InlineCost.h"
14 #include "llvm/ADT/STLExtras.h"
15 #include "llvm/ADT/SetVector.h"
16 #include "llvm/ADT/SmallPtrSet.h"
17 #include "llvm/ADT/SmallVector.h"
18 #include "llvm/ADT/Statistic.h"
19 #include "llvm/Analysis/AssumptionCache.h"
20 #include "llvm/Analysis/BlockFrequencyInfo.h"
21 #include "llvm/Analysis/CodeMetrics.h"
22 #include "llvm/Analysis/ConstantFolding.h"
23 #include "llvm/Analysis/DomConditionCache.h"
24 #include "llvm/Analysis/EphemeralValuesCache.h"
25 #include "llvm/Analysis/InstructionSimplify.h"
26 #include "llvm/Analysis/LoopInfo.h"
27 #include "llvm/Analysis/MemoryBuiltins.h"
28 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
29 #include "llvm/Analysis/ProfileSummaryInfo.h"
30 #include "llvm/Analysis/TargetLibraryInfo.h"
31 #include "llvm/Analysis/TargetTransformInfo.h"
32 #include "llvm/Analysis/ValueTracking.h"
33 #include "llvm/Config/llvm-config.h"
34 #include "llvm/IR/AssemblyAnnotationWriter.h"
35 #include "llvm/IR/CallingConv.h"
36 #include "llvm/IR/DataLayout.h"
37 #include "llvm/IR/Dominators.h"
38 #include "llvm/IR/GetElementPtrTypeIterator.h"
39 #include "llvm/IR/GlobalAlias.h"
40 #include "llvm/IR/InlineAsm.h"
41 #include "llvm/IR/InstVisitor.h"
42 #include "llvm/IR/IntrinsicInst.h"
43 #include "llvm/IR/Operator.h"
44 #include "llvm/IR/PatternMatch.h"
45 #include "llvm/Support/CommandLine.h"
46 #include "llvm/Support/Debug.h"
47 #include "llvm/Support/FormattedStream.h"
48 #include "llvm/Support/raw_ostream.h"
49 #include <climits>
50 #include <limits>
51 #include <optional>
52 
53 using namespace llvm;
54 
55 #define DEBUG_TYPE "inline-cost"
56 
57 STATISTIC(NumCallsAnalyzed, "Number of call sites analyzed");
58 
59 static cl::opt<int>
60     DefaultThreshold("inlinedefault-threshold", cl::Hidden, cl::init(225),
61                      cl::desc("Default amount of inlining to perform"));
62 
63 // We introduce this option since there is a minor compile-time win by avoiding
64 // addition of TTI attributes (target-features in particular) to inline
65 // candidates when they are guaranteed to be the same as top level methods in
66 // some use cases. If we avoid adding the attribute, we need an option to avoid
67 // checking these attributes.
68 static cl::opt<bool> IgnoreTTIInlineCompatible(
69     "ignore-tti-inline-compatible", cl::Hidden, cl::init(false),
70     cl::desc("Ignore TTI attributes compatibility check between callee/caller "
71              "during inline cost calculation"));
72 
73 static cl::opt<bool> PrintInstructionComments(
74     "print-instruction-comments", cl::Hidden, cl::init(false),
75     cl::desc("Prints comments for instruction based on inline cost analysis"));
76 
77 static cl::opt<int> InlineThreshold(
78     "inline-threshold", cl::Hidden, cl::init(225),
79     cl::desc("Control the amount of inlining to perform (default = 225)"));
80 
81 static cl::opt<int> HintThreshold(
82     "inlinehint-threshold", cl::Hidden, cl::init(325),
83     cl::desc("Threshold for inlining functions with inline hint"));
84 
85 static cl::opt<int>
86     ColdCallSiteThreshold("inline-cold-callsite-threshold", cl::Hidden,
87                           cl::init(45),
88                           cl::desc("Threshold for inlining cold callsites"));
89 
90 static cl::opt<bool> InlineEnableCostBenefitAnalysis(
91     "inline-enable-cost-benefit-analysis", cl::Hidden, cl::init(false),
92     cl::desc("Enable the cost-benefit analysis for the inliner"));
93 
94 // InlineSavingsMultiplier overrides per TTI multipliers iff it is
95 // specified explicitly in command line options. This option is exposed
96 // for tuning and testing.
97 static cl::opt<int> InlineSavingsMultiplier(
98     "inline-savings-multiplier", cl::Hidden, cl::init(8),
99     cl::desc("Multiplier to multiply cycle savings by during inlining"));
100 
101 // InlineSavingsProfitableMultiplier overrides per TTI multipliers iff it is
102 // specified explicitly in command line options. This option is exposed
103 // for tuning and testing.
104 static cl::opt<int> InlineSavingsProfitableMultiplier(
105     "inline-savings-profitable-multiplier", cl::Hidden, cl::init(4),
106     cl::desc("A multiplier on top of cycle savings to decide whether the "
107              "savings won't justify the cost"));
108 
109 static cl::opt<int>
110     InlineSizeAllowance("inline-size-allowance", cl::Hidden, cl::init(100),
111                         cl::desc("The maximum size of a callee that get's "
112                                  "inlined without sufficient cycle savings"));
113 
114 // We introduce this threshold to help performance of instrumentation based
115 // PGO before we actually hook up inliner with analysis passes such as BPI and
116 // BFI.
117 static cl::opt<int> ColdThreshold(
118     "inlinecold-threshold", cl::Hidden, cl::init(45),
119     cl::desc("Threshold for inlining functions with cold attribute"));
120 
121 static cl::opt<int>
122     HotCallSiteThreshold("hot-callsite-threshold", cl::Hidden, cl::init(3000),
123                          cl::desc("Threshold for hot callsites "));
124 
125 static cl::opt<int> LocallyHotCallSiteThreshold(
126     "locally-hot-callsite-threshold", cl::Hidden, cl::init(525),
127     cl::desc("Threshold for locally hot callsites "));
128 
129 static cl::opt<int> ColdCallSiteRelFreq(
130     "cold-callsite-rel-freq", cl::Hidden, cl::init(2),
131     cl::desc("Maximum block frequency, expressed as a percentage of caller's "
132              "entry frequency, for a callsite to be cold in the absence of "
133              "profile information."));
134 
135 static cl::opt<uint64_t> HotCallSiteRelFreq(
136     "hot-callsite-rel-freq", cl::Hidden, cl::init(60),
137     cl::desc("Minimum block frequency, expressed as a multiple of caller's "
138              "entry frequency, for a callsite to be hot in the absence of "
139              "profile information."));
140 
141 static cl::opt<int>
142     InstrCost("inline-instr-cost", cl::Hidden, cl::init(5),
143               cl::desc("Cost of a single instruction when inlining"));
144 
145 static cl::opt<int> InlineAsmInstrCost(
146     "inline-asm-instr-cost", cl::Hidden, cl::init(0),
147     cl::desc("Cost of a single inline asm instruction when inlining"));
148 
149 static cl::opt<int>
150     MemAccessCost("inline-memaccess-cost", cl::Hidden, cl::init(0),
151                   cl::desc("Cost of load/store instruction when inlining"));
152 
153 static cl::opt<int> CallPenalty(
154     "inline-call-penalty", cl::Hidden, cl::init(25),
155     cl::desc("Call penalty that is applied per callsite when inlining"));
156 
157 static cl::opt<size_t>
158     StackSizeThreshold("inline-max-stacksize", cl::Hidden,
159                        cl::init(std::numeric_limits<size_t>::max()),
160                        cl::desc("Do not inline functions with a stack size "
161                                 "that exceeds the specified limit"));
162 
163 static cl::opt<size_t> RecurStackSizeThreshold(
164     "recursive-inline-max-stacksize", cl::Hidden,
165     cl::init(InlineConstants::TotalAllocaSizeRecursiveCaller),
166     cl::desc("Do not inline recursive functions with a stack "
167              "size that exceeds the specified limit"));
168 
169 static cl::opt<bool> OptComputeFullInlineCost(
170     "inline-cost-full", cl::Hidden,
171     cl::desc("Compute the full inline cost of a call site even when the cost "
172              "exceeds the threshold."));
173 
174 static cl::opt<bool> InlineCallerSupersetNoBuiltin(
175     "inline-caller-superset-nobuiltin", cl::Hidden, cl::init(true),
176     cl::desc("Allow inlining when caller has a superset of callee's nobuiltin "
177              "attributes."));
178 
179 static cl::opt<bool> DisableGEPConstOperand(
180     "disable-gep-const-evaluation", cl::Hidden, cl::init(false),
181     cl::desc("Disables evaluation of GetElementPtr with constant operands"));
182 
183 namespace llvm {
getStringFnAttrAsInt(const Attribute & Attr)184 std::optional<int> getStringFnAttrAsInt(const Attribute &Attr) {
185   if (Attr.isValid()) {
186     int AttrValue = 0;
187     if (!Attr.getValueAsString().getAsInteger(10, AttrValue))
188       return AttrValue;
189   }
190   return std::nullopt;
191 }
192 
getStringFnAttrAsInt(CallBase & CB,StringRef AttrKind)193 std::optional<int> getStringFnAttrAsInt(CallBase &CB, StringRef AttrKind) {
194   return getStringFnAttrAsInt(CB.getFnAttr(AttrKind));
195 }
196 
getStringFnAttrAsInt(Function * F,StringRef AttrKind)197 std::optional<int> getStringFnAttrAsInt(Function *F, StringRef AttrKind) {
198   return getStringFnAttrAsInt(F->getFnAttribute(AttrKind));
199 }
200 
201 namespace InlineConstants {
getInstrCost()202 int getInstrCost() { return InstrCost; }
203 
204 } // namespace InlineConstants
205 
206 } // namespace llvm
207 
208 namespace {
209 class InlineCostCallAnalyzer;
210 
211 // This struct is used to store information about inline cost of a
212 // particular instruction
213 struct InstructionCostDetail {
214   int CostBefore = 0;
215   int CostAfter = 0;
216   int ThresholdBefore = 0;
217   int ThresholdAfter = 0;
218 
getThresholdDelta__anon463f3bdb0111::InstructionCostDetail219   int getThresholdDelta() const { return ThresholdAfter - ThresholdBefore; }
220 
getCostDelta__anon463f3bdb0111::InstructionCostDetail221   int getCostDelta() const { return CostAfter - CostBefore; }
222 
hasThresholdChanged__anon463f3bdb0111::InstructionCostDetail223   bool hasThresholdChanged() const { return ThresholdAfter != ThresholdBefore; }
224 };
225 
226 class InlineCostAnnotationWriter : public AssemblyAnnotationWriter {
227 private:
228   InlineCostCallAnalyzer *const ICCA;
229 
230 public:
InlineCostAnnotationWriter(InlineCostCallAnalyzer * ICCA)231   InlineCostAnnotationWriter(InlineCostCallAnalyzer *ICCA) : ICCA(ICCA) {}
232   void emitInstructionAnnot(const Instruction *I,
233                             formatted_raw_ostream &OS) override;
234 };
235 
236 /// Carry out call site analysis, in order to evaluate inlinability.
237 /// NOTE: the type is currently used as implementation detail of functions such
238 /// as llvm::getInlineCost. Note the function_ref constructor parameters - the
239 /// expectation is that they come from the outer scope, from the wrapper
240 /// functions. If we want to support constructing CallAnalyzer objects where
241 /// lambdas are provided inline at construction, or where the object needs to
242 /// otherwise survive past the scope of the provided functions, we need to
243 /// revisit the argument types.
244 class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
245   typedef InstVisitor<CallAnalyzer, bool> Base;
246   friend class InstVisitor<CallAnalyzer, bool>;
247 
248 protected:
249   virtual ~CallAnalyzer() = default;
250   /// The TargetTransformInfo available for this compilation.
251   const TargetTransformInfo &TTI;
252 
253   /// Getter for the cache of @llvm.assume intrinsics.
254   function_ref<AssumptionCache &(Function &)> GetAssumptionCache;
255 
256   /// Getter for BlockFrequencyInfo
257   function_ref<BlockFrequencyInfo &(Function &)> GetBFI;
258 
259   /// Getter for TargetLibraryInfo
260   function_ref<const TargetLibraryInfo &(Function &)> GetTLI;
261 
262   /// Profile summary information.
263   ProfileSummaryInfo *PSI;
264 
265   /// The called function.
266   Function &F;
267 
268   // Cache the DataLayout since we use it a lot.
269   const DataLayout &DL;
270 
271   /// The OptimizationRemarkEmitter available for this compilation.
272   OptimizationRemarkEmitter *ORE;
273 
274   /// The candidate callsite being analyzed. Please do not use this to do
275   /// analysis in the caller function; we want the inline cost query to be
276   /// easily cacheable. Instead, use the cover function paramHasAttr.
277   CallBase &CandidateCall;
278 
279   /// Getter for the cache of ephemeral values.
280   function_ref<EphemeralValuesCache &(Function &)> GetEphValuesCache = nullptr;
281 
282   /// Extension points for handling callsite features.
283   // Called before a basic block was analyzed.
onBlockStart(const BasicBlock * BB)284   virtual void onBlockStart(const BasicBlock *BB) {}
285 
286   /// Called after a basic block was analyzed.
onBlockAnalyzed(const BasicBlock * BB)287   virtual void onBlockAnalyzed(const BasicBlock *BB) {}
288 
289   /// Called before an instruction was analyzed
onInstructionAnalysisStart(const Instruction * I)290   virtual void onInstructionAnalysisStart(const Instruction *I) {}
291 
292   /// Called after an instruction was analyzed
onInstructionAnalysisFinish(const Instruction * I)293   virtual void onInstructionAnalysisFinish(const Instruction *I) {}
294 
295   /// Called at the end of the analysis of the callsite. Return the outcome of
296   /// the analysis, i.e. 'InlineResult(true)' if the inlining may happen, or
297   /// the reason it can't.
finalizeAnalysis()298   virtual InlineResult finalizeAnalysis() { return InlineResult::success(); }
299   /// Called when we're about to start processing a basic block, and every time
300   /// we are done processing an instruction. Return true if there is no point in
301   /// continuing the analysis (e.g. we've determined already the call site is
302   /// too expensive to inline)
shouldStop()303   virtual bool shouldStop() { return false; }
304 
305   /// Called before the analysis of the callee body starts (with callsite
306   /// contexts propagated).  It checks callsite-specific information. Return a
307   /// reason analysis can't continue if that's the case, or 'true' if it may
308   /// continue.
onAnalysisStart()309   virtual InlineResult onAnalysisStart() { return InlineResult::success(); }
310   /// Called if the analysis engine decides SROA cannot be done for the given
311   /// alloca.
onDisableSROA(AllocaInst * Arg)312   virtual void onDisableSROA(AllocaInst *Arg) {}
313 
314   /// Called the analysis engine determines load elimination won't happen.
onDisableLoadElimination()315   virtual void onDisableLoadElimination() {}
316 
317   /// Called when we visit a CallBase, before the analysis starts. Return false
318   /// to stop further processing of the instruction.
onCallBaseVisitStart(CallBase & Call)319   virtual bool onCallBaseVisitStart(CallBase &Call) { return true; }
320 
321   /// Called to account for a call.
onCallPenalty()322   virtual void onCallPenalty() {}
323 
324   /// Called to account for a load or store.
onMemAccess()325   virtual void onMemAccess(){};
326 
327   /// Called to account for the expectation the inlining would result in a load
328   /// elimination.
onLoadEliminationOpportunity()329   virtual void onLoadEliminationOpportunity() {}
330 
331   /// Called to account for the cost of argument setup for the Call in the
332   /// callee's body (not the callsite currently under analysis).
onCallArgumentSetup(const CallBase & Call)333   virtual void onCallArgumentSetup(const CallBase &Call) {}
334 
335   /// Called to account for a load relative intrinsic.
onLoadRelativeIntrinsic()336   virtual void onLoadRelativeIntrinsic() {}
337 
338   /// Called to account for a lowered call.
onLoweredCall(Function * F,CallBase & Call,bool IsIndirectCall)339   virtual void onLoweredCall(Function *F, CallBase &Call, bool IsIndirectCall) {
340   }
341 
342   /// Account for a jump table of given size. Return false to stop further
343   /// processing the switch instruction
onJumpTable(unsigned JumpTableSize)344   virtual bool onJumpTable(unsigned JumpTableSize) { return true; }
345 
346   /// Account for a case cluster of given size. Return false to stop further
347   /// processing of the instruction.
onCaseCluster(unsigned NumCaseCluster)348   virtual bool onCaseCluster(unsigned NumCaseCluster) { return true; }
349 
350   /// Called at the end of processing a switch instruction, with the given
351   /// number of case clusters.
onFinalizeSwitch(unsigned JumpTableSize,unsigned NumCaseCluster,bool DefaultDestUnreachable)352   virtual void onFinalizeSwitch(unsigned JumpTableSize, unsigned NumCaseCluster,
353                                 bool DefaultDestUnreachable) {}
354 
355   /// Called to account for any other instruction not specifically accounted
356   /// for.
onMissedSimplification()357   virtual void onMissedSimplification() {}
358 
359   /// Account for inline assembly instructions.
onInlineAsm(const InlineAsm & Arg)360   virtual void onInlineAsm(const InlineAsm &Arg) {}
361 
362   /// Start accounting potential benefits due to SROA for the given alloca.
onInitializeSROAArg(AllocaInst * Arg)363   virtual void onInitializeSROAArg(AllocaInst *Arg) {}
364 
365   /// Account SROA savings for the AllocaInst value.
onAggregateSROAUse(AllocaInst * V)366   virtual void onAggregateSROAUse(AllocaInst *V) {}
367 
handleSROA(Value * V,bool DoNotDisable)368   bool handleSROA(Value *V, bool DoNotDisable) {
369     // Check for SROA candidates in comparisons.
370     if (auto *SROAArg = getSROAArgForValueOrNull(V)) {
371       if (DoNotDisable) {
372         onAggregateSROAUse(SROAArg);
373         return true;
374       }
375       disableSROAForArg(SROAArg);
376     }
377     return false;
378   }
379 
380   bool IsCallerRecursive = false;
381   bool IsRecursiveCall = false;
382   bool ExposesReturnsTwice = false;
383   bool HasDynamicAlloca = false;
384   bool ContainsNoDuplicateCall = false;
385   bool HasReturn = false;
386   bool HasIndirectBr = false;
387   bool HasUninlineableIntrinsic = false;
388   bool InitsVargArgs = false;
389 
390   /// Number of bytes allocated statically by the callee.
391   uint64_t AllocatedSize = 0;
392   unsigned NumInstructions = 0;
393   unsigned NumInlineAsmInstructions = 0;
394   unsigned NumVectorInstructions = 0;
395 
396   /// While we walk the potentially-inlined instructions, we build up and
397   /// maintain a mapping of simplified values specific to this callsite. The
398   /// idea is to propagate any special information we have about arguments to
399   /// this call through the inlinable section of the function, and account for
400   /// likely simplifications post-inlining. The most important aspect we track
401   /// is CFG altering simplifications -- when we prove a basic block dead, that
402   /// can cause dramatic shifts in the cost of inlining a function.
403   /// Note: The simplified Value may be owned by the caller function.
404   DenseMap<Value *, Value *> SimplifiedValues;
405 
406   /// Keep track of the values which map back (through function arguments) to
407   /// allocas on the caller stack which could be simplified through SROA.
408   DenseMap<Value *, AllocaInst *> SROAArgValues;
409 
410   /// Keep track of Allocas for which we believe we may get SROA optimization.
411   DenseSet<AllocaInst *> EnabledSROAAllocas;
412 
413   /// Keep track of values which map to a pointer base and constant offset.
414   DenseMap<Value *, std::pair<Value *, APInt>> ConstantOffsetPtrs;
415 
416   /// Keep track of dead blocks due to the constant arguments.
417   SmallPtrSet<BasicBlock *, 16> DeadBlocks;
418 
419   /// The mapping of the blocks to their known unique successors due to the
420   /// constant arguments.
421   DenseMap<BasicBlock *, BasicBlock *> KnownSuccessors;
422 
423   /// Model the elimination of repeated loads that is expected to happen
424   /// whenever we simplify away the stores that would otherwise cause them to be
425   /// loads.
426   bool EnableLoadElimination = true;
427 
428   /// Whether we allow inlining for recursive call.
429   bool AllowRecursiveCall = false;
430 
431   SmallPtrSet<Value *, 16> LoadAddrSet;
432 
getSROAArgForValueOrNull(Value * V) const433   AllocaInst *getSROAArgForValueOrNull(Value *V) const {
434     auto It = SROAArgValues.find(V);
435     if (It == SROAArgValues.end() || EnabledSROAAllocas.count(It->second) == 0)
436       return nullptr;
437     return It->second;
438   }
439 
440   /// Use a value in its given form directly if possible, otherwise try looking
441   /// for it in SimplifiedValues.
getDirectOrSimplifiedValue(Value * V) const442   template <typename T> T *getDirectOrSimplifiedValue(Value *V) const {
443     if (auto *Direct = dyn_cast<T>(V))
444       return Direct;
445     return getSimplifiedValue<T>(V);
446   }
447 
448   // Custom simplification helper routines.
449   bool isAllocaDerivedArg(Value *V);
450   void disableSROAForArg(AllocaInst *SROAArg);
451   void disableSROA(Value *V);
452   void findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB);
453   void disableLoadElimination();
454   bool isGEPFree(GetElementPtrInst &GEP);
455   bool canFoldInboundsGEP(GetElementPtrInst &I);
456   bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset);
457   bool simplifyCallSite(Function *F, CallBase &Call);
458   bool simplifyCmpInstForRecCall(CmpInst &Cmp);
459   bool simplifyInstruction(Instruction &I);
460   bool simplifyIntrinsicCallIsConstant(CallBase &CB);
461   bool simplifyIntrinsicCallObjectSize(CallBase &CB);
462   ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V);
463   bool isLoweredToCall(Function *F, CallBase &Call);
464 
465   /// Return true if the given argument to the function being considered for
466   /// inlining has the given attribute set either at the call site or the
467   /// function declaration.  Primarily used to inspect call site specific
468   /// attributes since these can be more precise than the ones on the callee
469   /// itself.
470   bool paramHasAttr(Argument *A, Attribute::AttrKind Attr);
471 
472   /// Return true if the given value is known non null within the callee if
473   /// inlined through this particular callsite.
474   bool isKnownNonNullInCallee(Value *V);
475 
476   /// Return true if size growth is allowed when inlining the callee at \p Call.
477   bool allowSizeGrowth(CallBase &Call);
478 
479   // Custom analysis routines.
480   InlineResult analyzeBlock(BasicBlock *BB,
481                             const SmallPtrSetImpl<const Value *> &EphValues);
482 
483   // Disable several entry points to the visitor so we don't accidentally use
484   // them by declaring but not defining them here.
485   void visit(Module *);
486   void visit(Module &);
487   void visit(Function *);
488   void visit(Function &);
489   void visit(BasicBlock *);
490   void visit(BasicBlock &);
491 
492   // Provide base case for our instruction visit.
493   bool visitInstruction(Instruction &I);
494 
495   // Our visit overrides.
496   bool visitAlloca(AllocaInst &I);
497   bool visitPHI(PHINode &I);
498   bool visitGetElementPtr(GetElementPtrInst &I);
499   bool visitBitCast(BitCastInst &I);
500   bool visitPtrToInt(PtrToIntInst &I);
501   bool visitIntToPtr(IntToPtrInst &I);
502   bool visitCastInst(CastInst &I);
503   bool visitCmpInst(CmpInst &I);
504   bool visitSub(BinaryOperator &I);
505   bool visitBinaryOperator(BinaryOperator &I);
506   bool visitFNeg(UnaryOperator &I);
507   bool visitLoad(LoadInst &I);
508   bool visitStore(StoreInst &I);
509   bool visitExtractValue(ExtractValueInst &I);
510   bool visitInsertValue(InsertValueInst &I);
511   bool visitCallBase(CallBase &Call);
512   bool visitReturnInst(ReturnInst &RI);
513   bool visitBranchInst(BranchInst &BI);
514   bool visitSelectInst(SelectInst &SI);
515   bool visitSwitchInst(SwitchInst &SI);
516   bool visitIndirectBrInst(IndirectBrInst &IBI);
517   bool visitResumeInst(ResumeInst &RI);
518   bool visitCleanupReturnInst(CleanupReturnInst &RI);
519   bool visitCatchReturnInst(CatchReturnInst &RI);
520   bool visitUnreachableInst(UnreachableInst &I);
521 
522 public:
CallAnalyzer(Function & Callee,CallBase & Call,const TargetTransformInfo & TTI,function_ref<AssumptionCache & (Function &)> GetAssumptionCache,function_ref<BlockFrequencyInfo & (Function &)> GetBFI=nullptr,function_ref<const TargetLibraryInfo & (Function &)> GetTLI=nullptr,ProfileSummaryInfo * PSI=nullptr,OptimizationRemarkEmitter * ORE=nullptr,function_ref<EphemeralValuesCache & (Function &)> GetEphValuesCache=nullptr)523   CallAnalyzer(
524       Function &Callee, CallBase &Call, const TargetTransformInfo &TTI,
525       function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
526       function_ref<BlockFrequencyInfo &(Function &)> GetBFI = nullptr,
527       function_ref<const TargetLibraryInfo &(Function &)> GetTLI = nullptr,
528       ProfileSummaryInfo *PSI = nullptr,
529       OptimizationRemarkEmitter *ORE = nullptr,
530       function_ref<EphemeralValuesCache &(Function &)> GetEphValuesCache =
531           nullptr)
532       : TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI),
533         GetTLI(GetTLI), PSI(PSI), F(Callee), DL(F.getDataLayout()), ORE(ORE),
534         CandidateCall(Call), GetEphValuesCache(GetEphValuesCache) {}
535 
536   InlineResult analyze();
537 
538   /// Lookup simplified Value. May return a value owned by the caller.
getSimplifiedValueUnchecked(Value * V) const539   Value *getSimplifiedValueUnchecked(Value *V) const {
540     return SimplifiedValues.lookup(V);
541   }
542 
543   /// Lookup simplified Value, but return nullptr if the simplified value is
544   /// owned by the caller.
getSimplifiedValue(Value * V) const545   template <typename T> T *getSimplifiedValue(Value *V) const {
546     Value *SimpleV = SimplifiedValues.lookup(V);
547     if (!SimpleV)
548       return nullptr;
549 
550     // Skip checks if we know T is a global. This has a small, but measurable
551     // impact on compile-time.
552     if constexpr (std::is_base_of_v<Constant, T>)
553       return dyn_cast<T>(SimpleV);
554 
555     // Make sure the simplified Value is owned by this function
556     if (auto *I = dyn_cast<Instruction>(SimpleV)) {
557       if (I->getFunction() != &F)
558         return nullptr;
559     } else if (auto *Arg = dyn_cast<Argument>(SimpleV)) {
560       if (Arg->getParent() != &F)
561         return nullptr;
562     } else if (!isa<Constant>(SimpleV))
563       return nullptr;
564     return dyn_cast<T>(SimpleV);
565   }
566 
567   // Keep a bunch of stats about the cost savings found so we can print them
568   // out when debugging.
569   unsigned NumConstantArgs = 0;
570   unsigned NumConstantOffsetPtrArgs = 0;
571   unsigned NumAllocaArgs = 0;
572   unsigned NumConstantPtrCmps = 0;
573   unsigned NumConstantPtrDiffs = 0;
574   unsigned NumInstructionsSimplified = 0;
575 
576   void dump();
577 };
578 
579 // Considering forming a binary search, we should find the number of nodes
580 // which is same as the number of comparisons when lowered. For a given
581 // number of clusters, n, we can define a recursive function, f(n), to find
582 // the number of nodes in the tree. The recursion is :
583 // f(n) = 1 + f(n/2) + f (n - n/2), when n > 3,
584 // and f(n) = n, when n <= 3.
585 // This will lead a binary tree where the leaf should be either f(2) or f(3)
586 // when n > 3.  So, the number of comparisons from leaves should be n, while
587 // the number of non-leaf should be :
588 //   2^(log2(n) - 1) - 1
589 //   = 2^log2(n) * 2^-1 - 1
590 //   = n / 2 - 1.
591 // Considering comparisons from leaf and non-leaf nodes, we can estimate the
592 // number of comparisons in a simple closed form :
593 //   n + n / 2 - 1 = n * 3 / 2 - 1
getExpectedNumberOfCompare(int NumCaseCluster)594 int64_t getExpectedNumberOfCompare(int NumCaseCluster) {
595   return 3 * static_cast<int64_t>(NumCaseCluster) / 2 - 1;
596 }
597 
598 /// FIXME: if it is necessary to derive from InlineCostCallAnalyzer, note
599 /// the FIXME in onLoweredCall, when instantiating an InlineCostCallAnalyzer
600 class InlineCostCallAnalyzer final : public CallAnalyzer {
601   const bool ComputeFullInlineCost;
602   int LoadEliminationCost = 0;
603   /// Bonus to be applied when percentage of vector instructions in callee is
604   /// high (see more details in updateThreshold).
605   int VectorBonus = 0;
606   /// Bonus to be applied when the callee has only one reachable basic block.
607   int SingleBBBonus = 0;
608 
609   /// Tunable parameters that control the analysis.
610   const InlineParams &Params;
611 
612   // This DenseMap stores the delta change in cost and threshold after
613   // accounting for the given instruction. The map is filled only with the
614   // flag PrintInstructionComments on.
615   DenseMap<const Instruction *, InstructionCostDetail> InstructionCostDetailMap;
616 
617   /// Upper bound for the inlining cost. Bonuses are being applied to account
618   /// for speculative "expected profit" of the inlining decision.
619   int Threshold = 0;
620 
621   /// The amount of StaticBonus applied.
622   int StaticBonusApplied = 0;
623 
624   /// Attempt to evaluate indirect calls to boost its inline cost.
625   const bool BoostIndirectCalls;
626 
627   /// Ignore the threshold when finalizing analysis.
628   const bool IgnoreThreshold;
629 
630   // True if the cost-benefit-analysis-based inliner is enabled.
631   const bool CostBenefitAnalysisEnabled;
632 
633   /// Inlining cost measured in abstract units, accounts for all the
634   /// instructions expected to be executed for a given function invocation.
635   /// Instructions that are statically proven to be dead based on call-site
636   /// arguments are not counted here.
637   int Cost = 0;
638 
639   // The cumulative cost at the beginning of the basic block being analyzed.  At
640   // the end of analyzing each basic block, "Cost - CostAtBBStart" represents
641   // the size of that basic block.
642   int CostAtBBStart = 0;
643 
644   // The static size of live but cold basic blocks.  This is "static" in the
645   // sense that it's not weighted by profile counts at all.
646   int ColdSize = 0;
647 
648   // Whether inlining is decided by cost-threshold analysis.
649   bool DecidedByCostThreshold = false;
650 
651   // Whether inlining is decided by cost-benefit analysis.
652   bool DecidedByCostBenefit = false;
653 
654   // The cost-benefit pair computed by cost-benefit analysis.
655   std::optional<CostBenefitPair> CostBenefit;
656 
657   bool SingleBB = true;
658 
659   unsigned SROACostSavings = 0;
660   unsigned SROACostSavingsLost = 0;
661 
662   /// The mapping of caller Alloca values to their accumulated cost savings. If
663   /// we have to disable SROA for one of the allocas, this tells us how much
664   /// cost must be added.
665   DenseMap<AllocaInst *, int> SROAArgCosts;
666 
667   /// Return true if \p Call is a cold callsite.
668   bool isColdCallSite(CallBase &Call, BlockFrequencyInfo *CallerBFI);
669 
670   /// Update Threshold based on callsite properties such as callee
671   /// attributes and callee hotness for PGO builds. The Callee is explicitly
672   /// passed to support analyzing indirect calls whose target is inferred by
673   /// analysis.
674   void updateThreshold(CallBase &Call, Function &Callee);
675   /// Return a higher threshold if \p Call is a hot callsite.
676   std::optional<int> getHotCallSiteThreshold(CallBase &Call,
677                                              BlockFrequencyInfo *CallerBFI);
678 
679   /// Handle a capped 'int' increment for Cost.
addCost(int64_t Inc)680   void addCost(int64_t Inc) {
681     Inc = std::clamp<int64_t>(Inc, INT_MIN, INT_MAX);
682     Cost = std::clamp<int64_t>(Inc + Cost, INT_MIN, INT_MAX);
683   }
684 
onDisableSROA(AllocaInst * Arg)685   void onDisableSROA(AllocaInst *Arg) override {
686     auto CostIt = SROAArgCosts.find(Arg);
687     if (CostIt == SROAArgCosts.end())
688       return;
689     addCost(CostIt->second);
690     SROACostSavings -= CostIt->second;
691     SROACostSavingsLost += CostIt->second;
692     SROAArgCosts.erase(CostIt);
693   }
694 
onDisableLoadElimination()695   void onDisableLoadElimination() override {
696     addCost(LoadEliminationCost);
697     LoadEliminationCost = 0;
698   }
699 
onCallBaseVisitStart(CallBase & Call)700   bool onCallBaseVisitStart(CallBase &Call) override {
701     if (std::optional<int> AttrCallThresholdBonus =
702             getStringFnAttrAsInt(Call, "call-threshold-bonus"))
703       Threshold += *AttrCallThresholdBonus;
704 
705     if (std::optional<int> AttrCallCost =
706             getStringFnAttrAsInt(Call, "call-inline-cost")) {
707       addCost(*AttrCallCost);
708       // Prevent further processing of the call since we want to override its
709       // inline cost, not just add to it.
710       return false;
711     }
712     return true;
713   }
714 
onCallPenalty()715   void onCallPenalty() override { addCost(CallPenalty); }
716 
onMemAccess()717   void onMemAccess() override { addCost(MemAccessCost); }
718 
onCallArgumentSetup(const CallBase & Call)719   void onCallArgumentSetup(const CallBase &Call) override {
720     // Pay the price of the argument setup. We account for the average 1
721     // instruction per call argument setup here.
722     addCost(Call.arg_size() * InstrCost);
723   }
onLoadRelativeIntrinsic()724   void onLoadRelativeIntrinsic() override {
725     // This is normally lowered to 4 LLVM instructions.
726     addCost(3 * InstrCost);
727   }
onLoweredCall(Function * F,CallBase & Call,bool IsIndirectCall)728   void onLoweredCall(Function *F, CallBase &Call,
729                      bool IsIndirectCall) override {
730     // We account for the average 1 instruction per call argument setup here.
731     addCost(Call.arg_size() * InstrCost);
732 
733     // If we have a constant that we are calling as a function, we can peer
734     // through it and see the function target. This happens not infrequently
735     // during devirtualization and so we want to give it a hefty bonus for
736     // inlining, but cap that bonus in the event that inlining wouldn't pan out.
737     // Pretend to inline the function, with a custom threshold.
738     if (IsIndirectCall && BoostIndirectCalls) {
739       auto IndirectCallParams = Params;
740       IndirectCallParams.DefaultThreshold =
741           InlineConstants::IndirectCallThreshold;
742       /// FIXME: if InlineCostCallAnalyzer is derived from, this may need
743       /// to instantiate the derived class.
744       InlineCostCallAnalyzer CA(*F, Call, IndirectCallParams, TTI,
745                                 GetAssumptionCache, GetBFI, GetTLI, PSI, ORE,
746                                 false);
747       if (CA.analyze().isSuccess()) {
748         // We were able to inline the indirect call! Subtract the cost from the
749         // threshold to get the bonus we want to apply, but don't go below zero.
750         Cost -= std::max(0, CA.getThreshold() - CA.getCost());
751       }
752     } else
753       // Otherwise simply add the cost for merely making the call.
754       addCost(TTI.getInlineCallPenalty(CandidateCall.getCaller(), Call,
755                                        CallPenalty));
756   }
757 
onFinalizeSwitch(unsigned JumpTableSize,unsigned NumCaseCluster,bool DefaultDestUnreachable)758   void onFinalizeSwitch(unsigned JumpTableSize, unsigned NumCaseCluster,
759                         bool DefaultDestUnreachable) override {
760     // If suitable for a jump table, consider the cost for the table size and
761     // branch to destination.
762     // Maximum valid cost increased in this function.
763     if (JumpTableSize) {
764       // Suppose a default branch includes one compare and one conditional
765       // branch if it's reachable.
766       if (!DefaultDestUnreachable)
767         addCost(2 * InstrCost);
768       // Suppose a jump table requires one load and one jump instruction.
769       int64_t JTCost =
770           static_cast<int64_t>(JumpTableSize) * InstrCost + 2 * InstrCost;
771       addCost(JTCost);
772       return;
773     }
774 
775     if (NumCaseCluster <= 3) {
776       // Suppose a comparison includes one compare and one conditional branch.
777       // We can reduce a set of instructions if the default branch is
778       // undefined.
779       addCost((NumCaseCluster - DefaultDestUnreachable) * 2 * InstrCost);
780       return;
781     }
782 
783     int64_t ExpectedNumberOfCompare =
784         getExpectedNumberOfCompare(NumCaseCluster);
785     int64_t SwitchCost = ExpectedNumberOfCompare * 2 * InstrCost;
786 
787     addCost(SwitchCost);
788   }
789 
790   // Parses the inline assembly argument to account for its cost. Inline
791   // assembly instructions incur higher costs for inlining since they cannot be
792   // analyzed and optimized.
onInlineAsm(const InlineAsm & Arg)793   void onInlineAsm(const InlineAsm &Arg) override {
794     if (!InlineAsmInstrCost)
795       return;
796     SmallVector<StringRef, 4> AsmStrs;
797     Arg.collectAsmStrs(AsmStrs);
798     int SectionLevel = 0;
799     int InlineAsmInstrCount = 0;
800     for (StringRef AsmStr : AsmStrs) {
801       // Trim whitespaces and comments.
802       StringRef Trimmed = AsmStr.trim();
803       size_t hashPos = Trimmed.find('#');
804       if (hashPos != StringRef::npos)
805         Trimmed = Trimmed.substr(0, hashPos);
806       // Ignore comments.
807       if (Trimmed.empty())
808         continue;
809       // Filter out the outlined assembly instructions from the cost by keeping
810       // track of the section level and only accounting for instrutions at
811       // section level of zero. Note there will be duplication in outlined
812       // sections too, but is not accounted in the inlining cost model.
813       if (Trimmed.starts_with(".pushsection")) {
814         ++SectionLevel;
815         continue;
816       }
817       if (Trimmed.starts_with(".popsection")) {
818         --SectionLevel;
819         continue;
820       }
821       // Ignore directives and labels.
822       if (Trimmed.starts_with(".") || Trimmed.contains(":"))
823         continue;
824       if (SectionLevel == 0)
825         ++InlineAsmInstrCount;
826     }
827     NumInlineAsmInstructions += InlineAsmInstrCount;
828     addCost(InlineAsmInstrCount * InlineAsmInstrCost);
829   }
830 
onMissedSimplification()831   void onMissedSimplification() override { addCost(InstrCost); }
832 
onInitializeSROAArg(AllocaInst * Arg)833   void onInitializeSROAArg(AllocaInst *Arg) override {
834     assert(Arg != nullptr &&
835            "Should not initialize SROA costs for null value.");
836     auto SROAArgCost = TTI.getCallerAllocaCost(&CandidateCall, Arg);
837     SROACostSavings += SROAArgCost;
838     SROAArgCosts[Arg] = SROAArgCost;
839   }
840 
onAggregateSROAUse(AllocaInst * SROAArg)841   void onAggregateSROAUse(AllocaInst *SROAArg) override {
842     auto CostIt = SROAArgCosts.find(SROAArg);
843     assert(CostIt != SROAArgCosts.end() &&
844            "expected this argument to have a cost");
845     CostIt->second += InstrCost;
846     SROACostSavings += InstrCost;
847   }
848 
onBlockStart(const BasicBlock * BB)849   void onBlockStart(const BasicBlock *BB) override { CostAtBBStart = Cost; }
850 
onBlockAnalyzed(const BasicBlock * BB)851   void onBlockAnalyzed(const BasicBlock *BB) override {
852     if (CostBenefitAnalysisEnabled) {
853       // Keep track of the static size of live but cold basic blocks.  For now,
854       // we define a cold basic block to be one that's never executed.
855       assert(GetBFI && "GetBFI must be available");
856       BlockFrequencyInfo *BFI = &(GetBFI(F));
857       assert(BFI && "BFI must be available");
858       auto ProfileCount = BFI->getBlockProfileCount(BB);
859       if (*ProfileCount == 0)
860         ColdSize += Cost - CostAtBBStart;
861     }
862 
863     auto *TI = BB->getTerminator();
864     // If we had any successors at this point, than post-inlining is likely to
865     // have them as well. Note that we assume any basic blocks which existed
866     // due to branches or switches which folded above will also fold after
867     // inlining.
868     if (SingleBB && TI->getNumSuccessors() > 1) {
869       // Take off the bonus we applied to the threshold.
870       Threshold -= SingleBBBonus;
871       SingleBB = false;
872     }
873   }
874 
onInstructionAnalysisStart(const Instruction * I)875   void onInstructionAnalysisStart(const Instruction *I) override {
876     // This function is called to store the initial cost of inlining before
877     // the given instruction was assessed.
878     if (!PrintInstructionComments)
879       return;
880     auto &CostDetail = InstructionCostDetailMap[I];
881     CostDetail.CostBefore = Cost;
882     CostDetail.ThresholdBefore = Threshold;
883   }
884 
onInstructionAnalysisFinish(const Instruction * I)885   void onInstructionAnalysisFinish(const Instruction *I) override {
886     // This function is called to find new values of cost and threshold after
887     // the instruction has been assessed.
888     if (!PrintInstructionComments)
889       return;
890     auto &CostDetail = InstructionCostDetailMap[I];
891     CostDetail.CostAfter = Cost;
892     CostDetail.ThresholdAfter = Threshold;
893   }
894 
isCostBenefitAnalysisEnabled()895   bool isCostBenefitAnalysisEnabled() {
896     if (!PSI || !PSI->hasProfileSummary())
897       return false;
898 
899     if (!GetBFI)
900       return false;
901 
902     if (InlineEnableCostBenefitAnalysis.getNumOccurrences()) {
903       // Honor the explicit request from the user.
904       if (!InlineEnableCostBenefitAnalysis)
905         return false;
906     } else {
907       // Otherwise, require instrumentation profile.
908       if (!PSI->hasInstrumentationProfile())
909         return false;
910     }
911 
912     auto *Caller = CandidateCall.getParent()->getParent();
913     if (!Caller->getEntryCount())
914       return false;
915 
916     BlockFrequencyInfo *CallerBFI = &(GetBFI(*Caller));
917     if (!CallerBFI)
918       return false;
919 
920     // For now, limit to hot call site.
921     if (!PSI->isHotCallSite(CandidateCall, CallerBFI))
922       return false;
923 
924     // Make sure we have a nonzero entry count.
925     auto EntryCount = F.getEntryCount();
926     if (!EntryCount || !EntryCount->getCount())
927       return false;
928 
929     BlockFrequencyInfo *CalleeBFI = &(GetBFI(F));
930     if (!CalleeBFI)
931       return false;
932 
933     return true;
934   }
935 
936   // A helper function to choose between command line override and default.
getInliningCostBenefitAnalysisSavingsMultiplier() const937   unsigned getInliningCostBenefitAnalysisSavingsMultiplier() const {
938     if (InlineSavingsMultiplier.getNumOccurrences())
939       return InlineSavingsMultiplier;
940     return TTI.getInliningCostBenefitAnalysisSavingsMultiplier();
941   }
942 
943   // A helper function to choose between command line override and default.
getInliningCostBenefitAnalysisProfitableMultiplier() const944   unsigned getInliningCostBenefitAnalysisProfitableMultiplier() const {
945     if (InlineSavingsProfitableMultiplier.getNumOccurrences())
946       return InlineSavingsProfitableMultiplier;
947     return TTI.getInliningCostBenefitAnalysisProfitableMultiplier();
948   }
949 
OverrideCycleSavingsAndSizeForTesting(APInt & CycleSavings,int & Size)950   void OverrideCycleSavingsAndSizeForTesting(APInt &CycleSavings, int &Size) {
951     if (std::optional<int> AttrCycleSavings = getStringFnAttrAsInt(
952             CandidateCall, "inline-cycle-savings-for-test")) {
953       CycleSavings = *AttrCycleSavings;
954     }
955 
956     if (std::optional<int> AttrRuntimeCost = getStringFnAttrAsInt(
957             CandidateCall, "inline-runtime-cost-for-test")) {
958       Size = *AttrRuntimeCost;
959     }
960   }
961 
962   // Determine whether we should inline the given call site, taking into account
963   // both the size cost and the cycle savings.  Return std::nullopt if we don't
964   // have sufficient profiling information to determine.
costBenefitAnalysis()965   std::optional<bool> costBenefitAnalysis() {
966     if (!CostBenefitAnalysisEnabled)
967       return std::nullopt;
968 
969     // buildInlinerPipeline in the pass builder sets HotCallSiteThreshold to 0
970     // for the prelink phase of the AutoFDO + ThinLTO build.  Honor the logic by
971     // falling back to the cost-based metric.
972     // TODO: Improve this hacky condition.
973     if (Threshold == 0)
974       return std::nullopt;
975 
976     assert(GetBFI);
977     BlockFrequencyInfo *CalleeBFI = &(GetBFI(F));
978     assert(CalleeBFI);
979 
980     // The cycle savings expressed as the sum of InstrCost
981     // multiplied by the estimated dynamic count of each instruction we can
982     // avoid.  Savings come from the call site cost, such as argument setup and
983     // the call instruction, as well as the instructions that are folded.
984     //
985     // We use 128-bit APInt here to avoid potential overflow.  This variable
986     // should stay well below 10^^24 (or 2^^80) in practice.  This "worst" case
987     // assumes that we can avoid or fold a billion instructions, each with a
988     // profile count of 10^^15 -- roughly the number of cycles for a 24-hour
989     // period on a 4GHz machine.
990     APInt CycleSavings(128, 0);
991 
992     for (auto &BB : F) {
993       APInt CurrentSavings(128, 0);
994       for (auto &I : BB) {
995         if (BranchInst *BI = dyn_cast<BranchInst>(&I)) {
996           // Count a conditional branch as savings if it becomes unconditional.
997           if (BI->isConditional() &&
998               getSimplifiedValue<ConstantInt>(BI->getCondition())) {
999             CurrentSavings += InstrCost;
1000           }
1001         } else if (SwitchInst *SI = dyn_cast<SwitchInst>(&I)) {
1002           if (getSimplifiedValue<ConstantInt>(SI->getCondition()))
1003             CurrentSavings += InstrCost;
1004         } else if (Value *V = dyn_cast<Value>(&I)) {
1005           // Count an instruction as savings if we can fold it.
1006           if (SimplifiedValues.count(V)) {
1007             CurrentSavings += InstrCost;
1008           }
1009         }
1010       }
1011 
1012       auto ProfileCount = CalleeBFI->getBlockProfileCount(&BB);
1013       CurrentSavings *= *ProfileCount;
1014       CycleSavings += CurrentSavings;
1015     }
1016 
1017     // Compute the cycle savings per call.
1018     auto EntryProfileCount = F.getEntryCount();
1019     assert(EntryProfileCount && EntryProfileCount->getCount());
1020     auto EntryCount = EntryProfileCount->getCount();
1021     CycleSavings += EntryCount / 2;
1022     CycleSavings = CycleSavings.udiv(EntryCount);
1023 
1024     // Compute the total savings for the call site.
1025     auto *CallerBB = CandidateCall.getParent();
1026     BlockFrequencyInfo *CallerBFI = &(GetBFI(*(CallerBB->getParent())));
1027     CycleSavings += getCallsiteCost(TTI, this->CandidateCall, DL);
1028     CycleSavings *= *CallerBFI->getBlockProfileCount(CallerBB);
1029 
1030     // Remove the cost of the cold basic blocks to model the runtime cost more
1031     // accurately. Both machine block placement and function splitting could
1032     // place cold blocks further from hot blocks.
1033     int Size = Cost - ColdSize;
1034 
1035     // Allow tiny callees to be inlined regardless of whether they meet the
1036     // savings threshold.
1037     Size = Size > InlineSizeAllowance ? Size - InlineSizeAllowance : 1;
1038 
1039     OverrideCycleSavingsAndSizeForTesting(CycleSavings, Size);
1040     CostBenefit.emplace(APInt(128, Size), CycleSavings);
1041 
1042     // Let R be the ratio of CycleSavings to Size.  We accept the inlining
1043     // opportunity if R is really high and reject if R is really low.  If R is
1044     // somewhere in the middle, we fall back to the cost-based analysis.
1045     //
1046     // Specifically, let R = CycleSavings / Size, we accept the inlining
1047     // opportunity if:
1048     //
1049     //             PSI->getOrCompHotCountThreshold()
1050     // R > -------------------------------------------------
1051     //     getInliningCostBenefitAnalysisSavingsMultiplier()
1052     //
1053     // and reject the inlining opportunity if:
1054     //
1055     //                PSI->getOrCompHotCountThreshold()
1056     // R <= ----------------------------------------------------
1057     //      getInliningCostBenefitAnalysisProfitableMultiplier()
1058     //
1059     // Otherwise, we fall back to the cost-based analysis.
1060     //
1061     // Implementation-wise, use multiplication (CycleSavings * Multiplier,
1062     // HotCountThreshold * Size) rather than division to avoid precision loss.
1063     APInt Threshold(128, PSI->getOrCompHotCountThreshold());
1064     Threshold *= Size;
1065 
1066     APInt UpperBoundCycleSavings = CycleSavings;
1067     UpperBoundCycleSavings *= getInliningCostBenefitAnalysisSavingsMultiplier();
1068     if (UpperBoundCycleSavings.uge(Threshold))
1069       return true;
1070 
1071     APInt LowerBoundCycleSavings = CycleSavings;
1072     LowerBoundCycleSavings *=
1073         getInliningCostBenefitAnalysisProfitableMultiplier();
1074     if (LowerBoundCycleSavings.ult(Threshold))
1075       return false;
1076 
1077     // Otherwise, fall back to the cost-based analysis.
1078     return std::nullopt;
1079   }
1080 
finalizeAnalysis()1081   InlineResult finalizeAnalysis() override {
1082     // Loops generally act a lot like calls in that they act like barriers to
1083     // movement, require a certain amount of setup, etc. So when optimising for
1084     // size, we penalise any call sites that perform loops. We do this after all
1085     // other costs here, so will likely only be dealing with relatively small
1086     // functions (and hence DT and LI will hopefully be cheap).
1087     auto *Caller = CandidateCall.getFunction();
1088     if (Caller->hasMinSize()) {
1089       DominatorTree DT(F);
1090       LoopInfo LI(DT);
1091       int NumLoops = 0;
1092       for (Loop *L : LI) {
1093         // Ignore loops that will not be executed
1094         if (DeadBlocks.count(L->getHeader()))
1095           continue;
1096         NumLoops++;
1097       }
1098       addCost(NumLoops * InlineConstants::LoopPenalty);
1099     }
1100 
1101     // We applied the maximum possible vector bonus at the beginning. Now,
1102     // subtract the excess bonus, if any, from the Threshold before
1103     // comparing against Cost.
1104     if (NumVectorInstructions <= NumInstructions / 10)
1105       Threshold -= VectorBonus;
1106     else if (NumVectorInstructions <= NumInstructions / 2)
1107       Threshold -= VectorBonus / 2;
1108 
1109     if (std::optional<int> AttrCost =
1110             getStringFnAttrAsInt(CandidateCall, "function-inline-cost"))
1111       Cost = *AttrCost;
1112 
1113     if (std::optional<int> AttrCostMult = getStringFnAttrAsInt(
1114             CandidateCall,
1115             InlineConstants::FunctionInlineCostMultiplierAttributeName))
1116       Cost *= *AttrCostMult;
1117 
1118     if (std::optional<int> AttrThreshold =
1119             getStringFnAttrAsInt(CandidateCall, "function-inline-threshold"))
1120       Threshold = *AttrThreshold;
1121 
1122     if (auto Result = costBenefitAnalysis()) {
1123       DecidedByCostBenefit = true;
1124       if (*Result)
1125         return InlineResult::success();
1126       else
1127         return InlineResult::failure("Cost over threshold.");
1128     }
1129 
1130     if (IgnoreThreshold)
1131       return InlineResult::success();
1132 
1133     DecidedByCostThreshold = true;
1134     return Cost < std::max(1, Threshold)
1135                ? InlineResult::success()
1136                : InlineResult::failure("Cost over threshold.");
1137   }
1138 
shouldStop()1139   bool shouldStop() override {
1140     if (IgnoreThreshold || ComputeFullInlineCost)
1141       return false;
1142     // Bail out the moment we cross the threshold. This means we'll under-count
1143     // the cost, but only when undercounting doesn't matter.
1144     if (Cost < Threshold)
1145       return false;
1146     DecidedByCostThreshold = true;
1147     return true;
1148   }
1149 
onLoadEliminationOpportunity()1150   void onLoadEliminationOpportunity() override {
1151     LoadEliminationCost += InstrCost;
1152   }
1153 
onAnalysisStart()1154   InlineResult onAnalysisStart() override {
1155     // Perform some tweaks to the cost and threshold based on the direct
1156     // callsite information.
1157 
1158     // We want to more aggressively inline vector-dense kernels, so up the
1159     // threshold, and we'll lower it if the % of vector instructions gets too
1160     // low. Note that these bonuses are some what arbitrary and evolved over
1161     // time by accident as much as because they are principled bonuses.
1162     //
1163     // FIXME: It would be nice to remove all such bonuses. At least it would be
1164     // nice to base the bonus values on something more scientific.
1165     assert(NumInstructions == 0);
1166     assert(NumVectorInstructions == 0);
1167 
1168     // Update the threshold based on callsite properties
1169     updateThreshold(CandidateCall, F);
1170 
1171     // While Threshold depends on commandline options that can take negative
1172     // values, we want to enforce the invariant that the computed threshold and
1173     // bonuses are non-negative.
1174     assert(Threshold >= 0);
1175     assert(SingleBBBonus >= 0);
1176     assert(VectorBonus >= 0);
1177 
1178     // Speculatively apply all possible bonuses to Threshold. If cost exceeds
1179     // this Threshold any time, and cost cannot decrease, we can stop processing
1180     // the rest of the function body.
1181     Threshold += (SingleBBBonus + VectorBonus);
1182 
1183     // Give out bonuses for the callsite, as the instructions setting them up
1184     // will be gone after inlining.
1185     addCost(-getCallsiteCost(TTI, this->CandidateCall, DL));
1186 
1187     // If this function uses the coldcc calling convention, prefer not to inline
1188     // it.
1189     if (F.getCallingConv() == CallingConv::Cold)
1190       Cost += InlineConstants::ColdccPenalty;
1191 
1192     LLVM_DEBUG(dbgs() << "      Initial cost: " << Cost << "\n");
1193 
1194     // Check if we're done. This can happen due to bonuses and penalties.
1195     if (Cost >= Threshold && !ComputeFullInlineCost)
1196       return InlineResult::failure("high cost");
1197 
1198     return InlineResult::success();
1199   }
1200 
1201 public:
InlineCostCallAnalyzer(Function & Callee,CallBase & Call,const InlineParams & Params,const TargetTransformInfo & TTI,function_ref<AssumptionCache & (Function &)> GetAssumptionCache,function_ref<BlockFrequencyInfo & (Function &)> GetBFI=nullptr,function_ref<const TargetLibraryInfo & (Function &)> GetTLI=nullptr,ProfileSummaryInfo * PSI=nullptr,OptimizationRemarkEmitter * ORE=nullptr,bool BoostIndirect=true,bool IgnoreThreshold=false,function_ref<EphemeralValuesCache & (Function &)> GetEphValuesCache=nullptr)1202   InlineCostCallAnalyzer(
1203       Function &Callee, CallBase &Call, const InlineParams &Params,
1204       const TargetTransformInfo &TTI,
1205       function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
1206       function_ref<BlockFrequencyInfo &(Function &)> GetBFI = nullptr,
1207       function_ref<const TargetLibraryInfo &(Function &)> GetTLI = nullptr,
1208       ProfileSummaryInfo *PSI = nullptr,
1209       OptimizationRemarkEmitter *ORE = nullptr, bool BoostIndirect = true,
1210       bool IgnoreThreshold = false,
1211       function_ref<EphemeralValuesCache &(Function &)> GetEphValuesCache =
1212           nullptr)
1213       : CallAnalyzer(Callee, Call, TTI, GetAssumptionCache, GetBFI, GetTLI, PSI,
1214                      ORE, GetEphValuesCache),
1215         ComputeFullInlineCost(OptComputeFullInlineCost ||
1216                               Params.ComputeFullInlineCost || ORE ||
1217                               isCostBenefitAnalysisEnabled()),
1218         Params(Params), Threshold(Params.DefaultThreshold),
1219         BoostIndirectCalls(BoostIndirect), IgnoreThreshold(IgnoreThreshold),
1220         CostBenefitAnalysisEnabled(isCostBenefitAnalysisEnabled()),
1221         Writer(this) {
1222     AllowRecursiveCall = *Params.AllowRecursiveCall;
1223   }
1224 
1225   /// Annotation Writer for instruction details
1226   InlineCostAnnotationWriter Writer;
1227 
1228   void dump();
1229 
1230   // Prints the same analysis as dump(), but its definition is not dependent
1231   // on the build.
1232   void print(raw_ostream &OS);
1233 
getCostDetails(const Instruction * I)1234   std::optional<InstructionCostDetail> getCostDetails(const Instruction *I) {
1235     auto It = InstructionCostDetailMap.find(I);
1236     if (It != InstructionCostDetailMap.end())
1237       return It->second;
1238     return std::nullopt;
1239   }
1240 
1241   virtual ~InlineCostCallAnalyzer() = default;
getThreshold() const1242   int getThreshold() const { return Threshold; }
getCost() const1243   int getCost() const { return Cost; }
getStaticBonusApplied() const1244   int getStaticBonusApplied() const { return StaticBonusApplied; }
getCostBenefitPair()1245   std::optional<CostBenefitPair> getCostBenefitPair() { return CostBenefit; }
wasDecidedByCostBenefit() const1246   bool wasDecidedByCostBenefit() const { return DecidedByCostBenefit; }
wasDecidedByCostThreshold() const1247   bool wasDecidedByCostThreshold() const { return DecidedByCostThreshold; }
1248 };
1249 
1250 // Return true if CB is the sole call to local function Callee.
isSoleCallToLocalFunction(const CallBase & CB,const Function & Callee)1251 static bool isSoleCallToLocalFunction(const CallBase &CB,
1252                                       const Function &Callee) {
1253   return Callee.hasLocalLinkage() && Callee.hasOneLiveUse() &&
1254          &Callee == CB.getCalledFunction();
1255 }
1256 
1257 class InlineCostFeaturesAnalyzer final : public CallAnalyzer {
1258 private:
1259   InlineCostFeatures Cost = {};
1260 
1261   // FIXME: These constants are taken from the heuristic-based cost visitor.
1262   // These should be removed entirely in a later revision to avoid reliance on
1263   // heuristics in the ML inliner.
1264   static constexpr int JTCostMultiplier = 2;
1265   static constexpr int CaseClusterCostMultiplier = 2;
1266   static constexpr int SwitchDefaultDestCostMultiplier = 2;
1267   static constexpr int SwitchCostMultiplier = 2;
1268 
1269   // FIXME: These are taken from the heuristic-based cost visitor: we should
1270   // eventually abstract these to the CallAnalyzer to avoid duplication.
1271   unsigned SROACostSavingOpportunities = 0;
1272   int VectorBonus = 0;
1273   int SingleBBBonus = 0;
1274   int Threshold = 5;
1275 
1276   DenseMap<AllocaInst *, unsigned> SROACosts;
1277 
increment(InlineCostFeatureIndex Feature,int64_t Delta=1)1278   void increment(InlineCostFeatureIndex Feature, int64_t Delta = 1) {
1279     Cost[static_cast<size_t>(Feature)] += Delta;
1280   }
1281 
set(InlineCostFeatureIndex Feature,int64_t Value)1282   void set(InlineCostFeatureIndex Feature, int64_t Value) {
1283     Cost[static_cast<size_t>(Feature)] = Value;
1284   }
1285 
onDisableSROA(AllocaInst * Arg)1286   void onDisableSROA(AllocaInst *Arg) override {
1287     auto CostIt = SROACosts.find(Arg);
1288     if (CostIt == SROACosts.end())
1289       return;
1290 
1291     increment(InlineCostFeatureIndex::sroa_losses, CostIt->second);
1292     SROACostSavingOpportunities -= CostIt->second;
1293     SROACosts.erase(CostIt);
1294   }
1295 
onDisableLoadElimination()1296   void onDisableLoadElimination() override {
1297     set(InlineCostFeatureIndex::load_elimination, 1);
1298   }
1299 
onCallPenalty()1300   void onCallPenalty() override {
1301     increment(InlineCostFeatureIndex::call_penalty, CallPenalty);
1302   }
1303 
onCallArgumentSetup(const CallBase & Call)1304   void onCallArgumentSetup(const CallBase &Call) override {
1305     increment(InlineCostFeatureIndex::call_argument_setup,
1306               Call.arg_size() * InstrCost);
1307   }
1308 
onLoadRelativeIntrinsic()1309   void onLoadRelativeIntrinsic() override {
1310     increment(InlineCostFeatureIndex::load_relative_intrinsic, 3 * InstrCost);
1311   }
1312 
onLoweredCall(Function * F,CallBase & Call,bool IsIndirectCall)1313   void onLoweredCall(Function *F, CallBase &Call,
1314                      bool IsIndirectCall) override {
1315     increment(InlineCostFeatureIndex::lowered_call_arg_setup,
1316               Call.arg_size() * InstrCost);
1317 
1318     if (IsIndirectCall) {
1319       InlineParams IndirectCallParams = {/* DefaultThreshold*/ 0,
1320                                          /*HintThreshold*/ {},
1321                                          /*ColdThreshold*/ {},
1322                                          /*OptSizeThreshold*/ {},
1323                                          /*OptMinSizeThreshold*/ {},
1324                                          /*HotCallSiteThreshold*/ {},
1325                                          /*LocallyHotCallSiteThreshold*/ {},
1326                                          /*ColdCallSiteThreshold*/ {},
1327                                          /*ComputeFullInlineCost*/ true,
1328                                          /*EnableDeferral*/ true};
1329       IndirectCallParams.DefaultThreshold =
1330           InlineConstants::IndirectCallThreshold;
1331 
1332       InlineCostCallAnalyzer CA(*F, Call, IndirectCallParams, TTI,
1333                                 GetAssumptionCache, GetBFI, GetTLI, PSI, ORE,
1334                                 false, true);
1335       if (CA.analyze().isSuccess()) {
1336         increment(InlineCostFeatureIndex::nested_inline_cost_estimate,
1337                   CA.getCost());
1338         increment(InlineCostFeatureIndex::nested_inlines, 1);
1339       }
1340     } else {
1341       onCallPenalty();
1342     }
1343   }
1344 
onFinalizeSwitch(unsigned JumpTableSize,unsigned NumCaseCluster,bool DefaultDestUnreachable)1345   void onFinalizeSwitch(unsigned JumpTableSize, unsigned NumCaseCluster,
1346                         bool DefaultDestUnreachable) override {
1347     if (JumpTableSize) {
1348       if (!DefaultDestUnreachable)
1349         increment(InlineCostFeatureIndex::switch_default_dest_penalty,
1350                   SwitchDefaultDestCostMultiplier * InstrCost);
1351       int64_t JTCost = static_cast<int64_t>(JumpTableSize) * InstrCost +
1352                        JTCostMultiplier * InstrCost;
1353       increment(InlineCostFeatureIndex::jump_table_penalty, JTCost);
1354       return;
1355     }
1356 
1357     if (NumCaseCluster <= 3) {
1358       increment(InlineCostFeatureIndex::case_cluster_penalty,
1359                 (NumCaseCluster - DefaultDestUnreachable) *
1360                     CaseClusterCostMultiplier * InstrCost);
1361       return;
1362     }
1363 
1364     int64_t ExpectedNumberOfCompare =
1365         getExpectedNumberOfCompare(NumCaseCluster);
1366 
1367     int64_t SwitchCost =
1368         ExpectedNumberOfCompare * SwitchCostMultiplier * InstrCost;
1369     increment(InlineCostFeatureIndex::switch_penalty, SwitchCost);
1370   }
1371 
onMissedSimplification()1372   void onMissedSimplification() override {
1373     increment(InlineCostFeatureIndex::unsimplified_common_instructions,
1374               InstrCost);
1375   }
1376 
onInitializeSROAArg(AllocaInst * Arg)1377   void onInitializeSROAArg(AllocaInst *Arg) override {
1378     auto SROAArgCost = TTI.getCallerAllocaCost(&CandidateCall, Arg);
1379     SROACosts[Arg] = SROAArgCost;
1380     SROACostSavingOpportunities += SROAArgCost;
1381   }
1382 
onAggregateSROAUse(AllocaInst * Arg)1383   void onAggregateSROAUse(AllocaInst *Arg) override {
1384     SROACosts.find(Arg)->second += InstrCost;
1385     SROACostSavingOpportunities += InstrCost;
1386   }
1387 
onBlockAnalyzed(const BasicBlock * BB)1388   void onBlockAnalyzed(const BasicBlock *BB) override {
1389     if (BB->getTerminator()->getNumSuccessors() > 1)
1390       set(InlineCostFeatureIndex::is_multiple_blocks, 1);
1391     Threshold -= SingleBBBonus;
1392   }
1393 
finalizeAnalysis()1394   InlineResult finalizeAnalysis() override {
1395     auto *Caller = CandidateCall.getFunction();
1396     if (Caller->hasMinSize()) {
1397       DominatorTree DT(F);
1398       LoopInfo LI(DT);
1399       for (Loop *L : LI) {
1400         // Ignore loops that will not be executed
1401         if (DeadBlocks.count(L->getHeader()))
1402           continue;
1403         increment(InlineCostFeatureIndex::num_loops,
1404                   InlineConstants::LoopPenalty);
1405       }
1406     }
1407     set(InlineCostFeatureIndex::dead_blocks, DeadBlocks.size());
1408     set(InlineCostFeatureIndex::simplified_instructions,
1409         NumInstructionsSimplified);
1410     set(InlineCostFeatureIndex::constant_args, NumConstantArgs);
1411     set(InlineCostFeatureIndex::constant_offset_ptr_args,
1412         NumConstantOffsetPtrArgs);
1413     set(InlineCostFeatureIndex::sroa_savings, SROACostSavingOpportunities);
1414 
1415     if (NumVectorInstructions <= NumInstructions / 10)
1416       Threshold -= VectorBonus;
1417     else if (NumVectorInstructions <= NumInstructions / 2)
1418       Threshold -= VectorBonus / 2;
1419 
1420     set(InlineCostFeatureIndex::threshold, Threshold);
1421 
1422     return InlineResult::success();
1423   }
1424 
shouldStop()1425   bool shouldStop() override { return false; }
1426 
onLoadEliminationOpportunity()1427   void onLoadEliminationOpportunity() override {
1428     increment(InlineCostFeatureIndex::load_elimination, 1);
1429   }
1430 
onAnalysisStart()1431   InlineResult onAnalysisStart() override {
1432     increment(InlineCostFeatureIndex::callsite_cost,
1433               -1 * getCallsiteCost(TTI, this->CandidateCall, DL));
1434 
1435     set(InlineCostFeatureIndex::cold_cc_penalty,
1436         (F.getCallingConv() == CallingConv::Cold));
1437 
1438     set(InlineCostFeatureIndex::last_call_to_static_bonus,
1439         isSoleCallToLocalFunction(CandidateCall, F));
1440 
1441     // FIXME: we shouldn't repeat this logic in both the Features and Cost
1442     // analyzer - instead, we should abstract it to a common method in the
1443     // CallAnalyzer
1444     int SingleBBBonusPercent = 50;
1445     int VectorBonusPercent = TTI.getInlinerVectorBonusPercent();
1446     Threshold += TTI.adjustInliningThreshold(&CandidateCall);
1447     Threshold *= TTI.getInliningThresholdMultiplier();
1448     SingleBBBonus = Threshold * SingleBBBonusPercent / 100;
1449     VectorBonus = Threshold * VectorBonusPercent / 100;
1450     Threshold += (SingleBBBonus + VectorBonus);
1451 
1452     return InlineResult::success();
1453   }
1454 
1455 public:
InlineCostFeaturesAnalyzer(const TargetTransformInfo & TTI,function_ref<AssumptionCache & (Function &)> & GetAssumptionCache,function_ref<BlockFrequencyInfo & (Function &)> GetBFI,function_ref<const TargetLibraryInfo & (Function &)> GetTLI,ProfileSummaryInfo * PSI,OptimizationRemarkEmitter * ORE,Function & Callee,CallBase & Call)1456   InlineCostFeaturesAnalyzer(
1457       const TargetTransformInfo &TTI,
1458       function_ref<AssumptionCache &(Function &)> &GetAssumptionCache,
1459       function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
1460       function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
1461       ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE, Function &Callee,
1462       CallBase &Call)
1463       : CallAnalyzer(Callee, Call, TTI, GetAssumptionCache, GetBFI, GetTLI,
1464                      PSI) {}
1465 
features() const1466   const InlineCostFeatures &features() const { return Cost; }
1467 };
1468 
1469 } // namespace
1470 
1471 /// Test whether the given value is an Alloca-derived function argument.
isAllocaDerivedArg(Value * V)1472 bool CallAnalyzer::isAllocaDerivedArg(Value *V) {
1473   return SROAArgValues.count(V);
1474 }
1475 
disableSROAForArg(AllocaInst * SROAArg)1476 void CallAnalyzer::disableSROAForArg(AllocaInst *SROAArg) {
1477   onDisableSROA(SROAArg);
1478   EnabledSROAAllocas.erase(SROAArg);
1479   disableLoadElimination();
1480 }
1481 
emitInstructionAnnot(const Instruction * I,formatted_raw_ostream & OS)1482 void InlineCostAnnotationWriter::emitInstructionAnnot(
1483     const Instruction *I, formatted_raw_ostream &OS) {
1484   // The cost of inlining of the given instruction is printed always.
1485   // The threshold delta is printed only when it is non-zero. It happens
1486   // when we decided to give a bonus at a particular instruction.
1487   std::optional<InstructionCostDetail> Record = ICCA->getCostDetails(I);
1488   if (!Record)
1489     OS << "; No analysis for the instruction";
1490   else {
1491     OS << "; cost before = " << Record->CostBefore
1492        << ", cost after = " << Record->CostAfter
1493        << ", threshold before = " << Record->ThresholdBefore
1494        << ", threshold after = " << Record->ThresholdAfter << ", ";
1495     OS << "cost delta = " << Record->getCostDelta();
1496     if (Record->hasThresholdChanged())
1497       OS << ", threshold delta = " << Record->getThresholdDelta();
1498   }
1499   auto *V = ICCA->getSimplifiedValueUnchecked(const_cast<Instruction *>(I));
1500   if (V) {
1501     OS << ", simplified to ";
1502     V->print(OS, true);
1503     if (auto *VI = dyn_cast<Instruction>(V)) {
1504       if (VI->getFunction() != I->getFunction())
1505         OS << " (caller instruction)";
1506     } else if (auto *VArg = dyn_cast<Argument>(V)) {
1507       if (VArg->getParent() != I->getFunction())
1508         OS << " (caller argument)";
1509     }
1510   }
1511   OS << "\n";
1512 }
1513 
1514 /// If 'V' maps to a SROA candidate, disable SROA for it.
disableSROA(Value * V)1515 void CallAnalyzer::disableSROA(Value *V) {
1516   if (auto *SROAArg = getSROAArgForValueOrNull(V)) {
1517     disableSROAForArg(SROAArg);
1518   }
1519 }
1520 
disableLoadElimination()1521 void CallAnalyzer::disableLoadElimination() {
1522   if (EnableLoadElimination) {
1523     onDisableLoadElimination();
1524     EnableLoadElimination = false;
1525   }
1526 }
1527 
1528 /// Accumulate a constant GEP offset into an APInt if possible.
1529 ///
1530 /// Returns false if unable to compute the offset for any reason. Respects any
1531 /// simplified values known during the analysis of this callsite.
accumulateGEPOffset(GEPOperator & GEP,APInt & Offset)1532 bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) {
1533   unsigned IntPtrWidth = DL.getIndexTypeSizeInBits(GEP.getType());
1534   assert(IntPtrWidth == Offset.getBitWidth());
1535 
1536   for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP);
1537        GTI != GTE; ++GTI) {
1538     ConstantInt *OpC =
1539         getDirectOrSimplifiedValue<ConstantInt>(GTI.getOperand());
1540     if (!OpC)
1541       return false;
1542     if (OpC->isZero())
1543       continue;
1544 
1545     // Handle a struct index, which adds its field offset to the pointer.
1546     if (StructType *STy = GTI.getStructTypeOrNull()) {
1547       unsigned ElementIdx = OpC->getZExtValue();
1548       const StructLayout *SL = DL.getStructLayout(STy);
1549       Offset += APInt(IntPtrWidth, SL->getElementOffset(ElementIdx));
1550       continue;
1551     }
1552 
1553     APInt TypeSize(IntPtrWidth, GTI.getSequentialElementStride(DL));
1554     Offset += OpC->getValue().sextOrTrunc(IntPtrWidth) * TypeSize;
1555   }
1556   return true;
1557 }
1558 
1559 /// Use TTI to check whether a GEP is free.
1560 ///
1561 /// Respects any simplified values known during the analysis of this callsite.
isGEPFree(GetElementPtrInst & GEP)1562 bool CallAnalyzer::isGEPFree(GetElementPtrInst &GEP) {
1563   SmallVector<Value *, 4> Operands;
1564   Operands.push_back(GEP.getOperand(0));
1565   for (const Use &Op : GEP.indices())
1566     if (Constant *SimpleOp = getSimplifiedValue<Constant>(Op))
1567       Operands.push_back(SimpleOp);
1568     else
1569       Operands.push_back(Op);
1570   return TTI.getInstructionCost(&GEP, Operands,
1571                                 TargetTransformInfo::TCK_SizeAndLatency) ==
1572          TargetTransformInfo::TCC_Free;
1573 }
1574 
visitAlloca(AllocaInst & I)1575 bool CallAnalyzer::visitAlloca(AllocaInst &I) {
1576   disableSROA(I.getOperand(0));
1577 
1578   // Check whether inlining will turn a dynamic alloca into a static
1579   // alloca and handle that case.
1580   if (I.isArrayAllocation()) {
1581     Constant *Size = getSimplifiedValue<Constant>(I.getArraySize());
1582     if (auto *AllocSize = dyn_cast_or_null<ConstantInt>(Size)) {
1583       // Sometimes a dynamic alloca could be converted into a static alloca
1584       // after this constant prop, and become a huge static alloca on an
1585       // unconditional CFG path. Avoid inlining if this is going to happen above
1586       // a threshold.
1587       // FIXME: If the threshold is removed or lowered too much, we could end up
1588       // being too pessimistic and prevent inlining non-problematic code. This
1589       // could result in unintended perf regressions. A better overall strategy
1590       // is needed to track stack usage during inlining.
1591       Type *Ty = I.getAllocatedType();
1592       AllocatedSize = SaturatingMultiplyAdd(
1593           AllocSize->getLimitedValue(),
1594           DL.getTypeAllocSize(Ty).getKnownMinValue(), AllocatedSize);
1595       if (AllocatedSize > InlineConstants::MaxSimplifiedDynamicAllocaToInline)
1596         HasDynamicAlloca = true;
1597       return false;
1598     }
1599   }
1600 
1601   // Accumulate the allocated size.
1602   if (I.isStaticAlloca()) {
1603     Type *Ty = I.getAllocatedType();
1604     AllocatedSize = SaturatingAdd(DL.getTypeAllocSize(Ty).getKnownMinValue(),
1605                                   AllocatedSize);
1606   }
1607 
1608   // FIXME: This is overly conservative. Dynamic allocas are inefficient for
1609   // a variety of reasons, and so we would like to not inline them into
1610   // functions which don't currently have a dynamic alloca. This simply
1611   // disables inlining altogether in the presence of a dynamic alloca.
1612   if (!I.isStaticAlloca())
1613     HasDynamicAlloca = true;
1614 
1615   return false;
1616 }
1617 
visitPHI(PHINode & I)1618 bool CallAnalyzer::visitPHI(PHINode &I) {
1619   // FIXME: We need to propagate SROA *disabling* through phi nodes, even
1620   // though we don't want to propagate it's bonuses. The idea is to disable
1621   // SROA if it *might* be used in an inappropriate manner.
1622 
1623   // Phi nodes are always zero-cost.
1624   // FIXME: Pointer sizes may differ between different address spaces, so do we
1625   // need to use correct address space in the call to getPointerSizeInBits here?
1626   // Or could we skip the getPointerSizeInBits call completely? As far as I can
1627   // see the ZeroOffset is used as a dummy value, so we can probably use any
1628   // bit width for the ZeroOffset?
1629   APInt ZeroOffset = APInt::getZero(DL.getPointerSizeInBits(0));
1630   bool CheckSROA = I.getType()->isPointerTy();
1631 
1632   // Track the constant or pointer with constant offset we've seen so far.
1633   Constant *FirstC = nullptr;
1634   std::pair<Value *, APInt> FirstBaseAndOffset = {nullptr, ZeroOffset};
1635   Value *FirstV = nullptr;
1636 
1637   for (unsigned i = 0, e = I.getNumIncomingValues(); i != e; ++i) {
1638     BasicBlock *Pred = I.getIncomingBlock(i);
1639     // If the incoming block is dead, skip the incoming block.
1640     if (DeadBlocks.count(Pred))
1641       continue;
1642     // If the parent block of phi is not the known successor of the incoming
1643     // block, skip the incoming block.
1644     BasicBlock *KnownSuccessor = KnownSuccessors[Pred];
1645     if (KnownSuccessor && KnownSuccessor != I.getParent())
1646       continue;
1647 
1648     Value *V = I.getIncomingValue(i);
1649     // If the incoming value is this phi itself, skip the incoming value.
1650     if (&I == V)
1651       continue;
1652 
1653     Constant *C = getDirectOrSimplifiedValue<Constant>(V);
1654 
1655     std::pair<Value *, APInt> BaseAndOffset = {nullptr, ZeroOffset};
1656     if (!C && CheckSROA)
1657       BaseAndOffset = ConstantOffsetPtrs.lookup(V);
1658 
1659     if (!C && !BaseAndOffset.first)
1660       // The incoming value is neither a constant nor a pointer with constant
1661       // offset, exit early.
1662       return true;
1663 
1664     if (FirstC) {
1665       if (FirstC == C)
1666         // If we've seen a constant incoming value before and it is the same
1667         // constant we see this time, continue checking the next incoming value.
1668         continue;
1669       // Otherwise early exit because we either see a different constant or saw
1670       // a constant before but we have a pointer with constant offset this time.
1671       return true;
1672     }
1673 
1674     if (FirstV) {
1675       // The same logic as above, but check pointer with constant offset here.
1676       if (FirstBaseAndOffset == BaseAndOffset)
1677         continue;
1678       return true;
1679     }
1680 
1681     if (C) {
1682       // This is the 1st time we've seen a constant, record it.
1683       FirstC = C;
1684       continue;
1685     }
1686 
1687     // The remaining case is that this is the 1st time we've seen a pointer with
1688     // constant offset, record it.
1689     FirstV = V;
1690     FirstBaseAndOffset = BaseAndOffset;
1691   }
1692 
1693   // Check if we can map phi to a constant.
1694   if (FirstC) {
1695     SimplifiedValues[&I] = FirstC;
1696     return true;
1697   }
1698 
1699   // Check if we can map phi to a pointer with constant offset.
1700   if (FirstBaseAndOffset.first) {
1701     ConstantOffsetPtrs[&I] = FirstBaseAndOffset;
1702 
1703     if (auto *SROAArg = getSROAArgForValueOrNull(FirstV))
1704       SROAArgValues[&I] = SROAArg;
1705   }
1706 
1707   return true;
1708 }
1709 
1710 /// Check we can fold GEPs of constant-offset call site argument pointers.
1711 /// This requires target data and inbounds GEPs.
1712 ///
1713 /// \return true if the specified GEP can be folded.
canFoldInboundsGEP(GetElementPtrInst & I)1714 bool CallAnalyzer::canFoldInboundsGEP(GetElementPtrInst &I) {
1715   // Check if we have a base + offset for the pointer.
1716   std::pair<Value *, APInt> BaseAndOffset =
1717       ConstantOffsetPtrs.lookup(I.getPointerOperand());
1718   if (!BaseAndOffset.first)
1719     return false;
1720 
1721   // Check if the offset of this GEP is constant, and if so accumulate it
1722   // into Offset.
1723   if (!accumulateGEPOffset(cast<GEPOperator>(I), BaseAndOffset.second))
1724     return false;
1725 
1726   // Add the result as a new mapping to Base + Offset.
1727   ConstantOffsetPtrs[&I] = BaseAndOffset;
1728 
1729   return true;
1730 }
1731 
visitGetElementPtr(GetElementPtrInst & I)1732 bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {
1733   auto *SROAArg = getSROAArgForValueOrNull(I.getPointerOperand());
1734 
1735   // Lambda to check whether a GEP's indices are all constant.
1736   auto IsGEPOffsetConstant = [&](GetElementPtrInst &GEP) {
1737     for (const Use &Op : GEP.indices())
1738       if (!getDirectOrSimplifiedValue<Constant>(Op))
1739         return false;
1740     return true;
1741   };
1742 
1743   if (!DisableGEPConstOperand)
1744     if (simplifyInstruction(I))
1745       return true;
1746 
1747   if ((I.isInBounds() && canFoldInboundsGEP(I)) || IsGEPOffsetConstant(I)) {
1748     if (SROAArg)
1749       SROAArgValues[&I] = SROAArg;
1750 
1751     // Constant GEPs are modeled as free.
1752     return true;
1753   }
1754 
1755   // Variable GEPs will require math and will disable SROA.
1756   if (SROAArg)
1757     disableSROAForArg(SROAArg);
1758   return isGEPFree(I);
1759 }
1760 
1761 // Simplify \p Cmp if RHS is const and we can ValueTrack LHS.
1762 // This handles the case only when the Cmp instruction is guarding a recursive
1763 // call that will cause the Cmp to fail/succeed for the recursive call.
simplifyCmpInstForRecCall(CmpInst & Cmp)1764 bool CallAnalyzer::simplifyCmpInstForRecCall(CmpInst &Cmp) {
1765   // Bail out if LHS is not a function argument or RHS is NOT const:
1766   if (!isa<Argument>(Cmp.getOperand(0)) || !isa<Constant>(Cmp.getOperand(1)))
1767     return false;
1768   auto *CmpOp = Cmp.getOperand(0);
1769   // Make sure that the callsite is recursive:
1770   if (CandidateCall.getCaller() != &F)
1771     return false;
1772   // Only handle the case when the callsite has a single predecessor:
1773   auto *CallBB = CandidateCall.getParent();
1774   auto *Predecessor = CallBB->getSinglePredecessor();
1775   if (!Predecessor)
1776     return false;
1777   // Check if the callsite is guarded by the same Cmp instruction:
1778   auto *Br = dyn_cast<BranchInst>(Predecessor->getTerminator());
1779   if (!Br || Br->isUnconditional() || Br->getCondition() != &Cmp)
1780     return false;
1781 
1782   // Check if there is any arg of the recursive callsite is affecting the cmp
1783   // instr:
1784   bool ArgFound = false;
1785   Value *FuncArg = nullptr, *CallArg = nullptr;
1786   for (unsigned ArgNum = 0;
1787        ArgNum < F.arg_size() && ArgNum < CandidateCall.arg_size(); ArgNum++) {
1788     FuncArg = F.getArg(ArgNum);
1789     CallArg = CandidateCall.getArgOperand(ArgNum);
1790     if (FuncArg == CmpOp && CallArg != CmpOp) {
1791       ArgFound = true;
1792       break;
1793     }
1794   }
1795   if (!ArgFound)
1796     return false;
1797 
1798   // Now we have a recursive call that is guarded by a cmp instruction.
1799   // Check if this cmp can be simplified:
1800   SimplifyQuery SQ(DL, dyn_cast<Instruction>(CallArg));
1801   CondContext CC(&Cmp);
1802   CC.Invert = (CallBB != Br->getSuccessor(0));
1803   SQ.CC = &CC;
1804   CC.AffectedValues.insert(FuncArg);
1805   Value *SimplifiedInstruction = llvm::simplifyInstructionWithOperands(
1806       cast<CmpInst>(&Cmp), {CallArg, Cmp.getOperand(1)}, SQ);
1807   if (auto *ConstVal = dyn_cast_or_null<ConstantInt>(SimplifiedInstruction)) {
1808     // Make sure that the BB of the recursive call is NOT the true successor
1809     // of the icmp. In other words, make sure that the recursion depth is 1.
1810     if ((ConstVal->isOne() && CC.Invert) ||
1811         (ConstVal->isZero() && !CC.Invert)) {
1812       SimplifiedValues[&Cmp] = ConstVal;
1813       return true;
1814     }
1815   }
1816   return false;
1817 }
1818 
1819 /// Simplify \p I if its operands are constants and update SimplifiedValues.
simplifyInstruction(Instruction & I)1820 bool CallAnalyzer::simplifyInstruction(Instruction &I) {
1821   SmallVector<Constant *> COps;
1822   for (Value *Op : I.operands()) {
1823     Constant *COp = getDirectOrSimplifiedValue<Constant>(Op);
1824     if (!COp)
1825       return false;
1826     COps.push_back(COp);
1827   }
1828   auto *C = ConstantFoldInstOperands(&I, COps, DL);
1829   if (!C)
1830     return false;
1831   SimplifiedValues[&I] = C;
1832   return true;
1833 }
1834 
1835 /// Try to simplify a call to llvm.is.constant.
1836 ///
1837 /// Duplicate the argument checking from CallAnalyzer::simplifyCallSite since
1838 /// we expect calls of this specific intrinsic to be infrequent.
1839 ///
1840 /// FIXME: Given that we know CB's parent (F) caller
1841 /// (CandidateCall->getParent()->getParent()), we might be able to determine
1842 /// whether inlining F into F's caller would change how the call to
1843 /// llvm.is.constant would evaluate.
simplifyIntrinsicCallIsConstant(CallBase & CB)1844 bool CallAnalyzer::simplifyIntrinsicCallIsConstant(CallBase &CB) {
1845   Value *Arg = CB.getArgOperand(0);
1846   auto *C = getDirectOrSimplifiedValue<Constant>(Arg);
1847 
1848   Type *RT = CB.getFunctionType()->getReturnType();
1849   SimplifiedValues[&CB] = ConstantInt::get(RT, C ? 1 : 0);
1850   return true;
1851 }
1852 
simplifyIntrinsicCallObjectSize(CallBase & CB)1853 bool CallAnalyzer::simplifyIntrinsicCallObjectSize(CallBase &CB) {
1854   // As per the langref, "The fourth argument to llvm.objectsize determines if
1855   // the value should be evaluated at runtime."
1856   if (cast<ConstantInt>(CB.getArgOperand(3))->isOne())
1857     return false;
1858 
1859   Value *V = lowerObjectSizeCall(&cast<IntrinsicInst>(CB), DL, nullptr,
1860                                  /*MustSucceed=*/true);
1861   Constant *C = dyn_cast_or_null<Constant>(V);
1862   if (C)
1863     SimplifiedValues[&CB] = C;
1864   return C;
1865 }
1866 
visitBitCast(BitCastInst & I)1867 bool CallAnalyzer::visitBitCast(BitCastInst &I) {
1868   // Propagate constants through bitcasts.
1869   if (simplifyInstruction(I))
1870     return true;
1871 
1872   // Track base/offsets through casts
1873   std::pair<Value *, APInt> BaseAndOffset =
1874       ConstantOffsetPtrs.lookup(I.getOperand(0));
1875   // Casts don't change the offset, just wrap it up.
1876   if (BaseAndOffset.first)
1877     ConstantOffsetPtrs[&I] = BaseAndOffset;
1878 
1879   // Also look for SROA candidates here.
1880   if (auto *SROAArg = getSROAArgForValueOrNull(I.getOperand(0)))
1881     SROAArgValues[&I] = SROAArg;
1882 
1883   // Bitcasts are always zero cost.
1884   return true;
1885 }
1886 
visitPtrToInt(PtrToIntInst & I)1887 bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) {
1888   // Propagate constants through ptrtoint.
1889   if (simplifyInstruction(I))
1890     return true;
1891 
1892   // Track base/offset pairs when converted to a plain integer provided the
1893   // integer is large enough to represent the pointer.
1894   unsigned IntegerSize = I.getType()->getScalarSizeInBits();
1895   unsigned AS = I.getOperand(0)->getType()->getPointerAddressSpace();
1896   if (IntegerSize == DL.getPointerSizeInBits(AS)) {
1897     std::pair<Value *, APInt> BaseAndOffset =
1898         ConstantOffsetPtrs.lookup(I.getOperand(0));
1899     if (BaseAndOffset.first)
1900       ConstantOffsetPtrs[&I] = BaseAndOffset;
1901   }
1902 
1903   // This is really weird. Technically, ptrtoint will disable SROA. However,
1904   // unless that ptrtoint is *used* somewhere in the live basic blocks after
1905   // inlining, it will be nuked, and SROA should proceed. All of the uses which
1906   // would block SROA would also block SROA if applied directly to a pointer,
1907   // and so we can just add the integer in here. The only places where SROA is
1908   // preserved either cannot fire on an integer, or won't in-and-of themselves
1909   // disable SROA (ext) w/o some later use that we would see and disable.
1910   if (auto *SROAArg = getSROAArgForValueOrNull(I.getOperand(0)))
1911     SROAArgValues[&I] = SROAArg;
1912 
1913   return TTI.getInstructionCost(&I, TargetTransformInfo::TCK_SizeAndLatency) ==
1914          TargetTransformInfo::TCC_Free;
1915 }
1916 
visitIntToPtr(IntToPtrInst & I)1917 bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
1918   // Propagate constants through ptrtoint.
1919   if (simplifyInstruction(I))
1920     return true;
1921 
1922   // Track base/offset pairs when round-tripped through a pointer without
1923   // modifications provided the integer is not too large.
1924   Value *Op = I.getOperand(0);
1925   unsigned IntegerSize = Op->getType()->getScalarSizeInBits();
1926   if (IntegerSize <= DL.getPointerTypeSizeInBits(I.getType())) {
1927     std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Op);
1928     if (BaseAndOffset.first)
1929       ConstantOffsetPtrs[&I] = BaseAndOffset;
1930   }
1931 
1932   // "Propagate" SROA here in the same manner as we do for ptrtoint above.
1933   if (auto *SROAArg = getSROAArgForValueOrNull(Op))
1934     SROAArgValues[&I] = SROAArg;
1935 
1936   return TTI.getInstructionCost(&I, TargetTransformInfo::TCK_SizeAndLatency) ==
1937          TargetTransformInfo::TCC_Free;
1938 }
1939 
visitCastInst(CastInst & I)1940 bool CallAnalyzer::visitCastInst(CastInst &I) {
1941   // Propagate constants through casts.
1942   if (simplifyInstruction(I))
1943     return true;
1944 
1945   // Disable SROA in the face of arbitrary casts we don't explicitly list
1946   // elsewhere.
1947   disableSROA(I.getOperand(0));
1948 
1949   // If this is a floating-point cast, and the target says this operation
1950   // is expensive, this may eventually become a library call. Treat the cost
1951   // as such.
1952   switch (I.getOpcode()) {
1953   case Instruction::FPTrunc:
1954   case Instruction::FPExt:
1955   case Instruction::UIToFP:
1956   case Instruction::SIToFP:
1957   case Instruction::FPToUI:
1958   case Instruction::FPToSI:
1959     if (TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive)
1960       onCallPenalty();
1961     break;
1962   default:
1963     break;
1964   }
1965 
1966   return TTI.getInstructionCost(&I, TargetTransformInfo::TCK_SizeAndLatency) ==
1967          TargetTransformInfo::TCC_Free;
1968 }
1969 
paramHasAttr(Argument * A,Attribute::AttrKind Attr)1970 bool CallAnalyzer::paramHasAttr(Argument *A, Attribute::AttrKind Attr) {
1971   return CandidateCall.paramHasAttr(A->getArgNo(), Attr);
1972 }
1973 
isKnownNonNullInCallee(Value * V)1974 bool CallAnalyzer::isKnownNonNullInCallee(Value *V) {
1975   // Does the *call site* have the NonNull attribute set on an argument?  We
1976   // use the attribute on the call site to memoize any analysis done in the
1977   // caller. This will also trip if the callee function has a non-null
1978   // parameter attribute, but that's a less interesting case because hopefully
1979   // the callee would already have been simplified based on that.
1980   if (Argument *A = dyn_cast<Argument>(V))
1981     if (paramHasAttr(A, Attribute::NonNull))
1982       return true;
1983 
1984   // Is this an alloca in the caller?  This is distinct from the attribute case
1985   // above because attributes aren't updated within the inliner itself and we
1986   // always want to catch the alloca derived case.
1987   if (isAllocaDerivedArg(V))
1988     // We can actually predict the result of comparisons between an
1989     // alloca-derived value and null. Note that this fires regardless of
1990     // SROA firing.
1991     return true;
1992 
1993   return false;
1994 }
1995 
allowSizeGrowth(CallBase & Call)1996 bool CallAnalyzer::allowSizeGrowth(CallBase &Call) {
1997   // If the normal destination of the invoke or the parent block of the call
1998   // site is unreachable-terminated, there is little point in inlining this
1999   // unless there is literally zero cost.
2000   // FIXME: Note that it is possible that an unreachable-terminated block has a
2001   // hot entry. For example, in below scenario inlining hot_call_X() may be
2002   // beneficial :
2003   // main() {
2004   //   hot_call_1();
2005   //   ...
2006   //   hot_call_N()
2007   //   exit(0);
2008   // }
2009   // For now, we are not handling this corner case here as it is rare in real
2010   // code. In future, we should elaborate this based on BPI and BFI in more
2011   // general threshold adjusting heuristics in updateThreshold().
2012   if (InvokeInst *II = dyn_cast<InvokeInst>(&Call)) {
2013     if (isa<UnreachableInst>(II->getNormalDest()->getTerminator()))
2014       return false;
2015   } else if (isa<UnreachableInst>(Call.getParent()->getTerminator()))
2016     return false;
2017 
2018   return true;
2019 }
2020 
isColdCallSite(CallBase & Call,BlockFrequencyInfo * CallerBFI)2021 bool InlineCostCallAnalyzer::isColdCallSite(CallBase &Call,
2022                                             BlockFrequencyInfo *CallerBFI) {
2023   // If global profile summary is available, then callsite's coldness is
2024   // determined based on that.
2025   if (PSI && PSI->hasProfileSummary())
2026     return PSI->isColdCallSite(Call, CallerBFI);
2027 
2028   // Otherwise we need BFI to be available.
2029   if (!CallerBFI)
2030     return false;
2031 
2032   // Determine if the callsite is cold relative to caller's entry. We could
2033   // potentially cache the computation of scaled entry frequency, but the added
2034   // complexity is not worth it unless this scaling shows up high in the
2035   // profiles.
2036   const BranchProbability ColdProb(ColdCallSiteRelFreq, 100);
2037   auto CallSiteBB = Call.getParent();
2038   auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB);
2039   auto CallerEntryFreq =
2040       CallerBFI->getBlockFreq(&(Call.getCaller()->getEntryBlock()));
2041   return CallSiteFreq < CallerEntryFreq * ColdProb;
2042 }
2043 
2044 std::optional<int>
getHotCallSiteThreshold(CallBase & Call,BlockFrequencyInfo * CallerBFI)2045 InlineCostCallAnalyzer::getHotCallSiteThreshold(CallBase &Call,
2046                                                 BlockFrequencyInfo *CallerBFI) {
2047 
2048   // If global profile summary is available, then callsite's hotness is
2049   // determined based on that.
2050   if (PSI && PSI->hasProfileSummary() && PSI->isHotCallSite(Call, CallerBFI))
2051     return Params.HotCallSiteThreshold;
2052 
2053   // Otherwise we need BFI to be available and to have a locally hot callsite
2054   // threshold.
2055   if (!CallerBFI || !Params.LocallyHotCallSiteThreshold)
2056     return std::nullopt;
2057 
2058   // Determine if the callsite is hot relative to caller's entry. We could
2059   // potentially cache the computation of scaled entry frequency, but the added
2060   // complexity is not worth it unless this scaling shows up high in the
2061   // profiles.
2062   const BasicBlock *CallSiteBB = Call.getParent();
2063   BlockFrequency CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB);
2064   BlockFrequency CallerEntryFreq = CallerBFI->getEntryFreq();
2065   std::optional<BlockFrequency> Limit = CallerEntryFreq.mul(HotCallSiteRelFreq);
2066   if (Limit && CallSiteFreq >= *Limit)
2067     return Params.LocallyHotCallSiteThreshold;
2068 
2069   // Otherwise treat it normally.
2070   return std::nullopt;
2071 }
2072 
updateThreshold(CallBase & Call,Function & Callee)2073 void InlineCostCallAnalyzer::updateThreshold(CallBase &Call, Function &Callee) {
2074   // If no size growth is allowed for this inlining, set Threshold to 0.
2075   if (!allowSizeGrowth(Call)) {
2076     Threshold = 0;
2077     return;
2078   }
2079 
2080   Function *Caller = Call.getCaller();
2081 
2082   // return min(A, B) if B is valid.
2083   auto MinIfValid = [](int A, std::optional<int> B) {
2084     return B ? std::min(A, *B) : A;
2085   };
2086 
2087   // return max(A, B) if B is valid.
2088   auto MaxIfValid = [](int A, std::optional<int> B) {
2089     return B ? std::max(A, *B) : A;
2090   };
2091 
2092   // Various bonus percentages. These are multiplied by Threshold to get the
2093   // bonus values.
2094   // SingleBBBonus: This bonus is applied if the callee has a single reachable
2095   // basic block at the given callsite context. This is speculatively applied
2096   // and withdrawn if more than one basic block is seen.
2097   //
2098   // LstCallToStaticBonus: This large bonus is applied to ensure the inlining
2099   // of the last call to a static function as inlining such functions is
2100   // guaranteed to reduce code size.
2101   //
2102   // These bonus percentages may be set to 0 based on properties of the caller
2103   // and the callsite.
2104   int SingleBBBonusPercent = 50;
2105   int VectorBonusPercent = TTI.getInlinerVectorBonusPercent();
2106   int LastCallToStaticBonus = TTI.getInliningLastCallToStaticBonus();
2107 
2108   // Lambda to set all the above bonus and bonus percentages to 0.
2109   auto DisallowAllBonuses = [&]() {
2110     SingleBBBonusPercent = 0;
2111     VectorBonusPercent = 0;
2112     LastCallToStaticBonus = 0;
2113   };
2114 
2115   // Use the OptMinSizeThreshold or OptSizeThreshold knob if they are available
2116   // and reduce the threshold if the caller has the necessary attribute.
2117   if (Caller->hasMinSize()) {
2118     Threshold = MinIfValid(Threshold, Params.OptMinSizeThreshold);
2119     // For minsize, we want to disable the single BB bonus and the vector
2120     // bonuses, but not the last-call-to-static bonus. Inlining the last call to
2121     // a static function will, at the minimum, eliminate the parameter setup and
2122     // call/return instructions.
2123     SingleBBBonusPercent = 0;
2124     VectorBonusPercent = 0;
2125   } else if (Caller->hasOptSize())
2126     Threshold = MinIfValid(Threshold, Params.OptSizeThreshold);
2127 
2128   // Adjust the threshold based on inlinehint attribute and profile based
2129   // hotness information if the caller does not have MinSize attribute.
2130   if (!Caller->hasMinSize()) {
2131     if (Callee.hasFnAttribute(Attribute::InlineHint))
2132       Threshold = MaxIfValid(Threshold, Params.HintThreshold);
2133 
2134     // FIXME: After switching to the new passmanager, simplify the logic below
2135     // by checking only the callsite hotness/coldness as we will reliably
2136     // have local profile information.
2137     //
2138     // Callsite hotness and coldness can be determined if sample profile is
2139     // used (which adds hotness metadata to calls) or if caller's
2140     // BlockFrequencyInfo is available.
2141     BlockFrequencyInfo *CallerBFI = GetBFI ? &(GetBFI(*Caller)) : nullptr;
2142     auto HotCallSiteThreshold = getHotCallSiteThreshold(Call, CallerBFI);
2143     if (!Caller->hasOptSize() && HotCallSiteThreshold) {
2144       LLVM_DEBUG(dbgs() << "Hot callsite.\n");
2145       // FIXME: This should update the threshold only if it exceeds the
2146       // current threshold, but AutoFDO + ThinLTO currently relies on this
2147       // behavior to prevent inlining of hot callsites during ThinLTO
2148       // compile phase.
2149       Threshold = *HotCallSiteThreshold;
2150     } else if (isColdCallSite(Call, CallerBFI)) {
2151       LLVM_DEBUG(dbgs() << "Cold callsite.\n");
2152       // Do not apply bonuses for a cold callsite including the
2153       // LastCallToStatic bonus. While this bonus might result in code size
2154       // reduction, it can cause the size of a non-cold caller to increase
2155       // preventing it from being inlined.
2156       DisallowAllBonuses();
2157       Threshold = MinIfValid(Threshold, Params.ColdCallSiteThreshold);
2158     } else if (PSI) {
2159       // Use callee's global profile information only if we have no way of
2160       // determining this via callsite information.
2161       if (PSI->isFunctionEntryHot(&Callee)) {
2162         LLVM_DEBUG(dbgs() << "Hot callee.\n");
2163         // If callsite hotness can not be determined, we may still know
2164         // that the callee is hot and treat it as a weaker hint for threshold
2165         // increase.
2166         Threshold = MaxIfValid(Threshold, Params.HintThreshold);
2167       } else if (PSI->isFunctionEntryCold(&Callee)) {
2168         LLVM_DEBUG(dbgs() << "Cold callee.\n");
2169         // Do not apply bonuses for a cold callee including the
2170         // LastCallToStatic bonus. While this bonus might result in code size
2171         // reduction, it can cause the size of a non-cold caller to increase
2172         // preventing it from being inlined.
2173         DisallowAllBonuses();
2174         Threshold = MinIfValid(Threshold, Params.ColdThreshold);
2175       }
2176     }
2177   }
2178 
2179   Threshold += TTI.adjustInliningThreshold(&Call);
2180 
2181   // Finally, take the target-specific inlining threshold multiplier into
2182   // account.
2183   Threshold *= TTI.getInliningThresholdMultiplier();
2184 
2185   SingleBBBonus = Threshold * SingleBBBonusPercent / 100;
2186   VectorBonus = Threshold * VectorBonusPercent / 100;
2187 
2188   // If there is only one call of the function, and it has internal linkage,
2189   // the cost of inlining it drops dramatically. It may seem odd to update
2190   // Cost in updateThreshold, but the bonus depends on the logic in this method.
2191   if (isSoleCallToLocalFunction(Call, F)) {
2192     Cost -= LastCallToStaticBonus;
2193     StaticBonusApplied = LastCallToStaticBonus;
2194   }
2195 }
2196 
visitCmpInst(CmpInst & I)2197 bool CallAnalyzer::visitCmpInst(CmpInst &I) {
2198   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
2199   // First try to handle simplified comparisons.
2200   if (simplifyInstruction(I))
2201     return true;
2202 
2203   // Try to handle comparison that can be simplified using ValueTracking.
2204   if (simplifyCmpInstForRecCall(I))
2205     return true;
2206 
2207   if (I.getOpcode() == Instruction::FCmp)
2208     return false;
2209 
2210   // Otherwise look for a comparison between constant offset pointers with
2211   // a common base.
2212   Value *LHSBase, *RHSBase;
2213   APInt LHSOffset, RHSOffset;
2214   std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
2215   if (LHSBase) {
2216     std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
2217     if (RHSBase && LHSBase == RHSBase) {
2218       // We have common bases, fold the icmp to a constant based on the
2219       // offsets.
2220       SimplifiedValues[&I] = ConstantInt::getBool(
2221           I.getType(),
2222           ICmpInst::compare(LHSOffset, RHSOffset, I.getPredicate()));
2223       ++NumConstantPtrCmps;
2224       return true;
2225     }
2226   }
2227 
2228   auto isImplicitNullCheckCmp = [](const CmpInst &I) {
2229     for (auto *User : I.users())
2230       if (auto *Instr = dyn_cast<Instruction>(User))
2231         if (!Instr->getMetadata(LLVMContext::MD_make_implicit))
2232           return false;
2233     return true;
2234   };
2235 
2236   // If the comparison is an equality comparison with null, we can simplify it
2237   // if we know the value (argument) can't be null
2238   if (I.isEquality() && isa<ConstantPointerNull>(I.getOperand(1))) {
2239     if (isKnownNonNullInCallee(I.getOperand(0))) {
2240       bool IsNotEqual = I.getPredicate() == CmpInst::ICMP_NE;
2241       SimplifiedValues[&I] = IsNotEqual ? ConstantInt::getTrue(I.getType())
2242                                         : ConstantInt::getFalse(I.getType());
2243       return true;
2244     }
2245     // Implicit null checks act as unconditional branches and their comparisons
2246     // should be treated as simplified and free of cost.
2247     if (isImplicitNullCheckCmp(I))
2248       return true;
2249   }
2250   return handleSROA(I.getOperand(0), isa<ConstantPointerNull>(I.getOperand(1)));
2251 }
2252 
visitSub(BinaryOperator & I)2253 bool CallAnalyzer::visitSub(BinaryOperator &I) {
2254   // Try to handle a special case: we can fold computing the difference of two
2255   // constant-related pointers.
2256   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
2257   Value *LHSBase, *RHSBase;
2258   APInt LHSOffset, RHSOffset;
2259   std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
2260   if (LHSBase) {
2261     std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
2262     if (RHSBase && LHSBase == RHSBase) {
2263       // We have common bases, fold the subtract to a constant based on the
2264       // offsets.
2265       Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);
2266       Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);
2267       if (Constant *C = ConstantExpr::getSub(CLHS, CRHS)) {
2268         SimplifiedValues[&I] = C;
2269         ++NumConstantPtrDiffs;
2270         return true;
2271       }
2272     }
2273   }
2274 
2275   // Otherwise, fall back to the generic logic for simplifying and handling
2276   // instructions.
2277   return Base::visitSub(I);
2278 }
2279 
visitBinaryOperator(BinaryOperator & I)2280 bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) {
2281   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
2282   Constant *CLHS = getDirectOrSimplifiedValue<Constant>(LHS);
2283   Constant *CRHS = getDirectOrSimplifiedValue<Constant>(RHS);
2284 
2285   Value *SimpleV = nullptr;
2286   if (auto FI = dyn_cast<FPMathOperator>(&I))
2287     SimpleV = simplifyBinOp(I.getOpcode(), CLHS ? CLHS : LHS, CRHS ? CRHS : RHS,
2288                             FI->getFastMathFlags(), DL);
2289   else
2290     SimpleV =
2291         simplifyBinOp(I.getOpcode(), CLHS ? CLHS : LHS, CRHS ? CRHS : RHS, DL);
2292 
2293   if (Constant *C = dyn_cast_or_null<Constant>(SimpleV))
2294     SimplifiedValues[&I] = C;
2295 
2296   if (SimpleV)
2297     return true;
2298 
2299   // Disable any SROA on arguments to arbitrary, unsimplified binary operators.
2300   disableSROA(LHS);
2301   disableSROA(RHS);
2302 
2303   // If the instruction is floating point, and the target says this operation
2304   // is expensive, this may eventually become a library call. Treat the cost
2305   // as such. Unless it's fneg which can be implemented with an xor.
2306   using namespace llvm::PatternMatch;
2307   if (I.getType()->isFloatingPointTy() &&
2308       TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive &&
2309       !match(&I, m_FNeg(m_Value())))
2310     onCallPenalty();
2311 
2312   return false;
2313 }
2314 
visitFNeg(UnaryOperator & I)2315 bool CallAnalyzer::visitFNeg(UnaryOperator &I) {
2316   Value *Op = I.getOperand(0);
2317   Constant *COp = getDirectOrSimplifiedValue<Constant>(Op);
2318 
2319   Value *SimpleV = simplifyFNegInst(
2320       COp ? COp : Op, cast<FPMathOperator>(I).getFastMathFlags(), DL);
2321 
2322   if (Constant *C = dyn_cast_or_null<Constant>(SimpleV))
2323     SimplifiedValues[&I] = C;
2324 
2325   if (SimpleV)
2326     return true;
2327 
2328   // Disable any SROA on arguments to arbitrary, unsimplified fneg.
2329   disableSROA(Op);
2330 
2331   return false;
2332 }
2333 
visitLoad(LoadInst & I)2334 bool CallAnalyzer::visitLoad(LoadInst &I) {
2335   if (handleSROA(I.getPointerOperand(), I.isSimple()))
2336     return true;
2337 
2338   // If the data is already loaded from this address and hasn't been clobbered
2339   // by any stores or calls, this load is likely to be redundant and can be
2340   // eliminated.
2341   if (EnableLoadElimination &&
2342       !LoadAddrSet.insert(I.getPointerOperand()).second && I.isUnordered()) {
2343     onLoadEliminationOpportunity();
2344     return true;
2345   }
2346 
2347   onMemAccess();
2348   return false;
2349 }
2350 
visitStore(StoreInst & I)2351 bool CallAnalyzer::visitStore(StoreInst &I) {
2352   if (handleSROA(I.getPointerOperand(), I.isSimple()))
2353     return true;
2354 
2355   // The store can potentially clobber loads and prevent repeated loads from
2356   // being eliminated.
2357   // FIXME:
2358   // 1. We can probably keep an initial set of eliminatable loads substracted
2359   // from the cost even when we finally see a store. We just need to disable
2360   // *further* accumulation of elimination savings.
2361   // 2. We should probably at some point thread MemorySSA for the callee into
2362   // this and then use that to actually compute *really* precise savings.
2363   disableLoadElimination();
2364 
2365   onMemAccess();
2366   return false;
2367 }
2368 
visitExtractValue(ExtractValueInst & I)2369 bool CallAnalyzer::visitExtractValue(ExtractValueInst &I) {
2370   Value *Op = I.getAggregateOperand();
2371 
2372   // Special handling, because we want to simplify extractvalue with a
2373   // potential insertvalue from the caller.
2374   if (Value *SimpleOp = getSimplifiedValueUnchecked(Op)) {
2375     SimplifyQuery SQ(DL);
2376     Value *SimpleV = simplifyExtractValueInst(SimpleOp, I.getIndices(), SQ);
2377     if (SimpleV) {
2378       SimplifiedValues[&I] = SimpleV;
2379       return true;
2380     }
2381   }
2382 
2383   // SROA can't look through these, but they may be free.
2384   return Base::visitExtractValue(I);
2385 }
2386 
visitInsertValue(InsertValueInst & I)2387 bool CallAnalyzer::visitInsertValue(InsertValueInst &I) {
2388   // Constant folding for insert value is trivial.
2389   if (simplifyInstruction(I))
2390     return true;
2391 
2392   // SROA can't look through these, but they may be free.
2393   return Base::visitInsertValue(I);
2394 }
2395 
2396 /// Try to simplify a call site.
2397 ///
2398 /// Takes a concrete function and callsite and tries to actually simplify it by
2399 /// analyzing the arguments and call itself with instsimplify. Returns true if
2400 /// it has simplified the callsite to some other entity (a constant), making it
2401 /// free.
simplifyCallSite(Function * F,CallBase & Call)2402 bool CallAnalyzer::simplifyCallSite(Function *F, CallBase &Call) {
2403   // FIXME: Using the instsimplify logic directly for this is inefficient
2404   // because we have to continually rebuild the argument list even when no
2405   // simplifications can be performed. Until that is fixed with remapping
2406   // inside of instsimplify, directly constant fold calls here.
2407   if (!canConstantFoldCallTo(&Call, F))
2408     return false;
2409 
2410   // Try to re-map the arguments to constants.
2411   SmallVector<Constant *, 4> ConstantArgs;
2412   ConstantArgs.reserve(Call.arg_size());
2413   for (Value *I : Call.args()) {
2414     Constant *C = getDirectOrSimplifiedValue<Constant>(I);
2415     if (!C)
2416       return false; // This argument doesn't map to a constant.
2417 
2418     ConstantArgs.push_back(C);
2419   }
2420   if (Constant *C = ConstantFoldCall(&Call, F, ConstantArgs)) {
2421     SimplifiedValues[&Call] = C;
2422     return true;
2423   }
2424 
2425   return false;
2426 }
2427 
isLoweredToCall(Function * F,CallBase & Call)2428 bool CallAnalyzer::isLoweredToCall(Function *F, CallBase &Call) {
2429   const TargetLibraryInfo *TLI = GetTLI ? &GetTLI(*F) : nullptr;
2430   LibFunc LF;
2431   if (!TLI || !TLI->getLibFunc(*F, LF) || !TLI->has(LF))
2432     return TTI.isLoweredToCall(F);
2433 
2434   switch (LF) {
2435   case LibFunc_memcpy_chk:
2436   case LibFunc_memmove_chk:
2437   case LibFunc_mempcpy_chk:
2438   case LibFunc_memset_chk: {
2439     // Calls to  __memcpy_chk whose length is known to fit within the object
2440     // size will eventually be replaced by inline stores. Therefore, these
2441     // should not incur a call penalty. This is only really relevant on
2442     // platforms whose headers redirect memcpy to __memcpy_chk (e.g. Darwin), as
2443     // other platforms use memcpy intrinsics, which are already exempt from the
2444     // call penalty.
2445     auto *LenOp = getDirectOrSimplifiedValue<ConstantInt>(Call.getOperand(2));
2446     auto *ObjSizeOp =
2447         getDirectOrSimplifiedValue<ConstantInt>(Call.getOperand(3));
2448     if (LenOp && ObjSizeOp &&
2449         LenOp->getLimitedValue() <= ObjSizeOp->getLimitedValue()) {
2450       return false;
2451     }
2452     break;
2453   }
2454   default:
2455     break;
2456   }
2457 
2458   return TTI.isLoweredToCall(F);
2459 }
2460 
visitCallBase(CallBase & Call)2461 bool CallAnalyzer::visitCallBase(CallBase &Call) {
2462   if (!onCallBaseVisitStart(Call))
2463     return true;
2464 
2465   if (Call.hasFnAttr(Attribute::ReturnsTwice) &&
2466       !F.hasFnAttribute(Attribute::ReturnsTwice)) {
2467     // This aborts the entire analysis.
2468     ExposesReturnsTwice = true;
2469     return false;
2470   }
2471   if (isa<CallInst>(Call) && cast<CallInst>(Call).cannotDuplicate())
2472     ContainsNoDuplicateCall = true;
2473 
2474   if (InlineAsm *InlineAsmOp = dyn_cast<InlineAsm>(Call.getCalledOperand()))
2475     onInlineAsm(*InlineAsmOp);
2476 
2477   Function *F = Call.getCalledFunction();
2478   bool IsIndirectCall = !F;
2479   if (IsIndirectCall) {
2480     // Check if this happens to be an indirect function call to a known function
2481     // in this inline context. If not, we've done all we can.
2482     Value *Callee = Call.getCalledOperand();
2483     F = getSimplifiedValue<Function>(Callee);
2484     if (!F || F->getFunctionType() != Call.getFunctionType()) {
2485       onCallArgumentSetup(Call);
2486 
2487       if (!Call.onlyReadsMemory())
2488         disableLoadElimination();
2489       return Base::visitCallBase(Call);
2490     }
2491   }
2492 
2493   assert(F && "Expected a call to a known function");
2494 
2495   // When we have a concrete function, first try to simplify it directly.
2496   if (simplifyCallSite(F, Call))
2497     return true;
2498 
2499   // Next check if it is an intrinsic we know about.
2500   // FIXME: Lift this into part of the InstVisitor.
2501   if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&Call)) {
2502     switch (II->getIntrinsicID()) {
2503     default:
2504       if (!Call.onlyReadsMemory() && !isAssumeLikeIntrinsic(II))
2505         disableLoadElimination();
2506       return Base::visitCallBase(Call);
2507 
2508     case Intrinsic::load_relative:
2509       onLoadRelativeIntrinsic();
2510       return false;
2511 
2512     case Intrinsic::memset:
2513     case Intrinsic::memcpy:
2514     case Intrinsic::memmove:
2515       disableLoadElimination();
2516       // SROA can usually chew through these intrinsics, but they aren't free.
2517       return false;
2518     case Intrinsic::icall_branch_funnel:
2519     case Intrinsic::localescape:
2520       HasUninlineableIntrinsic = true;
2521       return false;
2522     case Intrinsic::vastart:
2523       InitsVargArgs = true;
2524       return false;
2525     case Intrinsic::launder_invariant_group:
2526     case Intrinsic::strip_invariant_group:
2527       if (auto *SROAArg = getSROAArgForValueOrNull(II->getOperand(0)))
2528         SROAArgValues[II] = SROAArg;
2529       return true;
2530     case Intrinsic::is_constant:
2531       return simplifyIntrinsicCallIsConstant(Call);
2532     case Intrinsic::objectsize:
2533       return simplifyIntrinsicCallObjectSize(Call);
2534     }
2535   }
2536 
2537   if (F == Call.getFunction()) {
2538     // This flag will fully abort the analysis, so don't bother with anything
2539     // else.
2540     IsRecursiveCall = true;
2541     if (!AllowRecursiveCall)
2542       return false;
2543   }
2544 
2545   if (isLoweredToCall(F, Call)) {
2546     onLoweredCall(F, Call, IsIndirectCall);
2547   }
2548 
2549   if (!(Call.onlyReadsMemory() || (IsIndirectCall && F->onlyReadsMemory())))
2550     disableLoadElimination();
2551   return Base::visitCallBase(Call);
2552 }
2553 
visitReturnInst(ReturnInst & RI)2554 bool CallAnalyzer::visitReturnInst(ReturnInst &RI) {
2555   // At least one return instruction will be free after inlining.
2556   bool Free = !HasReturn;
2557   HasReturn = true;
2558   return Free;
2559 }
2560 
visitBranchInst(BranchInst & BI)2561 bool CallAnalyzer::visitBranchInst(BranchInst &BI) {
2562   // We model unconditional branches as essentially free -- they really
2563   // shouldn't exist at all, but handling them makes the behavior of the
2564   // inliner more regular and predictable. Interestingly, conditional branches
2565   // which will fold away are also free.
2566   return BI.isUnconditional() ||
2567          getDirectOrSimplifiedValue<ConstantInt>(BI.getCondition()) ||
2568          BI.getMetadata(LLVMContext::MD_make_implicit);
2569 }
2570 
visitSelectInst(SelectInst & SI)2571 bool CallAnalyzer::visitSelectInst(SelectInst &SI) {
2572   bool CheckSROA = SI.getType()->isPointerTy();
2573   Value *TrueVal = SI.getTrueValue();
2574   Value *FalseVal = SI.getFalseValue();
2575 
2576   Constant *TrueC = getDirectOrSimplifiedValue<Constant>(TrueVal);
2577   Constant *FalseC = getDirectOrSimplifiedValue<Constant>(FalseVal);
2578   Constant *CondC = getSimplifiedValue<Constant>(SI.getCondition());
2579 
2580   if (!CondC) {
2581     // Select C, X, X => X
2582     if (TrueC == FalseC && TrueC) {
2583       SimplifiedValues[&SI] = TrueC;
2584       return true;
2585     }
2586 
2587     if (!CheckSROA)
2588       return Base::visitSelectInst(SI);
2589 
2590     std::pair<Value *, APInt> TrueBaseAndOffset =
2591         ConstantOffsetPtrs.lookup(TrueVal);
2592     std::pair<Value *, APInt> FalseBaseAndOffset =
2593         ConstantOffsetPtrs.lookup(FalseVal);
2594     if (TrueBaseAndOffset == FalseBaseAndOffset && TrueBaseAndOffset.first) {
2595       ConstantOffsetPtrs[&SI] = TrueBaseAndOffset;
2596 
2597       if (auto *SROAArg = getSROAArgForValueOrNull(TrueVal))
2598         SROAArgValues[&SI] = SROAArg;
2599       return true;
2600     }
2601 
2602     return Base::visitSelectInst(SI);
2603   }
2604 
2605   // Select condition is a constant.
2606   Value *SelectedV = CondC->isAllOnesValue()  ? TrueVal
2607                      : (CondC->isNullValue()) ? FalseVal
2608                                               : nullptr;
2609   if (!SelectedV) {
2610     // Condition is a vector constant that is not all 1s or all 0s.  If all
2611     // operands are constants, ConstantFoldSelectInstruction() can handle the
2612     // cases such as select vectors.
2613     if (TrueC && FalseC) {
2614       if (auto *C = ConstantFoldSelectInstruction(CondC, TrueC, FalseC)) {
2615         SimplifiedValues[&SI] = C;
2616         return true;
2617       }
2618     }
2619     return Base::visitSelectInst(SI);
2620   }
2621 
2622   // Condition is either all 1s or all 0s. SI can be simplified.
2623   if (Constant *SelectedC = dyn_cast<Constant>(SelectedV)) {
2624     SimplifiedValues[&SI] = SelectedC;
2625     return true;
2626   }
2627 
2628   if (!CheckSROA)
2629     return true;
2630 
2631   std::pair<Value *, APInt> BaseAndOffset =
2632       ConstantOffsetPtrs.lookup(SelectedV);
2633   if (BaseAndOffset.first) {
2634     ConstantOffsetPtrs[&SI] = BaseAndOffset;
2635 
2636     if (auto *SROAArg = getSROAArgForValueOrNull(SelectedV))
2637       SROAArgValues[&SI] = SROAArg;
2638   }
2639 
2640   return true;
2641 }
2642 
visitSwitchInst(SwitchInst & SI)2643 bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) {
2644   // We model unconditional switches as free, see the comments on handling
2645   // branches.
2646   if (getDirectOrSimplifiedValue<ConstantInt>(SI.getCondition()))
2647     return true;
2648 
2649   // Assume the most general case where the switch is lowered into
2650   // either a jump table, bit test, or a balanced binary tree consisting of
2651   // case clusters without merging adjacent clusters with the same
2652   // destination. We do not consider the switches that are lowered with a mix
2653   // of jump table/bit test/binary search tree. The cost of the switch is
2654   // proportional to the size of the tree or the size of jump table range.
2655   //
2656   // NB: We convert large switches which are just used to initialize large phi
2657   // nodes to lookup tables instead in simplifycfg, so this shouldn't prevent
2658   // inlining those. It will prevent inlining in cases where the optimization
2659   // does not (yet) fire.
2660 
2661   unsigned JumpTableSize = 0;
2662   BlockFrequencyInfo *BFI = GetBFI ? &(GetBFI(F)) : nullptr;
2663   unsigned NumCaseCluster =
2664       TTI.getEstimatedNumberOfCaseClusters(SI, JumpTableSize, PSI, BFI);
2665 
2666   onFinalizeSwitch(JumpTableSize, NumCaseCluster, SI.defaultDestUnreachable());
2667   return false;
2668 }
2669 
visitIndirectBrInst(IndirectBrInst & IBI)2670 bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) {
2671   // We never want to inline functions that contain an indirectbr.  This is
2672   // incorrect because all the blockaddress's (in static global initializers
2673   // for example) would be referring to the original function, and this
2674   // indirect jump would jump from the inlined copy of the function into the
2675   // original function which is extremely undefined behavior.
2676   // FIXME: This logic isn't really right; we can safely inline functions with
2677   // indirectbr's as long as no other function or global references the
2678   // blockaddress of a block within the current function.
2679   HasIndirectBr = true;
2680   return false;
2681 }
2682 
visitResumeInst(ResumeInst & RI)2683 bool CallAnalyzer::visitResumeInst(ResumeInst &RI) {
2684   // FIXME: It's not clear that a single instruction is an accurate model for
2685   // the inline cost of a resume instruction.
2686   return false;
2687 }
2688 
visitCleanupReturnInst(CleanupReturnInst & CRI)2689 bool CallAnalyzer::visitCleanupReturnInst(CleanupReturnInst &CRI) {
2690   // FIXME: It's not clear that a single instruction is an accurate model for
2691   // the inline cost of a cleanupret instruction.
2692   return false;
2693 }
2694 
visitCatchReturnInst(CatchReturnInst & CRI)2695 bool CallAnalyzer::visitCatchReturnInst(CatchReturnInst &CRI) {
2696   // FIXME: It's not clear that a single instruction is an accurate model for
2697   // the inline cost of a catchret instruction.
2698   return false;
2699 }
2700 
visitUnreachableInst(UnreachableInst & I)2701 bool CallAnalyzer::visitUnreachableInst(UnreachableInst &I) {
2702   // FIXME: It might be reasonably to discount the cost of instructions leading
2703   // to unreachable as they have the lowest possible impact on both runtime and
2704   // code size.
2705   return true; // No actual code is needed for unreachable.
2706 }
2707 
visitInstruction(Instruction & I)2708 bool CallAnalyzer::visitInstruction(Instruction &I) {
2709   // Some instructions are free. All of the free intrinsics can also be
2710   // handled by SROA, etc.
2711   if (TTI.getInstructionCost(&I, TargetTransformInfo::TCK_SizeAndLatency) ==
2712       TargetTransformInfo::TCC_Free)
2713     return true;
2714 
2715   // We found something we don't understand or can't handle. Mark any SROA-able
2716   // values in the operand list as no longer viable.
2717   for (const Use &Op : I.operands())
2718     disableSROA(Op);
2719 
2720   return false;
2721 }
2722 
2723 /// Analyze a basic block for its contribution to the inline cost.
2724 ///
2725 /// This method walks the analyzer over every instruction in the given basic
2726 /// block and accounts for their cost during inlining at this callsite. It
2727 /// aborts early if the threshold has been exceeded or an impossible to inline
2728 /// construct has been detected. It returns false if inlining is no longer
2729 /// viable, and true if inlining remains viable.
2730 InlineResult
analyzeBlock(BasicBlock * BB,const SmallPtrSetImpl<const Value * > & EphValues)2731 CallAnalyzer::analyzeBlock(BasicBlock *BB,
2732                            const SmallPtrSetImpl<const Value *> &EphValues) {
2733   for (Instruction &I : *BB) {
2734     // FIXME: Currently, the number of instructions in a function regardless of
2735     // our ability to simplify them during inline to constants or dead code,
2736     // are actually used by the vector bonus heuristic. As long as that's true,
2737     // we have to special case debug intrinsics here to prevent differences in
2738     // inlining due to debug symbols. Eventually, the number of unsimplified
2739     // instructions shouldn't factor into the cost computation, but until then,
2740     // hack around it here.
2741     // Similarly, skip pseudo-probes.
2742     if (I.isDebugOrPseudoInst())
2743       continue;
2744 
2745     // Skip ephemeral values.
2746     if (EphValues.count(&I))
2747       continue;
2748 
2749     ++NumInstructions;
2750     if (isa<ExtractElementInst>(I) || I.getType()->isVectorTy())
2751       ++NumVectorInstructions;
2752 
2753     // If the instruction simplified to a constant, there is no cost to this
2754     // instruction. Visit the instructions using our InstVisitor to account for
2755     // all of the per-instruction logic. The visit tree returns true if we
2756     // consumed the instruction in any way, and false if the instruction's base
2757     // cost should count against inlining.
2758     onInstructionAnalysisStart(&I);
2759 
2760     if (Base::visit(&I))
2761       ++NumInstructionsSimplified;
2762     else
2763       onMissedSimplification();
2764 
2765     onInstructionAnalysisFinish(&I);
2766     using namespace ore;
2767     // If the visit this instruction detected an uninlinable pattern, abort.
2768     InlineResult IR = InlineResult::success();
2769     if (IsRecursiveCall && !AllowRecursiveCall)
2770       IR = InlineResult::failure("recursive");
2771     else if (ExposesReturnsTwice)
2772       IR = InlineResult::failure("exposes returns twice");
2773     else if (HasDynamicAlloca)
2774       IR = InlineResult::failure("dynamic alloca");
2775     else if (HasIndirectBr)
2776       IR = InlineResult::failure("indirect branch");
2777     else if (HasUninlineableIntrinsic)
2778       IR = InlineResult::failure("uninlinable intrinsic");
2779     else if (InitsVargArgs)
2780       IR = InlineResult::failure("varargs");
2781     if (!IR.isSuccess()) {
2782       if (ORE)
2783         ORE->emit([&]() {
2784           return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline",
2785                                           &CandidateCall)
2786                  << NV("Callee", &F) << " has uninlinable pattern ("
2787                  << NV("InlineResult", IR.getFailureReason())
2788                  << ") and cost is not fully computed";
2789         });
2790       return IR;
2791     }
2792 
2793     // If the caller is a recursive function then we don't want to inline
2794     // functions which allocate a lot of stack space because it would increase
2795     // the caller stack usage dramatically.
2796     if (IsCallerRecursive && AllocatedSize > RecurStackSizeThreshold) {
2797       auto IR =
2798           InlineResult::failure("recursive and allocates too much stack space");
2799       if (ORE)
2800         ORE->emit([&]() {
2801           return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline",
2802                                           &CandidateCall)
2803                  << NV("Callee", &F) << " is "
2804                  << NV("InlineResult", IR.getFailureReason())
2805                  << ". Cost is not fully computed";
2806         });
2807       return IR;
2808     }
2809 
2810     if (shouldStop())
2811       return InlineResult::failure(
2812           "Call site analysis is not favorable to inlining.");
2813   }
2814 
2815   return InlineResult::success();
2816 }
2817 
2818 /// Compute the base pointer and cumulative constant offsets for V.
2819 ///
2820 /// This strips all constant offsets off of V, leaving it the base pointer, and
2821 /// accumulates the total constant offset applied in the returned constant. It
2822 /// returns 0 if V is not a pointer, and returns the constant '0' if there are
2823 /// no constant offsets applied.
stripAndComputeInBoundsConstantOffsets(Value * & V)2824 ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) {
2825   if (!V->getType()->isPointerTy())
2826     return nullptr;
2827 
2828   unsigned AS = V->getType()->getPointerAddressSpace();
2829   unsigned IntPtrWidth = DL.getIndexSizeInBits(AS);
2830   APInt Offset = APInt::getZero(IntPtrWidth);
2831 
2832   // Even though we don't look through PHI nodes, we could be called on an
2833   // instruction in an unreachable block, which may be on a cycle.
2834   SmallPtrSet<Value *, 4> Visited;
2835   Visited.insert(V);
2836   do {
2837     if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
2838       if (!GEP->isInBounds() || !accumulateGEPOffset(*GEP, Offset))
2839         return nullptr;
2840       V = GEP->getPointerOperand();
2841     } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
2842       if (GA->isInterposable())
2843         break;
2844       V = GA->getAliasee();
2845     } else {
2846       break;
2847     }
2848     assert(V->getType()->isPointerTy() && "Unexpected operand type!");
2849   } while (Visited.insert(V).second);
2850 
2851   Type *IdxPtrTy = DL.getIndexType(V->getType());
2852   return cast<ConstantInt>(ConstantInt::get(IdxPtrTy, Offset));
2853 }
2854 
2855 /// Find dead blocks due to deleted CFG edges during inlining.
2856 ///
2857 /// If we know the successor of the current block, \p CurrBB, has to be \p
2858 /// NextBB, the other successors of \p CurrBB are dead if these successors have
2859 /// no live incoming CFG edges.  If one block is found to be dead, we can
2860 /// continue growing the dead block list by checking the successors of the dead
2861 /// blocks to see if all their incoming edges are dead or not.
findDeadBlocks(BasicBlock * CurrBB,BasicBlock * NextBB)2862 void CallAnalyzer::findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB) {
2863   auto IsEdgeDead = [&](BasicBlock *Pred, BasicBlock *Succ) {
2864     // A CFG edge is dead if the predecessor is dead or the predecessor has a
2865     // known successor which is not the one under exam.
2866     if (DeadBlocks.count(Pred))
2867       return true;
2868     BasicBlock *KnownSucc = KnownSuccessors[Pred];
2869     return KnownSucc && KnownSucc != Succ;
2870   };
2871 
2872   auto IsNewlyDead = [&](BasicBlock *BB) {
2873     // If all the edges to a block are dead, the block is also dead.
2874     return (!DeadBlocks.count(BB) &&
2875             llvm::all_of(predecessors(BB),
2876                          [&](BasicBlock *P) { return IsEdgeDead(P, BB); }));
2877   };
2878 
2879   for (BasicBlock *Succ : successors(CurrBB)) {
2880     if (Succ == NextBB || !IsNewlyDead(Succ))
2881       continue;
2882     SmallVector<BasicBlock *, 4> NewDead;
2883     NewDead.push_back(Succ);
2884     while (!NewDead.empty()) {
2885       BasicBlock *Dead = NewDead.pop_back_val();
2886       if (DeadBlocks.insert(Dead).second)
2887         // Continue growing the dead block lists.
2888         for (BasicBlock *S : successors(Dead))
2889           if (IsNewlyDead(S))
2890             NewDead.push_back(S);
2891     }
2892   }
2893 }
2894 
2895 /// Analyze a call site for potential inlining.
2896 ///
2897 /// Returns true if inlining this call is viable, and false if it is not
2898 /// viable. It computes the cost and adjusts the threshold based on numerous
2899 /// factors and heuristics. If this method returns false but the computed cost
2900 /// is below the computed threshold, then inlining was forcibly disabled by
2901 /// some artifact of the routine.
analyze()2902 InlineResult CallAnalyzer::analyze() {
2903   ++NumCallsAnalyzed;
2904 
2905   auto Result = onAnalysisStart();
2906   if (!Result.isSuccess())
2907     return Result;
2908 
2909   if (F.empty())
2910     return InlineResult::success();
2911 
2912   Function *Caller = CandidateCall.getFunction();
2913   // Check if the caller function is recursive itself.
2914   for (User *U : Caller->users()) {
2915     CallBase *Call = dyn_cast<CallBase>(U);
2916     if (Call && Call->getFunction() == Caller) {
2917       IsCallerRecursive = true;
2918       break;
2919     }
2920   }
2921 
2922   // Populate our simplified values by mapping from function arguments to call
2923   // arguments with known important simplifications.
2924   auto CAI = CandidateCall.arg_begin();
2925   for (Argument &FAI : F.args()) {
2926     assert(CAI != CandidateCall.arg_end());
2927     SimplifiedValues[&FAI] = *CAI;
2928     if (isa<Constant>(*CAI))
2929       ++NumConstantArgs;
2930 
2931     Value *PtrArg = *CAI;
2932     if (ConstantInt *C = stripAndComputeInBoundsConstantOffsets(PtrArg)) {
2933       ConstantOffsetPtrs[&FAI] = std::make_pair(PtrArg, C->getValue());
2934 
2935       // We can SROA any pointer arguments derived from alloca instructions.
2936       if (auto *SROAArg = dyn_cast<AllocaInst>(PtrArg)) {
2937         SROAArgValues[&FAI] = SROAArg;
2938         onInitializeSROAArg(SROAArg);
2939         EnabledSROAAllocas.insert(SROAArg);
2940       }
2941     }
2942     ++CAI;
2943   }
2944   NumConstantOffsetPtrArgs = ConstantOffsetPtrs.size();
2945   NumAllocaArgs = SROAArgValues.size();
2946 
2947   // Collecting the ephemeral values of `F` can be expensive, so use the
2948   // ephemeral values cache if available.
2949   SmallPtrSet<const Value *, 32> EphValuesStorage;
2950   const SmallPtrSetImpl<const Value *> *EphValues = &EphValuesStorage;
2951   if (GetEphValuesCache)
2952     EphValues = &GetEphValuesCache(F).ephValues();
2953   else
2954     CodeMetrics::collectEphemeralValues(&F, &GetAssumptionCache(F),
2955                                         EphValuesStorage);
2956 
2957   // The worklist of live basic blocks in the callee *after* inlining. We avoid
2958   // adding basic blocks of the callee which can be proven to be dead for this
2959   // particular call site in order to get more accurate cost estimates. This
2960   // requires a somewhat heavyweight iteration pattern: we need to walk the
2961   // basic blocks in a breadth-first order as we insert live successors. To
2962   // accomplish this, prioritizing for small iterations because we exit after
2963   // crossing our threshold, we use a small-size optimized SetVector.
2964   typedef SmallSetVector<BasicBlock *, 16> BBSetVector;
2965   BBSetVector BBWorklist;
2966   BBWorklist.insert(&F.getEntryBlock());
2967 
2968   // Note that we *must not* cache the size, this loop grows the worklist.
2969   for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {
2970     if (shouldStop())
2971       break;
2972 
2973     BasicBlock *BB = BBWorklist[Idx];
2974     if (BB->empty())
2975       continue;
2976 
2977     onBlockStart(BB);
2978 
2979     // Disallow inlining a blockaddress with uses other than strictly callbr.
2980     // A blockaddress only has defined behavior for an indirect branch in the
2981     // same function, and we do not currently support inlining indirect
2982     // branches.  But, the inliner may not see an indirect branch that ends up
2983     // being dead code at a particular call site. If the blockaddress escapes
2984     // the function, e.g., via a global variable, inlining may lead to an
2985     // invalid cross-function reference.
2986     // FIXME: pr/39560: continue relaxing this overt restriction.
2987     if (BB->hasAddressTaken())
2988       for (User *U : BlockAddress::get(&*BB)->users())
2989         if (!isa<CallBrInst>(*U))
2990           return InlineResult::failure("blockaddress used outside of callbr");
2991 
2992     // Analyze the cost of this block. If we blow through the threshold, this
2993     // returns false, and we can bail on out.
2994     InlineResult IR = analyzeBlock(BB, *EphValues);
2995     if (!IR.isSuccess())
2996       return IR;
2997 
2998     Instruction *TI = BB->getTerminator();
2999 
3000     // Add in the live successors by first checking whether we have terminator
3001     // that may be simplified based on the values simplified by this call.
3002     if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
3003       if (BI->isConditional()) {
3004         Value *Cond = BI->getCondition();
3005         if (ConstantInt *SimpleCond = getSimplifiedValue<ConstantInt>(Cond)) {
3006           BasicBlock *NextBB = BI->getSuccessor(SimpleCond->isZero() ? 1 : 0);
3007           BBWorklist.insert(NextBB);
3008           KnownSuccessors[BB] = NextBB;
3009           findDeadBlocks(BB, NextBB);
3010           continue;
3011         }
3012       }
3013     } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
3014       Value *Cond = SI->getCondition();
3015       if (ConstantInt *SimpleCond = getSimplifiedValue<ConstantInt>(Cond)) {
3016         BasicBlock *NextBB = SI->findCaseValue(SimpleCond)->getCaseSuccessor();
3017         BBWorklist.insert(NextBB);
3018         KnownSuccessors[BB] = NextBB;
3019         findDeadBlocks(BB, NextBB);
3020         continue;
3021       }
3022     }
3023 
3024     // If we're unable to select a particular successor, just count all of
3025     // them.
3026     BBWorklist.insert_range(successors(BB));
3027 
3028     onBlockAnalyzed(BB);
3029   }
3030 
3031   // If this is a noduplicate call, we can still inline as long as
3032   // inlining this would cause the removal of the caller (so the instruction
3033   // is not actually duplicated, just moved).
3034   if (!isSoleCallToLocalFunction(CandidateCall, F) && ContainsNoDuplicateCall)
3035     return InlineResult::failure("noduplicate");
3036 
3037   // If the callee's stack size exceeds the user-specified threshold,
3038   // do not let it be inlined.
3039   // The command line option overrides a limit set in the function attributes.
3040   size_t FinalStackSizeThreshold = StackSizeThreshold;
3041   if (!StackSizeThreshold.getNumOccurrences())
3042     if (std::optional<int> AttrMaxStackSize = getStringFnAttrAsInt(
3043             Caller, InlineConstants::MaxInlineStackSizeAttributeName))
3044       FinalStackSizeThreshold = *AttrMaxStackSize;
3045   if (AllocatedSize > FinalStackSizeThreshold)
3046     return InlineResult::failure("stacksize");
3047 
3048   return finalizeAnalysis();
3049 }
3050 
print(raw_ostream & OS)3051 void InlineCostCallAnalyzer::print(raw_ostream &OS) {
3052 #define DEBUG_PRINT_STAT(x) OS << "      " #x ": " << x << "\n"
3053   if (PrintInstructionComments)
3054     F.print(OS, &Writer);
3055   DEBUG_PRINT_STAT(NumConstantArgs);
3056   DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs);
3057   DEBUG_PRINT_STAT(NumAllocaArgs);
3058   DEBUG_PRINT_STAT(NumConstantPtrCmps);
3059   DEBUG_PRINT_STAT(NumConstantPtrDiffs);
3060   DEBUG_PRINT_STAT(NumInstructionsSimplified);
3061   DEBUG_PRINT_STAT(NumInstructions);
3062   DEBUG_PRINT_STAT(NumInlineAsmInstructions);
3063   DEBUG_PRINT_STAT(SROACostSavings);
3064   DEBUG_PRINT_STAT(SROACostSavingsLost);
3065   DEBUG_PRINT_STAT(LoadEliminationCost);
3066   DEBUG_PRINT_STAT(ContainsNoDuplicateCall);
3067   DEBUG_PRINT_STAT(Cost);
3068   DEBUG_PRINT_STAT(Threshold);
3069 #undef DEBUG_PRINT_STAT
3070 }
3071 
3072 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3073 /// Dump stats about this call's analysis.
dump()3074 LLVM_DUMP_METHOD void InlineCostCallAnalyzer::dump() { print(dbgs()); }
3075 #endif
3076 
3077 /// Test that there are no attribute conflicts between Caller and Callee
3078 ///        that prevent inlining.
functionsHaveCompatibleAttributes(Function * Caller,Function * Callee,TargetTransformInfo & TTI,function_ref<const TargetLibraryInfo & (Function &)> & GetTLI)3079 static bool functionsHaveCompatibleAttributes(
3080     Function *Caller, Function *Callee, TargetTransformInfo &TTI,
3081     function_ref<const TargetLibraryInfo &(Function &)> &GetTLI) {
3082   // Note that CalleeTLI must be a copy not a reference. The legacy pass manager
3083   // caches the most recently created TLI in the TargetLibraryInfoWrapperPass
3084   // object, and always returns the same object (which is overwritten on each
3085   // GetTLI call). Therefore we copy the first result.
3086   auto CalleeTLI = GetTLI(*Callee);
3087   return (IgnoreTTIInlineCompatible ||
3088           TTI.areInlineCompatible(Caller, Callee)) &&
3089          GetTLI(*Caller).areInlineCompatible(CalleeTLI,
3090                                              InlineCallerSupersetNoBuiltin) &&
3091          AttributeFuncs::areInlineCompatible(*Caller, *Callee);
3092 }
3093 
getCallsiteCost(const TargetTransformInfo & TTI,const CallBase & Call,const DataLayout & DL)3094 int llvm::getCallsiteCost(const TargetTransformInfo &TTI, const CallBase &Call,
3095                           const DataLayout &DL) {
3096   int64_t Cost = 0;
3097   for (unsigned I = 0, E = Call.arg_size(); I != E; ++I) {
3098     if (Call.isByValArgument(I)) {
3099       // We approximate the number of loads and stores needed by dividing the
3100       // size of the byval type by the target's pointer size.
3101       PointerType *PTy = cast<PointerType>(Call.getArgOperand(I)->getType());
3102       unsigned TypeSize = DL.getTypeSizeInBits(Call.getParamByValType(I));
3103       unsigned AS = PTy->getAddressSpace();
3104       unsigned PointerSize = DL.getPointerSizeInBits(AS);
3105       // Ceiling division.
3106       unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize;
3107 
3108       // If it generates more than 8 stores it is likely to be expanded as an
3109       // inline memcpy so we take that as an upper bound. Otherwise we assume
3110       // one load and one store per word copied.
3111       // FIXME: The maxStoresPerMemcpy setting from the target should be used
3112       // here instead of a magic number of 8, but it's not available via
3113       // DataLayout.
3114       NumStores = std::min(NumStores, 8U);
3115 
3116       Cost += 2 * NumStores * InstrCost;
3117     } else {
3118       // For non-byval arguments subtract off one instruction per call
3119       // argument.
3120       Cost += InstrCost;
3121     }
3122   }
3123   // The call instruction also disappears after inlining.
3124   Cost += InstrCost;
3125   Cost += TTI.getInlineCallPenalty(Call.getCaller(), Call, CallPenalty);
3126 
3127   return std::min<int64_t>(Cost, INT_MAX);
3128 }
3129 
getInlineCost(CallBase & Call,const InlineParams & Params,TargetTransformInfo & CalleeTTI,function_ref<AssumptionCache & (Function &)> GetAssumptionCache,function_ref<const TargetLibraryInfo & (Function &)> GetTLI,function_ref<BlockFrequencyInfo & (Function &)> GetBFI,ProfileSummaryInfo * PSI,OptimizationRemarkEmitter * ORE,function_ref<EphemeralValuesCache & (Function &)> GetEphValuesCache)3130 InlineCost llvm::getInlineCost(
3131     CallBase &Call, const InlineParams &Params, TargetTransformInfo &CalleeTTI,
3132     function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
3133     function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
3134     function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
3135     ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE,
3136     function_ref<EphemeralValuesCache &(Function &)> GetEphValuesCache) {
3137   return getInlineCost(Call, Call.getCalledFunction(), Params, CalleeTTI,
3138                        GetAssumptionCache, GetTLI, GetBFI, PSI, ORE,
3139                        GetEphValuesCache);
3140 }
3141 
getInliningCostEstimate(CallBase & Call,TargetTransformInfo & CalleeTTI,function_ref<AssumptionCache & (Function &)> GetAssumptionCache,function_ref<BlockFrequencyInfo & (Function &)> GetBFI,function_ref<const TargetLibraryInfo & (Function &)> GetTLI,ProfileSummaryInfo * PSI,OptimizationRemarkEmitter * ORE)3142 std::optional<int> llvm::getInliningCostEstimate(
3143     CallBase &Call, TargetTransformInfo &CalleeTTI,
3144     function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
3145     function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
3146     function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
3147     ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
3148   const InlineParams Params = {/* DefaultThreshold*/ 0,
3149                                /*HintThreshold*/ {},
3150                                /*ColdThreshold*/ {},
3151                                /*OptSizeThreshold*/ {},
3152                                /*OptMinSizeThreshold*/ {},
3153                                /*HotCallSiteThreshold*/ {},
3154                                /*LocallyHotCallSiteThreshold*/ {},
3155                                /*ColdCallSiteThreshold*/ {},
3156                                /*ComputeFullInlineCost*/ true,
3157                                /*EnableDeferral*/ true};
3158 
3159   InlineCostCallAnalyzer CA(*Call.getCalledFunction(), Call, Params, CalleeTTI,
3160                             GetAssumptionCache, GetBFI, GetTLI, PSI, ORE, true,
3161                             /*IgnoreThreshold*/ true);
3162   auto R = CA.analyze();
3163   if (!R.isSuccess())
3164     return std::nullopt;
3165   return CA.getCost();
3166 }
3167 
getInliningCostFeatures(CallBase & Call,TargetTransformInfo & CalleeTTI,function_ref<AssumptionCache & (Function &)> GetAssumptionCache,function_ref<BlockFrequencyInfo & (Function &)> GetBFI,function_ref<const TargetLibraryInfo & (Function &)> GetTLI,ProfileSummaryInfo * PSI,OptimizationRemarkEmitter * ORE)3168 std::optional<InlineCostFeatures> llvm::getInliningCostFeatures(
3169     CallBase &Call, TargetTransformInfo &CalleeTTI,
3170     function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
3171     function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
3172     function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
3173     ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
3174   InlineCostFeaturesAnalyzer CFA(CalleeTTI, GetAssumptionCache, GetBFI, GetTLI,
3175                                  PSI, ORE, *Call.getCalledFunction(), Call);
3176   auto R = CFA.analyze();
3177   if (!R.isSuccess())
3178     return std::nullopt;
3179   return CFA.features();
3180 }
3181 
getAttributeBasedInliningDecision(CallBase & Call,Function * Callee,TargetTransformInfo & CalleeTTI,function_ref<const TargetLibraryInfo & (Function &)> GetTLI)3182 std::optional<InlineResult> llvm::getAttributeBasedInliningDecision(
3183     CallBase &Call, Function *Callee, TargetTransformInfo &CalleeTTI,
3184     function_ref<const TargetLibraryInfo &(Function &)> GetTLI) {
3185 
3186   // Cannot inline indirect calls.
3187   if (!Callee)
3188     return InlineResult::failure("indirect call");
3189 
3190   // When callee coroutine function is inlined into caller coroutine function
3191   // before coro-split pass,
3192   // coro-early pass can not handle this quiet well.
3193   // So we won't inline the coroutine function if it have not been unsplited
3194   if (Callee->isPresplitCoroutine())
3195     return InlineResult::failure("unsplited coroutine call");
3196 
3197   // Never inline calls with byval arguments that does not have the alloca
3198   // address space. Since byval arguments can be replaced with a copy to an
3199   // alloca, the inlined code would need to be adjusted to handle that the
3200   // argument is in the alloca address space (so it is a little bit complicated
3201   // to solve).
3202   unsigned AllocaAS = Callee->getDataLayout().getAllocaAddrSpace();
3203   for (unsigned I = 0, E = Call.arg_size(); I != E; ++I)
3204     if (Call.isByValArgument(I)) {
3205       PointerType *PTy = cast<PointerType>(Call.getArgOperand(I)->getType());
3206       if (PTy->getAddressSpace() != AllocaAS)
3207         return InlineResult::failure("byval arguments without alloca"
3208                                      " address space");
3209     }
3210 
3211   // Calls to functions with always-inline attributes should be inlined
3212   // whenever possible.
3213   if (Call.hasFnAttr(Attribute::AlwaysInline)) {
3214     if (Call.getAttributes().hasFnAttr(Attribute::NoInline))
3215       return InlineResult::failure("noinline call site attribute");
3216 
3217     auto IsViable = isInlineViable(*Callee);
3218     if (IsViable.isSuccess())
3219       return InlineResult::success();
3220     return InlineResult::failure(IsViable.getFailureReason());
3221   }
3222 
3223   // Never inline functions with conflicting attributes (unless callee has
3224   // always-inline attribute).
3225   Function *Caller = Call.getCaller();
3226   if (!functionsHaveCompatibleAttributes(Caller, Callee, CalleeTTI, GetTLI))
3227     return InlineResult::failure("conflicting attributes");
3228 
3229   // Don't inline this call if the caller has the optnone attribute.
3230   if (Caller->hasOptNone())
3231     return InlineResult::failure("optnone attribute");
3232 
3233   // Don't inline a function that treats null pointer as valid into a caller
3234   // that does not have this attribute.
3235   if (!Caller->nullPointerIsDefined() && Callee->nullPointerIsDefined())
3236     return InlineResult::failure("nullptr definitions incompatible");
3237 
3238   // Don't inline functions which can be interposed at link-time.
3239   if (Callee->isInterposable())
3240     return InlineResult::failure("interposable");
3241 
3242   // Don't inline functions marked noinline.
3243   if (Callee->hasFnAttribute(Attribute::NoInline))
3244     return InlineResult::failure("noinline function attribute");
3245 
3246   // Don't inline call sites marked noinline.
3247   if (Call.isNoInline())
3248     return InlineResult::failure("noinline call site attribute");
3249 
3250   // Don't inline functions that are loader replaceable.
3251   if (Callee->hasFnAttribute("loader-replaceable"))
3252     return InlineResult::failure("loader replaceable function attribute");
3253 
3254   return std::nullopt;
3255 }
3256 
getInlineCost(CallBase & Call,Function * Callee,const InlineParams & Params,TargetTransformInfo & CalleeTTI,function_ref<AssumptionCache & (Function &)> GetAssumptionCache,function_ref<const TargetLibraryInfo & (Function &)> GetTLI,function_ref<BlockFrequencyInfo & (Function &)> GetBFI,ProfileSummaryInfo * PSI,OptimizationRemarkEmitter * ORE,function_ref<EphemeralValuesCache & (Function &)> GetEphValuesCache)3257 InlineCost llvm::getInlineCost(
3258     CallBase &Call, Function *Callee, const InlineParams &Params,
3259     TargetTransformInfo &CalleeTTI,
3260     function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
3261     function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
3262     function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
3263     ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE,
3264     function_ref<EphemeralValuesCache &(Function &)> GetEphValuesCache) {
3265 
3266   auto UserDecision =
3267       llvm::getAttributeBasedInliningDecision(Call, Callee, CalleeTTI, GetTLI);
3268 
3269   if (UserDecision) {
3270     if (UserDecision->isSuccess())
3271       return llvm::InlineCost::getAlways("always inline attribute");
3272     return llvm::InlineCost::getNever(UserDecision->getFailureReason());
3273   }
3274 
3275   LLVM_DEBUG(llvm::dbgs() << "      Analyzing call of " << Callee->getName()
3276                           << "... (caller:" << Call.getCaller()->getName()
3277                           << ")\n");
3278 
3279   InlineCostCallAnalyzer CA(*Callee, Call, Params, CalleeTTI,
3280                             GetAssumptionCache, GetBFI, GetTLI, PSI, ORE,
3281                             /*BoostIndirect=*/true, /*IgnoreThreshold=*/false,
3282                             GetEphValuesCache);
3283   InlineResult ShouldInline = CA.analyze();
3284 
3285   LLVM_DEBUG(CA.dump());
3286 
3287   // Always make cost benefit based decision explicit.
3288   // We use always/never here since threshold is not meaningful,
3289   // as it's not what drives cost-benefit analysis.
3290   if (CA.wasDecidedByCostBenefit()) {
3291     if (ShouldInline.isSuccess())
3292       return InlineCost::getAlways("benefit over cost",
3293                                    CA.getCostBenefitPair());
3294     else
3295       return InlineCost::getNever("cost over benefit", CA.getCostBenefitPair());
3296   }
3297 
3298   if (CA.wasDecidedByCostThreshold())
3299     return InlineCost::get(CA.getCost(), CA.getThreshold(),
3300                            CA.getStaticBonusApplied());
3301 
3302   // No details on how the decision was made, simply return always or never.
3303   return ShouldInline.isSuccess()
3304              ? InlineCost::getAlways("empty function")
3305              : InlineCost::getNever(ShouldInline.getFailureReason());
3306 }
3307 
isInlineViable(Function & F)3308 InlineResult llvm::isInlineViable(Function &F) {
3309   bool ReturnsTwice = F.hasFnAttribute(Attribute::ReturnsTwice);
3310   for (BasicBlock &BB : F) {
3311     // Disallow inlining of functions which contain indirect branches.
3312     if (isa<IndirectBrInst>(BB.getTerminator()))
3313       return InlineResult::failure("contains indirect branches");
3314 
3315     // Disallow inlining of blockaddresses which are used by non-callbr
3316     // instructions.
3317     if (BB.hasAddressTaken())
3318       for (User *U : BlockAddress::get(&BB)->users())
3319         if (!isa<CallBrInst>(*U))
3320           return InlineResult::failure("blockaddress used outside of callbr");
3321 
3322     for (auto &II : BB) {
3323       CallBase *Call = dyn_cast<CallBase>(&II);
3324       if (!Call)
3325         continue;
3326 
3327       // Disallow recursive calls.
3328       Function *Callee = Call->getCalledFunction();
3329       if (&F == Callee)
3330         return InlineResult::failure("recursive call");
3331 
3332       // Disallow calls which expose returns-twice to a function not previously
3333       // attributed as such.
3334       if (!ReturnsTwice && isa<CallInst>(Call) &&
3335           cast<CallInst>(Call)->canReturnTwice())
3336         return InlineResult::failure("exposes returns-twice attribute");
3337 
3338       if (Callee)
3339         switch (Callee->getIntrinsicID()) {
3340         default:
3341           break;
3342         case llvm::Intrinsic::icall_branch_funnel:
3343           // Disallow inlining of @llvm.icall.branch.funnel because current
3344           // backend can't separate call targets from call arguments.
3345           return InlineResult::failure(
3346               "disallowed inlining of @llvm.icall.branch.funnel");
3347         case llvm::Intrinsic::localescape:
3348           // Disallow inlining functions that call @llvm.localescape. Doing this
3349           // correctly would require major changes to the inliner.
3350           return InlineResult::failure(
3351               "disallowed inlining of @llvm.localescape");
3352         case llvm::Intrinsic::vastart:
3353           // Disallow inlining of functions that initialize VarArgs with
3354           // va_start.
3355           return InlineResult::failure(
3356               "contains VarArgs initialized with va_start");
3357         }
3358     }
3359   }
3360 
3361   return InlineResult::success();
3362 }
3363 
3364 // APIs to create InlineParams based on command line flags and/or other
3365 // parameters.
3366 
getInlineParams(int Threshold)3367 InlineParams llvm::getInlineParams(int Threshold) {
3368   InlineParams Params;
3369 
3370   // This field is the threshold to use for a callee by default. This is
3371   // derived from one or more of:
3372   //  * optimization or size-optimization levels,
3373   //  * a value passed to createFunctionInliningPass function, or
3374   //  * the -inline-threshold flag.
3375   //  If the -inline-threshold flag is explicitly specified, that is used
3376   //  irrespective of anything else.
3377   if (InlineThreshold.getNumOccurrences() > 0)
3378     Params.DefaultThreshold = InlineThreshold;
3379   else
3380     Params.DefaultThreshold = Threshold;
3381 
3382   // Set the HintThreshold knob from the -inlinehint-threshold.
3383   Params.HintThreshold = HintThreshold;
3384 
3385   // Set the HotCallSiteThreshold knob from the -hot-callsite-threshold.
3386   Params.HotCallSiteThreshold = HotCallSiteThreshold;
3387 
3388   // If the -locally-hot-callsite-threshold is explicitly specified, use it to
3389   // populate LocallyHotCallSiteThreshold. Later, we populate
3390   // Params.LocallyHotCallSiteThreshold from -locally-hot-callsite-threshold if
3391   // we know that optimization level is O3 (in the getInlineParams variant that
3392   // takes the opt and size levels).
3393   // FIXME: Remove this check (and make the assignment unconditional) after
3394   // addressing size regression issues at O2.
3395   if (LocallyHotCallSiteThreshold.getNumOccurrences() > 0)
3396     Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold;
3397 
3398   // Set the ColdCallSiteThreshold knob from the
3399   // -inline-cold-callsite-threshold.
3400   Params.ColdCallSiteThreshold = ColdCallSiteThreshold;
3401 
3402   // Set the OptMinSizeThreshold and OptSizeThreshold params only if the
3403   // -inlinehint-threshold commandline option is not explicitly given. If that
3404   // option is present, then its value applies even for callees with size and
3405   // minsize attributes.
3406   // If the -inline-threshold is not specified, set the ColdThreshold from the
3407   // -inlinecold-threshold even if it is not explicitly passed. If
3408   // -inline-threshold is specified, then -inlinecold-threshold needs to be
3409   // explicitly specified to set the ColdThreshold knob
3410   if (InlineThreshold.getNumOccurrences() == 0) {
3411     Params.OptMinSizeThreshold = InlineConstants::OptMinSizeThreshold;
3412     Params.OptSizeThreshold = InlineConstants::OptSizeThreshold;
3413     Params.ColdThreshold = ColdThreshold;
3414   } else if (ColdThreshold.getNumOccurrences() > 0) {
3415     Params.ColdThreshold = ColdThreshold;
3416   }
3417   return Params;
3418 }
3419 
getInlineParams()3420 InlineParams llvm::getInlineParams() {
3421   return getInlineParams(DefaultThreshold);
3422 }
3423 
3424 // Compute the default threshold for inlining based on the opt level and the
3425 // size opt level.
computeThresholdFromOptLevels(unsigned OptLevel,unsigned SizeOptLevel)3426 static int computeThresholdFromOptLevels(unsigned OptLevel,
3427                                          unsigned SizeOptLevel) {
3428   if (OptLevel > 2)
3429     return InlineConstants::OptAggressiveThreshold;
3430   if (SizeOptLevel == 1) // -Os
3431     return InlineConstants::OptSizeThreshold;
3432   if (SizeOptLevel == 2) // -Oz
3433     return InlineConstants::OptMinSizeThreshold;
3434   return DefaultThreshold;
3435 }
3436 
getInlineParams(unsigned OptLevel,unsigned SizeOptLevel)3437 InlineParams llvm::getInlineParams(unsigned OptLevel, unsigned SizeOptLevel) {
3438   auto Params =
3439       getInlineParams(computeThresholdFromOptLevels(OptLevel, SizeOptLevel));
3440   // At O3, use the value of -locally-hot-callsite-threshold option to populate
3441   // Params.LocallyHotCallSiteThreshold. Below O3, this flag has effect only
3442   // when it is specified explicitly.
3443   if (OptLevel > 2)
3444     Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold;
3445   return Params;
3446 }
3447 
3448 PreservedAnalyses
run(Function & F,FunctionAnalysisManager & FAM)3449 InlineCostAnnotationPrinterPass::run(Function &F,
3450                                      FunctionAnalysisManager &FAM) {
3451   PrintInstructionComments = true;
3452   std::function<AssumptionCache &(Function &)> GetAssumptionCache =
3453       [&](Function &F) -> AssumptionCache & {
3454     return FAM.getResult<AssumptionAnalysis>(F);
3455   };
3456 
3457   auto &MAMProxy = FAM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
3458   ProfileSummaryInfo *PSI =
3459       MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
3460   const TargetTransformInfo &TTI = FAM.getResult<TargetIRAnalysis>(F);
3461 
3462   // FIXME: Redesign the usage of InlineParams to expand the scope of this pass.
3463   // In the current implementation, the type of InlineParams doesn't matter as
3464   // the pass serves only for verification of inliner's decisions.
3465   // We can add a flag which determines InlineParams for this run. Right now,
3466   // the default InlineParams are used.
3467   const InlineParams Params = llvm::getInlineParams();
3468   for (BasicBlock &BB : F) {
3469     for (Instruction &I : BB) {
3470       if (auto *CB = dyn_cast<CallBase>(&I)) {
3471         Function *CalledFunction = CB->getCalledFunction();
3472         if (!CalledFunction || CalledFunction->isDeclaration())
3473           continue;
3474         OptimizationRemarkEmitter ORE(CalledFunction);
3475         InlineCostCallAnalyzer ICCA(*CalledFunction, *CB, Params, TTI,
3476                                     GetAssumptionCache, nullptr, nullptr, PSI,
3477                                     &ORE);
3478         ICCA.analyze();
3479         OS << "      Analyzing call of " << CalledFunction->getName()
3480            << "... (caller:" << CB->getCaller()->getName() << ")\n";
3481         ICCA.print(OS);
3482         OS << "\n";
3483       }
3484     }
3485   }
3486   return PreservedAnalyses::all();
3487 }
3488