1 //===- TargetTransformInfo.h ------------------------------------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This pass exposes codegen information to IR-level passes. Every 10 /// transformation that uses codegen information is broken into three parts: 11 /// 1. The IR-level analysis pass. 12 /// 2. The IR-level transformation interface which provides the needed 13 /// information. 14 /// 3. Codegen-level implementation which uses target-specific hooks. 15 /// 16 /// This file defines #2, which is the interface that IR-level transformations 17 /// use for querying the codegen. 18 /// 19 //===----------------------------------------------------------------------===// 20 21 #ifndef LLVM_ANALYSIS_TARGETTRANSFORMINFO_H 22 #define LLVM_ANALYSIS_TARGETTRANSFORMINFO_H 23 24 #include "llvm/ADT/APInt.h" 25 #include "llvm/ADT/ArrayRef.h" 26 #include "llvm/Analysis/IVDescriptors.h" 27 #include "llvm/IR/FMF.h" 28 #include "llvm/IR/InstrTypes.h" 29 #include "llvm/IR/PassManager.h" 30 #include "llvm/Pass.h" 31 #include "llvm/Support/AtomicOrdering.h" 32 #include "llvm/Support/BranchProbability.h" 33 #include "llvm/Support/Compiler.h" 34 #include "llvm/Support/InstructionCost.h" 35 #include <functional> 36 #include <optional> 37 #include <utility> 38 39 namespace llvm { 40 41 namespace Intrinsic { 42 typedef unsigned ID; 43 } 44 45 class AllocaInst; 46 class AssumptionCache; 47 class BlockFrequencyInfo; 48 class DominatorTree; 49 class BranchInst; 50 class Function; 51 class GlobalValue; 52 class InstCombiner; 53 class OptimizationRemarkEmitter; 54 class InterleavedAccessInfo; 55 class IntrinsicInst; 56 class LoadInst; 57 class Loop; 58 class LoopInfo; 59 class LoopVectorizationLegality; 60 class ProfileSummaryInfo; 61 class RecurrenceDescriptor; 62 class SCEV; 63 class ScalarEvolution; 64 class SmallBitVector; 65 class StoreInst; 66 class SwitchInst; 67 class TargetLibraryInfo; 68 class Type; 69 class VPIntrinsic; 70 struct KnownBits; 71 72 /// Information about a load/store intrinsic defined by the target. 73 struct MemIntrinsicInfo { 74 /// This is the pointer that the intrinsic is loading from or storing to. 75 /// If this is non-null, then analysis/optimization passes can assume that 76 /// this intrinsic is functionally equivalent to a load/store from this 77 /// pointer. 78 Value *PtrVal = nullptr; 79 80 // Ordering for atomic operations. 81 AtomicOrdering Ordering = AtomicOrdering::NotAtomic; 82 83 // Same Id is set by the target for corresponding load/store intrinsics. 84 unsigned short MatchingId = 0; 85 86 bool ReadMem = false; 87 bool WriteMem = false; 88 bool IsVolatile = false; 89 isUnorderedMemIntrinsicInfo90 bool isUnordered() const { 91 return (Ordering == AtomicOrdering::NotAtomic || 92 Ordering == AtomicOrdering::Unordered) && 93 !IsVolatile; 94 } 95 }; 96 97 /// Attributes of a target dependent hardware loop. 98 struct HardwareLoopInfo { 99 HardwareLoopInfo() = delete; 100 LLVM_ABI HardwareLoopInfo(Loop *L); 101 Loop *L = nullptr; 102 BasicBlock *ExitBlock = nullptr; 103 BranchInst *ExitBranch = nullptr; 104 const SCEV *ExitCount = nullptr; 105 IntegerType *CountType = nullptr; 106 Value *LoopDecrement = nullptr; // Decrement the loop counter by this 107 // value in every iteration. 108 bool IsNestingLegal = false; // Can a hardware loop be a parent to 109 // another hardware loop? 110 bool CounterInReg = false; // Should loop counter be updated in 111 // the loop via a phi? 112 bool PerformEntryTest = false; // Generate the intrinsic which also performs 113 // icmp ne zero on the loop counter value and 114 // produces an i1 to guard the loop entry. 115 LLVM_ABI bool isHardwareLoopCandidate(ScalarEvolution &SE, LoopInfo &LI, 116 DominatorTree &DT, 117 bool ForceNestedLoop = false, 118 bool ForceHardwareLoopPHI = false); 119 LLVM_ABI bool canAnalyze(LoopInfo &LI); 120 }; 121 122 class IntrinsicCostAttributes { 123 const IntrinsicInst *II = nullptr; 124 Type *RetTy = nullptr; 125 Intrinsic::ID IID; 126 SmallVector<Type *, 4> ParamTys; 127 SmallVector<const Value *, 4> Arguments; 128 FastMathFlags FMF; 129 // If ScalarizationCost is UINT_MAX, the cost of scalarizing the 130 // arguments and the return value will be computed based on types. 131 InstructionCost ScalarizationCost = InstructionCost::getInvalid(); 132 TargetLibraryInfo const *LibInfo = nullptr; 133 134 public: 135 LLVM_ABI IntrinsicCostAttributes( 136 Intrinsic::ID Id, const CallBase &CI, 137 InstructionCost ScalarCost = InstructionCost::getInvalid(), 138 bool TypeBasedOnly = false, TargetLibraryInfo const *LibInfo = nullptr); 139 140 LLVM_ABI IntrinsicCostAttributes( 141 Intrinsic::ID Id, Type *RTy, ArrayRef<Type *> Tys, 142 FastMathFlags Flags = FastMathFlags(), const IntrinsicInst *I = nullptr, 143 InstructionCost ScalarCost = InstructionCost::getInvalid()); 144 145 LLVM_ABI IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy, 146 ArrayRef<const Value *> Args); 147 148 LLVM_ABI IntrinsicCostAttributes( 149 Intrinsic::ID Id, Type *RTy, ArrayRef<const Value *> Args, 150 ArrayRef<Type *> Tys, FastMathFlags Flags = FastMathFlags(), 151 const IntrinsicInst *I = nullptr, 152 InstructionCost ScalarCost = InstructionCost::getInvalid(), 153 TargetLibraryInfo const *LibInfo = nullptr); 154 getID()155 Intrinsic::ID getID() const { return IID; } getInst()156 const IntrinsicInst *getInst() const { return II; } getReturnType()157 Type *getReturnType() const { return RetTy; } getFlags()158 FastMathFlags getFlags() const { return FMF; } getScalarizationCost()159 InstructionCost getScalarizationCost() const { return ScalarizationCost; } getArgs()160 const SmallVectorImpl<const Value *> &getArgs() const { return Arguments; } getArgTypes()161 const SmallVectorImpl<Type *> &getArgTypes() const { return ParamTys; } getLibInfo()162 const TargetLibraryInfo *getLibInfo() const { return LibInfo; } 163 isTypeBasedOnly()164 bool isTypeBasedOnly() const { 165 return Arguments.empty(); 166 } 167 skipScalarizationCost()168 bool skipScalarizationCost() const { return ScalarizationCost.isValid(); } 169 }; 170 171 enum class TailFoldingStyle { 172 /// Don't use tail folding 173 None, 174 /// Use predicate only to mask operations on data in the loop. 175 /// When the VL is not known to be a power-of-2, this method requires a 176 /// runtime overflow check for the i + VL in the loop because it compares the 177 /// scalar induction variable against the tripcount rounded up by VL which may 178 /// overflow. When the VL is a power-of-2, both the increment and uprounded 179 /// tripcount will overflow to 0, which does not require a runtime check 180 /// since the loop is exited when the loop induction variable equals the 181 /// uprounded trip-count, which are both 0. 182 Data, 183 /// Same as Data, but avoids using the get.active.lane.mask intrinsic to 184 /// calculate the mask and instead implements this with a 185 /// splat/stepvector/cmp. 186 /// FIXME: Can this kind be removed now that SelectionDAGBuilder expands the 187 /// active.lane.mask intrinsic when it is not natively supported? 188 DataWithoutLaneMask, 189 /// Use predicate to control both data and control flow. 190 /// This method always requires a runtime overflow check for the i + VL 191 /// increment inside the loop, because it uses the result direclty in the 192 /// active.lane.mask to calculate the mask for the next iteration. If the 193 /// increment overflows, the mask is no longer correct. 194 DataAndControlFlow, 195 /// Use predicate to control both data and control flow, but modify 196 /// the trip count so that a runtime overflow check can be avoided 197 /// and such that the scalar epilogue loop can always be removed. 198 DataAndControlFlowWithoutRuntimeCheck, 199 /// Use predicated EVL instructions for tail-folding. 200 /// Indicates that VP intrinsics should be used. 201 DataWithEVL, 202 }; 203 204 struct TailFoldingInfo { 205 TargetLibraryInfo *TLI; 206 LoopVectorizationLegality *LVL; 207 InterleavedAccessInfo *IAI; TailFoldingInfoTailFoldingInfo208 TailFoldingInfo(TargetLibraryInfo *TLI, LoopVectorizationLegality *LVL, 209 InterleavedAccessInfo *IAI) 210 : TLI(TLI), LVL(LVL), IAI(IAI) {} 211 }; 212 213 class TargetTransformInfo; 214 typedef TargetTransformInfo TTI; 215 class TargetTransformInfoImplBase; 216 217 /// This pass provides access to the codegen interfaces that are needed 218 /// for IR-level transformations. 219 class TargetTransformInfo { 220 public: 221 enum PartialReductionExtendKind { PR_None, PR_SignExtend, PR_ZeroExtend }; 222 223 /// Get the kind of extension that an instruction represents. 224 LLVM_ABI static PartialReductionExtendKind 225 getPartialReductionExtendKind(Instruction *I); 226 227 /// Construct a TTI object using a type implementing the \c Concept 228 /// API below. 229 /// 230 /// This is used by targets to construct a TTI wrapping their target-specific 231 /// implementation that encodes appropriate costs for their target. 232 LLVM_ABI explicit TargetTransformInfo( 233 std::unique_ptr<const TargetTransformInfoImplBase> Impl); 234 235 /// Construct a baseline TTI object using a minimal implementation of 236 /// the \c Concept API below. 237 /// 238 /// The TTI implementation will reflect the information in the DataLayout 239 /// provided if non-null. 240 LLVM_ABI explicit TargetTransformInfo(const DataLayout &DL); 241 242 // Provide move semantics. 243 LLVM_ABI TargetTransformInfo(TargetTransformInfo &&Arg); 244 LLVM_ABI TargetTransformInfo &operator=(TargetTransformInfo &&RHS); 245 246 // We need to define the destructor out-of-line to define our sub-classes 247 // out-of-line. 248 LLVM_ABI ~TargetTransformInfo(); 249 250 /// Handle the invalidation of this information. 251 /// 252 /// When used as a result of \c TargetIRAnalysis this method will be called 253 /// when the function this was computed for changes. When it returns false, 254 /// the information is preserved across those changes. invalidate(Function &,const PreservedAnalyses &,FunctionAnalysisManager::Invalidator &)255 bool invalidate(Function &, const PreservedAnalyses &, 256 FunctionAnalysisManager::Invalidator &) { 257 // FIXME: We should probably in some way ensure that the subtarget 258 // information for a function hasn't changed. 259 return false; 260 } 261 262 /// \name Generic Target Information 263 /// @{ 264 265 /// The kind of cost model. 266 /// 267 /// There are several different cost models that can be customized by the 268 /// target. The normalization of each cost model may be target specific. 269 /// e.g. TCK_SizeAndLatency should be comparable to target thresholds such as 270 /// those derived from MCSchedModel::LoopMicroOpBufferSize etc. 271 enum TargetCostKind { 272 TCK_RecipThroughput, ///< Reciprocal throughput. 273 TCK_Latency, ///< The latency of instruction. 274 TCK_CodeSize, ///< Instruction code size. 275 TCK_SizeAndLatency ///< The weighted sum of size and latency. 276 }; 277 278 /// Underlying constants for 'cost' values in this interface. 279 /// 280 /// Many APIs in this interface return a cost. This enum defines the 281 /// fundamental values that should be used to interpret (and produce) those 282 /// costs. The costs are returned as an int rather than a member of this 283 /// enumeration because it is expected that the cost of one IR instruction 284 /// may have a multiplicative factor to it or otherwise won't fit directly 285 /// into the enum. Moreover, it is common to sum or average costs which works 286 /// better as simple integral values. Thus this enum only provides constants. 287 /// Also note that the returned costs are signed integers to make it natural 288 /// to add, subtract, and test with zero (a common boundary condition). It is 289 /// not expected that 2^32 is a realistic cost to be modeling at any point. 290 /// 291 /// Note that these costs should usually reflect the intersection of code-size 292 /// cost and execution cost. A free instruction is typically one that folds 293 /// into another instruction. For example, reg-to-reg moves can often be 294 /// skipped by renaming the registers in the CPU, but they still are encoded 295 /// and thus wouldn't be considered 'free' here. 296 enum TargetCostConstants { 297 TCC_Free = 0, ///< Expected to fold away in lowering. 298 TCC_Basic = 1, ///< The cost of a typical 'add' instruction. 299 TCC_Expensive = 4 ///< The cost of a 'div' instruction on x86. 300 }; 301 302 /// Estimate the cost of a GEP operation when lowered. 303 /// 304 /// \p PointeeType is the source element type of the GEP. 305 /// \p Ptr is the base pointer operand. 306 /// \p Operands is the list of indices following the base pointer. 307 /// 308 /// \p AccessType is a hint as to what type of memory might be accessed by 309 /// users of the GEP. getGEPCost will use it to determine if the GEP can be 310 /// folded into the addressing mode of a load/store. If AccessType is null, 311 /// then the resulting target type based off of PointeeType will be used as an 312 /// approximation. 313 LLVM_ABI InstructionCost 314 getGEPCost(Type *PointeeType, const Value *Ptr, 315 ArrayRef<const Value *> Operands, Type *AccessType = nullptr, 316 TargetCostKind CostKind = TCK_SizeAndLatency) const; 317 318 /// Describe known properties for a set of pointers. 319 struct PointersChainInfo { 320 /// All the GEPs in a set have same base address. 321 unsigned IsSameBaseAddress : 1; 322 /// These properties only valid if SameBaseAddress is set. 323 /// True if all pointers are separated by a unit stride. 324 unsigned IsUnitStride : 1; 325 /// True if distance between any two neigbouring pointers is a known value. 326 unsigned IsKnownStride : 1; 327 unsigned Reserved : 29; 328 isSameBasePointersChainInfo329 bool isSameBase() const { return IsSameBaseAddress; } isUnitStridePointersChainInfo330 bool isUnitStride() const { return IsSameBaseAddress && IsUnitStride; } isKnownStridePointersChainInfo331 bool isKnownStride() const { return IsSameBaseAddress && IsKnownStride; } 332 getUnitStridePointersChainInfo333 static PointersChainInfo getUnitStride() { 334 return {/*IsSameBaseAddress=*/1, /*IsUnitStride=*/1, 335 /*IsKnownStride=*/1, 0}; 336 } getKnownStridePointersChainInfo337 static PointersChainInfo getKnownStride() { 338 return {/*IsSameBaseAddress=*/1, /*IsUnitStride=*/0, 339 /*IsKnownStride=*/1, 0}; 340 } getUnknownStridePointersChainInfo341 static PointersChainInfo getUnknownStride() { 342 return {/*IsSameBaseAddress=*/1, /*IsUnitStride=*/0, 343 /*IsKnownStride=*/0, 0}; 344 } 345 }; 346 static_assert(sizeof(PointersChainInfo) == 4, "Was size increase justified?"); 347 348 /// Estimate the cost of a chain of pointers (typically pointer operands of a 349 /// chain of loads or stores within same block) operations set when lowered. 350 /// \p AccessTy is the type of the loads/stores that will ultimately use the 351 /// \p Ptrs. 352 LLVM_ABI InstructionCost getPointersChainCost( 353 ArrayRef<const Value *> Ptrs, const Value *Base, 354 const PointersChainInfo &Info, Type *AccessTy, 355 TargetCostKind CostKind = TTI::TCK_RecipThroughput) const; 356 357 /// \returns A value by which our inlining threshold should be multiplied. 358 /// This is primarily used to bump up the inlining threshold wholesale on 359 /// targets where calls are unusually expensive. 360 /// 361 /// TODO: This is a rather blunt instrument. Perhaps altering the costs of 362 /// individual classes of instructions would be better. 363 LLVM_ABI unsigned getInliningThresholdMultiplier() const; 364 365 LLVM_ABI unsigned getInliningCostBenefitAnalysisSavingsMultiplier() const; 366 LLVM_ABI unsigned getInliningCostBenefitAnalysisProfitableMultiplier() const; 367 368 /// \returns The bonus of inlining the last call to a static function. 369 LLVM_ABI int getInliningLastCallToStaticBonus() const; 370 371 /// \returns A value to be added to the inlining threshold. 372 LLVM_ABI unsigned adjustInliningThreshold(const CallBase *CB) const; 373 374 /// \returns The cost of having an Alloca in the caller if not inlined, to be 375 /// added to the threshold 376 LLVM_ABI unsigned getCallerAllocaCost(const CallBase *CB, 377 const AllocaInst *AI) const; 378 379 /// \returns Vector bonus in percent. 380 /// 381 /// Vector bonuses: We want to more aggressively inline vector-dense kernels 382 /// and apply this bonus based on the percentage of vector instructions. A 383 /// bonus is applied if the vector instructions exceed 50% and half that 384 /// amount is applied if it exceeds 10%. Note that these bonuses are some what 385 /// arbitrary and evolved over time by accident as much as because they are 386 /// principled bonuses. 387 /// FIXME: It would be nice to base the bonus values on something more 388 /// scientific. A target may has no bonus on vector instructions. 389 LLVM_ABI int getInlinerVectorBonusPercent() const; 390 391 /// \return the expected cost of a memcpy, which could e.g. depend on the 392 /// source/destination type and alignment and the number of bytes copied. 393 LLVM_ABI InstructionCost getMemcpyCost(const Instruction *I) const; 394 395 /// Returns the maximum memset / memcpy size in bytes that still makes it 396 /// profitable to inline the call. 397 LLVM_ABI uint64_t getMaxMemIntrinsicInlineSizeThreshold() const; 398 399 /// \return The estimated number of case clusters when lowering \p 'SI'. 400 /// \p JTSize Set a jump table size only when \p SI is suitable for a jump 401 /// table. 402 LLVM_ABI unsigned 403 getEstimatedNumberOfCaseClusters(const SwitchInst &SI, unsigned &JTSize, 404 ProfileSummaryInfo *PSI, 405 BlockFrequencyInfo *BFI) const; 406 407 /// Estimate the cost of a given IR user when lowered. 408 /// 409 /// This can estimate the cost of either a ConstantExpr or Instruction when 410 /// lowered. 411 /// 412 /// \p Operands is a list of operands which can be a result of transformations 413 /// of the current operands. The number of the operands on the list must equal 414 /// to the number of the current operands the IR user has. Their order on the 415 /// list must be the same as the order of the current operands the IR user 416 /// has. 417 /// 418 /// The returned cost is defined in terms of \c TargetCostConstants, see its 419 /// comments for a detailed explanation of the cost values. 420 LLVM_ABI InstructionCost getInstructionCost(const User *U, 421 ArrayRef<const Value *> Operands, 422 TargetCostKind CostKind) const; 423 424 /// This is a helper function which calls the three-argument 425 /// getInstructionCost with \p Operands which are the current operands U has. getInstructionCost(const User * U,TargetCostKind CostKind)426 InstructionCost getInstructionCost(const User *U, 427 TargetCostKind CostKind) const { 428 SmallVector<const Value *, 4> Operands(U->operand_values()); 429 return getInstructionCost(U, Operands, CostKind); 430 } 431 432 /// If a branch or a select condition is skewed in one direction by more than 433 /// this factor, it is very likely to be predicted correctly. 434 LLVM_ABI BranchProbability getPredictableBranchThreshold() const; 435 436 /// Returns estimated penalty of a branch misprediction in latency. Indicates 437 /// how aggressive the target wants for eliminating unpredictable branches. A 438 /// zero return value means extra optimization applied to them should be 439 /// minimal. 440 LLVM_ABI InstructionCost getBranchMispredictPenalty() const; 441 442 /// Return true if branch divergence exists. 443 /// 444 /// Branch divergence has a significantly negative impact on GPU performance 445 /// when threads in the same wavefront take different paths due to conditional 446 /// branches. 447 /// 448 /// If \p F is passed, provides a context function. If \p F is known to only 449 /// execute in a single threaded environment, the target may choose to skip 450 /// uniformity analysis and assume all values are uniform. 451 LLVM_ABI bool hasBranchDivergence(const Function *F = nullptr) const; 452 453 /// Returns whether V is a source of divergence. 454 /// 455 /// This function provides the target-dependent information for 456 /// the target-independent UniformityAnalysis. 457 LLVM_ABI bool isSourceOfDivergence(const Value *V) const; 458 459 // Returns true for the target specific 460 // set of operations which produce uniform result 461 // even taking non-uniform arguments 462 LLVM_ABI bool isAlwaysUniform(const Value *V) const; 463 464 /// Query the target whether the specified address space cast from FromAS to 465 /// ToAS is valid. 466 LLVM_ABI bool isValidAddrSpaceCast(unsigned FromAS, unsigned ToAS) const; 467 468 /// Return false if a \p AS0 address cannot possibly alias a \p AS1 address. 469 LLVM_ABI bool addrspacesMayAlias(unsigned AS0, unsigned AS1) const; 470 471 /// Returns the address space ID for a target's 'flat' address space. Note 472 /// this is not necessarily the same as addrspace(0), which LLVM sometimes 473 /// refers to as the generic address space. The flat address space is a 474 /// generic address space that can be used access multiple segments of memory 475 /// with different address spaces. Access of a memory location through a 476 /// pointer with this address space is expected to be legal but slower 477 /// compared to the same memory location accessed through a pointer with a 478 /// different address space. 479 // 480 /// This is for targets with different pointer representations which can 481 /// be converted with the addrspacecast instruction. If a pointer is converted 482 /// to this address space, optimizations should attempt to replace the access 483 /// with the source address space. 484 /// 485 /// \returns ~0u if the target does not have such a flat address space to 486 /// optimize away. 487 LLVM_ABI unsigned getFlatAddressSpace() const; 488 489 /// Return any intrinsic address operand indexes which may be rewritten if 490 /// they use a flat address space pointer. 491 /// 492 /// \returns true if the intrinsic was handled. 493 LLVM_ABI bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes, 494 Intrinsic::ID IID) const; 495 496 LLVM_ABI bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const; 497 498 /// Return true if globals in this address space can have initializers other 499 /// than `undef`. 500 LLVM_ABI bool 501 canHaveNonUndefGlobalInitializerInAddressSpace(unsigned AS) const; 502 503 LLVM_ABI unsigned getAssumedAddrSpace(const Value *V) const; 504 505 LLVM_ABI bool isSingleThreaded() const; 506 507 LLVM_ABI std::pair<const Value *, unsigned> 508 getPredicatedAddrSpace(const Value *V) const; 509 510 /// Rewrite intrinsic call \p II such that \p OldV will be replaced with \p 511 /// NewV, which has a different address space. This should happen for every 512 /// operand index that collectFlatAddressOperands returned for the intrinsic. 513 /// \returns nullptr if the intrinsic was not handled. Otherwise, returns the 514 /// new value (which may be the original \p II with modified operands). 515 LLVM_ABI Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, 516 Value *OldV, 517 Value *NewV) const; 518 519 /// Test whether calls to a function lower to actual program function 520 /// calls. 521 /// 522 /// The idea is to test whether the program is likely to require a 'call' 523 /// instruction or equivalent in order to call the given function. 524 /// 525 /// FIXME: It's not clear that this is a good or useful query API. Client's 526 /// should probably move to simpler cost metrics using the above. 527 /// Alternatively, we could split the cost interface into distinct code-size 528 /// and execution-speed costs. This would allow modelling the core of this 529 /// query more accurately as a call is a single small instruction, but 530 /// incurs significant execution cost. 531 LLVM_ABI bool isLoweredToCall(const Function *F) const; 532 533 struct LSRCost { 534 /// TODO: Some of these could be merged. Also, a lexical ordering 535 /// isn't always optimal. 536 unsigned Insns; 537 unsigned NumRegs; 538 unsigned AddRecCost; 539 unsigned NumIVMuls; 540 unsigned NumBaseAdds; 541 unsigned ImmCost; 542 unsigned SetupCost; 543 unsigned ScaleCost; 544 }; 545 546 /// Parameters that control the generic loop unrolling transformation. 547 struct UnrollingPreferences { 548 /// The cost threshold for the unrolled loop. Should be relative to the 549 /// getInstructionCost values returned by this API, and the expectation is 550 /// that the unrolled loop's instructions when run through that interface 551 /// should not exceed this cost. However, this is only an estimate. Also, 552 /// specific loops may be unrolled even with a cost above this threshold if 553 /// deemed profitable. Set this to UINT_MAX to disable the loop body cost 554 /// restriction. 555 unsigned Threshold; 556 /// If complete unrolling will reduce the cost of the loop, we will boost 557 /// the Threshold by a certain percent to allow more aggressive complete 558 /// unrolling. This value provides the maximum boost percentage that we 559 /// can apply to Threshold (The value should be no less than 100). 560 /// BoostedThreshold = Threshold * min(RolledCost / UnrolledCost, 561 /// MaxPercentThresholdBoost / 100) 562 /// E.g. if complete unrolling reduces the loop execution time by 50% 563 /// then we boost the threshold by the factor of 2x. If unrolling is not 564 /// expected to reduce the running time, then we do not increase the 565 /// threshold. 566 unsigned MaxPercentThresholdBoost; 567 /// The cost threshold for the unrolled loop when optimizing for size (set 568 /// to UINT_MAX to disable). 569 unsigned OptSizeThreshold; 570 /// The cost threshold for the unrolled loop, like Threshold, but used 571 /// for partial/runtime unrolling (set to UINT_MAX to disable). 572 unsigned PartialThreshold; 573 /// The cost threshold for the unrolled loop when optimizing for size, like 574 /// OptSizeThreshold, but used for partial/runtime unrolling (set to 575 /// UINT_MAX to disable). 576 unsigned PartialOptSizeThreshold; 577 /// A forced unrolling factor (the number of concatenated bodies of the 578 /// original loop in the unrolled loop body). When set to 0, the unrolling 579 /// transformation will select an unrolling factor based on the current cost 580 /// threshold and other factors. 581 unsigned Count; 582 /// Default unroll count for loops with run-time trip count. 583 unsigned DefaultUnrollRuntimeCount; 584 // Set the maximum unrolling factor. The unrolling factor may be selected 585 // using the appropriate cost threshold, but may not exceed this number 586 // (set to UINT_MAX to disable). This does not apply in cases where the 587 // loop is being fully unrolled. 588 unsigned MaxCount; 589 /// Set the maximum upper bound of trip count. Allowing the MaxUpperBound 590 /// to be overrided by a target gives more flexiblity on certain cases. 591 /// By default, MaxUpperBound uses UnrollMaxUpperBound which value is 8. 592 unsigned MaxUpperBound; 593 /// Set the maximum unrolling factor for full unrolling. Like MaxCount, but 594 /// applies even if full unrolling is selected. This allows a target to fall 595 /// back to Partial unrolling if full unrolling is above FullUnrollMaxCount. 596 unsigned FullUnrollMaxCount; 597 // Represents number of instructions optimized when "back edge" 598 // becomes "fall through" in unrolled loop. 599 // For now we count a conditional branch on a backedge and a comparison 600 // feeding it. 601 unsigned BEInsns; 602 /// Allow partial unrolling (unrolling of loops to expand the size of the 603 /// loop body, not only to eliminate small constant-trip-count loops). 604 bool Partial; 605 /// Allow runtime unrolling (unrolling of loops to expand the size of the 606 /// loop body even when the number of loop iterations is not known at 607 /// compile time). 608 bool Runtime; 609 /// Allow generation of a loop remainder (extra iterations after unroll). 610 bool AllowRemainder; 611 /// Allow emitting expensive instructions (such as divisions) when computing 612 /// the trip count of a loop for runtime unrolling. 613 bool AllowExpensiveTripCount; 614 /// Apply loop unroll on any kind of loop 615 /// (mainly to loops that fail runtime unrolling). 616 bool Force; 617 /// Allow using trip count upper bound to unroll loops. 618 bool UpperBound; 619 /// Allow unrolling of all the iterations of the runtime loop remainder. 620 bool UnrollRemainder; 621 /// Allow unroll and jam. Used to enable unroll and jam for the target. 622 bool UnrollAndJam; 623 /// Threshold for unroll and jam, for inner loop size. The 'Threshold' 624 /// value above is used during unroll and jam for the outer loop size. 625 /// This value is used in the same manner to limit the size of the inner 626 /// loop. 627 unsigned UnrollAndJamInnerLoopThreshold; 628 /// Don't allow loop unrolling to simulate more than this number of 629 /// iterations when checking full unroll profitability 630 unsigned MaxIterationsCountToAnalyze; 631 /// Don't disable runtime unroll for the loops which were vectorized. 632 bool UnrollVectorizedLoop = false; 633 /// Don't allow runtime unrolling if expanding the trip count takes more 634 /// than SCEVExpansionBudget. 635 unsigned SCEVExpansionBudget; 636 /// Allow runtime unrolling multi-exit loops. Should only be set if the 637 /// target determined that multi-exit unrolling is profitable for the loop. 638 /// Fall back to the generic logic to determine whether multi-exit unrolling 639 /// is profitable if set to false. 640 bool RuntimeUnrollMultiExit; 641 }; 642 643 /// Get target-customized preferences for the generic loop unrolling 644 /// transformation. The caller will initialize UP with the current 645 /// target-independent defaults. 646 LLVM_ABI void getUnrollingPreferences(Loop *L, ScalarEvolution &, 647 UnrollingPreferences &UP, 648 OptimizationRemarkEmitter *ORE) const; 649 650 /// Query the target whether it would be profitable to convert the given loop 651 /// into a hardware loop. 652 LLVM_ABI bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, 653 AssumptionCache &AC, 654 TargetLibraryInfo *LibInfo, 655 HardwareLoopInfo &HWLoopInfo) const; 656 657 // Query the target for which minimum vectorization factor epilogue 658 // vectorization should be considered. 659 LLVM_ABI unsigned getEpilogueVectorizationMinVF() const; 660 661 /// Query the target whether it would be prefered to create a predicated 662 /// vector loop, which can avoid the need to emit a scalar epilogue loop. 663 LLVM_ABI bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) const; 664 665 /// Query the target what the preferred style of tail folding is. 666 /// \param IVUpdateMayOverflow Tells whether it is known if the IV update 667 /// may (or will never) overflow for the suggested VF/UF in the given loop. 668 /// Targets can use this information to select a more optimal tail folding 669 /// style. The value conservatively defaults to true, such that no assumptions 670 /// are made on overflow. 671 LLVM_ABI TailFoldingStyle 672 getPreferredTailFoldingStyle(bool IVUpdateMayOverflow = true) const; 673 674 // Parameters that control the loop peeling transformation 675 struct PeelingPreferences { 676 /// A forced peeling factor (the number of bodied of the original loop 677 /// that should be peeled off before the loop body). When set to 0, the 678 /// a peeling factor based on profile information and other factors. 679 unsigned PeelCount; 680 /// Allow peeling off loop iterations. 681 bool AllowPeeling; 682 /// Allow peeling off loop iterations for loop nests. 683 bool AllowLoopNestsPeeling; 684 /// Allow peeling basing on profile. Uses to enable peeling off all 685 /// iterations basing on provided profile. 686 /// If the value is true the peeling cost model can decide to peel only 687 /// some iterations and in this case it will set this to false. 688 bool PeelProfiledIterations; 689 690 /// Peel off the last PeelCount loop iterations. 691 bool PeelLast; 692 }; 693 694 /// Get target-customized preferences for the generic loop peeling 695 /// transformation. The caller will initialize \p PP with the current 696 /// target-independent defaults with information from \p L and \p SE. 697 LLVM_ABI void getPeelingPreferences(Loop *L, ScalarEvolution &SE, 698 PeelingPreferences &PP) const; 699 700 /// Targets can implement their own combinations for target-specific 701 /// intrinsics. This function will be called from the InstCombine pass every 702 /// time a target-specific intrinsic is encountered. 703 /// 704 /// \returns std::nullopt to not do anything target specific or a value that 705 /// will be returned from the InstCombiner. It is possible to return null and 706 /// stop further processing of the intrinsic by returning nullptr. 707 LLVM_ABI std::optional<Instruction *> 708 instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const; 709 /// Can be used to implement target-specific instruction combining. 710 /// \see instCombineIntrinsic 711 LLVM_ABI std::optional<Value *> 712 simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II, 713 APInt DemandedMask, KnownBits &Known, 714 bool &KnownBitsComputed) const; 715 /// Can be used to implement target-specific instruction combining. 716 /// \see instCombineIntrinsic 717 LLVM_ABI std::optional<Value *> simplifyDemandedVectorEltsIntrinsic( 718 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, 719 APInt &UndefElts2, APInt &UndefElts3, 720 std::function<void(Instruction *, unsigned, APInt, APInt &)> 721 SimplifyAndSetOp) const; 722 /// @} 723 724 /// \name Scalar Target Information 725 /// @{ 726 727 /// Flags indicating the kind of support for population count. 728 /// 729 /// Compared to the SW implementation, HW support is supposed to 730 /// significantly boost the performance when the population is dense, and it 731 /// may or may not degrade performance if the population is sparse. A HW 732 /// support is considered as "Fast" if it can outperform, or is on a par 733 /// with, SW implementation when the population is sparse; otherwise, it is 734 /// considered as "Slow". 735 enum PopcntSupportKind { PSK_Software, PSK_SlowHardware, PSK_FastHardware }; 736 737 /// Return true if the specified immediate is legal add immediate, that 738 /// is the target has add instructions which can add a register with the 739 /// immediate without having to materialize the immediate into a register. 740 LLVM_ABI bool isLegalAddImmediate(int64_t Imm) const; 741 742 /// Return true if adding the specified scalable immediate is legal, that is 743 /// the target has add instructions which can add a register with the 744 /// immediate (multiplied by vscale) without having to materialize the 745 /// immediate into a register. 746 LLVM_ABI bool isLegalAddScalableImmediate(int64_t Imm) const; 747 748 /// Return true if the specified immediate is legal icmp immediate, 749 /// that is the target has icmp instructions which can compare a register 750 /// against the immediate without having to materialize the immediate into a 751 /// register. 752 LLVM_ABI bool isLegalICmpImmediate(int64_t Imm) const; 753 754 /// Return true if the addressing mode represented by AM is legal for 755 /// this target, for a load/store of the specified type. 756 /// The type may be VoidTy, in which case only return true if the addressing 757 /// mode is legal for a load/store of any legal type. 758 /// If target returns true in LSRWithInstrQueries(), I may be valid. 759 /// \param ScalableOffset represents a quantity of bytes multiplied by vscale, 760 /// an invariant value known only at runtime. Most targets should not accept 761 /// a scalable offset. 762 /// 763 /// TODO: Handle pre/postinc as well. 764 LLVM_ABI bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, 765 int64_t BaseOffset, bool HasBaseReg, 766 int64_t Scale, unsigned AddrSpace = 0, 767 Instruction *I = nullptr, 768 int64_t ScalableOffset = 0) const; 769 770 /// Return true if LSR cost of C1 is lower than C2. 771 LLVM_ABI bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, 772 const TargetTransformInfo::LSRCost &C2) const; 773 774 /// Return true if LSR major cost is number of registers. Targets which 775 /// implement their own isLSRCostLess and unset number of registers as major 776 /// cost should return false, otherwise return true. 777 LLVM_ABI bool isNumRegsMajorCostOfLSR() const; 778 779 /// Return true if LSR should drop a found solution if it's calculated to be 780 /// less profitable than the baseline. 781 LLVM_ABI bool shouldDropLSRSolutionIfLessProfitable() const; 782 783 /// \returns true if LSR should not optimize a chain that includes \p I. 784 LLVM_ABI bool isProfitableLSRChainElement(Instruction *I) const; 785 786 /// Return true if the target can fuse a compare and branch. 787 /// Loop-strength-reduction (LSR) uses that knowledge to adjust its cost 788 /// calculation for the instructions in a loop. 789 LLVM_ABI bool canMacroFuseCmp() const; 790 791 /// Return true if the target can save a compare for loop count, for example 792 /// hardware loop saves a compare. 793 LLVM_ABI bool canSaveCmp(Loop *L, BranchInst **BI, ScalarEvolution *SE, 794 LoopInfo *LI, DominatorTree *DT, AssumptionCache *AC, 795 TargetLibraryInfo *LibInfo) const; 796 797 enum AddressingModeKind { 798 AMK_PreIndexed, 799 AMK_PostIndexed, 800 AMK_None 801 }; 802 803 /// Return the preferred addressing mode LSR should make efforts to generate. 804 LLVM_ABI AddressingModeKind 805 getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const; 806 807 /// Return true if the target supports masked store. 808 LLVM_ABI bool isLegalMaskedStore(Type *DataType, Align Alignment, 809 unsigned AddressSpace) const; 810 /// Return true if the target supports masked load. 811 LLVM_ABI bool isLegalMaskedLoad(Type *DataType, Align Alignment, 812 unsigned AddressSpace) const; 813 814 /// Return true if the target supports nontemporal store. 815 LLVM_ABI bool isLegalNTStore(Type *DataType, Align Alignment) const; 816 /// Return true if the target supports nontemporal load. 817 LLVM_ABI bool isLegalNTLoad(Type *DataType, Align Alignment) const; 818 819 /// \Returns true if the target supports broadcasting a load to a vector of 820 /// type <NumElements x ElementTy>. 821 LLVM_ABI bool isLegalBroadcastLoad(Type *ElementTy, 822 ElementCount NumElements) const; 823 824 /// Return true if the target supports masked scatter. 825 LLVM_ABI bool isLegalMaskedScatter(Type *DataType, Align Alignment) const; 826 /// Return true if the target supports masked gather. 827 LLVM_ABI bool isLegalMaskedGather(Type *DataType, Align Alignment) const; 828 /// Return true if the target forces scalarizing of llvm.masked.gather 829 /// intrinsics. 830 LLVM_ABI bool forceScalarizeMaskedGather(VectorType *Type, 831 Align Alignment) const; 832 /// Return true if the target forces scalarizing of llvm.masked.scatter 833 /// intrinsics. 834 LLVM_ABI bool forceScalarizeMaskedScatter(VectorType *Type, 835 Align Alignment) const; 836 837 /// Return true if the target supports masked compress store. 838 LLVM_ABI bool isLegalMaskedCompressStore(Type *DataType, 839 Align Alignment) const; 840 /// Return true if the target supports masked expand load. 841 LLVM_ABI bool isLegalMaskedExpandLoad(Type *DataType, Align Alignment) const; 842 843 /// Return true if the target supports strided load. 844 LLVM_ABI bool isLegalStridedLoadStore(Type *DataType, Align Alignment) const; 845 846 /// Return true is the target supports interleaved access for the given vector 847 /// type \p VTy, interleave factor \p Factor, alignment \p Alignment and 848 /// address space \p AddrSpace. 849 LLVM_ABI bool isLegalInterleavedAccessType(VectorType *VTy, unsigned Factor, 850 Align Alignment, 851 unsigned AddrSpace) const; 852 853 // Return true if the target supports masked vector histograms. 854 LLVM_ABI bool isLegalMaskedVectorHistogram(Type *AddrType, 855 Type *DataType) const; 856 857 /// Return true if this is an alternating opcode pattern that can be lowered 858 /// to a single instruction on the target. In X86 this is for the addsub 859 /// instruction which corrsponds to a Shuffle + Fadd + FSub pattern in IR. 860 /// This function expectes two opcodes: \p Opcode1 and \p Opcode2 being 861 /// selected by \p OpcodeMask. The mask contains one bit per lane and is a `0` 862 /// when \p Opcode0 is selected and `1` when Opcode1 is selected. 863 /// \p VecTy is the vector type of the instruction to be generated. 864 LLVM_ABI bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, 865 unsigned Opcode1, 866 const SmallBitVector &OpcodeMask) const; 867 868 /// Return true if we should be enabling ordered reductions for the target. 869 LLVM_ABI bool enableOrderedReductions() const; 870 871 /// Return true if the target has a unified operation to calculate division 872 /// and remainder. If so, the additional implicit multiplication and 873 /// subtraction required to calculate a remainder from division are free. This 874 /// can enable more aggressive transformations for division and remainder than 875 /// would typically be allowed using throughput or size cost models. 876 LLVM_ABI bool hasDivRemOp(Type *DataType, bool IsSigned) const; 877 878 /// Return true if the given instruction (assumed to be a memory access 879 /// instruction) has a volatile variant. If that's the case then we can avoid 880 /// addrspacecast to generic AS for volatile loads/stores. Default 881 /// implementation returns false, which prevents address space inference for 882 /// volatile loads/stores. 883 LLVM_ABI bool hasVolatileVariant(Instruction *I, unsigned AddrSpace) const; 884 885 /// Return true if target doesn't mind addresses in vectors. 886 LLVM_ABI bool prefersVectorizedAddressing() const; 887 888 /// Return the cost of the scaling factor used in the addressing 889 /// mode represented by AM for this target, for a load/store 890 /// of the specified type. 891 /// If the AM is supported, the return value must be >= 0. 892 /// If the AM is not supported, it returns a negative value. 893 /// TODO: Handle pre/postinc as well. 894 LLVM_ABI InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, 895 StackOffset BaseOffset, 896 bool HasBaseReg, int64_t Scale, 897 unsigned AddrSpace = 0) const; 898 899 /// Return true if the loop strength reduce pass should make 900 /// Instruction* based TTI queries to isLegalAddressingMode(). This is 901 /// needed on SystemZ, where e.g. a memcpy can only have a 12 bit unsigned 902 /// immediate offset and no index register. 903 LLVM_ABI bool LSRWithInstrQueries() const; 904 905 /// Return true if it's free to truncate a value of type Ty1 to type 906 /// Ty2. e.g. On x86 it's free to truncate a i32 value in register EAX to i16 907 /// by referencing its sub-register AX. 908 LLVM_ABI bool isTruncateFree(Type *Ty1, Type *Ty2) const; 909 910 /// Return true if it is profitable to hoist instruction in the 911 /// then/else to before if. 912 LLVM_ABI bool isProfitableToHoist(Instruction *I) const; 913 914 LLVM_ABI bool useAA() const; 915 916 /// Return true if this type is legal. 917 LLVM_ABI bool isTypeLegal(Type *Ty) const; 918 919 /// Returns the estimated number of registers required to represent \p Ty. 920 LLVM_ABI unsigned getRegUsageForType(Type *Ty) const; 921 922 /// Return true if switches should be turned into lookup tables for the 923 /// target. 924 LLVM_ABI bool shouldBuildLookupTables() const; 925 926 /// Return true if switches should be turned into lookup tables 927 /// containing this constant value for the target. 928 LLVM_ABI bool shouldBuildLookupTablesForConstant(Constant *C) const; 929 930 /// Return true if lookup tables should be turned into relative lookup tables. 931 LLVM_ABI bool shouldBuildRelLookupTables() const; 932 933 /// Return true if the input function which is cold at all call sites, 934 /// should use coldcc calling convention. 935 LLVM_ABI bool useColdCCForColdCall(Function &F) const; 936 937 LLVM_ABI bool isTargetIntrinsicTriviallyScalarizable(Intrinsic::ID ID) const; 938 939 /// Identifies if the vector form of the intrinsic has a scalar operand. 940 LLVM_ABI bool isTargetIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, 941 unsigned ScalarOpdIdx) const; 942 943 /// Identifies if the vector form of the intrinsic is overloaded on the type 944 /// of the operand at index \p OpdIdx, or on the return type if \p OpdIdx is 945 /// -1. 946 LLVM_ABI bool isTargetIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, 947 int OpdIdx) const; 948 949 /// Identifies if the vector form of the intrinsic that returns a struct is 950 /// overloaded at the struct element index \p RetIdx. 951 LLVM_ABI bool 952 isTargetIntrinsicWithStructReturnOverloadAtField(Intrinsic::ID ID, 953 int RetIdx) const; 954 955 /// Estimate the overhead of scalarizing an instruction. Insert and Extract 956 /// are set if the demanded result elements need to be inserted and/or 957 /// extracted from vectors. The involved values may be passed in VL if 958 /// Insert is true. 959 LLVM_ABI InstructionCost getScalarizationOverhead( 960 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, 961 TTI::TargetCostKind CostKind, bool ForPoisonSrc = true, 962 ArrayRef<Value *> VL = {}) const; 963 964 /// Estimate the overhead of scalarizing an instructions unique 965 /// non-constant operands. The (potentially vector) types to use for each of 966 /// argument are passes via Tys. 967 LLVM_ABI InstructionCost getOperandsScalarizationOverhead( 968 ArrayRef<const Value *> Args, ArrayRef<Type *> Tys, 969 TTI::TargetCostKind CostKind) const; 970 971 /// If target has efficient vector element load/store instructions, it can 972 /// return true here so that insertion/extraction costs are not added to 973 /// the scalarization cost of a load/store. 974 LLVM_ABI bool supportsEfficientVectorElementLoadStore() const; 975 976 /// If the target supports tail calls. 977 LLVM_ABI bool supportsTailCalls() const; 978 979 /// If target supports tail call on \p CB 980 LLVM_ABI bool supportsTailCallFor(const CallBase *CB) const; 981 982 /// Don't restrict interleaved unrolling to small loops. 983 LLVM_ABI bool enableAggressiveInterleaving(bool LoopHasReductions) const; 984 985 /// Returns options for expansion of memcmp. IsZeroCmp is 986 // true if this is the expansion of memcmp(p1, p2, s) == 0. 987 struct MemCmpExpansionOptions { 988 // Return true if memcmp expansion is enabled. 989 operator bool() const { return MaxNumLoads > 0; } 990 991 // Maximum number of load operations. 992 unsigned MaxNumLoads = 0; 993 994 // The list of available load sizes (in bytes), sorted in decreasing order. 995 SmallVector<unsigned, 8> LoadSizes; 996 997 // For memcmp expansion when the memcmp result is only compared equal or 998 // not-equal to 0, allow up to this number of load pairs per block. As an 999 // example, this may allow 'memcmp(a, b, 3) == 0' in a single block: 1000 // a0 = load2bytes &a[0] 1001 // b0 = load2bytes &b[0] 1002 // a2 = load1byte &a[2] 1003 // b2 = load1byte &b[2] 1004 // r = cmp eq (a0 ^ b0 | a2 ^ b2), 0 1005 unsigned NumLoadsPerBlock = 1; 1006 1007 // Set to true to allow overlapping loads. For example, 7-byte compares can 1008 // be done with two 4-byte compares instead of 4+2+1-byte compares. This 1009 // requires all loads in LoadSizes to be doable in an unaligned way. 1010 bool AllowOverlappingLoads = false; 1011 1012 // Sometimes, the amount of data that needs to be compared is smaller than 1013 // the standard register size, but it cannot be loaded with just one load 1014 // instruction. For example, if the size of the memory comparison is 6 1015 // bytes, we can handle it more efficiently by loading all 6 bytes in a 1016 // single block and generating an 8-byte number, instead of generating two 1017 // separate blocks with conditional jumps for 4 and 2 byte loads. This 1018 // approach simplifies the process and produces the comparison result as 1019 // normal. This array lists the allowed sizes of memcmp tails that can be 1020 // merged into one block 1021 SmallVector<unsigned, 4> AllowedTailExpansions; 1022 }; 1023 LLVM_ABI MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, 1024 bool IsZeroCmp) const; 1025 1026 /// Should the Select Optimization pass be enabled and ran. 1027 LLVM_ABI bool enableSelectOptimize() const; 1028 1029 /// Should the Select Optimization pass treat the given instruction like a 1030 /// select, potentially converting it to a conditional branch. This can 1031 /// include select-like instructions like or(zext(c), x) that can be converted 1032 /// to selects. 1033 LLVM_ABI bool shouldTreatInstructionLikeSelect(const Instruction *I) const; 1034 1035 /// Enable matching of interleaved access groups. 1036 LLVM_ABI bool enableInterleavedAccessVectorization() const; 1037 1038 /// Enable matching of interleaved access groups that contain predicated 1039 /// accesses or gaps and therefore vectorized using masked 1040 /// vector loads/stores. 1041 LLVM_ABI bool enableMaskedInterleavedAccessVectorization() const; 1042 1043 /// Indicate that it is potentially unsafe to automatically vectorize 1044 /// floating-point operations because the semantics of vector and scalar 1045 /// floating-point semantics may differ. For example, ARM NEON v7 SIMD math 1046 /// does not support IEEE-754 denormal numbers, while depending on the 1047 /// platform, scalar floating-point math does. 1048 /// This applies to floating-point math operations and calls, not memory 1049 /// operations, shuffles, or casts. 1050 LLVM_ABI bool isFPVectorizationPotentiallyUnsafe() const; 1051 1052 /// Determine if the target supports unaligned memory accesses. 1053 LLVM_ABI bool allowsMisalignedMemoryAccesses(LLVMContext &Context, 1054 unsigned BitWidth, 1055 unsigned AddressSpace = 0, 1056 Align Alignment = Align(1), 1057 unsigned *Fast = nullptr) const; 1058 1059 /// Return hardware support for population count. 1060 LLVM_ABI PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) const; 1061 1062 /// Return true if the hardware has a fast square-root instruction. 1063 LLVM_ABI bool haveFastSqrt(Type *Ty) const; 1064 1065 /// Return true if the cost of the instruction is too high to speculatively 1066 /// execute and should be kept behind a branch. 1067 /// This normally just wraps around a getInstructionCost() call, but some 1068 /// targets might report a low TCK_SizeAndLatency value that is incompatible 1069 /// with the fixed TCC_Expensive value. 1070 /// NOTE: This assumes the instruction passes isSafeToSpeculativelyExecute(). 1071 LLVM_ABI bool isExpensiveToSpeculativelyExecute(const Instruction *I) const; 1072 1073 /// Return true if it is faster to check if a floating-point value is NaN 1074 /// (or not-NaN) versus a comparison against a constant FP zero value. 1075 /// Targets should override this if materializing a 0.0 for comparison is 1076 /// generally as cheap as checking for ordered/unordered. 1077 LLVM_ABI bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) const; 1078 1079 /// Return the expected cost of supporting the floating point operation 1080 /// of the specified type. 1081 LLVM_ABI InstructionCost getFPOpCost(Type *Ty) const; 1082 1083 /// Return the expected cost of materializing for the given integer 1084 /// immediate of the specified type. 1085 LLVM_ABI InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, 1086 TargetCostKind CostKind) const; 1087 1088 /// Return the expected cost of materialization for the given integer 1089 /// immediate of the specified type for a given instruction. The cost can be 1090 /// zero if the immediate can be folded into the specified instruction. 1091 LLVM_ABI InstructionCost getIntImmCostInst(unsigned Opc, unsigned Idx, 1092 const APInt &Imm, Type *Ty, 1093 TargetCostKind CostKind, 1094 Instruction *Inst = nullptr) const; 1095 LLVM_ABI InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, 1096 const APInt &Imm, Type *Ty, 1097 TargetCostKind CostKind) const; 1098 1099 /// Return the expected cost for the given integer when optimising 1100 /// for size. This is different than the other integer immediate cost 1101 /// functions in that it is subtarget agnostic. This is useful when you e.g. 1102 /// target one ISA such as Aarch32 but smaller encodings could be possible 1103 /// with another such as Thumb. This return value is used as a penalty when 1104 /// the total costs for a constant is calculated (the bigger the cost, the 1105 /// more beneficial constant hoisting is). 1106 LLVM_ABI InstructionCost getIntImmCodeSizeCost(unsigned Opc, unsigned Idx, 1107 const APInt &Imm, 1108 Type *Ty) const; 1109 1110 /// It can be advantageous to detach complex constants from their uses to make 1111 /// their generation cheaper. This hook allows targets to report when such 1112 /// transformations might negatively effect the code generation of the 1113 /// underlying operation. The motivating example is divides whereby hoisting 1114 /// constants prevents the code generator's ability to transform them into 1115 /// combinations of simpler operations. 1116 LLVM_ABI bool preferToKeepConstantsAttached(const Instruction &Inst, 1117 const Function &Fn) const; 1118 1119 /// @} 1120 1121 /// \name Vector Target Information 1122 /// @{ 1123 1124 /// The various kinds of shuffle patterns for vector queries. 1125 enum ShuffleKind { 1126 SK_Broadcast, ///< Broadcast element 0 to all other elements. 1127 SK_Reverse, ///< Reverse the order of the vector. 1128 SK_Select, ///< Selects elements from the corresponding lane of 1129 ///< either source operand. This is equivalent to a 1130 ///< vector select with a constant condition operand. 1131 SK_Transpose, ///< Transpose two vectors. 1132 SK_InsertSubvector, ///< InsertSubvector. Index indicates start offset. 1133 SK_ExtractSubvector, ///< ExtractSubvector Index indicates start offset. 1134 SK_PermuteTwoSrc, ///< Merge elements from two source vectors into one 1135 ///< with any shuffle mask. 1136 SK_PermuteSingleSrc, ///< Shuffle elements of single source vector with any 1137 ///< shuffle mask. 1138 SK_Splice ///< Concatenates elements from the first input vector 1139 ///< with elements of the second input vector. Returning 1140 ///< a vector of the same type as the input vectors. 1141 ///< Index indicates start offset in first input vector. 1142 }; 1143 1144 /// Additional information about an operand's possible values. 1145 enum OperandValueKind { 1146 OK_AnyValue, // Operand can have any value. 1147 OK_UniformValue, // Operand is uniform (splat of a value). 1148 OK_UniformConstantValue, // Operand is uniform constant. 1149 OK_NonUniformConstantValue // Operand is a non uniform constant value. 1150 }; 1151 1152 /// Additional properties of an operand's values. 1153 enum OperandValueProperties { 1154 OP_None = 0, 1155 OP_PowerOf2 = 1, 1156 OP_NegatedPowerOf2 = 2, 1157 }; 1158 1159 // Describe the values an operand can take. We're in the process 1160 // of migrating uses of OperandValueKind and OperandValueProperties 1161 // to use this class, and then will change the internal representation. 1162 struct OperandValueInfo { 1163 OperandValueKind Kind = OK_AnyValue; 1164 OperandValueProperties Properties = OP_None; 1165 isConstantOperandValueInfo1166 bool isConstant() const { 1167 return Kind == OK_UniformConstantValue || Kind == OK_NonUniformConstantValue; 1168 } isUniformOperandValueInfo1169 bool isUniform() const { 1170 return Kind == OK_UniformConstantValue || Kind == OK_UniformValue; 1171 } isPowerOf2OperandValueInfo1172 bool isPowerOf2() const { 1173 return Properties == OP_PowerOf2; 1174 } isNegatedPowerOf2OperandValueInfo1175 bool isNegatedPowerOf2() const { 1176 return Properties == OP_NegatedPowerOf2; 1177 } 1178 getNoPropsOperandValueInfo1179 OperandValueInfo getNoProps() const { 1180 return {Kind, OP_None}; 1181 } 1182 }; 1183 1184 /// \return the number of registers in the target-provided register class. 1185 LLVM_ABI unsigned getNumberOfRegisters(unsigned ClassID) const; 1186 1187 /// \return true if the target supports load/store that enables fault 1188 /// suppression of memory operands when the source condition is false. 1189 LLVM_ABI bool hasConditionalLoadStoreForType(Type *Ty, bool IsStore) const; 1190 1191 /// \return the target-provided register class ID for the provided type, 1192 /// accounting for type promotion and other type-legalization techniques that 1193 /// the target might apply. However, it specifically does not account for the 1194 /// scalarization or splitting of vector types. Should a vector type require 1195 /// scalarization or splitting into multiple underlying vector registers, that 1196 /// type should be mapped to a register class containing no registers. 1197 /// Specifically, this is designed to provide a simple, high-level view of the 1198 /// register allocation later performed by the backend. These register classes 1199 /// don't necessarily map onto the register classes used by the backend. 1200 /// FIXME: It's not currently possible to determine how many registers 1201 /// are used by the provided type. 1202 LLVM_ABI unsigned getRegisterClassForType(bool Vector, 1203 Type *Ty = nullptr) const; 1204 1205 /// \return the target-provided register class name 1206 LLVM_ABI const char *getRegisterClassName(unsigned ClassID) const; 1207 1208 enum RegisterKind { RGK_Scalar, RGK_FixedWidthVector, RGK_ScalableVector }; 1209 1210 /// \return The width of the largest scalar or vector register type. 1211 LLVM_ABI TypeSize getRegisterBitWidth(RegisterKind K) const; 1212 1213 /// \return The width of the smallest vector register type. 1214 LLVM_ABI unsigned getMinVectorRegisterBitWidth() const; 1215 1216 /// \return The maximum value of vscale if the target specifies an 1217 /// architectural maximum vector length, and std::nullopt otherwise. 1218 LLVM_ABI std::optional<unsigned> getMaxVScale() const; 1219 1220 /// \return the value of vscale to tune the cost model for. 1221 LLVM_ABI std::optional<unsigned> getVScaleForTuning() const; 1222 1223 /// \return true if vscale is known to be a power of 2 1224 LLVM_ABI bool isVScaleKnownToBeAPowerOfTwo() const; 1225 1226 /// \return True if the vectorization factor should be chosen to 1227 /// make the vector of the smallest element type match the size of a 1228 /// vector register. For wider element types, this could result in 1229 /// creating vectors that span multiple vector registers. 1230 /// If false, the vectorization factor will be chosen based on the 1231 /// size of the widest element type. 1232 /// \p K Register Kind for vectorization. 1233 LLVM_ABI bool 1234 shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const; 1235 1236 /// \return The minimum vectorization factor for types of given element 1237 /// bit width, or 0 if there is no minimum VF. The returned value only 1238 /// applies when shouldMaximizeVectorBandwidth returns true. 1239 /// If IsScalable is true, the returned ElementCount must be a scalable VF. 1240 LLVM_ABI ElementCount getMinimumVF(unsigned ElemWidth, bool IsScalable) const; 1241 1242 /// \return The maximum vectorization factor for types of given element 1243 /// bit width and opcode, or 0 if there is no maximum VF. 1244 /// Currently only used by the SLP vectorizer. 1245 LLVM_ABI unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const; 1246 1247 /// \return The minimum vectorization factor for the store instruction. Given 1248 /// the initial estimation of the minimum vector factor and store value type, 1249 /// it tries to find possible lowest VF, which still might be profitable for 1250 /// the vectorization. 1251 /// \param VF Initial estimation of the minimum vector factor. 1252 /// \param ScalarMemTy Scalar memory type of the store operation. 1253 /// \param ScalarValTy Scalar type of the stored value. 1254 /// Currently only used by the SLP vectorizer. 1255 LLVM_ABI unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, 1256 Type *ScalarValTy) const; 1257 1258 /// \return True if it should be considered for address type promotion. 1259 /// \p AllowPromotionWithoutCommonHeader Set true if promoting \p I is 1260 /// profitable without finding other extensions fed by the same input. 1261 LLVM_ABI bool shouldConsiderAddressTypePromotion( 1262 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const; 1263 1264 /// \return The size of a cache line in bytes. 1265 LLVM_ABI unsigned getCacheLineSize() const; 1266 1267 /// The possible cache levels 1268 enum class CacheLevel { 1269 L1D, // The L1 data cache 1270 L2D, // The L2 data cache 1271 1272 // We currently do not model L3 caches, as their sizes differ widely between 1273 // microarchitectures. Also, we currently do not have a use for L3 cache 1274 // size modeling yet. 1275 }; 1276 1277 /// \return The size of the cache level in bytes, if available. 1278 LLVM_ABI std::optional<unsigned> getCacheSize(CacheLevel Level) const; 1279 1280 /// \return The associativity of the cache level, if available. 1281 LLVM_ABI std::optional<unsigned> 1282 getCacheAssociativity(CacheLevel Level) const; 1283 1284 /// \return The minimum architectural page size for the target. 1285 LLVM_ABI std::optional<unsigned> getMinPageSize() const; 1286 1287 /// \return How much before a load we should place the prefetch 1288 /// instruction. This is currently measured in number of 1289 /// instructions. 1290 LLVM_ABI unsigned getPrefetchDistance() const; 1291 1292 /// Some HW prefetchers can handle accesses up to a certain constant stride. 1293 /// Sometimes prefetching is beneficial even below the HW prefetcher limit, 1294 /// and the arguments provided are meant to serve as a basis for deciding this 1295 /// for a particular loop. 1296 /// 1297 /// \param NumMemAccesses Number of memory accesses in the loop. 1298 /// \param NumStridedMemAccesses Number of the memory accesses that 1299 /// ScalarEvolution could find a known stride 1300 /// for. 1301 /// \param NumPrefetches Number of software prefetches that will be 1302 /// emitted as determined by the addresses 1303 /// involved and the cache line size. 1304 /// \param HasCall True if the loop contains a call. 1305 /// 1306 /// \return This is the minimum stride in bytes where it makes sense to start 1307 /// adding SW prefetches. The default is 1, i.e. prefetch with any 1308 /// stride. 1309 LLVM_ABI unsigned getMinPrefetchStride(unsigned NumMemAccesses, 1310 unsigned NumStridedMemAccesses, 1311 unsigned NumPrefetches, 1312 bool HasCall) const; 1313 1314 /// \return The maximum number of iterations to prefetch ahead. If 1315 /// the required number of iterations is more than this number, no 1316 /// prefetching is performed. 1317 LLVM_ABI unsigned getMaxPrefetchIterationsAhead() const; 1318 1319 /// \return True if prefetching should also be done for writes. 1320 LLVM_ABI bool enableWritePrefetching() const; 1321 1322 /// \return if target want to issue a prefetch in address space \p AS. 1323 LLVM_ABI bool shouldPrefetchAddressSpace(unsigned AS) const; 1324 1325 /// \return The cost of a partial reduction, which is a reduction from a 1326 /// vector to another vector with fewer elements of larger size. They are 1327 /// represented by the llvm.experimental.partial.reduce.add intrinsic, which 1328 /// takes an accumulator of type \p AccumType and a second vector operand to 1329 /// be accumulated, whose element count is specified by \p VF. The type of 1330 /// reduction is specified by \p Opcode. The second operand passed to the 1331 /// intrinsic could be the result of an extend, such as sext or zext. In 1332 /// this case \p BinOp is nullopt, \p InputTypeA represents the type being 1333 /// extended and \p OpAExtend the operation, i.e. sign- or zero-extend. 1334 /// Also, \p InputTypeB should be nullptr and OpBExtend should be None. 1335 /// Alternatively, the second operand could be the result of a binary 1336 /// operation performed on two extends, i.e. 1337 /// mul(zext i8 %a -> i32, zext i8 %b -> i32). 1338 /// In this case \p BinOp may specify the opcode of the binary operation, 1339 /// \p InputTypeA and \p InputTypeB the types being extended, and 1340 /// \p OpAExtend, \p OpBExtend the form of extensions. An example of an 1341 /// operation that uses a partial reduction is a dot product, which reduces 1342 /// two vectors in binary mul operation to another of 4 times fewer and 4 1343 /// times larger elements. 1344 LLVM_ABI InstructionCost getPartialReductionCost( 1345 unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, 1346 ElementCount VF, PartialReductionExtendKind OpAExtend, 1347 PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp, 1348 TTI::TargetCostKind CostKind) const; 1349 1350 /// \return The maximum interleave factor that any transform should try to 1351 /// perform for this target. This number depends on the level of parallelism 1352 /// and the number of execution units in the CPU. 1353 LLVM_ABI unsigned getMaxInterleaveFactor(ElementCount VF) const; 1354 1355 /// Collect properties of V used in cost analysis, e.g. OP_PowerOf2. 1356 LLVM_ABI static OperandValueInfo getOperandInfo(const Value *V); 1357 1358 /// This is an approximation of reciprocal throughput of a math/logic op. 1359 /// A higher cost indicates less expected throughput. 1360 /// From Agner Fog's guides, reciprocal throughput is "the average number of 1361 /// clock cycles per instruction when the instructions are not part of a 1362 /// limiting dependency chain." 1363 /// Therefore, costs should be scaled to account for multiple execution units 1364 /// on the target that can process this type of instruction. For example, if 1365 /// there are 5 scalar integer units and 2 vector integer units that can 1366 /// calculate an 'add' in a single cycle, this model should indicate that the 1367 /// cost of the vector add instruction is 2.5 times the cost of the scalar 1368 /// add instruction. 1369 /// \p Args is an optional argument which holds the instruction operands 1370 /// values so the TTI can analyze those values searching for special 1371 /// cases or optimizations based on those values. 1372 /// \p CxtI is the optional original context instruction, if one exists, to 1373 /// provide even more information. 1374 /// \p TLibInfo is used to search for platform specific vector library 1375 /// functions for instructions that might be converted to calls (e.g. frem). 1376 LLVM_ABI InstructionCost getArithmeticInstrCost( 1377 unsigned Opcode, Type *Ty, 1378 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput, 1379 TTI::OperandValueInfo Opd1Info = {TTI::OK_AnyValue, TTI::OP_None}, 1380 TTI::OperandValueInfo Opd2Info = {TTI::OK_AnyValue, TTI::OP_None}, 1381 ArrayRef<const Value *> Args = {}, const Instruction *CxtI = nullptr, 1382 const TargetLibraryInfo *TLibInfo = nullptr) const; 1383 1384 /// Returns the cost estimation for alternating opcode pattern that can be 1385 /// lowered to a single instruction on the target. In X86 this is for the 1386 /// addsub instruction which corrsponds to a Shuffle + Fadd + FSub pattern in 1387 /// IR. This function expects two opcodes: \p Opcode1 and \p Opcode2 being 1388 /// selected by \p OpcodeMask. The mask contains one bit per lane and is a `0` 1389 /// when \p Opcode0 is selected and `1` when Opcode1 is selected. 1390 /// \p VecTy is the vector type of the instruction to be generated. 1391 LLVM_ABI InstructionCost getAltInstrCost( 1392 VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, 1393 const SmallBitVector &OpcodeMask, 1394 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const; 1395 1396 /// \return The cost of a shuffle instruction of kind Kind with inputs of type 1397 /// SrcTy, producing a vector of type DstTy. The exact mask may be passed as 1398 /// Mask, or else the array will be empty. The Index and SubTp parameters 1399 /// are used by the subvector insertions shuffle kinds to show the insert 1400 /// point and the type of the subvector being inserted. The operands of the 1401 /// shuffle can be passed through \p Args, which helps improve the cost 1402 /// estimation in some cases, like in broadcast loads. 1403 LLVM_ABI InstructionCost getShuffleCost( 1404 ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, 1405 ArrayRef<int> Mask = {}, 1406 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput, int Index = 0, 1407 VectorType *SubTp = nullptr, ArrayRef<const Value *> Args = {}, 1408 const Instruction *CxtI = nullptr) const; 1409 1410 /// Represents a hint about the context in which a cast is used. 1411 /// 1412 /// For zext/sext, the context of the cast is the operand, which must be a 1413 /// load of some kind. For trunc, the context is of the cast is the single 1414 /// user of the instruction, which must be a store of some kind. 1415 /// 1416 /// This enum allows the vectorizer to give getCastInstrCost an idea of the 1417 /// type of cast it's dealing with, as not every cast is equal. For instance, 1418 /// the zext of a load may be free, but the zext of an interleaving load can 1419 //// be (very) expensive! 1420 /// 1421 /// See \c getCastContextHint to compute a CastContextHint from a cast 1422 /// Instruction*. Callers can use it if they don't need to override the 1423 /// context and just want it to be calculated from the instruction. 1424 /// 1425 /// FIXME: This handles the types of load/store that the vectorizer can 1426 /// produce, which are the cases where the context instruction is most 1427 /// likely to be incorrect. There are other situations where that can happen 1428 /// too, which might be handled here but in the long run a more general 1429 /// solution of costing multiple instructions at the same times may be better. 1430 enum class CastContextHint : uint8_t { 1431 None, ///< The cast is not used with a load/store of any kind. 1432 Normal, ///< The cast is used with a normal load/store. 1433 Masked, ///< The cast is used with a masked load/store. 1434 GatherScatter, ///< The cast is used with a gather/scatter. 1435 Interleave, ///< The cast is used with an interleaved load/store. 1436 Reversed, ///< The cast is used with a reversed load/store. 1437 }; 1438 1439 /// Calculates a CastContextHint from \p I. 1440 /// This should be used by callers of getCastInstrCost if they wish to 1441 /// determine the context from some instruction. 1442 /// \returns the CastContextHint for ZExt/SExt/Trunc, None if \p I is nullptr, 1443 /// or if it's another type of cast. 1444 LLVM_ABI static CastContextHint getCastContextHint(const Instruction *I); 1445 1446 /// \return The expected cost of cast instructions, such as bitcast, trunc, 1447 /// zext, etc. If there is an existing instruction that holds Opcode, it 1448 /// may be passed in the 'I' parameter. 1449 LLVM_ABI InstructionCost getCastInstrCost( 1450 unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, 1451 TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency, 1452 const Instruction *I = nullptr) const; 1453 1454 /// \return The expected cost of a sign- or zero-extended vector extract. Use 1455 /// Index = -1 to indicate that there is no information about the index value. 1456 LLVM_ABI InstructionCost 1457 getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, 1458 unsigned Index, TTI::TargetCostKind CostKind) const; 1459 1460 /// \return The expected cost of control-flow related instructions such as 1461 /// Phi, Ret, Br, Switch. 1462 LLVM_ABI InstructionCost getCFInstrCost( 1463 unsigned Opcode, TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency, 1464 const Instruction *I = nullptr) const; 1465 1466 /// \returns The expected cost of compare and select instructions. If there 1467 /// is an existing instruction that holds Opcode, it may be passed in the 1468 /// 'I' parameter. The \p VecPred parameter can be used to indicate the select 1469 /// is using a compare with the specified predicate as condition. When vector 1470 /// types are passed, \p VecPred must be used for all lanes. For a 1471 /// comparison, the two operands are the natural values. For a select, the 1472 /// two operands are the *value* operands, not the condition operand. 1473 LLVM_ABI InstructionCost getCmpSelInstrCost( 1474 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, 1475 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput, 1476 OperandValueInfo Op1Info = {OK_AnyValue, OP_None}, 1477 OperandValueInfo Op2Info = {OK_AnyValue, OP_None}, 1478 const Instruction *I = nullptr) const; 1479 1480 /// \return The expected cost of vector Insert and Extract. 1481 /// Use -1 to indicate that there is no information on the index value. 1482 /// This is used when the instruction is not available; a typical use 1483 /// case is to provision the cost of vectorization/scalarization in 1484 /// vectorizer passes. 1485 LLVM_ABI InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, 1486 TTI::TargetCostKind CostKind, 1487 unsigned Index = -1, 1488 const Value *Op0 = nullptr, 1489 const Value *Op1 = nullptr) const; 1490 1491 /// \return The expected cost of vector Insert and Extract. 1492 /// Use -1 to indicate that there is no information on the index value. 1493 /// This is used when the instruction is not available; a typical use 1494 /// case is to provision the cost of vectorization/scalarization in 1495 /// vectorizer passes. 1496 /// \param ScalarUserAndIdx encodes the information about extracts from a 1497 /// vector with 'Scalar' being the value being extracted,'User' being the user 1498 /// of the extract(nullptr if user is not known before vectorization) and 1499 /// 'Idx' being the extract lane. 1500 LLVM_ABI InstructionCost getVectorInstrCost( 1501 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, 1502 Value *Scalar, 1503 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const; 1504 1505 /// \return The expected cost of vector Insert and Extract. 1506 /// This is used when instruction is available, and implementation 1507 /// asserts 'I' is not nullptr. 1508 /// 1509 /// A typical suitable use case is cost estimation when vector instruction 1510 /// exists (e.g., from basic blocks during transformation). 1511 LLVM_ABI InstructionCost getVectorInstrCost(const Instruction &I, Type *Val, 1512 TTI::TargetCostKind CostKind, 1513 unsigned Index = -1) const; 1514 1515 /// \return The expected cost of aggregate inserts and extracts. This is 1516 /// used when the instruction is not available; a typical use case is to 1517 /// provision the cost of vectorization/scalarization in vectorizer passes. 1518 LLVM_ABI InstructionCost getInsertExtractValueCost( 1519 unsigned Opcode, TTI::TargetCostKind CostKind) const; 1520 1521 /// \return The cost of replication shuffle of \p VF elements typed \p EltTy 1522 /// \p ReplicationFactor times. 1523 /// 1524 /// For example, the mask for \p ReplicationFactor=3 and \p VF=4 is: 1525 /// <0,0,0,1,1,1,2,2,2,3,3,3> 1526 LLVM_ABI InstructionCost getReplicationShuffleCost( 1527 Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, 1528 TTI::TargetCostKind CostKind) const; 1529 1530 /// \return The cost of Load and Store instructions. 1531 LLVM_ABI InstructionCost getMemoryOpCost( 1532 unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, 1533 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput, 1534 OperandValueInfo OpdInfo = {OK_AnyValue, OP_None}, 1535 const Instruction *I = nullptr) const; 1536 1537 /// \return The cost of VP Load and Store instructions. 1538 LLVM_ABI InstructionCost getVPMemoryOpCost( 1539 unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, 1540 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput, 1541 const Instruction *I = nullptr) const; 1542 1543 /// \return The cost of masked Load and Store instructions. 1544 LLVM_ABI InstructionCost getMaskedMemoryOpCost( 1545 unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, 1546 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const; 1547 1548 /// \return The cost of Gather or Scatter operation 1549 /// \p Opcode - is a type of memory access Load or Store 1550 /// \p DataTy - a vector type of the data to be loaded or stored 1551 /// \p Ptr - pointer [or vector of pointers] - address[es] in memory 1552 /// \p VariableMask - true when the memory access is predicated with a mask 1553 /// that is not a compile-time constant 1554 /// \p Alignment - alignment of single element 1555 /// \p I - the optional original context instruction, if one exists, e.g. the 1556 /// load/store to transform or the call to the gather/scatter intrinsic 1557 LLVM_ABI InstructionCost getGatherScatterOpCost( 1558 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, 1559 Align Alignment, TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput, 1560 const Instruction *I = nullptr) const; 1561 1562 /// \return The cost of Expand Load or Compress Store operation 1563 /// \p Opcode - is a type of memory access Load or Store 1564 /// \p Src - a vector type of the data to be loaded or stored 1565 /// \p VariableMask - true when the memory access is predicated with a mask 1566 /// that is not a compile-time constant 1567 /// \p Alignment - alignment of single element 1568 /// \p I - the optional original context instruction, if one exists, e.g. the 1569 /// load/store to transform or the call to the gather/scatter intrinsic 1570 LLVM_ABI InstructionCost getExpandCompressMemoryOpCost( 1571 unsigned Opcode, Type *DataTy, bool VariableMask, Align Alignment, 1572 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput, 1573 const Instruction *I = nullptr) const; 1574 1575 /// \return The cost of strided memory operations. 1576 /// \p Opcode - is a type of memory access Load or Store 1577 /// \p DataTy - a vector type of the data to be loaded or stored 1578 /// \p Ptr - pointer [or vector of pointers] - address[es] in memory 1579 /// \p VariableMask - true when the memory access is predicated with a mask 1580 /// that is not a compile-time constant 1581 /// \p Alignment - alignment of single element 1582 /// \p I - the optional original context instruction, if one exists, e.g. the 1583 /// load/store to transform or the call to the gather/scatter intrinsic 1584 LLVM_ABI InstructionCost getStridedMemoryOpCost( 1585 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, 1586 Align Alignment, TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput, 1587 const Instruction *I = nullptr) const; 1588 1589 /// \return The cost of the interleaved memory operation. 1590 /// \p Opcode is the memory operation code 1591 /// \p VecTy is the vector type of the interleaved access. 1592 /// \p Factor is the interleave factor 1593 /// \p Indices is the indices for interleaved load members (as interleaved 1594 /// load allows gaps) 1595 /// \p Alignment is the alignment of the memory operation 1596 /// \p AddressSpace is address space of the pointer. 1597 /// \p UseMaskForCond indicates if the memory access is predicated. 1598 /// \p UseMaskForGaps indicates if gaps should be masked. 1599 LLVM_ABI InstructionCost getInterleavedMemoryOpCost( 1600 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, 1601 Align Alignment, unsigned AddressSpace, 1602 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput, 1603 bool UseMaskForCond = false, bool UseMaskForGaps = false) const; 1604 1605 /// A helper function to determine the type of reduction algorithm used 1606 /// for a given \p Opcode and set of FastMathFlags \p FMF. requiresOrderedReduction(std::optional<FastMathFlags> FMF)1607 static bool requiresOrderedReduction(std::optional<FastMathFlags> FMF) { 1608 return FMF && !(*FMF).allowReassoc(); 1609 } 1610 1611 /// Calculate the cost of vector reduction intrinsics. 1612 /// 1613 /// This is the cost of reducing the vector value of type \p Ty to a scalar 1614 /// value using the operation denoted by \p Opcode. The FastMathFlags 1615 /// parameter \p FMF indicates what type of reduction we are performing: 1616 /// 1. Tree-wise. This is the typical 'fast' reduction performed that 1617 /// involves successively splitting a vector into half and doing the 1618 /// operation on the pair of halves until you have a scalar value. For 1619 /// example: 1620 /// (v0, v1, v2, v3) 1621 /// ((v0+v2), (v1+v3), undef, undef) 1622 /// ((v0+v2+v1+v3), undef, undef, undef) 1623 /// This is the default behaviour for integer operations, whereas for 1624 /// floating point we only do this if \p FMF indicates that 1625 /// reassociation is allowed. 1626 /// 2. Ordered. For a vector with N elements this involves performing N 1627 /// operations in lane order, starting with an initial scalar value, i.e. 1628 /// result = InitVal + v0 1629 /// result = result + v1 1630 /// result = result + v2 1631 /// result = result + v3 1632 /// This is only the case for FP operations and when reassociation is not 1633 /// allowed. 1634 /// 1635 LLVM_ABI InstructionCost getArithmeticReductionCost( 1636 unsigned Opcode, VectorType *Ty, std::optional<FastMathFlags> FMF, 1637 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const; 1638 1639 LLVM_ABI InstructionCost getMinMaxReductionCost( 1640 Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF = FastMathFlags(), 1641 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const; 1642 1643 /// Calculate the cost of an extended reduction pattern, similar to 1644 /// getArithmeticReductionCost of an Add reduction with multiply and optional 1645 /// extensions. This is the cost of as: 1646 /// ResTy vecreduce.add(mul (A, B)). 1647 /// ResTy vecreduce.add(mul(ext(Ty A), ext(Ty B)). 1648 LLVM_ABI InstructionCost getMulAccReductionCost( 1649 bool IsUnsigned, Type *ResTy, VectorType *Ty, 1650 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const; 1651 1652 /// Calculate the cost of an extended reduction pattern, similar to 1653 /// getArithmeticReductionCost of a reduction with an extension. 1654 /// This is the cost of as: 1655 /// ResTy vecreduce.opcode(ext(Ty A)). 1656 LLVM_ABI InstructionCost getExtendedReductionCost( 1657 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, 1658 std::optional<FastMathFlags> FMF, 1659 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const; 1660 1661 /// \returns The cost of Intrinsic instructions. Analyses the real arguments. 1662 /// Three cases are handled: 1. scalar instruction 2. vector instruction 1663 /// 3. scalar instruction which is to be vectorized. 1664 LLVM_ABI InstructionCost getIntrinsicInstrCost( 1665 const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const; 1666 1667 /// \returns The cost of Call instructions. 1668 LLVM_ABI InstructionCost getCallInstrCost( 1669 Function *F, Type *RetTy, ArrayRef<Type *> Tys, 1670 TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency) const; 1671 1672 /// \returns The number of pieces into which the provided type must be 1673 /// split during legalization. Zero is returned when the answer is unknown. 1674 LLVM_ABI unsigned getNumberOfParts(Type *Tp) const; 1675 1676 /// \returns The cost of the address computation. For most targets this can be 1677 /// merged into the instruction indexing mode. Some targets might want to 1678 /// distinguish between address computation for memory operations on vector 1679 /// types and scalar types. Such targets should override this function. 1680 /// The 'SE' parameter holds pointer for the scalar evolution object which 1681 /// is used in order to get the Ptr step value in case of constant stride. 1682 /// The 'Ptr' parameter holds SCEV of the access pointer. 1683 LLVM_ABI InstructionCost getAddressComputationCost( 1684 Type *Ty, ScalarEvolution *SE = nullptr, const SCEV *Ptr = nullptr) const; 1685 1686 /// \returns The cost, if any, of keeping values of the given types alive 1687 /// over a callsite. 1688 /// 1689 /// Some types may require the use of register classes that do not have 1690 /// any callee-saved registers, so would require a spill and fill. 1691 LLVM_ABI InstructionCost 1692 getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) const; 1693 1694 /// \returns True if the intrinsic is a supported memory intrinsic. Info 1695 /// will contain additional information - whether the intrinsic may write 1696 /// or read to memory, volatility and the pointer. Info is undefined 1697 /// if false is returned. 1698 LLVM_ABI bool getTgtMemIntrinsic(IntrinsicInst *Inst, 1699 MemIntrinsicInfo &Info) const; 1700 1701 /// \returns The maximum element size, in bytes, for an element 1702 /// unordered-atomic memory intrinsic. 1703 LLVM_ABI unsigned getAtomicMemIntrinsicMaxElementSize() const; 1704 1705 /// \returns A value which is the result of the given memory intrinsic. If \p 1706 /// CanCreate is true, new instructions may be created to extract the result 1707 /// from the given intrinsic memory operation. Returns nullptr if the target 1708 /// cannot create a result from the given intrinsic. 1709 LLVM_ABI Value * 1710 getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, Type *ExpectedType, 1711 bool CanCreate = true) const; 1712 1713 /// \returns The type to use in a loop expansion of a memcpy call. 1714 LLVM_ABI Type *getMemcpyLoopLoweringType( 1715 LLVMContext &Context, Value *Length, unsigned SrcAddrSpace, 1716 unsigned DestAddrSpace, Align SrcAlign, Align DestAlign, 1717 std::optional<uint32_t> AtomicElementSize = std::nullopt) const; 1718 1719 /// \param[out] OpsOut The operand types to copy RemainingBytes of memory. 1720 /// \param RemainingBytes The number of bytes to copy. 1721 /// 1722 /// Calculates the operand types to use when copying \p RemainingBytes of 1723 /// memory, where source and destination alignments are \p SrcAlign and 1724 /// \p DestAlign respectively. 1725 LLVM_ABI void getMemcpyLoopResidualLoweringType( 1726 SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context, 1727 unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, 1728 Align SrcAlign, Align DestAlign, 1729 std::optional<uint32_t> AtomicCpySize = std::nullopt) const; 1730 1731 /// \returns True if the two functions have compatible attributes for inlining 1732 /// purposes. 1733 LLVM_ABI bool areInlineCompatible(const Function *Caller, 1734 const Function *Callee) const; 1735 1736 /// Returns a penalty for invoking call \p Call in \p F. 1737 /// For example, if a function F calls a function G, which in turn calls 1738 /// function H, then getInlineCallPenalty(F, H()) would return the 1739 /// penalty of calling H from F, e.g. after inlining G into F. 1740 /// \p DefaultCallPenalty is passed to give a default penalty that 1741 /// the target can amend or override. 1742 LLVM_ABI unsigned getInlineCallPenalty(const Function *F, 1743 const CallBase &Call, 1744 unsigned DefaultCallPenalty) const; 1745 1746 /// \returns True if the caller and callee agree on how \p Types will be 1747 /// passed to or returned from the callee. 1748 /// to the callee. 1749 /// \param Types List of types to check. 1750 LLVM_ABI bool areTypesABICompatible(const Function *Caller, 1751 const Function *Callee, 1752 const ArrayRef<Type *> &Types) const; 1753 1754 /// The type of load/store indexing. 1755 enum MemIndexedMode { 1756 MIM_Unindexed, ///< No indexing. 1757 MIM_PreInc, ///< Pre-incrementing. 1758 MIM_PreDec, ///< Pre-decrementing. 1759 MIM_PostInc, ///< Post-incrementing. 1760 MIM_PostDec ///< Post-decrementing. 1761 }; 1762 1763 /// \returns True if the specified indexed load for the given type is legal. 1764 LLVM_ABI bool isIndexedLoadLegal(enum MemIndexedMode Mode, Type *Ty) const; 1765 1766 /// \returns True if the specified indexed store for the given type is legal. 1767 LLVM_ABI bool isIndexedStoreLegal(enum MemIndexedMode Mode, Type *Ty) const; 1768 1769 /// \returns The bitwidth of the largest vector type that should be used to 1770 /// load/store in the given address space. 1771 LLVM_ABI unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const; 1772 1773 /// \returns True if the load instruction is legal to vectorize. 1774 LLVM_ABI bool isLegalToVectorizeLoad(LoadInst *LI) const; 1775 1776 /// \returns True if the store instruction is legal to vectorize. 1777 LLVM_ABI bool isLegalToVectorizeStore(StoreInst *SI) const; 1778 1779 /// \returns True if it is legal to vectorize the given load chain. 1780 LLVM_ABI bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, 1781 Align Alignment, 1782 unsigned AddrSpace) const; 1783 1784 /// \returns True if it is legal to vectorize the given store chain. 1785 LLVM_ABI bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, 1786 Align Alignment, 1787 unsigned AddrSpace) const; 1788 1789 /// \returns True if it is legal to vectorize the given reduction kind. 1790 LLVM_ABI bool isLegalToVectorizeReduction(const RecurrenceDescriptor &RdxDesc, 1791 ElementCount VF) const; 1792 1793 /// \returns True if the given type is supported for scalable vectors 1794 LLVM_ABI bool isElementTypeLegalForScalableVector(Type *Ty) const; 1795 1796 /// \returns The new vector factor value if the target doesn't support \p 1797 /// SizeInBytes loads or has a better vector factor. 1798 LLVM_ABI unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize, 1799 unsigned ChainSizeInBytes, 1800 VectorType *VecTy) const; 1801 1802 /// \returns The new vector factor value if the target doesn't support \p 1803 /// SizeInBytes stores or has a better vector factor. 1804 LLVM_ABI unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize, 1805 unsigned ChainSizeInBytes, 1806 VectorType *VecTy) const; 1807 1808 /// \returns True if the targets prefers fixed width vectorization if the 1809 /// loop vectorizer's cost-model assigns an equal cost to the fixed and 1810 /// scalable version of the vectorized loop. 1811 LLVM_ABI bool preferFixedOverScalableIfEqualCost() const; 1812 1813 /// \returns True if target prefers SLP vectorizer with altermate opcode 1814 /// vectorization, false - otherwise. 1815 LLVM_ABI bool preferAlternateOpcodeVectorization() const; 1816 1817 /// \returns True if the target prefers reductions of \p Kind to be performed 1818 /// in the loop. 1819 LLVM_ABI bool preferInLoopReduction(RecurKind Kind, Type *Ty) const; 1820 1821 /// \returns True if the target prefers reductions select kept in the loop 1822 /// when tail folding. i.e. 1823 /// loop: 1824 /// p = phi (0, s) 1825 /// a = add (p, x) 1826 /// s = select (mask, a, p) 1827 /// vecreduce.add(s) 1828 /// 1829 /// As opposed to the normal scheme of p = phi (0, a) which allows the select 1830 /// to be pulled out of the loop. If the select(.., add, ..) can be predicated 1831 /// by the target, this can lead to cleaner code generation. 1832 LLVM_ABI bool preferPredicatedReductionSelect() const; 1833 1834 /// Return true if the loop vectorizer should consider vectorizing an 1835 /// otherwise scalar epilogue loop. 1836 LLVM_ABI bool preferEpilogueVectorization() const; 1837 1838 /// \returns True if the target wants to expand the given reduction intrinsic 1839 /// into a shuffle sequence. 1840 LLVM_ABI bool shouldExpandReduction(const IntrinsicInst *II) const; 1841 1842 enum struct ReductionShuffle { SplitHalf, Pairwise }; 1843 1844 /// \returns The shuffle sequence pattern used to expand the given reduction 1845 /// intrinsic. 1846 LLVM_ABI ReductionShuffle 1847 getPreferredExpandedReductionShuffle(const IntrinsicInst *II) const; 1848 1849 /// \returns the size cost of rematerializing a GlobalValue address relative 1850 /// to a stack reload. 1851 LLVM_ABI unsigned getGISelRematGlobalCost() const; 1852 1853 /// \returns the lower bound of a trip count to decide on vectorization 1854 /// while tail-folding. 1855 LLVM_ABI unsigned getMinTripCountTailFoldingThreshold() const; 1856 1857 /// \returns True if the target supports scalable vectors. 1858 LLVM_ABI bool supportsScalableVectors() const; 1859 1860 /// \return true when scalable vectorization is preferred. 1861 LLVM_ABI bool enableScalableVectorization() const; 1862 1863 /// \name Vector Predication Information 1864 /// @{ 1865 /// Whether the target supports the %evl parameter of VP intrinsic efficiently 1866 /// in hardware. (see LLVM Language Reference - "Vector Predication 1867 /// Intrinsics"). Use of %evl is discouraged when that is not the case. 1868 LLVM_ABI bool hasActiveVectorLength() const; 1869 1870 /// Return true if sinking I's operands to the same basic block as I is 1871 /// profitable, e.g. because the operands can be folded into a target 1872 /// instruction during instruction selection. After calling the function 1873 /// \p Ops contains the Uses to sink ordered by dominance (dominating users 1874 /// come first). 1875 LLVM_ABI bool isProfitableToSinkOperands(Instruction *I, 1876 SmallVectorImpl<Use *> &Ops) const; 1877 1878 /// Return true if it's significantly cheaper to shift a vector by a uniform 1879 /// scalar than by an amount which will vary across each lane. On x86 before 1880 /// AVX2 for example, there is a "psllw" instruction for the former case, but 1881 /// no simple instruction for a general "a << b" operation on vectors. 1882 /// This should also apply to lowering for vector funnel shifts (rotates). 1883 LLVM_ABI bool isVectorShiftByScalarCheap(Type *Ty) const; 1884 1885 struct VPLegalization { 1886 enum VPTransform { 1887 // keep the predicating parameter 1888 Legal = 0, 1889 // where legal, discard the predicate parameter 1890 Discard = 1, 1891 // transform into something else that is also predicating 1892 Convert = 2 1893 }; 1894 1895 // How to transform the EVL parameter. 1896 // Legal: keep the EVL parameter as it is. 1897 // Discard: Ignore the EVL parameter where it is safe to do so. 1898 // Convert: Fold the EVL into the mask parameter. 1899 VPTransform EVLParamStrategy; 1900 1901 // How to transform the operator. 1902 // Legal: The target supports this operator. 1903 // Convert: Convert this to a non-VP operation. 1904 // The 'Discard' strategy is invalid. 1905 VPTransform OpStrategy; 1906 shouldDoNothingVPLegalization1907 bool shouldDoNothing() const { 1908 return (EVLParamStrategy == Legal) && (OpStrategy == Legal); 1909 } VPLegalizationVPLegalization1910 VPLegalization(VPTransform EVLParamStrategy, VPTransform OpStrategy) 1911 : EVLParamStrategy(EVLParamStrategy), OpStrategy(OpStrategy) {} 1912 }; 1913 1914 /// \returns How the target needs this vector-predicated operation to be 1915 /// transformed. 1916 LLVM_ABI VPLegalization 1917 getVPLegalizationStrategy(const VPIntrinsic &PI) const; 1918 /// @} 1919 1920 /// \returns Whether a 32-bit branch instruction is available in Arm or Thumb 1921 /// state. 1922 /// 1923 /// Used by the LowerTypeTests pass, which constructs an IR inline assembler 1924 /// node containing a jump table in a format suitable for the target, so it 1925 /// needs to know what format of jump table it can legally use. 1926 /// 1927 /// For non-Arm targets, this function isn't used. It defaults to returning 1928 /// false, but it shouldn't matter what it returns anyway. 1929 LLVM_ABI bool hasArmWideBranch(bool Thumb) const; 1930 1931 /// Returns a bitmask constructed from the target-features or fmv-features 1932 /// metadata of a function. 1933 LLVM_ABI uint64_t getFeatureMask(const Function &F) const; 1934 1935 /// Returns true if this is an instance of a function with multiple versions. 1936 LLVM_ABI bool isMultiversionedFunction(const Function &F) const; 1937 1938 /// \return The maximum number of function arguments the target supports. 1939 LLVM_ABI unsigned getMaxNumArgs() const; 1940 1941 /// \return For an array of given Size, return alignment boundary to 1942 /// pad to. Default is no padding. 1943 LLVM_ABI unsigned getNumBytesToPadGlobalArray(unsigned Size, 1944 Type *ArrayType) const; 1945 1946 /// @} 1947 1948 /// Collect kernel launch bounds for \p F into \p LB. 1949 LLVM_ABI void collectKernelLaunchBounds( 1950 const Function &F, 1951 SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const; 1952 1953 private: 1954 std::unique_ptr<const TargetTransformInfoImplBase> TTIImpl; 1955 }; 1956 1957 /// Analysis pass providing the \c TargetTransformInfo. 1958 /// 1959 /// The core idea of the TargetIRAnalysis is to expose an interface through 1960 /// which LLVM targets can analyze and provide information about the middle 1961 /// end's target-independent IR. This supports use cases such as target-aware 1962 /// cost modeling of IR constructs. 1963 /// 1964 /// This is a function analysis because much of the cost modeling for targets 1965 /// is done in a subtarget specific way and LLVM supports compiling different 1966 /// functions targeting different subtargets in order to support runtime 1967 /// dispatch according to the observed subtarget. 1968 class TargetIRAnalysis : public AnalysisInfoMixin<TargetIRAnalysis> { 1969 public: 1970 typedef TargetTransformInfo Result; 1971 1972 /// Default construct a target IR analysis. 1973 /// 1974 /// This will use the module's datalayout to construct a baseline 1975 /// conservative TTI result. 1976 LLVM_ABI TargetIRAnalysis(); 1977 1978 /// Construct an IR analysis pass around a target-provide callback. 1979 /// 1980 /// The callback will be called with a particular function for which the TTI 1981 /// is needed and must return a TTI object for that function. 1982 LLVM_ABI 1983 TargetIRAnalysis(std::function<Result(const Function &)> TTICallback); 1984 1985 // Value semantics. We spell out the constructors for MSVC. TargetIRAnalysis(const TargetIRAnalysis & Arg)1986 TargetIRAnalysis(const TargetIRAnalysis &Arg) 1987 : TTICallback(Arg.TTICallback) {} TargetIRAnalysis(TargetIRAnalysis && Arg)1988 TargetIRAnalysis(TargetIRAnalysis &&Arg) 1989 : TTICallback(std::move(Arg.TTICallback)) {} 1990 TargetIRAnalysis &operator=(const TargetIRAnalysis &RHS) { 1991 TTICallback = RHS.TTICallback; 1992 return *this; 1993 } 1994 TargetIRAnalysis &operator=(TargetIRAnalysis &&RHS) { 1995 TTICallback = std::move(RHS.TTICallback); 1996 return *this; 1997 } 1998 1999 LLVM_ABI Result run(const Function &F, FunctionAnalysisManager &); 2000 2001 private: 2002 friend AnalysisInfoMixin<TargetIRAnalysis>; 2003 LLVM_ABI static AnalysisKey Key; 2004 2005 /// The callback used to produce a result. 2006 /// 2007 /// We use a completely opaque callback so that targets can provide whatever 2008 /// mechanism they desire for constructing the TTI for a given function. 2009 /// 2010 /// FIXME: Should we really use std::function? It's relatively inefficient. 2011 /// It might be possible to arrange for even stateful callbacks to outlive 2012 /// the analysis and thus use a function_ref which would be lighter weight. 2013 /// This may also be less error prone as the callback is likely to reference 2014 /// the external TargetMachine, and that reference needs to never dangle. 2015 std::function<Result(const Function &)> TTICallback; 2016 2017 /// Helper function used as the callback in the default constructor. 2018 static Result getDefaultTTI(const Function &F); 2019 }; 2020 2021 /// Wrapper pass for TargetTransformInfo. 2022 /// 2023 /// This pass can be constructed from a TTI object which it stores internally 2024 /// and is queried by passes. 2025 class LLVM_ABI TargetTransformInfoWrapperPass : public ImmutablePass { 2026 TargetIRAnalysis TIRA; 2027 std::optional<TargetTransformInfo> TTI; 2028 2029 virtual void anchor(); 2030 2031 public: 2032 static char ID; 2033 2034 /// We must provide a default constructor for the pass but it should 2035 /// never be used. 2036 /// 2037 /// Use the constructor below or call one of the creation routines. 2038 TargetTransformInfoWrapperPass(); 2039 2040 explicit TargetTransformInfoWrapperPass(TargetIRAnalysis TIRA); 2041 2042 TargetTransformInfo &getTTI(const Function &F); 2043 }; 2044 2045 /// Create an analysis pass wrapper around a TTI object. 2046 /// 2047 /// This analysis pass just holds the TTI instance and makes it available to 2048 /// clients. 2049 LLVM_ABI ImmutablePass * 2050 createTargetTransformInfoWrapperPass(TargetIRAnalysis TIRA); 2051 2052 } // namespace llvm 2053 2054 #endif 2055