1 //===- BasicTTIImpl.h -------------------------------------------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// This file provides a helper that implements much of the TTI interface in 11 /// terms of the target-independent code generator and TargetLowering 12 /// interfaces. 13 // 14 //===----------------------------------------------------------------------===// 15 16 #ifndef LLVM_CODEGEN_BASICTTIIMPL_H 17 #define LLVM_CODEGEN_BASICTTIIMPL_H 18 19 #include "llvm/ADT/APInt.h" 20 #include "llvm/ADT/ArrayRef.h" 21 #include "llvm/ADT/BitVector.h" 22 #include "llvm/ADT/SmallPtrSet.h" 23 #include "llvm/ADT/SmallVector.h" 24 #include "llvm/Analysis/LoopInfo.h" 25 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 26 #include "llvm/Analysis/TargetTransformInfo.h" 27 #include "llvm/Analysis/TargetTransformInfoImpl.h" 28 #include "llvm/Analysis/ValueTracking.h" 29 #include "llvm/CodeGen/ISDOpcodes.h" 30 #include "llvm/CodeGen/TargetLowering.h" 31 #include "llvm/CodeGen/TargetSubtargetInfo.h" 32 #include "llvm/CodeGen/ValueTypes.h" 33 #include "llvm/CodeGenTypes/MachineValueType.h" 34 #include "llvm/IR/BasicBlock.h" 35 #include "llvm/IR/Constant.h" 36 #include "llvm/IR/Constants.h" 37 #include "llvm/IR/DataLayout.h" 38 #include "llvm/IR/DerivedTypes.h" 39 #include "llvm/IR/InstrTypes.h" 40 #include "llvm/IR/Instruction.h" 41 #include "llvm/IR/Instructions.h" 42 #include "llvm/IR/Intrinsics.h" 43 #include "llvm/IR/Operator.h" 44 #include "llvm/IR/Type.h" 45 #include "llvm/IR/Value.h" 46 #include "llvm/Support/Casting.h" 47 #include "llvm/Support/CommandLine.h" 48 #include "llvm/Support/ErrorHandling.h" 49 #include "llvm/Support/MathExtras.h" 50 #include "llvm/Target/TargetMachine.h" 51 #include "llvm/Target/TargetOptions.h" 52 #include "llvm/Transforms/Utils/LoopUtils.h" 53 #include <algorithm> 54 #include <cassert> 55 #include <cstdint> 56 #include <limits> 57 #include <optional> 58 #include <utility> 59 60 namespace llvm { 61 62 class Function; 63 class GlobalValue; 64 class LLVMContext; 65 class ScalarEvolution; 66 class SCEV; 67 class TargetMachine; 68 69 extern cl::opt<unsigned> PartialUnrollingThreshold; 70 71 /// Base class which can be used to help build a TTI implementation. 72 /// 73 /// This class provides as much implementation of the TTI interface as is 74 /// possible using the target independent parts of the code generator. 75 /// 76 /// In order to subclass it, your class must implement a getST() method to 77 /// return the subtarget, and a getTLI() method to return the target lowering. 78 /// We need these methods implemented in the derived class so that this class 79 /// doesn't have to duplicate storage for them. 80 template <typename T> 81 class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> { 82 private: 83 using BaseT = TargetTransformInfoImplCRTPBase<T>; 84 using TTI = TargetTransformInfo; 85 86 /// Helper function to access this as a T. thisT()87 T *thisT() { return static_cast<T *>(this); } 88 89 /// Estimate a cost of Broadcast as an extract and sequence of insert 90 /// operations. getBroadcastShuffleOverhead(FixedVectorType * VTy,TTI::TargetCostKind CostKind)91 InstructionCost getBroadcastShuffleOverhead(FixedVectorType *VTy, 92 TTI::TargetCostKind CostKind) { 93 InstructionCost Cost = 0; 94 // Broadcast cost is equal to the cost of extracting the zero'th element 95 // plus the cost of inserting it into every element of the result vector. 96 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, 97 CostKind, 0, nullptr, nullptr); 98 99 for (int i = 0, e = VTy->getNumElements(); i < e; ++i) { 100 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, 101 CostKind, i, nullptr, nullptr); 102 } 103 return Cost; 104 } 105 106 /// Estimate a cost of shuffle as a sequence of extract and insert 107 /// operations. getPermuteShuffleOverhead(FixedVectorType * VTy,TTI::TargetCostKind CostKind)108 InstructionCost getPermuteShuffleOverhead(FixedVectorType *VTy, 109 TTI::TargetCostKind CostKind) { 110 InstructionCost Cost = 0; 111 // Shuffle cost is equal to the cost of extracting element from its argument 112 // plus the cost of inserting them onto the result vector. 113 114 // e.g. <4 x float> has a mask of <0,5,2,7> i.e we need to extract from 115 // index 0 of first vector, index 1 of second vector,index 2 of first 116 // vector and finally index 3 of second vector and insert them at index 117 // <0,1,2,3> of result vector. 118 for (int i = 0, e = VTy->getNumElements(); i < e; ++i) { 119 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, 120 CostKind, i, nullptr, nullptr); 121 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, 122 CostKind, i, nullptr, nullptr); 123 } 124 return Cost; 125 } 126 127 /// Estimate a cost of subvector extraction as a sequence of extract and 128 /// insert operations. getExtractSubvectorOverhead(VectorType * VTy,TTI::TargetCostKind CostKind,int Index,FixedVectorType * SubVTy)129 InstructionCost getExtractSubvectorOverhead(VectorType *VTy, 130 TTI::TargetCostKind CostKind, 131 int Index, 132 FixedVectorType *SubVTy) { 133 assert(VTy && SubVTy && 134 "Can only extract subvectors from vectors"); 135 int NumSubElts = SubVTy->getNumElements(); 136 assert((!isa<FixedVectorType>(VTy) || 137 (Index + NumSubElts) <= 138 (int)cast<FixedVectorType>(VTy)->getNumElements()) && 139 "SK_ExtractSubvector index out of range"); 140 141 InstructionCost Cost = 0; 142 // Subvector extraction cost is equal to the cost of extracting element from 143 // the source type plus the cost of inserting them into the result vector 144 // type. 145 for (int i = 0; i != NumSubElts; ++i) { 146 Cost += 147 thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, 148 CostKind, i + Index, nullptr, nullptr); 149 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, SubVTy, 150 CostKind, i, nullptr, nullptr); 151 } 152 return Cost; 153 } 154 155 /// Estimate a cost of subvector insertion as a sequence of extract and 156 /// insert operations. getInsertSubvectorOverhead(VectorType * VTy,TTI::TargetCostKind CostKind,int Index,FixedVectorType * SubVTy)157 InstructionCost getInsertSubvectorOverhead(VectorType *VTy, 158 TTI::TargetCostKind CostKind, 159 int Index, 160 FixedVectorType *SubVTy) { 161 assert(VTy && SubVTy && 162 "Can only insert subvectors into vectors"); 163 int NumSubElts = SubVTy->getNumElements(); 164 assert((!isa<FixedVectorType>(VTy) || 165 (Index + NumSubElts) <= 166 (int)cast<FixedVectorType>(VTy)->getNumElements()) && 167 "SK_InsertSubvector index out of range"); 168 169 InstructionCost Cost = 0; 170 // Subvector insertion cost is equal to the cost of extracting element from 171 // the source type plus the cost of inserting them into the result vector 172 // type. 173 for (int i = 0; i != NumSubElts; ++i) { 174 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVTy, 175 CostKind, i, nullptr, nullptr); 176 Cost += 177 thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, CostKind, 178 i + Index, nullptr, nullptr); 179 } 180 return Cost; 181 } 182 183 /// Local query method delegates up to T which *must* implement this! getST()184 const TargetSubtargetInfo *getST() const { 185 return static_cast<const T *>(this)->getST(); 186 } 187 188 /// Local query method delegates up to T which *must* implement this! getTLI()189 const TargetLoweringBase *getTLI() const { 190 return static_cast<const T *>(this)->getTLI(); 191 } 192 getISDIndexedMode(TTI::MemIndexedMode M)193 static ISD::MemIndexedMode getISDIndexedMode(TTI::MemIndexedMode M) { 194 switch (M) { 195 case TTI::MIM_Unindexed: 196 return ISD::UNINDEXED; 197 case TTI::MIM_PreInc: 198 return ISD::PRE_INC; 199 case TTI::MIM_PreDec: 200 return ISD::PRE_DEC; 201 case TTI::MIM_PostInc: 202 return ISD::POST_INC; 203 case TTI::MIM_PostDec: 204 return ISD::POST_DEC; 205 } 206 llvm_unreachable("Unexpected MemIndexedMode"); 207 } 208 209 InstructionCost getCommonMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, 210 Align Alignment, 211 bool VariableMask, 212 bool IsGatherScatter, 213 TTI::TargetCostKind CostKind, 214 unsigned AddressSpace = 0) { 215 // We cannot scalarize scalable vectors, so return Invalid. 216 if (isa<ScalableVectorType>(DataTy)) 217 return InstructionCost::getInvalid(); 218 219 auto *VT = cast<FixedVectorType>(DataTy); 220 unsigned VF = VT->getNumElements(); 221 222 // Assume the target does not have support for gather/scatter operations 223 // and provide a rough estimate. 224 // 225 // First, compute the cost of the individual memory operations. 226 InstructionCost AddrExtractCost = 227 IsGatherScatter 228 ? getScalarizationOverhead( 229 FixedVectorType::get( 230 PointerType::get(VT->getElementType(), 0), VF), 231 /*Insert=*/false, /*Extract=*/true, CostKind) 232 : 0; 233 234 // The cost of the scalar loads/stores. 235 InstructionCost MemoryOpCost = 236 VF * thisT()->getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 237 AddressSpace, CostKind); 238 239 // Next, compute the cost of packing the result in a vector. 240 InstructionCost PackingCost = 241 getScalarizationOverhead(VT, Opcode != Instruction::Store, 242 Opcode == Instruction::Store, CostKind); 243 244 InstructionCost ConditionalCost = 0; 245 if (VariableMask) { 246 // Compute the cost of conditionally executing the memory operations with 247 // variable masks. This includes extracting the individual conditions, a 248 // branches and PHIs to combine the results. 249 // NOTE: Estimating the cost of conditionally executing the memory 250 // operations accurately is quite difficult and the current solution 251 // provides a very rough estimate only. 252 ConditionalCost = 253 getScalarizationOverhead( 254 FixedVectorType::get(Type::getInt1Ty(DataTy->getContext()), VF), 255 /*Insert=*/false, /*Extract=*/true, CostKind) + 256 VF * (thisT()->getCFInstrCost(Instruction::Br, CostKind) + 257 thisT()->getCFInstrCost(Instruction::PHI, CostKind)); 258 } 259 260 return AddrExtractCost + MemoryOpCost + PackingCost + ConditionalCost; 261 } 262 263 protected: BasicTTIImplBase(const TargetMachine * TM,const DataLayout & DL)264 explicit BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL) 265 : BaseT(DL) {} 266 virtual ~BasicTTIImplBase() = default; 267 268 using TargetTransformInfoImplBase::DL; 269 270 public: 271 /// \name Scalar TTI Implementations 272 /// @{ allowsMisalignedMemoryAccesses(LLVMContext & Context,unsigned BitWidth,unsigned AddressSpace,Align Alignment,unsigned * Fast)273 bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth, 274 unsigned AddressSpace, Align Alignment, 275 unsigned *Fast) const { 276 EVT E = EVT::getIntegerVT(Context, BitWidth); 277 return getTLI()->allowsMisalignedMemoryAccesses( 278 E, AddressSpace, Alignment, MachineMemOperand::MONone, Fast); 279 } 280 281 bool hasBranchDivergence(const Function *F = nullptr) { return false; } 282 isSourceOfDivergence(const Value * V)283 bool isSourceOfDivergence(const Value *V) { return false; } 284 isAlwaysUniform(const Value * V)285 bool isAlwaysUniform(const Value *V) { return false; } 286 isValidAddrSpaceCast(unsigned FromAS,unsigned ToAS)287 bool isValidAddrSpaceCast(unsigned FromAS, unsigned ToAS) const { 288 return false; 289 } 290 addrspacesMayAlias(unsigned AS0,unsigned AS1)291 bool addrspacesMayAlias(unsigned AS0, unsigned AS1) const { 292 return true; 293 } 294 getFlatAddressSpace()295 unsigned getFlatAddressSpace() { 296 // Return an invalid address space. 297 return -1; 298 } 299 collectFlatAddressOperands(SmallVectorImpl<int> & OpIndexes,Intrinsic::ID IID)300 bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes, 301 Intrinsic::ID IID) const { 302 return false; 303 } 304 isNoopAddrSpaceCast(unsigned FromAS,unsigned ToAS)305 bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const { 306 return getTLI()->getTargetMachine().isNoopAddrSpaceCast(FromAS, ToAS); 307 } 308 getAssumedAddrSpace(const Value * V)309 unsigned getAssumedAddrSpace(const Value *V) const { 310 return getTLI()->getTargetMachine().getAssumedAddrSpace(V); 311 } 312 isSingleThreaded()313 bool isSingleThreaded() const { 314 return getTLI()->getTargetMachine().Options.ThreadModel == 315 ThreadModel::Single; 316 } 317 318 std::pair<const Value *, unsigned> getPredicatedAddrSpace(const Value * V)319 getPredicatedAddrSpace(const Value *V) const { 320 return getTLI()->getTargetMachine().getPredicatedAddrSpace(V); 321 } 322 rewriteIntrinsicWithAddressSpace(IntrinsicInst * II,Value * OldV,Value * NewV)323 Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, 324 Value *NewV) const { 325 return nullptr; 326 } 327 isLegalAddImmediate(int64_t imm)328 bool isLegalAddImmediate(int64_t imm) { 329 return getTLI()->isLegalAddImmediate(imm); 330 } 331 isLegalAddScalableImmediate(int64_t Imm)332 bool isLegalAddScalableImmediate(int64_t Imm) { 333 return getTLI()->isLegalAddScalableImmediate(Imm); 334 } 335 isLegalICmpImmediate(int64_t imm)336 bool isLegalICmpImmediate(int64_t imm) { 337 return getTLI()->isLegalICmpImmediate(imm); 338 } 339 340 bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, 341 bool HasBaseReg, int64_t Scale, unsigned AddrSpace, 342 Instruction *I = nullptr, 343 int64_t ScalableOffset = 0) { 344 TargetLoweringBase::AddrMode AM; 345 AM.BaseGV = BaseGV; 346 AM.BaseOffs = BaseOffset; 347 AM.HasBaseReg = HasBaseReg; 348 AM.Scale = Scale; 349 AM.ScalableOffset = ScalableOffset; 350 return getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace, I); 351 } 352 getPreferredLargeGEPBaseOffset(int64_t MinOffset,int64_t MaxOffset)353 int64_t getPreferredLargeGEPBaseOffset(int64_t MinOffset, int64_t MaxOffset) { 354 return getTLI()->getPreferredLargeGEPBaseOffset(MinOffset, MaxOffset); 355 } 356 getStoreMinimumVF(unsigned VF,Type * ScalarMemTy,Type * ScalarValTy)357 unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, 358 Type *ScalarValTy) const { 359 auto &&IsSupportedByTarget = [this, ScalarMemTy, ScalarValTy](unsigned VF) { 360 auto *SrcTy = FixedVectorType::get(ScalarMemTy, VF / 2); 361 EVT VT = getTLI()->getValueType(DL, SrcTy); 362 if (getTLI()->isOperationLegal(ISD::STORE, VT) || 363 getTLI()->isOperationCustom(ISD::STORE, VT)) 364 return true; 365 366 EVT ValVT = 367 getTLI()->getValueType(DL, FixedVectorType::get(ScalarValTy, VF / 2)); 368 EVT LegalizedVT = 369 getTLI()->getTypeToTransformTo(ScalarMemTy->getContext(), VT); 370 return getTLI()->isTruncStoreLegal(LegalizedVT, ValVT); 371 }; 372 while (VF > 2 && IsSupportedByTarget(VF)) 373 VF /= 2; 374 return VF; 375 } 376 isIndexedLoadLegal(TTI::MemIndexedMode M,Type * Ty,const DataLayout & DL)377 bool isIndexedLoadLegal(TTI::MemIndexedMode M, Type *Ty, 378 const DataLayout &DL) const { 379 EVT VT = getTLI()->getValueType(DL, Ty); 380 return getTLI()->isIndexedLoadLegal(getISDIndexedMode(M), VT); 381 } 382 isIndexedStoreLegal(TTI::MemIndexedMode M,Type * Ty,const DataLayout & DL)383 bool isIndexedStoreLegal(TTI::MemIndexedMode M, Type *Ty, 384 const DataLayout &DL) const { 385 EVT VT = getTLI()->getValueType(DL, Ty); 386 return getTLI()->isIndexedStoreLegal(getISDIndexedMode(M), VT); 387 } 388 isLSRCostLess(TTI::LSRCost C1,TTI::LSRCost C2)389 bool isLSRCostLess(TTI::LSRCost C1, TTI::LSRCost C2) { 390 return TargetTransformInfoImplBase::isLSRCostLess(C1, C2); 391 } 392 isNumRegsMajorCostOfLSR()393 bool isNumRegsMajorCostOfLSR() { 394 return TargetTransformInfoImplBase::isNumRegsMajorCostOfLSR(); 395 } 396 shouldFoldTerminatingConditionAfterLSR()397 bool shouldFoldTerminatingConditionAfterLSR() const { 398 return TargetTransformInfoImplBase:: 399 shouldFoldTerminatingConditionAfterLSR(); 400 } 401 shouldDropLSRSolutionIfLessProfitable()402 bool shouldDropLSRSolutionIfLessProfitable() const { 403 return TargetTransformInfoImplBase::shouldDropLSRSolutionIfLessProfitable(); 404 } 405 isProfitableLSRChainElement(Instruction * I)406 bool isProfitableLSRChainElement(Instruction *I) { 407 return TargetTransformInfoImplBase::isProfitableLSRChainElement(I); 408 } 409 getScalingFactorCost(Type * Ty,GlobalValue * BaseGV,StackOffset BaseOffset,bool HasBaseReg,int64_t Scale,unsigned AddrSpace)410 InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, 411 StackOffset BaseOffset, bool HasBaseReg, 412 int64_t Scale, unsigned AddrSpace) { 413 TargetLoweringBase::AddrMode AM; 414 AM.BaseGV = BaseGV; 415 AM.BaseOffs = BaseOffset.getFixed(); 416 AM.HasBaseReg = HasBaseReg; 417 AM.Scale = Scale; 418 AM.ScalableOffset = BaseOffset.getScalable(); 419 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace)) 420 return 0; 421 return -1; 422 } 423 isTruncateFree(Type * Ty1,Type * Ty2)424 bool isTruncateFree(Type *Ty1, Type *Ty2) { 425 return getTLI()->isTruncateFree(Ty1, Ty2); 426 } 427 isProfitableToHoist(Instruction * I)428 bool isProfitableToHoist(Instruction *I) { 429 return getTLI()->isProfitableToHoist(I); 430 } 431 useAA()432 bool useAA() const { return getST()->useAA(); } 433 isTypeLegal(Type * Ty)434 bool isTypeLegal(Type *Ty) { 435 EVT VT = getTLI()->getValueType(DL, Ty, /*AllowUnknown=*/true); 436 return getTLI()->isTypeLegal(VT); 437 } 438 getRegUsageForType(Type * Ty)439 unsigned getRegUsageForType(Type *Ty) { 440 EVT ETy = getTLI()->getValueType(DL, Ty); 441 return getTLI()->getNumRegisters(Ty->getContext(), ETy); 442 } 443 getGEPCost(Type * PointeeType,const Value * Ptr,ArrayRef<const Value * > Operands,Type * AccessType,TTI::TargetCostKind CostKind)444 InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, 445 ArrayRef<const Value *> Operands, Type *AccessType, 446 TTI::TargetCostKind CostKind) { 447 return BaseT::getGEPCost(PointeeType, Ptr, Operands, AccessType, CostKind); 448 } 449 getEstimatedNumberOfCaseClusters(const SwitchInst & SI,unsigned & JumpTableSize,ProfileSummaryInfo * PSI,BlockFrequencyInfo * BFI)450 unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI, 451 unsigned &JumpTableSize, 452 ProfileSummaryInfo *PSI, 453 BlockFrequencyInfo *BFI) { 454 /// Try to find the estimated number of clusters. Note that the number of 455 /// clusters identified in this function could be different from the actual 456 /// numbers found in lowering. This function ignore switches that are 457 /// lowered with a mix of jump table / bit test / BTree. This function was 458 /// initially intended to be used when estimating the cost of switch in 459 /// inline cost heuristic, but it's a generic cost model to be used in other 460 /// places (e.g., in loop unrolling). 461 unsigned N = SI.getNumCases(); 462 const TargetLoweringBase *TLI = getTLI(); 463 const DataLayout &DL = this->getDataLayout(); 464 465 JumpTableSize = 0; 466 bool IsJTAllowed = TLI->areJTsAllowed(SI.getParent()->getParent()); 467 468 // Early exit if both a jump table and bit test are not allowed. 469 if (N < 1 || (!IsJTAllowed && DL.getIndexSizeInBits(0u) < N)) 470 return N; 471 472 APInt MaxCaseVal = SI.case_begin()->getCaseValue()->getValue(); 473 APInt MinCaseVal = MaxCaseVal; 474 for (auto CI : SI.cases()) { 475 const APInt &CaseVal = CI.getCaseValue()->getValue(); 476 if (CaseVal.sgt(MaxCaseVal)) 477 MaxCaseVal = CaseVal; 478 if (CaseVal.slt(MinCaseVal)) 479 MinCaseVal = CaseVal; 480 } 481 482 // Check if suitable for a bit test 483 if (N <= DL.getIndexSizeInBits(0u)) { 484 SmallPtrSet<const BasicBlock *, 4> Dests; 485 for (auto I : SI.cases()) 486 Dests.insert(I.getCaseSuccessor()); 487 488 if (TLI->isSuitableForBitTests(Dests.size(), N, MinCaseVal, MaxCaseVal, 489 DL)) 490 return 1; 491 } 492 493 // Check if suitable for a jump table. 494 if (IsJTAllowed) { 495 if (N < 2 || N < TLI->getMinimumJumpTableEntries()) 496 return N; 497 uint64_t Range = 498 (MaxCaseVal - MinCaseVal) 499 .getLimitedValue(std::numeric_limits<uint64_t>::max() - 1) + 1; 500 // Check whether a range of clusters is dense enough for a jump table 501 if (TLI->isSuitableForJumpTable(&SI, N, Range, PSI, BFI)) { 502 JumpTableSize = Range; 503 return 1; 504 } 505 } 506 return N; 507 } 508 shouldBuildLookupTables()509 bool shouldBuildLookupTables() { 510 const TargetLoweringBase *TLI = getTLI(); 511 return TLI->isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) || 512 TLI->isOperationLegalOrCustom(ISD::BRIND, MVT::Other); 513 } 514 shouldBuildRelLookupTables()515 bool shouldBuildRelLookupTables() const { 516 const TargetMachine &TM = getTLI()->getTargetMachine(); 517 // If non-PIC mode, do not generate a relative lookup table. 518 if (!TM.isPositionIndependent()) 519 return false; 520 521 /// Relative lookup table entries consist of 32-bit offsets. 522 /// Do not generate relative lookup tables for large code models 523 /// in 64-bit achitectures where 32-bit offsets might not be enough. 524 if (TM.getCodeModel() == CodeModel::Medium || 525 TM.getCodeModel() == CodeModel::Large) 526 return false; 527 528 Triple TargetTriple = TM.getTargetTriple(); 529 if (!TargetTriple.isArch64Bit()) 530 return false; 531 532 // TODO: Triggers issues on aarch64 on darwin, so temporarily disable it 533 // there. 534 if (TargetTriple.getArch() == Triple::aarch64 && TargetTriple.isOSDarwin()) 535 return false; 536 537 return true; 538 } 539 haveFastSqrt(Type * Ty)540 bool haveFastSqrt(Type *Ty) { 541 const TargetLoweringBase *TLI = getTLI(); 542 EVT VT = TLI->getValueType(DL, Ty); 543 return TLI->isTypeLegal(VT) && 544 TLI->isOperationLegalOrCustom(ISD::FSQRT, VT); 545 } 546 isFCmpOrdCheaperThanFCmpZero(Type * Ty)547 bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) { 548 return true; 549 } 550 getFPOpCost(Type * Ty)551 InstructionCost getFPOpCost(Type *Ty) { 552 // Check whether FADD is available, as a proxy for floating-point in 553 // general. 554 const TargetLoweringBase *TLI = getTLI(); 555 EVT VT = TLI->getValueType(DL, Ty); 556 if (TLI->isOperationLegalOrCustomOrPromote(ISD::FADD, VT)) 557 return TargetTransformInfo::TCC_Basic; 558 return TargetTransformInfo::TCC_Expensive; 559 } 560 preferToKeepConstantsAttached(const Instruction & Inst,const Function & Fn)561 bool preferToKeepConstantsAttached(const Instruction &Inst, 562 const Function &Fn) const { 563 switch (Inst.getOpcode()) { 564 default: 565 break; 566 case Instruction::SDiv: 567 case Instruction::SRem: 568 case Instruction::UDiv: 569 case Instruction::URem: { 570 if (!isa<ConstantInt>(Inst.getOperand(1))) 571 return false; 572 EVT VT = getTLI()->getValueType(DL, Inst.getType()); 573 return !getTLI()->isIntDivCheap(VT, Fn.getAttributes()); 574 } 575 }; 576 577 return false; 578 } 579 getInliningThresholdMultiplier()580 unsigned getInliningThresholdMultiplier() const { return 1; } adjustInliningThreshold(const CallBase * CB)581 unsigned adjustInliningThreshold(const CallBase *CB) { return 0; } getCallerAllocaCost(const CallBase * CB,const AllocaInst * AI)582 unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const { 583 return 0; 584 } 585 getInlinerVectorBonusPercent()586 int getInlinerVectorBonusPercent() const { return 150; } 587 getUnrollingPreferences(Loop * L,ScalarEvolution & SE,TTI::UnrollingPreferences & UP,OptimizationRemarkEmitter * ORE)588 void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, 589 TTI::UnrollingPreferences &UP, 590 OptimizationRemarkEmitter *ORE) { 591 // This unrolling functionality is target independent, but to provide some 592 // motivation for its intended use, for x86: 593 594 // According to the Intel 64 and IA-32 Architectures Optimization Reference 595 // Manual, Intel Core models and later have a loop stream detector (and 596 // associated uop queue) that can benefit from partial unrolling. 597 // The relevant requirements are: 598 // - The loop must have no more than 4 (8 for Nehalem and later) branches 599 // taken, and none of them may be calls. 600 // - The loop can have no more than 18 (28 for Nehalem and later) uops. 601 602 // According to the Software Optimization Guide for AMD Family 15h 603 // Processors, models 30h-4fh (Steamroller and later) have a loop predictor 604 // and loop buffer which can benefit from partial unrolling. 605 // The relevant requirements are: 606 // - The loop must have fewer than 16 branches 607 // - The loop must have less than 40 uops in all executed loop branches 608 609 // The number of taken branches in a loop is hard to estimate here, and 610 // benchmarking has revealed that it is better not to be conservative when 611 // estimating the branch count. As a result, we'll ignore the branch limits 612 // until someone finds a case where it matters in practice. 613 614 unsigned MaxOps; 615 const TargetSubtargetInfo *ST = getST(); 616 if (PartialUnrollingThreshold.getNumOccurrences() > 0) 617 MaxOps = PartialUnrollingThreshold; 618 else if (ST->getSchedModel().LoopMicroOpBufferSize > 0) 619 MaxOps = ST->getSchedModel().LoopMicroOpBufferSize; 620 else 621 return; 622 623 // Scan the loop: don't unroll loops with calls. 624 for (BasicBlock *BB : L->blocks()) { 625 for (Instruction &I : *BB) { 626 if (isa<CallInst>(I) || isa<InvokeInst>(I)) { 627 if (const Function *F = cast<CallBase>(I).getCalledFunction()) { 628 if (!thisT()->isLoweredToCall(F)) 629 continue; 630 } 631 632 if (ORE) { 633 ORE->emit([&]() { 634 return OptimizationRemark("TTI", "DontUnroll", L->getStartLoc(), 635 L->getHeader()) 636 << "advising against unrolling the loop because it " 637 "contains a " 638 << ore::NV("Call", &I); 639 }); 640 } 641 return; 642 } 643 } 644 } 645 646 // Enable runtime and partial unrolling up to the specified size. 647 // Enable using trip count upper bound to unroll loops. 648 UP.Partial = UP.Runtime = UP.UpperBound = true; 649 UP.PartialThreshold = MaxOps; 650 651 // Avoid unrolling when optimizing for size. 652 UP.OptSizeThreshold = 0; 653 UP.PartialOptSizeThreshold = 0; 654 655 // Set number of instructions optimized when "back edge" 656 // becomes "fall through" to default value of 2. 657 UP.BEInsns = 2; 658 } 659 getPeelingPreferences(Loop * L,ScalarEvolution & SE,TTI::PeelingPreferences & PP)660 void getPeelingPreferences(Loop *L, ScalarEvolution &SE, 661 TTI::PeelingPreferences &PP) { 662 PP.PeelCount = 0; 663 PP.AllowPeeling = true; 664 PP.AllowLoopNestsPeeling = false; 665 PP.PeelProfiledIterations = true; 666 } 667 isHardwareLoopProfitable(Loop * L,ScalarEvolution & SE,AssumptionCache & AC,TargetLibraryInfo * LibInfo,HardwareLoopInfo & HWLoopInfo)668 bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, 669 AssumptionCache &AC, 670 TargetLibraryInfo *LibInfo, 671 HardwareLoopInfo &HWLoopInfo) { 672 return BaseT::isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo); 673 } 674 preferPredicateOverEpilogue(TailFoldingInfo * TFI)675 bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) { 676 return BaseT::preferPredicateOverEpilogue(TFI); 677 } 678 679 TailFoldingStyle 680 getPreferredTailFoldingStyle(bool IVUpdateMayOverflow = true) { 681 return BaseT::getPreferredTailFoldingStyle(IVUpdateMayOverflow); 682 } 683 instCombineIntrinsic(InstCombiner & IC,IntrinsicInst & II)684 std::optional<Instruction *> instCombineIntrinsic(InstCombiner &IC, 685 IntrinsicInst &II) { 686 return BaseT::instCombineIntrinsic(IC, II); 687 } 688 689 std::optional<Value *> simplifyDemandedUseBitsIntrinsic(InstCombiner & IC,IntrinsicInst & II,APInt DemandedMask,KnownBits & Known,bool & KnownBitsComputed)690 simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II, 691 APInt DemandedMask, KnownBits &Known, 692 bool &KnownBitsComputed) { 693 return BaseT::simplifyDemandedUseBitsIntrinsic(IC, II, DemandedMask, Known, 694 KnownBitsComputed); 695 } 696 simplifyDemandedVectorEltsIntrinsic(InstCombiner & IC,IntrinsicInst & II,APInt DemandedElts,APInt & UndefElts,APInt & UndefElts2,APInt & UndefElts3,std::function<void (Instruction *,unsigned,APInt,APInt &)> SimplifyAndSetOp)697 std::optional<Value *> simplifyDemandedVectorEltsIntrinsic( 698 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, 699 APInt &UndefElts2, APInt &UndefElts3, 700 std::function<void(Instruction *, unsigned, APInt, APInt &)> 701 SimplifyAndSetOp) { 702 return BaseT::simplifyDemandedVectorEltsIntrinsic( 703 IC, II, DemandedElts, UndefElts, UndefElts2, UndefElts3, 704 SimplifyAndSetOp); 705 } 706 707 virtual std::optional<unsigned> getCacheSize(TargetTransformInfo::CacheLevel Level)708 getCacheSize(TargetTransformInfo::CacheLevel Level) const { 709 return std::optional<unsigned>( 710 getST()->getCacheSize(static_cast<unsigned>(Level))); 711 } 712 713 virtual std::optional<unsigned> getCacheAssociativity(TargetTransformInfo::CacheLevel Level)714 getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const { 715 std::optional<unsigned> TargetResult = 716 getST()->getCacheAssociativity(static_cast<unsigned>(Level)); 717 718 if (TargetResult) 719 return TargetResult; 720 721 return BaseT::getCacheAssociativity(Level); 722 } 723 getCacheLineSize()724 virtual unsigned getCacheLineSize() const { 725 return getST()->getCacheLineSize(); 726 } 727 getPrefetchDistance()728 virtual unsigned getPrefetchDistance() const { 729 return getST()->getPrefetchDistance(); 730 } 731 getMinPrefetchStride(unsigned NumMemAccesses,unsigned NumStridedMemAccesses,unsigned NumPrefetches,bool HasCall)732 virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses, 733 unsigned NumStridedMemAccesses, 734 unsigned NumPrefetches, 735 bool HasCall) const { 736 return getST()->getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses, 737 NumPrefetches, HasCall); 738 } 739 getMaxPrefetchIterationsAhead()740 virtual unsigned getMaxPrefetchIterationsAhead() const { 741 return getST()->getMaxPrefetchIterationsAhead(); 742 } 743 enableWritePrefetching()744 virtual bool enableWritePrefetching() const { 745 return getST()->enableWritePrefetching(); 746 } 747 shouldPrefetchAddressSpace(unsigned AS)748 virtual bool shouldPrefetchAddressSpace(unsigned AS) const { 749 return getST()->shouldPrefetchAddressSpace(AS); 750 } 751 752 /// @} 753 754 /// \name Vector TTI Implementations 755 /// @{ 756 getRegisterBitWidth(TargetTransformInfo::RegisterKind K)757 TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { 758 return TypeSize::getFixed(32); 759 } 760 getMaxVScale()761 std::optional<unsigned> getMaxVScale() const { return std::nullopt; } getVScaleForTuning()762 std::optional<unsigned> getVScaleForTuning() const { return std::nullopt; } isVScaleKnownToBeAPowerOfTwo()763 bool isVScaleKnownToBeAPowerOfTwo() const { return false; } 764 765 /// Estimate the overhead of scalarizing an instruction. Insert and Extract 766 /// are set if the demanded result elements need to be inserted and/or 767 /// extracted from vectors. getScalarizationOverhead(VectorType * InTy,const APInt & DemandedElts,bool Insert,bool Extract,TTI::TargetCostKind CostKind)768 InstructionCost getScalarizationOverhead(VectorType *InTy, 769 const APInt &DemandedElts, 770 bool Insert, bool Extract, 771 TTI::TargetCostKind CostKind) { 772 /// FIXME: a bitfield is not a reasonable abstraction for talking about 773 /// which elements are needed from a scalable vector 774 if (isa<ScalableVectorType>(InTy)) 775 return InstructionCost::getInvalid(); 776 auto *Ty = cast<FixedVectorType>(InTy); 777 778 assert(DemandedElts.getBitWidth() == Ty->getNumElements() && 779 "Vector size mismatch"); 780 781 InstructionCost Cost = 0; 782 783 for (int i = 0, e = Ty->getNumElements(); i < e; ++i) { 784 if (!DemandedElts[i]) 785 continue; 786 if (Insert) 787 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, Ty, 788 CostKind, i, nullptr, nullptr); 789 if (Extract) 790 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 791 CostKind, i, nullptr, nullptr); 792 } 793 794 return Cost; 795 } 796 797 /// Helper wrapper for the DemandedElts variant of getScalarizationOverhead. getScalarizationOverhead(VectorType * InTy,bool Insert,bool Extract,TTI::TargetCostKind CostKind)798 InstructionCost getScalarizationOverhead(VectorType *InTy, bool Insert, 799 bool Extract, 800 TTI::TargetCostKind CostKind) { 801 if (isa<ScalableVectorType>(InTy)) 802 return InstructionCost::getInvalid(); 803 auto *Ty = cast<FixedVectorType>(InTy); 804 805 APInt DemandedElts = APInt::getAllOnes(Ty->getNumElements()); 806 return thisT()->getScalarizationOverhead(Ty, DemandedElts, Insert, Extract, 807 CostKind); 808 } 809 810 /// Estimate the overhead of scalarizing an instructions unique 811 /// non-constant operands. The (potentially vector) types to use for each of 812 /// argument are passes via Tys. 813 InstructionCost getOperandsScalarizationOverhead(ArrayRef<const Value * > Args,ArrayRef<Type * > Tys,TTI::TargetCostKind CostKind)814 getOperandsScalarizationOverhead(ArrayRef<const Value *> Args, 815 ArrayRef<Type *> Tys, 816 TTI::TargetCostKind CostKind) { 817 assert(Args.size() == Tys.size() && "Expected matching Args and Tys"); 818 819 InstructionCost Cost = 0; 820 SmallPtrSet<const Value*, 4> UniqueOperands; 821 for (int I = 0, E = Args.size(); I != E; I++) { 822 // Disregard things like metadata arguments. 823 const Value *A = Args[I]; 824 Type *Ty = Tys[I]; 825 if (!Ty->isIntOrIntVectorTy() && !Ty->isFPOrFPVectorTy() && 826 !Ty->isPtrOrPtrVectorTy()) 827 continue; 828 829 if (!isa<Constant>(A) && UniqueOperands.insert(A).second) { 830 if (auto *VecTy = dyn_cast<VectorType>(Ty)) 831 Cost += getScalarizationOverhead(VecTy, /*Insert*/ false, 832 /*Extract*/ true, CostKind); 833 } 834 } 835 836 return Cost; 837 } 838 839 /// Estimate the overhead of scalarizing the inputs and outputs of an 840 /// instruction, with return type RetTy and arguments Args of type Tys. If 841 /// Args are unknown (empty), then the cost associated with one argument is 842 /// added as a heuristic. getScalarizationOverhead(VectorType * RetTy,ArrayRef<const Value * > Args,ArrayRef<Type * > Tys,TTI::TargetCostKind CostKind)843 InstructionCost getScalarizationOverhead(VectorType *RetTy, 844 ArrayRef<const Value *> Args, 845 ArrayRef<Type *> Tys, 846 TTI::TargetCostKind CostKind) { 847 InstructionCost Cost = getScalarizationOverhead( 848 RetTy, /*Insert*/ true, /*Extract*/ false, CostKind); 849 if (!Args.empty()) 850 Cost += getOperandsScalarizationOverhead(Args, Tys, CostKind); 851 else 852 // When no information on arguments is provided, we add the cost 853 // associated with one argument as a heuristic. 854 Cost += getScalarizationOverhead(RetTy, /*Insert*/ false, 855 /*Extract*/ true, CostKind); 856 857 return Cost; 858 } 859 860 /// Estimate the cost of type-legalization and the legalized type. getTypeLegalizationCost(Type * Ty)861 std::pair<InstructionCost, MVT> getTypeLegalizationCost(Type *Ty) const { 862 LLVMContext &C = Ty->getContext(); 863 EVT MTy = getTLI()->getValueType(DL, Ty); 864 865 InstructionCost Cost = 1; 866 // We keep legalizing the type until we find a legal kind. We assume that 867 // the only operation that costs anything is the split. After splitting 868 // we need to handle two types. 869 while (true) { 870 TargetLoweringBase::LegalizeKind LK = getTLI()->getTypeConversion(C, MTy); 871 872 if (LK.first == TargetLoweringBase::TypeScalarizeScalableVector) { 873 // Ensure we return a sensible simple VT here, since many callers of 874 // this function require it. 875 MVT VT = MTy.isSimple() ? MTy.getSimpleVT() : MVT::i64; 876 return std::make_pair(InstructionCost::getInvalid(), VT); 877 } 878 879 if (LK.first == TargetLoweringBase::TypeLegal) 880 return std::make_pair(Cost, MTy.getSimpleVT()); 881 882 if (LK.first == TargetLoweringBase::TypeSplitVector || 883 LK.first == TargetLoweringBase::TypeExpandInteger) 884 Cost *= 2; 885 886 // Do not loop with f128 type. 887 if (MTy == LK.second) 888 return std::make_pair(Cost, MTy.getSimpleVT()); 889 890 // Keep legalizing the type. 891 MTy = LK.second; 892 } 893 } 894 getMaxInterleaveFactor(ElementCount VF)895 unsigned getMaxInterleaveFactor(ElementCount VF) { return 1; } 896 897 InstructionCost getArithmeticInstrCost( 898 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, 899 TTI::OperandValueInfo Opd1Info = {TTI::OK_AnyValue, TTI::OP_None}, 900 TTI::OperandValueInfo Opd2Info = {TTI::OK_AnyValue, TTI::OP_None}, 901 ArrayRef<const Value *> Args = std::nullopt, 902 const Instruction *CxtI = nullptr) { 903 // Check if any of the operands are vector operands. 904 const TargetLoweringBase *TLI = getTLI(); 905 int ISD = TLI->InstructionOpcodeToISD(Opcode); 906 assert(ISD && "Invalid opcode"); 907 908 // TODO: Handle more cost kinds. 909 if (CostKind != TTI::TCK_RecipThroughput) 910 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, 911 Opd1Info, Opd2Info, 912 Args, CxtI); 913 914 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); 915 916 bool IsFloat = Ty->isFPOrFPVectorTy(); 917 // Assume that floating point arithmetic operations cost twice as much as 918 // integer operations. 919 InstructionCost OpCost = (IsFloat ? 2 : 1); 920 921 if (TLI->isOperationLegalOrPromote(ISD, LT.second)) { 922 // The operation is legal. Assume it costs 1. 923 // TODO: Once we have extract/insert subvector cost we need to use them. 924 return LT.first * OpCost; 925 } 926 927 if (!TLI->isOperationExpand(ISD, LT.second)) { 928 // If the operation is custom lowered, then assume that the code is twice 929 // as expensive. 930 return LT.first * 2 * OpCost; 931 } 932 933 // An 'Expand' of URem and SRem is special because it may default 934 // to expanding the operation into a sequence of sub-operations 935 // i.e. X % Y -> X-(X/Y)*Y. 936 if (ISD == ISD::UREM || ISD == ISD::SREM) { 937 bool IsSigned = ISD == ISD::SREM; 938 if (TLI->isOperationLegalOrCustom(IsSigned ? ISD::SDIVREM : ISD::UDIVREM, 939 LT.second) || 940 TLI->isOperationLegalOrCustom(IsSigned ? ISD::SDIV : ISD::UDIV, 941 LT.second)) { 942 unsigned DivOpc = IsSigned ? Instruction::SDiv : Instruction::UDiv; 943 InstructionCost DivCost = thisT()->getArithmeticInstrCost( 944 DivOpc, Ty, CostKind, Opd1Info, Opd2Info); 945 InstructionCost MulCost = 946 thisT()->getArithmeticInstrCost(Instruction::Mul, Ty, CostKind); 947 InstructionCost SubCost = 948 thisT()->getArithmeticInstrCost(Instruction::Sub, Ty, CostKind); 949 return DivCost + MulCost + SubCost; 950 } 951 } 952 953 // We cannot scalarize scalable vectors, so return Invalid. 954 if (isa<ScalableVectorType>(Ty)) 955 return InstructionCost::getInvalid(); 956 957 // Else, assume that we need to scalarize this op. 958 // TODO: If one of the types get legalized by splitting, handle this 959 // similarly to what getCastInstrCost() does. 960 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) { 961 InstructionCost Cost = thisT()->getArithmeticInstrCost( 962 Opcode, VTy->getScalarType(), CostKind, Opd1Info, Opd2Info, 963 Args, CxtI); 964 // Return the cost of multiple scalar invocation plus the cost of 965 // inserting and extracting the values. 966 SmallVector<Type *> Tys(Args.size(), Ty); 967 return getScalarizationOverhead(VTy, Args, Tys, CostKind) + 968 VTy->getNumElements() * Cost; 969 } 970 971 // We don't know anything about this scalar instruction. 972 return OpCost; 973 } 974 improveShuffleKindFromMask(TTI::ShuffleKind Kind,ArrayRef<int> Mask,VectorType * Ty,int & Index,VectorType * & SubTy)975 TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, 976 ArrayRef<int> Mask, 977 VectorType *Ty, int &Index, 978 VectorType *&SubTy) const { 979 if (Mask.empty()) 980 return Kind; 981 int NumSrcElts = Ty->getElementCount().getKnownMinValue(); 982 switch (Kind) { 983 case TTI::SK_PermuteSingleSrc: 984 if (ShuffleVectorInst::isReverseMask(Mask, NumSrcElts)) 985 return TTI::SK_Reverse; 986 if (ShuffleVectorInst::isZeroEltSplatMask(Mask, NumSrcElts)) 987 return TTI::SK_Broadcast; 988 if (ShuffleVectorInst::isExtractSubvectorMask(Mask, NumSrcElts, Index) && 989 (Index + Mask.size()) <= (size_t)NumSrcElts) { 990 SubTy = FixedVectorType::get(Ty->getElementType(), Mask.size()); 991 return TTI::SK_ExtractSubvector; 992 } 993 break; 994 case TTI::SK_PermuteTwoSrc: { 995 int NumSubElts; 996 if (Mask.size() > 2 && ShuffleVectorInst::isInsertSubvectorMask( 997 Mask, NumSrcElts, NumSubElts, Index)) { 998 if (Index + NumSubElts > NumSrcElts) 999 return Kind; 1000 SubTy = FixedVectorType::get(Ty->getElementType(), NumSubElts); 1001 return TTI::SK_InsertSubvector; 1002 } 1003 if (ShuffleVectorInst::isSelectMask(Mask, NumSrcElts)) 1004 return TTI::SK_Select; 1005 if (ShuffleVectorInst::isTransposeMask(Mask, NumSrcElts)) 1006 return TTI::SK_Transpose; 1007 if (ShuffleVectorInst::isSpliceMask(Mask, NumSrcElts, Index)) 1008 return TTI::SK_Splice; 1009 break; 1010 } 1011 case TTI::SK_Select: 1012 case TTI::SK_Reverse: 1013 case TTI::SK_Broadcast: 1014 case TTI::SK_Transpose: 1015 case TTI::SK_InsertSubvector: 1016 case TTI::SK_ExtractSubvector: 1017 case TTI::SK_Splice: 1018 break; 1019 } 1020 return Kind; 1021 } 1022 1023 InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, 1024 ArrayRef<int> Mask, 1025 TTI::TargetCostKind CostKind, int Index, 1026 VectorType *SubTp, 1027 ArrayRef<const Value *> Args = std::nullopt, 1028 const Instruction *CxtI = nullptr) { 1029 switch (improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp)) { 1030 case TTI::SK_Broadcast: 1031 if (auto *FVT = dyn_cast<FixedVectorType>(Tp)) 1032 return getBroadcastShuffleOverhead(FVT, CostKind); 1033 return InstructionCost::getInvalid(); 1034 case TTI::SK_Select: 1035 case TTI::SK_Splice: 1036 case TTI::SK_Reverse: 1037 case TTI::SK_Transpose: 1038 case TTI::SK_PermuteSingleSrc: 1039 case TTI::SK_PermuteTwoSrc: 1040 if (auto *FVT = dyn_cast<FixedVectorType>(Tp)) 1041 return getPermuteShuffleOverhead(FVT, CostKind); 1042 return InstructionCost::getInvalid(); 1043 case TTI::SK_ExtractSubvector: 1044 return getExtractSubvectorOverhead(Tp, CostKind, Index, 1045 cast<FixedVectorType>(SubTp)); 1046 case TTI::SK_InsertSubvector: 1047 return getInsertSubvectorOverhead(Tp, CostKind, Index, 1048 cast<FixedVectorType>(SubTp)); 1049 } 1050 llvm_unreachable("Unknown TTI::ShuffleKind"); 1051 } 1052 1053 InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, 1054 TTI::CastContextHint CCH, 1055 TTI::TargetCostKind CostKind, 1056 const Instruction *I = nullptr) { 1057 if (BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I) == 0) 1058 return 0; 1059 1060 const TargetLoweringBase *TLI = getTLI(); 1061 int ISD = TLI->InstructionOpcodeToISD(Opcode); 1062 assert(ISD && "Invalid opcode"); 1063 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src); 1064 std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(Dst); 1065 1066 TypeSize SrcSize = SrcLT.second.getSizeInBits(); 1067 TypeSize DstSize = DstLT.second.getSizeInBits(); 1068 bool IntOrPtrSrc = Src->isIntegerTy() || Src->isPointerTy(); 1069 bool IntOrPtrDst = Dst->isIntegerTy() || Dst->isPointerTy(); 1070 1071 switch (Opcode) { 1072 default: 1073 break; 1074 case Instruction::Trunc: 1075 // Check for NOOP conversions. 1076 if (TLI->isTruncateFree(SrcLT.second, DstLT.second)) 1077 return 0; 1078 [[fallthrough]]; 1079 case Instruction::BitCast: 1080 // Bitcast between types that are legalized to the same type are free and 1081 // assume int to/from ptr of the same size is also free. 1082 if (SrcLT.first == DstLT.first && IntOrPtrSrc == IntOrPtrDst && 1083 SrcSize == DstSize) 1084 return 0; 1085 break; 1086 case Instruction::FPExt: 1087 if (I && getTLI()->isExtFree(I)) 1088 return 0; 1089 break; 1090 case Instruction::ZExt: 1091 if (TLI->isZExtFree(SrcLT.second, DstLT.second)) 1092 return 0; 1093 [[fallthrough]]; 1094 case Instruction::SExt: 1095 if (I && getTLI()->isExtFree(I)) 1096 return 0; 1097 1098 // If this is a zext/sext of a load, return 0 if the corresponding 1099 // extending load exists on target and the result type is legal. 1100 if (CCH == TTI::CastContextHint::Normal) { 1101 EVT ExtVT = EVT::getEVT(Dst); 1102 EVT LoadVT = EVT::getEVT(Src); 1103 unsigned LType = 1104 ((Opcode == Instruction::ZExt) ? ISD::ZEXTLOAD : ISD::SEXTLOAD); 1105 if (DstLT.first == SrcLT.first && 1106 TLI->isLoadExtLegal(LType, ExtVT, LoadVT)) 1107 return 0; 1108 } 1109 break; 1110 case Instruction::AddrSpaceCast: 1111 if (TLI->isFreeAddrSpaceCast(Src->getPointerAddressSpace(), 1112 Dst->getPointerAddressSpace())) 1113 return 0; 1114 break; 1115 } 1116 1117 auto *SrcVTy = dyn_cast<VectorType>(Src); 1118 auto *DstVTy = dyn_cast<VectorType>(Dst); 1119 1120 // If the cast is marked as legal (or promote) then assume low cost. 1121 if (SrcLT.first == DstLT.first && 1122 TLI->isOperationLegalOrPromote(ISD, DstLT.second)) 1123 return SrcLT.first; 1124 1125 // Handle scalar conversions. 1126 if (!SrcVTy && !DstVTy) { 1127 // Just check the op cost. If the operation is legal then assume it costs 1128 // 1. 1129 if (!TLI->isOperationExpand(ISD, DstLT.second)) 1130 return 1; 1131 1132 // Assume that illegal scalar instruction are expensive. 1133 return 4; 1134 } 1135 1136 // Check vector-to-vector casts. 1137 if (DstVTy && SrcVTy) { 1138 // If the cast is between same-sized registers, then the check is simple. 1139 if (SrcLT.first == DstLT.first && SrcSize == DstSize) { 1140 1141 // Assume that Zext is done using AND. 1142 if (Opcode == Instruction::ZExt) 1143 return SrcLT.first; 1144 1145 // Assume that sext is done using SHL and SRA. 1146 if (Opcode == Instruction::SExt) 1147 return SrcLT.first * 2; 1148 1149 // Just check the op cost. If the operation is legal then assume it 1150 // costs 1151 // 1 and multiply by the type-legalization overhead. 1152 if (!TLI->isOperationExpand(ISD, DstLT.second)) 1153 return SrcLT.first * 1; 1154 } 1155 1156 // If we are legalizing by splitting, query the concrete TTI for the cost 1157 // of casting the original vector twice. We also need to factor in the 1158 // cost of the split itself. Count that as 1, to be consistent with 1159 // getTypeLegalizationCost(). 1160 bool SplitSrc = 1161 TLI->getTypeAction(Src->getContext(), TLI->getValueType(DL, Src)) == 1162 TargetLowering::TypeSplitVector; 1163 bool SplitDst = 1164 TLI->getTypeAction(Dst->getContext(), TLI->getValueType(DL, Dst)) == 1165 TargetLowering::TypeSplitVector; 1166 if ((SplitSrc || SplitDst) && SrcVTy->getElementCount().isVector() && 1167 DstVTy->getElementCount().isVector()) { 1168 Type *SplitDstTy = VectorType::getHalfElementsVectorType(DstVTy); 1169 Type *SplitSrcTy = VectorType::getHalfElementsVectorType(SrcVTy); 1170 T *TTI = static_cast<T *>(this); 1171 // If both types need to be split then the split is free. 1172 InstructionCost SplitCost = 1173 (!SplitSrc || !SplitDst) ? TTI->getVectorSplitCost() : 0; 1174 return SplitCost + 1175 (2 * TTI->getCastInstrCost(Opcode, SplitDstTy, SplitSrcTy, CCH, 1176 CostKind, I)); 1177 } 1178 1179 // Scalarization cost is Invalid, can't assume any num elements. 1180 if (isa<ScalableVectorType>(DstVTy)) 1181 return InstructionCost::getInvalid(); 1182 1183 // In other cases where the source or destination are illegal, assume 1184 // the operation will get scalarized. 1185 unsigned Num = cast<FixedVectorType>(DstVTy)->getNumElements(); 1186 InstructionCost Cost = thisT()->getCastInstrCost( 1187 Opcode, Dst->getScalarType(), Src->getScalarType(), CCH, CostKind, I); 1188 1189 // Return the cost of multiple scalar invocation plus the cost of 1190 // inserting and extracting the values. 1191 return getScalarizationOverhead(DstVTy, /*Insert*/ true, /*Extract*/ true, 1192 CostKind) + 1193 Num * Cost; 1194 } 1195 1196 // We already handled vector-to-vector and scalar-to-scalar conversions. 1197 // This 1198 // is where we handle bitcast between vectors and scalars. We need to assume 1199 // that the conversion is scalarized in one way or another. 1200 if (Opcode == Instruction::BitCast) { 1201 // Illegal bitcasts are done by storing and loading from a stack slot. 1202 return (SrcVTy ? getScalarizationOverhead(SrcVTy, /*Insert*/ false, 1203 /*Extract*/ true, CostKind) 1204 : 0) + 1205 (DstVTy ? getScalarizationOverhead(DstVTy, /*Insert*/ true, 1206 /*Extract*/ false, CostKind) 1207 : 0); 1208 } 1209 1210 llvm_unreachable("Unhandled cast"); 1211 } 1212 getExtractWithExtendCost(unsigned Opcode,Type * Dst,VectorType * VecTy,unsigned Index)1213 InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, 1214 VectorType *VecTy, unsigned Index) { 1215 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 1216 return thisT()->getVectorInstrCost(Instruction::ExtractElement, VecTy, 1217 CostKind, Index, nullptr, nullptr) + 1218 thisT()->getCastInstrCost(Opcode, Dst, VecTy->getElementType(), 1219 TTI::CastContextHint::None, CostKind); 1220 } 1221 1222 InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, 1223 const Instruction *I = nullptr) { 1224 return BaseT::getCFInstrCost(Opcode, CostKind, I); 1225 } 1226 1227 InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, 1228 CmpInst::Predicate VecPred, 1229 TTI::TargetCostKind CostKind, 1230 const Instruction *I = nullptr) { 1231 const TargetLoweringBase *TLI = getTLI(); 1232 int ISD = TLI->InstructionOpcodeToISD(Opcode); 1233 assert(ISD && "Invalid opcode"); 1234 1235 // TODO: Handle other cost kinds. 1236 if (CostKind != TTI::TCK_RecipThroughput) 1237 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, 1238 I); 1239 1240 // Selects on vectors are actually vector selects. 1241 if (ISD == ISD::SELECT) { 1242 assert(CondTy && "CondTy must exist"); 1243 if (CondTy->isVectorTy()) 1244 ISD = ISD::VSELECT; 1245 } 1246 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy); 1247 1248 if (!(ValTy->isVectorTy() && !LT.second.isVector()) && 1249 !TLI->isOperationExpand(ISD, LT.second)) { 1250 // The operation is legal. Assume it costs 1. Multiply 1251 // by the type-legalization overhead. 1252 return LT.first * 1; 1253 } 1254 1255 // Otherwise, assume that the cast is scalarized. 1256 // TODO: If one of the types get legalized by splitting, handle this 1257 // similarly to what getCastInstrCost() does. 1258 if (auto *ValVTy = dyn_cast<VectorType>(ValTy)) { 1259 if (isa<ScalableVectorType>(ValTy)) 1260 return InstructionCost::getInvalid(); 1261 1262 unsigned Num = cast<FixedVectorType>(ValVTy)->getNumElements(); 1263 if (CondTy) 1264 CondTy = CondTy->getScalarType(); 1265 InstructionCost Cost = thisT()->getCmpSelInstrCost( 1266 Opcode, ValVTy->getScalarType(), CondTy, VecPred, CostKind, I); 1267 1268 // Return the cost of multiple scalar invocation plus the cost of 1269 // inserting and extracting the values. 1270 return getScalarizationOverhead(ValVTy, /*Insert*/ true, 1271 /*Extract*/ false, CostKind) + 1272 Num * Cost; 1273 } 1274 1275 // Unknown scalar opcode. 1276 return 1; 1277 } 1278 getVectorInstrCost(unsigned Opcode,Type * Val,TTI::TargetCostKind CostKind,unsigned Index,Value * Op0,Value * Op1)1279 InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, 1280 TTI::TargetCostKind CostKind, 1281 unsigned Index, Value *Op0, Value *Op1) { 1282 return getRegUsageForType(Val->getScalarType()); 1283 } 1284 getVectorInstrCost(const Instruction & I,Type * Val,TTI::TargetCostKind CostKind,unsigned Index)1285 InstructionCost getVectorInstrCost(const Instruction &I, Type *Val, 1286 TTI::TargetCostKind CostKind, 1287 unsigned Index) { 1288 Value *Op0 = nullptr; 1289 Value *Op1 = nullptr; 1290 if (auto *IE = dyn_cast<InsertElementInst>(&I)) { 1291 Op0 = IE->getOperand(0); 1292 Op1 = IE->getOperand(1); 1293 } 1294 return thisT()->getVectorInstrCost(I.getOpcode(), Val, CostKind, Index, Op0, 1295 Op1); 1296 } 1297 getReplicationShuffleCost(Type * EltTy,int ReplicationFactor,int VF,const APInt & DemandedDstElts,TTI::TargetCostKind CostKind)1298 InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, 1299 int VF, 1300 const APInt &DemandedDstElts, 1301 TTI::TargetCostKind CostKind) { 1302 assert(DemandedDstElts.getBitWidth() == (unsigned)VF * ReplicationFactor && 1303 "Unexpected size of DemandedDstElts."); 1304 1305 InstructionCost Cost; 1306 1307 auto *SrcVT = FixedVectorType::get(EltTy, VF); 1308 auto *ReplicatedVT = FixedVectorType::get(EltTy, VF * ReplicationFactor); 1309 1310 // The Mask shuffling cost is extract all the elements of the Mask 1311 // and insert each of them Factor times into the wide vector: 1312 // 1313 // E.g. an interleaved group with factor 3: 1314 // %mask = icmp ult <8 x i32> %vec1, %vec2 1315 // %interleaved.mask = shufflevector <8 x i1> %mask, <8 x i1> undef, 1316 // <24 x i32> <0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7> 1317 // The cost is estimated as extract all mask elements from the <8xi1> mask 1318 // vector and insert them factor times into the <24xi1> shuffled mask 1319 // vector. 1320 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedDstElts, VF); 1321 Cost += thisT()->getScalarizationOverhead(SrcVT, DemandedSrcElts, 1322 /*Insert*/ false, 1323 /*Extract*/ true, CostKind); 1324 Cost += thisT()->getScalarizationOverhead(ReplicatedVT, DemandedDstElts, 1325 /*Insert*/ true, 1326 /*Extract*/ false, CostKind); 1327 1328 return Cost; 1329 } 1330 1331 InstructionCost 1332 getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, 1333 unsigned AddressSpace, TTI::TargetCostKind CostKind, 1334 TTI::OperandValueInfo OpInfo = {TTI::OK_AnyValue, TTI::OP_None}, 1335 const Instruction *I = nullptr) { 1336 assert(!Src->isVoidTy() && "Invalid type"); 1337 // Assume types, such as structs, are expensive. 1338 if (getTLI()->getValueType(DL, Src, true) == MVT::Other) 1339 return 4; 1340 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src); 1341 1342 // Assuming that all loads of legal types cost 1. 1343 InstructionCost Cost = LT.first; 1344 if (CostKind != TTI::TCK_RecipThroughput) 1345 return Cost; 1346 1347 const DataLayout &DL = this->getDataLayout(); 1348 if (Src->isVectorTy() && 1349 // In practice it's not currently possible to have a change in lane 1350 // length for extending loads or truncating stores so both types should 1351 // have the same scalable property. 1352 TypeSize::isKnownLT(DL.getTypeStoreSizeInBits(Src), 1353 LT.second.getSizeInBits())) { 1354 // This is a vector load that legalizes to a larger type than the vector 1355 // itself. Unless the corresponding extending load or truncating store is 1356 // legal, then this will scalarize. 1357 TargetLowering::LegalizeAction LA = TargetLowering::Expand; 1358 EVT MemVT = getTLI()->getValueType(DL, Src); 1359 if (Opcode == Instruction::Store) 1360 LA = getTLI()->getTruncStoreAction(LT.second, MemVT); 1361 else 1362 LA = getTLI()->getLoadExtAction(ISD::EXTLOAD, LT.second, MemVT); 1363 1364 if (LA != TargetLowering::Legal && LA != TargetLowering::Custom) { 1365 // This is a vector load/store for some illegal type that is scalarized. 1366 // We must account for the cost of building or decomposing the vector. 1367 Cost += getScalarizationOverhead( 1368 cast<VectorType>(Src), Opcode != Instruction::Store, 1369 Opcode == Instruction::Store, CostKind); 1370 } 1371 } 1372 1373 return Cost; 1374 } 1375 getMaskedMemoryOpCost(unsigned Opcode,Type * DataTy,Align Alignment,unsigned AddressSpace,TTI::TargetCostKind CostKind)1376 InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, 1377 Align Alignment, unsigned AddressSpace, 1378 TTI::TargetCostKind CostKind) { 1379 // TODO: Pass on AddressSpace when we have test coverage. 1380 return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, true, false, 1381 CostKind); 1382 } 1383 1384 InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, 1385 const Value *Ptr, bool VariableMask, 1386 Align Alignment, 1387 TTI::TargetCostKind CostKind, 1388 const Instruction *I = nullptr) { 1389 return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, VariableMask, 1390 true, CostKind); 1391 } 1392 getStridedMemoryOpCost(unsigned Opcode,Type * DataTy,const Value * Ptr,bool VariableMask,Align Alignment,TTI::TargetCostKind CostKind,const Instruction * I)1393 InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, 1394 const Value *Ptr, bool VariableMask, 1395 Align Alignment, 1396 TTI::TargetCostKind CostKind, 1397 const Instruction *I) { 1398 // For a target without strided memory operations (or for an illegal 1399 // operation type on one which does), assume we lower to a gather/scatter 1400 // operation. (Which may in turn be scalarized.) 1401 return thisT()->getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, 1402 Alignment, CostKind, I); 1403 } 1404 1405 InstructionCost getInterleavedMemoryOpCost( 1406 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, 1407 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, 1408 bool UseMaskForCond = false, bool UseMaskForGaps = false) { 1409 1410 // We cannot scalarize scalable vectors, so return Invalid. 1411 if (isa<ScalableVectorType>(VecTy)) 1412 return InstructionCost::getInvalid(); 1413 1414 auto *VT = cast<FixedVectorType>(VecTy); 1415 1416 unsigned NumElts = VT->getNumElements(); 1417 assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor"); 1418 1419 unsigned NumSubElts = NumElts / Factor; 1420 auto *SubVT = FixedVectorType::get(VT->getElementType(), NumSubElts); 1421 1422 // Firstly, the cost of load/store operation. 1423 InstructionCost Cost; 1424 if (UseMaskForCond || UseMaskForGaps) 1425 Cost = thisT()->getMaskedMemoryOpCost(Opcode, VecTy, Alignment, 1426 AddressSpace, CostKind); 1427 else 1428 Cost = thisT()->getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, 1429 CostKind); 1430 1431 // Legalize the vector type, and get the legalized and unlegalized type 1432 // sizes. 1433 MVT VecTyLT = getTypeLegalizationCost(VecTy).second; 1434 unsigned VecTySize = thisT()->getDataLayout().getTypeStoreSize(VecTy); 1435 unsigned VecTyLTSize = VecTyLT.getStoreSize(); 1436 1437 // Scale the cost of the memory operation by the fraction of legalized 1438 // instructions that will actually be used. We shouldn't account for the 1439 // cost of dead instructions since they will be removed. 1440 // 1441 // E.g., An interleaved load of factor 8: 1442 // %vec = load <16 x i64>, <16 x i64>* %ptr 1443 // %v0 = shufflevector %vec, undef, <0, 8> 1444 // 1445 // If <16 x i64> is legalized to 8 v2i64 loads, only 2 of the loads will be 1446 // used (those corresponding to elements [0:1] and [8:9] of the unlegalized 1447 // type). The other loads are unused. 1448 // 1449 // TODO: Note that legalization can turn masked loads/stores into unmasked 1450 // (legalized) loads/stores. This can be reflected in the cost. 1451 if (Cost.isValid() && VecTySize > VecTyLTSize) { 1452 // The number of loads of a legal type it will take to represent a load 1453 // of the unlegalized vector type. 1454 unsigned NumLegalInsts = divideCeil(VecTySize, VecTyLTSize); 1455 1456 // The number of elements of the unlegalized type that correspond to a 1457 // single legal instruction. 1458 unsigned NumEltsPerLegalInst = divideCeil(NumElts, NumLegalInsts); 1459 1460 // Determine which legal instructions will be used. 1461 BitVector UsedInsts(NumLegalInsts, false); 1462 for (unsigned Index : Indices) 1463 for (unsigned Elt = 0; Elt < NumSubElts; ++Elt) 1464 UsedInsts.set((Index + Elt * Factor) / NumEltsPerLegalInst); 1465 1466 // Scale the cost of the load by the fraction of legal instructions that 1467 // will be used. 1468 Cost = divideCeil(UsedInsts.count() * *Cost.getValue(), NumLegalInsts); 1469 } 1470 1471 // Then plus the cost of interleave operation. 1472 assert(Indices.size() <= Factor && 1473 "Interleaved memory op has too many members"); 1474 1475 const APInt DemandedAllSubElts = APInt::getAllOnes(NumSubElts); 1476 const APInt DemandedAllResultElts = APInt::getAllOnes(NumElts); 1477 1478 APInt DemandedLoadStoreElts = APInt::getZero(NumElts); 1479 for (unsigned Index : Indices) { 1480 assert(Index < Factor && "Invalid index for interleaved memory op"); 1481 for (unsigned Elm = 0; Elm < NumSubElts; Elm++) 1482 DemandedLoadStoreElts.setBit(Index + Elm * Factor); 1483 } 1484 1485 if (Opcode == Instruction::Load) { 1486 // The interleave cost is similar to extract sub vectors' elements 1487 // from the wide vector, and insert them into sub vectors. 1488 // 1489 // E.g. An interleaved load of factor 2 (with one member of index 0): 1490 // %vec = load <8 x i32>, <8 x i32>* %ptr 1491 // %v0 = shuffle %vec, undef, <0, 2, 4, 6> ; Index 0 1492 // The cost is estimated as extract elements at 0, 2, 4, 6 from the 1493 // <8 x i32> vector and insert them into a <4 x i32> vector. 1494 InstructionCost InsSubCost = thisT()->getScalarizationOverhead( 1495 SubVT, DemandedAllSubElts, 1496 /*Insert*/ true, /*Extract*/ false, CostKind); 1497 Cost += Indices.size() * InsSubCost; 1498 Cost += thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts, 1499 /*Insert*/ false, 1500 /*Extract*/ true, CostKind); 1501 } else { 1502 // The interleave cost is extract elements from sub vectors, and 1503 // insert them into the wide vector. 1504 // 1505 // E.g. An interleaved store of factor 3 with 2 members at indices 0,1: 1506 // (using VF=4): 1507 // %v0_v1 = shuffle %v0, %v1, <0,4,undef,1,5,undef,2,6,undef,3,7,undef> 1508 // %gaps.mask = <true, true, false, true, true, false, 1509 // true, true, false, true, true, false> 1510 // call llvm.masked.store <12 x i32> %v0_v1, <12 x i32>* %ptr, 1511 // i32 Align, <12 x i1> %gaps.mask 1512 // The cost is estimated as extract all elements (of actual members, 1513 // excluding gaps) from both <4 x i32> vectors and insert into the <12 x 1514 // i32> vector. 1515 InstructionCost ExtSubCost = thisT()->getScalarizationOverhead( 1516 SubVT, DemandedAllSubElts, 1517 /*Insert*/ false, /*Extract*/ true, CostKind); 1518 Cost += ExtSubCost * Indices.size(); 1519 Cost += thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts, 1520 /*Insert*/ true, 1521 /*Extract*/ false, CostKind); 1522 } 1523 1524 if (!UseMaskForCond) 1525 return Cost; 1526 1527 Type *I8Type = Type::getInt8Ty(VT->getContext()); 1528 1529 Cost += thisT()->getReplicationShuffleCost( 1530 I8Type, Factor, NumSubElts, 1531 UseMaskForGaps ? DemandedLoadStoreElts : DemandedAllResultElts, 1532 CostKind); 1533 1534 // The Gaps mask is invariant and created outside the loop, therefore the 1535 // cost of creating it is not accounted for here. However if we have both 1536 // a MaskForGaps and some other mask that guards the execution of the 1537 // memory access, we need to account for the cost of And-ing the two masks 1538 // inside the loop. 1539 if (UseMaskForGaps) { 1540 auto *MaskVT = FixedVectorType::get(I8Type, NumElts); 1541 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::And, MaskVT, 1542 CostKind); 1543 } 1544 1545 return Cost; 1546 } 1547 1548 /// Get intrinsic cost based on arguments. getIntrinsicInstrCost(const IntrinsicCostAttributes & ICA,TTI::TargetCostKind CostKind)1549 InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, 1550 TTI::TargetCostKind CostKind) { 1551 // Check for generically free intrinsics. 1552 if (BaseT::getIntrinsicInstrCost(ICA, CostKind) == 0) 1553 return 0; 1554 1555 // Assume that target intrinsics are cheap. 1556 Intrinsic::ID IID = ICA.getID(); 1557 if (Function::isTargetIntrinsic(IID)) 1558 return TargetTransformInfo::TCC_Basic; 1559 1560 if (ICA.isTypeBasedOnly()) 1561 return getTypeBasedIntrinsicInstrCost(ICA, CostKind); 1562 1563 Type *RetTy = ICA.getReturnType(); 1564 1565 ElementCount RetVF = 1566 (RetTy->isVectorTy() ? cast<VectorType>(RetTy)->getElementCount() 1567 : ElementCount::getFixed(1)); 1568 const IntrinsicInst *I = ICA.getInst(); 1569 const SmallVectorImpl<const Value *> &Args = ICA.getArgs(); 1570 FastMathFlags FMF = ICA.getFlags(); 1571 switch (IID) { 1572 default: 1573 break; 1574 1575 case Intrinsic::powi: 1576 if (auto *RHSC = dyn_cast<ConstantInt>(Args[1])) { 1577 bool ShouldOptForSize = I->getParent()->getParent()->hasOptSize(); 1578 if (getTLI()->isBeneficialToExpandPowI(RHSC->getSExtValue(), 1579 ShouldOptForSize)) { 1580 // The cost is modeled on the expansion performed by ExpandPowI in 1581 // SelectionDAGBuilder. 1582 APInt Exponent = RHSC->getValue().abs(); 1583 unsigned ActiveBits = Exponent.getActiveBits(); 1584 unsigned PopCount = Exponent.popcount(); 1585 InstructionCost Cost = (ActiveBits + PopCount - 2) * 1586 thisT()->getArithmeticInstrCost( 1587 Instruction::FMul, RetTy, CostKind); 1588 if (RHSC->isNegative()) 1589 Cost += thisT()->getArithmeticInstrCost(Instruction::FDiv, RetTy, 1590 CostKind); 1591 return Cost; 1592 } 1593 } 1594 break; 1595 case Intrinsic::cttz: 1596 // FIXME: If necessary, this should go in target-specific overrides. 1597 if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCttz(RetTy)) 1598 return TargetTransformInfo::TCC_Basic; 1599 break; 1600 1601 case Intrinsic::ctlz: 1602 // FIXME: If necessary, this should go in target-specific overrides. 1603 if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCtlz(RetTy)) 1604 return TargetTransformInfo::TCC_Basic; 1605 break; 1606 1607 case Intrinsic::memcpy: 1608 return thisT()->getMemcpyCost(ICA.getInst()); 1609 1610 case Intrinsic::masked_scatter: { 1611 const Value *Mask = Args[3]; 1612 bool VarMask = !isa<Constant>(Mask); 1613 Align Alignment = cast<ConstantInt>(Args[2])->getAlignValue(); 1614 return thisT()->getGatherScatterOpCost(Instruction::Store, 1615 ICA.getArgTypes()[0], Args[1], 1616 VarMask, Alignment, CostKind, I); 1617 } 1618 case Intrinsic::masked_gather: { 1619 const Value *Mask = Args[2]; 1620 bool VarMask = !isa<Constant>(Mask); 1621 Align Alignment = cast<ConstantInt>(Args[1])->getAlignValue(); 1622 return thisT()->getGatherScatterOpCost(Instruction::Load, RetTy, Args[0], 1623 VarMask, Alignment, CostKind, I); 1624 } 1625 case Intrinsic::experimental_vp_strided_store: { 1626 const Value *Data = Args[0]; 1627 const Value *Ptr = Args[1]; 1628 const Value *Mask = Args[3]; 1629 const Value *EVL = Args[4]; 1630 bool VarMask = !isa<Constant>(Mask) || !isa<Constant>(EVL); 1631 Align Alignment = I->getParamAlign(1).valueOrOne(); 1632 return thisT()->getStridedMemoryOpCost(Instruction::Store, 1633 Data->getType(), Ptr, VarMask, 1634 Alignment, CostKind, I); 1635 } 1636 case Intrinsic::experimental_vp_strided_load: { 1637 const Value *Ptr = Args[0]; 1638 const Value *Mask = Args[2]; 1639 const Value *EVL = Args[3]; 1640 bool VarMask = !isa<Constant>(Mask) || !isa<Constant>(EVL); 1641 Align Alignment = I->getParamAlign(0).valueOrOne(); 1642 return thisT()->getStridedMemoryOpCost(Instruction::Load, RetTy, Ptr, 1643 VarMask, Alignment, CostKind, I); 1644 } 1645 case Intrinsic::experimental_stepvector: { 1646 if (isa<ScalableVectorType>(RetTy)) 1647 return BaseT::getIntrinsicInstrCost(ICA, CostKind); 1648 // The cost of materialising a constant integer vector. 1649 return TargetTransformInfo::TCC_Basic; 1650 } 1651 case Intrinsic::vector_extract: { 1652 // FIXME: Handle case where a scalable vector is extracted from a scalable 1653 // vector 1654 if (isa<ScalableVectorType>(RetTy)) 1655 return BaseT::getIntrinsicInstrCost(ICA, CostKind); 1656 unsigned Index = cast<ConstantInt>(Args[1])->getZExtValue(); 1657 return thisT()->getShuffleCost( 1658 TTI::SK_ExtractSubvector, cast<VectorType>(Args[0]->getType()), 1659 std::nullopt, CostKind, Index, cast<VectorType>(RetTy)); 1660 } 1661 case Intrinsic::vector_insert: { 1662 // FIXME: Handle case where a scalable vector is inserted into a scalable 1663 // vector 1664 if (isa<ScalableVectorType>(Args[1]->getType())) 1665 return BaseT::getIntrinsicInstrCost(ICA, CostKind); 1666 unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue(); 1667 return thisT()->getShuffleCost( 1668 TTI::SK_InsertSubvector, cast<VectorType>(Args[0]->getType()), 1669 std::nullopt, CostKind, Index, cast<VectorType>(Args[1]->getType())); 1670 } 1671 case Intrinsic::vector_reverse: { 1672 return thisT()->getShuffleCost( 1673 TTI::SK_Reverse, cast<VectorType>(Args[0]->getType()), std::nullopt, 1674 CostKind, 0, cast<VectorType>(RetTy)); 1675 } 1676 case Intrinsic::vector_splice: { 1677 unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue(); 1678 return thisT()->getShuffleCost( 1679 TTI::SK_Splice, cast<VectorType>(Args[0]->getType()), std::nullopt, 1680 CostKind, Index, cast<VectorType>(RetTy)); 1681 } 1682 case Intrinsic::vector_reduce_add: 1683 case Intrinsic::vector_reduce_mul: 1684 case Intrinsic::vector_reduce_and: 1685 case Intrinsic::vector_reduce_or: 1686 case Intrinsic::vector_reduce_xor: 1687 case Intrinsic::vector_reduce_smax: 1688 case Intrinsic::vector_reduce_smin: 1689 case Intrinsic::vector_reduce_fmax: 1690 case Intrinsic::vector_reduce_fmin: 1691 case Intrinsic::vector_reduce_fmaximum: 1692 case Intrinsic::vector_reduce_fminimum: 1693 case Intrinsic::vector_reduce_umax: 1694 case Intrinsic::vector_reduce_umin: { 1695 IntrinsicCostAttributes Attrs(IID, RetTy, Args[0]->getType(), FMF, I, 1); 1696 return getTypeBasedIntrinsicInstrCost(Attrs, CostKind); 1697 } 1698 case Intrinsic::vector_reduce_fadd: 1699 case Intrinsic::vector_reduce_fmul: { 1700 IntrinsicCostAttributes Attrs( 1701 IID, RetTy, {Args[0]->getType(), Args[1]->getType()}, FMF, I, 1); 1702 return getTypeBasedIntrinsicInstrCost(Attrs, CostKind); 1703 } 1704 case Intrinsic::fshl: 1705 case Intrinsic::fshr: { 1706 const Value *X = Args[0]; 1707 const Value *Y = Args[1]; 1708 const Value *Z = Args[2]; 1709 const TTI::OperandValueInfo OpInfoX = TTI::getOperandInfo(X); 1710 const TTI::OperandValueInfo OpInfoY = TTI::getOperandInfo(Y); 1711 const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(Z); 1712 const TTI::OperandValueInfo OpInfoBW = 1713 {TTI::OK_UniformConstantValue, 1714 isPowerOf2_32(RetTy->getScalarSizeInBits()) ? TTI::OP_PowerOf2 1715 : TTI::OP_None}; 1716 1717 // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW))) 1718 // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW)) 1719 InstructionCost Cost = 0; 1720 Cost += 1721 thisT()->getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind); 1722 Cost += 1723 thisT()->getArithmeticInstrCost(BinaryOperator::Sub, RetTy, CostKind); 1724 Cost += thisT()->getArithmeticInstrCost( 1725 BinaryOperator::Shl, RetTy, CostKind, OpInfoX, 1726 {OpInfoZ.Kind, TTI::OP_None}); 1727 Cost += thisT()->getArithmeticInstrCost( 1728 BinaryOperator::LShr, RetTy, CostKind, OpInfoY, 1729 {OpInfoZ.Kind, TTI::OP_None}); 1730 // Non-constant shift amounts requires a modulo. 1731 if (!OpInfoZ.isConstant()) 1732 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::URem, RetTy, 1733 CostKind, OpInfoZ, OpInfoBW); 1734 // For non-rotates (X != Y) we must add shift-by-zero handling costs. 1735 if (X != Y) { 1736 Type *CondTy = RetTy->getWithNewBitWidth(1); 1737 Cost += 1738 thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy, 1739 CmpInst::ICMP_EQ, CostKind); 1740 Cost += 1741 thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, 1742 CmpInst::ICMP_EQ, CostKind); 1743 } 1744 return Cost; 1745 } 1746 case Intrinsic::get_active_lane_mask: { 1747 EVT ResVT = getTLI()->getValueType(DL, RetTy, true); 1748 EVT ArgType = getTLI()->getValueType(DL, ICA.getArgTypes()[0], true); 1749 1750 // If we're not expanding the intrinsic then we assume this is cheap 1751 // to implement. 1752 if (!getTLI()->shouldExpandGetActiveLaneMask(ResVT, ArgType)) { 1753 return getTypeLegalizationCost(RetTy).first; 1754 } 1755 1756 // Create the expanded types that will be used to calculate the uadd_sat 1757 // operation. 1758 Type *ExpRetTy = VectorType::get( 1759 ICA.getArgTypes()[0], cast<VectorType>(RetTy)->getElementCount()); 1760 IntrinsicCostAttributes Attrs(Intrinsic::uadd_sat, ExpRetTy, {}, FMF); 1761 InstructionCost Cost = 1762 thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind); 1763 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, ExpRetTy, RetTy, 1764 CmpInst::ICMP_ULT, CostKind); 1765 return Cost; 1766 } 1767 case Intrinsic::experimental_cttz_elts: { 1768 EVT ArgType = getTLI()->getValueType(DL, ICA.getArgTypes()[0], true); 1769 1770 // If we're not expanding the intrinsic then we assume this is cheap 1771 // to implement. 1772 if (!getTLI()->shouldExpandCttzElements(ArgType)) 1773 return getTypeLegalizationCost(RetTy).first; 1774 1775 // TODO: The costs below reflect the expansion code in 1776 // SelectionDAGBuilder, but we may want to sacrifice some accuracy in 1777 // favour of compile time. 1778 1779 // Find the smallest "sensible" element type to use for the expansion. 1780 bool ZeroIsPoison = !cast<ConstantInt>(Args[1])->isZero(); 1781 ConstantRange VScaleRange(APInt(64, 1), APInt::getZero(64)); 1782 if (isa<ScalableVectorType>(ICA.getArgTypes()[0]) && I && I->getCaller()) 1783 VScaleRange = getVScaleRange(I->getCaller(), 64); 1784 1785 unsigned EltWidth = getTLI()->getBitWidthForCttzElements( 1786 RetTy, ArgType.getVectorElementCount(), ZeroIsPoison, &VScaleRange); 1787 Type *NewEltTy = IntegerType::getIntNTy(RetTy->getContext(), EltWidth); 1788 1789 // Create the new vector type & get the vector length 1790 Type *NewVecTy = VectorType::get( 1791 NewEltTy, cast<VectorType>(Args[0]->getType())->getElementCount()); 1792 1793 IntrinsicCostAttributes StepVecAttrs(Intrinsic::experimental_stepvector, 1794 NewVecTy, {}, FMF); 1795 InstructionCost Cost = 1796 thisT()->getIntrinsicInstrCost(StepVecAttrs, CostKind); 1797 1798 Cost += 1799 thisT()->getArithmeticInstrCost(Instruction::Sub, NewVecTy, CostKind); 1800 Cost += thisT()->getCastInstrCost(Instruction::SExt, NewVecTy, 1801 Args[0]->getType(), 1802 TTI::CastContextHint::None, CostKind); 1803 Cost += 1804 thisT()->getArithmeticInstrCost(Instruction::And, NewVecTy, CostKind); 1805 1806 IntrinsicCostAttributes ReducAttrs(Intrinsic::vector_reduce_umax, 1807 NewEltTy, NewVecTy, FMF, I, 1); 1808 Cost += thisT()->getTypeBasedIntrinsicInstrCost(ReducAttrs, CostKind); 1809 Cost += 1810 thisT()->getArithmeticInstrCost(Instruction::Sub, NewEltTy, CostKind); 1811 1812 return Cost; 1813 } 1814 } 1815 1816 // VP Intrinsics should have the same cost as their non-vp counterpart. 1817 // TODO: Adjust the cost to make the vp intrinsic cheaper than its non-vp 1818 // counterpart when the vector length argument is smaller than the maximum 1819 // vector length. 1820 // TODO: Support other kinds of VPIntrinsics 1821 if (VPIntrinsic::isVPIntrinsic(ICA.getID())) { 1822 std::optional<unsigned> FOp = 1823 VPIntrinsic::getFunctionalOpcodeForVP(ICA.getID()); 1824 if (FOp) { 1825 if (ICA.getID() == Intrinsic::vp_load) { 1826 Align Alignment; 1827 if (auto *VPI = dyn_cast_or_null<VPIntrinsic>(ICA.getInst())) 1828 Alignment = VPI->getPointerAlignment().valueOrOne(); 1829 unsigned AS = 0; 1830 if (ICA.getArgs().size() > 1) 1831 if (auto *PtrTy = 1832 dyn_cast<PointerType>(ICA.getArgs()[0]->getType())) 1833 AS = PtrTy->getAddressSpace(); 1834 return thisT()->getMemoryOpCost(*FOp, ICA.getReturnType(), Alignment, 1835 AS, CostKind); 1836 } 1837 if (ICA.getID() == Intrinsic::vp_store) { 1838 Align Alignment; 1839 if (auto *VPI = dyn_cast_or_null<VPIntrinsic>(ICA.getInst())) 1840 Alignment = VPI->getPointerAlignment().valueOrOne(); 1841 unsigned AS = 0; 1842 if (ICA.getArgs().size() >= 2) 1843 if (auto *PtrTy = 1844 dyn_cast<PointerType>(ICA.getArgs()[1]->getType())) 1845 AS = PtrTy->getAddressSpace(); 1846 return thisT()->getMemoryOpCost(*FOp, Args[0]->getType(), Alignment, 1847 AS, CostKind); 1848 } 1849 if (VPBinOpIntrinsic::isVPBinOp(ICA.getID())) { 1850 return thisT()->getArithmeticInstrCost(*FOp, ICA.getReturnType(), 1851 CostKind); 1852 } 1853 } 1854 1855 std::optional<Intrinsic::ID> FID = 1856 VPIntrinsic::getFunctionalIntrinsicIDForVP(ICA.getID()); 1857 if (FID) { 1858 // Non-vp version will have same Args/Tys except mask and vector length. 1859 assert(ICA.getArgs().size() >= 2 && ICA.getArgTypes().size() >= 2 && 1860 "Expected VPIntrinsic to have Mask and Vector Length args and " 1861 "types"); 1862 ArrayRef<Type *> NewTys = ArrayRef(ICA.getArgTypes()).drop_back(2); 1863 1864 // VPReduction intrinsics have a start value argument that their non-vp 1865 // counterparts do not have, except for the fadd and fmul non-vp 1866 // counterpart. 1867 if (VPReductionIntrinsic::isVPReduction(ICA.getID()) && 1868 *FID != Intrinsic::vector_reduce_fadd && 1869 *FID != Intrinsic::vector_reduce_fmul) 1870 NewTys = NewTys.drop_front(); 1871 1872 IntrinsicCostAttributes NewICA(*FID, ICA.getReturnType(), NewTys, 1873 ICA.getFlags()); 1874 return thisT()->getIntrinsicInstrCost(NewICA, CostKind); 1875 } 1876 } 1877 1878 // Assume that we need to scalarize this intrinsic.) 1879 // Compute the scalarization overhead based on Args for a vector 1880 // intrinsic. 1881 InstructionCost ScalarizationCost = InstructionCost::getInvalid(); 1882 if (RetVF.isVector() && !RetVF.isScalable()) { 1883 ScalarizationCost = 0; 1884 if (!RetTy->isVoidTy()) 1885 ScalarizationCost += getScalarizationOverhead( 1886 cast<VectorType>(RetTy), 1887 /*Insert*/ true, /*Extract*/ false, CostKind); 1888 ScalarizationCost += 1889 getOperandsScalarizationOverhead(Args, ICA.getArgTypes(), CostKind); 1890 } 1891 1892 IntrinsicCostAttributes Attrs(IID, RetTy, ICA.getArgTypes(), FMF, I, 1893 ScalarizationCost); 1894 return thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind); 1895 } 1896 1897 /// Get intrinsic cost based on argument types. 1898 /// If ScalarizationCostPassed is std::numeric_limits<unsigned>::max(), the 1899 /// cost of scalarizing the arguments and the return value will be computed 1900 /// based on types. 1901 InstructionCost getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes & ICA,TTI::TargetCostKind CostKind)1902 getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, 1903 TTI::TargetCostKind CostKind) { 1904 Intrinsic::ID IID = ICA.getID(); 1905 Type *RetTy = ICA.getReturnType(); 1906 const SmallVectorImpl<Type *> &Tys = ICA.getArgTypes(); 1907 FastMathFlags FMF = ICA.getFlags(); 1908 InstructionCost ScalarizationCostPassed = ICA.getScalarizationCost(); 1909 bool SkipScalarizationCost = ICA.skipScalarizationCost(); 1910 1911 VectorType *VecOpTy = nullptr; 1912 if (!Tys.empty()) { 1913 // The vector reduction operand is operand 0 except for fadd/fmul. 1914 // Their operand 0 is a scalar start value, so the vector op is operand 1. 1915 unsigned VecTyIndex = 0; 1916 if (IID == Intrinsic::vector_reduce_fadd || 1917 IID == Intrinsic::vector_reduce_fmul) 1918 VecTyIndex = 1; 1919 assert(Tys.size() > VecTyIndex && "Unexpected IntrinsicCostAttributes"); 1920 VecOpTy = dyn_cast<VectorType>(Tys[VecTyIndex]); 1921 } 1922 1923 // Library call cost - other than size, make it expensive. 1924 unsigned SingleCallCost = CostKind == TTI::TCK_CodeSize ? 1 : 10; 1925 unsigned ISD = 0; 1926 switch (IID) { 1927 default: { 1928 // Scalable vectors cannot be scalarized, so return Invalid. 1929 if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) { 1930 return isa<ScalableVectorType>(Ty); 1931 })) 1932 return InstructionCost::getInvalid(); 1933 1934 // Assume that we need to scalarize this intrinsic. 1935 InstructionCost ScalarizationCost = 1936 SkipScalarizationCost ? ScalarizationCostPassed : 0; 1937 unsigned ScalarCalls = 1; 1938 Type *ScalarRetTy = RetTy; 1939 if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) { 1940 if (!SkipScalarizationCost) 1941 ScalarizationCost = getScalarizationOverhead( 1942 RetVTy, /*Insert*/ true, /*Extract*/ false, CostKind); 1943 ScalarCalls = std::max(ScalarCalls, 1944 cast<FixedVectorType>(RetVTy)->getNumElements()); 1945 ScalarRetTy = RetTy->getScalarType(); 1946 } 1947 SmallVector<Type *, 4> ScalarTys; 1948 for (Type *Ty : Tys) { 1949 if (auto *VTy = dyn_cast<VectorType>(Ty)) { 1950 if (!SkipScalarizationCost) 1951 ScalarizationCost += getScalarizationOverhead( 1952 VTy, /*Insert*/ false, /*Extract*/ true, CostKind); 1953 ScalarCalls = std::max(ScalarCalls, 1954 cast<FixedVectorType>(VTy)->getNumElements()); 1955 Ty = Ty->getScalarType(); 1956 } 1957 ScalarTys.push_back(Ty); 1958 } 1959 if (ScalarCalls == 1) 1960 return 1; // Return cost of a scalar intrinsic. Assume it to be cheap. 1961 1962 IntrinsicCostAttributes ScalarAttrs(IID, ScalarRetTy, ScalarTys, FMF); 1963 InstructionCost ScalarCost = 1964 thisT()->getIntrinsicInstrCost(ScalarAttrs, CostKind); 1965 1966 return ScalarCalls * ScalarCost + ScalarizationCost; 1967 } 1968 // Look for intrinsics that can be lowered directly or turned into a scalar 1969 // intrinsic call. 1970 case Intrinsic::sqrt: 1971 ISD = ISD::FSQRT; 1972 break; 1973 case Intrinsic::sin: 1974 ISD = ISD::FSIN; 1975 break; 1976 case Intrinsic::cos: 1977 ISD = ISD::FCOS; 1978 break; 1979 case Intrinsic::tan: 1980 ISD = ISD::FTAN; 1981 break; 1982 case Intrinsic::asin: 1983 ISD = ISD::FASIN; 1984 break; 1985 case Intrinsic::acos: 1986 ISD = ISD::FACOS; 1987 break; 1988 case Intrinsic::atan: 1989 ISD = ISD::FATAN; 1990 break; 1991 case Intrinsic::sinh: 1992 ISD = ISD::FSINH; 1993 break; 1994 case Intrinsic::cosh: 1995 ISD = ISD::FCOSH; 1996 break; 1997 case Intrinsic::tanh: 1998 ISD = ISD::FTANH; 1999 break; 2000 case Intrinsic::exp: 2001 ISD = ISD::FEXP; 2002 break; 2003 case Intrinsic::exp2: 2004 ISD = ISD::FEXP2; 2005 break; 2006 case Intrinsic::exp10: 2007 ISD = ISD::FEXP10; 2008 break; 2009 case Intrinsic::log: 2010 ISD = ISD::FLOG; 2011 break; 2012 case Intrinsic::log10: 2013 ISD = ISD::FLOG10; 2014 break; 2015 case Intrinsic::log2: 2016 ISD = ISD::FLOG2; 2017 break; 2018 case Intrinsic::fabs: 2019 ISD = ISD::FABS; 2020 break; 2021 case Intrinsic::canonicalize: 2022 ISD = ISD::FCANONICALIZE; 2023 break; 2024 case Intrinsic::minnum: 2025 ISD = ISD::FMINNUM; 2026 break; 2027 case Intrinsic::maxnum: 2028 ISD = ISD::FMAXNUM; 2029 break; 2030 case Intrinsic::minimum: 2031 ISD = ISD::FMINIMUM; 2032 break; 2033 case Intrinsic::maximum: 2034 ISD = ISD::FMAXIMUM; 2035 break; 2036 case Intrinsic::copysign: 2037 ISD = ISD::FCOPYSIGN; 2038 break; 2039 case Intrinsic::floor: 2040 ISD = ISD::FFLOOR; 2041 break; 2042 case Intrinsic::ceil: 2043 ISD = ISD::FCEIL; 2044 break; 2045 case Intrinsic::trunc: 2046 ISD = ISD::FTRUNC; 2047 break; 2048 case Intrinsic::nearbyint: 2049 ISD = ISD::FNEARBYINT; 2050 break; 2051 case Intrinsic::rint: 2052 ISD = ISD::FRINT; 2053 break; 2054 case Intrinsic::lrint: 2055 ISD = ISD::LRINT; 2056 break; 2057 case Intrinsic::llrint: 2058 ISD = ISD::LLRINT; 2059 break; 2060 case Intrinsic::round: 2061 ISD = ISD::FROUND; 2062 break; 2063 case Intrinsic::roundeven: 2064 ISD = ISD::FROUNDEVEN; 2065 break; 2066 case Intrinsic::pow: 2067 ISD = ISD::FPOW; 2068 break; 2069 case Intrinsic::fma: 2070 ISD = ISD::FMA; 2071 break; 2072 case Intrinsic::fmuladd: 2073 ISD = ISD::FMA; 2074 break; 2075 case Intrinsic::experimental_constrained_fmuladd: 2076 ISD = ISD::STRICT_FMA; 2077 break; 2078 // FIXME: We should return 0 whenever getIntrinsicCost == TCC_Free. 2079 case Intrinsic::lifetime_start: 2080 case Intrinsic::lifetime_end: 2081 case Intrinsic::sideeffect: 2082 case Intrinsic::pseudoprobe: 2083 case Intrinsic::arithmetic_fence: 2084 return 0; 2085 case Intrinsic::masked_store: { 2086 Type *Ty = Tys[0]; 2087 Align TyAlign = thisT()->DL.getABITypeAlign(Ty); 2088 return thisT()->getMaskedMemoryOpCost(Instruction::Store, Ty, TyAlign, 0, 2089 CostKind); 2090 } 2091 case Intrinsic::masked_load: { 2092 Type *Ty = RetTy; 2093 Align TyAlign = thisT()->DL.getABITypeAlign(Ty); 2094 return thisT()->getMaskedMemoryOpCost(Instruction::Load, Ty, TyAlign, 0, 2095 CostKind); 2096 } 2097 case Intrinsic::vector_reduce_add: 2098 case Intrinsic::vector_reduce_mul: 2099 case Intrinsic::vector_reduce_and: 2100 case Intrinsic::vector_reduce_or: 2101 case Intrinsic::vector_reduce_xor: 2102 return thisT()->getArithmeticReductionCost( 2103 getArithmeticReductionInstruction(IID), VecOpTy, std::nullopt, 2104 CostKind); 2105 case Intrinsic::vector_reduce_fadd: 2106 case Intrinsic::vector_reduce_fmul: 2107 return thisT()->getArithmeticReductionCost( 2108 getArithmeticReductionInstruction(IID), VecOpTy, FMF, CostKind); 2109 case Intrinsic::vector_reduce_smax: 2110 case Intrinsic::vector_reduce_smin: 2111 case Intrinsic::vector_reduce_umax: 2112 case Intrinsic::vector_reduce_umin: 2113 case Intrinsic::vector_reduce_fmax: 2114 case Intrinsic::vector_reduce_fmin: 2115 case Intrinsic::vector_reduce_fmaximum: 2116 case Intrinsic::vector_reduce_fminimum: 2117 return thisT()->getMinMaxReductionCost(getMinMaxReductionIntrinsicOp(IID), 2118 VecOpTy, ICA.getFlags(), CostKind); 2119 case Intrinsic::abs: { 2120 // abs(X) = select(icmp(X,0),X,sub(0,X)) 2121 Type *CondTy = RetTy->getWithNewBitWidth(1); 2122 CmpInst::Predicate Pred = CmpInst::ICMP_SGT; 2123 InstructionCost Cost = 0; 2124 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy, 2125 Pred, CostKind); 2126 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, 2127 Pred, CostKind); 2128 // TODO: Should we add an OperandValueProperties::OP_Zero property? 2129 Cost += thisT()->getArithmeticInstrCost( 2130 BinaryOperator::Sub, RetTy, CostKind, {TTI::OK_UniformConstantValue, TTI::OP_None}); 2131 return Cost; 2132 } 2133 case Intrinsic::smax: 2134 case Intrinsic::smin: 2135 case Intrinsic::umax: 2136 case Intrinsic::umin: { 2137 // minmax(X,Y) = select(icmp(X,Y),X,Y) 2138 Type *CondTy = RetTy->getWithNewBitWidth(1); 2139 bool IsUnsigned = IID == Intrinsic::umax || IID == Intrinsic::umin; 2140 CmpInst::Predicate Pred = 2141 IsUnsigned ? CmpInst::ICMP_UGT : CmpInst::ICMP_SGT; 2142 InstructionCost Cost = 0; 2143 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy, 2144 Pred, CostKind); 2145 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, 2146 Pred, CostKind); 2147 return Cost; 2148 } 2149 case Intrinsic::sadd_sat: 2150 case Intrinsic::ssub_sat: { 2151 Type *CondTy = RetTy->getWithNewBitWidth(1); 2152 2153 Type *OpTy = StructType::create({RetTy, CondTy}); 2154 Intrinsic::ID OverflowOp = IID == Intrinsic::sadd_sat 2155 ? Intrinsic::sadd_with_overflow 2156 : Intrinsic::ssub_with_overflow; 2157 CmpInst::Predicate Pred = CmpInst::ICMP_SGT; 2158 2159 // SatMax -> Overflow && SumDiff < 0 2160 // SatMin -> Overflow && SumDiff >= 0 2161 InstructionCost Cost = 0; 2162 IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF, 2163 nullptr, ScalarizationCostPassed); 2164 Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind); 2165 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy, 2166 Pred, CostKind); 2167 Cost += 2 * thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, 2168 CondTy, Pred, CostKind); 2169 return Cost; 2170 } 2171 case Intrinsic::uadd_sat: 2172 case Intrinsic::usub_sat: { 2173 Type *CondTy = RetTy->getWithNewBitWidth(1); 2174 2175 Type *OpTy = StructType::create({RetTy, CondTy}); 2176 Intrinsic::ID OverflowOp = IID == Intrinsic::uadd_sat 2177 ? Intrinsic::uadd_with_overflow 2178 : Intrinsic::usub_with_overflow; 2179 2180 InstructionCost Cost = 0; 2181 IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF, 2182 nullptr, ScalarizationCostPassed); 2183 Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind); 2184 Cost += 2185 thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, 2186 CmpInst::BAD_ICMP_PREDICATE, CostKind); 2187 return Cost; 2188 } 2189 case Intrinsic::smul_fix: 2190 case Intrinsic::umul_fix: { 2191 unsigned ExtSize = RetTy->getScalarSizeInBits() * 2; 2192 Type *ExtTy = RetTy->getWithNewBitWidth(ExtSize); 2193 2194 unsigned ExtOp = 2195 IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt; 2196 TTI::CastContextHint CCH = TTI::CastContextHint::None; 2197 2198 InstructionCost Cost = 0; 2199 Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, RetTy, CCH, CostKind); 2200 Cost += 2201 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind); 2202 Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, RetTy, ExtTy, 2203 CCH, CostKind); 2204 Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, RetTy, 2205 CostKind, 2206 {TTI::OK_AnyValue, TTI::OP_None}, 2207 {TTI::OK_UniformConstantValue, TTI::OP_None}); 2208 Cost += thisT()->getArithmeticInstrCost(Instruction::Shl, RetTy, CostKind, 2209 {TTI::OK_AnyValue, TTI::OP_None}, 2210 {TTI::OK_UniformConstantValue, TTI::OP_None}); 2211 Cost += thisT()->getArithmeticInstrCost(Instruction::Or, RetTy, CostKind); 2212 return Cost; 2213 } 2214 case Intrinsic::sadd_with_overflow: 2215 case Intrinsic::ssub_with_overflow: { 2216 Type *SumTy = RetTy->getContainedType(0); 2217 Type *OverflowTy = RetTy->getContainedType(1); 2218 unsigned Opcode = IID == Intrinsic::sadd_with_overflow 2219 ? BinaryOperator::Add 2220 : BinaryOperator::Sub; 2221 2222 // Add: 2223 // Overflow -> (Result < LHS) ^ (RHS < 0) 2224 // Sub: 2225 // Overflow -> (Result < LHS) ^ (RHS > 0) 2226 InstructionCost Cost = 0; 2227 Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind); 2228 Cost += 2 * thisT()->getCmpSelInstrCost( 2229 Instruction::ICmp, SumTy, OverflowTy, 2230 CmpInst::ICMP_SGT, CostKind); 2231 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::Xor, OverflowTy, 2232 CostKind); 2233 return Cost; 2234 } 2235 case Intrinsic::uadd_with_overflow: 2236 case Intrinsic::usub_with_overflow: { 2237 Type *SumTy = RetTy->getContainedType(0); 2238 Type *OverflowTy = RetTy->getContainedType(1); 2239 unsigned Opcode = IID == Intrinsic::uadd_with_overflow 2240 ? BinaryOperator::Add 2241 : BinaryOperator::Sub; 2242 CmpInst::Predicate Pred = IID == Intrinsic::uadd_with_overflow 2243 ? CmpInst::ICMP_ULT 2244 : CmpInst::ICMP_UGT; 2245 2246 InstructionCost Cost = 0; 2247 Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind); 2248 Cost += 2249 thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SumTy, OverflowTy, 2250 Pred, CostKind); 2251 return Cost; 2252 } 2253 case Intrinsic::smul_with_overflow: 2254 case Intrinsic::umul_with_overflow: { 2255 Type *MulTy = RetTy->getContainedType(0); 2256 Type *OverflowTy = RetTy->getContainedType(1); 2257 unsigned ExtSize = MulTy->getScalarSizeInBits() * 2; 2258 Type *ExtTy = MulTy->getWithNewBitWidth(ExtSize); 2259 bool IsSigned = IID == Intrinsic::smul_with_overflow; 2260 2261 unsigned ExtOp = IsSigned ? Instruction::SExt : Instruction::ZExt; 2262 TTI::CastContextHint CCH = TTI::CastContextHint::None; 2263 2264 InstructionCost Cost = 0; 2265 Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, MulTy, CCH, CostKind); 2266 Cost += 2267 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind); 2268 Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, MulTy, ExtTy, 2269 CCH, CostKind); 2270 Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, ExtTy, 2271 CostKind, 2272 {TTI::OK_AnyValue, TTI::OP_None}, 2273 {TTI::OK_UniformConstantValue, TTI::OP_None}); 2274 2275 if (IsSigned) 2276 Cost += thisT()->getArithmeticInstrCost(Instruction::AShr, MulTy, 2277 CostKind, 2278 {TTI::OK_AnyValue, TTI::OP_None}, 2279 {TTI::OK_UniformConstantValue, TTI::OP_None}); 2280 2281 Cost += thisT()->getCmpSelInstrCost( 2282 BinaryOperator::ICmp, MulTy, OverflowTy, CmpInst::ICMP_NE, CostKind); 2283 return Cost; 2284 } 2285 case Intrinsic::fptosi_sat: 2286 case Intrinsic::fptoui_sat: { 2287 if (Tys.empty()) 2288 break; 2289 Type *FromTy = Tys[0]; 2290 bool IsSigned = IID == Intrinsic::fptosi_sat; 2291 2292 InstructionCost Cost = 0; 2293 IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FromTy, 2294 {FromTy, FromTy}); 2295 Cost += thisT()->getIntrinsicInstrCost(Attrs1, CostKind); 2296 IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FromTy, 2297 {FromTy, FromTy}); 2298 Cost += thisT()->getIntrinsicInstrCost(Attrs2, CostKind); 2299 Cost += thisT()->getCastInstrCost( 2300 IsSigned ? Instruction::FPToSI : Instruction::FPToUI, RetTy, FromTy, 2301 TTI::CastContextHint::None, CostKind); 2302 if (IsSigned) { 2303 Type *CondTy = RetTy->getWithNewBitWidth(1); 2304 Cost += thisT()->getCmpSelInstrCost( 2305 BinaryOperator::FCmp, FromTy, CondTy, CmpInst::FCMP_UNO, CostKind); 2306 Cost += thisT()->getCmpSelInstrCost( 2307 BinaryOperator::Select, RetTy, CondTy, CmpInst::FCMP_UNO, CostKind); 2308 } 2309 return Cost; 2310 } 2311 case Intrinsic::ctpop: 2312 ISD = ISD::CTPOP; 2313 // In case of legalization use TCC_Expensive. This is cheaper than a 2314 // library call but still not a cheap instruction. 2315 SingleCallCost = TargetTransformInfo::TCC_Expensive; 2316 break; 2317 case Intrinsic::ctlz: 2318 ISD = ISD::CTLZ; 2319 break; 2320 case Intrinsic::cttz: 2321 ISD = ISD::CTTZ; 2322 break; 2323 case Intrinsic::bswap: 2324 ISD = ISD::BSWAP; 2325 break; 2326 case Intrinsic::bitreverse: 2327 ISD = ISD::BITREVERSE; 2328 break; 2329 } 2330 2331 const TargetLoweringBase *TLI = getTLI(); 2332 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy); 2333 2334 if (TLI->isOperationLegalOrPromote(ISD, LT.second)) { 2335 if (IID == Intrinsic::fabs && LT.second.isFloatingPoint() && 2336 TLI->isFAbsFree(LT.second)) { 2337 return 0; 2338 } 2339 2340 // The operation is legal. Assume it costs 1. 2341 // If the type is split to multiple registers, assume that there is some 2342 // overhead to this. 2343 // TODO: Once we have extract/insert subvector cost we need to use them. 2344 if (LT.first > 1) 2345 return (LT.first * 2); 2346 else 2347 return (LT.first * 1); 2348 } else if (!TLI->isOperationExpand(ISD, LT.second)) { 2349 // If the operation is custom lowered then assume 2350 // that the code is twice as expensive. 2351 return (LT.first * 2); 2352 } 2353 2354 // If we can't lower fmuladd into an FMA estimate the cost as a floating 2355 // point mul followed by an add. 2356 if (IID == Intrinsic::fmuladd) 2357 return thisT()->getArithmeticInstrCost(BinaryOperator::FMul, RetTy, 2358 CostKind) + 2359 thisT()->getArithmeticInstrCost(BinaryOperator::FAdd, RetTy, 2360 CostKind); 2361 if (IID == Intrinsic::experimental_constrained_fmuladd) { 2362 IntrinsicCostAttributes FMulAttrs( 2363 Intrinsic::experimental_constrained_fmul, RetTy, Tys); 2364 IntrinsicCostAttributes FAddAttrs( 2365 Intrinsic::experimental_constrained_fadd, RetTy, Tys); 2366 return thisT()->getIntrinsicInstrCost(FMulAttrs, CostKind) + 2367 thisT()->getIntrinsicInstrCost(FAddAttrs, CostKind); 2368 } 2369 2370 // Else, assume that we need to scalarize this intrinsic. For math builtins 2371 // this will emit a costly libcall, adding call overhead and spills. Make it 2372 // very expensive. 2373 if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) { 2374 // Scalable vectors cannot be scalarized, so return Invalid. 2375 if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) { 2376 return isa<ScalableVectorType>(Ty); 2377 })) 2378 return InstructionCost::getInvalid(); 2379 2380 InstructionCost ScalarizationCost = 2381 SkipScalarizationCost 2382 ? ScalarizationCostPassed 2383 : getScalarizationOverhead(RetVTy, /*Insert*/ true, 2384 /*Extract*/ false, CostKind); 2385 2386 unsigned ScalarCalls = cast<FixedVectorType>(RetVTy)->getNumElements(); 2387 SmallVector<Type *, 4> ScalarTys; 2388 for (Type *Ty : Tys) { 2389 if (Ty->isVectorTy()) 2390 Ty = Ty->getScalarType(); 2391 ScalarTys.push_back(Ty); 2392 } 2393 IntrinsicCostAttributes Attrs(IID, RetTy->getScalarType(), ScalarTys, FMF); 2394 InstructionCost ScalarCost = 2395 thisT()->getIntrinsicInstrCost(Attrs, CostKind); 2396 for (Type *Ty : Tys) { 2397 if (auto *VTy = dyn_cast<VectorType>(Ty)) { 2398 if (!ICA.skipScalarizationCost()) 2399 ScalarizationCost += getScalarizationOverhead( 2400 VTy, /*Insert*/ false, /*Extract*/ true, CostKind); 2401 ScalarCalls = std::max(ScalarCalls, 2402 cast<FixedVectorType>(VTy)->getNumElements()); 2403 } 2404 } 2405 return ScalarCalls * ScalarCost + ScalarizationCost; 2406 } 2407 2408 // This is going to be turned into a library call, make it expensive. 2409 return SingleCallCost; 2410 } 2411 2412 /// Compute a cost of the given call instruction. 2413 /// 2414 /// Compute the cost of calling function F with return type RetTy and 2415 /// argument types Tys. F might be nullptr, in this case the cost of an 2416 /// arbitrary call with the specified signature will be returned. 2417 /// This is used, for instance, when we estimate call of a vector 2418 /// counterpart of the given function. 2419 /// \param F Called function, might be nullptr. 2420 /// \param RetTy Return value types. 2421 /// \param Tys Argument types. 2422 /// \returns The cost of Call instruction. getCallInstrCost(Function * F,Type * RetTy,ArrayRef<Type * > Tys,TTI::TargetCostKind CostKind)2423 InstructionCost getCallInstrCost(Function *F, Type *RetTy, 2424 ArrayRef<Type *> Tys, 2425 TTI::TargetCostKind CostKind) { 2426 return 10; 2427 } 2428 getNumberOfParts(Type * Tp)2429 unsigned getNumberOfParts(Type *Tp) { 2430 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp); 2431 return LT.first.isValid() ? *LT.first.getValue() : 0; 2432 } 2433 getAddressComputationCost(Type * Ty,ScalarEvolution *,const SCEV *)2434 InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *, 2435 const SCEV *) { 2436 return 0; 2437 } 2438 2439 /// Try to calculate arithmetic and shuffle op costs for reduction intrinsics. 2440 /// We're assuming that reduction operation are performing the following way: 2441 /// 2442 /// %val1 = shufflevector<n x t> %val, <n x t> %undef, 2443 /// <n x i32> <i32 n/2, i32 n/2 + 1, ..., i32 n, i32 undef, ..., i32 undef> 2444 /// \----------------v-------------/ \----------v------------/ 2445 /// n/2 elements n/2 elements 2446 /// %red1 = op <n x t> %val, <n x t> val1 2447 /// After this operation we have a vector %red1 where only the first n/2 2448 /// elements are meaningful, the second n/2 elements are undefined and can be 2449 /// dropped. All other operations are actually working with the vector of 2450 /// length n/2, not n, though the real vector length is still n. 2451 /// %val2 = shufflevector<n x t> %red1, <n x t> %undef, 2452 /// <n x i32> <i32 n/4, i32 n/4 + 1, ..., i32 n/2, i32 undef, ..., i32 undef> 2453 /// \----------------v-------------/ \----------v------------/ 2454 /// n/4 elements 3*n/4 elements 2455 /// %red2 = op <n x t> %red1, <n x t> val2 - working with the vector of 2456 /// length n/2, the resulting vector has length n/4 etc. 2457 /// 2458 /// The cost model should take into account that the actual length of the 2459 /// vector is reduced on each iteration. getTreeReductionCost(unsigned Opcode,VectorType * Ty,TTI::TargetCostKind CostKind)2460 InstructionCost getTreeReductionCost(unsigned Opcode, VectorType *Ty, 2461 TTI::TargetCostKind CostKind) { 2462 // Targets must implement a default value for the scalable case, since 2463 // we don't know how many lanes the vector has. 2464 if (isa<ScalableVectorType>(Ty)) 2465 return InstructionCost::getInvalid(); 2466 2467 Type *ScalarTy = Ty->getElementType(); 2468 unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements(); 2469 if ((Opcode == Instruction::Or || Opcode == Instruction::And) && 2470 ScalarTy == IntegerType::getInt1Ty(Ty->getContext()) && 2471 NumVecElts >= 2) { 2472 // Or reduction for i1 is represented as: 2473 // %val = bitcast <ReduxWidth x i1> to iReduxWidth 2474 // %res = cmp ne iReduxWidth %val, 0 2475 // And reduction for i1 is represented as: 2476 // %val = bitcast <ReduxWidth x i1> to iReduxWidth 2477 // %res = cmp eq iReduxWidth %val, 11111 2478 Type *ValTy = IntegerType::get(Ty->getContext(), NumVecElts); 2479 return thisT()->getCastInstrCost(Instruction::BitCast, ValTy, Ty, 2480 TTI::CastContextHint::None, CostKind) + 2481 thisT()->getCmpSelInstrCost(Instruction::ICmp, ValTy, 2482 CmpInst::makeCmpResultType(ValTy), 2483 CmpInst::BAD_ICMP_PREDICATE, CostKind); 2484 } 2485 unsigned NumReduxLevels = Log2_32(NumVecElts); 2486 InstructionCost ArithCost = 0; 2487 InstructionCost ShuffleCost = 0; 2488 std::pair<InstructionCost, MVT> LT = thisT()->getTypeLegalizationCost(Ty); 2489 unsigned LongVectorCount = 0; 2490 unsigned MVTLen = 2491 LT.second.isVector() ? LT.second.getVectorNumElements() : 1; 2492 while (NumVecElts > MVTLen) { 2493 NumVecElts /= 2; 2494 VectorType *SubTy = FixedVectorType::get(ScalarTy, NumVecElts); 2495 ShuffleCost += 2496 thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt, 2497 CostKind, NumVecElts, SubTy); 2498 ArithCost += thisT()->getArithmeticInstrCost(Opcode, SubTy, CostKind); 2499 Ty = SubTy; 2500 ++LongVectorCount; 2501 } 2502 2503 NumReduxLevels -= LongVectorCount; 2504 2505 // The minimal length of the vector is limited by the real length of vector 2506 // operations performed on the current platform. That's why several final 2507 // reduction operations are performed on the vectors with the same 2508 // architecture-dependent length. 2509 2510 // By default reductions need one shuffle per reduction level. 2511 ShuffleCost += 2512 NumReduxLevels * thisT()->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, 2513 std::nullopt, CostKind, 0, Ty); 2514 ArithCost += 2515 NumReduxLevels * thisT()->getArithmeticInstrCost(Opcode, Ty, CostKind); 2516 return ShuffleCost + ArithCost + 2517 thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 2518 CostKind, 0, nullptr, nullptr); 2519 } 2520 2521 /// Try to calculate the cost of performing strict (in-order) reductions, 2522 /// which involves doing a sequence of floating point additions in lane 2523 /// order, starting with an initial value. For example, consider a scalar 2524 /// initial value 'InitVal' of type float and a vector of type <4 x float>: 2525 /// 2526 /// Vector = <float %v0, float %v1, float %v2, float %v3> 2527 /// 2528 /// %add1 = %InitVal + %v0 2529 /// %add2 = %add1 + %v1 2530 /// %add3 = %add2 + %v2 2531 /// %add4 = %add3 + %v3 2532 /// 2533 /// As a simple estimate we can say the cost of such a reduction is 4 times 2534 /// the cost of a scalar FP addition. We can only estimate the costs for 2535 /// fixed-width vectors here because for scalable vectors we do not know the 2536 /// runtime number of operations. getOrderedReductionCost(unsigned Opcode,VectorType * Ty,TTI::TargetCostKind CostKind)2537 InstructionCost getOrderedReductionCost(unsigned Opcode, VectorType *Ty, 2538 TTI::TargetCostKind CostKind) { 2539 // Targets must implement a default value for the scalable case, since 2540 // we don't know how many lanes the vector has. 2541 if (isa<ScalableVectorType>(Ty)) 2542 return InstructionCost::getInvalid(); 2543 2544 auto *VTy = cast<FixedVectorType>(Ty); 2545 InstructionCost ExtractCost = getScalarizationOverhead( 2546 VTy, /*Insert=*/false, /*Extract=*/true, CostKind); 2547 InstructionCost ArithCost = thisT()->getArithmeticInstrCost( 2548 Opcode, VTy->getElementType(), CostKind); 2549 ArithCost *= VTy->getNumElements(); 2550 2551 return ExtractCost + ArithCost; 2552 } 2553 getArithmeticReductionCost(unsigned Opcode,VectorType * Ty,std::optional<FastMathFlags> FMF,TTI::TargetCostKind CostKind)2554 InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, 2555 std::optional<FastMathFlags> FMF, 2556 TTI::TargetCostKind CostKind) { 2557 assert(Ty && "Unknown reduction vector type"); 2558 if (TTI::requiresOrderedReduction(FMF)) 2559 return getOrderedReductionCost(Opcode, Ty, CostKind); 2560 return getTreeReductionCost(Opcode, Ty, CostKind); 2561 } 2562 2563 /// Try to calculate op costs for min/max reduction operations. 2564 /// \param CondTy Conditional type for the Select instruction. getMinMaxReductionCost(Intrinsic::ID IID,VectorType * Ty,FastMathFlags FMF,TTI::TargetCostKind CostKind)2565 InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, 2566 FastMathFlags FMF, 2567 TTI::TargetCostKind CostKind) { 2568 // Targets must implement a default value for the scalable case, since 2569 // we don't know how many lanes the vector has. 2570 if (isa<ScalableVectorType>(Ty)) 2571 return InstructionCost::getInvalid(); 2572 2573 Type *ScalarTy = Ty->getElementType(); 2574 unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements(); 2575 unsigned NumReduxLevels = Log2_32(NumVecElts); 2576 InstructionCost MinMaxCost = 0; 2577 InstructionCost ShuffleCost = 0; 2578 std::pair<InstructionCost, MVT> LT = thisT()->getTypeLegalizationCost(Ty); 2579 unsigned LongVectorCount = 0; 2580 unsigned MVTLen = 2581 LT.second.isVector() ? LT.second.getVectorNumElements() : 1; 2582 while (NumVecElts > MVTLen) { 2583 NumVecElts /= 2; 2584 auto *SubTy = FixedVectorType::get(ScalarTy, NumVecElts); 2585 2586 ShuffleCost += 2587 thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt, 2588 CostKind, NumVecElts, SubTy); 2589 2590 IntrinsicCostAttributes Attrs(IID, SubTy, {SubTy, SubTy}, FMF); 2591 MinMaxCost += getIntrinsicInstrCost(Attrs, CostKind); 2592 Ty = SubTy; 2593 ++LongVectorCount; 2594 } 2595 2596 NumReduxLevels -= LongVectorCount; 2597 2598 // The minimal length of the vector is limited by the real length of vector 2599 // operations performed on the current platform. That's why several final 2600 // reduction opertions are perfomed on the vectors with the same 2601 // architecture-dependent length. 2602 ShuffleCost += 2603 NumReduxLevels * thisT()->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, 2604 std::nullopt, CostKind, 0, Ty); 2605 IntrinsicCostAttributes Attrs(IID, Ty, {Ty, Ty}, FMF); 2606 MinMaxCost += NumReduxLevels * getIntrinsicInstrCost(Attrs, CostKind); 2607 // The last min/max should be in vector registers and we counted it above. 2608 // So just need a single extractelement. 2609 return ShuffleCost + MinMaxCost + 2610 thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 2611 CostKind, 0, nullptr, nullptr); 2612 } 2613 getExtendedReductionCost(unsigned Opcode,bool IsUnsigned,Type * ResTy,VectorType * Ty,FastMathFlags FMF,TTI::TargetCostKind CostKind)2614 InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, 2615 Type *ResTy, VectorType *Ty, 2616 FastMathFlags FMF, 2617 TTI::TargetCostKind CostKind) { 2618 // Without any native support, this is equivalent to the cost of 2619 // vecreduce.opcode(ext(Ty A)). 2620 VectorType *ExtTy = VectorType::get(ResTy, Ty); 2621 InstructionCost RedCost = 2622 thisT()->getArithmeticReductionCost(Opcode, ExtTy, FMF, CostKind); 2623 InstructionCost ExtCost = thisT()->getCastInstrCost( 2624 IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty, 2625 TTI::CastContextHint::None, CostKind); 2626 2627 return RedCost + ExtCost; 2628 } 2629 getMulAccReductionCost(bool IsUnsigned,Type * ResTy,VectorType * Ty,TTI::TargetCostKind CostKind)2630 InstructionCost getMulAccReductionCost(bool IsUnsigned, Type *ResTy, 2631 VectorType *Ty, 2632 TTI::TargetCostKind CostKind) { 2633 // Without any native support, this is equivalent to the cost of 2634 // vecreduce.add(mul(ext(Ty A), ext(Ty B))) or 2635 // vecreduce.add(mul(A, B)). 2636 VectorType *ExtTy = VectorType::get(ResTy, Ty); 2637 InstructionCost RedCost = thisT()->getArithmeticReductionCost( 2638 Instruction::Add, ExtTy, std::nullopt, CostKind); 2639 InstructionCost ExtCost = thisT()->getCastInstrCost( 2640 IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty, 2641 TTI::CastContextHint::None, CostKind); 2642 2643 InstructionCost MulCost = 2644 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind); 2645 2646 return RedCost + MulCost + 2 * ExtCost; 2647 } 2648 getVectorSplitCost()2649 InstructionCost getVectorSplitCost() { return 1; } 2650 2651 /// @} 2652 }; 2653 2654 /// Concrete BasicTTIImpl that can be used if no further customization 2655 /// is needed. 2656 class BasicTTIImpl : public BasicTTIImplBase<BasicTTIImpl> { 2657 using BaseT = BasicTTIImplBase<BasicTTIImpl>; 2658 2659 friend class BasicTTIImplBase<BasicTTIImpl>; 2660 2661 const TargetSubtargetInfo *ST; 2662 const TargetLoweringBase *TLI; 2663 getST()2664 const TargetSubtargetInfo *getST() const { return ST; } getTLI()2665 const TargetLoweringBase *getTLI() const { return TLI; } 2666 2667 public: 2668 explicit BasicTTIImpl(const TargetMachine *TM, const Function &F); 2669 }; 2670 2671 } // end namespace llvm 2672 2673 #endif // LLVM_CODEGEN_BASICTTIIMPL_H 2674