xref: /freebsd/contrib/llvm-project/llvm/include/llvm/CodeGen/BasicTTIImpl.h (revision 0fca6ea1d4eea4c934cfff25ac9ee8ad6fe95583)
1 //===- BasicTTIImpl.h -------------------------------------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This file provides a helper that implements much of the TTI interface in
11 /// terms of the target-independent code generator and TargetLowering
12 /// interfaces.
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #ifndef LLVM_CODEGEN_BASICTTIIMPL_H
17 #define LLVM_CODEGEN_BASICTTIIMPL_H
18 
19 #include "llvm/ADT/APInt.h"
20 #include "llvm/ADT/ArrayRef.h"
21 #include "llvm/ADT/BitVector.h"
22 #include "llvm/ADT/SmallPtrSet.h"
23 #include "llvm/ADT/SmallVector.h"
24 #include "llvm/Analysis/LoopInfo.h"
25 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
26 #include "llvm/Analysis/TargetTransformInfo.h"
27 #include "llvm/Analysis/TargetTransformInfoImpl.h"
28 #include "llvm/Analysis/ValueTracking.h"
29 #include "llvm/CodeGen/ISDOpcodes.h"
30 #include "llvm/CodeGen/TargetLowering.h"
31 #include "llvm/CodeGen/TargetSubtargetInfo.h"
32 #include "llvm/CodeGen/ValueTypes.h"
33 #include "llvm/CodeGenTypes/MachineValueType.h"
34 #include "llvm/IR/BasicBlock.h"
35 #include "llvm/IR/Constant.h"
36 #include "llvm/IR/Constants.h"
37 #include "llvm/IR/DataLayout.h"
38 #include "llvm/IR/DerivedTypes.h"
39 #include "llvm/IR/InstrTypes.h"
40 #include "llvm/IR/Instruction.h"
41 #include "llvm/IR/Instructions.h"
42 #include "llvm/IR/Intrinsics.h"
43 #include "llvm/IR/Operator.h"
44 #include "llvm/IR/Type.h"
45 #include "llvm/IR/Value.h"
46 #include "llvm/Support/Casting.h"
47 #include "llvm/Support/CommandLine.h"
48 #include "llvm/Support/ErrorHandling.h"
49 #include "llvm/Support/MathExtras.h"
50 #include "llvm/Target/TargetMachine.h"
51 #include "llvm/Target/TargetOptions.h"
52 #include "llvm/Transforms/Utils/LoopUtils.h"
53 #include <algorithm>
54 #include <cassert>
55 #include <cstdint>
56 #include <limits>
57 #include <optional>
58 #include <utility>
59 
60 namespace llvm {
61 
62 class Function;
63 class GlobalValue;
64 class LLVMContext;
65 class ScalarEvolution;
66 class SCEV;
67 class TargetMachine;
68 
69 extern cl::opt<unsigned> PartialUnrollingThreshold;
70 
71 /// Base class which can be used to help build a TTI implementation.
72 ///
73 /// This class provides as much implementation of the TTI interface as is
74 /// possible using the target independent parts of the code generator.
75 ///
76 /// In order to subclass it, your class must implement a getST() method to
77 /// return the subtarget, and a getTLI() method to return the target lowering.
78 /// We need these methods implemented in the derived class so that this class
79 /// doesn't have to duplicate storage for them.
80 template <typename T>
81 class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
82 private:
83   using BaseT = TargetTransformInfoImplCRTPBase<T>;
84   using TTI = TargetTransformInfo;
85 
86   /// Helper function to access this as a T.
thisT()87   T *thisT() { return static_cast<T *>(this); }
88 
89   /// Estimate a cost of Broadcast as an extract and sequence of insert
90   /// operations.
getBroadcastShuffleOverhead(FixedVectorType * VTy,TTI::TargetCostKind CostKind)91   InstructionCost getBroadcastShuffleOverhead(FixedVectorType *VTy,
92                                               TTI::TargetCostKind CostKind) {
93     InstructionCost Cost = 0;
94     // Broadcast cost is equal to the cost of extracting the zero'th element
95     // plus the cost of inserting it into every element of the result vector.
96     Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy,
97                                         CostKind, 0, nullptr, nullptr);
98 
99     for (int i = 0, e = VTy->getNumElements(); i < e; ++i) {
100       Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy,
101                                           CostKind, i, nullptr, nullptr);
102     }
103     return Cost;
104   }
105 
106   /// Estimate a cost of shuffle as a sequence of extract and insert
107   /// operations.
getPermuteShuffleOverhead(FixedVectorType * VTy,TTI::TargetCostKind CostKind)108   InstructionCost getPermuteShuffleOverhead(FixedVectorType *VTy,
109                                             TTI::TargetCostKind CostKind) {
110     InstructionCost Cost = 0;
111     // Shuffle cost is equal to the cost of extracting element from its argument
112     // plus the cost of inserting them onto the result vector.
113 
114     // e.g. <4 x float> has a mask of <0,5,2,7> i.e we need to extract from
115     // index 0 of first vector, index 1 of second vector,index 2 of first
116     // vector and finally index 3 of second vector and insert them at index
117     // <0,1,2,3> of result vector.
118     for (int i = 0, e = VTy->getNumElements(); i < e; ++i) {
119       Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy,
120                                           CostKind, i, nullptr, nullptr);
121       Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy,
122                                           CostKind, i, nullptr, nullptr);
123     }
124     return Cost;
125   }
126 
127   /// Estimate a cost of subvector extraction as a sequence of extract and
128   /// insert operations.
getExtractSubvectorOverhead(VectorType * VTy,TTI::TargetCostKind CostKind,int Index,FixedVectorType * SubVTy)129   InstructionCost getExtractSubvectorOverhead(VectorType *VTy,
130                                               TTI::TargetCostKind CostKind,
131                                               int Index,
132                                               FixedVectorType *SubVTy) {
133     assert(VTy && SubVTy &&
134            "Can only extract subvectors from vectors");
135     int NumSubElts = SubVTy->getNumElements();
136     assert((!isa<FixedVectorType>(VTy) ||
137             (Index + NumSubElts) <=
138                 (int)cast<FixedVectorType>(VTy)->getNumElements()) &&
139            "SK_ExtractSubvector index out of range");
140 
141     InstructionCost Cost = 0;
142     // Subvector extraction cost is equal to the cost of extracting element from
143     // the source type plus the cost of inserting them into the result vector
144     // type.
145     for (int i = 0; i != NumSubElts; ++i) {
146       Cost +=
147           thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy,
148                                       CostKind, i + Index, nullptr, nullptr);
149       Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, SubVTy,
150                                           CostKind, i, nullptr, nullptr);
151     }
152     return Cost;
153   }
154 
155   /// Estimate a cost of subvector insertion as a sequence of extract and
156   /// insert operations.
getInsertSubvectorOverhead(VectorType * VTy,TTI::TargetCostKind CostKind,int Index,FixedVectorType * SubVTy)157   InstructionCost getInsertSubvectorOverhead(VectorType *VTy,
158                                              TTI::TargetCostKind CostKind,
159                                              int Index,
160                                              FixedVectorType *SubVTy) {
161     assert(VTy && SubVTy &&
162            "Can only insert subvectors into vectors");
163     int NumSubElts = SubVTy->getNumElements();
164     assert((!isa<FixedVectorType>(VTy) ||
165             (Index + NumSubElts) <=
166                 (int)cast<FixedVectorType>(VTy)->getNumElements()) &&
167            "SK_InsertSubvector index out of range");
168 
169     InstructionCost Cost = 0;
170     // Subvector insertion cost is equal to the cost of extracting element from
171     // the source type plus the cost of inserting them into the result vector
172     // type.
173     for (int i = 0; i != NumSubElts; ++i) {
174       Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVTy,
175                                           CostKind, i, nullptr, nullptr);
176       Cost +=
177           thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, CostKind,
178                                       i + Index, nullptr, nullptr);
179     }
180     return Cost;
181   }
182 
183   /// Local query method delegates up to T which *must* implement this!
getST()184   const TargetSubtargetInfo *getST() const {
185     return static_cast<const T *>(this)->getST();
186   }
187 
188   /// Local query method delegates up to T which *must* implement this!
getTLI()189   const TargetLoweringBase *getTLI() const {
190     return static_cast<const T *>(this)->getTLI();
191   }
192 
getISDIndexedMode(TTI::MemIndexedMode M)193   static ISD::MemIndexedMode getISDIndexedMode(TTI::MemIndexedMode M) {
194     switch (M) {
195       case TTI::MIM_Unindexed:
196         return ISD::UNINDEXED;
197       case TTI::MIM_PreInc:
198         return ISD::PRE_INC;
199       case TTI::MIM_PreDec:
200         return ISD::PRE_DEC;
201       case TTI::MIM_PostInc:
202         return ISD::POST_INC;
203       case TTI::MIM_PostDec:
204         return ISD::POST_DEC;
205     }
206     llvm_unreachable("Unexpected MemIndexedMode");
207   }
208 
209   InstructionCost getCommonMaskedMemoryOpCost(unsigned Opcode, Type *DataTy,
210                                               Align Alignment,
211                                               bool VariableMask,
212                                               bool IsGatherScatter,
213                                               TTI::TargetCostKind CostKind,
214                                               unsigned AddressSpace = 0) {
215     // We cannot scalarize scalable vectors, so return Invalid.
216     if (isa<ScalableVectorType>(DataTy))
217       return InstructionCost::getInvalid();
218 
219     auto *VT = cast<FixedVectorType>(DataTy);
220     unsigned VF = VT->getNumElements();
221 
222     // Assume the target does not have support for gather/scatter operations
223     // and provide a rough estimate.
224     //
225     // First, compute the cost of the individual memory operations.
226     InstructionCost AddrExtractCost =
227         IsGatherScatter
228             ? getScalarizationOverhead(
229                   FixedVectorType::get(
230                       PointerType::get(VT->getElementType(), 0), VF),
231                   /*Insert=*/false, /*Extract=*/true, CostKind)
232             : 0;
233 
234     // The cost of the scalar loads/stores.
235     InstructionCost MemoryOpCost =
236         VF * thisT()->getMemoryOpCost(Opcode, VT->getElementType(), Alignment,
237                                       AddressSpace, CostKind);
238 
239     // Next, compute the cost of packing the result in a vector.
240     InstructionCost PackingCost =
241         getScalarizationOverhead(VT, Opcode != Instruction::Store,
242                                  Opcode == Instruction::Store, CostKind);
243 
244     InstructionCost ConditionalCost = 0;
245     if (VariableMask) {
246       // Compute the cost of conditionally executing the memory operations with
247       // variable masks. This includes extracting the individual conditions, a
248       // branches and PHIs to combine the results.
249       // NOTE: Estimating the cost of conditionally executing the memory
250       // operations accurately is quite difficult and the current solution
251       // provides a very rough estimate only.
252       ConditionalCost =
253           getScalarizationOverhead(
254               FixedVectorType::get(Type::getInt1Ty(DataTy->getContext()), VF),
255               /*Insert=*/false, /*Extract=*/true, CostKind) +
256           VF * (thisT()->getCFInstrCost(Instruction::Br, CostKind) +
257                 thisT()->getCFInstrCost(Instruction::PHI, CostKind));
258     }
259 
260     return AddrExtractCost + MemoryOpCost + PackingCost + ConditionalCost;
261   }
262 
263 protected:
BasicTTIImplBase(const TargetMachine * TM,const DataLayout & DL)264   explicit BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL)
265       : BaseT(DL) {}
266   virtual ~BasicTTIImplBase() = default;
267 
268   using TargetTransformInfoImplBase::DL;
269 
270 public:
271   /// \name Scalar TTI Implementations
272   /// @{
allowsMisalignedMemoryAccesses(LLVMContext & Context,unsigned BitWidth,unsigned AddressSpace,Align Alignment,unsigned * Fast)273   bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth,
274                                       unsigned AddressSpace, Align Alignment,
275                                       unsigned *Fast) const {
276     EVT E = EVT::getIntegerVT(Context, BitWidth);
277     return getTLI()->allowsMisalignedMemoryAccesses(
278         E, AddressSpace, Alignment, MachineMemOperand::MONone, Fast);
279   }
280 
281   bool hasBranchDivergence(const Function *F = nullptr) { return false; }
282 
isSourceOfDivergence(const Value * V)283   bool isSourceOfDivergence(const Value *V) { return false; }
284 
isAlwaysUniform(const Value * V)285   bool isAlwaysUniform(const Value *V) { return false; }
286 
isValidAddrSpaceCast(unsigned FromAS,unsigned ToAS)287   bool isValidAddrSpaceCast(unsigned FromAS, unsigned ToAS) const {
288     return false;
289   }
290 
addrspacesMayAlias(unsigned AS0,unsigned AS1)291   bool addrspacesMayAlias(unsigned AS0, unsigned AS1) const {
292     return true;
293   }
294 
getFlatAddressSpace()295   unsigned getFlatAddressSpace() {
296     // Return an invalid address space.
297     return -1;
298   }
299 
collectFlatAddressOperands(SmallVectorImpl<int> & OpIndexes,Intrinsic::ID IID)300   bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
301                                   Intrinsic::ID IID) const {
302     return false;
303   }
304 
isNoopAddrSpaceCast(unsigned FromAS,unsigned ToAS)305   bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const {
306     return getTLI()->getTargetMachine().isNoopAddrSpaceCast(FromAS, ToAS);
307   }
308 
getAssumedAddrSpace(const Value * V)309   unsigned getAssumedAddrSpace(const Value *V) const {
310     return getTLI()->getTargetMachine().getAssumedAddrSpace(V);
311   }
312 
isSingleThreaded()313   bool isSingleThreaded() const {
314     return getTLI()->getTargetMachine().Options.ThreadModel ==
315            ThreadModel::Single;
316   }
317 
318   std::pair<const Value *, unsigned>
getPredicatedAddrSpace(const Value * V)319   getPredicatedAddrSpace(const Value *V) const {
320     return getTLI()->getTargetMachine().getPredicatedAddrSpace(V);
321   }
322 
rewriteIntrinsicWithAddressSpace(IntrinsicInst * II,Value * OldV,Value * NewV)323   Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
324                                           Value *NewV) const {
325     return nullptr;
326   }
327 
isLegalAddImmediate(int64_t imm)328   bool isLegalAddImmediate(int64_t imm) {
329     return getTLI()->isLegalAddImmediate(imm);
330   }
331 
isLegalAddScalableImmediate(int64_t Imm)332   bool isLegalAddScalableImmediate(int64_t Imm) {
333     return getTLI()->isLegalAddScalableImmediate(Imm);
334   }
335 
isLegalICmpImmediate(int64_t imm)336   bool isLegalICmpImmediate(int64_t imm) {
337     return getTLI()->isLegalICmpImmediate(imm);
338   }
339 
340   bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
341                              bool HasBaseReg, int64_t Scale, unsigned AddrSpace,
342                              Instruction *I = nullptr,
343                              int64_t ScalableOffset = 0) {
344     TargetLoweringBase::AddrMode AM;
345     AM.BaseGV = BaseGV;
346     AM.BaseOffs = BaseOffset;
347     AM.HasBaseReg = HasBaseReg;
348     AM.Scale = Scale;
349     AM.ScalableOffset = ScalableOffset;
350     return getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace, I);
351   }
352 
getPreferredLargeGEPBaseOffset(int64_t MinOffset,int64_t MaxOffset)353   int64_t getPreferredLargeGEPBaseOffset(int64_t MinOffset, int64_t MaxOffset) {
354     return getTLI()->getPreferredLargeGEPBaseOffset(MinOffset, MaxOffset);
355   }
356 
getStoreMinimumVF(unsigned VF,Type * ScalarMemTy,Type * ScalarValTy)357   unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy,
358                              Type *ScalarValTy) const {
359     auto &&IsSupportedByTarget = [this, ScalarMemTy, ScalarValTy](unsigned VF) {
360       auto *SrcTy = FixedVectorType::get(ScalarMemTy, VF / 2);
361       EVT VT = getTLI()->getValueType(DL, SrcTy);
362       if (getTLI()->isOperationLegal(ISD::STORE, VT) ||
363           getTLI()->isOperationCustom(ISD::STORE, VT))
364         return true;
365 
366       EVT ValVT =
367           getTLI()->getValueType(DL, FixedVectorType::get(ScalarValTy, VF / 2));
368       EVT LegalizedVT =
369           getTLI()->getTypeToTransformTo(ScalarMemTy->getContext(), VT);
370       return getTLI()->isTruncStoreLegal(LegalizedVT, ValVT);
371     };
372     while (VF > 2 && IsSupportedByTarget(VF))
373       VF /= 2;
374     return VF;
375   }
376 
isIndexedLoadLegal(TTI::MemIndexedMode M,Type * Ty,const DataLayout & DL)377   bool isIndexedLoadLegal(TTI::MemIndexedMode M, Type *Ty,
378                           const DataLayout &DL) const {
379     EVT VT = getTLI()->getValueType(DL, Ty);
380     return getTLI()->isIndexedLoadLegal(getISDIndexedMode(M), VT);
381   }
382 
isIndexedStoreLegal(TTI::MemIndexedMode M,Type * Ty,const DataLayout & DL)383   bool isIndexedStoreLegal(TTI::MemIndexedMode M, Type *Ty,
384                            const DataLayout &DL) const {
385     EVT VT = getTLI()->getValueType(DL, Ty);
386     return getTLI()->isIndexedStoreLegal(getISDIndexedMode(M), VT);
387   }
388 
isLSRCostLess(TTI::LSRCost C1,TTI::LSRCost C2)389   bool isLSRCostLess(TTI::LSRCost C1, TTI::LSRCost C2) {
390     return TargetTransformInfoImplBase::isLSRCostLess(C1, C2);
391   }
392 
isNumRegsMajorCostOfLSR()393   bool isNumRegsMajorCostOfLSR() {
394     return TargetTransformInfoImplBase::isNumRegsMajorCostOfLSR();
395   }
396 
shouldFoldTerminatingConditionAfterLSR()397   bool shouldFoldTerminatingConditionAfterLSR() const {
398     return TargetTransformInfoImplBase::
399         shouldFoldTerminatingConditionAfterLSR();
400   }
401 
shouldDropLSRSolutionIfLessProfitable()402   bool shouldDropLSRSolutionIfLessProfitable() const {
403     return TargetTransformInfoImplBase::shouldDropLSRSolutionIfLessProfitable();
404   }
405 
isProfitableLSRChainElement(Instruction * I)406   bool isProfitableLSRChainElement(Instruction *I) {
407     return TargetTransformInfoImplBase::isProfitableLSRChainElement(I);
408   }
409 
getScalingFactorCost(Type * Ty,GlobalValue * BaseGV,StackOffset BaseOffset,bool HasBaseReg,int64_t Scale,unsigned AddrSpace)410   InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
411                                        StackOffset BaseOffset, bool HasBaseReg,
412                                        int64_t Scale, unsigned AddrSpace) {
413     TargetLoweringBase::AddrMode AM;
414     AM.BaseGV = BaseGV;
415     AM.BaseOffs = BaseOffset.getFixed();
416     AM.HasBaseReg = HasBaseReg;
417     AM.Scale = Scale;
418     AM.ScalableOffset = BaseOffset.getScalable();
419     if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
420       return 0;
421     return -1;
422   }
423 
isTruncateFree(Type * Ty1,Type * Ty2)424   bool isTruncateFree(Type *Ty1, Type *Ty2) {
425     return getTLI()->isTruncateFree(Ty1, Ty2);
426   }
427 
isProfitableToHoist(Instruction * I)428   bool isProfitableToHoist(Instruction *I) {
429     return getTLI()->isProfitableToHoist(I);
430   }
431 
useAA()432   bool useAA() const { return getST()->useAA(); }
433 
isTypeLegal(Type * Ty)434   bool isTypeLegal(Type *Ty) {
435     EVT VT = getTLI()->getValueType(DL, Ty, /*AllowUnknown=*/true);
436     return getTLI()->isTypeLegal(VT);
437   }
438 
getRegUsageForType(Type * Ty)439   unsigned getRegUsageForType(Type *Ty) {
440     EVT ETy = getTLI()->getValueType(DL, Ty);
441     return getTLI()->getNumRegisters(Ty->getContext(), ETy);
442   }
443 
getGEPCost(Type * PointeeType,const Value * Ptr,ArrayRef<const Value * > Operands,Type * AccessType,TTI::TargetCostKind CostKind)444   InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr,
445                              ArrayRef<const Value *> Operands, Type *AccessType,
446                              TTI::TargetCostKind CostKind) {
447     return BaseT::getGEPCost(PointeeType, Ptr, Operands, AccessType, CostKind);
448   }
449 
getEstimatedNumberOfCaseClusters(const SwitchInst & SI,unsigned & JumpTableSize,ProfileSummaryInfo * PSI,BlockFrequencyInfo * BFI)450   unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI,
451                                             unsigned &JumpTableSize,
452                                             ProfileSummaryInfo *PSI,
453                                             BlockFrequencyInfo *BFI) {
454     /// Try to find the estimated number of clusters. Note that the number of
455     /// clusters identified in this function could be different from the actual
456     /// numbers found in lowering. This function ignore switches that are
457     /// lowered with a mix of jump table / bit test / BTree. This function was
458     /// initially intended to be used when estimating the cost of switch in
459     /// inline cost heuristic, but it's a generic cost model to be used in other
460     /// places (e.g., in loop unrolling).
461     unsigned N = SI.getNumCases();
462     const TargetLoweringBase *TLI = getTLI();
463     const DataLayout &DL = this->getDataLayout();
464 
465     JumpTableSize = 0;
466     bool IsJTAllowed = TLI->areJTsAllowed(SI.getParent()->getParent());
467 
468     // Early exit if both a jump table and bit test are not allowed.
469     if (N < 1 || (!IsJTAllowed && DL.getIndexSizeInBits(0u) < N))
470       return N;
471 
472     APInt MaxCaseVal = SI.case_begin()->getCaseValue()->getValue();
473     APInt MinCaseVal = MaxCaseVal;
474     for (auto CI : SI.cases()) {
475       const APInt &CaseVal = CI.getCaseValue()->getValue();
476       if (CaseVal.sgt(MaxCaseVal))
477         MaxCaseVal = CaseVal;
478       if (CaseVal.slt(MinCaseVal))
479         MinCaseVal = CaseVal;
480     }
481 
482     // Check if suitable for a bit test
483     if (N <= DL.getIndexSizeInBits(0u)) {
484       SmallPtrSet<const BasicBlock *, 4> Dests;
485       for (auto I : SI.cases())
486         Dests.insert(I.getCaseSuccessor());
487 
488       if (TLI->isSuitableForBitTests(Dests.size(), N, MinCaseVal, MaxCaseVal,
489                                      DL))
490         return 1;
491     }
492 
493     // Check if suitable for a jump table.
494     if (IsJTAllowed) {
495       if (N < 2 || N < TLI->getMinimumJumpTableEntries())
496         return N;
497       uint64_t Range =
498           (MaxCaseVal - MinCaseVal)
499               .getLimitedValue(std::numeric_limits<uint64_t>::max() - 1) + 1;
500       // Check whether a range of clusters is dense enough for a jump table
501       if (TLI->isSuitableForJumpTable(&SI, N, Range, PSI, BFI)) {
502         JumpTableSize = Range;
503         return 1;
504       }
505     }
506     return N;
507   }
508 
shouldBuildLookupTables()509   bool shouldBuildLookupTables() {
510     const TargetLoweringBase *TLI = getTLI();
511     return TLI->isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) ||
512            TLI->isOperationLegalOrCustom(ISD::BRIND, MVT::Other);
513   }
514 
shouldBuildRelLookupTables()515   bool shouldBuildRelLookupTables() const {
516     const TargetMachine &TM = getTLI()->getTargetMachine();
517     // If non-PIC mode, do not generate a relative lookup table.
518     if (!TM.isPositionIndependent())
519       return false;
520 
521     /// Relative lookup table entries consist of 32-bit offsets.
522     /// Do not generate relative lookup tables for large code models
523     /// in 64-bit achitectures where 32-bit offsets might not be enough.
524     if (TM.getCodeModel() == CodeModel::Medium ||
525         TM.getCodeModel() == CodeModel::Large)
526       return false;
527 
528     Triple TargetTriple = TM.getTargetTriple();
529     if (!TargetTriple.isArch64Bit())
530       return false;
531 
532     // TODO: Triggers issues on aarch64 on darwin, so temporarily disable it
533     // there.
534     if (TargetTriple.getArch() == Triple::aarch64 && TargetTriple.isOSDarwin())
535       return false;
536 
537     return true;
538   }
539 
haveFastSqrt(Type * Ty)540   bool haveFastSqrt(Type *Ty) {
541     const TargetLoweringBase *TLI = getTLI();
542     EVT VT = TLI->getValueType(DL, Ty);
543     return TLI->isTypeLegal(VT) &&
544            TLI->isOperationLegalOrCustom(ISD::FSQRT, VT);
545   }
546 
isFCmpOrdCheaperThanFCmpZero(Type * Ty)547   bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) {
548     return true;
549   }
550 
getFPOpCost(Type * Ty)551   InstructionCost getFPOpCost(Type *Ty) {
552     // Check whether FADD is available, as a proxy for floating-point in
553     // general.
554     const TargetLoweringBase *TLI = getTLI();
555     EVT VT = TLI->getValueType(DL, Ty);
556     if (TLI->isOperationLegalOrCustomOrPromote(ISD::FADD, VT))
557       return TargetTransformInfo::TCC_Basic;
558     return TargetTransformInfo::TCC_Expensive;
559   }
560 
preferToKeepConstantsAttached(const Instruction & Inst,const Function & Fn)561   bool preferToKeepConstantsAttached(const Instruction &Inst,
562                                      const Function &Fn) const {
563     switch (Inst.getOpcode()) {
564     default:
565       break;
566     case Instruction::SDiv:
567     case Instruction::SRem:
568     case Instruction::UDiv:
569     case Instruction::URem: {
570       if (!isa<ConstantInt>(Inst.getOperand(1)))
571         return false;
572       EVT VT = getTLI()->getValueType(DL, Inst.getType());
573       return !getTLI()->isIntDivCheap(VT, Fn.getAttributes());
574     }
575     };
576 
577     return false;
578   }
579 
getInliningThresholdMultiplier()580   unsigned getInliningThresholdMultiplier() const { return 1; }
adjustInliningThreshold(const CallBase * CB)581   unsigned adjustInliningThreshold(const CallBase *CB) { return 0; }
getCallerAllocaCost(const CallBase * CB,const AllocaInst * AI)582   unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const {
583     return 0;
584   }
585 
getInlinerVectorBonusPercent()586   int getInlinerVectorBonusPercent() const { return 150; }
587 
getUnrollingPreferences(Loop * L,ScalarEvolution & SE,TTI::UnrollingPreferences & UP,OptimizationRemarkEmitter * ORE)588   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
589                                TTI::UnrollingPreferences &UP,
590                                OptimizationRemarkEmitter *ORE) {
591     // This unrolling functionality is target independent, but to provide some
592     // motivation for its intended use, for x86:
593 
594     // According to the Intel 64 and IA-32 Architectures Optimization Reference
595     // Manual, Intel Core models and later have a loop stream detector (and
596     // associated uop queue) that can benefit from partial unrolling.
597     // The relevant requirements are:
598     //  - The loop must have no more than 4 (8 for Nehalem and later) branches
599     //    taken, and none of them may be calls.
600     //  - The loop can have no more than 18 (28 for Nehalem and later) uops.
601 
602     // According to the Software Optimization Guide for AMD Family 15h
603     // Processors, models 30h-4fh (Steamroller and later) have a loop predictor
604     // and loop buffer which can benefit from partial unrolling.
605     // The relevant requirements are:
606     //  - The loop must have fewer than 16 branches
607     //  - The loop must have less than 40 uops in all executed loop branches
608 
609     // The number of taken branches in a loop is hard to estimate here, and
610     // benchmarking has revealed that it is better not to be conservative when
611     // estimating the branch count. As a result, we'll ignore the branch limits
612     // until someone finds a case where it matters in practice.
613 
614     unsigned MaxOps;
615     const TargetSubtargetInfo *ST = getST();
616     if (PartialUnrollingThreshold.getNumOccurrences() > 0)
617       MaxOps = PartialUnrollingThreshold;
618     else if (ST->getSchedModel().LoopMicroOpBufferSize > 0)
619       MaxOps = ST->getSchedModel().LoopMicroOpBufferSize;
620     else
621       return;
622 
623     // Scan the loop: don't unroll loops with calls.
624     for (BasicBlock *BB : L->blocks()) {
625       for (Instruction &I : *BB) {
626         if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
627           if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
628             if (!thisT()->isLoweredToCall(F))
629               continue;
630           }
631 
632           if (ORE) {
633             ORE->emit([&]() {
634               return OptimizationRemark("TTI", "DontUnroll", L->getStartLoc(),
635                                         L->getHeader())
636                      << "advising against unrolling the loop because it "
637                         "contains a "
638                      << ore::NV("Call", &I);
639             });
640           }
641           return;
642         }
643       }
644     }
645 
646     // Enable runtime and partial unrolling up to the specified size.
647     // Enable using trip count upper bound to unroll loops.
648     UP.Partial = UP.Runtime = UP.UpperBound = true;
649     UP.PartialThreshold = MaxOps;
650 
651     // Avoid unrolling when optimizing for size.
652     UP.OptSizeThreshold = 0;
653     UP.PartialOptSizeThreshold = 0;
654 
655     // Set number of instructions optimized when "back edge"
656     // becomes "fall through" to default value of 2.
657     UP.BEInsns = 2;
658   }
659 
getPeelingPreferences(Loop * L,ScalarEvolution & SE,TTI::PeelingPreferences & PP)660   void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
661                              TTI::PeelingPreferences &PP) {
662     PP.PeelCount = 0;
663     PP.AllowPeeling = true;
664     PP.AllowLoopNestsPeeling = false;
665     PP.PeelProfiledIterations = true;
666   }
667 
isHardwareLoopProfitable(Loop * L,ScalarEvolution & SE,AssumptionCache & AC,TargetLibraryInfo * LibInfo,HardwareLoopInfo & HWLoopInfo)668   bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
669                                 AssumptionCache &AC,
670                                 TargetLibraryInfo *LibInfo,
671                                 HardwareLoopInfo &HWLoopInfo) {
672     return BaseT::isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
673   }
674 
preferPredicateOverEpilogue(TailFoldingInfo * TFI)675   bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) {
676     return BaseT::preferPredicateOverEpilogue(TFI);
677   }
678 
679   TailFoldingStyle
680   getPreferredTailFoldingStyle(bool IVUpdateMayOverflow = true) {
681     return BaseT::getPreferredTailFoldingStyle(IVUpdateMayOverflow);
682   }
683 
instCombineIntrinsic(InstCombiner & IC,IntrinsicInst & II)684   std::optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
685                                                IntrinsicInst &II) {
686     return BaseT::instCombineIntrinsic(IC, II);
687   }
688 
689   std::optional<Value *>
simplifyDemandedUseBitsIntrinsic(InstCombiner & IC,IntrinsicInst & II,APInt DemandedMask,KnownBits & Known,bool & KnownBitsComputed)690   simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II,
691                                    APInt DemandedMask, KnownBits &Known,
692                                    bool &KnownBitsComputed) {
693     return BaseT::simplifyDemandedUseBitsIntrinsic(IC, II, DemandedMask, Known,
694                                                    KnownBitsComputed);
695   }
696 
simplifyDemandedVectorEltsIntrinsic(InstCombiner & IC,IntrinsicInst & II,APInt DemandedElts,APInt & UndefElts,APInt & UndefElts2,APInt & UndefElts3,std::function<void (Instruction *,unsigned,APInt,APInt &)> SimplifyAndSetOp)697   std::optional<Value *> simplifyDemandedVectorEltsIntrinsic(
698       InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
699       APInt &UndefElts2, APInt &UndefElts3,
700       std::function<void(Instruction *, unsigned, APInt, APInt &)>
701           SimplifyAndSetOp) {
702     return BaseT::simplifyDemandedVectorEltsIntrinsic(
703         IC, II, DemandedElts, UndefElts, UndefElts2, UndefElts3,
704         SimplifyAndSetOp);
705   }
706 
707   virtual std::optional<unsigned>
getCacheSize(TargetTransformInfo::CacheLevel Level)708   getCacheSize(TargetTransformInfo::CacheLevel Level) const {
709     return std::optional<unsigned>(
710         getST()->getCacheSize(static_cast<unsigned>(Level)));
711   }
712 
713   virtual std::optional<unsigned>
getCacheAssociativity(TargetTransformInfo::CacheLevel Level)714   getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const {
715     std::optional<unsigned> TargetResult =
716         getST()->getCacheAssociativity(static_cast<unsigned>(Level));
717 
718     if (TargetResult)
719       return TargetResult;
720 
721     return BaseT::getCacheAssociativity(Level);
722   }
723 
getCacheLineSize()724   virtual unsigned getCacheLineSize() const {
725     return getST()->getCacheLineSize();
726   }
727 
getPrefetchDistance()728   virtual unsigned getPrefetchDistance() const {
729     return getST()->getPrefetchDistance();
730   }
731 
getMinPrefetchStride(unsigned NumMemAccesses,unsigned NumStridedMemAccesses,unsigned NumPrefetches,bool HasCall)732   virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses,
733                                         unsigned NumStridedMemAccesses,
734                                         unsigned NumPrefetches,
735                                         bool HasCall) const {
736     return getST()->getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses,
737                                          NumPrefetches, HasCall);
738   }
739 
getMaxPrefetchIterationsAhead()740   virtual unsigned getMaxPrefetchIterationsAhead() const {
741     return getST()->getMaxPrefetchIterationsAhead();
742   }
743 
enableWritePrefetching()744   virtual bool enableWritePrefetching() const {
745     return getST()->enableWritePrefetching();
746   }
747 
shouldPrefetchAddressSpace(unsigned AS)748   virtual bool shouldPrefetchAddressSpace(unsigned AS) const {
749     return getST()->shouldPrefetchAddressSpace(AS);
750   }
751 
752   /// @}
753 
754   /// \name Vector TTI Implementations
755   /// @{
756 
getRegisterBitWidth(TargetTransformInfo::RegisterKind K)757   TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
758     return TypeSize::getFixed(32);
759   }
760 
getMaxVScale()761   std::optional<unsigned> getMaxVScale() const { return std::nullopt; }
getVScaleForTuning()762   std::optional<unsigned> getVScaleForTuning() const { return std::nullopt; }
isVScaleKnownToBeAPowerOfTwo()763   bool isVScaleKnownToBeAPowerOfTwo() const { return false; }
764 
765   /// Estimate the overhead of scalarizing an instruction. Insert and Extract
766   /// are set if the demanded result elements need to be inserted and/or
767   /// extracted from vectors.
getScalarizationOverhead(VectorType * InTy,const APInt & DemandedElts,bool Insert,bool Extract,TTI::TargetCostKind CostKind)768   InstructionCost getScalarizationOverhead(VectorType *InTy,
769                                            const APInt &DemandedElts,
770                                            bool Insert, bool Extract,
771                                            TTI::TargetCostKind CostKind) {
772     /// FIXME: a bitfield is not a reasonable abstraction for talking about
773     /// which elements are needed from a scalable vector
774     if (isa<ScalableVectorType>(InTy))
775       return InstructionCost::getInvalid();
776     auto *Ty = cast<FixedVectorType>(InTy);
777 
778     assert(DemandedElts.getBitWidth() == Ty->getNumElements() &&
779            "Vector size mismatch");
780 
781     InstructionCost Cost = 0;
782 
783     for (int i = 0, e = Ty->getNumElements(); i < e; ++i) {
784       if (!DemandedElts[i])
785         continue;
786       if (Insert)
787         Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, Ty,
788                                             CostKind, i, nullptr, nullptr);
789       if (Extract)
790         Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty,
791                                             CostKind, i, nullptr, nullptr);
792     }
793 
794     return Cost;
795   }
796 
797   /// Helper wrapper for the DemandedElts variant of getScalarizationOverhead.
getScalarizationOverhead(VectorType * InTy,bool Insert,bool Extract,TTI::TargetCostKind CostKind)798   InstructionCost getScalarizationOverhead(VectorType *InTy, bool Insert,
799                                            bool Extract,
800                                            TTI::TargetCostKind CostKind) {
801     if (isa<ScalableVectorType>(InTy))
802       return InstructionCost::getInvalid();
803     auto *Ty = cast<FixedVectorType>(InTy);
804 
805     APInt DemandedElts = APInt::getAllOnes(Ty->getNumElements());
806     return thisT()->getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
807                                              CostKind);
808   }
809 
810   /// Estimate the overhead of scalarizing an instructions unique
811   /// non-constant operands. The (potentially vector) types to use for each of
812   /// argument are passes via Tys.
813   InstructionCost
getOperandsScalarizationOverhead(ArrayRef<const Value * > Args,ArrayRef<Type * > Tys,TTI::TargetCostKind CostKind)814   getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
815                                    ArrayRef<Type *> Tys,
816                                    TTI::TargetCostKind CostKind) {
817     assert(Args.size() == Tys.size() && "Expected matching Args and Tys");
818 
819     InstructionCost Cost = 0;
820     SmallPtrSet<const Value*, 4> UniqueOperands;
821     for (int I = 0, E = Args.size(); I != E; I++) {
822       // Disregard things like metadata arguments.
823       const Value *A = Args[I];
824       Type *Ty = Tys[I];
825       if (!Ty->isIntOrIntVectorTy() && !Ty->isFPOrFPVectorTy() &&
826           !Ty->isPtrOrPtrVectorTy())
827         continue;
828 
829       if (!isa<Constant>(A) && UniqueOperands.insert(A).second) {
830         if (auto *VecTy = dyn_cast<VectorType>(Ty))
831           Cost += getScalarizationOverhead(VecTy, /*Insert*/ false,
832                                            /*Extract*/ true, CostKind);
833       }
834     }
835 
836     return Cost;
837   }
838 
839   /// Estimate the overhead of scalarizing the inputs and outputs of an
840   /// instruction, with return type RetTy and arguments Args of type Tys. If
841   /// Args are unknown (empty), then the cost associated with one argument is
842   /// added as a heuristic.
getScalarizationOverhead(VectorType * RetTy,ArrayRef<const Value * > Args,ArrayRef<Type * > Tys,TTI::TargetCostKind CostKind)843   InstructionCost getScalarizationOverhead(VectorType *RetTy,
844                                            ArrayRef<const Value *> Args,
845                                            ArrayRef<Type *> Tys,
846                                            TTI::TargetCostKind CostKind) {
847     InstructionCost Cost = getScalarizationOverhead(
848         RetTy, /*Insert*/ true, /*Extract*/ false, CostKind);
849     if (!Args.empty())
850       Cost += getOperandsScalarizationOverhead(Args, Tys, CostKind);
851     else
852       // When no information on arguments is provided, we add the cost
853       // associated with one argument as a heuristic.
854       Cost += getScalarizationOverhead(RetTy, /*Insert*/ false,
855                                        /*Extract*/ true, CostKind);
856 
857     return Cost;
858   }
859 
860   /// Estimate the cost of type-legalization and the legalized type.
getTypeLegalizationCost(Type * Ty)861   std::pair<InstructionCost, MVT> getTypeLegalizationCost(Type *Ty) const {
862     LLVMContext &C = Ty->getContext();
863     EVT MTy = getTLI()->getValueType(DL, Ty);
864 
865     InstructionCost Cost = 1;
866     // We keep legalizing the type until we find a legal kind. We assume that
867     // the only operation that costs anything is the split. After splitting
868     // we need to handle two types.
869     while (true) {
870       TargetLoweringBase::LegalizeKind LK = getTLI()->getTypeConversion(C, MTy);
871 
872       if (LK.first == TargetLoweringBase::TypeScalarizeScalableVector) {
873         // Ensure we return a sensible simple VT here, since many callers of
874         // this function require it.
875         MVT VT = MTy.isSimple() ? MTy.getSimpleVT() : MVT::i64;
876         return std::make_pair(InstructionCost::getInvalid(), VT);
877       }
878 
879       if (LK.first == TargetLoweringBase::TypeLegal)
880         return std::make_pair(Cost, MTy.getSimpleVT());
881 
882       if (LK.first == TargetLoweringBase::TypeSplitVector ||
883           LK.first == TargetLoweringBase::TypeExpandInteger)
884         Cost *= 2;
885 
886       // Do not loop with f128 type.
887       if (MTy == LK.second)
888         return std::make_pair(Cost, MTy.getSimpleVT());
889 
890       // Keep legalizing the type.
891       MTy = LK.second;
892     }
893   }
894 
getMaxInterleaveFactor(ElementCount VF)895   unsigned getMaxInterleaveFactor(ElementCount VF) { return 1; }
896 
897   InstructionCost getArithmeticInstrCost(
898       unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
899       TTI::OperandValueInfo Opd1Info = {TTI::OK_AnyValue, TTI::OP_None},
900       TTI::OperandValueInfo Opd2Info = {TTI::OK_AnyValue, TTI::OP_None},
901       ArrayRef<const Value *> Args = std::nullopt,
902       const Instruction *CxtI = nullptr) {
903     // Check if any of the operands are vector operands.
904     const TargetLoweringBase *TLI = getTLI();
905     int ISD = TLI->InstructionOpcodeToISD(Opcode);
906     assert(ISD && "Invalid opcode");
907 
908     // TODO: Handle more cost kinds.
909     if (CostKind != TTI::TCK_RecipThroughput)
910       return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind,
911                                            Opd1Info, Opd2Info,
912                                            Args, CxtI);
913 
914     std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
915 
916     bool IsFloat = Ty->isFPOrFPVectorTy();
917     // Assume that floating point arithmetic operations cost twice as much as
918     // integer operations.
919     InstructionCost OpCost = (IsFloat ? 2 : 1);
920 
921     if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
922       // The operation is legal. Assume it costs 1.
923       // TODO: Once we have extract/insert subvector cost we need to use them.
924       return LT.first * OpCost;
925     }
926 
927     if (!TLI->isOperationExpand(ISD, LT.second)) {
928       // If the operation is custom lowered, then assume that the code is twice
929       // as expensive.
930       return LT.first * 2 * OpCost;
931     }
932 
933     // An 'Expand' of URem and SRem is special because it may default
934     // to expanding the operation into a sequence of sub-operations
935     // i.e. X % Y -> X-(X/Y)*Y.
936     if (ISD == ISD::UREM || ISD == ISD::SREM) {
937       bool IsSigned = ISD == ISD::SREM;
938       if (TLI->isOperationLegalOrCustom(IsSigned ? ISD::SDIVREM : ISD::UDIVREM,
939                                         LT.second) ||
940           TLI->isOperationLegalOrCustom(IsSigned ? ISD::SDIV : ISD::UDIV,
941                                         LT.second)) {
942         unsigned DivOpc = IsSigned ? Instruction::SDiv : Instruction::UDiv;
943         InstructionCost DivCost = thisT()->getArithmeticInstrCost(
944             DivOpc, Ty, CostKind, Opd1Info, Opd2Info);
945         InstructionCost MulCost =
946             thisT()->getArithmeticInstrCost(Instruction::Mul, Ty, CostKind);
947         InstructionCost SubCost =
948             thisT()->getArithmeticInstrCost(Instruction::Sub, Ty, CostKind);
949         return DivCost + MulCost + SubCost;
950       }
951     }
952 
953     // We cannot scalarize scalable vectors, so return Invalid.
954     if (isa<ScalableVectorType>(Ty))
955       return InstructionCost::getInvalid();
956 
957     // Else, assume that we need to scalarize this op.
958     // TODO: If one of the types get legalized by splitting, handle this
959     // similarly to what getCastInstrCost() does.
960     if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
961       InstructionCost Cost = thisT()->getArithmeticInstrCost(
962           Opcode, VTy->getScalarType(), CostKind, Opd1Info, Opd2Info,
963           Args, CxtI);
964       // Return the cost of multiple scalar invocation plus the cost of
965       // inserting and extracting the values.
966       SmallVector<Type *> Tys(Args.size(), Ty);
967       return getScalarizationOverhead(VTy, Args, Tys, CostKind) +
968              VTy->getNumElements() * Cost;
969     }
970 
971     // We don't know anything about this scalar instruction.
972     return OpCost;
973   }
974 
improveShuffleKindFromMask(TTI::ShuffleKind Kind,ArrayRef<int> Mask,VectorType * Ty,int & Index,VectorType * & SubTy)975   TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind,
976                                               ArrayRef<int> Mask,
977                                               VectorType *Ty, int &Index,
978                                               VectorType *&SubTy) const {
979     if (Mask.empty())
980       return Kind;
981     int NumSrcElts = Ty->getElementCount().getKnownMinValue();
982     switch (Kind) {
983     case TTI::SK_PermuteSingleSrc:
984       if (ShuffleVectorInst::isReverseMask(Mask, NumSrcElts))
985         return TTI::SK_Reverse;
986       if (ShuffleVectorInst::isZeroEltSplatMask(Mask, NumSrcElts))
987         return TTI::SK_Broadcast;
988       if (ShuffleVectorInst::isExtractSubvectorMask(Mask, NumSrcElts, Index) &&
989           (Index + Mask.size()) <= (size_t)NumSrcElts) {
990         SubTy = FixedVectorType::get(Ty->getElementType(), Mask.size());
991         return TTI::SK_ExtractSubvector;
992       }
993       break;
994     case TTI::SK_PermuteTwoSrc: {
995       int NumSubElts;
996       if (Mask.size() > 2 && ShuffleVectorInst::isInsertSubvectorMask(
997                                  Mask, NumSrcElts, NumSubElts, Index)) {
998         if (Index + NumSubElts > NumSrcElts)
999           return Kind;
1000         SubTy = FixedVectorType::get(Ty->getElementType(), NumSubElts);
1001         return TTI::SK_InsertSubvector;
1002       }
1003       if (ShuffleVectorInst::isSelectMask(Mask, NumSrcElts))
1004         return TTI::SK_Select;
1005       if (ShuffleVectorInst::isTransposeMask(Mask, NumSrcElts))
1006         return TTI::SK_Transpose;
1007       if (ShuffleVectorInst::isSpliceMask(Mask, NumSrcElts, Index))
1008         return TTI::SK_Splice;
1009       break;
1010     }
1011     case TTI::SK_Select:
1012     case TTI::SK_Reverse:
1013     case TTI::SK_Broadcast:
1014     case TTI::SK_Transpose:
1015     case TTI::SK_InsertSubvector:
1016     case TTI::SK_ExtractSubvector:
1017     case TTI::SK_Splice:
1018       break;
1019     }
1020     return Kind;
1021   }
1022 
1023   InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
1024                                  ArrayRef<int> Mask,
1025                                  TTI::TargetCostKind CostKind, int Index,
1026                                  VectorType *SubTp,
1027                                  ArrayRef<const Value *> Args = std::nullopt,
1028                                  const Instruction *CxtI = nullptr) {
1029     switch (improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp)) {
1030     case TTI::SK_Broadcast:
1031       if (auto *FVT = dyn_cast<FixedVectorType>(Tp))
1032         return getBroadcastShuffleOverhead(FVT, CostKind);
1033       return InstructionCost::getInvalid();
1034     case TTI::SK_Select:
1035     case TTI::SK_Splice:
1036     case TTI::SK_Reverse:
1037     case TTI::SK_Transpose:
1038     case TTI::SK_PermuteSingleSrc:
1039     case TTI::SK_PermuteTwoSrc:
1040       if (auto *FVT = dyn_cast<FixedVectorType>(Tp))
1041         return getPermuteShuffleOverhead(FVT, CostKind);
1042       return InstructionCost::getInvalid();
1043     case TTI::SK_ExtractSubvector:
1044       return getExtractSubvectorOverhead(Tp, CostKind, Index,
1045                                          cast<FixedVectorType>(SubTp));
1046     case TTI::SK_InsertSubvector:
1047       return getInsertSubvectorOverhead(Tp, CostKind, Index,
1048                                         cast<FixedVectorType>(SubTp));
1049     }
1050     llvm_unreachable("Unknown TTI::ShuffleKind");
1051   }
1052 
1053   InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
1054                                    TTI::CastContextHint CCH,
1055                                    TTI::TargetCostKind CostKind,
1056                                    const Instruction *I = nullptr) {
1057     if (BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I) == 0)
1058       return 0;
1059 
1060     const TargetLoweringBase *TLI = getTLI();
1061     int ISD = TLI->InstructionOpcodeToISD(Opcode);
1062     assert(ISD && "Invalid opcode");
1063     std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
1064     std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(Dst);
1065 
1066     TypeSize SrcSize = SrcLT.second.getSizeInBits();
1067     TypeSize DstSize = DstLT.second.getSizeInBits();
1068     bool IntOrPtrSrc = Src->isIntegerTy() || Src->isPointerTy();
1069     bool IntOrPtrDst = Dst->isIntegerTy() || Dst->isPointerTy();
1070 
1071     switch (Opcode) {
1072     default:
1073       break;
1074     case Instruction::Trunc:
1075       // Check for NOOP conversions.
1076       if (TLI->isTruncateFree(SrcLT.second, DstLT.second))
1077         return 0;
1078       [[fallthrough]];
1079     case Instruction::BitCast:
1080       // Bitcast between types that are legalized to the same type are free and
1081       // assume int to/from ptr of the same size is also free.
1082       if (SrcLT.first == DstLT.first && IntOrPtrSrc == IntOrPtrDst &&
1083           SrcSize == DstSize)
1084         return 0;
1085       break;
1086     case Instruction::FPExt:
1087       if (I && getTLI()->isExtFree(I))
1088         return 0;
1089       break;
1090     case Instruction::ZExt:
1091       if (TLI->isZExtFree(SrcLT.second, DstLT.second))
1092         return 0;
1093       [[fallthrough]];
1094     case Instruction::SExt:
1095       if (I && getTLI()->isExtFree(I))
1096         return 0;
1097 
1098       // If this is a zext/sext of a load, return 0 if the corresponding
1099       // extending load exists on target and the result type is legal.
1100       if (CCH == TTI::CastContextHint::Normal) {
1101         EVT ExtVT = EVT::getEVT(Dst);
1102         EVT LoadVT = EVT::getEVT(Src);
1103         unsigned LType =
1104           ((Opcode == Instruction::ZExt) ? ISD::ZEXTLOAD : ISD::SEXTLOAD);
1105         if (DstLT.first == SrcLT.first &&
1106             TLI->isLoadExtLegal(LType, ExtVT, LoadVT))
1107           return 0;
1108       }
1109       break;
1110     case Instruction::AddrSpaceCast:
1111       if (TLI->isFreeAddrSpaceCast(Src->getPointerAddressSpace(),
1112                                    Dst->getPointerAddressSpace()))
1113         return 0;
1114       break;
1115     }
1116 
1117     auto *SrcVTy = dyn_cast<VectorType>(Src);
1118     auto *DstVTy = dyn_cast<VectorType>(Dst);
1119 
1120     // If the cast is marked as legal (or promote) then assume low cost.
1121     if (SrcLT.first == DstLT.first &&
1122         TLI->isOperationLegalOrPromote(ISD, DstLT.second))
1123       return SrcLT.first;
1124 
1125     // Handle scalar conversions.
1126     if (!SrcVTy && !DstVTy) {
1127       // Just check the op cost. If the operation is legal then assume it costs
1128       // 1.
1129       if (!TLI->isOperationExpand(ISD, DstLT.second))
1130         return 1;
1131 
1132       // Assume that illegal scalar instruction are expensive.
1133       return 4;
1134     }
1135 
1136     // Check vector-to-vector casts.
1137     if (DstVTy && SrcVTy) {
1138       // If the cast is between same-sized registers, then the check is simple.
1139       if (SrcLT.first == DstLT.first && SrcSize == DstSize) {
1140 
1141         // Assume that Zext is done using AND.
1142         if (Opcode == Instruction::ZExt)
1143           return SrcLT.first;
1144 
1145         // Assume that sext is done using SHL and SRA.
1146         if (Opcode == Instruction::SExt)
1147           return SrcLT.first * 2;
1148 
1149         // Just check the op cost. If the operation is legal then assume it
1150         // costs
1151         // 1 and multiply by the type-legalization overhead.
1152         if (!TLI->isOperationExpand(ISD, DstLT.second))
1153           return SrcLT.first * 1;
1154       }
1155 
1156       // If we are legalizing by splitting, query the concrete TTI for the cost
1157       // of casting the original vector twice. We also need to factor in the
1158       // cost of the split itself. Count that as 1, to be consistent with
1159       // getTypeLegalizationCost().
1160       bool SplitSrc =
1161           TLI->getTypeAction(Src->getContext(), TLI->getValueType(DL, Src)) ==
1162           TargetLowering::TypeSplitVector;
1163       bool SplitDst =
1164           TLI->getTypeAction(Dst->getContext(), TLI->getValueType(DL, Dst)) ==
1165           TargetLowering::TypeSplitVector;
1166       if ((SplitSrc || SplitDst) && SrcVTy->getElementCount().isVector() &&
1167           DstVTy->getElementCount().isVector()) {
1168         Type *SplitDstTy = VectorType::getHalfElementsVectorType(DstVTy);
1169         Type *SplitSrcTy = VectorType::getHalfElementsVectorType(SrcVTy);
1170         T *TTI = static_cast<T *>(this);
1171         // If both types need to be split then the split is free.
1172         InstructionCost SplitCost =
1173             (!SplitSrc || !SplitDst) ? TTI->getVectorSplitCost() : 0;
1174         return SplitCost +
1175                (2 * TTI->getCastInstrCost(Opcode, SplitDstTy, SplitSrcTy, CCH,
1176                                           CostKind, I));
1177       }
1178 
1179       // Scalarization cost is Invalid, can't assume any num elements.
1180       if (isa<ScalableVectorType>(DstVTy))
1181         return InstructionCost::getInvalid();
1182 
1183       // In other cases where the source or destination are illegal, assume
1184       // the operation will get scalarized.
1185       unsigned Num = cast<FixedVectorType>(DstVTy)->getNumElements();
1186       InstructionCost Cost = thisT()->getCastInstrCost(
1187           Opcode, Dst->getScalarType(), Src->getScalarType(), CCH, CostKind, I);
1188 
1189       // Return the cost of multiple scalar invocation plus the cost of
1190       // inserting and extracting the values.
1191       return getScalarizationOverhead(DstVTy, /*Insert*/ true, /*Extract*/ true,
1192                                       CostKind) +
1193              Num * Cost;
1194     }
1195 
1196     // We already handled vector-to-vector and scalar-to-scalar conversions.
1197     // This
1198     // is where we handle bitcast between vectors and scalars. We need to assume
1199     //  that the conversion is scalarized in one way or another.
1200     if (Opcode == Instruction::BitCast) {
1201       // Illegal bitcasts are done by storing and loading from a stack slot.
1202       return (SrcVTy ? getScalarizationOverhead(SrcVTy, /*Insert*/ false,
1203                                                 /*Extract*/ true, CostKind)
1204                      : 0) +
1205              (DstVTy ? getScalarizationOverhead(DstVTy, /*Insert*/ true,
1206                                                 /*Extract*/ false, CostKind)
1207                      : 0);
1208     }
1209 
1210     llvm_unreachable("Unhandled cast");
1211   }
1212 
getExtractWithExtendCost(unsigned Opcode,Type * Dst,VectorType * VecTy,unsigned Index)1213   InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst,
1214                                            VectorType *VecTy, unsigned Index) {
1215     TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
1216     return thisT()->getVectorInstrCost(Instruction::ExtractElement, VecTy,
1217                                        CostKind, Index, nullptr, nullptr) +
1218            thisT()->getCastInstrCost(Opcode, Dst, VecTy->getElementType(),
1219                                      TTI::CastContextHint::None, CostKind);
1220   }
1221 
1222   InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
1223                                  const Instruction *I = nullptr) {
1224     return BaseT::getCFInstrCost(Opcode, CostKind, I);
1225   }
1226 
1227   InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
1228                                      CmpInst::Predicate VecPred,
1229                                      TTI::TargetCostKind CostKind,
1230                                      const Instruction *I = nullptr) {
1231     const TargetLoweringBase *TLI = getTLI();
1232     int ISD = TLI->InstructionOpcodeToISD(Opcode);
1233     assert(ISD && "Invalid opcode");
1234 
1235     // TODO: Handle other cost kinds.
1236     if (CostKind != TTI::TCK_RecipThroughput)
1237       return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1238                                        I);
1239 
1240     // Selects on vectors are actually vector selects.
1241     if (ISD == ISD::SELECT) {
1242       assert(CondTy && "CondTy must exist");
1243       if (CondTy->isVectorTy())
1244         ISD = ISD::VSELECT;
1245     }
1246     std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1247 
1248     if (!(ValTy->isVectorTy() && !LT.second.isVector()) &&
1249         !TLI->isOperationExpand(ISD, LT.second)) {
1250       // The operation is legal. Assume it costs 1. Multiply
1251       // by the type-legalization overhead.
1252       return LT.first * 1;
1253     }
1254 
1255     // Otherwise, assume that the cast is scalarized.
1256     // TODO: If one of the types get legalized by splitting, handle this
1257     // similarly to what getCastInstrCost() does.
1258     if (auto *ValVTy = dyn_cast<VectorType>(ValTy)) {
1259       if (isa<ScalableVectorType>(ValTy))
1260         return InstructionCost::getInvalid();
1261 
1262       unsigned Num = cast<FixedVectorType>(ValVTy)->getNumElements();
1263       if (CondTy)
1264         CondTy = CondTy->getScalarType();
1265       InstructionCost Cost = thisT()->getCmpSelInstrCost(
1266           Opcode, ValVTy->getScalarType(), CondTy, VecPred, CostKind, I);
1267 
1268       // Return the cost of multiple scalar invocation plus the cost of
1269       // inserting and extracting the values.
1270       return getScalarizationOverhead(ValVTy, /*Insert*/ true,
1271                                       /*Extract*/ false, CostKind) +
1272              Num * Cost;
1273     }
1274 
1275     // Unknown scalar opcode.
1276     return 1;
1277   }
1278 
getVectorInstrCost(unsigned Opcode,Type * Val,TTI::TargetCostKind CostKind,unsigned Index,Value * Op0,Value * Op1)1279   InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
1280                                      TTI::TargetCostKind CostKind,
1281                                      unsigned Index, Value *Op0, Value *Op1) {
1282     return getRegUsageForType(Val->getScalarType());
1283   }
1284 
getVectorInstrCost(const Instruction & I,Type * Val,TTI::TargetCostKind CostKind,unsigned Index)1285   InstructionCost getVectorInstrCost(const Instruction &I, Type *Val,
1286                                      TTI::TargetCostKind CostKind,
1287                                      unsigned Index) {
1288     Value *Op0 = nullptr;
1289     Value *Op1 = nullptr;
1290     if (auto *IE = dyn_cast<InsertElementInst>(&I)) {
1291       Op0 = IE->getOperand(0);
1292       Op1 = IE->getOperand(1);
1293     }
1294     return thisT()->getVectorInstrCost(I.getOpcode(), Val, CostKind, Index, Op0,
1295                                        Op1);
1296   }
1297 
getReplicationShuffleCost(Type * EltTy,int ReplicationFactor,int VF,const APInt & DemandedDstElts,TTI::TargetCostKind CostKind)1298   InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor,
1299                                             int VF,
1300                                             const APInt &DemandedDstElts,
1301                                             TTI::TargetCostKind CostKind) {
1302     assert(DemandedDstElts.getBitWidth() == (unsigned)VF * ReplicationFactor &&
1303            "Unexpected size of DemandedDstElts.");
1304 
1305     InstructionCost Cost;
1306 
1307     auto *SrcVT = FixedVectorType::get(EltTy, VF);
1308     auto *ReplicatedVT = FixedVectorType::get(EltTy, VF * ReplicationFactor);
1309 
1310     // The Mask shuffling cost is extract all the elements of the Mask
1311     // and insert each of them Factor times into the wide vector:
1312     //
1313     // E.g. an interleaved group with factor 3:
1314     //    %mask = icmp ult <8 x i32> %vec1, %vec2
1315     //    %interleaved.mask = shufflevector <8 x i1> %mask, <8 x i1> undef,
1316     //        <24 x i32> <0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7>
1317     // The cost is estimated as extract all mask elements from the <8xi1> mask
1318     // vector and insert them factor times into the <24xi1> shuffled mask
1319     // vector.
1320     APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedDstElts, VF);
1321     Cost += thisT()->getScalarizationOverhead(SrcVT, DemandedSrcElts,
1322                                               /*Insert*/ false,
1323                                               /*Extract*/ true, CostKind);
1324     Cost += thisT()->getScalarizationOverhead(ReplicatedVT, DemandedDstElts,
1325                                               /*Insert*/ true,
1326                                               /*Extract*/ false, CostKind);
1327 
1328     return Cost;
1329   }
1330 
1331   InstructionCost
1332   getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment,
1333                   unsigned AddressSpace, TTI::TargetCostKind CostKind,
1334                   TTI::OperandValueInfo OpInfo = {TTI::OK_AnyValue, TTI::OP_None},
1335                   const Instruction *I = nullptr) {
1336     assert(!Src->isVoidTy() && "Invalid type");
1337     // Assume types, such as structs, are expensive.
1338     if (getTLI()->getValueType(DL, Src,  true) == MVT::Other)
1339       return 4;
1340     std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
1341 
1342     // Assuming that all loads of legal types cost 1.
1343     InstructionCost Cost = LT.first;
1344     if (CostKind != TTI::TCK_RecipThroughput)
1345       return Cost;
1346 
1347     const DataLayout &DL = this->getDataLayout();
1348     if (Src->isVectorTy() &&
1349         // In practice it's not currently possible to have a change in lane
1350         // length for extending loads or truncating stores so both types should
1351         // have the same scalable property.
1352         TypeSize::isKnownLT(DL.getTypeStoreSizeInBits(Src),
1353                             LT.second.getSizeInBits())) {
1354       // This is a vector load that legalizes to a larger type than the vector
1355       // itself. Unless the corresponding extending load or truncating store is
1356       // legal, then this will scalarize.
1357       TargetLowering::LegalizeAction LA = TargetLowering::Expand;
1358       EVT MemVT = getTLI()->getValueType(DL, Src);
1359       if (Opcode == Instruction::Store)
1360         LA = getTLI()->getTruncStoreAction(LT.second, MemVT);
1361       else
1362         LA = getTLI()->getLoadExtAction(ISD::EXTLOAD, LT.second, MemVT);
1363 
1364       if (LA != TargetLowering::Legal && LA != TargetLowering::Custom) {
1365         // This is a vector load/store for some illegal type that is scalarized.
1366         // We must account for the cost of building or decomposing the vector.
1367         Cost += getScalarizationOverhead(
1368             cast<VectorType>(Src), Opcode != Instruction::Store,
1369             Opcode == Instruction::Store, CostKind);
1370       }
1371     }
1372 
1373     return Cost;
1374   }
1375 
getMaskedMemoryOpCost(unsigned Opcode,Type * DataTy,Align Alignment,unsigned AddressSpace,TTI::TargetCostKind CostKind)1376   InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy,
1377                                         Align Alignment, unsigned AddressSpace,
1378                                         TTI::TargetCostKind CostKind) {
1379     // TODO: Pass on AddressSpace when we have test coverage.
1380     return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, true, false,
1381                                        CostKind);
1382   }
1383 
1384   InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
1385                                          const Value *Ptr, bool VariableMask,
1386                                          Align Alignment,
1387                                          TTI::TargetCostKind CostKind,
1388                                          const Instruction *I = nullptr) {
1389     return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, VariableMask,
1390                                        true, CostKind);
1391   }
1392 
getStridedMemoryOpCost(unsigned Opcode,Type * DataTy,const Value * Ptr,bool VariableMask,Align Alignment,TTI::TargetCostKind CostKind,const Instruction * I)1393   InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy,
1394                                          const Value *Ptr, bool VariableMask,
1395                                          Align Alignment,
1396                                          TTI::TargetCostKind CostKind,
1397                                          const Instruction *I) {
1398     // For a target without strided memory operations (or for an illegal
1399     // operation type on one which does), assume we lower to a gather/scatter
1400     // operation.  (Which may in turn be scalarized.)
1401     return thisT()->getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
1402                                            Alignment, CostKind, I);
1403   }
1404 
1405   InstructionCost getInterleavedMemoryOpCost(
1406       unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1407       Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1408       bool UseMaskForCond = false, bool UseMaskForGaps = false) {
1409 
1410     // We cannot scalarize scalable vectors, so return Invalid.
1411     if (isa<ScalableVectorType>(VecTy))
1412       return InstructionCost::getInvalid();
1413 
1414     auto *VT = cast<FixedVectorType>(VecTy);
1415 
1416     unsigned NumElts = VT->getNumElements();
1417     assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor");
1418 
1419     unsigned NumSubElts = NumElts / Factor;
1420     auto *SubVT = FixedVectorType::get(VT->getElementType(), NumSubElts);
1421 
1422     // Firstly, the cost of load/store operation.
1423     InstructionCost Cost;
1424     if (UseMaskForCond || UseMaskForGaps)
1425       Cost = thisT()->getMaskedMemoryOpCost(Opcode, VecTy, Alignment,
1426                                             AddressSpace, CostKind);
1427     else
1428       Cost = thisT()->getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace,
1429                                       CostKind);
1430 
1431     // Legalize the vector type, and get the legalized and unlegalized type
1432     // sizes.
1433     MVT VecTyLT = getTypeLegalizationCost(VecTy).second;
1434     unsigned VecTySize = thisT()->getDataLayout().getTypeStoreSize(VecTy);
1435     unsigned VecTyLTSize = VecTyLT.getStoreSize();
1436 
1437     // Scale the cost of the memory operation by the fraction of legalized
1438     // instructions that will actually be used. We shouldn't account for the
1439     // cost of dead instructions since they will be removed.
1440     //
1441     // E.g., An interleaved load of factor 8:
1442     //       %vec = load <16 x i64>, <16 x i64>* %ptr
1443     //       %v0 = shufflevector %vec, undef, <0, 8>
1444     //
1445     // If <16 x i64> is legalized to 8 v2i64 loads, only 2 of the loads will be
1446     // used (those corresponding to elements [0:1] and [8:9] of the unlegalized
1447     // type). The other loads are unused.
1448     //
1449     // TODO: Note that legalization can turn masked loads/stores into unmasked
1450     // (legalized) loads/stores. This can be reflected in the cost.
1451     if (Cost.isValid() && VecTySize > VecTyLTSize) {
1452       // The number of loads of a legal type it will take to represent a load
1453       // of the unlegalized vector type.
1454       unsigned NumLegalInsts = divideCeil(VecTySize, VecTyLTSize);
1455 
1456       // The number of elements of the unlegalized type that correspond to a
1457       // single legal instruction.
1458       unsigned NumEltsPerLegalInst = divideCeil(NumElts, NumLegalInsts);
1459 
1460       // Determine which legal instructions will be used.
1461       BitVector UsedInsts(NumLegalInsts, false);
1462       for (unsigned Index : Indices)
1463         for (unsigned Elt = 0; Elt < NumSubElts; ++Elt)
1464           UsedInsts.set((Index + Elt * Factor) / NumEltsPerLegalInst);
1465 
1466       // Scale the cost of the load by the fraction of legal instructions that
1467       // will be used.
1468       Cost = divideCeil(UsedInsts.count() * *Cost.getValue(), NumLegalInsts);
1469     }
1470 
1471     // Then plus the cost of interleave operation.
1472     assert(Indices.size() <= Factor &&
1473            "Interleaved memory op has too many members");
1474 
1475     const APInt DemandedAllSubElts = APInt::getAllOnes(NumSubElts);
1476     const APInt DemandedAllResultElts = APInt::getAllOnes(NumElts);
1477 
1478     APInt DemandedLoadStoreElts = APInt::getZero(NumElts);
1479     for (unsigned Index : Indices) {
1480       assert(Index < Factor && "Invalid index for interleaved memory op");
1481       for (unsigned Elm = 0; Elm < NumSubElts; Elm++)
1482         DemandedLoadStoreElts.setBit(Index + Elm * Factor);
1483     }
1484 
1485     if (Opcode == Instruction::Load) {
1486       // The interleave cost is similar to extract sub vectors' elements
1487       // from the wide vector, and insert them into sub vectors.
1488       //
1489       // E.g. An interleaved load of factor 2 (with one member of index 0):
1490       //      %vec = load <8 x i32>, <8 x i32>* %ptr
1491       //      %v0 = shuffle %vec, undef, <0, 2, 4, 6>         ; Index 0
1492       // The cost is estimated as extract elements at 0, 2, 4, 6 from the
1493       // <8 x i32> vector and insert them into a <4 x i32> vector.
1494       InstructionCost InsSubCost = thisT()->getScalarizationOverhead(
1495           SubVT, DemandedAllSubElts,
1496           /*Insert*/ true, /*Extract*/ false, CostKind);
1497       Cost += Indices.size() * InsSubCost;
1498       Cost += thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts,
1499                                                 /*Insert*/ false,
1500                                                 /*Extract*/ true, CostKind);
1501     } else {
1502       // The interleave cost is extract elements from sub vectors, and
1503       // insert them into the wide vector.
1504       //
1505       // E.g. An interleaved store of factor 3 with 2 members at indices 0,1:
1506       // (using VF=4):
1507       //    %v0_v1 = shuffle %v0, %v1, <0,4,undef,1,5,undef,2,6,undef,3,7,undef>
1508       //    %gaps.mask = <true, true, false, true, true, false,
1509       //                  true, true, false, true, true, false>
1510       //    call llvm.masked.store <12 x i32> %v0_v1, <12 x i32>* %ptr,
1511       //                           i32 Align, <12 x i1> %gaps.mask
1512       // The cost is estimated as extract all elements (of actual members,
1513       // excluding gaps) from both <4 x i32> vectors and insert into the <12 x
1514       // i32> vector.
1515       InstructionCost ExtSubCost = thisT()->getScalarizationOverhead(
1516           SubVT, DemandedAllSubElts,
1517           /*Insert*/ false, /*Extract*/ true, CostKind);
1518       Cost += ExtSubCost * Indices.size();
1519       Cost += thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts,
1520                                                 /*Insert*/ true,
1521                                                 /*Extract*/ false, CostKind);
1522     }
1523 
1524     if (!UseMaskForCond)
1525       return Cost;
1526 
1527     Type *I8Type = Type::getInt8Ty(VT->getContext());
1528 
1529     Cost += thisT()->getReplicationShuffleCost(
1530         I8Type, Factor, NumSubElts,
1531         UseMaskForGaps ? DemandedLoadStoreElts : DemandedAllResultElts,
1532         CostKind);
1533 
1534     // The Gaps mask is invariant and created outside the loop, therefore the
1535     // cost of creating it is not accounted for here. However if we have both
1536     // a MaskForGaps and some other mask that guards the execution of the
1537     // memory access, we need to account for the cost of And-ing the two masks
1538     // inside the loop.
1539     if (UseMaskForGaps) {
1540       auto *MaskVT = FixedVectorType::get(I8Type, NumElts);
1541       Cost += thisT()->getArithmeticInstrCost(BinaryOperator::And, MaskVT,
1542                                               CostKind);
1543     }
1544 
1545     return Cost;
1546   }
1547 
1548   /// Get intrinsic cost based on arguments.
getIntrinsicInstrCost(const IntrinsicCostAttributes & ICA,TTI::TargetCostKind CostKind)1549   InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
1550                                         TTI::TargetCostKind CostKind) {
1551     // Check for generically free intrinsics.
1552     if (BaseT::getIntrinsicInstrCost(ICA, CostKind) == 0)
1553       return 0;
1554 
1555     // Assume that target intrinsics are cheap.
1556     Intrinsic::ID IID = ICA.getID();
1557     if (Function::isTargetIntrinsic(IID))
1558       return TargetTransformInfo::TCC_Basic;
1559 
1560     if (ICA.isTypeBasedOnly())
1561       return getTypeBasedIntrinsicInstrCost(ICA, CostKind);
1562 
1563     Type *RetTy = ICA.getReturnType();
1564 
1565     ElementCount RetVF =
1566         (RetTy->isVectorTy() ? cast<VectorType>(RetTy)->getElementCount()
1567                              : ElementCount::getFixed(1));
1568     const IntrinsicInst *I = ICA.getInst();
1569     const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
1570     FastMathFlags FMF = ICA.getFlags();
1571     switch (IID) {
1572     default:
1573       break;
1574 
1575     case Intrinsic::powi:
1576       if (auto *RHSC = dyn_cast<ConstantInt>(Args[1])) {
1577         bool ShouldOptForSize = I->getParent()->getParent()->hasOptSize();
1578         if (getTLI()->isBeneficialToExpandPowI(RHSC->getSExtValue(),
1579                                                ShouldOptForSize)) {
1580           // The cost is modeled on the expansion performed by ExpandPowI in
1581           // SelectionDAGBuilder.
1582           APInt Exponent = RHSC->getValue().abs();
1583           unsigned ActiveBits = Exponent.getActiveBits();
1584           unsigned PopCount = Exponent.popcount();
1585           InstructionCost Cost = (ActiveBits + PopCount - 2) *
1586                                  thisT()->getArithmeticInstrCost(
1587                                      Instruction::FMul, RetTy, CostKind);
1588           if (RHSC->isNegative())
1589             Cost += thisT()->getArithmeticInstrCost(Instruction::FDiv, RetTy,
1590                                                     CostKind);
1591           return Cost;
1592         }
1593       }
1594       break;
1595     case Intrinsic::cttz:
1596       // FIXME: If necessary, this should go in target-specific overrides.
1597       if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCttz(RetTy))
1598         return TargetTransformInfo::TCC_Basic;
1599       break;
1600 
1601     case Intrinsic::ctlz:
1602       // FIXME: If necessary, this should go in target-specific overrides.
1603       if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCtlz(RetTy))
1604         return TargetTransformInfo::TCC_Basic;
1605       break;
1606 
1607     case Intrinsic::memcpy:
1608       return thisT()->getMemcpyCost(ICA.getInst());
1609 
1610     case Intrinsic::masked_scatter: {
1611       const Value *Mask = Args[3];
1612       bool VarMask = !isa<Constant>(Mask);
1613       Align Alignment = cast<ConstantInt>(Args[2])->getAlignValue();
1614       return thisT()->getGatherScatterOpCost(Instruction::Store,
1615                                              ICA.getArgTypes()[0], Args[1],
1616                                              VarMask, Alignment, CostKind, I);
1617     }
1618     case Intrinsic::masked_gather: {
1619       const Value *Mask = Args[2];
1620       bool VarMask = !isa<Constant>(Mask);
1621       Align Alignment = cast<ConstantInt>(Args[1])->getAlignValue();
1622       return thisT()->getGatherScatterOpCost(Instruction::Load, RetTy, Args[0],
1623                                              VarMask, Alignment, CostKind, I);
1624     }
1625     case Intrinsic::experimental_vp_strided_store: {
1626       const Value *Data = Args[0];
1627       const Value *Ptr = Args[1];
1628       const Value *Mask = Args[3];
1629       const Value *EVL = Args[4];
1630       bool VarMask = !isa<Constant>(Mask) || !isa<Constant>(EVL);
1631       Align Alignment = I->getParamAlign(1).valueOrOne();
1632       return thisT()->getStridedMemoryOpCost(Instruction::Store,
1633                                              Data->getType(), Ptr, VarMask,
1634                                              Alignment, CostKind, I);
1635     }
1636     case Intrinsic::experimental_vp_strided_load: {
1637       const Value *Ptr = Args[0];
1638       const Value *Mask = Args[2];
1639       const Value *EVL = Args[3];
1640       bool VarMask = !isa<Constant>(Mask) || !isa<Constant>(EVL);
1641       Align Alignment = I->getParamAlign(0).valueOrOne();
1642       return thisT()->getStridedMemoryOpCost(Instruction::Load, RetTy, Ptr,
1643                                              VarMask, Alignment, CostKind, I);
1644     }
1645     case Intrinsic::experimental_stepvector: {
1646       if (isa<ScalableVectorType>(RetTy))
1647         return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1648       // The cost of materialising a constant integer vector.
1649       return TargetTransformInfo::TCC_Basic;
1650     }
1651     case Intrinsic::vector_extract: {
1652       // FIXME: Handle case where a scalable vector is extracted from a scalable
1653       // vector
1654       if (isa<ScalableVectorType>(RetTy))
1655         return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1656       unsigned Index = cast<ConstantInt>(Args[1])->getZExtValue();
1657       return thisT()->getShuffleCost(
1658           TTI::SK_ExtractSubvector, cast<VectorType>(Args[0]->getType()),
1659           std::nullopt, CostKind, Index, cast<VectorType>(RetTy));
1660     }
1661     case Intrinsic::vector_insert: {
1662       // FIXME: Handle case where a scalable vector is inserted into a scalable
1663       // vector
1664       if (isa<ScalableVectorType>(Args[1]->getType()))
1665         return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1666       unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue();
1667       return thisT()->getShuffleCost(
1668           TTI::SK_InsertSubvector, cast<VectorType>(Args[0]->getType()),
1669           std::nullopt, CostKind, Index, cast<VectorType>(Args[1]->getType()));
1670     }
1671     case Intrinsic::vector_reverse: {
1672       return thisT()->getShuffleCost(
1673           TTI::SK_Reverse, cast<VectorType>(Args[0]->getType()), std::nullopt,
1674           CostKind, 0, cast<VectorType>(RetTy));
1675     }
1676     case Intrinsic::vector_splice: {
1677       unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue();
1678       return thisT()->getShuffleCost(
1679           TTI::SK_Splice, cast<VectorType>(Args[0]->getType()), std::nullopt,
1680           CostKind, Index, cast<VectorType>(RetTy));
1681     }
1682     case Intrinsic::vector_reduce_add:
1683     case Intrinsic::vector_reduce_mul:
1684     case Intrinsic::vector_reduce_and:
1685     case Intrinsic::vector_reduce_or:
1686     case Intrinsic::vector_reduce_xor:
1687     case Intrinsic::vector_reduce_smax:
1688     case Intrinsic::vector_reduce_smin:
1689     case Intrinsic::vector_reduce_fmax:
1690     case Intrinsic::vector_reduce_fmin:
1691     case Intrinsic::vector_reduce_fmaximum:
1692     case Intrinsic::vector_reduce_fminimum:
1693     case Intrinsic::vector_reduce_umax:
1694     case Intrinsic::vector_reduce_umin: {
1695       IntrinsicCostAttributes Attrs(IID, RetTy, Args[0]->getType(), FMF, I, 1);
1696       return getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
1697     }
1698     case Intrinsic::vector_reduce_fadd:
1699     case Intrinsic::vector_reduce_fmul: {
1700       IntrinsicCostAttributes Attrs(
1701           IID, RetTy, {Args[0]->getType(), Args[1]->getType()}, FMF, I, 1);
1702       return getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
1703     }
1704     case Intrinsic::fshl:
1705     case Intrinsic::fshr: {
1706       const Value *X = Args[0];
1707       const Value *Y = Args[1];
1708       const Value *Z = Args[2];
1709       const TTI::OperandValueInfo OpInfoX = TTI::getOperandInfo(X);
1710       const TTI::OperandValueInfo OpInfoY = TTI::getOperandInfo(Y);
1711       const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(Z);
1712       const TTI::OperandValueInfo OpInfoBW =
1713         {TTI::OK_UniformConstantValue,
1714          isPowerOf2_32(RetTy->getScalarSizeInBits()) ? TTI::OP_PowerOf2
1715          : TTI::OP_None};
1716 
1717       // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
1718       // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
1719       InstructionCost Cost = 0;
1720       Cost +=
1721           thisT()->getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind);
1722       Cost +=
1723           thisT()->getArithmeticInstrCost(BinaryOperator::Sub, RetTy, CostKind);
1724       Cost += thisT()->getArithmeticInstrCost(
1725           BinaryOperator::Shl, RetTy, CostKind, OpInfoX,
1726           {OpInfoZ.Kind, TTI::OP_None});
1727       Cost += thisT()->getArithmeticInstrCost(
1728           BinaryOperator::LShr, RetTy, CostKind, OpInfoY,
1729           {OpInfoZ.Kind, TTI::OP_None});
1730       // Non-constant shift amounts requires a modulo.
1731       if (!OpInfoZ.isConstant())
1732         Cost += thisT()->getArithmeticInstrCost(BinaryOperator::URem, RetTy,
1733                                                 CostKind, OpInfoZ, OpInfoBW);
1734       // For non-rotates (X != Y) we must add shift-by-zero handling costs.
1735       if (X != Y) {
1736         Type *CondTy = RetTy->getWithNewBitWidth(1);
1737         Cost +=
1738             thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
1739                                         CmpInst::ICMP_EQ, CostKind);
1740         Cost +=
1741             thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1742                                         CmpInst::ICMP_EQ, CostKind);
1743       }
1744       return Cost;
1745     }
1746     case Intrinsic::get_active_lane_mask: {
1747       EVT ResVT = getTLI()->getValueType(DL, RetTy, true);
1748       EVT ArgType = getTLI()->getValueType(DL, ICA.getArgTypes()[0], true);
1749 
1750       // If we're not expanding the intrinsic then we assume this is cheap
1751       // to implement.
1752       if (!getTLI()->shouldExpandGetActiveLaneMask(ResVT, ArgType)) {
1753         return getTypeLegalizationCost(RetTy).first;
1754       }
1755 
1756       // Create the expanded types that will be used to calculate the uadd_sat
1757       // operation.
1758       Type *ExpRetTy = VectorType::get(
1759           ICA.getArgTypes()[0], cast<VectorType>(RetTy)->getElementCount());
1760       IntrinsicCostAttributes Attrs(Intrinsic::uadd_sat, ExpRetTy, {}, FMF);
1761       InstructionCost Cost =
1762           thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
1763       Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, ExpRetTy, RetTy,
1764                                           CmpInst::ICMP_ULT, CostKind);
1765       return Cost;
1766     }
1767     case Intrinsic::experimental_cttz_elts: {
1768       EVT ArgType = getTLI()->getValueType(DL, ICA.getArgTypes()[0], true);
1769 
1770       // If we're not expanding the intrinsic then we assume this is cheap
1771       // to implement.
1772       if (!getTLI()->shouldExpandCttzElements(ArgType))
1773         return getTypeLegalizationCost(RetTy).first;
1774 
1775       // TODO: The costs below reflect the expansion code in
1776       // SelectionDAGBuilder, but we may want to sacrifice some accuracy in
1777       // favour of compile time.
1778 
1779       // Find the smallest "sensible" element type to use for the expansion.
1780       bool ZeroIsPoison = !cast<ConstantInt>(Args[1])->isZero();
1781       ConstantRange VScaleRange(APInt(64, 1), APInt::getZero(64));
1782       if (isa<ScalableVectorType>(ICA.getArgTypes()[0]) && I && I->getCaller())
1783         VScaleRange = getVScaleRange(I->getCaller(), 64);
1784 
1785       unsigned EltWidth = getTLI()->getBitWidthForCttzElements(
1786           RetTy, ArgType.getVectorElementCount(), ZeroIsPoison, &VScaleRange);
1787       Type *NewEltTy = IntegerType::getIntNTy(RetTy->getContext(), EltWidth);
1788 
1789       // Create the new vector type & get the vector length
1790       Type *NewVecTy = VectorType::get(
1791           NewEltTy, cast<VectorType>(Args[0]->getType())->getElementCount());
1792 
1793       IntrinsicCostAttributes StepVecAttrs(Intrinsic::experimental_stepvector,
1794                                            NewVecTy, {}, FMF);
1795       InstructionCost Cost =
1796           thisT()->getIntrinsicInstrCost(StepVecAttrs, CostKind);
1797 
1798       Cost +=
1799           thisT()->getArithmeticInstrCost(Instruction::Sub, NewVecTy, CostKind);
1800       Cost += thisT()->getCastInstrCost(Instruction::SExt, NewVecTy,
1801                                         Args[0]->getType(),
1802                                         TTI::CastContextHint::None, CostKind);
1803       Cost +=
1804           thisT()->getArithmeticInstrCost(Instruction::And, NewVecTy, CostKind);
1805 
1806       IntrinsicCostAttributes ReducAttrs(Intrinsic::vector_reduce_umax,
1807                                          NewEltTy, NewVecTy, FMF, I, 1);
1808       Cost += thisT()->getTypeBasedIntrinsicInstrCost(ReducAttrs, CostKind);
1809       Cost +=
1810           thisT()->getArithmeticInstrCost(Instruction::Sub, NewEltTy, CostKind);
1811 
1812       return Cost;
1813     }
1814     }
1815 
1816     // VP Intrinsics should have the same cost as their non-vp counterpart.
1817     // TODO: Adjust the cost to make the vp intrinsic cheaper than its non-vp
1818     // counterpart when the vector length argument is smaller than the maximum
1819     // vector length.
1820     // TODO: Support other kinds of VPIntrinsics
1821     if (VPIntrinsic::isVPIntrinsic(ICA.getID())) {
1822       std::optional<unsigned> FOp =
1823           VPIntrinsic::getFunctionalOpcodeForVP(ICA.getID());
1824       if (FOp) {
1825         if (ICA.getID() == Intrinsic::vp_load) {
1826           Align Alignment;
1827           if (auto *VPI = dyn_cast_or_null<VPIntrinsic>(ICA.getInst()))
1828             Alignment = VPI->getPointerAlignment().valueOrOne();
1829           unsigned AS = 0;
1830           if (ICA.getArgs().size() > 1)
1831             if (auto *PtrTy =
1832                     dyn_cast<PointerType>(ICA.getArgs()[0]->getType()))
1833               AS = PtrTy->getAddressSpace();
1834           return thisT()->getMemoryOpCost(*FOp, ICA.getReturnType(), Alignment,
1835                                           AS, CostKind);
1836         }
1837         if (ICA.getID() == Intrinsic::vp_store) {
1838           Align Alignment;
1839           if (auto *VPI = dyn_cast_or_null<VPIntrinsic>(ICA.getInst()))
1840             Alignment = VPI->getPointerAlignment().valueOrOne();
1841           unsigned AS = 0;
1842           if (ICA.getArgs().size() >= 2)
1843             if (auto *PtrTy =
1844                     dyn_cast<PointerType>(ICA.getArgs()[1]->getType()))
1845               AS = PtrTy->getAddressSpace();
1846           return thisT()->getMemoryOpCost(*FOp, Args[0]->getType(), Alignment,
1847                                           AS, CostKind);
1848         }
1849         if (VPBinOpIntrinsic::isVPBinOp(ICA.getID())) {
1850           return thisT()->getArithmeticInstrCost(*FOp, ICA.getReturnType(),
1851                                                  CostKind);
1852         }
1853       }
1854 
1855       std::optional<Intrinsic::ID> FID =
1856           VPIntrinsic::getFunctionalIntrinsicIDForVP(ICA.getID());
1857       if (FID) {
1858         // Non-vp version will have same Args/Tys except mask and vector length.
1859         assert(ICA.getArgs().size() >= 2 && ICA.getArgTypes().size() >= 2 &&
1860                "Expected VPIntrinsic to have Mask and Vector Length args and "
1861                "types");
1862         ArrayRef<Type *> NewTys = ArrayRef(ICA.getArgTypes()).drop_back(2);
1863 
1864         // VPReduction intrinsics have a start value argument that their non-vp
1865         // counterparts do not have, except for the fadd and fmul non-vp
1866         // counterpart.
1867         if (VPReductionIntrinsic::isVPReduction(ICA.getID()) &&
1868             *FID != Intrinsic::vector_reduce_fadd &&
1869             *FID != Intrinsic::vector_reduce_fmul)
1870           NewTys = NewTys.drop_front();
1871 
1872         IntrinsicCostAttributes NewICA(*FID, ICA.getReturnType(), NewTys,
1873                                        ICA.getFlags());
1874         return thisT()->getIntrinsicInstrCost(NewICA, CostKind);
1875       }
1876     }
1877 
1878     // Assume that we need to scalarize this intrinsic.)
1879     // Compute the scalarization overhead based on Args for a vector
1880     // intrinsic.
1881     InstructionCost ScalarizationCost = InstructionCost::getInvalid();
1882     if (RetVF.isVector() && !RetVF.isScalable()) {
1883       ScalarizationCost = 0;
1884       if (!RetTy->isVoidTy())
1885         ScalarizationCost += getScalarizationOverhead(
1886             cast<VectorType>(RetTy),
1887             /*Insert*/ true, /*Extract*/ false, CostKind);
1888       ScalarizationCost +=
1889           getOperandsScalarizationOverhead(Args, ICA.getArgTypes(), CostKind);
1890     }
1891 
1892     IntrinsicCostAttributes Attrs(IID, RetTy, ICA.getArgTypes(), FMF, I,
1893                                   ScalarizationCost);
1894     return thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
1895   }
1896 
1897   /// Get intrinsic cost based on argument types.
1898   /// If ScalarizationCostPassed is std::numeric_limits<unsigned>::max(), the
1899   /// cost of scalarizing the arguments and the return value will be computed
1900   /// based on types.
1901   InstructionCost
getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes & ICA,TTI::TargetCostKind CostKind)1902   getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
1903                                  TTI::TargetCostKind CostKind) {
1904     Intrinsic::ID IID = ICA.getID();
1905     Type *RetTy = ICA.getReturnType();
1906     const SmallVectorImpl<Type *> &Tys = ICA.getArgTypes();
1907     FastMathFlags FMF = ICA.getFlags();
1908     InstructionCost ScalarizationCostPassed = ICA.getScalarizationCost();
1909     bool SkipScalarizationCost = ICA.skipScalarizationCost();
1910 
1911     VectorType *VecOpTy = nullptr;
1912     if (!Tys.empty()) {
1913       // The vector reduction operand is operand 0 except for fadd/fmul.
1914       // Their operand 0 is a scalar start value, so the vector op is operand 1.
1915       unsigned VecTyIndex = 0;
1916       if (IID == Intrinsic::vector_reduce_fadd ||
1917           IID == Intrinsic::vector_reduce_fmul)
1918         VecTyIndex = 1;
1919       assert(Tys.size() > VecTyIndex && "Unexpected IntrinsicCostAttributes");
1920       VecOpTy = dyn_cast<VectorType>(Tys[VecTyIndex]);
1921     }
1922 
1923     // Library call cost - other than size, make it expensive.
1924     unsigned SingleCallCost = CostKind == TTI::TCK_CodeSize ? 1 : 10;
1925     unsigned ISD = 0;
1926     switch (IID) {
1927     default: {
1928       // Scalable vectors cannot be scalarized, so return Invalid.
1929       if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) {
1930             return isa<ScalableVectorType>(Ty);
1931           }))
1932         return InstructionCost::getInvalid();
1933 
1934       // Assume that we need to scalarize this intrinsic.
1935       InstructionCost ScalarizationCost =
1936           SkipScalarizationCost ? ScalarizationCostPassed : 0;
1937       unsigned ScalarCalls = 1;
1938       Type *ScalarRetTy = RetTy;
1939       if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) {
1940         if (!SkipScalarizationCost)
1941           ScalarizationCost = getScalarizationOverhead(
1942               RetVTy, /*Insert*/ true, /*Extract*/ false, CostKind);
1943         ScalarCalls = std::max(ScalarCalls,
1944                                cast<FixedVectorType>(RetVTy)->getNumElements());
1945         ScalarRetTy = RetTy->getScalarType();
1946       }
1947       SmallVector<Type *, 4> ScalarTys;
1948       for (Type *Ty : Tys) {
1949         if (auto *VTy = dyn_cast<VectorType>(Ty)) {
1950           if (!SkipScalarizationCost)
1951             ScalarizationCost += getScalarizationOverhead(
1952                 VTy, /*Insert*/ false, /*Extract*/ true, CostKind);
1953           ScalarCalls = std::max(ScalarCalls,
1954                                  cast<FixedVectorType>(VTy)->getNumElements());
1955           Ty = Ty->getScalarType();
1956         }
1957         ScalarTys.push_back(Ty);
1958       }
1959       if (ScalarCalls == 1)
1960         return 1; // Return cost of a scalar intrinsic. Assume it to be cheap.
1961 
1962       IntrinsicCostAttributes ScalarAttrs(IID, ScalarRetTy, ScalarTys, FMF);
1963       InstructionCost ScalarCost =
1964           thisT()->getIntrinsicInstrCost(ScalarAttrs, CostKind);
1965 
1966       return ScalarCalls * ScalarCost + ScalarizationCost;
1967     }
1968     // Look for intrinsics that can be lowered directly or turned into a scalar
1969     // intrinsic call.
1970     case Intrinsic::sqrt:
1971       ISD = ISD::FSQRT;
1972       break;
1973     case Intrinsic::sin:
1974       ISD = ISD::FSIN;
1975       break;
1976     case Intrinsic::cos:
1977       ISD = ISD::FCOS;
1978       break;
1979     case Intrinsic::tan:
1980       ISD = ISD::FTAN;
1981       break;
1982     case Intrinsic::asin:
1983       ISD = ISD::FASIN;
1984       break;
1985     case Intrinsic::acos:
1986       ISD = ISD::FACOS;
1987       break;
1988     case Intrinsic::atan:
1989       ISD = ISD::FATAN;
1990       break;
1991     case Intrinsic::sinh:
1992       ISD = ISD::FSINH;
1993       break;
1994     case Intrinsic::cosh:
1995       ISD = ISD::FCOSH;
1996       break;
1997     case Intrinsic::tanh:
1998       ISD = ISD::FTANH;
1999       break;
2000     case Intrinsic::exp:
2001       ISD = ISD::FEXP;
2002       break;
2003     case Intrinsic::exp2:
2004       ISD = ISD::FEXP2;
2005       break;
2006     case Intrinsic::exp10:
2007       ISD = ISD::FEXP10;
2008       break;
2009     case Intrinsic::log:
2010       ISD = ISD::FLOG;
2011       break;
2012     case Intrinsic::log10:
2013       ISD = ISD::FLOG10;
2014       break;
2015     case Intrinsic::log2:
2016       ISD = ISD::FLOG2;
2017       break;
2018     case Intrinsic::fabs:
2019       ISD = ISD::FABS;
2020       break;
2021     case Intrinsic::canonicalize:
2022       ISD = ISD::FCANONICALIZE;
2023       break;
2024     case Intrinsic::minnum:
2025       ISD = ISD::FMINNUM;
2026       break;
2027     case Intrinsic::maxnum:
2028       ISD = ISD::FMAXNUM;
2029       break;
2030     case Intrinsic::minimum:
2031       ISD = ISD::FMINIMUM;
2032       break;
2033     case Intrinsic::maximum:
2034       ISD = ISD::FMAXIMUM;
2035       break;
2036     case Intrinsic::copysign:
2037       ISD = ISD::FCOPYSIGN;
2038       break;
2039     case Intrinsic::floor:
2040       ISD = ISD::FFLOOR;
2041       break;
2042     case Intrinsic::ceil:
2043       ISD = ISD::FCEIL;
2044       break;
2045     case Intrinsic::trunc:
2046       ISD = ISD::FTRUNC;
2047       break;
2048     case Intrinsic::nearbyint:
2049       ISD = ISD::FNEARBYINT;
2050       break;
2051     case Intrinsic::rint:
2052       ISD = ISD::FRINT;
2053       break;
2054     case Intrinsic::lrint:
2055       ISD = ISD::LRINT;
2056       break;
2057     case Intrinsic::llrint:
2058       ISD = ISD::LLRINT;
2059       break;
2060     case Intrinsic::round:
2061       ISD = ISD::FROUND;
2062       break;
2063     case Intrinsic::roundeven:
2064       ISD = ISD::FROUNDEVEN;
2065       break;
2066     case Intrinsic::pow:
2067       ISD = ISD::FPOW;
2068       break;
2069     case Intrinsic::fma:
2070       ISD = ISD::FMA;
2071       break;
2072     case Intrinsic::fmuladd:
2073       ISD = ISD::FMA;
2074       break;
2075     case Intrinsic::experimental_constrained_fmuladd:
2076       ISD = ISD::STRICT_FMA;
2077       break;
2078     // FIXME: We should return 0 whenever getIntrinsicCost == TCC_Free.
2079     case Intrinsic::lifetime_start:
2080     case Intrinsic::lifetime_end:
2081     case Intrinsic::sideeffect:
2082     case Intrinsic::pseudoprobe:
2083     case Intrinsic::arithmetic_fence:
2084       return 0;
2085     case Intrinsic::masked_store: {
2086       Type *Ty = Tys[0];
2087       Align TyAlign = thisT()->DL.getABITypeAlign(Ty);
2088       return thisT()->getMaskedMemoryOpCost(Instruction::Store, Ty, TyAlign, 0,
2089                                             CostKind);
2090     }
2091     case Intrinsic::masked_load: {
2092       Type *Ty = RetTy;
2093       Align TyAlign = thisT()->DL.getABITypeAlign(Ty);
2094       return thisT()->getMaskedMemoryOpCost(Instruction::Load, Ty, TyAlign, 0,
2095                                             CostKind);
2096     }
2097     case Intrinsic::vector_reduce_add:
2098     case Intrinsic::vector_reduce_mul:
2099     case Intrinsic::vector_reduce_and:
2100     case Intrinsic::vector_reduce_or:
2101     case Intrinsic::vector_reduce_xor:
2102       return thisT()->getArithmeticReductionCost(
2103           getArithmeticReductionInstruction(IID), VecOpTy, std::nullopt,
2104           CostKind);
2105     case Intrinsic::vector_reduce_fadd:
2106     case Intrinsic::vector_reduce_fmul:
2107       return thisT()->getArithmeticReductionCost(
2108           getArithmeticReductionInstruction(IID), VecOpTy, FMF, CostKind);
2109     case Intrinsic::vector_reduce_smax:
2110     case Intrinsic::vector_reduce_smin:
2111     case Intrinsic::vector_reduce_umax:
2112     case Intrinsic::vector_reduce_umin:
2113     case Intrinsic::vector_reduce_fmax:
2114     case Intrinsic::vector_reduce_fmin:
2115     case Intrinsic::vector_reduce_fmaximum:
2116     case Intrinsic::vector_reduce_fminimum:
2117       return thisT()->getMinMaxReductionCost(getMinMaxReductionIntrinsicOp(IID),
2118                                              VecOpTy, ICA.getFlags(), CostKind);
2119     case Intrinsic::abs: {
2120       // abs(X) = select(icmp(X,0),X,sub(0,X))
2121       Type *CondTy = RetTy->getWithNewBitWidth(1);
2122       CmpInst::Predicate Pred = CmpInst::ICMP_SGT;
2123       InstructionCost Cost = 0;
2124       Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
2125                                           Pred, CostKind);
2126       Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
2127                                           Pred, CostKind);
2128       // TODO: Should we add an OperandValueProperties::OP_Zero property?
2129       Cost += thisT()->getArithmeticInstrCost(
2130          BinaryOperator::Sub, RetTy, CostKind, {TTI::OK_UniformConstantValue, TTI::OP_None});
2131       return Cost;
2132     }
2133     case Intrinsic::smax:
2134     case Intrinsic::smin:
2135     case Intrinsic::umax:
2136     case Intrinsic::umin: {
2137       // minmax(X,Y) = select(icmp(X,Y),X,Y)
2138       Type *CondTy = RetTy->getWithNewBitWidth(1);
2139       bool IsUnsigned = IID == Intrinsic::umax || IID == Intrinsic::umin;
2140       CmpInst::Predicate Pred =
2141           IsUnsigned ? CmpInst::ICMP_UGT : CmpInst::ICMP_SGT;
2142       InstructionCost Cost = 0;
2143       Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
2144                                           Pred, CostKind);
2145       Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
2146                                           Pred, CostKind);
2147       return Cost;
2148     }
2149     case Intrinsic::sadd_sat:
2150     case Intrinsic::ssub_sat: {
2151       Type *CondTy = RetTy->getWithNewBitWidth(1);
2152 
2153       Type *OpTy = StructType::create({RetTy, CondTy});
2154       Intrinsic::ID OverflowOp = IID == Intrinsic::sadd_sat
2155                                      ? Intrinsic::sadd_with_overflow
2156                                      : Intrinsic::ssub_with_overflow;
2157       CmpInst::Predicate Pred = CmpInst::ICMP_SGT;
2158 
2159       // SatMax -> Overflow && SumDiff < 0
2160       // SatMin -> Overflow && SumDiff >= 0
2161       InstructionCost Cost = 0;
2162       IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
2163                                     nullptr, ScalarizationCostPassed);
2164       Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
2165       Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
2166                                           Pred, CostKind);
2167       Cost += 2 * thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy,
2168                                               CondTy, Pred, CostKind);
2169       return Cost;
2170     }
2171     case Intrinsic::uadd_sat:
2172     case Intrinsic::usub_sat: {
2173       Type *CondTy = RetTy->getWithNewBitWidth(1);
2174 
2175       Type *OpTy = StructType::create({RetTy, CondTy});
2176       Intrinsic::ID OverflowOp = IID == Intrinsic::uadd_sat
2177                                      ? Intrinsic::uadd_with_overflow
2178                                      : Intrinsic::usub_with_overflow;
2179 
2180       InstructionCost Cost = 0;
2181       IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
2182                                     nullptr, ScalarizationCostPassed);
2183       Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
2184       Cost +=
2185           thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
2186                                       CmpInst::BAD_ICMP_PREDICATE, CostKind);
2187       return Cost;
2188     }
2189     case Intrinsic::smul_fix:
2190     case Intrinsic::umul_fix: {
2191       unsigned ExtSize = RetTy->getScalarSizeInBits() * 2;
2192       Type *ExtTy = RetTy->getWithNewBitWidth(ExtSize);
2193 
2194       unsigned ExtOp =
2195           IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt;
2196       TTI::CastContextHint CCH = TTI::CastContextHint::None;
2197 
2198       InstructionCost Cost = 0;
2199       Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, RetTy, CCH, CostKind);
2200       Cost +=
2201           thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
2202       Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, RetTy, ExtTy,
2203                                             CCH, CostKind);
2204       Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, RetTy,
2205                                               CostKind,
2206                                               {TTI::OK_AnyValue, TTI::OP_None},
2207                                               {TTI::OK_UniformConstantValue, TTI::OP_None});
2208       Cost += thisT()->getArithmeticInstrCost(Instruction::Shl, RetTy, CostKind,
2209                                               {TTI::OK_AnyValue, TTI::OP_None},
2210                                               {TTI::OK_UniformConstantValue, TTI::OP_None});
2211       Cost += thisT()->getArithmeticInstrCost(Instruction::Or, RetTy, CostKind);
2212       return Cost;
2213     }
2214     case Intrinsic::sadd_with_overflow:
2215     case Intrinsic::ssub_with_overflow: {
2216       Type *SumTy = RetTy->getContainedType(0);
2217       Type *OverflowTy = RetTy->getContainedType(1);
2218       unsigned Opcode = IID == Intrinsic::sadd_with_overflow
2219                             ? BinaryOperator::Add
2220                             : BinaryOperator::Sub;
2221 
2222       //   Add:
2223       //   Overflow -> (Result < LHS) ^ (RHS < 0)
2224       //   Sub:
2225       //   Overflow -> (Result < LHS) ^ (RHS > 0)
2226       InstructionCost Cost = 0;
2227       Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
2228       Cost += 2 * thisT()->getCmpSelInstrCost(
2229                       Instruction::ICmp, SumTy, OverflowTy,
2230                       CmpInst::ICMP_SGT, CostKind);
2231       Cost += thisT()->getArithmeticInstrCost(BinaryOperator::Xor, OverflowTy,
2232                                               CostKind);
2233       return Cost;
2234     }
2235     case Intrinsic::uadd_with_overflow:
2236     case Intrinsic::usub_with_overflow: {
2237       Type *SumTy = RetTy->getContainedType(0);
2238       Type *OverflowTy = RetTy->getContainedType(1);
2239       unsigned Opcode = IID == Intrinsic::uadd_with_overflow
2240                             ? BinaryOperator::Add
2241                             : BinaryOperator::Sub;
2242       CmpInst::Predicate Pred = IID == Intrinsic::uadd_with_overflow
2243                                     ? CmpInst::ICMP_ULT
2244                                     : CmpInst::ICMP_UGT;
2245 
2246       InstructionCost Cost = 0;
2247       Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
2248       Cost +=
2249           thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SumTy, OverflowTy,
2250                                       Pred, CostKind);
2251       return Cost;
2252     }
2253     case Intrinsic::smul_with_overflow:
2254     case Intrinsic::umul_with_overflow: {
2255       Type *MulTy = RetTy->getContainedType(0);
2256       Type *OverflowTy = RetTy->getContainedType(1);
2257       unsigned ExtSize = MulTy->getScalarSizeInBits() * 2;
2258       Type *ExtTy = MulTy->getWithNewBitWidth(ExtSize);
2259       bool IsSigned = IID == Intrinsic::smul_with_overflow;
2260 
2261       unsigned ExtOp = IsSigned ? Instruction::SExt : Instruction::ZExt;
2262       TTI::CastContextHint CCH = TTI::CastContextHint::None;
2263 
2264       InstructionCost Cost = 0;
2265       Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, MulTy, CCH, CostKind);
2266       Cost +=
2267           thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
2268       Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, MulTy, ExtTy,
2269                                             CCH, CostKind);
2270       Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, ExtTy,
2271                                               CostKind,
2272                                               {TTI::OK_AnyValue, TTI::OP_None},
2273                                               {TTI::OK_UniformConstantValue, TTI::OP_None});
2274 
2275       if (IsSigned)
2276         Cost += thisT()->getArithmeticInstrCost(Instruction::AShr, MulTy,
2277                                                 CostKind,
2278                                                 {TTI::OK_AnyValue, TTI::OP_None},
2279                                                 {TTI::OK_UniformConstantValue, TTI::OP_None});
2280 
2281       Cost += thisT()->getCmpSelInstrCost(
2282           BinaryOperator::ICmp, MulTy, OverflowTy, CmpInst::ICMP_NE, CostKind);
2283       return Cost;
2284     }
2285     case Intrinsic::fptosi_sat:
2286     case Intrinsic::fptoui_sat: {
2287       if (Tys.empty())
2288         break;
2289       Type *FromTy = Tys[0];
2290       bool IsSigned = IID == Intrinsic::fptosi_sat;
2291 
2292       InstructionCost Cost = 0;
2293       IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FromTy,
2294                                      {FromTy, FromTy});
2295       Cost += thisT()->getIntrinsicInstrCost(Attrs1, CostKind);
2296       IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FromTy,
2297                                      {FromTy, FromTy});
2298       Cost += thisT()->getIntrinsicInstrCost(Attrs2, CostKind);
2299       Cost += thisT()->getCastInstrCost(
2300           IsSigned ? Instruction::FPToSI : Instruction::FPToUI, RetTy, FromTy,
2301           TTI::CastContextHint::None, CostKind);
2302       if (IsSigned) {
2303         Type *CondTy = RetTy->getWithNewBitWidth(1);
2304         Cost += thisT()->getCmpSelInstrCost(
2305             BinaryOperator::FCmp, FromTy, CondTy, CmpInst::FCMP_UNO, CostKind);
2306         Cost += thisT()->getCmpSelInstrCost(
2307             BinaryOperator::Select, RetTy, CondTy, CmpInst::FCMP_UNO, CostKind);
2308       }
2309       return Cost;
2310     }
2311     case Intrinsic::ctpop:
2312       ISD = ISD::CTPOP;
2313       // In case of legalization use TCC_Expensive. This is cheaper than a
2314       // library call but still not a cheap instruction.
2315       SingleCallCost = TargetTransformInfo::TCC_Expensive;
2316       break;
2317     case Intrinsic::ctlz:
2318       ISD = ISD::CTLZ;
2319       break;
2320     case Intrinsic::cttz:
2321       ISD = ISD::CTTZ;
2322       break;
2323     case Intrinsic::bswap:
2324       ISD = ISD::BSWAP;
2325       break;
2326     case Intrinsic::bitreverse:
2327       ISD = ISD::BITREVERSE;
2328       break;
2329     }
2330 
2331     const TargetLoweringBase *TLI = getTLI();
2332     std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);
2333 
2334     if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
2335       if (IID == Intrinsic::fabs && LT.second.isFloatingPoint() &&
2336           TLI->isFAbsFree(LT.second)) {
2337         return 0;
2338       }
2339 
2340       // The operation is legal. Assume it costs 1.
2341       // If the type is split to multiple registers, assume that there is some
2342       // overhead to this.
2343       // TODO: Once we have extract/insert subvector cost we need to use them.
2344       if (LT.first > 1)
2345         return (LT.first * 2);
2346       else
2347         return (LT.first * 1);
2348     } else if (!TLI->isOperationExpand(ISD, LT.second)) {
2349       // If the operation is custom lowered then assume
2350       // that the code is twice as expensive.
2351       return (LT.first * 2);
2352     }
2353 
2354     // If we can't lower fmuladd into an FMA estimate the cost as a floating
2355     // point mul followed by an add.
2356     if (IID == Intrinsic::fmuladd)
2357       return thisT()->getArithmeticInstrCost(BinaryOperator::FMul, RetTy,
2358                                              CostKind) +
2359              thisT()->getArithmeticInstrCost(BinaryOperator::FAdd, RetTy,
2360                                              CostKind);
2361     if (IID == Intrinsic::experimental_constrained_fmuladd) {
2362       IntrinsicCostAttributes FMulAttrs(
2363         Intrinsic::experimental_constrained_fmul, RetTy, Tys);
2364       IntrinsicCostAttributes FAddAttrs(
2365         Intrinsic::experimental_constrained_fadd, RetTy, Tys);
2366       return thisT()->getIntrinsicInstrCost(FMulAttrs, CostKind) +
2367              thisT()->getIntrinsicInstrCost(FAddAttrs, CostKind);
2368     }
2369 
2370     // Else, assume that we need to scalarize this intrinsic. For math builtins
2371     // this will emit a costly libcall, adding call overhead and spills. Make it
2372     // very expensive.
2373     if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) {
2374       // Scalable vectors cannot be scalarized, so return Invalid.
2375       if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) {
2376             return isa<ScalableVectorType>(Ty);
2377           }))
2378         return InstructionCost::getInvalid();
2379 
2380       InstructionCost ScalarizationCost =
2381           SkipScalarizationCost
2382               ? ScalarizationCostPassed
2383               : getScalarizationOverhead(RetVTy, /*Insert*/ true,
2384                                          /*Extract*/ false, CostKind);
2385 
2386       unsigned ScalarCalls = cast<FixedVectorType>(RetVTy)->getNumElements();
2387       SmallVector<Type *, 4> ScalarTys;
2388       for (Type *Ty : Tys) {
2389         if (Ty->isVectorTy())
2390           Ty = Ty->getScalarType();
2391         ScalarTys.push_back(Ty);
2392       }
2393       IntrinsicCostAttributes Attrs(IID, RetTy->getScalarType(), ScalarTys, FMF);
2394       InstructionCost ScalarCost =
2395           thisT()->getIntrinsicInstrCost(Attrs, CostKind);
2396       for (Type *Ty : Tys) {
2397         if (auto *VTy = dyn_cast<VectorType>(Ty)) {
2398           if (!ICA.skipScalarizationCost())
2399             ScalarizationCost += getScalarizationOverhead(
2400                 VTy, /*Insert*/ false, /*Extract*/ true, CostKind);
2401           ScalarCalls = std::max(ScalarCalls,
2402                                  cast<FixedVectorType>(VTy)->getNumElements());
2403         }
2404       }
2405       return ScalarCalls * ScalarCost + ScalarizationCost;
2406     }
2407 
2408     // This is going to be turned into a library call, make it expensive.
2409     return SingleCallCost;
2410   }
2411 
2412   /// Compute a cost of the given call instruction.
2413   ///
2414   /// Compute the cost of calling function F with return type RetTy and
2415   /// argument types Tys. F might be nullptr, in this case the cost of an
2416   /// arbitrary call with the specified signature will be returned.
2417   /// This is used, for instance,  when we estimate call of a vector
2418   /// counterpart of the given function.
2419   /// \param F Called function, might be nullptr.
2420   /// \param RetTy Return value types.
2421   /// \param Tys Argument types.
2422   /// \returns The cost of Call instruction.
getCallInstrCost(Function * F,Type * RetTy,ArrayRef<Type * > Tys,TTI::TargetCostKind CostKind)2423   InstructionCost getCallInstrCost(Function *F, Type *RetTy,
2424                                    ArrayRef<Type *> Tys,
2425                                    TTI::TargetCostKind CostKind) {
2426     return 10;
2427   }
2428 
getNumberOfParts(Type * Tp)2429   unsigned getNumberOfParts(Type *Tp) {
2430     std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
2431     return LT.first.isValid() ? *LT.first.getValue() : 0;
2432   }
2433 
getAddressComputationCost(Type * Ty,ScalarEvolution *,const SCEV *)2434   InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *,
2435                                             const SCEV *) {
2436     return 0;
2437   }
2438 
2439   /// Try to calculate arithmetic and shuffle op costs for reduction intrinsics.
2440   /// We're assuming that reduction operation are performing the following way:
2441   ///
2442   /// %val1 = shufflevector<n x t> %val, <n x t> %undef,
2443   /// <n x i32> <i32 n/2, i32 n/2 + 1, ..., i32 n, i32 undef, ..., i32 undef>
2444   ///            \----------------v-------------/  \----------v------------/
2445   ///                            n/2 elements               n/2 elements
2446   /// %red1 = op <n x t> %val, <n x t> val1
2447   /// After this operation we have a vector %red1 where only the first n/2
2448   /// elements are meaningful, the second n/2 elements are undefined and can be
2449   /// dropped. All other operations are actually working with the vector of
2450   /// length n/2, not n, though the real vector length is still n.
2451   /// %val2 = shufflevector<n x t> %red1, <n x t> %undef,
2452   /// <n x i32> <i32 n/4, i32 n/4 + 1, ..., i32 n/2, i32 undef, ..., i32 undef>
2453   ///            \----------------v-------------/  \----------v------------/
2454   ///                            n/4 elements               3*n/4 elements
2455   /// %red2 = op <n x t> %red1, <n x t> val2  - working with the vector of
2456   /// length n/2, the resulting vector has length n/4 etc.
2457   ///
2458   /// The cost model should take into account that the actual length of the
2459   /// vector is reduced on each iteration.
getTreeReductionCost(unsigned Opcode,VectorType * Ty,TTI::TargetCostKind CostKind)2460   InstructionCost getTreeReductionCost(unsigned Opcode, VectorType *Ty,
2461                                        TTI::TargetCostKind CostKind) {
2462     // Targets must implement a default value for the scalable case, since
2463     // we don't know how many lanes the vector has.
2464     if (isa<ScalableVectorType>(Ty))
2465       return InstructionCost::getInvalid();
2466 
2467     Type *ScalarTy = Ty->getElementType();
2468     unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements();
2469     if ((Opcode == Instruction::Or || Opcode == Instruction::And) &&
2470         ScalarTy == IntegerType::getInt1Ty(Ty->getContext()) &&
2471         NumVecElts >= 2) {
2472       // Or reduction for i1 is represented as:
2473       // %val = bitcast <ReduxWidth x i1> to iReduxWidth
2474       // %res = cmp ne iReduxWidth %val, 0
2475       // And reduction for i1 is represented as:
2476       // %val = bitcast <ReduxWidth x i1> to iReduxWidth
2477       // %res = cmp eq iReduxWidth %val, 11111
2478       Type *ValTy = IntegerType::get(Ty->getContext(), NumVecElts);
2479       return thisT()->getCastInstrCost(Instruction::BitCast, ValTy, Ty,
2480                                        TTI::CastContextHint::None, CostKind) +
2481              thisT()->getCmpSelInstrCost(Instruction::ICmp, ValTy,
2482                                          CmpInst::makeCmpResultType(ValTy),
2483                                          CmpInst::BAD_ICMP_PREDICATE, CostKind);
2484     }
2485     unsigned NumReduxLevels = Log2_32(NumVecElts);
2486     InstructionCost ArithCost = 0;
2487     InstructionCost ShuffleCost = 0;
2488     std::pair<InstructionCost, MVT> LT = thisT()->getTypeLegalizationCost(Ty);
2489     unsigned LongVectorCount = 0;
2490     unsigned MVTLen =
2491         LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
2492     while (NumVecElts > MVTLen) {
2493       NumVecElts /= 2;
2494       VectorType *SubTy = FixedVectorType::get(ScalarTy, NumVecElts);
2495       ShuffleCost +=
2496           thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt,
2497                                   CostKind, NumVecElts, SubTy);
2498       ArithCost += thisT()->getArithmeticInstrCost(Opcode, SubTy, CostKind);
2499       Ty = SubTy;
2500       ++LongVectorCount;
2501     }
2502 
2503     NumReduxLevels -= LongVectorCount;
2504 
2505     // The minimal length of the vector is limited by the real length of vector
2506     // operations performed on the current platform. That's why several final
2507     // reduction operations are performed on the vectors with the same
2508     // architecture-dependent length.
2509 
2510     // By default reductions need one shuffle per reduction level.
2511     ShuffleCost +=
2512         NumReduxLevels * thisT()->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty,
2513                                                  std::nullopt, CostKind, 0, Ty);
2514     ArithCost +=
2515         NumReduxLevels * thisT()->getArithmeticInstrCost(Opcode, Ty, CostKind);
2516     return ShuffleCost + ArithCost +
2517            thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty,
2518                                        CostKind, 0, nullptr, nullptr);
2519   }
2520 
2521   /// Try to calculate the cost of performing strict (in-order) reductions,
2522   /// which involves doing a sequence of floating point additions in lane
2523   /// order, starting with an initial value. For example, consider a scalar
2524   /// initial value 'InitVal' of type float and a vector of type <4 x float>:
2525   ///
2526   ///   Vector = <float %v0, float %v1, float %v2, float %v3>
2527   ///
2528   ///   %add1 = %InitVal + %v0
2529   ///   %add2 = %add1 + %v1
2530   ///   %add3 = %add2 + %v2
2531   ///   %add4 = %add3 + %v3
2532   ///
2533   /// As a simple estimate we can say the cost of such a reduction is 4 times
2534   /// the cost of a scalar FP addition. We can only estimate the costs for
2535   /// fixed-width vectors here because for scalable vectors we do not know the
2536   /// runtime number of operations.
getOrderedReductionCost(unsigned Opcode,VectorType * Ty,TTI::TargetCostKind CostKind)2537   InstructionCost getOrderedReductionCost(unsigned Opcode, VectorType *Ty,
2538                                           TTI::TargetCostKind CostKind) {
2539     // Targets must implement a default value for the scalable case, since
2540     // we don't know how many lanes the vector has.
2541     if (isa<ScalableVectorType>(Ty))
2542       return InstructionCost::getInvalid();
2543 
2544     auto *VTy = cast<FixedVectorType>(Ty);
2545     InstructionCost ExtractCost = getScalarizationOverhead(
2546         VTy, /*Insert=*/false, /*Extract=*/true, CostKind);
2547     InstructionCost ArithCost = thisT()->getArithmeticInstrCost(
2548         Opcode, VTy->getElementType(), CostKind);
2549     ArithCost *= VTy->getNumElements();
2550 
2551     return ExtractCost + ArithCost;
2552   }
2553 
getArithmeticReductionCost(unsigned Opcode,VectorType * Ty,std::optional<FastMathFlags> FMF,TTI::TargetCostKind CostKind)2554   InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
2555                                              std::optional<FastMathFlags> FMF,
2556                                              TTI::TargetCostKind CostKind) {
2557     assert(Ty && "Unknown reduction vector type");
2558     if (TTI::requiresOrderedReduction(FMF))
2559       return getOrderedReductionCost(Opcode, Ty, CostKind);
2560     return getTreeReductionCost(Opcode, Ty, CostKind);
2561   }
2562 
2563   /// Try to calculate op costs for min/max reduction operations.
2564   /// \param CondTy Conditional type for the Select instruction.
getMinMaxReductionCost(Intrinsic::ID IID,VectorType * Ty,FastMathFlags FMF,TTI::TargetCostKind CostKind)2565   InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
2566                                          FastMathFlags FMF,
2567                                          TTI::TargetCostKind CostKind) {
2568     // Targets must implement a default value for the scalable case, since
2569     // we don't know how many lanes the vector has.
2570     if (isa<ScalableVectorType>(Ty))
2571       return InstructionCost::getInvalid();
2572 
2573     Type *ScalarTy = Ty->getElementType();
2574     unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements();
2575     unsigned NumReduxLevels = Log2_32(NumVecElts);
2576     InstructionCost MinMaxCost = 0;
2577     InstructionCost ShuffleCost = 0;
2578     std::pair<InstructionCost, MVT> LT = thisT()->getTypeLegalizationCost(Ty);
2579     unsigned LongVectorCount = 0;
2580     unsigned MVTLen =
2581         LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
2582     while (NumVecElts > MVTLen) {
2583       NumVecElts /= 2;
2584       auto *SubTy = FixedVectorType::get(ScalarTy, NumVecElts);
2585 
2586       ShuffleCost +=
2587           thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt,
2588                                   CostKind, NumVecElts, SubTy);
2589 
2590       IntrinsicCostAttributes Attrs(IID, SubTy, {SubTy, SubTy}, FMF);
2591       MinMaxCost += getIntrinsicInstrCost(Attrs, CostKind);
2592       Ty = SubTy;
2593       ++LongVectorCount;
2594     }
2595 
2596     NumReduxLevels -= LongVectorCount;
2597 
2598     // The minimal length of the vector is limited by the real length of vector
2599     // operations performed on the current platform. That's why several final
2600     // reduction opertions are perfomed on the vectors with the same
2601     // architecture-dependent length.
2602     ShuffleCost +=
2603         NumReduxLevels * thisT()->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty,
2604                                                  std::nullopt, CostKind, 0, Ty);
2605     IntrinsicCostAttributes Attrs(IID, Ty, {Ty, Ty}, FMF);
2606     MinMaxCost += NumReduxLevels * getIntrinsicInstrCost(Attrs, CostKind);
2607     // The last min/max should be in vector registers and we counted it above.
2608     // So just need a single extractelement.
2609     return ShuffleCost + MinMaxCost +
2610            thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty,
2611                                        CostKind, 0, nullptr, nullptr);
2612   }
2613 
getExtendedReductionCost(unsigned Opcode,bool IsUnsigned,Type * ResTy,VectorType * Ty,FastMathFlags FMF,TTI::TargetCostKind CostKind)2614   InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned,
2615                                            Type *ResTy, VectorType *Ty,
2616                                            FastMathFlags FMF,
2617                                            TTI::TargetCostKind CostKind) {
2618     // Without any native support, this is equivalent to the cost of
2619     // vecreduce.opcode(ext(Ty A)).
2620     VectorType *ExtTy = VectorType::get(ResTy, Ty);
2621     InstructionCost RedCost =
2622         thisT()->getArithmeticReductionCost(Opcode, ExtTy, FMF, CostKind);
2623     InstructionCost ExtCost = thisT()->getCastInstrCost(
2624         IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty,
2625         TTI::CastContextHint::None, CostKind);
2626 
2627     return RedCost + ExtCost;
2628   }
2629 
getMulAccReductionCost(bool IsUnsigned,Type * ResTy,VectorType * Ty,TTI::TargetCostKind CostKind)2630   InstructionCost getMulAccReductionCost(bool IsUnsigned, Type *ResTy,
2631                                          VectorType *Ty,
2632                                          TTI::TargetCostKind CostKind) {
2633     // Without any native support, this is equivalent to the cost of
2634     // vecreduce.add(mul(ext(Ty A), ext(Ty B))) or
2635     // vecreduce.add(mul(A, B)).
2636     VectorType *ExtTy = VectorType::get(ResTy, Ty);
2637     InstructionCost RedCost = thisT()->getArithmeticReductionCost(
2638         Instruction::Add, ExtTy, std::nullopt, CostKind);
2639     InstructionCost ExtCost = thisT()->getCastInstrCost(
2640         IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty,
2641         TTI::CastContextHint::None, CostKind);
2642 
2643     InstructionCost MulCost =
2644         thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
2645 
2646     return RedCost + MulCost + 2 * ExtCost;
2647   }
2648 
getVectorSplitCost()2649   InstructionCost getVectorSplitCost() { return 1; }
2650 
2651   /// @}
2652 };
2653 
2654 /// Concrete BasicTTIImpl that can be used if no further customization
2655 /// is needed.
2656 class BasicTTIImpl : public BasicTTIImplBase<BasicTTIImpl> {
2657   using BaseT = BasicTTIImplBase<BasicTTIImpl>;
2658 
2659   friend class BasicTTIImplBase<BasicTTIImpl>;
2660 
2661   const TargetSubtargetInfo *ST;
2662   const TargetLoweringBase *TLI;
2663 
getST()2664   const TargetSubtargetInfo *getST() const { return ST; }
getTLI()2665   const TargetLoweringBase *getTLI() const { return TLI; }
2666 
2667 public:
2668   explicit BasicTTIImpl(const TargetMachine *TM, const Function &F);
2669 };
2670 
2671 } // end namespace llvm
2672 
2673 #endif // LLVM_CODEGEN_BASICTTIIMPL_H
2674