xref: /freebsd/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp (revision a4e5e0106ac7145f56eb39a691e302cabb4635be)
1 //===-- RISCVTargetTransformInfo.cpp - RISC-V specific TTI ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "RISCVTargetTransformInfo.h"
10 #include "MCTargetDesc/RISCVMatInt.h"
11 #include "llvm/ADT/STLExtras.h"
12 #include "llvm/Analysis/TargetTransformInfo.h"
13 #include "llvm/CodeGen/BasicTTIImpl.h"
14 #include "llvm/CodeGen/CostTable.h"
15 #include "llvm/CodeGen/TargetLowering.h"
16 #include "llvm/IR/Instructions.h"
17 #include <cmath>
18 #include <optional>
19 using namespace llvm;
20 
21 #define DEBUG_TYPE "riscvtti"
22 
23 static cl::opt<unsigned> RVVRegisterWidthLMUL(
24     "riscv-v-register-bit-width-lmul",
25     cl::desc(
26         "The LMUL to use for getRegisterBitWidth queries. Affects LMUL used "
27         "by autovectorized code. Fractional LMULs are not supported."),
28     cl::init(2), cl::Hidden);
29 
30 static cl::opt<unsigned> SLPMaxVF(
31     "riscv-v-slp-max-vf",
32     cl::desc(
33         "Overrides result used for getMaximumVF query which is used "
34         "exclusively by SLP vectorizer."),
35     cl::Hidden);
36 
37 InstructionCost RISCVTTIImpl::getLMULCost(MVT VT) {
38   // TODO: Here assume reciprocal throughput is 1 for LMUL_1, it is
39   // implementation-defined.
40   if (!VT.isVector())
41     return InstructionCost::getInvalid();
42   unsigned DLenFactor = ST->getDLenFactor();
43   unsigned Cost;
44   if (VT.isScalableVector()) {
45     unsigned LMul;
46     bool Fractional;
47     std::tie(LMul, Fractional) =
48         RISCVVType::decodeVLMUL(RISCVTargetLowering::getLMUL(VT));
49     if (Fractional)
50       Cost = LMul <= DLenFactor ? (DLenFactor / LMul) : 1;
51     else
52       Cost = (LMul * DLenFactor);
53   } else {
54     Cost = divideCeil(VT.getSizeInBits(), ST->getRealMinVLen() / DLenFactor);
55   }
56   return Cost;
57 }
58 
59 InstructionCost RISCVTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
60                                             TTI::TargetCostKind CostKind) {
61   assert(Ty->isIntegerTy() &&
62          "getIntImmCost can only estimate cost of materialising integers");
63 
64   // We have a Zero register, so 0 is always free.
65   if (Imm == 0)
66     return TTI::TCC_Free;
67 
68   // Otherwise, we check how many instructions it will take to materialise.
69   const DataLayout &DL = getDataLayout();
70   return RISCVMatInt::getIntMatCost(Imm, DL.getTypeSizeInBits(Ty),
71                                     getST()->getFeatureBits());
72 }
73 
74 // Look for patterns of shift followed by AND that can be turned into a pair of
75 // shifts. We won't need to materialize an immediate for the AND so these can
76 // be considered free.
77 static bool canUseShiftPair(Instruction *Inst, const APInt &Imm) {
78   uint64_t Mask = Imm.getZExtValue();
79   auto *BO = dyn_cast<BinaryOperator>(Inst->getOperand(0));
80   if (!BO || !BO->hasOneUse())
81     return false;
82 
83   if (BO->getOpcode() != Instruction::Shl)
84     return false;
85 
86   if (!isa<ConstantInt>(BO->getOperand(1)))
87     return false;
88 
89   unsigned ShAmt = cast<ConstantInt>(BO->getOperand(1))->getZExtValue();
90   // (and (shl x, c2), c1) will be matched to (srli (slli x, c2+c3), c3) if c1
91   // is a mask shifted by c2 bits with c3 leading zeros.
92   if (isShiftedMask_64(Mask)) {
93     unsigned Trailing = llvm::countr_zero(Mask);
94     if (ShAmt == Trailing)
95       return true;
96   }
97 
98   return false;
99 }
100 
101 InstructionCost RISCVTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
102                                                 const APInt &Imm, Type *Ty,
103                                                 TTI::TargetCostKind CostKind,
104                                                 Instruction *Inst) {
105   assert(Ty->isIntegerTy() &&
106          "getIntImmCost can only estimate cost of materialising integers");
107 
108   // We have a Zero register, so 0 is always free.
109   if (Imm == 0)
110     return TTI::TCC_Free;
111 
112   // Some instructions in RISC-V can take a 12-bit immediate. Some of these are
113   // commutative, in others the immediate comes from a specific argument index.
114   bool Takes12BitImm = false;
115   unsigned ImmArgIdx = ~0U;
116 
117   switch (Opcode) {
118   case Instruction::GetElementPtr:
119     // Never hoist any arguments to a GetElementPtr. CodeGenPrepare will
120     // split up large offsets in GEP into better parts than ConstantHoisting
121     // can.
122     return TTI::TCC_Free;
123   case Instruction::And:
124     // zext.h
125     if (Imm == UINT64_C(0xffff) && ST->hasStdExtZbb())
126       return TTI::TCC_Free;
127     // zext.w
128     if (Imm == UINT64_C(0xffffffff) && ST->hasStdExtZba())
129       return TTI::TCC_Free;
130     // bclri
131     if (ST->hasStdExtZbs() && (~Imm).isPowerOf2())
132       return TTI::TCC_Free;
133     if (Inst && Idx == 1 && Imm.getBitWidth() <= ST->getXLen() &&
134         canUseShiftPair(Inst, Imm))
135       return TTI::TCC_Free;
136     Takes12BitImm = true;
137     break;
138   case Instruction::Add:
139     Takes12BitImm = true;
140     break;
141   case Instruction::Or:
142   case Instruction::Xor:
143     // bseti/binvi
144     if (ST->hasStdExtZbs() && Imm.isPowerOf2())
145       return TTI::TCC_Free;
146     Takes12BitImm = true;
147     break;
148   case Instruction::Mul:
149     // Power of 2 is a shift. Negated power of 2 is a shift and a negate.
150     if (Imm.isPowerOf2() || Imm.isNegatedPowerOf2())
151       return TTI::TCC_Free;
152     // FIXME: There is no MULI instruction.
153     Takes12BitImm = true;
154     break;
155   case Instruction::Sub:
156   case Instruction::Shl:
157   case Instruction::LShr:
158   case Instruction::AShr:
159     Takes12BitImm = true;
160     ImmArgIdx = 1;
161     break;
162   default:
163     break;
164   }
165 
166   if (Takes12BitImm) {
167     // Check immediate is the correct argument...
168     if (Instruction::isCommutative(Opcode) || Idx == ImmArgIdx) {
169       // ... and fits into the 12-bit immediate.
170       if (Imm.getSignificantBits() <= 64 &&
171           getTLI()->isLegalAddImmediate(Imm.getSExtValue())) {
172         return TTI::TCC_Free;
173       }
174     }
175 
176     // Otherwise, use the full materialisation cost.
177     return getIntImmCost(Imm, Ty, CostKind);
178   }
179 
180   // By default, prevent hoisting.
181   return TTI::TCC_Free;
182 }
183 
184 InstructionCost
185 RISCVTTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
186                                   const APInt &Imm, Type *Ty,
187                                   TTI::TargetCostKind CostKind) {
188   // Prevent hoisting in unknown cases.
189   return TTI::TCC_Free;
190 }
191 
192 TargetTransformInfo::PopcntSupportKind
193 RISCVTTIImpl::getPopcntSupport(unsigned TyWidth) {
194   assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
195   return ST->hasStdExtZbb() ? TTI::PSK_FastHardware : TTI::PSK_Software;
196 }
197 
198 bool RISCVTTIImpl::shouldExpandReduction(const IntrinsicInst *II) const {
199   // Currently, the ExpandReductions pass can't expand scalable-vector
200   // reductions, but we still request expansion as RVV doesn't support certain
201   // reductions and the SelectionDAG can't legalize them either.
202   switch (II->getIntrinsicID()) {
203   default:
204     return false;
205   // These reductions have no equivalent in RVV
206   case Intrinsic::vector_reduce_mul:
207   case Intrinsic::vector_reduce_fmul:
208     return true;
209   }
210 }
211 
212 std::optional<unsigned> RISCVTTIImpl::getMaxVScale() const {
213   if (ST->hasVInstructions())
214     return ST->getRealMaxVLen() / RISCV::RVVBitsPerBlock;
215   return BaseT::getMaxVScale();
216 }
217 
218 std::optional<unsigned> RISCVTTIImpl::getVScaleForTuning() const {
219   if (ST->hasVInstructions())
220     if (unsigned MinVLen = ST->getRealMinVLen();
221         MinVLen >= RISCV::RVVBitsPerBlock)
222       return MinVLen / RISCV::RVVBitsPerBlock;
223   return BaseT::getVScaleForTuning();
224 }
225 
226 TypeSize
227 RISCVTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
228   unsigned LMUL =
229       llvm::bit_floor(std::clamp<unsigned>(RVVRegisterWidthLMUL, 1, 8));
230   switch (K) {
231   case TargetTransformInfo::RGK_Scalar:
232     return TypeSize::getFixed(ST->getXLen());
233   case TargetTransformInfo::RGK_FixedWidthVector:
234     return TypeSize::getFixed(
235         ST->useRVVForFixedLengthVectors() ? LMUL * ST->getRealMinVLen() : 0);
236   case TargetTransformInfo::RGK_ScalableVector:
237     return TypeSize::getScalable(
238         (ST->hasVInstructions() &&
239          ST->getRealMinVLen() >= RISCV::RVVBitsPerBlock)
240             ? LMUL * RISCV::RVVBitsPerBlock
241             : 0);
242   }
243 
244   llvm_unreachable("Unsupported register kind");
245 }
246 
247 InstructionCost
248 RISCVTTIImpl::getConstantPoolLoadCost(Type *Ty,  TTI::TargetCostKind CostKind) {
249   // Add a cost of address generation + the cost of the load. The address
250   // is expected to be a PC relative offset to a constant pool entry
251   // using auipc/addi.
252   return 2 + getMemoryOpCost(Instruction::Load, Ty, DL.getABITypeAlign(Ty),
253                              /*AddressSpace=*/0, CostKind);
254 }
255 
256 static VectorType *getVRGatherIndexType(MVT DataVT, const RISCVSubtarget &ST,
257                                         LLVMContext &C) {
258   assert((DataVT.getScalarSizeInBits() != 8 ||
259           DataVT.getVectorNumElements() <= 256) && "unhandled case in lowering");
260   MVT IndexVT = DataVT.changeTypeToInteger();
261   if (IndexVT.getScalarType().bitsGT(ST.getXLenVT()))
262     IndexVT = IndexVT.changeVectorElementType(MVT::i16);
263   return cast<VectorType>(EVT(IndexVT).getTypeForEVT(C));
264 }
265 
266 /// Return the cost of a vrgather.vv instruction for the type VT.  vrgather.vv
267 /// is generally quadratic in the number of vreg implied by LMUL.  Note that
268 /// operand (index and possibly mask) are handled separately.
269 InstructionCost RISCVTTIImpl::getVRGatherVVCost(MVT VT) {
270   return getLMULCost(VT) * getLMULCost(VT);
271 }
272 
273 InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
274                                              VectorType *Tp, ArrayRef<int> Mask,
275                                              TTI::TargetCostKind CostKind,
276                                              int Index, VectorType *SubTp,
277                                              ArrayRef<const Value *> Args) {
278   Kind = improveShuffleKindFromMask(Kind, Mask);
279 
280   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
281 
282   // First, handle cases where having a fixed length vector enables us to
283   // give a more accurate cost than falling back to generic scalable codegen.
284   // TODO: Each of these cases hints at a modeling gap around scalable vectors.
285   if (isa<FixedVectorType>(Tp)) {
286     switch (Kind) {
287     default:
288       break;
289     case TTI::SK_PermuteSingleSrc: {
290       if (Mask.size() >= 2 && LT.second.isFixedLengthVector()) {
291         MVT EltTp = LT.second.getVectorElementType();
292         // If the size of the element is < ELEN then shuffles of interleaves and
293         // deinterleaves of 2 vectors can be lowered into the following
294         // sequences
295         if (EltTp.getScalarSizeInBits() < ST->getELEN()) {
296           // Example sequence:
297           //   vsetivli     zero, 4, e8, mf4, ta, ma (ignored)
298           //   vwaddu.vv    v10, v8, v9
299           //   li       a0, -1                   (ignored)
300           //   vwmaccu.vx   v10, a0, v9
301           if (ShuffleVectorInst::isInterleaveMask(Mask, 2, Mask.size()))
302             return 2 * LT.first * getLMULCost(LT.second);
303 
304           if (Mask[0] == 0 || Mask[0] == 1) {
305             auto DeinterleaveMask = createStrideMask(Mask[0], 2, Mask.size());
306             // Example sequence:
307             //   vnsrl.wi   v10, v8, 0
308             if (equal(DeinterleaveMask, Mask))
309               return LT.first * getLMULCost(LT.second);
310           }
311         }
312 
313         // vrgather + cost of generating the mask constant.
314         // We model this for an unknown mask with a single vrgather.
315         if (LT.first == 1 &&
316             (LT.second.getScalarSizeInBits() != 8 ||
317              LT.second.getVectorNumElements() <= 256)) {
318           VectorType *IdxTy = getVRGatherIndexType(LT.second, *ST, Tp->getContext());
319           InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
320           return IndexCost + getVRGatherVVCost(LT.second);
321         }
322       }
323       break;
324     }
325     case TTI::SK_Transpose:
326     case TTI::SK_PermuteTwoSrc: {
327       if (Mask.size() >= 2 && LT.second.isFixedLengthVector()) {
328         // 2 x (vrgather + cost of generating the mask constant) + cost of mask
329         // register for the second vrgather. We model this for an unknown
330         // (shuffle) mask.
331         if (LT.first == 1 &&
332             (LT.second.getScalarSizeInBits() != 8 ||
333              LT.second.getVectorNumElements() <= 256)) {
334           auto &C = Tp->getContext();
335           auto EC = Tp->getElementCount();
336           VectorType *IdxTy = getVRGatherIndexType(LT.second, *ST, C);
337           VectorType *MaskTy = VectorType::get(IntegerType::getInt1Ty(C), EC);
338           InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
339           InstructionCost MaskCost = getConstantPoolLoadCost(MaskTy, CostKind);
340           return 2 * IndexCost + 2 * getVRGatherVVCost(LT.second) + MaskCost;
341         }
342       }
343       break;
344     }
345     }
346   };
347 
348   // Handle scalable vectors (and fixed vectors legalized to scalable vectors).
349   switch (Kind) {
350   default:
351     // Fallthrough to generic handling.
352     // TODO: Most of these cases will return getInvalid in generic code, and
353     // must be implemented here.
354     break;
355   case TTI::SK_ExtractSubvector:
356     // Example sequence:
357     // vsetivli     zero, 4, e8, mf2, tu, ma (ignored)
358     // vslidedown.vi  v8, v9, 2
359     return LT.first * getLMULCost(LT.second);
360   case TTI::SK_InsertSubvector:
361     // Example sequence:
362     // vsetivli     zero, 4, e8, mf2, tu, ma (ignored)
363     // vslideup.vi  v8, v9, 2
364     return LT.first * getLMULCost(LT.second);
365   case TTI::SK_Select: {
366     // Example sequence:
367     // li           a0, 90
368     // vsetivli     zero, 8, e8, mf2, ta, ma (ignored)
369     // vmv.s.x      v0, a0
370     // vmerge.vvm   v8, v9, v8, v0
371     return LT.first * 3 * getLMULCost(LT.second);
372   }
373   case TTI::SK_Broadcast: {
374     bool HasScalar = (Args.size() > 0) && (Operator::getOpcode(Args[0]) ==
375                                            Instruction::InsertElement);
376     if (LT.second.getScalarSizeInBits() == 1) {
377       if (HasScalar) {
378         // Example sequence:
379         //   andi a0, a0, 1
380         //   vsetivli zero, 2, e8, mf8, ta, ma (ignored)
381         //   vmv.v.x v8, a0
382         //   vmsne.vi v0, v8, 0
383         return LT.first * getLMULCost(LT.second) * 3;
384       }
385       // Example sequence:
386       //   vsetivli  zero, 2, e8, mf8, ta, mu (ignored)
387       //   vmv.v.i v8, 0
388       //   vmerge.vim      v8, v8, 1, v0
389       //   vmv.x.s a0, v8
390       //   andi    a0, a0, 1
391       //   vmv.v.x v8, a0
392       //   vmsne.vi  v0, v8, 0
393 
394       return LT.first * getLMULCost(LT.second) * 6;
395     }
396 
397     if (HasScalar) {
398       // Example sequence:
399       //   vmv.v.x v8, a0
400       return LT.first * getLMULCost(LT.second);
401     }
402 
403     // Example sequence:
404     //   vrgather.vi     v9, v8, 0
405     // TODO: vrgather could be slower than vmv.v.x. It is
406     // implementation-dependent.
407     return LT.first * getLMULCost(LT.second);
408   }
409   case TTI::SK_Splice:
410     // vslidedown+vslideup.
411     // TODO: Multiplying by LT.first implies this legalizes into multiple copies
412     // of similar code, but I think we expand through memory.
413     return 2 * LT.first * getLMULCost(LT.second);
414   case TTI::SK_Reverse: {
415     // TODO: Cases to improve here:
416     // * Illegal vector types
417     // * i64 on RV32
418     // * i1 vector
419     // At low LMUL, most of the cost is producing the vrgather index register.
420     // At high LMUL, the cost of the vrgather itself will dominate.
421     // Example sequence:
422     //   csrr a0, vlenb
423     //   srli a0, a0, 3
424     //   addi a0, a0, -1
425     //   vsetvli a1, zero, e8, mf8, ta, mu (ignored)
426     //   vid.v v9
427     //   vrsub.vx v10, v9, a0
428     //   vrgather.vv v9, v8, v10
429     InstructionCost LenCost = 3;
430     if (LT.second.isFixedLengthVector())
431       // vrsub.vi has a 5 bit immediate field, otherwise an li suffices
432       LenCost = isInt<5>(LT.second.getVectorNumElements() - 1) ? 0 : 1;
433     InstructionCost GatherCost = 2 + getVRGatherVVCost(LT.second);
434     // Mask operation additionally required extend and truncate
435     InstructionCost ExtendCost = Tp->getElementType()->isIntegerTy(1) ? 3 : 0;
436     return LT.first * (LenCost + GatherCost + ExtendCost);
437   }
438   }
439   return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp);
440 }
441 
442 InstructionCost
443 RISCVTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
444                                     unsigned AddressSpace,
445                                     TTI::TargetCostKind CostKind) {
446   if (!isLegalMaskedLoadStore(Src, Alignment) ||
447       CostKind != TTI::TCK_RecipThroughput)
448     return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
449                                         CostKind);
450 
451   return getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind);
452 }
453 
454 InstructionCost RISCVTTIImpl::getInterleavedMemoryOpCost(
455     unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
456     Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
457     bool UseMaskForCond, bool UseMaskForGaps) {
458   if (isa<ScalableVectorType>(VecTy))
459     return InstructionCost::getInvalid();
460   auto *FVTy = cast<FixedVectorType>(VecTy);
461   InstructionCost MemCost =
462       getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, CostKind);
463   unsigned VF = FVTy->getNumElements() / Factor;
464 
465   // The interleaved memory access pass will lower interleaved memory ops (i.e
466   // a load and store followed by a specific shuffle) to vlseg/vsseg
467   // intrinsics. In those cases then we can treat it as if it's just one (legal)
468   // memory op
469   if (!UseMaskForCond && !UseMaskForGaps &&
470       Factor <= TLI->getMaxSupportedInterleaveFactor()) {
471     std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(FVTy);
472     // Need to make sure type has't been scalarized
473     if (LT.second.isFixedLengthVector()) {
474       auto *LegalFVTy = FixedVectorType::get(FVTy->getElementType(),
475                                              LT.second.getVectorNumElements());
476       // FIXME: We use the memory op cost of the *legalized* type here, becuase
477       // it's getMemoryOpCost returns a really expensive cost for types like
478       // <6 x i8>, which show up when doing interleaves of Factor=3 etc.
479       // Should the memory op cost of these be cheaper?
480       if (TLI->isLegalInterleavedAccessType(LegalFVTy, Factor, Alignment,
481                                             AddressSpace, DL)) {
482         InstructionCost LegalMemCost = getMemoryOpCost(
483             Opcode, LegalFVTy, Alignment, AddressSpace, CostKind);
484         return LT.first + LegalMemCost;
485       }
486     }
487   }
488 
489   // An interleaved load will look like this for Factor=3:
490   // %wide.vec = load <12 x i32>, ptr %3, align 4
491   // %strided.vec = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
492   // %strided.vec1 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
493   // %strided.vec2 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
494   if (Opcode == Instruction::Load) {
495     InstructionCost Cost = MemCost;
496     for (unsigned Index : Indices) {
497       FixedVectorType *SubVecTy =
498           FixedVectorType::get(FVTy->getElementType(), VF);
499       auto Mask = createStrideMask(Index, Factor, VF);
500       InstructionCost ShuffleCost =
501           getShuffleCost(TTI::ShuffleKind::SK_PermuteSingleSrc, SubVecTy, Mask,
502                          CostKind, 0, nullptr, {});
503       Cost += ShuffleCost;
504     }
505     return Cost;
506   }
507 
508   // TODO: Model for NF > 2
509   // We'll need to enhance getShuffleCost to model shuffles that are just
510   // inserts and extracts into subvectors, since they won't have the full cost
511   // of a vrgather.
512   // An interleaved store for 3 vectors of 4 lanes will look like
513   // %11 = shufflevector <4 x i32> %4, <4 x i32> %6, <8 x i32> <0...7>
514   // %12 = shufflevector <4 x i32> %9, <4 x i32> poison, <8 x i32> <0...3>
515   // %13 = shufflevector <8 x i32> %11, <8 x i32> %12, <12 x i32> <0...11>
516   // %interleaved.vec = shufflevector %13, poison, <12 x i32> <interleave mask>
517   // store <12 x i32> %interleaved.vec, ptr %10, align 4
518   if (Factor != 2)
519     return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
520                                              Alignment, AddressSpace, CostKind,
521                                              UseMaskForCond, UseMaskForGaps);
522 
523   assert(Opcode == Instruction::Store && "Opcode must be a store");
524   // For an interleaving store of 2 vectors, we perform one large interleaving
525   // shuffle that goes into the wide store
526   auto Mask = createInterleaveMask(VF, Factor);
527   InstructionCost ShuffleCost =
528       getShuffleCost(TTI::ShuffleKind::SK_PermuteSingleSrc, FVTy, Mask,
529                      CostKind, 0, nullptr, {});
530   return MemCost + ShuffleCost;
531 }
532 
533 InstructionCost RISCVTTIImpl::getGatherScatterOpCost(
534     unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
535     Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
536   if (CostKind != TTI::TCK_RecipThroughput)
537     return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
538                                          Alignment, CostKind, I);
539 
540   if ((Opcode == Instruction::Load &&
541        !isLegalMaskedGather(DataTy, Align(Alignment))) ||
542       (Opcode == Instruction::Store &&
543        !isLegalMaskedScatter(DataTy, Align(Alignment))))
544     return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
545                                          Alignment, CostKind, I);
546 
547   // Cost is proportional to the number of memory operations implied.  For
548   // scalable vectors, we use an estimate on that number since we don't
549   // know exactly what VL will be.
550   auto &VTy = *cast<VectorType>(DataTy);
551   InstructionCost MemOpCost =
552       getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind,
553                       {TTI::OK_AnyValue, TTI::OP_None}, I);
554   unsigned NumLoads = getEstimatedVLFor(&VTy);
555   return NumLoads * MemOpCost;
556 }
557 
558 // Currently, these represent both throughput and codesize costs
559 // for the respective intrinsics.  The costs in this table are simply
560 // instruction counts with the following adjustments made:
561 // * One vsetvli is considered free.
562 static const CostTblEntry VectorIntrinsicCostTable[]{
563     {Intrinsic::floor, MVT::v2f32, 9},
564     {Intrinsic::floor, MVT::v4f32, 9},
565     {Intrinsic::floor, MVT::v8f32, 9},
566     {Intrinsic::floor, MVT::v16f32, 9},
567     {Intrinsic::floor, MVT::nxv1f32, 9},
568     {Intrinsic::floor, MVT::nxv2f32, 9},
569     {Intrinsic::floor, MVT::nxv4f32, 9},
570     {Intrinsic::floor, MVT::nxv8f32, 9},
571     {Intrinsic::floor, MVT::nxv16f32, 9},
572     {Intrinsic::floor, MVT::v2f64, 9},
573     {Intrinsic::floor, MVT::v4f64, 9},
574     {Intrinsic::floor, MVT::v8f64, 9},
575     {Intrinsic::floor, MVT::v16f64, 9},
576     {Intrinsic::floor, MVT::nxv1f64, 9},
577     {Intrinsic::floor, MVT::nxv2f64, 9},
578     {Intrinsic::floor, MVT::nxv4f64, 9},
579     {Intrinsic::floor, MVT::nxv8f64, 9},
580     {Intrinsic::ceil, MVT::v2f32, 9},
581     {Intrinsic::ceil, MVT::v4f32, 9},
582     {Intrinsic::ceil, MVT::v8f32, 9},
583     {Intrinsic::ceil, MVT::v16f32, 9},
584     {Intrinsic::ceil, MVT::nxv1f32, 9},
585     {Intrinsic::ceil, MVT::nxv2f32, 9},
586     {Intrinsic::ceil, MVT::nxv4f32, 9},
587     {Intrinsic::ceil, MVT::nxv8f32, 9},
588     {Intrinsic::ceil, MVT::nxv16f32, 9},
589     {Intrinsic::ceil, MVT::v2f64, 9},
590     {Intrinsic::ceil, MVT::v4f64, 9},
591     {Intrinsic::ceil, MVT::v8f64, 9},
592     {Intrinsic::ceil, MVT::v16f64, 9},
593     {Intrinsic::ceil, MVT::nxv1f64, 9},
594     {Intrinsic::ceil, MVT::nxv2f64, 9},
595     {Intrinsic::ceil, MVT::nxv4f64, 9},
596     {Intrinsic::ceil, MVT::nxv8f64, 9},
597     {Intrinsic::trunc, MVT::v2f32, 7},
598     {Intrinsic::trunc, MVT::v4f32, 7},
599     {Intrinsic::trunc, MVT::v8f32, 7},
600     {Intrinsic::trunc, MVT::v16f32, 7},
601     {Intrinsic::trunc, MVT::nxv1f32, 7},
602     {Intrinsic::trunc, MVT::nxv2f32, 7},
603     {Intrinsic::trunc, MVT::nxv4f32, 7},
604     {Intrinsic::trunc, MVT::nxv8f32, 7},
605     {Intrinsic::trunc, MVT::nxv16f32, 7},
606     {Intrinsic::trunc, MVT::v2f64, 7},
607     {Intrinsic::trunc, MVT::v4f64, 7},
608     {Intrinsic::trunc, MVT::v8f64, 7},
609     {Intrinsic::trunc, MVT::v16f64, 7},
610     {Intrinsic::trunc, MVT::nxv1f64, 7},
611     {Intrinsic::trunc, MVT::nxv2f64, 7},
612     {Intrinsic::trunc, MVT::nxv4f64, 7},
613     {Intrinsic::trunc, MVT::nxv8f64, 7},
614     {Intrinsic::round, MVT::v2f32, 9},
615     {Intrinsic::round, MVT::v4f32, 9},
616     {Intrinsic::round, MVT::v8f32, 9},
617     {Intrinsic::round, MVT::v16f32, 9},
618     {Intrinsic::round, MVT::nxv1f32, 9},
619     {Intrinsic::round, MVT::nxv2f32, 9},
620     {Intrinsic::round, MVT::nxv4f32, 9},
621     {Intrinsic::round, MVT::nxv8f32, 9},
622     {Intrinsic::round, MVT::nxv16f32, 9},
623     {Intrinsic::round, MVT::v2f64, 9},
624     {Intrinsic::round, MVT::v4f64, 9},
625     {Intrinsic::round, MVT::v8f64, 9},
626     {Intrinsic::round, MVT::v16f64, 9},
627     {Intrinsic::round, MVT::nxv1f64, 9},
628     {Intrinsic::round, MVT::nxv2f64, 9},
629     {Intrinsic::round, MVT::nxv4f64, 9},
630     {Intrinsic::round, MVT::nxv8f64, 9},
631     {Intrinsic::roundeven, MVT::v2f32, 9},
632     {Intrinsic::roundeven, MVT::v4f32, 9},
633     {Intrinsic::roundeven, MVT::v8f32, 9},
634     {Intrinsic::roundeven, MVT::v16f32, 9},
635     {Intrinsic::roundeven, MVT::nxv1f32, 9},
636     {Intrinsic::roundeven, MVT::nxv2f32, 9},
637     {Intrinsic::roundeven, MVT::nxv4f32, 9},
638     {Intrinsic::roundeven, MVT::nxv8f32, 9},
639     {Intrinsic::roundeven, MVT::nxv16f32, 9},
640     {Intrinsic::roundeven, MVT::v2f64, 9},
641     {Intrinsic::roundeven, MVT::v4f64, 9},
642     {Intrinsic::roundeven, MVT::v8f64, 9},
643     {Intrinsic::roundeven, MVT::v16f64, 9},
644     {Intrinsic::roundeven, MVT::nxv1f64, 9},
645     {Intrinsic::roundeven, MVT::nxv2f64, 9},
646     {Intrinsic::roundeven, MVT::nxv4f64, 9},
647     {Intrinsic::roundeven, MVT::nxv8f64, 9},
648     {Intrinsic::rint, MVT::v2f32, 7},
649     {Intrinsic::rint, MVT::v4f32, 7},
650     {Intrinsic::rint, MVT::v8f32, 7},
651     {Intrinsic::rint, MVT::v16f32, 7},
652     {Intrinsic::rint, MVT::nxv1f32, 7},
653     {Intrinsic::rint, MVT::nxv2f32, 7},
654     {Intrinsic::rint, MVT::nxv4f32, 7},
655     {Intrinsic::rint, MVT::nxv8f32, 7},
656     {Intrinsic::rint, MVT::nxv16f32, 7},
657     {Intrinsic::rint, MVT::v2f64, 7},
658     {Intrinsic::rint, MVT::v4f64, 7},
659     {Intrinsic::rint, MVT::v8f64, 7},
660     {Intrinsic::rint, MVT::v16f64, 7},
661     {Intrinsic::rint, MVT::nxv1f64, 7},
662     {Intrinsic::rint, MVT::nxv2f64, 7},
663     {Intrinsic::rint, MVT::nxv4f64, 7},
664     {Intrinsic::rint, MVT::nxv8f64, 7},
665     {Intrinsic::nearbyint, MVT::v2f32, 9},
666     {Intrinsic::nearbyint, MVT::v4f32, 9},
667     {Intrinsic::nearbyint, MVT::v8f32, 9},
668     {Intrinsic::nearbyint, MVT::v16f32, 9},
669     {Intrinsic::nearbyint, MVT::nxv1f32, 9},
670     {Intrinsic::nearbyint, MVT::nxv2f32, 9},
671     {Intrinsic::nearbyint, MVT::nxv4f32, 9},
672     {Intrinsic::nearbyint, MVT::nxv8f32, 9},
673     {Intrinsic::nearbyint, MVT::nxv16f32, 9},
674     {Intrinsic::nearbyint, MVT::v2f64, 9},
675     {Intrinsic::nearbyint, MVT::v4f64, 9},
676     {Intrinsic::nearbyint, MVT::v8f64, 9},
677     {Intrinsic::nearbyint, MVT::v16f64, 9},
678     {Intrinsic::nearbyint, MVT::nxv1f64, 9},
679     {Intrinsic::nearbyint, MVT::nxv2f64, 9},
680     {Intrinsic::nearbyint, MVT::nxv4f64, 9},
681     {Intrinsic::nearbyint, MVT::nxv8f64, 9},
682     {Intrinsic::bswap, MVT::v2i16, 3},
683     {Intrinsic::bswap, MVT::v4i16, 3},
684     {Intrinsic::bswap, MVT::v8i16, 3},
685     {Intrinsic::bswap, MVT::v16i16, 3},
686     {Intrinsic::bswap, MVT::nxv1i16, 3},
687     {Intrinsic::bswap, MVT::nxv2i16, 3},
688     {Intrinsic::bswap, MVT::nxv4i16, 3},
689     {Intrinsic::bswap, MVT::nxv8i16, 3},
690     {Intrinsic::bswap, MVT::nxv16i16, 3},
691     {Intrinsic::bswap, MVT::v2i32, 12},
692     {Intrinsic::bswap, MVT::v4i32, 12},
693     {Intrinsic::bswap, MVT::v8i32, 12},
694     {Intrinsic::bswap, MVT::v16i32, 12},
695     {Intrinsic::bswap, MVT::nxv1i32, 12},
696     {Intrinsic::bswap, MVT::nxv2i32, 12},
697     {Intrinsic::bswap, MVT::nxv4i32, 12},
698     {Intrinsic::bswap, MVT::nxv8i32, 12},
699     {Intrinsic::bswap, MVT::nxv16i32, 12},
700     {Intrinsic::bswap, MVT::v2i64, 31},
701     {Intrinsic::bswap, MVT::v4i64, 31},
702     {Intrinsic::bswap, MVT::v8i64, 31},
703     {Intrinsic::bswap, MVT::v16i64, 31},
704     {Intrinsic::bswap, MVT::nxv1i64, 31},
705     {Intrinsic::bswap, MVT::nxv2i64, 31},
706     {Intrinsic::bswap, MVT::nxv4i64, 31},
707     {Intrinsic::bswap, MVT::nxv8i64, 31},
708     {Intrinsic::vp_bswap, MVT::v2i16, 3},
709     {Intrinsic::vp_bswap, MVT::v4i16, 3},
710     {Intrinsic::vp_bswap, MVT::v8i16, 3},
711     {Intrinsic::vp_bswap, MVT::v16i16, 3},
712     {Intrinsic::vp_bswap, MVT::nxv1i16, 3},
713     {Intrinsic::vp_bswap, MVT::nxv2i16, 3},
714     {Intrinsic::vp_bswap, MVT::nxv4i16, 3},
715     {Intrinsic::vp_bswap, MVT::nxv8i16, 3},
716     {Intrinsic::vp_bswap, MVT::nxv16i16, 3},
717     {Intrinsic::vp_bswap, MVT::v2i32, 12},
718     {Intrinsic::vp_bswap, MVT::v4i32, 12},
719     {Intrinsic::vp_bswap, MVT::v8i32, 12},
720     {Intrinsic::vp_bswap, MVT::v16i32, 12},
721     {Intrinsic::vp_bswap, MVT::nxv1i32, 12},
722     {Intrinsic::vp_bswap, MVT::nxv2i32, 12},
723     {Intrinsic::vp_bswap, MVT::nxv4i32, 12},
724     {Intrinsic::vp_bswap, MVT::nxv8i32, 12},
725     {Intrinsic::vp_bswap, MVT::nxv16i32, 12},
726     {Intrinsic::vp_bswap, MVT::v2i64, 31},
727     {Intrinsic::vp_bswap, MVT::v4i64, 31},
728     {Intrinsic::vp_bswap, MVT::v8i64, 31},
729     {Intrinsic::vp_bswap, MVT::v16i64, 31},
730     {Intrinsic::vp_bswap, MVT::nxv1i64, 31},
731     {Intrinsic::vp_bswap, MVT::nxv2i64, 31},
732     {Intrinsic::vp_bswap, MVT::nxv4i64, 31},
733     {Intrinsic::vp_bswap, MVT::nxv8i64, 31},
734     {Intrinsic::vp_fshl, MVT::v2i8, 7},
735     {Intrinsic::vp_fshl, MVT::v4i8, 7},
736     {Intrinsic::vp_fshl, MVT::v8i8, 7},
737     {Intrinsic::vp_fshl, MVT::v16i8, 7},
738     {Intrinsic::vp_fshl, MVT::nxv1i8, 7},
739     {Intrinsic::vp_fshl, MVT::nxv2i8, 7},
740     {Intrinsic::vp_fshl, MVT::nxv4i8, 7},
741     {Intrinsic::vp_fshl, MVT::nxv8i8, 7},
742     {Intrinsic::vp_fshl, MVT::nxv16i8, 7},
743     {Intrinsic::vp_fshl, MVT::nxv32i8, 7},
744     {Intrinsic::vp_fshl, MVT::nxv64i8, 7},
745     {Intrinsic::vp_fshl, MVT::v2i16, 7},
746     {Intrinsic::vp_fshl, MVT::v4i16, 7},
747     {Intrinsic::vp_fshl, MVT::v8i16, 7},
748     {Intrinsic::vp_fshl, MVT::v16i16, 7},
749     {Intrinsic::vp_fshl, MVT::nxv1i16, 7},
750     {Intrinsic::vp_fshl, MVT::nxv2i16, 7},
751     {Intrinsic::vp_fshl, MVT::nxv4i16, 7},
752     {Intrinsic::vp_fshl, MVT::nxv8i16, 7},
753     {Intrinsic::vp_fshl, MVT::nxv16i16, 7},
754     {Intrinsic::vp_fshl, MVT::nxv32i16, 7},
755     {Intrinsic::vp_fshl, MVT::v2i32, 7},
756     {Intrinsic::vp_fshl, MVT::v4i32, 7},
757     {Intrinsic::vp_fshl, MVT::v8i32, 7},
758     {Intrinsic::vp_fshl, MVT::v16i32, 7},
759     {Intrinsic::vp_fshl, MVT::nxv1i32, 7},
760     {Intrinsic::vp_fshl, MVT::nxv2i32, 7},
761     {Intrinsic::vp_fshl, MVT::nxv4i32, 7},
762     {Intrinsic::vp_fshl, MVT::nxv8i32, 7},
763     {Intrinsic::vp_fshl, MVT::nxv16i32, 7},
764     {Intrinsic::vp_fshl, MVT::v2i64, 7},
765     {Intrinsic::vp_fshl, MVT::v4i64, 7},
766     {Intrinsic::vp_fshl, MVT::v8i64, 7},
767     {Intrinsic::vp_fshl, MVT::v16i64, 7},
768     {Intrinsic::vp_fshl, MVT::nxv1i64, 7},
769     {Intrinsic::vp_fshl, MVT::nxv2i64, 7},
770     {Intrinsic::vp_fshl, MVT::nxv4i64, 7},
771     {Intrinsic::vp_fshl, MVT::nxv8i64, 7},
772     {Intrinsic::vp_fshr, MVT::v2i8, 7},
773     {Intrinsic::vp_fshr, MVT::v4i8, 7},
774     {Intrinsic::vp_fshr, MVT::v8i8, 7},
775     {Intrinsic::vp_fshr, MVT::v16i8, 7},
776     {Intrinsic::vp_fshr, MVT::nxv1i8, 7},
777     {Intrinsic::vp_fshr, MVT::nxv2i8, 7},
778     {Intrinsic::vp_fshr, MVT::nxv4i8, 7},
779     {Intrinsic::vp_fshr, MVT::nxv8i8, 7},
780     {Intrinsic::vp_fshr, MVT::nxv16i8, 7},
781     {Intrinsic::vp_fshr, MVT::nxv32i8, 7},
782     {Intrinsic::vp_fshr, MVT::nxv64i8, 7},
783     {Intrinsic::vp_fshr, MVT::v2i16, 7},
784     {Intrinsic::vp_fshr, MVT::v4i16, 7},
785     {Intrinsic::vp_fshr, MVT::v8i16, 7},
786     {Intrinsic::vp_fshr, MVT::v16i16, 7},
787     {Intrinsic::vp_fshr, MVT::nxv1i16, 7},
788     {Intrinsic::vp_fshr, MVT::nxv2i16, 7},
789     {Intrinsic::vp_fshr, MVT::nxv4i16, 7},
790     {Intrinsic::vp_fshr, MVT::nxv8i16, 7},
791     {Intrinsic::vp_fshr, MVT::nxv16i16, 7},
792     {Intrinsic::vp_fshr, MVT::nxv32i16, 7},
793     {Intrinsic::vp_fshr, MVT::v2i32, 7},
794     {Intrinsic::vp_fshr, MVT::v4i32, 7},
795     {Intrinsic::vp_fshr, MVT::v8i32, 7},
796     {Intrinsic::vp_fshr, MVT::v16i32, 7},
797     {Intrinsic::vp_fshr, MVT::nxv1i32, 7},
798     {Intrinsic::vp_fshr, MVT::nxv2i32, 7},
799     {Intrinsic::vp_fshr, MVT::nxv4i32, 7},
800     {Intrinsic::vp_fshr, MVT::nxv8i32, 7},
801     {Intrinsic::vp_fshr, MVT::nxv16i32, 7},
802     {Intrinsic::vp_fshr, MVT::v2i64, 7},
803     {Intrinsic::vp_fshr, MVT::v4i64, 7},
804     {Intrinsic::vp_fshr, MVT::v8i64, 7},
805     {Intrinsic::vp_fshr, MVT::v16i64, 7},
806     {Intrinsic::vp_fshr, MVT::nxv1i64, 7},
807     {Intrinsic::vp_fshr, MVT::nxv2i64, 7},
808     {Intrinsic::vp_fshr, MVT::nxv4i64, 7},
809     {Intrinsic::vp_fshr, MVT::nxv8i64, 7},
810     {Intrinsic::bitreverse, MVT::v2i8, 17},
811     {Intrinsic::bitreverse, MVT::v4i8, 17},
812     {Intrinsic::bitreverse, MVT::v8i8, 17},
813     {Intrinsic::bitreverse, MVT::v16i8, 17},
814     {Intrinsic::bitreverse, MVT::nxv1i8, 17},
815     {Intrinsic::bitreverse, MVT::nxv2i8, 17},
816     {Intrinsic::bitreverse, MVT::nxv4i8, 17},
817     {Intrinsic::bitreverse, MVT::nxv8i8, 17},
818     {Intrinsic::bitreverse, MVT::nxv16i8, 17},
819     {Intrinsic::bitreverse, MVT::v2i16, 24},
820     {Intrinsic::bitreverse, MVT::v4i16, 24},
821     {Intrinsic::bitreverse, MVT::v8i16, 24},
822     {Intrinsic::bitreverse, MVT::v16i16, 24},
823     {Intrinsic::bitreverse, MVT::nxv1i16, 24},
824     {Intrinsic::bitreverse, MVT::nxv2i16, 24},
825     {Intrinsic::bitreverse, MVT::nxv4i16, 24},
826     {Intrinsic::bitreverse, MVT::nxv8i16, 24},
827     {Intrinsic::bitreverse, MVT::nxv16i16, 24},
828     {Intrinsic::bitreverse, MVT::v2i32, 33},
829     {Intrinsic::bitreverse, MVT::v4i32, 33},
830     {Intrinsic::bitreverse, MVT::v8i32, 33},
831     {Intrinsic::bitreverse, MVT::v16i32, 33},
832     {Intrinsic::bitreverse, MVT::nxv1i32, 33},
833     {Intrinsic::bitreverse, MVT::nxv2i32, 33},
834     {Intrinsic::bitreverse, MVT::nxv4i32, 33},
835     {Intrinsic::bitreverse, MVT::nxv8i32, 33},
836     {Intrinsic::bitreverse, MVT::nxv16i32, 33},
837     {Intrinsic::bitreverse, MVT::v2i64, 52},
838     {Intrinsic::bitreverse, MVT::v4i64, 52},
839     {Intrinsic::bitreverse, MVT::v8i64, 52},
840     {Intrinsic::bitreverse, MVT::v16i64, 52},
841     {Intrinsic::bitreverse, MVT::nxv1i64, 52},
842     {Intrinsic::bitreverse, MVT::nxv2i64, 52},
843     {Intrinsic::bitreverse, MVT::nxv4i64, 52},
844     {Intrinsic::bitreverse, MVT::nxv8i64, 52},
845     {Intrinsic::vp_bitreverse, MVT::v2i8, 17},
846     {Intrinsic::vp_bitreverse, MVT::v4i8, 17},
847     {Intrinsic::vp_bitreverse, MVT::v8i8, 17},
848     {Intrinsic::vp_bitreverse, MVT::v16i8, 17},
849     {Intrinsic::vp_bitreverse, MVT::nxv1i8, 17},
850     {Intrinsic::vp_bitreverse, MVT::nxv2i8, 17},
851     {Intrinsic::vp_bitreverse, MVT::nxv4i8, 17},
852     {Intrinsic::vp_bitreverse, MVT::nxv8i8, 17},
853     {Intrinsic::vp_bitreverse, MVT::nxv16i8, 17},
854     {Intrinsic::vp_bitreverse, MVT::v2i16, 24},
855     {Intrinsic::vp_bitreverse, MVT::v4i16, 24},
856     {Intrinsic::vp_bitreverse, MVT::v8i16, 24},
857     {Intrinsic::vp_bitreverse, MVT::v16i16, 24},
858     {Intrinsic::vp_bitreverse, MVT::nxv1i16, 24},
859     {Intrinsic::vp_bitreverse, MVT::nxv2i16, 24},
860     {Intrinsic::vp_bitreverse, MVT::nxv4i16, 24},
861     {Intrinsic::vp_bitreverse, MVT::nxv8i16, 24},
862     {Intrinsic::vp_bitreverse, MVT::nxv16i16, 24},
863     {Intrinsic::vp_bitreverse, MVT::v2i32, 33},
864     {Intrinsic::vp_bitreverse, MVT::v4i32, 33},
865     {Intrinsic::vp_bitreverse, MVT::v8i32, 33},
866     {Intrinsic::vp_bitreverse, MVT::v16i32, 33},
867     {Intrinsic::vp_bitreverse, MVT::nxv1i32, 33},
868     {Intrinsic::vp_bitreverse, MVT::nxv2i32, 33},
869     {Intrinsic::vp_bitreverse, MVT::nxv4i32, 33},
870     {Intrinsic::vp_bitreverse, MVT::nxv8i32, 33},
871     {Intrinsic::vp_bitreverse, MVT::nxv16i32, 33},
872     {Intrinsic::vp_bitreverse, MVT::v2i64, 52},
873     {Intrinsic::vp_bitreverse, MVT::v4i64, 52},
874     {Intrinsic::vp_bitreverse, MVT::v8i64, 52},
875     {Intrinsic::vp_bitreverse, MVT::v16i64, 52},
876     {Intrinsic::vp_bitreverse, MVT::nxv1i64, 52},
877     {Intrinsic::vp_bitreverse, MVT::nxv2i64, 52},
878     {Intrinsic::vp_bitreverse, MVT::nxv4i64, 52},
879     {Intrinsic::vp_bitreverse, MVT::nxv8i64, 52},
880     {Intrinsic::ctpop, MVT::v2i8, 12},
881     {Intrinsic::ctpop, MVT::v4i8, 12},
882     {Intrinsic::ctpop, MVT::v8i8, 12},
883     {Intrinsic::ctpop, MVT::v16i8, 12},
884     {Intrinsic::ctpop, MVT::nxv1i8, 12},
885     {Intrinsic::ctpop, MVT::nxv2i8, 12},
886     {Intrinsic::ctpop, MVT::nxv4i8, 12},
887     {Intrinsic::ctpop, MVT::nxv8i8, 12},
888     {Intrinsic::ctpop, MVT::nxv16i8, 12},
889     {Intrinsic::ctpop, MVT::v2i16, 19},
890     {Intrinsic::ctpop, MVT::v4i16, 19},
891     {Intrinsic::ctpop, MVT::v8i16, 19},
892     {Intrinsic::ctpop, MVT::v16i16, 19},
893     {Intrinsic::ctpop, MVT::nxv1i16, 19},
894     {Intrinsic::ctpop, MVT::nxv2i16, 19},
895     {Intrinsic::ctpop, MVT::nxv4i16, 19},
896     {Intrinsic::ctpop, MVT::nxv8i16, 19},
897     {Intrinsic::ctpop, MVT::nxv16i16, 19},
898     {Intrinsic::ctpop, MVT::v2i32, 20},
899     {Intrinsic::ctpop, MVT::v4i32, 20},
900     {Intrinsic::ctpop, MVT::v8i32, 20},
901     {Intrinsic::ctpop, MVT::v16i32, 20},
902     {Intrinsic::ctpop, MVT::nxv1i32, 20},
903     {Intrinsic::ctpop, MVT::nxv2i32, 20},
904     {Intrinsic::ctpop, MVT::nxv4i32, 20},
905     {Intrinsic::ctpop, MVT::nxv8i32, 20},
906     {Intrinsic::ctpop, MVT::nxv16i32, 20},
907     {Intrinsic::ctpop, MVT::v2i64, 21},
908     {Intrinsic::ctpop, MVT::v4i64, 21},
909     {Intrinsic::ctpop, MVT::v8i64, 21},
910     {Intrinsic::ctpop, MVT::v16i64, 21},
911     {Intrinsic::ctpop, MVT::nxv1i64, 21},
912     {Intrinsic::ctpop, MVT::nxv2i64, 21},
913     {Intrinsic::ctpop, MVT::nxv4i64, 21},
914     {Intrinsic::ctpop, MVT::nxv8i64, 21},
915     {Intrinsic::vp_ctpop, MVT::v2i8, 12},
916     {Intrinsic::vp_ctpop, MVT::v4i8, 12},
917     {Intrinsic::vp_ctpop, MVT::v8i8, 12},
918     {Intrinsic::vp_ctpop, MVT::v16i8, 12},
919     {Intrinsic::vp_ctpop, MVT::nxv1i8, 12},
920     {Intrinsic::vp_ctpop, MVT::nxv2i8, 12},
921     {Intrinsic::vp_ctpop, MVT::nxv4i8, 12},
922     {Intrinsic::vp_ctpop, MVT::nxv8i8, 12},
923     {Intrinsic::vp_ctpop, MVT::nxv16i8, 12},
924     {Intrinsic::vp_ctpop, MVT::v2i16, 19},
925     {Intrinsic::vp_ctpop, MVT::v4i16, 19},
926     {Intrinsic::vp_ctpop, MVT::v8i16, 19},
927     {Intrinsic::vp_ctpop, MVT::v16i16, 19},
928     {Intrinsic::vp_ctpop, MVT::nxv1i16, 19},
929     {Intrinsic::vp_ctpop, MVT::nxv2i16, 19},
930     {Intrinsic::vp_ctpop, MVT::nxv4i16, 19},
931     {Intrinsic::vp_ctpop, MVT::nxv8i16, 19},
932     {Intrinsic::vp_ctpop, MVT::nxv16i16, 19},
933     {Intrinsic::vp_ctpop, MVT::v2i32, 20},
934     {Intrinsic::vp_ctpop, MVT::v4i32, 20},
935     {Intrinsic::vp_ctpop, MVT::v8i32, 20},
936     {Intrinsic::vp_ctpop, MVT::v16i32, 20},
937     {Intrinsic::vp_ctpop, MVT::nxv1i32, 20},
938     {Intrinsic::vp_ctpop, MVT::nxv2i32, 20},
939     {Intrinsic::vp_ctpop, MVT::nxv4i32, 20},
940     {Intrinsic::vp_ctpop, MVT::nxv8i32, 20},
941     {Intrinsic::vp_ctpop, MVT::nxv16i32, 20},
942     {Intrinsic::vp_ctpop, MVT::v2i64, 21},
943     {Intrinsic::vp_ctpop, MVT::v4i64, 21},
944     {Intrinsic::vp_ctpop, MVT::v8i64, 21},
945     {Intrinsic::vp_ctpop, MVT::v16i64, 21},
946     {Intrinsic::vp_ctpop, MVT::nxv1i64, 21},
947     {Intrinsic::vp_ctpop, MVT::nxv2i64, 21},
948     {Intrinsic::vp_ctpop, MVT::nxv4i64, 21},
949     {Intrinsic::vp_ctpop, MVT::nxv8i64, 21},
950     {Intrinsic::vp_ctlz, MVT::v2i8, 19},
951     {Intrinsic::vp_ctlz, MVT::v4i8, 19},
952     {Intrinsic::vp_ctlz, MVT::v8i8, 19},
953     {Intrinsic::vp_ctlz, MVT::v16i8, 19},
954     {Intrinsic::vp_ctlz, MVT::nxv1i8, 19},
955     {Intrinsic::vp_ctlz, MVT::nxv2i8, 19},
956     {Intrinsic::vp_ctlz, MVT::nxv4i8, 19},
957     {Intrinsic::vp_ctlz, MVT::nxv8i8, 19},
958     {Intrinsic::vp_ctlz, MVT::nxv16i8, 19},
959     {Intrinsic::vp_ctlz, MVT::nxv32i8, 19},
960     {Intrinsic::vp_ctlz, MVT::nxv64i8, 19},
961     {Intrinsic::vp_ctlz, MVT::v2i16, 28},
962     {Intrinsic::vp_ctlz, MVT::v4i16, 28},
963     {Intrinsic::vp_ctlz, MVT::v8i16, 28},
964     {Intrinsic::vp_ctlz, MVT::v16i16, 28},
965     {Intrinsic::vp_ctlz, MVT::nxv1i16, 28},
966     {Intrinsic::vp_ctlz, MVT::nxv2i16, 28},
967     {Intrinsic::vp_ctlz, MVT::nxv4i16, 28},
968     {Intrinsic::vp_ctlz, MVT::nxv8i16, 28},
969     {Intrinsic::vp_ctlz, MVT::nxv16i16, 28},
970     {Intrinsic::vp_ctlz, MVT::nxv32i16, 28},
971     {Intrinsic::vp_ctlz, MVT::v2i32, 31},
972     {Intrinsic::vp_ctlz, MVT::v4i32, 31},
973     {Intrinsic::vp_ctlz, MVT::v8i32, 31},
974     {Intrinsic::vp_ctlz, MVT::v16i32, 31},
975     {Intrinsic::vp_ctlz, MVT::nxv1i32, 31},
976     {Intrinsic::vp_ctlz, MVT::nxv2i32, 31},
977     {Intrinsic::vp_ctlz, MVT::nxv4i32, 31},
978     {Intrinsic::vp_ctlz, MVT::nxv8i32, 31},
979     {Intrinsic::vp_ctlz, MVT::nxv16i32, 31},
980     {Intrinsic::vp_ctlz, MVT::v2i64, 35},
981     {Intrinsic::vp_ctlz, MVT::v4i64, 35},
982     {Intrinsic::vp_ctlz, MVT::v8i64, 35},
983     {Intrinsic::vp_ctlz, MVT::v16i64, 35},
984     {Intrinsic::vp_ctlz, MVT::nxv1i64, 35},
985     {Intrinsic::vp_ctlz, MVT::nxv2i64, 35},
986     {Intrinsic::vp_ctlz, MVT::nxv4i64, 35},
987     {Intrinsic::vp_ctlz, MVT::nxv8i64, 35},
988     {Intrinsic::vp_cttz, MVT::v2i8, 16},
989     {Intrinsic::vp_cttz, MVT::v4i8, 16},
990     {Intrinsic::vp_cttz, MVT::v8i8, 16},
991     {Intrinsic::vp_cttz, MVT::v16i8, 16},
992     {Intrinsic::vp_cttz, MVT::nxv1i8, 16},
993     {Intrinsic::vp_cttz, MVT::nxv2i8, 16},
994     {Intrinsic::vp_cttz, MVT::nxv4i8, 16},
995     {Intrinsic::vp_cttz, MVT::nxv8i8, 16},
996     {Intrinsic::vp_cttz, MVT::nxv16i8, 16},
997     {Intrinsic::vp_cttz, MVT::nxv32i8, 16},
998     {Intrinsic::vp_cttz, MVT::nxv64i8, 16},
999     {Intrinsic::vp_cttz, MVT::v2i16, 23},
1000     {Intrinsic::vp_cttz, MVT::v4i16, 23},
1001     {Intrinsic::vp_cttz, MVT::v8i16, 23},
1002     {Intrinsic::vp_cttz, MVT::v16i16, 23},
1003     {Intrinsic::vp_cttz, MVT::nxv1i16, 23},
1004     {Intrinsic::vp_cttz, MVT::nxv2i16, 23},
1005     {Intrinsic::vp_cttz, MVT::nxv4i16, 23},
1006     {Intrinsic::vp_cttz, MVT::nxv8i16, 23},
1007     {Intrinsic::vp_cttz, MVT::nxv16i16, 23},
1008     {Intrinsic::vp_cttz, MVT::nxv32i16, 23},
1009     {Intrinsic::vp_cttz, MVT::v2i32, 24},
1010     {Intrinsic::vp_cttz, MVT::v4i32, 24},
1011     {Intrinsic::vp_cttz, MVT::v8i32, 24},
1012     {Intrinsic::vp_cttz, MVT::v16i32, 24},
1013     {Intrinsic::vp_cttz, MVT::nxv1i32, 24},
1014     {Intrinsic::vp_cttz, MVT::nxv2i32, 24},
1015     {Intrinsic::vp_cttz, MVT::nxv4i32, 24},
1016     {Intrinsic::vp_cttz, MVT::nxv8i32, 24},
1017     {Intrinsic::vp_cttz, MVT::nxv16i32, 24},
1018     {Intrinsic::vp_cttz, MVT::v2i64, 25},
1019     {Intrinsic::vp_cttz, MVT::v4i64, 25},
1020     {Intrinsic::vp_cttz, MVT::v8i64, 25},
1021     {Intrinsic::vp_cttz, MVT::v16i64, 25},
1022     {Intrinsic::vp_cttz, MVT::nxv1i64, 25},
1023     {Intrinsic::vp_cttz, MVT::nxv2i64, 25},
1024     {Intrinsic::vp_cttz, MVT::nxv4i64, 25},
1025     {Intrinsic::vp_cttz, MVT::nxv8i64, 25},
1026 };
1027 
1028 static unsigned getISDForVPIntrinsicID(Intrinsic::ID ID) {
1029   switch (ID) {
1030 #define HELPER_MAP_VPID_TO_VPSD(VPID, VPSD)                                    \
1031   case Intrinsic::VPID:                                                        \
1032     return ISD::VPSD;
1033 #include "llvm/IR/VPIntrinsics.def"
1034 #undef HELPER_MAP_VPID_TO_VPSD
1035   }
1036   return ISD::DELETED_NODE;
1037 }
1038 
1039 InstructionCost
1040 RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
1041                                     TTI::TargetCostKind CostKind) {
1042   auto *RetTy = ICA.getReturnType();
1043   switch (ICA.getID()) {
1044   case Intrinsic::ceil:
1045   case Intrinsic::floor:
1046   case Intrinsic::trunc:
1047   case Intrinsic::rint:
1048   case Intrinsic::round:
1049   case Intrinsic::roundeven: {
1050     // These all use the same code.
1051     auto LT = getTypeLegalizationCost(RetTy);
1052     if (!LT.second.isVector() && TLI->isOperationCustom(ISD::FCEIL, LT.second))
1053       return LT.first * 8;
1054     break;
1055   }
1056   case Intrinsic::umin:
1057   case Intrinsic::umax:
1058   case Intrinsic::smin:
1059   case Intrinsic::smax: {
1060     auto LT = getTypeLegalizationCost(RetTy);
1061     if ((ST->hasVInstructions() && LT.second.isVector()) ||
1062         (LT.second.isScalarInteger() && ST->hasStdExtZbb()))
1063       return LT.first;
1064     break;
1065   }
1066   case Intrinsic::sadd_sat:
1067   case Intrinsic::ssub_sat:
1068   case Intrinsic::uadd_sat:
1069   case Intrinsic::usub_sat:
1070   case Intrinsic::fabs:
1071   case Intrinsic::sqrt: {
1072     auto LT = getTypeLegalizationCost(RetTy);
1073     if (ST->hasVInstructions() && LT.second.isVector())
1074       return LT.first;
1075     break;
1076   }
1077   case Intrinsic::abs: {
1078     auto LT = getTypeLegalizationCost(RetTy);
1079     if (ST->hasVInstructions() && LT.second.isVector()) {
1080       // vrsub.vi v10, v8, 0
1081       // vmax.vv v8, v8, v10
1082       return LT.first * 2;
1083     }
1084     break;
1085   }
1086   // TODO: add more intrinsic
1087   case Intrinsic::experimental_stepvector: {
1088     unsigned Cost = 1; // vid
1089     auto LT = getTypeLegalizationCost(RetTy);
1090     return Cost + (LT.first - 1);
1091   }
1092   case Intrinsic::vp_rint: {
1093     // RISC-V target uses at least 5 instructions to lower rounding intrinsics.
1094     unsigned Cost = 5;
1095     auto LT = getTypeLegalizationCost(RetTy);
1096     if (TLI->isOperationCustom(ISD::VP_FRINT, LT.second))
1097       return Cost * LT.first;
1098     break;
1099   }
1100   case Intrinsic::vp_nearbyint: {
1101     // More one read and one write for fflags than vp_rint.
1102     unsigned Cost = 7;
1103     auto LT = getTypeLegalizationCost(RetTy);
1104     if (TLI->isOperationCustom(ISD::VP_FRINT, LT.second))
1105       return Cost * LT.first;
1106     break;
1107   }
1108   case Intrinsic::vp_ceil:
1109   case Intrinsic::vp_floor:
1110   case Intrinsic::vp_round:
1111   case Intrinsic::vp_roundeven:
1112   case Intrinsic::vp_roundtozero: {
1113     // Rounding with static rounding mode needs two more instructions to
1114     // swap/write FRM than vp_rint.
1115     unsigned Cost = 7;
1116     auto LT = getTypeLegalizationCost(RetTy);
1117     unsigned VPISD = getISDForVPIntrinsicID(ICA.getID());
1118     if (TLI->isOperationCustom(VPISD, LT.second))
1119       return Cost * LT.first;
1120     break;
1121   }
1122   }
1123 
1124   if (ST->hasVInstructions() && RetTy->isVectorTy()) {
1125     auto LT = getTypeLegalizationCost(RetTy);
1126     if (const auto *Entry = CostTableLookup(VectorIntrinsicCostTable,
1127                                             ICA.getID(), LT.second))
1128       return LT.first * Entry->Cost;
1129   }
1130 
1131   return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1132 }
1133 
1134 InstructionCost RISCVTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
1135                                                Type *Src,
1136                                                TTI::CastContextHint CCH,
1137                                                TTI::TargetCostKind CostKind,
1138                                                const Instruction *I) {
1139   if (isa<VectorType>(Dst) && isa<VectorType>(Src)) {
1140     // FIXME: Need to compute legalizing cost for illegal types.
1141     if (!isTypeLegal(Src) || !isTypeLegal(Dst))
1142       return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1143 
1144     // Skip if element size of Dst or Src is bigger than ELEN.
1145     if (Src->getScalarSizeInBits() > ST->getELEN() ||
1146         Dst->getScalarSizeInBits() > ST->getELEN())
1147       return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1148 
1149     int ISD = TLI->InstructionOpcodeToISD(Opcode);
1150     assert(ISD && "Invalid opcode");
1151 
1152     // FIXME: Need to consider vsetvli and lmul.
1153     int PowDiff = (int)Log2_32(Dst->getScalarSizeInBits()) -
1154                   (int)Log2_32(Src->getScalarSizeInBits());
1155     switch (ISD) {
1156     case ISD::SIGN_EXTEND:
1157     case ISD::ZERO_EXTEND:
1158       if (Src->getScalarSizeInBits() == 1) {
1159         // We do not use vsext/vzext to extend from mask vector.
1160         // Instead we use the following instructions to extend from mask vector:
1161         // vmv.v.i v8, 0
1162         // vmerge.vim v8, v8, -1, v0
1163         return 2;
1164       }
1165       return 1;
1166     case ISD::TRUNCATE:
1167       if (Dst->getScalarSizeInBits() == 1) {
1168         // We do not use several vncvt to truncate to mask vector. So we could
1169         // not use PowDiff to calculate it.
1170         // Instead we use the following instructions to truncate to mask vector:
1171         // vand.vi v8, v8, 1
1172         // vmsne.vi v0, v8, 0
1173         return 2;
1174       }
1175       [[fallthrough]];
1176     case ISD::FP_EXTEND:
1177     case ISD::FP_ROUND:
1178       // Counts of narrow/widen instructions.
1179       return std::abs(PowDiff);
1180     case ISD::FP_TO_SINT:
1181     case ISD::FP_TO_UINT:
1182     case ISD::SINT_TO_FP:
1183     case ISD::UINT_TO_FP:
1184       if (Src->getScalarSizeInBits() == 1 || Dst->getScalarSizeInBits() == 1) {
1185         // The cost of convert from or to mask vector is different from other
1186         // cases. We could not use PowDiff to calculate it.
1187         // For mask vector to fp, we should use the following instructions:
1188         // vmv.v.i v8, 0
1189         // vmerge.vim v8, v8, -1, v0
1190         // vfcvt.f.x.v v8, v8
1191 
1192         // And for fp vector to mask, we use:
1193         // vfncvt.rtz.x.f.w v9, v8
1194         // vand.vi v8, v9, 1
1195         // vmsne.vi v0, v8, 0
1196         return 3;
1197       }
1198       if (std::abs(PowDiff) <= 1)
1199         return 1;
1200       // Backend could lower (v[sz]ext i8 to double) to vfcvt(v[sz]ext.f8 i8),
1201       // so it only need two conversion.
1202       if (Src->isIntOrIntVectorTy())
1203         return 2;
1204       // Counts of narrow/widen instructions.
1205       return std::abs(PowDiff);
1206     }
1207   }
1208   return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1209 }
1210 
1211 unsigned RISCVTTIImpl::getEstimatedVLFor(VectorType *Ty) {
1212   if (isa<ScalableVectorType>(Ty)) {
1213     const unsigned EltSize = DL.getTypeSizeInBits(Ty->getElementType());
1214     const unsigned MinSize = DL.getTypeSizeInBits(Ty).getKnownMinValue();
1215     const unsigned VectorBits = *getVScaleForTuning() * RISCV::RVVBitsPerBlock;
1216     return RISCVTargetLowering::computeVLMAX(VectorBits, EltSize, MinSize);
1217   }
1218   return cast<FixedVectorType>(Ty)->getNumElements();
1219 }
1220 
1221 InstructionCost
1222 RISCVTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
1223                                      FastMathFlags FMF,
1224                                      TTI::TargetCostKind CostKind) {
1225   if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1226     return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1227 
1228   // Skip if scalar size of Ty is bigger than ELEN.
1229   if (Ty->getScalarSizeInBits() > ST->getELEN())
1230     return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1231 
1232   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1233   if (Ty->getElementType()->isIntegerTy(1))
1234     // vcpop sequences, see vreduction-mask.ll.  umax, smin actually only
1235     // cost 2, but we don't have enough info here so we slightly over cost.
1236     return (LT.first - 1) + 3;
1237 
1238   // IR Reduction is composed by two vmv and one rvv reduction instruction.
1239   InstructionCost BaseCost = 2;
1240 
1241   if (CostKind == TTI::TCK_CodeSize)
1242     return (LT.first - 1) + BaseCost;
1243 
1244   unsigned VL = getEstimatedVLFor(Ty);
1245   return (LT.first - 1) + BaseCost + Log2_32_Ceil(VL);
1246 }
1247 
1248 InstructionCost
1249 RISCVTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
1250                                          std::optional<FastMathFlags> FMF,
1251                                          TTI::TargetCostKind CostKind) {
1252   if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1253     return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1254 
1255   // Skip if scalar size of Ty is bigger than ELEN.
1256   if (Ty->getScalarSizeInBits() > ST->getELEN())
1257     return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1258 
1259   int ISD = TLI->InstructionOpcodeToISD(Opcode);
1260   assert(ISD && "Invalid opcode");
1261 
1262   if (ISD != ISD::ADD && ISD != ISD::OR && ISD != ISD::XOR && ISD != ISD::AND &&
1263       ISD != ISD::FADD)
1264     return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1265 
1266   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1267   if (Ty->getElementType()->isIntegerTy(1))
1268     // vcpop sequences, see vreduction-mask.ll
1269     return (LT.first - 1) + (ISD == ISD::AND ? 3 : 2);
1270 
1271   // IR Reduction is composed by two vmv and one rvv reduction instruction.
1272   InstructionCost BaseCost = 2;
1273 
1274   if (CostKind == TTI::TCK_CodeSize)
1275     return (LT.first - 1) + BaseCost;
1276 
1277   unsigned VL = getEstimatedVLFor(Ty);
1278   if (TTI::requiresOrderedReduction(FMF))
1279     return (LT.first - 1) + BaseCost + VL;
1280   return (LT.first - 1) + BaseCost + Log2_32_Ceil(VL);
1281 }
1282 
1283 InstructionCost RISCVTTIImpl::getExtendedReductionCost(
1284     unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy,
1285     FastMathFlags FMF, TTI::TargetCostKind CostKind) {
1286   if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
1287     return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1288                                            FMF, CostKind);
1289 
1290   // Skip if scalar size of ResTy is bigger than ELEN.
1291   if (ResTy->getScalarSizeInBits() > ST->getELEN())
1292     return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1293                                            FMF, CostKind);
1294 
1295   if (Opcode != Instruction::Add && Opcode != Instruction::FAdd)
1296     return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1297                                            FMF, CostKind);
1298 
1299   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1300 
1301   if (ResTy->getScalarSizeInBits() != 2 * LT.second.getScalarSizeInBits())
1302     return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1303                                            FMF, CostKind);
1304 
1305   return (LT.first - 1) +
1306          getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1307 }
1308 
1309 InstructionCost RISCVTTIImpl::getStoreImmCost(Type *Ty,
1310                                               TTI::OperandValueInfo OpInfo,
1311                                               TTI::TargetCostKind CostKind) {
1312   assert(OpInfo.isConstant() && "non constant operand?");
1313   if (!isa<VectorType>(Ty))
1314     // FIXME: We need to account for immediate materialization here, but doing
1315     // a decent job requires more knowledge about the immediate than we
1316     // currently have here.
1317     return 0;
1318 
1319   if (OpInfo.isUniform())
1320     // vmv.x.i, vmv.v.x, or vfmv.v.f
1321     // We ignore the cost of the scalar constant materialization to be consistent
1322     // with how we treat scalar constants themselves just above.
1323     return 1;
1324 
1325   return getConstantPoolLoadCost(Ty, CostKind);
1326 }
1327 
1328 
1329 InstructionCost RISCVTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
1330                                               MaybeAlign Alignment,
1331                                               unsigned AddressSpace,
1332                                               TTI::TargetCostKind CostKind,
1333                                               TTI::OperandValueInfo OpInfo,
1334                                               const Instruction *I) {
1335   EVT VT = TLI->getValueType(DL, Src, true);
1336   // Type legalization can't handle structs
1337   if (VT == MVT::Other)
1338     return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1339                                   CostKind, OpInfo, I);
1340 
1341   InstructionCost Cost = 0;
1342   if (Opcode == Instruction::Store && OpInfo.isConstant())
1343     Cost += getStoreImmCost(Src, OpInfo, CostKind);
1344   InstructionCost BaseCost =
1345     BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1346                            CostKind, OpInfo, I);
1347   // Assume memory ops cost scale with the number of vector registers
1348   // possible accessed by the instruction.  Note that BasicTTI already
1349   // handles the LT.first term for us.
1350   if (std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
1351       LT.second.isVector())
1352     BaseCost *= getLMULCost(LT.second);
1353   return Cost + BaseCost;
1354 
1355 }
1356 
1357 InstructionCost RISCVTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
1358                                                  Type *CondTy,
1359                                                  CmpInst::Predicate VecPred,
1360                                                  TTI::TargetCostKind CostKind,
1361                                                  const Instruction *I) {
1362   if (CostKind != TTI::TCK_RecipThroughput)
1363     return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1364                                      I);
1365 
1366   if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
1367     return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1368                                      I);
1369 
1370   // Skip if scalar size of ValTy is bigger than ELEN.
1371   if (ValTy->isVectorTy() && ValTy->getScalarSizeInBits() > ST->getELEN())
1372     return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1373                                      I);
1374 
1375   if (Opcode == Instruction::Select && ValTy->isVectorTy()) {
1376     std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1377     if (CondTy->isVectorTy()) {
1378       if (ValTy->getScalarSizeInBits() == 1) {
1379         // vmandn.mm v8, v8, v9
1380         // vmand.mm v9, v0, v9
1381         // vmor.mm v0, v9, v8
1382         return LT.first * 3;
1383       }
1384       // vselect and max/min are supported natively.
1385       return LT.first * 1;
1386     }
1387 
1388     if (ValTy->getScalarSizeInBits() == 1) {
1389       //  vmv.v.x v9, a0
1390       //  vmsne.vi v9, v9, 0
1391       //  vmandn.mm v8, v8, v9
1392       //  vmand.mm v9, v0, v9
1393       //  vmor.mm v0, v9, v8
1394       return LT.first * 5;
1395     }
1396 
1397     // vmv.v.x v10, a0
1398     // vmsne.vi v0, v10, 0
1399     // vmerge.vvm v8, v9, v8, v0
1400     return LT.first * 3;
1401   }
1402 
1403   if ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
1404       ValTy->isVectorTy()) {
1405     std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1406 
1407     // Support natively.
1408     if (CmpInst::isIntPredicate(VecPred))
1409       return LT.first * 1;
1410 
1411     // If we do not support the input floating point vector type, use the base
1412     // one which will calculate as:
1413     // ScalarizeCost + Num * Cost for fixed vector,
1414     // InvalidCost for scalable vector.
1415     if ((ValTy->getScalarSizeInBits() == 16 && !ST->hasVInstructionsF16()) ||
1416         (ValTy->getScalarSizeInBits() == 32 && !ST->hasVInstructionsF32()) ||
1417         (ValTy->getScalarSizeInBits() == 64 && !ST->hasVInstructionsF64()))
1418       return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1419                                        I);
1420     switch (VecPred) {
1421       // Support natively.
1422     case CmpInst::FCMP_OEQ:
1423     case CmpInst::FCMP_OGT:
1424     case CmpInst::FCMP_OGE:
1425     case CmpInst::FCMP_OLT:
1426     case CmpInst::FCMP_OLE:
1427     case CmpInst::FCMP_UNE:
1428       return LT.first * 1;
1429     // TODO: Other comparisons?
1430     default:
1431       break;
1432     }
1433   }
1434 
1435   // TODO: Add cost for scalar type.
1436 
1437   return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
1438 }
1439 
1440 InstructionCost RISCVTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
1441                                                  TTI::TargetCostKind CostKind,
1442                                                  unsigned Index, Value *Op0,
1443                                                  Value *Op1) {
1444   assert(Val->isVectorTy() && "This must be a vector type");
1445 
1446   if (Opcode != Instruction::ExtractElement &&
1447       Opcode != Instruction::InsertElement)
1448     return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
1449 
1450   // Legalize the type.
1451   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
1452 
1453   // This type is legalized to a scalar type.
1454   if (!LT.second.isVector())
1455     return 0;
1456 
1457   // For unsupported scalable vector.
1458   if (LT.second.isScalableVector() && !LT.first.isValid())
1459     return LT.first;
1460 
1461   if (!isTypeLegal(Val))
1462     return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
1463 
1464   // In RVV, we could use vslidedown + vmv.x.s to extract element from vector
1465   // and vslideup + vmv.s.x to insert element to vector.
1466   unsigned BaseCost = 1;
1467   // When insertelement we should add the index with 1 as the input of vslideup.
1468   unsigned SlideCost = Opcode == Instruction::InsertElement ? 2 : 1;
1469 
1470   if (Index != -1U) {
1471     // The type may be split. For fixed-width vectors we can normalize the
1472     // index to the new type.
1473     if (LT.second.isFixedLengthVector()) {
1474       unsigned Width = LT.second.getVectorNumElements();
1475       Index = Index % Width;
1476     }
1477 
1478     // We could extract/insert the first element without vslidedown/vslideup.
1479     if (Index == 0)
1480       SlideCost = 0;
1481     else if (Opcode == Instruction::InsertElement)
1482       SlideCost = 1; // With a constant index, we do not need to use addi.
1483   }
1484 
1485   // Mask vector extract/insert element is different from normal case.
1486   if (Val->getScalarSizeInBits() == 1) {
1487     // For extractelement, we need the following instructions:
1488     // vmv.v.i v8, 0
1489     // vmerge.vim v8, v8, 1, v0
1490     // vsetivli zero, 1, e8, m2, ta, mu (not count)
1491     // vslidedown.vx v8, v8, a0
1492     // vmv.x.s a0, v8
1493 
1494     // For insertelement, we need the following instructions:
1495     // vsetvli a2, zero, e8, m1, ta, mu (not count)
1496     // vmv.s.x v8, a0
1497     // vmv.v.i v9, 0
1498     // vmerge.vim v9, v9, 1, v0
1499     // addi a0, a1, 1
1500     // vsetvli zero, a0, e8, m1, tu, mu (not count)
1501     // vslideup.vx v9, v8, a1
1502     // vsetvli a0, zero, e8, m1, ta, mu (not count)
1503     // vand.vi v8, v9, 1
1504     // vmsne.vi v0, v8, 0
1505 
1506     // TODO: should we count these special vsetvlis?
1507     BaseCost = Opcode == Instruction::InsertElement ? 5 : 3;
1508   }
1509   // Extract i64 in the target that has XLEN=32 need more instruction.
1510   if (Val->getScalarType()->isIntegerTy() &&
1511       ST->getXLen() < Val->getScalarSizeInBits()) {
1512     // For extractelement, we need the following instructions:
1513     // vsetivli zero, 1, e64, m1, ta, mu (not count)
1514     // vslidedown.vx v8, v8, a0
1515     // vmv.x.s a0, v8
1516     // li a1, 32
1517     // vsrl.vx v8, v8, a1
1518     // vmv.x.s a1, v8
1519 
1520     // For insertelement, we need the following instructions:
1521     // vsetivli zero, 2, e32, m4, ta, mu (not count)
1522     // vmv.v.i v12, 0
1523     // vslide1up.vx v16, v12, a1
1524     // vslide1up.vx v12, v16, a0
1525     // addi a0, a2, 1
1526     // vsetvli zero, a0, e64, m4, tu, mu (not count)
1527     // vslideup.vx v8, v12, a2
1528 
1529     // TODO: should we count these special vsetvlis?
1530     BaseCost = Opcode == Instruction::InsertElement ? 3 : 4;
1531   }
1532   return BaseCost + SlideCost;
1533 }
1534 
1535 InstructionCost RISCVTTIImpl::getArithmeticInstrCost(
1536     unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
1537     TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
1538     ArrayRef<const Value *> Args, const Instruction *CxtI) {
1539 
1540   // TODO: Handle more cost kinds.
1541   if (CostKind != TTI::TCK_RecipThroughput)
1542     return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1543                                          Args, CxtI);
1544 
1545   if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1546     return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1547                                          Args, CxtI);
1548 
1549   // Skip if scalar size of Ty is bigger than ELEN.
1550   if (isa<VectorType>(Ty) && Ty->getScalarSizeInBits() > ST->getELEN())
1551     return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1552                                          Args, CxtI);
1553 
1554   // Legalize the type.
1555   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1556 
1557   // TODO: Handle scalar type.
1558   if (!LT.second.isVector())
1559     return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1560                                          Args, CxtI);
1561 
1562 
1563   auto getConstantMatCost =
1564     [&](unsigned Operand, TTI::OperandValueInfo OpInfo) -> InstructionCost {
1565     if (OpInfo.isUniform() && TLI->canSplatOperand(Opcode, Operand))
1566       // Two sub-cases:
1567       // * Has a 5 bit immediate operand which can be splatted.
1568       // * Has a larger immediate which must be materialized in scalar register
1569       // We return 0 for both as we currently ignore the cost of materializing
1570       // scalar constants in GPRs.
1571       return 0;
1572 
1573     return getConstantPoolLoadCost(Ty, CostKind);
1574   };
1575 
1576   // Add the cost of materializing any constant vectors required.
1577   InstructionCost ConstantMatCost = 0;
1578   if (Op1Info.isConstant())
1579     ConstantMatCost += getConstantMatCost(0, Op1Info);
1580   if (Op2Info.isConstant())
1581     ConstantMatCost += getConstantMatCost(1, Op2Info);
1582 
1583   switch (TLI->InstructionOpcodeToISD(Opcode)) {
1584   case ISD::ADD:
1585   case ISD::SUB:
1586   case ISD::AND:
1587   case ISD::OR:
1588   case ISD::XOR:
1589   case ISD::SHL:
1590   case ISD::SRL:
1591   case ISD::SRA:
1592   case ISD::MUL:
1593   case ISD::MULHS:
1594   case ISD::MULHU:
1595   case ISD::FADD:
1596   case ISD::FSUB:
1597   case ISD::FMUL:
1598   case ISD::FNEG: {
1599     return ConstantMatCost + getLMULCost(LT.second) * LT.first * 1;
1600   }
1601   default:
1602     return ConstantMatCost +
1603            BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1604                                          Args, CxtI);
1605   }
1606 }
1607 
1608 // TODO: Deduplicate from TargetTransformInfoImplCRTPBase.
1609 InstructionCost RISCVTTIImpl::getPointersChainCost(
1610     ArrayRef<const Value *> Ptrs, const Value *Base,
1611     const TTI::PointersChainInfo &Info, Type *AccessTy,
1612     TTI::TargetCostKind CostKind) {
1613   InstructionCost Cost = TTI::TCC_Free;
1614   // In the basic model we take into account GEP instructions only
1615   // (although here can come alloca instruction, a value, constants and/or
1616   // constant expressions, PHIs, bitcasts ... whatever allowed to be used as a
1617   // pointer). Typically, if Base is a not a GEP-instruction and all the
1618   // pointers are relative to the same base address, all the rest are
1619   // either GEP instructions, PHIs, bitcasts or constants. When we have same
1620   // base, we just calculate cost of each non-Base GEP as an ADD operation if
1621   // any their index is a non-const.
1622   // If no known dependecies between the pointers cost is calculated as a sum
1623   // of costs of GEP instructions.
1624   for (auto [I, V] : enumerate(Ptrs)) {
1625     const auto *GEP = dyn_cast<GetElementPtrInst>(V);
1626     if (!GEP)
1627       continue;
1628     if (Info.isSameBase() && V != Base) {
1629       if (GEP->hasAllConstantIndices())
1630         continue;
1631       // If the chain is unit-stride and BaseReg + stride*i is a legal
1632       // addressing mode, then presume the base GEP is sitting around in a
1633       // register somewhere and check if we can fold the offset relative to
1634       // it.
1635       unsigned Stride = DL.getTypeStoreSize(AccessTy);
1636       if (Info.isUnitStride() &&
1637           isLegalAddressingMode(AccessTy,
1638                                 /* BaseGV */ nullptr,
1639                                 /* BaseOffset */ Stride * I,
1640                                 /* HasBaseReg */ true,
1641                                 /* Scale */ 0,
1642                                 GEP->getType()->getPointerAddressSpace()))
1643         continue;
1644       Cost += getArithmeticInstrCost(Instruction::Add, GEP->getType(), CostKind,
1645                                      {TTI::OK_AnyValue, TTI::OP_None},
1646                                      {TTI::OK_AnyValue, TTI::OP_None},
1647                                      std::nullopt);
1648     } else {
1649       SmallVector<const Value *> Indices(GEP->indices());
1650       Cost += getGEPCost(GEP->getSourceElementType(), GEP->getPointerOperand(),
1651                          Indices, AccessTy, CostKind);
1652     }
1653   }
1654   return Cost;
1655 }
1656 
1657 void RISCVTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
1658                                            TTI::UnrollingPreferences &UP,
1659                                            OptimizationRemarkEmitter *ORE) {
1660   // TODO: More tuning on benchmarks and metrics with changes as needed
1661   //       would apply to all settings below to enable performance.
1662 
1663 
1664   if (ST->enableDefaultUnroll())
1665     return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
1666 
1667   // Enable Upper bound unrolling universally, not dependant upon the conditions
1668   // below.
1669   UP.UpperBound = true;
1670 
1671   // Disable loop unrolling for Oz and Os.
1672   UP.OptSizeThreshold = 0;
1673   UP.PartialOptSizeThreshold = 0;
1674   if (L->getHeader()->getParent()->hasOptSize())
1675     return;
1676 
1677   SmallVector<BasicBlock *, 4> ExitingBlocks;
1678   L->getExitingBlocks(ExitingBlocks);
1679   LLVM_DEBUG(dbgs() << "Loop has:\n"
1680                     << "Blocks: " << L->getNumBlocks() << "\n"
1681                     << "Exit blocks: " << ExitingBlocks.size() << "\n");
1682 
1683   // Only allow another exit other than the latch. This acts as an early exit
1684   // as it mirrors the profitability calculation of the runtime unroller.
1685   if (ExitingBlocks.size() > 2)
1686     return;
1687 
1688   // Limit the CFG of the loop body for targets with a branch predictor.
1689   // Allowing 4 blocks permits if-then-else diamonds in the body.
1690   if (L->getNumBlocks() > 4)
1691     return;
1692 
1693   // Don't unroll vectorized loops, including the remainder loop
1694   if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized"))
1695     return;
1696 
1697   // Scan the loop: don't unroll loops with calls as this could prevent
1698   // inlining.
1699   InstructionCost Cost = 0;
1700   for (auto *BB : L->getBlocks()) {
1701     for (auto &I : *BB) {
1702       // Initial setting - Don't unroll loops containing vectorized
1703       // instructions.
1704       if (I.getType()->isVectorTy())
1705         return;
1706 
1707       if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
1708         if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
1709           if (!isLoweredToCall(F))
1710             continue;
1711         }
1712         return;
1713       }
1714 
1715       SmallVector<const Value *> Operands(I.operand_values());
1716       Cost += getInstructionCost(&I, Operands,
1717                                  TargetTransformInfo::TCK_SizeAndLatency);
1718     }
1719   }
1720 
1721   LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
1722 
1723   UP.Partial = true;
1724   UP.Runtime = true;
1725   UP.UnrollRemainder = true;
1726   UP.UnrollAndJam = true;
1727   UP.UnrollAndJamInnerLoopThreshold = 60;
1728 
1729   // Force unrolling small loops can be very useful because of the branch
1730   // taken cost of the backedge.
1731   if (Cost < 12)
1732     UP.Force = true;
1733 }
1734 
1735 void RISCVTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
1736                                          TTI::PeelingPreferences &PP) {
1737   BaseT::getPeelingPreferences(L, SE, PP);
1738 }
1739 
1740 unsigned RISCVTTIImpl::getRegUsageForType(Type *Ty) {
1741   TypeSize Size = DL.getTypeSizeInBits(Ty);
1742   if (Ty->isVectorTy()) {
1743     if (Size.isScalable() && ST->hasVInstructions())
1744       return divideCeil(Size.getKnownMinValue(), RISCV::RVVBitsPerBlock);
1745 
1746     if (ST->useRVVForFixedLengthVectors())
1747       return divideCeil(Size, ST->getRealMinVLen());
1748   }
1749 
1750   return BaseT::getRegUsageForType(Ty);
1751 }
1752 
1753 unsigned RISCVTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
1754   if (SLPMaxVF.getNumOccurrences())
1755     return SLPMaxVF;
1756 
1757   // Return how many elements can fit in getRegisterBitwidth.  This is the
1758   // same routine as used in LoopVectorizer.  We should probably be
1759   // accounting for whether we actually have instructions with the right
1760   // lane type, but we don't have enough information to do that without
1761   // some additional plumbing which hasn't been justified yet.
1762   TypeSize RegWidth =
1763     getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector);
1764   // If no vector registers, or absurd element widths, disable
1765   // vectorization by returning 1.
1766   return std::max<unsigned>(1U, RegWidth.getFixedValue() / ElemWidth);
1767 }
1768 
1769 bool RISCVTTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
1770                                  const TargetTransformInfo::LSRCost &C2) {
1771   // RISC-V specific here are "instruction number 1st priority".
1772   return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
1773                   C1.NumIVMuls, C1.NumBaseAdds,
1774                   C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
1775          std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
1776                   C2.NumIVMuls, C2.NumBaseAdds,
1777                   C2.ScaleCost, C2.ImmCost, C2.SetupCost);
1778 }
1779