1 //===-- RISCVTargetTransformInfo.cpp - RISC-V specific TTI ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "RISCVTargetTransformInfo.h" 10 #include "MCTargetDesc/RISCVMatInt.h" 11 #include "llvm/ADT/STLExtras.h" 12 #include "llvm/Analysis/TargetTransformInfo.h" 13 #include "llvm/CodeGen/BasicTTIImpl.h" 14 #include "llvm/CodeGen/CostTable.h" 15 #include "llvm/CodeGen/TargetLowering.h" 16 #include "llvm/IR/Instructions.h" 17 #include <cmath> 18 #include <optional> 19 using namespace llvm; 20 21 #define DEBUG_TYPE "riscvtti" 22 23 static cl::opt<unsigned> RVVRegisterWidthLMUL( 24 "riscv-v-register-bit-width-lmul", 25 cl::desc( 26 "The LMUL to use for getRegisterBitWidth queries. Affects LMUL used " 27 "by autovectorized code. Fractional LMULs are not supported."), 28 cl::init(2), cl::Hidden); 29 30 static cl::opt<unsigned> SLPMaxVF( 31 "riscv-v-slp-max-vf", 32 cl::desc( 33 "Overrides result used for getMaximumVF query which is used " 34 "exclusively by SLP vectorizer."), 35 cl::Hidden); 36 37 InstructionCost RISCVTTIImpl::getLMULCost(MVT VT) { 38 // TODO: Here assume reciprocal throughput is 1 for LMUL_1, it is 39 // implementation-defined. 40 if (!VT.isVector()) 41 return InstructionCost::getInvalid(); 42 unsigned DLenFactor = ST->getDLenFactor(); 43 unsigned Cost; 44 if (VT.isScalableVector()) { 45 unsigned LMul; 46 bool Fractional; 47 std::tie(LMul, Fractional) = 48 RISCVVType::decodeVLMUL(RISCVTargetLowering::getLMUL(VT)); 49 if (Fractional) 50 Cost = LMul <= DLenFactor ? (DLenFactor / LMul) : 1; 51 else 52 Cost = (LMul * DLenFactor); 53 } else { 54 Cost = divideCeil(VT.getSizeInBits(), ST->getRealMinVLen() / DLenFactor); 55 } 56 return Cost; 57 } 58 59 InstructionCost RISCVTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty, 60 TTI::TargetCostKind CostKind) { 61 assert(Ty->isIntegerTy() && 62 "getIntImmCost can only estimate cost of materialising integers"); 63 64 // We have a Zero register, so 0 is always free. 65 if (Imm == 0) 66 return TTI::TCC_Free; 67 68 // Otherwise, we check how many instructions it will take to materialise. 69 const DataLayout &DL = getDataLayout(); 70 return RISCVMatInt::getIntMatCost(Imm, DL.getTypeSizeInBits(Ty), 71 getST()->getFeatureBits()); 72 } 73 74 // Look for patterns of shift followed by AND that can be turned into a pair of 75 // shifts. We won't need to materialize an immediate for the AND so these can 76 // be considered free. 77 static bool canUseShiftPair(Instruction *Inst, const APInt &Imm) { 78 uint64_t Mask = Imm.getZExtValue(); 79 auto *BO = dyn_cast<BinaryOperator>(Inst->getOperand(0)); 80 if (!BO || !BO->hasOneUse()) 81 return false; 82 83 if (BO->getOpcode() != Instruction::Shl) 84 return false; 85 86 if (!isa<ConstantInt>(BO->getOperand(1))) 87 return false; 88 89 unsigned ShAmt = cast<ConstantInt>(BO->getOperand(1))->getZExtValue(); 90 // (and (shl x, c2), c1) will be matched to (srli (slli x, c2+c3), c3) if c1 91 // is a mask shifted by c2 bits with c3 leading zeros. 92 if (isShiftedMask_64(Mask)) { 93 unsigned Trailing = llvm::countr_zero(Mask); 94 if (ShAmt == Trailing) 95 return true; 96 } 97 98 return false; 99 } 100 101 InstructionCost RISCVTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, 102 const APInt &Imm, Type *Ty, 103 TTI::TargetCostKind CostKind, 104 Instruction *Inst) { 105 assert(Ty->isIntegerTy() && 106 "getIntImmCost can only estimate cost of materialising integers"); 107 108 // We have a Zero register, so 0 is always free. 109 if (Imm == 0) 110 return TTI::TCC_Free; 111 112 // Some instructions in RISC-V can take a 12-bit immediate. Some of these are 113 // commutative, in others the immediate comes from a specific argument index. 114 bool Takes12BitImm = false; 115 unsigned ImmArgIdx = ~0U; 116 117 switch (Opcode) { 118 case Instruction::GetElementPtr: 119 // Never hoist any arguments to a GetElementPtr. CodeGenPrepare will 120 // split up large offsets in GEP into better parts than ConstantHoisting 121 // can. 122 return TTI::TCC_Free; 123 case Instruction::And: 124 // zext.h 125 if (Imm == UINT64_C(0xffff) && ST->hasStdExtZbb()) 126 return TTI::TCC_Free; 127 // zext.w 128 if (Imm == UINT64_C(0xffffffff) && ST->hasStdExtZba()) 129 return TTI::TCC_Free; 130 // bclri 131 if (ST->hasStdExtZbs() && (~Imm).isPowerOf2()) 132 return TTI::TCC_Free; 133 if (Inst && Idx == 1 && Imm.getBitWidth() <= ST->getXLen() && 134 canUseShiftPair(Inst, Imm)) 135 return TTI::TCC_Free; 136 Takes12BitImm = true; 137 break; 138 case Instruction::Add: 139 Takes12BitImm = true; 140 break; 141 case Instruction::Or: 142 case Instruction::Xor: 143 // bseti/binvi 144 if (ST->hasStdExtZbs() && Imm.isPowerOf2()) 145 return TTI::TCC_Free; 146 Takes12BitImm = true; 147 break; 148 case Instruction::Mul: 149 // Power of 2 is a shift. Negated power of 2 is a shift and a negate. 150 if (Imm.isPowerOf2() || Imm.isNegatedPowerOf2()) 151 return TTI::TCC_Free; 152 // FIXME: There is no MULI instruction. 153 Takes12BitImm = true; 154 break; 155 case Instruction::Sub: 156 case Instruction::Shl: 157 case Instruction::LShr: 158 case Instruction::AShr: 159 Takes12BitImm = true; 160 ImmArgIdx = 1; 161 break; 162 default: 163 break; 164 } 165 166 if (Takes12BitImm) { 167 // Check immediate is the correct argument... 168 if (Instruction::isCommutative(Opcode) || Idx == ImmArgIdx) { 169 // ... and fits into the 12-bit immediate. 170 if (Imm.getSignificantBits() <= 64 && 171 getTLI()->isLegalAddImmediate(Imm.getSExtValue())) { 172 return TTI::TCC_Free; 173 } 174 } 175 176 // Otherwise, use the full materialisation cost. 177 return getIntImmCost(Imm, Ty, CostKind); 178 } 179 180 // By default, prevent hoisting. 181 return TTI::TCC_Free; 182 } 183 184 InstructionCost 185 RISCVTTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, 186 const APInt &Imm, Type *Ty, 187 TTI::TargetCostKind CostKind) { 188 // Prevent hoisting in unknown cases. 189 return TTI::TCC_Free; 190 } 191 192 TargetTransformInfo::PopcntSupportKind 193 RISCVTTIImpl::getPopcntSupport(unsigned TyWidth) { 194 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); 195 return ST->hasStdExtZbb() ? TTI::PSK_FastHardware : TTI::PSK_Software; 196 } 197 198 bool RISCVTTIImpl::shouldExpandReduction(const IntrinsicInst *II) const { 199 // Currently, the ExpandReductions pass can't expand scalable-vector 200 // reductions, but we still request expansion as RVV doesn't support certain 201 // reductions and the SelectionDAG can't legalize them either. 202 switch (II->getIntrinsicID()) { 203 default: 204 return false; 205 // These reductions have no equivalent in RVV 206 case Intrinsic::vector_reduce_mul: 207 case Intrinsic::vector_reduce_fmul: 208 return true; 209 } 210 } 211 212 std::optional<unsigned> RISCVTTIImpl::getMaxVScale() const { 213 if (ST->hasVInstructions()) 214 return ST->getRealMaxVLen() / RISCV::RVVBitsPerBlock; 215 return BaseT::getMaxVScale(); 216 } 217 218 std::optional<unsigned> RISCVTTIImpl::getVScaleForTuning() const { 219 if (ST->hasVInstructions()) 220 if (unsigned MinVLen = ST->getRealMinVLen(); 221 MinVLen >= RISCV::RVVBitsPerBlock) 222 return MinVLen / RISCV::RVVBitsPerBlock; 223 return BaseT::getVScaleForTuning(); 224 } 225 226 TypeSize 227 RISCVTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { 228 unsigned LMUL = 229 llvm::bit_floor(std::clamp<unsigned>(RVVRegisterWidthLMUL, 1, 8)); 230 switch (K) { 231 case TargetTransformInfo::RGK_Scalar: 232 return TypeSize::getFixed(ST->getXLen()); 233 case TargetTransformInfo::RGK_FixedWidthVector: 234 return TypeSize::getFixed( 235 ST->useRVVForFixedLengthVectors() ? LMUL * ST->getRealMinVLen() : 0); 236 case TargetTransformInfo::RGK_ScalableVector: 237 return TypeSize::getScalable( 238 (ST->hasVInstructions() && 239 ST->getRealMinVLen() >= RISCV::RVVBitsPerBlock) 240 ? LMUL * RISCV::RVVBitsPerBlock 241 : 0); 242 } 243 244 llvm_unreachable("Unsupported register kind"); 245 } 246 247 InstructionCost 248 RISCVTTIImpl::getConstantPoolLoadCost(Type *Ty, TTI::TargetCostKind CostKind) { 249 // Add a cost of address generation + the cost of the load. The address 250 // is expected to be a PC relative offset to a constant pool entry 251 // using auipc/addi. 252 return 2 + getMemoryOpCost(Instruction::Load, Ty, DL.getABITypeAlign(Ty), 253 /*AddressSpace=*/0, CostKind); 254 } 255 256 static VectorType *getVRGatherIndexType(MVT DataVT, const RISCVSubtarget &ST, 257 LLVMContext &C) { 258 assert((DataVT.getScalarSizeInBits() != 8 || 259 DataVT.getVectorNumElements() <= 256) && "unhandled case in lowering"); 260 MVT IndexVT = DataVT.changeTypeToInteger(); 261 if (IndexVT.getScalarType().bitsGT(ST.getXLenVT())) 262 IndexVT = IndexVT.changeVectorElementType(MVT::i16); 263 return cast<VectorType>(EVT(IndexVT).getTypeForEVT(C)); 264 } 265 266 /// Return the cost of a vrgather.vv instruction for the type VT. vrgather.vv 267 /// is generally quadratic in the number of vreg implied by LMUL. Note that 268 /// operand (index and possibly mask) are handled separately. 269 InstructionCost RISCVTTIImpl::getVRGatherVVCost(MVT VT) { 270 return getLMULCost(VT) * getLMULCost(VT); 271 } 272 273 InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, 274 VectorType *Tp, ArrayRef<int> Mask, 275 TTI::TargetCostKind CostKind, 276 int Index, VectorType *SubTp, 277 ArrayRef<const Value *> Args) { 278 Kind = improveShuffleKindFromMask(Kind, Mask); 279 280 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp); 281 282 // First, handle cases where having a fixed length vector enables us to 283 // give a more accurate cost than falling back to generic scalable codegen. 284 // TODO: Each of these cases hints at a modeling gap around scalable vectors. 285 if (isa<FixedVectorType>(Tp)) { 286 switch (Kind) { 287 default: 288 break; 289 case TTI::SK_PermuteSingleSrc: { 290 if (Mask.size() >= 2 && LT.second.isFixedLengthVector()) { 291 MVT EltTp = LT.second.getVectorElementType(); 292 // If the size of the element is < ELEN then shuffles of interleaves and 293 // deinterleaves of 2 vectors can be lowered into the following 294 // sequences 295 if (EltTp.getScalarSizeInBits() < ST->getELEN()) { 296 // Example sequence: 297 // vsetivli zero, 4, e8, mf4, ta, ma (ignored) 298 // vwaddu.vv v10, v8, v9 299 // li a0, -1 (ignored) 300 // vwmaccu.vx v10, a0, v9 301 if (ShuffleVectorInst::isInterleaveMask(Mask, 2, Mask.size())) 302 return 2 * LT.first * getLMULCost(LT.second); 303 304 if (Mask[0] == 0 || Mask[0] == 1) { 305 auto DeinterleaveMask = createStrideMask(Mask[0], 2, Mask.size()); 306 // Example sequence: 307 // vnsrl.wi v10, v8, 0 308 if (equal(DeinterleaveMask, Mask)) 309 return LT.first * getLMULCost(LT.second); 310 } 311 } 312 313 // vrgather + cost of generating the mask constant. 314 // We model this for an unknown mask with a single vrgather. 315 if (LT.first == 1 && 316 (LT.second.getScalarSizeInBits() != 8 || 317 LT.second.getVectorNumElements() <= 256)) { 318 VectorType *IdxTy = getVRGatherIndexType(LT.second, *ST, Tp->getContext()); 319 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind); 320 return IndexCost + getVRGatherVVCost(LT.second); 321 } 322 } 323 break; 324 } 325 case TTI::SK_Transpose: 326 case TTI::SK_PermuteTwoSrc: { 327 if (Mask.size() >= 2 && LT.second.isFixedLengthVector()) { 328 // 2 x (vrgather + cost of generating the mask constant) + cost of mask 329 // register for the second vrgather. We model this for an unknown 330 // (shuffle) mask. 331 if (LT.first == 1 && 332 (LT.second.getScalarSizeInBits() != 8 || 333 LT.second.getVectorNumElements() <= 256)) { 334 auto &C = Tp->getContext(); 335 auto EC = Tp->getElementCount(); 336 VectorType *IdxTy = getVRGatherIndexType(LT.second, *ST, C); 337 VectorType *MaskTy = VectorType::get(IntegerType::getInt1Ty(C), EC); 338 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind); 339 InstructionCost MaskCost = getConstantPoolLoadCost(MaskTy, CostKind); 340 return 2 * IndexCost + 2 * getVRGatherVVCost(LT.second) + MaskCost; 341 } 342 } 343 break; 344 } 345 } 346 }; 347 348 // Handle scalable vectors (and fixed vectors legalized to scalable vectors). 349 switch (Kind) { 350 default: 351 // Fallthrough to generic handling. 352 // TODO: Most of these cases will return getInvalid in generic code, and 353 // must be implemented here. 354 break; 355 case TTI::SK_ExtractSubvector: 356 // Example sequence: 357 // vsetivli zero, 4, e8, mf2, tu, ma (ignored) 358 // vslidedown.vi v8, v9, 2 359 return LT.first * getLMULCost(LT.second); 360 case TTI::SK_InsertSubvector: 361 // Example sequence: 362 // vsetivli zero, 4, e8, mf2, tu, ma (ignored) 363 // vslideup.vi v8, v9, 2 364 return LT.first * getLMULCost(LT.second); 365 case TTI::SK_Select: { 366 // Example sequence: 367 // li a0, 90 368 // vsetivli zero, 8, e8, mf2, ta, ma (ignored) 369 // vmv.s.x v0, a0 370 // vmerge.vvm v8, v9, v8, v0 371 return LT.first * 3 * getLMULCost(LT.second); 372 } 373 case TTI::SK_Broadcast: { 374 bool HasScalar = (Args.size() > 0) && (Operator::getOpcode(Args[0]) == 375 Instruction::InsertElement); 376 if (LT.second.getScalarSizeInBits() == 1) { 377 if (HasScalar) { 378 // Example sequence: 379 // andi a0, a0, 1 380 // vsetivli zero, 2, e8, mf8, ta, ma (ignored) 381 // vmv.v.x v8, a0 382 // vmsne.vi v0, v8, 0 383 return LT.first * getLMULCost(LT.second) * 3; 384 } 385 // Example sequence: 386 // vsetivli zero, 2, e8, mf8, ta, mu (ignored) 387 // vmv.v.i v8, 0 388 // vmerge.vim v8, v8, 1, v0 389 // vmv.x.s a0, v8 390 // andi a0, a0, 1 391 // vmv.v.x v8, a0 392 // vmsne.vi v0, v8, 0 393 394 return LT.first * getLMULCost(LT.second) * 6; 395 } 396 397 if (HasScalar) { 398 // Example sequence: 399 // vmv.v.x v8, a0 400 return LT.first * getLMULCost(LT.second); 401 } 402 403 // Example sequence: 404 // vrgather.vi v9, v8, 0 405 // TODO: vrgather could be slower than vmv.v.x. It is 406 // implementation-dependent. 407 return LT.first * getLMULCost(LT.second); 408 } 409 case TTI::SK_Splice: 410 // vslidedown+vslideup. 411 // TODO: Multiplying by LT.first implies this legalizes into multiple copies 412 // of similar code, but I think we expand through memory. 413 return 2 * LT.first * getLMULCost(LT.second); 414 case TTI::SK_Reverse: { 415 // TODO: Cases to improve here: 416 // * Illegal vector types 417 // * i64 on RV32 418 // * i1 vector 419 // At low LMUL, most of the cost is producing the vrgather index register. 420 // At high LMUL, the cost of the vrgather itself will dominate. 421 // Example sequence: 422 // csrr a0, vlenb 423 // srli a0, a0, 3 424 // addi a0, a0, -1 425 // vsetvli a1, zero, e8, mf8, ta, mu (ignored) 426 // vid.v v9 427 // vrsub.vx v10, v9, a0 428 // vrgather.vv v9, v8, v10 429 InstructionCost LenCost = 3; 430 if (LT.second.isFixedLengthVector()) 431 // vrsub.vi has a 5 bit immediate field, otherwise an li suffices 432 LenCost = isInt<5>(LT.second.getVectorNumElements() - 1) ? 0 : 1; 433 InstructionCost GatherCost = 2 + getVRGatherVVCost(LT.second); 434 // Mask operation additionally required extend and truncate 435 InstructionCost ExtendCost = Tp->getElementType()->isIntegerTy(1) ? 3 : 0; 436 return LT.first * (LenCost + GatherCost + ExtendCost); 437 } 438 } 439 return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp); 440 } 441 442 InstructionCost 443 RISCVTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, 444 unsigned AddressSpace, 445 TTI::TargetCostKind CostKind) { 446 if (!isLegalMaskedLoadStore(Src, Alignment) || 447 CostKind != TTI::TCK_RecipThroughput) 448 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace, 449 CostKind); 450 451 return getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind); 452 } 453 454 InstructionCost RISCVTTIImpl::getInterleavedMemoryOpCost( 455 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, 456 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, 457 bool UseMaskForCond, bool UseMaskForGaps) { 458 if (isa<ScalableVectorType>(VecTy)) 459 return InstructionCost::getInvalid(); 460 auto *FVTy = cast<FixedVectorType>(VecTy); 461 InstructionCost MemCost = 462 getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, CostKind); 463 unsigned VF = FVTy->getNumElements() / Factor; 464 465 // The interleaved memory access pass will lower interleaved memory ops (i.e 466 // a load and store followed by a specific shuffle) to vlseg/vsseg 467 // intrinsics. In those cases then we can treat it as if it's just one (legal) 468 // memory op 469 if (!UseMaskForCond && !UseMaskForGaps && 470 Factor <= TLI->getMaxSupportedInterleaveFactor()) { 471 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(FVTy); 472 // Need to make sure type has't been scalarized 473 if (LT.second.isFixedLengthVector()) { 474 auto *LegalFVTy = FixedVectorType::get(FVTy->getElementType(), 475 LT.second.getVectorNumElements()); 476 // FIXME: We use the memory op cost of the *legalized* type here, becuase 477 // it's getMemoryOpCost returns a really expensive cost for types like 478 // <6 x i8>, which show up when doing interleaves of Factor=3 etc. 479 // Should the memory op cost of these be cheaper? 480 if (TLI->isLegalInterleavedAccessType(LegalFVTy, Factor, Alignment, 481 AddressSpace, DL)) { 482 InstructionCost LegalMemCost = getMemoryOpCost( 483 Opcode, LegalFVTy, Alignment, AddressSpace, CostKind); 484 return LT.first + LegalMemCost; 485 } 486 } 487 } 488 489 // An interleaved load will look like this for Factor=3: 490 // %wide.vec = load <12 x i32>, ptr %3, align 4 491 // %strided.vec = shufflevector %wide.vec, poison, <4 x i32> <stride mask> 492 // %strided.vec1 = shufflevector %wide.vec, poison, <4 x i32> <stride mask> 493 // %strided.vec2 = shufflevector %wide.vec, poison, <4 x i32> <stride mask> 494 if (Opcode == Instruction::Load) { 495 InstructionCost Cost = MemCost; 496 for (unsigned Index : Indices) { 497 FixedVectorType *SubVecTy = 498 FixedVectorType::get(FVTy->getElementType(), VF); 499 auto Mask = createStrideMask(Index, Factor, VF); 500 InstructionCost ShuffleCost = 501 getShuffleCost(TTI::ShuffleKind::SK_PermuteSingleSrc, SubVecTy, Mask, 502 CostKind, 0, nullptr, {}); 503 Cost += ShuffleCost; 504 } 505 return Cost; 506 } 507 508 // TODO: Model for NF > 2 509 // We'll need to enhance getShuffleCost to model shuffles that are just 510 // inserts and extracts into subvectors, since they won't have the full cost 511 // of a vrgather. 512 // An interleaved store for 3 vectors of 4 lanes will look like 513 // %11 = shufflevector <4 x i32> %4, <4 x i32> %6, <8 x i32> <0...7> 514 // %12 = shufflevector <4 x i32> %9, <4 x i32> poison, <8 x i32> <0...3> 515 // %13 = shufflevector <8 x i32> %11, <8 x i32> %12, <12 x i32> <0...11> 516 // %interleaved.vec = shufflevector %13, poison, <12 x i32> <interleave mask> 517 // store <12 x i32> %interleaved.vec, ptr %10, align 4 518 if (Factor != 2) 519 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, 520 Alignment, AddressSpace, CostKind, 521 UseMaskForCond, UseMaskForGaps); 522 523 assert(Opcode == Instruction::Store && "Opcode must be a store"); 524 // For an interleaving store of 2 vectors, we perform one large interleaving 525 // shuffle that goes into the wide store 526 auto Mask = createInterleaveMask(VF, Factor); 527 InstructionCost ShuffleCost = 528 getShuffleCost(TTI::ShuffleKind::SK_PermuteSingleSrc, FVTy, Mask, 529 CostKind, 0, nullptr, {}); 530 return MemCost + ShuffleCost; 531 } 532 533 InstructionCost RISCVTTIImpl::getGatherScatterOpCost( 534 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, 535 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) { 536 if (CostKind != TTI::TCK_RecipThroughput) 537 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, 538 Alignment, CostKind, I); 539 540 if ((Opcode == Instruction::Load && 541 !isLegalMaskedGather(DataTy, Align(Alignment))) || 542 (Opcode == Instruction::Store && 543 !isLegalMaskedScatter(DataTy, Align(Alignment)))) 544 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, 545 Alignment, CostKind, I); 546 547 // Cost is proportional to the number of memory operations implied. For 548 // scalable vectors, we use an estimate on that number since we don't 549 // know exactly what VL will be. 550 auto &VTy = *cast<VectorType>(DataTy); 551 InstructionCost MemOpCost = 552 getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind, 553 {TTI::OK_AnyValue, TTI::OP_None}, I); 554 unsigned NumLoads = getEstimatedVLFor(&VTy); 555 return NumLoads * MemOpCost; 556 } 557 558 // Currently, these represent both throughput and codesize costs 559 // for the respective intrinsics. The costs in this table are simply 560 // instruction counts with the following adjustments made: 561 // * One vsetvli is considered free. 562 static const CostTblEntry VectorIntrinsicCostTable[]{ 563 {Intrinsic::floor, MVT::v2f32, 9}, 564 {Intrinsic::floor, MVT::v4f32, 9}, 565 {Intrinsic::floor, MVT::v8f32, 9}, 566 {Intrinsic::floor, MVT::v16f32, 9}, 567 {Intrinsic::floor, MVT::nxv1f32, 9}, 568 {Intrinsic::floor, MVT::nxv2f32, 9}, 569 {Intrinsic::floor, MVT::nxv4f32, 9}, 570 {Intrinsic::floor, MVT::nxv8f32, 9}, 571 {Intrinsic::floor, MVT::nxv16f32, 9}, 572 {Intrinsic::floor, MVT::v2f64, 9}, 573 {Intrinsic::floor, MVT::v4f64, 9}, 574 {Intrinsic::floor, MVT::v8f64, 9}, 575 {Intrinsic::floor, MVT::v16f64, 9}, 576 {Intrinsic::floor, MVT::nxv1f64, 9}, 577 {Intrinsic::floor, MVT::nxv2f64, 9}, 578 {Intrinsic::floor, MVT::nxv4f64, 9}, 579 {Intrinsic::floor, MVT::nxv8f64, 9}, 580 {Intrinsic::ceil, MVT::v2f32, 9}, 581 {Intrinsic::ceil, MVT::v4f32, 9}, 582 {Intrinsic::ceil, MVT::v8f32, 9}, 583 {Intrinsic::ceil, MVT::v16f32, 9}, 584 {Intrinsic::ceil, MVT::nxv1f32, 9}, 585 {Intrinsic::ceil, MVT::nxv2f32, 9}, 586 {Intrinsic::ceil, MVT::nxv4f32, 9}, 587 {Intrinsic::ceil, MVT::nxv8f32, 9}, 588 {Intrinsic::ceil, MVT::nxv16f32, 9}, 589 {Intrinsic::ceil, MVT::v2f64, 9}, 590 {Intrinsic::ceil, MVT::v4f64, 9}, 591 {Intrinsic::ceil, MVT::v8f64, 9}, 592 {Intrinsic::ceil, MVT::v16f64, 9}, 593 {Intrinsic::ceil, MVT::nxv1f64, 9}, 594 {Intrinsic::ceil, MVT::nxv2f64, 9}, 595 {Intrinsic::ceil, MVT::nxv4f64, 9}, 596 {Intrinsic::ceil, MVT::nxv8f64, 9}, 597 {Intrinsic::trunc, MVT::v2f32, 7}, 598 {Intrinsic::trunc, MVT::v4f32, 7}, 599 {Intrinsic::trunc, MVT::v8f32, 7}, 600 {Intrinsic::trunc, MVT::v16f32, 7}, 601 {Intrinsic::trunc, MVT::nxv1f32, 7}, 602 {Intrinsic::trunc, MVT::nxv2f32, 7}, 603 {Intrinsic::trunc, MVT::nxv4f32, 7}, 604 {Intrinsic::trunc, MVT::nxv8f32, 7}, 605 {Intrinsic::trunc, MVT::nxv16f32, 7}, 606 {Intrinsic::trunc, MVT::v2f64, 7}, 607 {Intrinsic::trunc, MVT::v4f64, 7}, 608 {Intrinsic::trunc, MVT::v8f64, 7}, 609 {Intrinsic::trunc, MVT::v16f64, 7}, 610 {Intrinsic::trunc, MVT::nxv1f64, 7}, 611 {Intrinsic::trunc, MVT::nxv2f64, 7}, 612 {Intrinsic::trunc, MVT::nxv4f64, 7}, 613 {Intrinsic::trunc, MVT::nxv8f64, 7}, 614 {Intrinsic::round, MVT::v2f32, 9}, 615 {Intrinsic::round, MVT::v4f32, 9}, 616 {Intrinsic::round, MVT::v8f32, 9}, 617 {Intrinsic::round, MVT::v16f32, 9}, 618 {Intrinsic::round, MVT::nxv1f32, 9}, 619 {Intrinsic::round, MVT::nxv2f32, 9}, 620 {Intrinsic::round, MVT::nxv4f32, 9}, 621 {Intrinsic::round, MVT::nxv8f32, 9}, 622 {Intrinsic::round, MVT::nxv16f32, 9}, 623 {Intrinsic::round, MVT::v2f64, 9}, 624 {Intrinsic::round, MVT::v4f64, 9}, 625 {Intrinsic::round, MVT::v8f64, 9}, 626 {Intrinsic::round, MVT::v16f64, 9}, 627 {Intrinsic::round, MVT::nxv1f64, 9}, 628 {Intrinsic::round, MVT::nxv2f64, 9}, 629 {Intrinsic::round, MVT::nxv4f64, 9}, 630 {Intrinsic::round, MVT::nxv8f64, 9}, 631 {Intrinsic::roundeven, MVT::v2f32, 9}, 632 {Intrinsic::roundeven, MVT::v4f32, 9}, 633 {Intrinsic::roundeven, MVT::v8f32, 9}, 634 {Intrinsic::roundeven, MVT::v16f32, 9}, 635 {Intrinsic::roundeven, MVT::nxv1f32, 9}, 636 {Intrinsic::roundeven, MVT::nxv2f32, 9}, 637 {Intrinsic::roundeven, MVT::nxv4f32, 9}, 638 {Intrinsic::roundeven, MVT::nxv8f32, 9}, 639 {Intrinsic::roundeven, MVT::nxv16f32, 9}, 640 {Intrinsic::roundeven, MVT::v2f64, 9}, 641 {Intrinsic::roundeven, MVT::v4f64, 9}, 642 {Intrinsic::roundeven, MVT::v8f64, 9}, 643 {Intrinsic::roundeven, MVT::v16f64, 9}, 644 {Intrinsic::roundeven, MVT::nxv1f64, 9}, 645 {Intrinsic::roundeven, MVT::nxv2f64, 9}, 646 {Intrinsic::roundeven, MVT::nxv4f64, 9}, 647 {Intrinsic::roundeven, MVT::nxv8f64, 9}, 648 {Intrinsic::rint, MVT::v2f32, 7}, 649 {Intrinsic::rint, MVT::v4f32, 7}, 650 {Intrinsic::rint, MVT::v8f32, 7}, 651 {Intrinsic::rint, MVT::v16f32, 7}, 652 {Intrinsic::rint, MVT::nxv1f32, 7}, 653 {Intrinsic::rint, MVT::nxv2f32, 7}, 654 {Intrinsic::rint, MVT::nxv4f32, 7}, 655 {Intrinsic::rint, MVT::nxv8f32, 7}, 656 {Intrinsic::rint, MVT::nxv16f32, 7}, 657 {Intrinsic::rint, MVT::v2f64, 7}, 658 {Intrinsic::rint, MVT::v4f64, 7}, 659 {Intrinsic::rint, MVT::v8f64, 7}, 660 {Intrinsic::rint, MVT::v16f64, 7}, 661 {Intrinsic::rint, MVT::nxv1f64, 7}, 662 {Intrinsic::rint, MVT::nxv2f64, 7}, 663 {Intrinsic::rint, MVT::nxv4f64, 7}, 664 {Intrinsic::rint, MVT::nxv8f64, 7}, 665 {Intrinsic::nearbyint, MVT::v2f32, 9}, 666 {Intrinsic::nearbyint, MVT::v4f32, 9}, 667 {Intrinsic::nearbyint, MVT::v8f32, 9}, 668 {Intrinsic::nearbyint, MVT::v16f32, 9}, 669 {Intrinsic::nearbyint, MVT::nxv1f32, 9}, 670 {Intrinsic::nearbyint, MVT::nxv2f32, 9}, 671 {Intrinsic::nearbyint, MVT::nxv4f32, 9}, 672 {Intrinsic::nearbyint, MVT::nxv8f32, 9}, 673 {Intrinsic::nearbyint, MVT::nxv16f32, 9}, 674 {Intrinsic::nearbyint, MVT::v2f64, 9}, 675 {Intrinsic::nearbyint, MVT::v4f64, 9}, 676 {Intrinsic::nearbyint, MVT::v8f64, 9}, 677 {Intrinsic::nearbyint, MVT::v16f64, 9}, 678 {Intrinsic::nearbyint, MVT::nxv1f64, 9}, 679 {Intrinsic::nearbyint, MVT::nxv2f64, 9}, 680 {Intrinsic::nearbyint, MVT::nxv4f64, 9}, 681 {Intrinsic::nearbyint, MVT::nxv8f64, 9}, 682 {Intrinsic::bswap, MVT::v2i16, 3}, 683 {Intrinsic::bswap, MVT::v4i16, 3}, 684 {Intrinsic::bswap, MVT::v8i16, 3}, 685 {Intrinsic::bswap, MVT::v16i16, 3}, 686 {Intrinsic::bswap, MVT::nxv1i16, 3}, 687 {Intrinsic::bswap, MVT::nxv2i16, 3}, 688 {Intrinsic::bswap, MVT::nxv4i16, 3}, 689 {Intrinsic::bswap, MVT::nxv8i16, 3}, 690 {Intrinsic::bswap, MVT::nxv16i16, 3}, 691 {Intrinsic::bswap, MVT::v2i32, 12}, 692 {Intrinsic::bswap, MVT::v4i32, 12}, 693 {Intrinsic::bswap, MVT::v8i32, 12}, 694 {Intrinsic::bswap, MVT::v16i32, 12}, 695 {Intrinsic::bswap, MVT::nxv1i32, 12}, 696 {Intrinsic::bswap, MVT::nxv2i32, 12}, 697 {Intrinsic::bswap, MVT::nxv4i32, 12}, 698 {Intrinsic::bswap, MVT::nxv8i32, 12}, 699 {Intrinsic::bswap, MVT::nxv16i32, 12}, 700 {Intrinsic::bswap, MVT::v2i64, 31}, 701 {Intrinsic::bswap, MVT::v4i64, 31}, 702 {Intrinsic::bswap, MVT::v8i64, 31}, 703 {Intrinsic::bswap, MVT::v16i64, 31}, 704 {Intrinsic::bswap, MVT::nxv1i64, 31}, 705 {Intrinsic::bswap, MVT::nxv2i64, 31}, 706 {Intrinsic::bswap, MVT::nxv4i64, 31}, 707 {Intrinsic::bswap, MVT::nxv8i64, 31}, 708 {Intrinsic::vp_bswap, MVT::v2i16, 3}, 709 {Intrinsic::vp_bswap, MVT::v4i16, 3}, 710 {Intrinsic::vp_bswap, MVT::v8i16, 3}, 711 {Intrinsic::vp_bswap, MVT::v16i16, 3}, 712 {Intrinsic::vp_bswap, MVT::nxv1i16, 3}, 713 {Intrinsic::vp_bswap, MVT::nxv2i16, 3}, 714 {Intrinsic::vp_bswap, MVT::nxv4i16, 3}, 715 {Intrinsic::vp_bswap, MVT::nxv8i16, 3}, 716 {Intrinsic::vp_bswap, MVT::nxv16i16, 3}, 717 {Intrinsic::vp_bswap, MVT::v2i32, 12}, 718 {Intrinsic::vp_bswap, MVT::v4i32, 12}, 719 {Intrinsic::vp_bswap, MVT::v8i32, 12}, 720 {Intrinsic::vp_bswap, MVT::v16i32, 12}, 721 {Intrinsic::vp_bswap, MVT::nxv1i32, 12}, 722 {Intrinsic::vp_bswap, MVT::nxv2i32, 12}, 723 {Intrinsic::vp_bswap, MVT::nxv4i32, 12}, 724 {Intrinsic::vp_bswap, MVT::nxv8i32, 12}, 725 {Intrinsic::vp_bswap, MVT::nxv16i32, 12}, 726 {Intrinsic::vp_bswap, MVT::v2i64, 31}, 727 {Intrinsic::vp_bswap, MVT::v4i64, 31}, 728 {Intrinsic::vp_bswap, MVT::v8i64, 31}, 729 {Intrinsic::vp_bswap, MVT::v16i64, 31}, 730 {Intrinsic::vp_bswap, MVT::nxv1i64, 31}, 731 {Intrinsic::vp_bswap, MVT::nxv2i64, 31}, 732 {Intrinsic::vp_bswap, MVT::nxv4i64, 31}, 733 {Intrinsic::vp_bswap, MVT::nxv8i64, 31}, 734 {Intrinsic::vp_fshl, MVT::v2i8, 7}, 735 {Intrinsic::vp_fshl, MVT::v4i8, 7}, 736 {Intrinsic::vp_fshl, MVT::v8i8, 7}, 737 {Intrinsic::vp_fshl, MVT::v16i8, 7}, 738 {Intrinsic::vp_fshl, MVT::nxv1i8, 7}, 739 {Intrinsic::vp_fshl, MVT::nxv2i8, 7}, 740 {Intrinsic::vp_fshl, MVT::nxv4i8, 7}, 741 {Intrinsic::vp_fshl, MVT::nxv8i8, 7}, 742 {Intrinsic::vp_fshl, MVT::nxv16i8, 7}, 743 {Intrinsic::vp_fshl, MVT::nxv32i8, 7}, 744 {Intrinsic::vp_fshl, MVT::nxv64i8, 7}, 745 {Intrinsic::vp_fshl, MVT::v2i16, 7}, 746 {Intrinsic::vp_fshl, MVT::v4i16, 7}, 747 {Intrinsic::vp_fshl, MVT::v8i16, 7}, 748 {Intrinsic::vp_fshl, MVT::v16i16, 7}, 749 {Intrinsic::vp_fshl, MVT::nxv1i16, 7}, 750 {Intrinsic::vp_fshl, MVT::nxv2i16, 7}, 751 {Intrinsic::vp_fshl, MVT::nxv4i16, 7}, 752 {Intrinsic::vp_fshl, MVT::nxv8i16, 7}, 753 {Intrinsic::vp_fshl, MVT::nxv16i16, 7}, 754 {Intrinsic::vp_fshl, MVT::nxv32i16, 7}, 755 {Intrinsic::vp_fshl, MVT::v2i32, 7}, 756 {Intrinsic::vp_fshl, MVT::v4i32, 7}, 757 {Intrinsic::vp_fshl, MVT::v8i32, 7}, 758 {Intrinsic::vp_fshl, MVT::v16i32, 7}, 759 {Intrinsic::vp_fshl, MVT::nxv1i32, 7}, 760 {Intrinsic::vp_fshl, MVT::nxv2i32, 7}, 761 {Intrinsic::vp_fshl, MVT::nxv4i32, 7}, 762 {Intrinsic::vp_fshl, MVT::nxv8i32, 7}, 763 {Intrinsic::vp_fshl, MVT::nxv16i32, 7}, 764 {Intrinsic::vp_fshl, MVT::v2i64, 7}, 765 {Intrinsic::vp_fshl, MVT::v4i64, 7}, 766 {Intrinsic::vp_fshl, MVT::v8i64, 7}, 767 {Intrinsic::vp_fshl, MVT::v16i64, 7}, 768 {Intrinsic::vp_fshl, MVT::nxv1i64, 7}, 769 {Intrinsic::vp_fshl, MVT::nxv2i64, 7}, 770 {Intrinsic::vp_fshl, MVT::nxv4i64, 7}, 771 {Intrinsic::vp_fshl, MVT::nxv8i64, 7}, 772 {Intrinsic::vp_fshr, MVT::v2i8, 7}, 773 {Intrinsic::vp_fshr, MVT::v4i8, 7}, 774 {Intrinsic::vp_fshr, MVT::v8i8, 7}, 775 {Intrinsic::vp_fshr, MVT::v16i8, 7}, 776 {Intrinsic::vp_fshr, MVT::nxv1i8, 7}, 777 {Intrinsic::vp_fshr, MVT::nxv2i8, 7}, 778 {Intrinsic::vp_fshr, MVT::nxv4i8, 7}, 779 {Intrinsic::vp_fshr, MVT::nxv8i8, 7}, 780 {Intrinsic::vp_fshr, MVT::nxv16i8, 7}, 781 {Intrinsic::vp_fshr, MVT::nxv32i8, 7}, 782 {Intrinsic::vp_fshr, MVT::nxv64i8, 7}, 783 {Intrinsic::vp_fshr, MVT::v2i16, 7}, 784 {Intrinsic::vp_fshr, MVT::v4i16, 7}, 785 {Intrinsic::vp_fshr, MVT::v8i16, 7}, 786 {Intrinsic::vp_fshr, MVT::v16i16, 7}, 787 {Intrinsic::vp_fshr, MVT::nxv1i16, 7}, 788 {Intrinsic::vp_fshr, MVT::nxv2i16, 7}, 789 {Intrinsic::vp_fshr, MVT::nxv4i16, 7}, 790 {Intrinsic::vp_fshr, MVT::nxv8i16, 7}, 791 {Intrinsic::vp_fshr, MVT::nxv16i16, 7}, 792 {Intrinsic::vp_fshr, MVT::nxv32i16, 7}, 793 {Intrinsic::vp_fshr, MVT::v2i32, 7}, 794 {Intrinsic::vp_fshr, MVT::v4i32, 7}, 795 {Intrinsic::vp_fshr, MVT::v8i32, 7}, 796 {Intrinsic::vp_fshr, MVT::v16i32, 7}, 797 {Intrinsic::vp_fshr, MVT::nxv1i32, 7}, 798 {Intrinsic::vp_fshr, MVT::nxv2i32, 7}, 799 {Intrinsic::vp_fshr, MVT::nxv4i32, 7}, 800 {Intrinsic::vp_fshr, MVT::nxv8i32, 7}, 801 {Intrinsic::vp_fshr, MVT::nxv16i32, 7}, 802 {Intrinsic::vp_fshr, MVT::v2i64, 7}, 803 {Intrinsic::vp_fshr, MVT::v4i64, 7}, 804 {Intrinsic::vp_fshr, MVT::v8i64, 7}, 805 {Intrinsic::vp_fshr, MVT::v16i64, 7}, 806 {Intrinsic::vp_fshr, MVT::nxv1i64, 7}, 807 {Intrinsic::vp_fshr, MVT::nxv2i64, 7}, 808 {Intrinsic::vp_fshr, MVT::nxv4i64, 7}, 809 {Intrinsic::vp_fshr, MVT::nxv8i64, 7}, 810 {Intrinsic::bitreverse, MVT::v2i8, 17}, 811 {Intrinsic::bitreverse, MVT::v4i8, 17}, 812 {Intrinsic::bitreverse, MVT::v8i8, 17}, 813 {Intrinsic::bitreverse, MVT::v16i8, 17}, 814 {Intrinsic::bitreverse, MVT::nxv1i8, 17}, 815 {Intrinsic::bitreverse, MVT::nxv2i8, 17}, 816 {Intrinsic::bitreverse, MVT::nxv4i8, 17}, 817 {Intrinsic::bitreverse, MVT::nxv8i8, 17}, 818 {Intrinsic::bitreverse, MVT::nxv16i8, 17}, 819 {Intrinsic::bitreverse, MVT::v2i16, 24}, 820 {Intrinsic::bitreverse, MVT::v4i16, 24}, 821 {Intrinsic::bitreverse, MVT::v8i16, 24}, 822 {Intrinsic::bitreverse, MVT::v16i16, 24}, 823 {Intrinsic::bitreverse, MVT::nxv1i16, 24}, 824 {Intrinsic::bitreverse, MVT::nxv2i16, 24}, 825 {Intrinsic::bitreverse, MVT::nxv4i16, 24}, 826 {Intrinsic::bitreverse, MVT::nxv8i16, 24}, 827 {Intrinsic::bitreverse, MVT::nxv16i16, 24}, 828 {Intrinsic::bitreverse, MVT::v2i32, 33}, 829 {Intrinsic::bitreverse, MVT::v4i32, 33}, 830 {Intrinsic::bitreverse, MVT::v8i32, 33}, 831 {Intrinsic::bitreverse, MVT::v16i32, 33}, 832 {Intrinsic::bitreverse, MVT::nxv1i32, 33}, 833 {Intrinsic::bitreverse, MVT::nxv2i32, 33}, 834 {Intrinsic::bitreverse, MVT::nxv4i32, 33}, 835 {Intrinsic::bitreverse, MVT::nxv8i32, 33}, 836 {Intrinsic::bitreverse, MVT::nxv16i32, 33}, 837 {Intrinsic::bitreverse, MVT::v2i64, 52}, 838 {Intrinsic::bitreverse, MVT::v4i64, 52}, 839 {Intrinsic::bitreverse, MVT::v8i64, 52}, 840 {Intrinsic::bitreverse, MVT::v16i64, 52}, 841 {Intrinsic::bitreverse, MVT::nxv1i64, 52}, 842 {Intrinsic::bitreverse, MVT::nxv2i64, 52}, 843 {Intrinsic::bitreverse, MVT::nxv4i64, 52}, 844 {Intrinsic::bitreverse, MVT::nxv8i64, 52}, 845 {Intrinsic::vp_bitreverse, MVT::v2i8, 17}, 846 {Intrinsic::vp_bitreverse, MVT::v4i8, 17}, 847 {Intrinsic::vp_bitreverse, MVT::v8i8, 17}, 848 {Intrinsic::vp_bitreverse, MVT::v16i8, 17}, 849 {Intrinsic::vp_bitreverse, MVT::nxv1i8, 17}, 850 {Intrinsic::vp_bitreverse, MVT::nxv2i8, 17}, 851 {Intrinsic::vp_bitreverse, MVT::nxv4i8, 17}, 852 {Intrinsic::vp_bitreverse, MVT::nxv8i8, 17}, 853 {Intrinsic::vp_bitreverse, MVT::nxv16i8, 17}, 854 {Intrinsic::vp_bitreverse, MVT::v2i16, 24}, 855 {Intrinsic::vp_bitreverse, MVT::v4i16, 24}, 856 {Intrinsic::vp_bitreverse, MVT::v8i16, 24}, 857 {Intrinsic::vp_bitreverse, MVT::v16i16, 24}, 858 {Intrinsic::vp_bitreverse, MVT::nxv1i16, 24}, 859 {Intrinsic::vp_bitreverse, MVT::nxv2i16, 24}, 860 {Intrinsic::vp_bitreverse, MVT::nxv4i16, 24}, 861 {Intrinsic::vp_bitreverse, MVT::nxv8i16, 24}, 862 {Intrinsic::vp_bitreverse, MVT::nxv16i16, 24}, 863 {Intrinsic::vp_bitreverse, MVT::v2i32, 33}, 864 {Intrinsic::vp_bitreverse, MVT::v4i32, 33}, 865 {Intrinsic::vp_bitreverse, MVT::v8i32, 33}, 866 {Intrinsic::vp_bitreverse, MVT::v16i32, 33}, 867 {Intrinsic::vp_bitreverse, MVT::nxv1i32, 33}, 868 {Intrinsic::vp_bitreverse, MVT::nxv2i32, 33}, 869 {Intrinsic::vp_bitreverse, MVT::nxv4i32, 33}, 870 {Intrinsic::vp_bitreverse, MVT::nxv8i32, 33}, 871 {Intrinsic::vp_bitreverse, MVT::nxv16i32, 33}, 872 {Intrinsic::vp_bitreverse, MVT::v2i64, 52}, 873 {Intrinsic::vp_bitreverse, MVT::v4i64, 52}, 874 {Intrinsic::vp_bitreverse, MVT::v8i64, 52}, 875 {Intrinsic::vp_bitreverse, MVT::v16i64, 52}, 876 {Intrinsic::vp_bitreverse, MVT::nxv1i64, 52}, 877 {Intrinsic::vp_bitreverse, MVT::nxv2i64, 52}, 878 {Intrinsic::vp_bitreverse, MVT::nxv4i64, 52}, 879 {Intrinsic::vp_bitreverse, MVT::nxv8i64, 52}, 880 {Intrinsic::ctpop, MVT::v2i8, 12}, 881 {Intrinsic::ctpop, MVT::v4i8, 12}, 882 {Intrinsic::ctpop, MVT::v8i8, 12}, 883 {Intrinsic::ctpop, MVT::v16i8, 12}, 884 {Intrinsic::ctpop, MVT::nxv1i8, 12}, 885 {Intrinsic::ctpop, MVT::nxv2i8, 12}, 886 {Intrinsic::ctpop, MVT::nxv4i8, 12}, 887 {Intrinsic::ctpop, MVT::nxv8i8, 12}, 888 {Intrinsic::ctpop, MVT::nxv16i8, 12}, 889 {Intrinsic::ctpop, MVT::v2i16, 19}, 890 {Intrinsic::ctpop, MVT::v4i16, 19}, 891 {Intrinsic::ctpop, MVT::v8i16, 19}, 892 {Intrinsic::ctpop, MVT::v16i16, 19}, 893 {Intrinsic::ctpop, MVT::nxv1i16, 19}, 894 {Intrinsic::ctpop, MVT::nxv2i16, 19}, 895 {Intrinsic::ctpop, MVT::nxv4i16, 19}, 896 {Intrinsic::ctpop, MVT::nxv8i16, 19}, 897 {Intrinsic::ctpop, MVT::nxv16i16, 19}, 898 {Intrinsic::ctpop, MVT::v2i32, 20}, 899 {Intrinsic::ctpop, MVT::v4i32, 20}, 900 {Intrinsic::ctpop, MVT::v8i32, 20}, 901 {Intrinsic::ctpop, MVT::v16i32, 20}, 902 {Intrinsic::ctpop, MVT::nxv1i32, 20}, 903 {Intrinsic::ctpop, MVT::nxv2i32, 20}, 904 {Intrinsic::ctpop, MVT::nxv4i32, 20}, 905 {Intrinsic::ctpop, MVT::nxv8i32, 20}, 906 {Intrinsic::ctpop, MVT::nxv16i32, 20}, 907 {Intrinsic::ctpop, MVT::v2i64, 21}, 908 {Intrinsic::ctpop, MVT::v4i64, 21}, 909 {Intrinsic::ctpop, MVT::v8i64, 21}, 910 {Intrinsic::ctpop, MVT::v16i64, 21}, 911 {Intrinsic::ctpop, MVT::nxv1i64, 21}, 912 {Intrinsic::ctpop, MVT::nxv2i64, 21}, 913 {Intrinsic::ctpop, MVT::nxv4i64, 21}, 914 {Intrinsic::ctpop, MVT::nxv8i64, 21}, 915 {Intrinsic::vp_ctpop, MVT::v2i8, 12}, 916 {Intrinsic::vp_ctpop, MVT::v4i8, 12}, 917 {Intrinsic::vp_ctpop, MVT::v8i8, 12}, 918 {Intrinsic::vp_ctpop, MVT::v16i8, 12}, 919 {Intrinsic::vp_ctpop, MVT::nxv1i8, 12}, 920 {Intrinsic::vp_ctpop, MVT::nxv2i8, 12}, 921 {Intrinsic::vp_ctpop, MVT::nxv4i8, 12}, 922 {Intrinsic::vp_ctpop, MVT::nxv8i8, 12}, 923 {Intrinsic::vp_ctpop, MVT::nxv16i8, 12}, 924 {Intrinsic::vp_ctpop, MVT::v2i16, 19}, 925 {Intrinsic::vp_ctpop, MVT::v4i16, 19}, 926 {Intrinsic::vp_ctpop, MVT::v8i16, 19}, 927 {Intrinsic::vp_ctpop, MVT::v16i16, 19}, 928 {Intrinsic::vp_ctpop, MVT::nxv1i16, 19}, 929 {Intrinsic::vp_ctpop, MVT::nxv2i16, 19}, 930 {Intrinsic::vp_ctpop, MVT::nxv4i16, 19}, 931 {Intrinsic::vp_ctpop, MVT::nxv8i16, 19}, 932 {Intrinsic::vp_ctpop, MVT::nxv16i16, 19}, 933 {Intrinsic::vp_ctpop, MVT::v2i32, 20}, 934 {Intrinsic::vp_ctpop, MVT::v4i32, 20}, 935 {Intrinsic::vp_ctpop, MVT::v8i32, 20}, 936 {Intrinsic::vp_ctpop, MVT::v16i32, 20}, 937 {Intrinsic::vp_ctpop, MVT::nxv1i32, 20}, 938 {Intrinsic::vp_ctpop, MVT::nxv2i32, 20}, 939 {Intrinsic::vp_ctpop, MVT::nxv4i32, 20}, 940 {Intrinsic::vp_ctpop, MVT::nxv8i32, 20}, 941 {Intrinsic::vp_ctpop, MVT::nxv16i32, 20}, 942 {Intrinsic::vp_ctpop, MVT::v2i64, 21}, 943 {Intrinsic::vp_ctpop, MVT::v4i64, 21}, 944 {Intrinsic::vp_ctpop, MVT::v8i64, 21}, 945 {Intrinsic::vp_ctpop, MVT::v16i64, 21}, 946 {Intrinsic::vp_ctpop, MVT::nxv1i64, 21}, 947 {Intrinsic::vp_ctpop, MVT::nxv2i64, 21}, 948 {Intrinsic::vp_ctpop, MVT::nxv4i64, 21}, 949 {Intrinsic::vp_ctpop, MVT::nxv8i64, 21}, 950 {Intrinsic::vp_ctlz, MVT::v2i8, 19}, 951 {Intrinsic::vp_ctlz, MVT::v4i8, 19}, 952 {Intrinsic::vp_ctlz, MVT::v8i8, 19}, 953 {Intrinsic::vp_ctlz, MVT::v16i8, 19}, 954 {Intrinsic::vp_ctlz, MVT::nxv1i8, 19}, 955 {Intrinsic::vp_ctlz, MVT::nxv2i8, 19}, 956 {Intrinsic::vp_ctlz, MVT::nxv4i8, 19}, 957 {Intrinsic::vp_ctlz, MVT::nxv8i8, 19}, 958 {Intrinsic::vp_ctlz, MVT::nxv16i8, 19}, 959 {Intrinsic::vp_ctlz, MVT::nxv32i8, 19}, 960 {Intrinsic::vp_ctlz, MVT::nxv64i8, 19}, 961 {Intrinsic::vp_ctlz, MVT::v2i16, 28}, 962 {Intrinsic::vp_ctlz, MVT::v4i16, 28}, 963 {Intrinsic::vp_ctlz, MVT::v8i16, 28}, 964 {Intrinsic::vp_ctlz, MVT::v16i16, 28}, 965 {Intrinsic::vp_ctlz, MVT::nxv1i16, 28}, 966 {Intrinsic::vp_ctlz, MVT::nxv2i16, 28}, 967 {Intrinsic::vp_ctlz, MVT::nxv4i16, 28}, 968 {Intrinsic::vp_ctlz, MVT::nxv8i16, 28}, 969 {Intrinsic::vp_ctlz, MVT::nxv16i16, 28}, 970 {Intrinsic::vp_ctlz, MVT::nxv32i16, 28}, 971 {Intrinsic::vp_ctlz, MVT::v2i32, 31}, 972 {Intrinsic::vp_ctlz, MVT::v4i32, 31}, 973 {Intrinsic::vp_ctlz, MVT::v8i32, 31}, 974 {Intrinsic::vp_ctlz, MVT::v16i32, 31}, 975 {Intrinsic::vp_ctlz, MVT::nxv1i32, 31}, 976 {Intrinsic::vp_ctlz, MVT::nxv2i32, 31}, 977 {Intrinsic::vp_ctlz, MVT::nxv4i32, 31}, 978 {Intrinsic::vp_ctlz, MVT::nxv8i32, 31}, 979 {Intrinsic::vp_ctlz, MVT::nxv16i32, 31}, 980 {Intrinsic::vp_ctlz, MVT::v2i64, 35}, 981 {Intrinsic::vp_ctlz, MVT::v4i64, 35}, 982 {Intrinsic::vp_ctlz, MVT::v8i64, 35}, 983 {Intrinsic::vp_ctlz, MVT::v16i64, 35}, 984 {Intrinsic::vp_ctlz, MVT::nxv1i64, 35}, 985 {Intrinsic::vp_ctlz, MVT::nxv2i64, 35}, 986 {Intrinsic::vp_ctlz, MVT::nxv4i64, 35}, 987 {Intrinsic::vp_ctlz, MVT::nxv8i64, 35}, 988 {Intrinsic::vp_cttz, MVT::v2i8, 16}, 989 {Intrinsic::vp_cttz, MVT::v4i8, 16}, 990 {Intrinsic::vp_cttz, MVT::v8i8, 16}, 991 {Intrinsic::vp_cttz, MVT::v16i8, 16}, 992 {Intrinsic::vp_cttz, MVT::nxv1i8, 16}, 993 {Intrinsic::vp_cttz, MVT::nxv2i8, 16}, 994 {Intrinsic::vp_cttz, MVT::nxv4i8, 16}, 995 {Intrinsic::vp_cttz, MVT::nxv8i8, 16}, 996 {Intrinsic::vp_cttz, MVT::nxv16i8, 16}, 997 {Intrinsic::vp_cttz, MVT::nxv32i8, 16}, 998 {Intrinsic::vp_cttz, MVT::nxv64i8, 16}, 999 {Intrinsic::vp_cttz, MVT::v2i16, 23}, 1000 {Intrinsic::vp_cttz, MVT::v4i16, 23}, 1001 {Intrinsic::vp_cttz, MVT::v8i16, 23}, 1002 {Intrinsic::vp_cttz, MVT::v16i16, 23}, 1003 {Intrinsic::vp_cttz, MVT::nxv1i16, 23}, 1004 {Intrinsic::vp_cttz, MVT::nxv2i16, 23}, 1005 {Intrinsic::vp_cttz, MVT::nxv4i16, 23}, 1006 {Intrinsic::vp_cttz, MVT::nxv8i16, 23}, 1007 {Intrinsic::vp_cttz, MVT::nxv16i16, 23}, 1008 {Intrinsic::vp_cttz, MVT::nxv32i16, 23}, 1009 {Intrinsic::vp_cttz, MVT::v2i32, 24}, 1010 {Intrinsic::vp_cttz, MVT::v4i32, 24}, 1011 {Intrinsic::vp_cttz, MVT::v8i32, 24}, 1012 {Intrinsic::vp_cttz, MVT::v16i32, 24}, 1013 {Intrinsic::vp_cttz, MVT::nxv1i32, 24}, 1014 {Intrinsic::vp_cttz, MVT::nxv2i32, 24}, 1015 {Intrinsic::vp_cttz, MVT::nxv4i32, 24}, 1016 {Intrinsic::vp_cttz, MVT::nxv8i32, 24}, 1017 {Intrinsic::vp_cttz, MVT::nxv16i32, 24}, 1018 {Intrinsic::vp_cttz, MVT::v2i64, 25}, 1019 {Intrinsic::vp_cttz, MVT::v4i64, 25}, 1020 {Intrinsic::vp_cttz, MVT::v8i64, 25}, 1021 {Intrinsic::vp_cttz, MVT::v16i64, 25}, 1022 {Intrinsic::vp_cttz, MVT::nxv1i64, 25}, 1023 {Intrinsic::vp_cttz, MVT::nxv2i64, 25}, 1024 {Intrinsic::vp_cttz, MVT::nxv4i64, 25}, 1025 {Intrinsic::vp_cttz, MVT::nxv8i64, 25}, 1026 }; 1027 1028 static unsigned getISDForVPIntrinsicID(Intrinsic::ID ID) { 1029 switch (ID) { 1030 #define HELPER_MAP_VPID_TO_VPSD(VPID, VPSD) \ 1031 case Intrinsic::VPID: \ 1032 return ISD::VPSD; 1033 #include "llvm/IR/VPIntrinsics.def" 1034 #undef HELPER_MAP_VPID_TO_VPSD 1035 } 1036 return ISD::DELETED_NODE; 1037 } 1038 1039 InstructionCost 1040 RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, 1041 TTI::TargetCostKind CostKind) { 1042 auto *RetTy = ICA.getReturnType(); 1043 switch (ICA.getID()) { 1044 case Intrinsic::ceil: 1045 case Intrinsic::floor: 1046 case Intrinsic::trunc: 1047 case Intrinsic::rint: 1048 case Intrinsic::round: 1049 case Intrinsic::roundeven: { 1050 // These all use the same code. 1051 auto LT = getTypeLegalizationCost(RetTy); 1052 if (!LT.second.isVector() && TLI->isOperationCustom(ISD::FCEIL, LT.second)) 1053 return LT.first * 8; 1054 break; 1055 } 1056 case Intrinsic::umin: 1057 case Intrinsic::umax: 1058 case Intrinsic::smin: 1059 case Intrinsic::smax: { 1060 auto LT = getTypeLegalizationCost(RetTy); 1061 if ((ST->hasVInstructions() && LT.second.isVector()) || 1062 (LT.second.isScalarInteger() && ST->hasStdExtZbb())) 1063 return LT.first; 1064 break; 1065 } 1066 case Intrinsic::sadd_sat: 1067 case Intrinsic::ssub_sat: 1068 case Intrinsic::uadd_sat: 1069 case Intrinsic::usub_sat: 1070 case Intrinsic::fabs: 1071 case Intrinsic::sqrt: { 1072 auto LT = getTypeLegalizationCost(RetTy); 1073 if (ST->hasVInstructions() && LT.second.isVector()) 1074 return LT.first; 1075 break; 1076 } 1077 case Intrinsic::abs: { 1078 auto LT = getTypeLegalizationCost(RetTy); 1079 if (ST->hasVInstructions() && LT.second.isVector()) { 1080 // vrsub.vi v10, v8, 0 1081 // vmax.vv v8, v8, v10 1082 return LT.first * 2; 1083 } 1084 break; 1085 } 1086 // TODO: add more intrinsic 1087 case Intrinsic::experimental_stepvector: { 1088 unsigned Cost = 1; // vid 1089 auto LT = getTypeLegalizationCost(RetTy); 1090 return Cost + (LT.first - 1); 1091 } 1092 case Intrinsic::vp_rint: { 1093 // RISC-V target uses at least 5 instructions to lower rounding intrinsics. 1094 unsigned Cost = 5; 1095 auto LT = getTypeLegalizationCost(RetTy); 1096 if (TLI->isOperationCustom(ISD::VP_FRINT, LT.second)) 1097 return Cost * LT.first; 1098 break; 1099 } 1100 case Intrinsic::vp_nearbyint: { 1101 // More one read and one write for fflags than vp_rint. 1102 unsigned Cost = 7; 1103 auto LT = getTypeLegalizationCost(RetTy); 1104 if (TLI->isOperationCustom(ISD::VP_FRINT, LT.second)) 1105 return Cost * LT.first; 1106 break; 1107 } 1108 case Intrinsic::vp_ceil: 1109 case Intrinsic::vp_floor: 1110 case Intrinsic::vp_round: 1111 case Intrinsic::vp_roundeven: 1112 case Intrinsic::vp_roundtozero: { 1113 // Rounding with static rounding mode needs two more instructions to 1114 // swap/write FRM than vp_rint. 1115 unsigned Cost = 7; 1116 auto LT = getTypeLegalizationCost(RetTy); 1117 unsigned VPISD = getISDForVPIntrinsicID(ICA.getID()); 1118 if (TLI->isOperationCustom(VPISD, LT.second)) 1119 return Cost * LT.first; 1120 break; 1121 } 1122 } 1123 1124 if (ST->hasVInstructions() && RetTy->isVectorTy()) { 1125 auto LT = getTypeLegalizationCost(RetTy); 1126 if (const auto *Entry = CostTableLookup(VectorIntrinsicCostTable, 1127 ICA.getID(), LT.second)) 1128 return LT.first * Entry->Cost; 1129 } 1130 1131 return BaseT::getIntrinsicInstrCost(ICA, CostKind); 1132 } 1133 1134 InstructionCost RISCVTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, 1135 Type *Src, 1136 TTI::CastContextHint CCH, 1137 TTI::TargetCostKind CostKind, 1138 const Instruction *I) { 1139 if (isa<VectorType>(Dst) && isa<VectorType>(Src)) { 1140 // FIXME: Need to compute legalizing cost for illegal types. 1141 if (!isTypeLegal(Src) || !isTypeLegal(Dst)) 1142 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I); 1143 1144 // Skip if element size of Dst or Src is bigger than ELEN. 1145 if (Src->getScalarSizeInBits() > ST->getELEN() || 1146 Dst->getScalarSizeInBits() > ST->getELEN()) 1147 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I); 1148 1149 int ISD = TLI->InstructionOpcodeToISD(Opcode); 1150 assert(ISD && "Invalid opcode"); 1151 1152 // FIXME: Need to consider vsetvli and lmul. 1153 int PowDiff = (int)Log2_32(Dst->getScalarSizeInBits()) - 1154 (int)Log2_32(Src->getScalarSizeInBits()); 1155 switch (ISD) { 1156 case ISD::SIGN_EXTEND: 1157 case ISD::ZERO_EXTEND: 1158 if (Src->getScalarSizeInBits() == 1) { 1159 // We do not use vsext/vzext to extend from mask vector. 1160 // Instead we use the following instructions to extend from mask vector: 1161 // vmv.v.i v8, 0 1162 // vmerge.vim v8, v8, -1, v0 1163 return 2; 1164 } 1165 return 1; 1166 case ISD::TRUNCATE: 1167 if (Dst->getScalarSizeInBits() == 1) { 1168 // We do not use several vncvt to truncate to mask vector. So we could 1169 // not use PowDiff to calculate it. 1170 // Instead we use the following instructions to truncate to mask vector: 1171 // vand.vi v8, v8, 1 1172 // vmsne.vi v0, v8, 0 1173 return 2; 1174 } 1175 [[fallthrough]]; 1176 case ISD::FP_EXTEND: 1177 case ISD::FP_ROUND: 1178 // Counts of narrow/widen instructions. 1179 return std::abs(PowDiff); 1180 case ISD::FP_TO_SINT: 1181 case ISD::FP_TO_UINT: 1182 case ISD::SINT_TO_FP: 1183 case ISD::UINT_TO_FP: 1184 if (Src->getScalarSizeInBits() == 1 || Dst->getScalarSizeInBits() == 1) { 1185 // The cost of convert from or to mask vector is different from other 1186 // cases. We could not use PowDiff to calculate it. 1187 // For mask vector to fp, we should use the following instructions: 1188 // vmv.v.i v8, 0 1189 // vmerge.vim v8, v8, -1, v0 1190 // vfcvt.f.x.v v8, v8 1191 1192 // And for fp vector to mask, we use: 1193 // vfncvt.rtz.x.f.w v9, v8 1194 // vand.vi v8, v9, 1 1195 // vmsne.vi v0, v8, 0 1196 return 3; 1197 } 1198 if (std::abs(PowDiff) <= 1) 1199 return 1; 1200 // Backend could lower (v[sz]ext i8 to double) to vfcvt(v[sz]ext.f8 i8), 1201 // so it only need two conversion. 1202 if (Src->isIntOrIntVectorTy()) 1203 return 2; 1204 // Counts of narrow/widen instructions. 1205 return std::abs(PowDiff); 1206 } 1207 } 1208 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I); 1209 } 1210 1211 unsigned RISCVTTIImpl::getEstimatedVLFor(VectorType *Ty) { 1212 if (isa<ScalableVectorType>(Ty)) { 1213 const unsigned EltSize = DL.getTypeSizeInBits(Ty->getElementType()); 1214 const unsigned MinSize = DL.getTypeSizeInBits(Ty).getKnownMinValue(); 1215 const unsigned VectorBits = *getVScaleForTuning() * RISCV::RVVBitsPerBlock; 1216 return RISCVTargetLowering::computeVLMAX(VectorBits, EltSize, MinSize); 1217 } 1218 return cast<FixedVectorType>(Ty)->getNumElements(); 1219 } 1220 1221 InstructionCost 1222 RISCVTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, 1223 FastMathFlags FMF, 1224 TTI::TargetCostKind CostKind) { 1225 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors()) 1226 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind); 1227 1228 // Skip if scalar size of Ty is bigger than ELEN. 1229 if (Ty->getScalarSizeInBits() > ST->getELEN()) 1230 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind); 1231 1232 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); 1233 if (Ty->getElementType()->isIntegerTy(1)) 1234 // vcpop sequences, see vreduction-mask.ll. umax, smin actually only 1235 // cost 2, but we don't have enough info here so we slightly over cost. 1236 return (LT.first - 1) + 3; 1237 1238 // IR Reduction is composed by two vmv and one rvv reduction instruction. 1239 InstructionCost BaseCost = 2; 1240 1241 if (CostKind == TTI::TCK_CodeSize) 1242 return (LT.first - 1) + BaseCost; 1243 1244 unsigned VL = getEstimatedVLFor(Ty); 1245 return (LT.first - 1) + BaseCost + Log2_32_Ceil(VL); 1246 } 1247 1248 InstructionCost 1249 RISCVTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, 1250 std::optional<FastMathFlags> FMF, 1251 TTI::TargetCostKind CostKind) { 1252 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors()) 1253 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind); 1254 1255 // Skip if scalar size of Ty is bigger than ELEN. 1256 if (Ty->getScalarSizeInBits() > ST->getELEN()) 1257 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind); 1258 1259 int ISD = TLI->InstructionOpcodeToISD(Opcode); 1260 assert(ISD && "Invalid opcode"); 1261 1262 if (ISD != ISD::ADD && ISD != ISD::OR && ISD != ISD::XOR && ISD != ISD::AND && 1263 ISD != ISD::FADD) 1264 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind); 1265 1266 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); 1267 if (Ty->getElementType()->isIntegerTy(1)) 1268 // vcpop sequences, see vreduction-mask.ll 1269 return (LT.first - 1) + (ISD == ISD::AND ? 3 : 2); 1270 1271 // IR Reduction is composed by two vmv and one rvv reduction instruction. 1272 InstructionCost BaseCost = 2; 1273 1274 if (CostKind == TTI::TCK_CodeSize) 1275 return (LT.first - 1) + BaseCost; 1276 1277 unsigned VL = getEstimatedVLFor(Ty); 1278 if (TTI::requiresOrderedReduction(FMF)) 1279 return (LT.first - 1) + BaseCost + VL; 1280 return (LT.first - 1) + BaseCost + Log2_32_Ceil(VL); 1281 } 1282 1283 InstructionCost RISCVTTIImpl::getExtendedReductionCost( 1284 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, 1285 FastMathFlags FMF, TTI::TargetCostKind CostKind) { 1286 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors()) 1287 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy, 1288 FMF, CostKind); 1289 1290 // Skip if scalar size of ResTy is bigger than ELEN. 1291 if (ResTy->getScalarSizeInBits() > ST->getELEN()) 1292 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy, 1293 FMF, CostKind); 1294 1295 if (Opcode != Instruction::Add && Opcode != Instruction::FAdd) 1296 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy, 1297 FMF, CostKind); 1298 1299 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy); 1300 1301 if (ResTy->getScalarSizeInBits() != 2 * LT.second.getScalarSizeInBits()) 1302 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy, 1303 FMF, CostKind); 1304 1305 return (LT.first - 1) + 1306 getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind); 1307 } 1308 1309 InstructionCost RISCVTTIImpl::getStoreImmCost(Type *Ty, 1310 TTI::OperandValueInfo OpInfo, 1311 TTI::TargetCostKind CostKind) { 1312 assert(OpInfo.isConstant() && "non constant operand?"); 1313 if (!isa<VectorType>(Ty)) 1314 // FIXME: We need to account for immediate materialization here, but doing 1315 // a decent job requires more knowledge about the immediate than we 1316 // currently have here. 1317 return 0; 1318 1319 if (OpInfo.isUniform()) 1320 // vmv.x.i, vmv.v.x, or vfmv.v.f 1321 // We ignore the cost of the scalar constant materialization to be consistent 1322 // with how we treat scalar constants themselves just above. 1323 return 1; 1324 1325 return getConstantPoolLoadCost(Ty, CostKind); 1326 } 1327 1328 1329 InstructionCost RISCVTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, 1330 MaybeAlign Alignment, 1331 unsigned AddressSpace, 1332 TTI::TargetCostKind CostKind, 1333 TTI::OperandValueInfo OpInfo, 1334 const Instruction *I) { 1335 EVT VT = TLI->getValueType(DL, Src, true); 1336 // Type legalization can't handle structs 1337 if (VT == MVT::Other) 1338 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, 1339 CostKind, OpInfo, I); 1340 1341 InstructionCost Cost = 0; 1342 if (Opcode == Instruction::Store && OpInfo.isConstant()) 1343 Cost += getStoreImmCost(Src, OpInfo, CostKind); 1344 InstructionCost BaseCost = 1345 BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, 1346 CostKind, OpInfo, I); 1347 // Assume memory ops cost scale with the number of vector registers 1348 // possible accessed by the instruction. Note that BasicTTI already 1349 // handles the LT.first term for us. 1350 if (std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src); 1351 LT.second.isVector()) 1352 BaseCost *= getLMULCost(LT.second); 1353 return Cost + BaseCost; 1354 1355 } 1356 1357 InstructionCost RISCVTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, 1358 Type *CondTy, 1359 CmpInst::Predicate VecPred, 1360 TTI::TargetCostKind CostKind, 1361 const Instruction *I) { 1362 if (CostKind != TTI::TCK_RecipThroughput) 1363 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, 1364 I); 1365 1366 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors()) 1367 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, 1368 I); 1369 1370 // Skip if scalar size of ValTy is bigger than ELEN. 1371 if (ValTy->isVectorTy() && ValTy->getScalarSizeInBits() > ST->getELEN()) 1372 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, 1373 I); 1374 1375 if (Opcode == Instruction::Select && ValTy->isVectorTy()) { 1376 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy); 1377 if (CondTy->isVectorTy()) { 1378 if (ValTy->getScalarSizeInBits() == 1) { 1379 // vmandn.mm v8, v8, v9 1380 // vmand.mm v9, v0, v9 1381 // vmor.mm v0, v9, v8 1382 return LT.first * 3; 1383 } 1384 // vselect and max/min are supported natively. 1385 return LT.first * 1; 1386 } 1387 1388 if (ValTy->getScalarSizeInBits() == 1) { 1389 // vmv.v.x v9, a0 1390 // vmsne.vi v9, v9, 0 1391 // vmandn.mm v8, v8, v9 1392 // vmand.mm v9, v0, v9 1393 // vmor.mm v0, v9, v8 1394 return LT.first * 5; 1395 } 1396 1397 // vmv.v.x v10, a0 1398 // vmsne.vi v0, v10, 0 1399 // vmerge.vvm v8, v9, v8, v0 1400 return LT.first * 3; 1401 } 1402 1403 if ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) && 1404 ValTy->isVectorTy()) { 1405 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy); 1406 1407 // Support natively. 1408 if (CmpInst::isIntPredicate(VecPred)) 1409 return LT.first * 1; 1410 1411 // If we do not support the input floating point vector type, use the base 1412 // one which will calculate as: 1413 // ScalarizeCost + Num * Cost for fixed vector, 1414 // InvalidCost for scalable vector. 1415 if ((ValTy->getScalarSizeInBits() == 16 && !ST->hasVInstructionsF16()) || 1416 (ValTy->getScalarSizeInBits() == 32 && !ST->hasVInstructionsF32()) || 1417 (ValTy->getScalarSizeInBits() == 64 && !ST->hasVInstructionsF64())) 1418 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, 1419 I); 1420 switch (VecPred) { 1421 // Support natively. 1422 case CmpInst::FCMP_OEQ: 1423 case CmpInst::FCMP_OGT: 1424 case CmpInst::FCMP_OGE: 1425 case CmpInst::FCMP_OLT: 1426 case CmpInst::FCMP_OLE: 1427 case CmpInst::FCMP_UNE: 1428 return LT.first * 1; 1429 // TODO: Other comparisons? 1430 default: 1431 break; 1432 } 1433 } 1434 1435 // TODO: Add cost for scalar type. 1436 1437 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I); 1438 } 1439 1440 InstructionCost RISCVTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, 1441 TTI::TargetCostKind CostKind, 1442 unsigned Index, Value *Op0, 1443 Value *Op1) { 1444 assert(Val->isVectorTy() && "This must be a vector type"); 1445 1446 if (Opcode != Instruction::ExtractElement && 1447 Opcode != Instruction::InsertElement) 1448 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1); 1449 1450 // Legalize the type. 1451 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val); 1452 1453 // This type is legalized to a scalar type. 1454 if (!LT.second.isVector()) 1455 return 0; 1456 1457 // For unsupported scalable vector. 1458 if (LT.second.isScalableVector() && !LT.first.isValid()) 1459 return LT.first; 1460 1461 if (!isTypeLegal(Val)) 1462 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1); 1463 1464 // In RVV, we could use vslidedown + vmv.x.s to extract element from vector 1465 // and vslideup + vmv.s.x to insert element to vector. 1466 unsigned BaseCost = 1; 1467 // When insertelement we should add the index with 1 as the input of vslideup. 1468 unsigned SlideCost = Opcode == Instruction::InsertElement ? 2 : 1; 1469 1470 if (Index != -1U) { 1471 // The type may be split. For fixed-width vectors we can normalize the 1472 // index to the new type. 1473 if (LT.second.isFixedLengthVector()) { 1474 unsigned Width = LT.second.getVectorNumElements(); 1475 Index = Index % Width; 1476 } 1477 1478 // We could extract/insert the first element without vslidedown/vslideup. 1479 if (Index == 0) 1480 SlideCost = 0; 1481 else if (Opcode == Instruction::InsertElement) 1482 SlideCost = 1; // With a constant index, we do not need to use addi. 1483 } 1484 1485 // Mask vector extract/insert element is different from normal case. 1486 if (Val->getScalarSizeInBits() == 1) { 1487 // For extractelement, we need the following instructions: 1488 // vmv.v.i v8, 0 1489 // vmerge.vim v8, v8, 1, v0 1490 // vsetivli zero, 1, e8, m2, ta, mu (not count) 1491 // vslidedown.vx v8, v8, a0 1492 // vmv.x.s a0, v8 1493 1494 // For insertelement, we need the following instructions: 1495 // vsetvli a2, zero, e8, m1, ta, mu (not count) 1496 // vmv.s.x v8, a0 1497 // vmv.v.i v9, 0 1498 // vmerge.vim v9, v9, 1, v0 1499 // addi a0, a1, 1 1500 // vsetvli zero, a0, e8, m1, tu, mu (not count) 1501 // vslideup.vx v9, v8, a1 1502 // vsetvli a0, zero, e8, m1, ta, mu (not count) 1503 // vand.vi v8, v9, 1 1504 // vmsne.vi v0, v8, 0 1505 1506 // TODO: should we count these special vsetvlis? 1507 BaseCost = Opcode == Instruction::InsertElement ? 5 : 3; 1508 } 1509 // Extract i64 in the target that has XLEN=32 need more instruction. 1510 if (Val->getScalarType()->isIntegerTy() && 1511 ST->getXLen() < Val->getScalarSizeInBits()) { 1512 // For extractelement, we need the following instructions: 1513 // vsetivli zero, 1, e64, m1, ta, mu (not count) 1514 // vslidedown.vx v8, v8, a0 1515 // vmv.x.s a0, v8 1516 // li a1, 32 1517 // vsrl.vx v8, v8, a1 1518 // vmv.x.s a1, v8 1519 1520 // For insertelement, we need the following instructions: 1521 // vsetivli zero, 2, e32, m4, ta, mu (not count) 1522 // vmv.v.i v12, 0 1523 // vslide1up.vx v16, v12, a1 1524 // vslide1up.vx v12, v16, a0 1525 // addi a0, a2, 1 1526 // vsetvli zero, a0, e64, m4, tu, mu (not count) 1527 // vslideup.vx v8, v12, a2 1528 1529 // TODO: should we count these special vsetvlis? 1530 BaseCost = Opcode == Instruction::InsertElement ? 3 : 4; 1531 } 1532 return BaseCost + SlideCost; 1533 } 1534 1535 InstructionCost RISCVTTIImpl::getArithmeticInstrCost( 1536 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, 1537 TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, 1538 ArrayRef<const Value *> Args, const Instruction *CxtI) { 1539 1540 // TODO: Handle more cost kinds. 1541 if (CostKind != TTI::TCK_RecipThroughput) 1542 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info, 1543 Args, CxtI); 1544 1545 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors()) 1546 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info, 1547 Args, CxtI); 1548 1549 // Skip if scalar size of Ty is bigger than ELEN. 1550 if (isa<VectorType>(Ty) && Ty->getScalarSizeInBits() > ST->getELEN()) 1551 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info, 1552 Args, CxtI); 1553 1554 // Legalize the type. 1555 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); 1556 1557 // TODO: Handle scalar type. 1558 if (!LT.second.isVector()) 1559 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info, 1560 Args, CxtI); 1561 1562 1563 auto getConstantMatCost = 1564 [&](unsigned Operand, TTI::OperandValueInfo OpInfo) -> InstructionCost { 1565 if (OpInfo.isUniform() && TLI->canSplatOperand(Opcode, Operand)) 1566 // Two sub-cases: 1567 // * Has a 5 bit immediate operand which can be splatted. 1568 // * Has a larger immediate which must be materialized in scalar register 1569 // We return 0 for both as we currently ignore the cost of materializing 1570 // scalar constants in GPRs. 1571 return 0; 1572 1573 return getConstantPoolLoadCost(Ty, CostKind); 1574 }; 1575 1576 // Add the cost of materializing any constant vectors required. 1577 InstructionCost ConstantMatCost = 0; 1578 if (Op1Info.isConstant()) 1579 ConstantMatCost += getConstantMatCost(0, Op1Info); 1580 if (Op2Info.isConstant()) 1581 ConstantMatCost += getConstantMatCost(1, Op2Info); 1582 1583 switch (TLI->InstructionOpcodeToISD(Opcode)) { 1584 case ISD::ADD: 1585 case ISD::SUB: 1586 case ISD::AND: 1587 case ISD::OR: 1588 case ISD::XOR: 1589 case ISD::SHL: 1590 case ISD::SRL: 1591 case ISD::SRA: 1592 case ISD::MUL: 1593 case ISD::MULHS: 1594 case ISD::MULHU: 1595 case ISD::FADD: 1596 case ISD::FSUB: 1597 case ISD::FMUL: 1598 case ISD::FNEG: { 1599 return ConstantMatCost + getLMULCost(LT.second) * LT.first * 1; 1600 } 1601 default: 1602 return ConstantMatCost + 1603 BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info, 1604 Args, CxtI); 1605 } 1606 } 1607 1608 // TODO: Deduplicate from TargetTransformInfoImplCRTPBase. 1609 InstructionCost RISCVTTIImpl::getPointersChainCost( 1610 ArrayRef<const Value *> Ptrs, const Value *Base, 1611 const TTI::PointersChainInfo &Info, Type *AccessTy, 1612 TTI::TargetCostKind CostKind) { 1613 InstructionCost Cost = TTI::TCC_Free; 1614 // In the basic model we take into account GEP instructions only 1615 // (although here can come alloca instruction, a value, constants and/or 1616 // constant expressions, PHIs, bitcasts ... whatever allowed to be used as a 1617 // pointer). Typically, if Base is a not a GEP-instruction and all the 1618 // pointers are relative to the same base address, all the rest are 1619 // either GEP instructions, PHIs, bitcasts or constants. When we have same 1620 // base, we just calculate cost of each non-Base GEP as an ADD operation if 1621 // any their index is a non-const. 1622 // If no known dependecies between the pointers cost is calculated as a sum 1623 // of costs of GEP instructions. 1624 for (auto [I, V] : enumerate(Ptrs)) { 1625 const auto *GEP = dyn_cast<GetElementPtrInst>(V); 1626 if (!GEP) 1627 continue; 1628 if (Info.isSameBase() && V != Base) { 1629 if (GEP->hasAllConstantIndices()) 1630 continue; 1631 // If the chain is unit-stride and BaseReg + stride*i is a legal 1632 // addressing mode, then presume the base GEP is sitting around in a 1633 // register somewhere and check if we can fold the offset relative to 1634 // it. 1635 unsigned Stride = DL.getTypeStoreSize(AccessTy); 1636 if (Info.isUnitStride() && 1637 isLegalAddressingMode(AccessTy, 1638 /* BaseGV */ nullptr, 1639 /* BaseOffset */ Stride * I, 1640 /* HasBaseReg */ true, 1641 /* Scale */ 0, 1642 GEP->getType()->getPointerAddressSpace())) 1643 continue; 1644 Cost += getArithmeticInstrCost(Instruction::Add, GEP->getType(), CostKind, 1645 {TTI::OK_AnyValue, TTI::OP_None}, 1646 {TTI::OK_AnyValue, TTI::OP_None}, 1647 std::nullopt); 1648 } else { 1649 SmallVector<const Value *> Indices(GEP->indices()); 1650 Cost += getGEPCost(GEP->getSourceElementType(), GEP->getPointerOperand(), 1651 Indices, AccessTy, CostKind); 1652 } 1653 } 1654 return Cost; 1655 } 1656 1657 void RISCVTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, 1658 TTI::UnrollingPreferences &UP, 1659 OptimizationRemarkEmitter *ORE) { 1660 // TODO: More tuning on benchmarks and metrics with changes as needed 1661 // would apply to all settings below to enable performance. 1662 1663 1664 if (ST->enableDefaultUnroll()) 1665 return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE); 1666 1667 // Enable Upper bound unrolling universally, not dependant upon the conditions 1668 // below. 1669 UP.UpperBound = true; 1670 1671 // Disable loop unrolling for Oz and Os. 1672 UP.OptSizeThreshold = 0; 1673 UP.PartialOptSizeThreshold = 0; 1674 if (L->getHeader()->getParent()->hasOptSize()) 1675 return; 1676 1677 SmallVector<BasicBlock *, 4> ExitingBlocks; 1678 L->getExitingBlocks(ExitingBlocks); 1679 LLVM_DEBUG(dbgs() << "Loop has:\n" 1680 << "Blocks: " << L->getNumBlocks() << "\n" 1681 << "Exit blocks: " << ExitingBlocks.size() << "\n"); 1682 1683 // Only allow another exit other than the latch. This acts as an early exit 1684 // as it mirrors the profitability calculation of the runtime unroller. 1685 if (ExitingBlocks.size() > 2) 1686 return; 1687 1688 // Limit the CFG of the loop body for targets with a branch predictor. 1689 // Allowing 4 blocks permits if-then-else diamonds in the body. 1690 if (L->getNumBlocks() > 4) 1691 return; 1692 1693 // Don't unroll vectorized loops, including the remainder loop 1694 if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized")) 1695 return; 1696 1697 // Scan the loop: don't unroll loops with calls as this could prevent 1698 // inlining. 1699 InstructionCost Cost = 0; 1700 for (auto *BB : L->getBlocks()) { 1701 for (auto &I : *BB) { 1702 // Initial setting - Don't unroll loops containing vectorized 1703 // instructions. 1704 if (I.getType()->isVectorTy()) 1705 return; 1706 1707 if (isa<CallInst>(I) || isa<InvokeInst>(I)) { 1708 if (const Function *F = cast<CallBase>(I).getCalledFunction()) { 1709 if (!isLoweredToCall(F)) 1710 continue; 1711 } 1712 return; 1713 } 1714 1715 SmallVector<const Value *> Operands(I.operand_values()); 1716 Cost += getInstructionCost(&I, Operands, 1717 TargetTransformInfo::TCK_SizeAndLatency); 1718 } 1719 } 1720 1721 LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n"); 1722 1723 UP.Partial = true; 1724 UP.Runtime = true; 1725 UP.UnrollRemainder = true; 1726 UP.UnrollAndJam = true; 1727 UP.UnrollAndJamInnerLoopThreshold = 60; 1728 1729 // Force unrolling small loops can be very useful because of the branch 1730 // taken cost of the backedge. 1731 if (Cost < 12) 1732 UP.Force = true; 1733 } 1734 1735 void RISCVTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, 1736 TTI::PeelingPreferences &PP) { 1737 BaseT::getPeelingPreferences(L, SE, PP); 1738 } 1739 1740 unsigned RISCVTTIImpl::getRegUsageForType(Type *Ty) { 1741 TypeSize Size = DL.getTypeSizeInBits(Ty); 1742 if (Ty->isVectorTy()) { 1743 if (Size.isScalable() && ST->hasVInstructions()) 1744 return divideCeil(Size.getKnownMinValue(), RISCV::RVVBitsPerBlock); 1745 1746 if (ST->useRVVForFixedLengthVectors()) 1747 return divideCeil(Size, ST->getRealMinVLen()); 1748 } 1749 1750 return BaseT::getRegUsageForType(Ty); 1751 } 1752 1753 unsigned RISCVTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const { 1754 if (SLPMaxVF.getNumOccurrences()) 1755 return SLPMaxVF; 1756 1757 // Return how many elements can fit in getRegisterBitwidth. This is the 1758 // same routine as used in LoopVectorizer. We should probably be 1759 // accounting for whether we actually have instructions with the right 1760 // lane type, but we don't have enough information to do that without 1761 // some additional plumbing which hasn't been justified yet. 1762 TypeSize RegWidth = 1763 getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector); 1764 // If no vector registers, or absurd element widths, disable 1765 // vectorization by returning 1. 1766 return std::max<unsigned>(1U, RegWidth.getFixedValue() / ElemWidth); 1767 } 1768 1769 bool RISCVTTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1, 1770 const TargetTransformInfo::LSRCost &C2) { 1771 // RISC-V specific here are "instruction number 1st priority". 1772 return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost, 1773 C1.NumIVMuls, C1.NumBaseAdds, 1774 C1.ScaleCost, C1.ImmCost, C1.SetupCost) < 1775 std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost, 1776 C2.NumIVMuls, C2.NumBaseAdds, 1777 C2.ScaleCost, C2.ImmCost, C2.SetupCost); 1778 } 1779