1 //===-- RISCVTargetTransformInfo.cpp - RISC-V specific TTI ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "RISCVTargetTransformInfo.h" 10 #include "MCTargetDesc/RISCVMatInt.h" 11 #include "llvm/ADT/STLExtras.h" 12 #include "llvm/Analysis/TargetTransformInfo.h" 13 #include "llvm/CodeGen/BasicTTIImpl.h" 14 #include "llvm/CodeGen/CostTable.h" 15 #include "llvm/CodeGen/TargetLowering.h" 16 #include "llvm/IR/Instructions.h" 17 #include <cmath> 18 #include <optional> 19 using namespace llvm; 20 21 #define DEBUG_TYPE "riscvtti" 22 23 static cl::opt<unsigned> RVVRegisterWidthLMUL( 24 "riscv-v-register-bit-width-lmul", 25 cl::desc( 26 "The LMUL to use for getRegisterBitWidth queries. Affects LMUL used " 27 "by autovectorized code. Fractional LMULs are not supported."), 28 cl::init(2), cl::Hidden); 29 30 static cl::opt<unsigned> SLPMaxVF( 31 "riscv-v-slp-max-vf", 32 cl::desc( 33 "Overrides result used for getMaximumVF query which is used " 34 "exclusively by SLP vectorizer."), 35 cl::Hidden); 36 37 InstructionCost 38 RISCVTTIImpl::getRISCVInstructionCost(ArrayRef<unsigned> OpCodes, MVT VT, 39 TTI::TargetCostKind CostKind) { 40 size_t NumInstr = OpCodes.size(); 41 if (CostKind == TTI::TCK_CodeSize) 42 return NumInstr; 43 InstructionCost LMULCost = TLI->getLMULCost(VT); 44 if ((CostKind != TTI::TCK_RecipThroughput) && (CostKind != TTI::TCK_Latency)) 45 return LMULCost * NumInstr; 46 InstructionCost Cost = 0; 47 for (auto Op : OpCodes) { 48 switch (Op) { 49 case RISCV::VRGATHER_VI: 50 Cost += TLI->getVRGatherVICost(VT); 51 break; 52 case RISCV::VRGATHER_VV: 53 Cost += TLI->getVRGatherVVCost(VT); 54 break; 55 case RISCV::VSLIDEUP_VI: 56 case RISCV::VSLIDEDOWN_VI: 57 Cost += TLI->getVSlideVICost(VT); 58 break; 59 case RISCV::VSLIDEUP_VX: 60 case RISCV::VSLIDEDOWN_VX: 61 Cost += TLI->getVSlideVXCost(VT); 62 break; 63 case RISCV::VREDMAX_VS: 64 case RISCV::VREDMIN_VS: 65 case RISCV::VREDMAXU_VS: 66 case RISCV::VREDMINU_VS: 67 case RISCV::VREDSUM_VS: 68 case RISCV::VREDAND_VS: 69 case RISCV::VREDOR_VS: 70 case RISCV::VREDXOR_VS: 71 case RISCV::VFREDMAX_VS: 72 case RISCV::VFREDMIN_VS: 73 case RISCV::VFREDUSUM_VS: { 74 unsigned VL = VT.getVectorMinNumElements(); 75 if (!VT.isFixedLengthVector()) 76 VL *= *getVScaleForTuning(); 77 Cost += Log2_32_Ceil(VL); 78 break; 79 } 80 case RISCV::VFREDOSUM_VS: { 81 unsigned VL = VT.getVectorMinNumElements(); 82 if (!VT.isFixedLengthVector()) 83 VL *= *getVScaleForTuning(); 84 Cost += VL; 85 break; 86 } 87 case RISCV::VMV_S_X: 88 // FIXME: VMV_S_X doesn't use LMUL, the cost should be 1 89 default: 90 Cost += LMULCost; 91 } 92 } 93 return Cost; 94 } 95 96 InstructionCost RISCVTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty, 97 TTI::TargetCostKind CostKind) { 98 assert(Ty->isIntegerTy() && 99 "getIntImmCost can only estimate cost of materialising integers"); 100 101 // We have a Zero register, so 0 is always free. 102 if (Imm == 0) 103 return TTI::TCC_Free; 104 105 // Otherwise, we check how many instructions it will take to materialise. 106 const DataLayout &DL = getDataLayout(); 107 return RISCVMatInt::getIntMatCost(Imm, DL.getTypeSizeInBits(Ty), *getST()); 108 } 109 110 // Look for patterns of shift followed by AND that can be turned into a pair of 111 // shifts. We won't need to materialize an immediate for the AND so these can 112 // be considered free. 113 static bool canUseShiftPair(Instruction *Inst, const APInt &Imm) { 114 uint64_t Mask = Imm.getZExtValue(); 115 auto *BO = dyn_cast<BinaryOperator>(Inst->getOperand(0)); 116 if (!BO || !BO->hasOneUse()) 117 return false; 118 119 if (BO->getOpcode() != Instruction::Shl) 120 return false; 121 122 if (!isa<ConstantInt>(BO->getOperand(1))) 123 return false; 124 125 unsigned ShAmt = cast<ConstantInt>(BO->getOperand(1))->getZExtValue(); 126 // (and (shl x, c2), c1) will be matched to (srli (slli x, c2+c3), c3) if c1 127 // is a mask shifted by c2 bits with c3 leading zeros. 128 if (isShiftedMask_64(Mask)) { 129 unsigned Trailing = llvm::countr_zero(Mask); 130 if (ShAmt == Trailing) 131 return true; 132 } 133 134 return false; 135 } 136 137 InstructionCost RISCVTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, 138 const APInt &Imm, Type *Ty, 139 TTI::TargetCostKind CostKind, 140 Instruction *Inst) { 141 assert(Ty->isIntegerTy() && 142 "getIntImmCost can only estimate cost of materialising integers"); 143 144 // We have a Zero register, so 0 is always free. 145 if (Imm == 0) 146 return TTI::TCC_Free; 147 148 // Some instructions in RISC-V can take a 12-bit immediate. Some of these are 149 // commutative, in others the immediate comes from a specific argument index. 150 bool Takes12BitImm = false; 151 unsigned ImmArgIdx = ~0U; 152 153 switch (Opcode) { 154 case Instruction::GetElementPtr: 155 // Never hoist any arguments to a GetElementPtr. CodeGenPrepare will 156 // split up large offsets in GEP into better parts than ConstantHoisting 157 // can. 158 return TTI::TCC_Free; 159 case Instruction::And: 160 // zext.h 161 if (Imm == UINT64_C(0xffff) && ST->hasStdExtZbb()) 162 return TTI::TCC_Free; 163 // zext.w 164 if (Imm == UINT64_C(0xffffffff) && ST->hasStdExtZba()) 165 return TTI::TCC_Free; 166 // bclri 167 if (ST->hasStdExtZbs() && (~Imm).isPowerOf2()) 168 return TTI::TCC_Free; 169 if (Inst && Idx == 1 && Imm.getBitWidth() <= ST->getXLen() && 170 canUseShiftPair(Inst, Imm)) 171 return TTI::TCC_Free; 172 Takes12BitImm = true; 173 break; 174 case Instruction::Add: 175 Takes12BitImm = true; 176 break; 177 case Instruction::Or: 178 case Instruction::Xor: 179 // bseti/binvi 180 if (ST->hasStdExtZbs() && Imm.isPowerOf2()) 181 return TTI::TCC_Free; 182 Takes12BitImm = true; 183 break; 184 case Instruction::Mul: 185 // Power of 2 is a shift. Negated power of 2 is a shift and a negate. 186 if (Imm.isPowerOf2() || Imm.isNegatedPowerOf2()) 187 return TTI::TCC_Free; 188 // One more or less than a power of 2 can use SLLI+ADD/SUB. 189 if ((Imm + 1).isPowerOf2() || (Imm - 1).isPowerOf2()) 190 return TTI::TCC_Free; 191 // FIXME: There is no MULI instruction. 192 Takes12BitImm = true; 193 break; 194 case Instruction::Sub: 195 case Instruction::Shl: 196 case Instruction::LShr: 197 case Instruction::AShr: 198 Takes12BitImm = true; 199 ImmArgIdx = 1; 200 break; 201 default: 202 break; 203 } 204 205 if (Takes12BitImm) { 206 // Check immediate is the correct argument... 207 if (Instruction::isCommutative(Opcode) || Idx == ImmArgIdx) { 208 // ... and fits into the 12-bit immediate. 209 if (Imm.getSignificantBits() <= 64 && 210 getTLI()->isLegalAddImmediate(Imm.getSExtValue())) { 211 return TTI::TCC_Free; 212 } 213 } 214 215 // Otherwise, use the full materialisation cost. 216 return getIntImmCost(Imm, Ty, CostKind); 217 } 218 219 // By default, prevent hoisting. 220 return TTI::TCC_Free; 221 } 222 223 InstructionCost 224 RISCVTTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, 225 const APInt &Imm, Type *Ty, 226 TTI::TargetCostKind CostKind) { 227 // Prevent hoisting in unknown cases. 228 return TTI::TCC_Free; 229 } 230 231 TargetTransformInfo::PopcntSupportKind 232 RISCVTTIImpl::getPopcntSupport(unsigned TyWidth) { 233 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); 234 return ST->hasStdExtZbb() || ST->hasVendorXCVbitmanip() 235 ? TTI::PSK_FastHardware 236 : TTI::PSK_Software; 237 } 238 239 bool RISCVTTIImpl::shouldExpandReduction(const IntrinsicInst *II) const { 240 // Currently, the ExpandReductions pass can't expand scalable-vector 241 // reductions, but we still request expansion as RVV doesn't support certain 242 // reductions and the SelectionDAG can't legalize them either. 243 switch (II->getIntrinsicID()) { 244 default: 245 return false; 246 // These reductions have no equivalent in RVV 247 case Intrinsic::vector_reduce_mul: 248 case Intrinsic::vector_reduce_fmul: 249 return true; 250 } 251 } 252 253 std::optional<unsigned> RISCVTTIImpl::getMaxVScale() const { 254 if (ST->hasVInstructions()) 255 return ST->getRealMaxVLen() / RISCV::RVVBitsPerBlock; 256 return BaseT::getMaxVScale(); 257 } 258 259 std::optional<unsigned> RISCVTTIImpl::getVScaleForTuning() const { 260 if (ST->hasVInstructions()) 261 if (unsigned MinVLen = ST->getRealMinVLen(); 262 MinVLen >= RISCV::RVVBitsPerBlock) 263 return MinVLen / RISCV::RVVBitsPerBlock; 264 return BaseT::getVScaleForTuning(); 265 } 266 267 TypeSize 268 RISCVTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { 269 unsigned LMUL = 270 llvm::bit_floor(std::clamp<unsigned>(RVVRegisterWidthLMUL, 1, 8)); 271 switch (K) { 272 case TargetTransformInfo::RGK_Scalar: 273 return TypeSize::getFixed(ST->getXLen()); 274 case TargetTransformInfo::RGK_FixedWidthVector: 275 return TypeSize::getFixed( 276 ST->useRVVForFixedLengthVectors() ? LMUL * ST->getRealMinVLen() : 0); 277 case TargetTransformInfo::RGK_ScalableVector: 278 return TypeSize::getScalable( 279 (ST->hasVInstructions() && 280 ST->getRealMinVLen() >= RISCV::RVVBitsPerBlock) 281 ? LMUL * RISCV::RVVBitsPerBlock 282 : 0); 283 } 284 285 llvm_unreachable("Unsupported register kind"); 286 } 287 288 InstructionCost 289 RISCVTTIImpl::getConstantPoolLoadCost(Type *Ty, TTI::TargetCostKind CostKind) { 290 // Add a cost of address generation + the cost of the load. The address 291 // is expected to be a PC relative offset to a constant pool entry 292 // using auipc/addi. 293 return 2 + getMemoryOpCost(Instruction::Load, Ty, DL.getABITypeAlign(Ty), 294 /*AddressSpace=*/0, CostKind); 295 } 296 297 static VectorType *getVRGatherIndexType(MVT DataVT, const RISCVSubtarget &ST, 298 LLVMContext &C) { 299 assert((DataVT.getScalarSizeInBits() != 8 || 300 DataVT.getVectorNumElements() <= 256) && "unhandled case in lowering"); 301 MVT IndexVT = DataVT.changeTypeToInteger(); 302 if (IndexVT.getScalarType().bitsGT(ST.getXLenVT())) 303 IndexVT = IndexVT.changeVectorElementType(MVT::i16); 304 return cast<VectorType>(EVT(IndexVT).getTypeForEVT(C)); 305 } 306 307 InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, 308 VectorType *Tp, ArrayRef<int> Mask, 309 TTI::TargetCostKind CostKind, 310 int Index, VectorType *SubTp, 311 ArrayRef<const Value *> Args) { 312 Kind = improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp); 313 314 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp); 315 316 // First, handle cases where having a fixed length vector enables us to 317 // give a more accurate cost than falling back to generic scalable codegen. 318 // TODO: Each of these cases hints at a modeling gap around scalable vectors. 319 if (isa<FixedVectorType>(Tp)) { 320 switch (Kind) { 321 default: 322 break; 323 case TTI::SK_PermuteSingleSrc: { 324 if (Mask.size() >= 2 && LT.second.isFixedLengthVector()) { 325 MVT EltTp = LT.second.getVectorElementType(); 326 // If the size of the element is < ELEN then shuffles of interleaves and 327 // deinterleaves of 2 vectors can be lowered into the following 328 // sequences 329 if (EltTp.getScalarSizeInBits() < ST->getELen()) { 330 // Example sequence: 331 // vsetivli zero, 4, e8, mf4, ta, ma (ignored) 332 // vwaddu.vv v10, v8, v9 333 // li a0, -1 (ignored) 334 // vwmaccu.vx v10, a0, v9 335 if (ShuffleVectorInst::isInterleaveMask(Mask, 2, Mask.size())) 336 return 2 * LT.first * TLI->getLMULCost(LT.second); 337 338 if (Mask[0] == 0 || Mask[0] == 1) { 339 auto DeinterleaveMask = createStrideMask(Mask[0], 2, Mask.size()); 340 // Example sequence: 341 // vnsrl.wi v10, v8, 0 342 if (equal(DeinterleaveMask, Mask)) 343 return LT.first * getRISCVInstructionCost(RISCV::VNSRL_WI, 344 LT.second, CostKind); 345 } 346 } 347 } 348 // vrgather + cost of generating the mask constant. 349 // We model this for an unknown mask with a single vrgather. 350 if (LT.second.isFixedLengthVector() && LT.first == 1 && 351 (LT.second.getScalarSizeInBits() != 8 || 352 LT.second.getVectorNumElements() <= 256)) { 353 VectorType *IdxTy = getVRGatherIndexType(LT.second, *ST, Tp->getContext()); 354 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind); 355 return IndexCost + 356 getRISCVInstructionCost(RISCV::VRGATHER_VV, LT.second, CostKind); 357 } 358 [[fallthrough]]; 359 } 360 case TTI::SK_Transpose: 361 case TTI::SK_PermuteTwoSrc: { 362 // 2 x (vrgather + cost of generating the mask constant) + cost of mask 363 // register for the second vrgather. We model this for an unknown 364 // (shuffle) mask. 365 if (LT.second.isFixedLengthVector() && LT.first == 1 && 366 (LT.second.getScalarSizeInBits() != 8 || 367 LT.second.getVectorNumElements() <= 256)) { 368 auto &C = Tp->getContext(); 369 auto EC = Tp->getElementCount(); 370 VectorType *IdxTy = getVRGatherIndexType(LT.second, *ST, C); 371 VectorType *MaskTy = VectorType::get(IntegerType::getInt1Ty(C), EC); 372 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind); 373 InstructionCost MaskCost = getConstantPoolLoadCost(MaskTy, CostKind); 374 return 2 * IndexCost + 375 getRISCVInstructionCost({RISCV::VRGATHER_VV, RISCV::VRGATHER_VV}, 376 LT.second, CostKind) + 377 MaskCost; 378 } 379 [[fallthrough]]; 380 } 381 case TTI::SK_Select: { 382 // We are going to permute multiple sources and the result will be in 383 // multiple destinations. Providing an accurate cost only for splits where 384 // the element type remains the same. 385 if (!Mask.empty() && LT.first.isValid() && LT.first != 1 && 386 LT.second.isFixedLengthVector() && 387 LT.second.getVectorElementType().getSizeInBits() == 388 Tp->getElementType()->getPrimitiveSizeInBits() && 389 LT.second.getVectorNumElements() < 390 cast<FixedVectorType>(Tp)->getNumElements() && 391 divideCeil(Mask.size(), 392 cast<FixedVectorType>(Tp)->getNumElements()) == 393 static_cast<unsigned>(*LT.first.getValue())) { 394 unsigned NumRegs = *LT.first.getValue(); 395 unsigned VF = cast<FixedVectorType>(Tp)->getNumElements(); 396 unsigned SubVF = PowerOf2Ceil(VF / NumRegs); 397 auto *SubVecTy = FixedVectorType::get(Tp->getElementType(), SubVF); 398 399 InstructionCost Cost = 0; 400 for (unsigned I = 0; I < NumRegs; ++I) { 401 bool IsSingleVector = true; 402 SmallVector<int> SubMask(SubVF, PoisonMaskElem); 403 transform(Mask.slice(I * SubVF, 404 I == NumRegs - 1 ? Mask.size() % SubVF : SubVF), 405 SubMask.begin(), [&](int I) { 406 bool SingleSubVector = I / VF == 0; 407 IsSingleVector &= SingleSubVector; 408 return (SingleSubVector ? 0 : 1) * SubVF + I % VF; 409 }); 410 Cost += getShuffleCost(IsSingleVector ? TTI::SK_PermuteSingleSrc 411 : TTI::SK_PermuteTwoSrc, 412 SubVecTy, SubMask, CostKind, 0, nullptr); 413 return Cost; 414 } 415 } 416 break; 417 } 418 } 419 }; 420 421 // Handle scalable vectors (and fixed vectors legalized to scalable vectors). 422 switch (Kind) { 423 default: 424 // Fallthrough to generic handling. 425 // TODO: Most of these cases will return getInvalid in generic code, and 426 // must be implemented here. 427 break; 428 case TTI::SK_ExtractSubvector: 429 // Example sequence: 430 // vsetivli zero, 4, e8, mf2, tu, ma (ignored) 431 // vslidedown.vi v8, v9, 2 432 return LT.first * 433 getRISCVInstructionCost(RISCV::VSLIDEDOWN_VI, LT.second, CostKind); 434 case TTI::SK_InsertSubvector: 435 // Example sequence: 436 // vsetivli zero, 4, e8, mf2, tu, ma (ignored) 437 // vslideup.vi v8, v9, 2 438 return LT.first * 439 getRISCVInstructionCost(RISCV::VSLIDEUP_VI, LT.second, CostKind); 440 case TTI::SK_Select: { 441 // Example sequence: 442 // li a0, 90 443 // vsetivli zero, 8, e8, mf2, ta, ma (ignored) 444 // vmv.s.x v0, a0 445 // vmerge.vvm v8, v9, v8, v0 446 return LT.first * 447 (TLI->getLMULCost(LT.second) + // FIXME: should be 1 for li 448 getRISCVInstructionCost({RISCV::VMV_S_X, RISCV::VMERGE_VVM}, 449 LT.second, CostKind)); 450 } 451 case TTI::SK_Broadcast: { 452 bool HasScalar = (Args.size() > 0) && (Operator::getOpcode(Args[0]) == 453 Instruction::InsertElement); 454 if (LT.second.getScalarSizeInBits() == 1) { 455 if (HasScalar) { 456 // Example sequence: 457 // andi a0, a0, 1 458 // vsetivli zero, 2, e8, mf8, ta, ma (ignored) 459 // vmv.v.x v8, a0 460 // vmsne.vi v0, v8, 0 461 return LT.first * 462 (TLI->getLMULCost(LT.second) + // FIXME: should be 1 for andi 463 getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI}, 464 LT.second, CostKind)); 465 } 466 // Example sequence: 467 // vsetivli zero, 2, e8, mf8, ta, mu (ignored) 468 // vmv.v.i v8, 0 469 // vmerge.vim v8, v8, 1, v0 470 // vmv.x.s a0, v8 471 // andi a0, a0, 1 472 // vmv.v.x v8, a0 473 // vmsne.vi v0, v8, 0 474 475 return LT.first * 476 (TLI->getLMULCost(LT.second) + // FIXME: this should be 1 for andi 477 TLI->getLMULCost( 478 LT.second) + // FIXME: vmv.x.s is the same as extractelement 479 getRISCVInstructionCost({RISCV::VMV_V_I, RISCV::VMERGE_VIM, 480 RISCV::VMV_V_X, RISCV::VMSNE_VI}, 481 LT.second, CostKind)); 482 } 483 484 if (HasScalar) { 485 // Example sequence: 486 // vmv.v.x v8, a0 487 return LT.first * 488 getRISCVInstructionCost(RISCV::VMV_V_X, LT.second, CostKind); 489 } 490 491 // Example sequence: 492 // vrgather.vi v9, v8, 0 493 return LT.first * 494 getRISCVInstructionCost(RISCV::VRGATHER_VI, LT.second, CostKind); 495 } 496 case TTI::SK_Splice: { 497 // vslidedown+vslideup. 498 // TODO: Multiplying by LT.first implies this legalizes into multiple copies 499 // of similar code, but I think we expand through memory. 500 unsigned Opcodes[2] = {RISCV::VSLIDEDOWN_VX, RISCV::VSLIDEUP_VX}; 501 if (Index >= 0 && Index < 32) 502 Opcodes[0] = RISCV::VSLIDEDOWN_VI; 503 else if (Index < 0 && Index > -32) 504 Opcodes[1] = RISCV::VSLIDEUP_VI; 505 return LT.first * getRISCVInstructionCost(Opcodes, LT.second, CostKind); 506 } 507 case TTI::SK_Reverse: { 508 // TODO: Cases to improve here: 509 // * Illegal vector types 510 // * i64 on RV32 511 // * i1 vector 512 // At low LMUL, most of the cost is producing the vrgather index register. 513 // At high LMUL, the cost of the vrgather itself will dominate. 514 // Example sequence: 515 // csrr a0, vlenb 516 // srli a0, a0, 3 517 // addi a0, a0, -1 518 // vsetvli a1, zero, e8, mf8, ta, mu (ignored) 519 // vid.v v9 520 // vrsub.vx v10, v9, a0 521 // vrgather.vv v9, v8, v10 522 InstructionCost LenCost = 3; 523 if (LT.second.isFixedLengthVector()) 524 // vrsub.vi has a 5 bit immediate field, otherwise an li suffices 525 LenCost = isInt<5>(LT.second.getVectorNumElements() - 1) ? 0 : 1; 526 // FIXME: replace the constant `2` below with cost of {VID_V,VRSUB_VX} 527 InstructionCost GatherCost = 528 2 + getRISCVInstructionCost(RISCV::VRGATHER_VV, LT.second, CostKind); 529 // Mask operation additionally required extend and truncate 530 InstructionCost ExtendCost = Tp->getElementType()->isIntegerTy(1) ? 3 : 0; 531 return LT.first * (LenCost + GatherCost + ExtendCost); 532 } 533 } 534 return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp); 535 } 536 537 InstructionCost 538 RISCVTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, 539 unsigned AddressSpace, 540 TTI::TargetCostKind CostKind) { 541 if (!isLegalMaskedLoadStore(Src, Alignment) || 542 CostKind != TTI::TCK_RecipThroughput) 543 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace, 544 CostKind); 545 546 return getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind); 547 } 548 549 InstructionCost RISCVTTIImpl::getInterleavedMemoryOpCost( 550 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, 551 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, 552 bool UseMaskForCond, bool UseMaskForGaps) { 553 if (isa<ScalableVectorType>(VecTy)) 554 return InstructionCost::getInvalid(); 555 auto *FVTy = cast<FixedVectorType>(VecTy); 556 InstructionCost MemCost = 557 getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, CostKind); 558 unsigned VF = FVTy->getNumElements() / Factor; 559 560 // The interleaved memory access pass will lower interleaved memory ops (i.e 561 // a load and store followed by a specific shuffle) to vlseg/vsseg 562 // intrinsics. In those cases then we can treat it as if it's just one (legal) 563 // memory op 564 if (!UseMaskForCond && !UseMaskForGaps && 565 Factor <= TLI->getMaxSupportedInterleaveFactor()) { 566 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(FVTy); 567 // Need to make sure type has't been scalarized 568 if (LT.second.isFixedLengthVector()) { 569 auto *LegalFVTy = FixedVectorType::get(FVTy->getElementType(), 570 LT.second.getVectorNumElements()); 571 // FIXME: We use the memory op cost of the *legalized* type here, becuase 572 // it's getMemoryOpCost returns a really expensive cost for types like 573 // <6 x i8>, which show up when doing interleaves of Factor=3 etc. 574 // Should the memory op cost of these be cheaper? 575 if (TLI->isLegalInterleavedAccessType(LegalFVTy, Factor, Alignment, 576 AddressSpace, DL)) { 577 InstructionCost LegalMemCost = getMemoryOpCost( 578 Opcode, LegalFVTy, Alignment, AddressSpace, CostKind); 579 return LT.first + LegalMemCost; 580 } 581 } 582 } 583 584 // An interleaved load will look like this for Factor=3: 585 // %wide.vec = load <12 x i32>, ptr %3, align 4 586 // %strided.vec = shufflevector %wide.vec, poison, <4 x i32> <stride mask> 587 // %strided.vec1 = shufflevector %wide.vec, poison, <4 x i32> <stride mask> 588 // %strided.vec2 = shufflevector %wide.vec, poison, <4 x i32> <stride mask> 589 if (Opcode == Instruction::Load) { 590 InstructionCost Cost = MemCost; 591 for (unsigned Index : Indices) { 592 FixedVectorType *SubVecTy = 593 FixedVectorType::get(FVTy->getElementType(), VF * Factor); 594 auto Mask = createStrideMask(Index, Factor, VF); 595 InstructionCost ShuffleCost = 596 getShuffleCost(TTI::ShuffleKind::SK_PermuteSingleSrc, SubVecTy, Mask, 597 CostKind, 0, nullptr, {}); 598 Cost += ShuffleCost; 599 } 600 return Cost; 601 } 602 603 // TODO: Model for NF > 2 604 // We'll need to enhance getShuffleCost to model shuffles that are just 605 // inserts and extracts into subvectors, since they won't have the full cost 606 // of a vrgather. 607 // An interleaved store for 3 vectors of 4 lanes will look like 608 // %11 = shufflevector <4 x i32> %4, <4 x i32> %6, <8 x i32> <0...7> 609 // %12 = shufflevector <4 x i32> %9, <4 x i32> poison, <8 x i32> <0...3> 610 // %13 = shufflevector <8 x i32> %11, <8 x i32> %12, <12 x i32> <0...11> 611 // %interleaved.vec = shufflevector %13, poison, <12 x i32> <interleave mask> 612 // store <12 x i32> %interleaved.vec, ptr %10, align 4 613 if (Factor != 2) 614 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, 615 Alignment, AddressSpace, CostKind, 616 UseMaskForCond, UseMaskForGaps); 617 618 assert(Opcode == Instruction::Store && "Opcode must be a store"); 619 // For an interleaving store of 2 vectors, we perform one large interleaving 620 // shuffle that goes into the wide store 621 auto Mask = createInterleaveMask(VF, Factor); 622 InstructionCost ShuffleCost = 623 getShuffleCost(TTI::ShuffleKind::SK_PermuteSingleSrc, FVTy, Mask, 624 CostKind, 0, nullptr, {}); 625 return MemCost + ShuffleCost; 626 } 627 628 InstructionCost RISCVTTIImpl::getGatherScatterOpCost( 629 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, 630 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) { 631 if (CostKind != TTI::TCK_RecipThroughput) 632 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, 633 Alignment, CostKind, I); 634 635 if ((Opcode == Instruction::Load && 636 !isLegalMaskedGather(DataTy, Align(Alignment))) || 637 (Opcode == Instruction::Store && 638 !isLegalMaskedScatter(DataTy, Align(Alignment)))) 639 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, 640 Alignment, CostKind, I); 641 642 // Cost is proportional to the number of memory operations implied. For 643 // scalable vectors, we use an estimate on that number since we don't 644 // know exactly what VL will be. 645 auto &VTy = *cast<VectorType>(DataTy); 646 InstructionCost MemOpCost = 647 getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind, 648 {TTI::OK_AnyValue, TTI::OP_None}, I); 649 unsigned NumLoads = getEstimatedVLFor(&VTy); 650 return NumLoads * MemOpCost; 651 } 652 653 // Currently, these represent both throughput and codesize costs 654 // for the respective intrinsics. The costs in this table are simply 655 // instruction counts with the following adjustments made: 656 // * One vsetvli is considered free. 657 static const CostTblEntry VectorIntrinsicCostTable[]{ 658 {Intrinsic::floor, MVT::v2f32, 9}, 659 {Intrinsic::floor, MVT::v4f32, 9}, 660 {Intrinsic::floor, MVT::v8f32, 9}, 661 {Intrinsic::floor, MVT::v16f32, 9}, 662 {Intrinsic::floor, MVT::nxv1f32, 9}, 663 {Intrinsic::floor, MVT::nxv2f32, 9}, 664 {Intrinsic::floor, MVT::nxv4f32, 9}, 665 {Intrinsic::floor, MVT::nxv8f32, 9}, 666 {Intrinsic::floor, MVT::nxv16f32, 9}, 667 {Intrinsic::floor, MVT::v2f64, 9}, 668 {Intrinsic::floor, MVT::v4f64, 9}, 669 {Intrinsic::floor, MVT::v8f64, 9}, 670 {Intrinsic::floor, MVT::v16f64, 9}, 671 {Intrinsic::floor, MVT::nxv1f64, 9}, 672 {Intrinsic::floor, MVT::nxv2f64, 9}, 673 {Intrinsic::floor, MVT::nxv4f64, 9}, 674 {Intrinsic::floor, MVT::nxv8f64, 9}, 675 {Intrinsic::ceil, MVT::v2f32, 9}, 676 {Intrinsic::ceil, MVT::v4f32, 9}, 677 {Intrinsic::ceil, MVT::v8f32, 9}, 678 {Intrinsic::ceil, MVT::v16f32, 9}, 679 {Intrinsic::ceil, MVT::nxv1f32, 9}, 680 {Intrinsic::ceil, MVT::nxv2f32, 9}, 681 {Intrinsic::ceil, MVT::nxv4f32, 9}, 682 {Intrinsic::ceil, MVT::nxv8f32, 9}, 683 {Intrinsic::ceil, MVT::nxv16f32, 9}, 684 {Intrinsic::ceil, MVT::v2f64, 9}, 685 {Intrinsic::ceil, MVT::v4f64, 9}, 686 {Intrinsic::ceil, MVT::v8f64, 9}, 687 {Intrinsic::ceil, MVT::v16f64, 9}, 688 {Intrinsic::ceil, MVT::nxv1f64, 9}, 689 {Intrinsic::ceil, MVT::nxv2f64, 9}, 690 {Intrinsic::ceil, MVT::nxv4f64, 9}, 691 {Intrinsic::ceil, MVT::nxv8f64, 9}, 692 {Intrinsic::trunc, MVT::v2f32, 7}, 693 {Intrinsic::trunc, MVT::v4f32, 7}, 694 {Intrinsic::trunc, MVT::v8f32, 7}, 695 {Intrinsic::trunc, MVT::v16f32, 7}, 696 {Intrinsic::trunc, MVT::nxv1f32, 7}, 697 {Intrinsic::trunc, MVT::nxv2f32, 7}, 698 {Intrinsic::trunc, MVT::nxv4f32, 7}, 699 {Intrinsic::trunc, MVT::nxv8f32, 7}, 700 {Intrinsic::trunc, MVT::nxv16f32, 7}, 701 {Intrinsic::trunc, MVT::v2f64, 7}, 702 {Intrinsic::trunc, MVT::v4f64, 7}, 703 {Intrinsic::trunc, MVT::v8f64, 7}, 704 {Intrinsic::trunc, MVT::v16f64, 7}, 705 {Intrinsic::trunc, MVT::nxv1f64, 7}, 706 {Intrinsic::trunc, MVT::nxv2f64, 7}, 707 {Intrinsic::trunc, MVT::nxv4f64, 7}, 708 {Intrinsic::trunc, MVT::nxv8f64, 7}, 709 {Intrinsic::round, MVT::v2f32, 9}, 710 {Intrinsic::round, MVT::v4f32, 9}, 711 {Intrinsic::round, MVT::v8f32, 9}, 712 {Intrinsic::round, MVT::v16f32, 9}, 713 {Intrinsic::round, MVT::nxv1f32, 9}, 714 {Intrinsic::round, MVT::nxv2f32, 9}, 715 {Intrinsic::round, MVT::nxv4f32, 9}, 716 {Intrinsic::round, MVT::nxv8f32, 9}, 717 {Intrinsic::round, MVT::nxv16f32, 9}, 718 {Intrinsic::round, MVT::v2f64, 9}, 719 {Intrinsic::round, MVT::v4f64, 9}, 720 {Intrinsic::round, MVT::v8f64, 9}, 721 {Intrinsic::round, MVT::v16f64, 9}, 722 {Intrinsic::round, MVT::nxv1f64, 9}, 723 {Intrinsic::round, MVT::nxv2f64, 9}, 724 {Intrinsic::round, MVT::nxv4f64, 9}, 725 {Intrinsic::round, MVT::nxv8f64, 9}, 726 {Intrinsic::roundeven, MVT::v2f32, 9}, 727 {Intrinsic::roundeven, MVT::v4f32, 9}, 728 {Intrinsic::roundeven, MVT::v8f32, 9}, 729 {Intrinsic::roundeven, MVT::v16f32, 9}, 730 {Intrinsic::roundeven, MVT::nxv1f32, 9}, 731 {Intrinsic::roundeven, MVT::nxv2f32, 9}, 732 {Intrinsic::roundeven, MVT::nxv4f32, 9}, 733 {Intrinsic::roundeven, MVT::nxv8f32, 9}, 734 {Intrinsic::roundeven, MVT::nxv16f32, 9}, 735 {Intrinsic::roundeven, MVT::v2f64, 9}, 736 {Intrinsic::roundeven, MVT::v4f64, 9}, 737 {Intrinsic::roundeven, MVT::v8f64, 9}, 738 {Intrinsic::roundeven, MVT::v16f64, 9}, 739 {Intrinsic::roundeven, MVT::nxv1f64, 9}, 740 {Intrinsic::roundeven, MVT::nxv2f64, 9}, 741 {Intrinsic::roundeven, MVT::nxv4f64, 9}, 742 {Intrinsic::roundeven, MVT::nxv8f64, 9}, 743 {Intrinsic::rint, MVT::v2f32, 7}, 744 {Intrinsic::rint, MVT::v4f32, 7}, 745 {Intrinsic::rint, MVT::v8f32, 7}, 746 {Intrinsic::rint, MVT::v16f32, 7}, 747 {Intrinsic::rint, MVT::nxv1f32, 7}, 748 {Intrinsic::rint, MVT::nxv2f32, 7}, 749 {Intrinsic::rint, MVT::nxv4f32, 7}, 750 {Intrinsic::rint, MVT::nxv8f32, 7}, 751 {Intrinsic::rint, MVT::nxv16f32, 7}, 752 {Intrinsic::rint, MVT::v2f64, 7}, 753 {Intrinsic::rint, MVT::v4f64, 7}, 754 {Intrinsic::rint, MVT::v8f64, 7}, 755 {Intrinsic::rint, MVT::v16f64, 7}, 756 {Intrinsic::rint, MVT::nxv1f64, 7}, 757 {Intrinsic::rint, MVT::nxv2f64, 7}, 758 {Intrinsic::rint, MVT::nxv4f64, 7}, 759 {Intrinsic::rint, MVT::nxv8f64, 7}, 760 {Intrinsic::lrint, MVT::v2i32, 1}, 761 {Intrinsic::lrint, MVT::v4i32, 1}, 762 {Intrinsic::lrint, MVT::v8i32, 1}, 763 {Intrinsic::lrint, MVT::v16i32, 1}, 764 {Intrinsic::lrint, MVT::nxv1i32, 1}, 765 {Intrinsic::lrint, MVT::nxv2i32, 1}, 766 {Intrinsic::lrint, MVT::nxv4i32, 1}, 767 {Intrinsic::lrint, MVT::nxv8i32, 1}, 768 {Intrinsic::lrint, MVT::nxv16i32, 1}, 769 {Intrinsic::lrint, MVT::v2i64, 1}, 770 {Intrinsic::lrint, MVT::v4i64, 1}, 771 {Intrinsic::lrint, MVT::v8i64, 1}, 772 {Intrinsic::lrint, MVT::v16i64, 1}, 773 {Intrinsic::lrint, MVT::nxv1i64, 1}, 774 {Intrinsic::lrint, MVT::nxv2i64, 1}, 775 {Intrinsic::lrint, MVT::nxv4i64, 1}, 776 {Intrinsic::lrint, MVT::nxv8i64, 1}, 777 {Intrinsic::llrint, MVT::v2i64, 1}, 778 {Intrinsic::llrint, MVT::v4i64, 1}, 779 {Intrinsic::llrint, MVT::v8i64, 1}, 780 {Intrinsic::llrint, MVT::v16i64, 1}, 781 {Intrinsic::llrint, MVT::nxv1i64, 1}, 782 {Intrinsic::llrint, MVT::nxv2i64, 1}, 783 {Intrinsic::llrint, MVT::nxv4i64, 1}, 784 {Intrinsic::llrint, MVT::nxv8i64, 1}, 785 {Intrinsic::nearbyint, MVT::v2f32, 9}, 786 {Intrinsic::nearbyint, MVT::v4f32, 9}, 787 {Intrinsic::nearbyint, MVT::v8f32, 9}, 788 {Intrinsic::nearbyint, MVT::v16f32, 9}, 789 {Intrinsic::nearbyint, MVT::nxv1f32, 9}, 790 {Intrinsic::nearbyint, MVT::nxv2f32, 9}, 791 {Intrinsic::nearbyint, MVT::nxv4f32, 9}, 792 {Intrinsic::nearbyint, MVT::nxv8f32, 9}, 793 {Intrinsic::nearbyint, MVT::nxv16f32, 9}, 794 {Intrinsic::nearbyint, MVT::v2f64, 9}, 795 {Intrinsic::nearbyint, MVT::v4f64, 9}, 796 {Intrinsic::nearbyint, MVT::v8f64, 9}, 797 {Intrinsic::nearbyint, MVT::v16f64, 9}, 798 {Intrinsic::nearbyint, MVT::nxv1f64, 9}, 799 {Intrinsic::nearbyint, MVT::nxv2f64, 9}, 800 {Intrinsic::nearbyint, MVT::nxv4f64, 9}, 801 {Intrinsic::nearbyint, MVT::nxv8f64, 9}, 802 {Intrinsic::bswap, MVT::v2i16, 3}, 803 {Intrinsic::bswap, MVT::v4i16, 3}, 804 {Intrinsic::bswap, MVT::v8i16, 3}, 805 {Intrinsic::bswap, MVT::v16i16, 3}, 806 {Intrinsic::bswap, MVT::nxv1i16, 3}, 807 {Intrinsic::bswap, MVT::nxv2i16, 3}, 808 {Intrinsic::bswap, MVT::nxv4i16, 3}, 809 {Intrinsic::bswap, MVT::nxv8i16, 3}, 810 {Intrinsic::bswap, MVT::nxv16i16, 3}, 811 {Intrinsic::bswap, MVT::v2i32, 12}, 812 {Intrinsic::bswap, MVT::v4i32, 12}, 813 {Intrinsic::bswap, MVT::v8i32, 12}, 814 {Intrinsic::bswap, MVT::v16i32, 12}, 815 {Intrinsic::bswap, MVT::nxv1i32, 12}, 816 {Intrinsic::bswap, MVT::nxv2i32, 12}, 817 {Intrinsic::bswap, MVT::nxv4i32, 12}, 818 {Intrinsic::bswap, MVT::nxv8i32, 12}, 819 {Intrinsic::bswap, MVT::nxv16i32, 12}, 820 {Intrinsic::bswap, MVT::v2i64, 31}, 821 {Intrinsic::bswap, MVT::v4i64, 31}, 822 {Intrinsic::bswap, MVT::v8i64, 31}, 823 {Intrinsic::bswap, MVT::v16i64, 31}, 824 {Intrinsic::bswap, MVT::nxv1i64, 31}, 825 {Intrinsic::bswap, MVT::nxv2i64, 31}, 826 {Intrinsic::bswap, MVT::nxv4i64, 31}, 827 {Intrinsic::bswap, MVT::nxv8i64, 31}, 828 {Intrinsic::vp_bswap, MVT::v2i16, 3}, 829 {Intrinsic::vp_bswap, MVT::v4i16, 3}, 830 {Intrinsic::vp_bswap, MVT::v8i16, 3}, 831 {Intrinsic::vp_bswap, MVT::v16i16, 3}, 832 {Intrinsic::vp_bswap, MVT::nxv1i16, 3}, 833 {Intrinsic::vp_bswap, MVT::nxv2i16, 3}, 834 {Intrinsic::vp_bswap, MVT::nxv4i16, 3}, 835 {Intrinsic::vp_bswap, MVT::nxv8i16, 3}, 836 {Intrinsic::vp_bswap, MVT::nxv16i16, 3}, 837 {Intrinsic::vp_bswap, MVT::v2i32, 12}, 838 {Intrinsic::vp_bswap, MVT::v4i32, 12}, 839 {Intrinsic::vp_bswap, MVT::v8i32, 12}, 840 {Intrinsic::vp_bswap, MVT::v16i32, 12}, 841 {Intrinsic::vp_bswap, MVT::nxv1i32, 12}, 842 {Intrinsic::vp_bswap, MVT::nxv2i32, 12}, 843 {Intrinsic::vp_bswap, MVT::nxv4i32, 12}, 844 {Intrinsic::vp_bswap, MVT::nxv8i32, 12}, 845 {Intrinsic::vp_bswap, MVT::nxv16i32, 12}, 846 {Intrinsic::vp_bswap, MVT::v2i64, 31}, 847 {Intrinsic::vp_bswap, MVT::v4i64, 31}, 848 {Intrinsic::vp_bswap, MVT::v8i64, 31}, 849 {Intrinsic::vp_bswap, MVT::v16i64, 31}, 850 {Intrinsic::vp_bswap, MVT::nxv1i64, 31}, 851 {Intrinsic::vp_bswap, MVT::nxv2i64, 31}, 852 {Intrinsic::vp_bswap, MVT::nxv4i64, 31}, 853 {Intrinsic::vp_bswap, MVT::nxv8i64, 31}, 854 {Intrinsic::vp_fshl, MVT::v2i8, 7}, 855 {Intrinsic::vp_fshl, MVT::v4i8, 7}, 856 {Intrinsic::vp_fshl, MVT::v8i8, 7}, 857 {Intrinsic::vp_fshl, MVT::v16i8, 7}, 858 {Intrinsic::vp_fshl, MVT::nxv1i8, 7}, 859 {Intrinsic::vp_fshl, MVT::nxv2i8, 7}, 860 {Intrinsic::vp_fshl, MVT::nxv4i8, 7}, 861 {Intrinsic::vp_fshl, MVT::nxv8i8, 7}, 862 {Intrinsic::vp_fshl, MVT::nxv16i8, 7}, 863 {Intrinsic::vp_fshl, MVT::nxv32i8, 7}, 864 {Intrinsic::vp_fshl, MVT::nxv64i8, 7}, 865 {Intrinsic::vp_fshl, MVT::v2i16, 7}, 866 {Intrinsic::vp_fshl, MVT::v4i16, 7}, 867 {Intrinsic::vp_fshl, MVT::v8i16, 7}, 868 {Intrinsic::vp_fshl, MVT::v16i16, 7}, 869 {Intrinsic::vp_fshl, MVT::nxv1i16, 7}, 870 {Intrinsic::vp_fshl, MVT::nxv2i16, 7}, 871 {Intrinsic::vp_fshl, MVT::nxv4i16, 7}, 872 {Intrinsic::vp_fshl, MVT::nxv8i16, 7}, 873 {Intrinsic::vp_fshl, MVT::nxv16i16, 7}, 874 {Intrinsic::vp_fshl, MVT::nxv32i16, 7}, 875 {Intrinsic::vp_fshl, MVT::v2i32, 7}, 876 {Intrinsic::vp_fshl, MVT::v4i32, 7}, 877 {Intrinsic::vp_fshl, MVT::v8i32, 7}, 878 {Intrinsic::vp_fshl, MVT::v16i32, 7}, 879 {Intrinsic::vp_fshl, MVT::nxv1i32, 7}, 880 {Intrinsic::vp_fshl, MVT::nxv2i32, 7}, 881 {Intrinsic::vp_fshl, MVT::nxv4i32, 7}, 882 {Intrinsic::vp_fshl, MVT::nxv8i32, 7}, 883 {Intrinsic::vp_fshl, MVT::nxv16i32, 7}, 884 {Intrinsic::vp_fshl, MVT::v2i64, 7}, 885 {Intrinsic::vp_fshl, MVT::v4i64, 7}, 886 {Intrinsic::vp_fshl, MVT::v8i64, 7}, 887 {Intrinsic::vp_fshl, MVT::v16i64, 7}, 888 {Intrinsic::vp_fshl, MVT::nxv1i64, 7}, 889 {Intrinsic::vp_fshl, MVT::nxv2i64, 7}, 890 {Intrinsic::vp_fshl, MVT::nxv4i64, 7}, 891 {Intrinsic::vp_fshl, MVT::nxv8i64, 7}, 892 {Intrinsic::vp_fshr, MVT::v2i8, 7}, 893 {Intrinsic::vp_fshr, MVT::v4i8, 7}, 894 {Intrinsic::vp_fshr, MVT::v8i8, 7}, 895 {Intrinsic::vp_fshr, MVT::v16i8, 7}, 896 {Intrinsic::vp_fshr, MVT::nxv1i8, 7}, 897 {Intrinsic::vp_fshr, MVT::nxv2i8, 7}, 898 {Intrinsic::vp_fshr, MVT::nxv4i8, 7}, 899 {Intrinsic::vp_fshr, MVT::nxv8i8, 7}, 900 {Intrinsic::vp_fshr, MVT::nxv16i8, 7}, 901 {Intrinsic::vp_fshr, MVT::nxv32i8, 7}, 902 {Intrinsic::vp_fshr, MVT::nxv64i8, 7}, 903 {Intrinsic::vp_fshr, MVT::v2i16, 7}, 904 {Intrinsic::vp_fshr, MVT::v4i16, 7}, 905 {Intrinsic::vp_fshr, MVT::v8i16, 7}, 906 {Intrinsic::vp_fshr, MVT::v16i16, 7}, 907 {Intrinsic::vp_fshr, MVT::nxv1i16, 7}, 908 {Intrinsic::vp_fshr, MVT::nxv2i16, 7}, 909 {Intrinsic::vp_fshr, MVT::nxv4i16, 7}, 910 {Intrinsic::vp_fshr, MVT::nxv8i16, 7}, 911 {Intrinsic::vp_fshr, MVT::nxv16i16, 7}, 912 {Intrinsic::vp_fshr, MVT::nxv32i16, 7}, 913 {Intrinsic::vp_fshr, MVT::v2i32, 7}, 914 {Intrinsic::vp_fshr, MVT::v4i32, 7}, 915 {Intrinsic::vp_fshr, MVT::v8i32, 7}, 916 {Intrinsic::vp_fshr, MVT::v16i32, 7}, 917 {Intrinsic::vp_fshr, MVT::nxv1i32, 7}, 918 {Intrinsic::vp_fshr, MVT::nxv2i32, 7}, 919 {Intrinsic::vp_fshr, MVT::nxv4i32, 7}, 920 {Intrinsic::vp_fshr, MVT::nxv8i32, 7}, 921 {Intrinsic::vp_fshr, MVT::nxv16i32, 7}, 922 {Intrinsic::vp_fshr, MVT::v2i64, 7}, 923 {Intrinsic::vp_fshr, MVT::v4i64, 7}, 924 {Intrinsic::vp_fshr, MVT::v8i64, 7}, 925 {Intrinsic::vp_fshr, MVT::v16i64, 7}, 926 {Intrinsic::vp_fshr, MVT::nxv1i64, 7}, 927 {Intrinsic::vp_fshr, MVT::nxv2i64, 7}, 928 {Intrinsic::vp_fshr, MVT::nxv4i64, 7}, 929 {Intrinsic::vp_fshr, MVT::nxv8i64, 7}, 930 {Intrinsic::bitreverse, MVT::v2i8, 17}, 931 {Intrinsic::bitreverse, MVT::v4i8, 17}, 932 {Intrinsic::bitreverse, MVT::v8i8, 17}, 933 {Intrinsic::bitreverse, MVT::v16i8, 17}, 934 {Intrinsic::bitreverse, MVT::nxv1i8, 17}, 935 {Intrinsic::bitreverse, MVT::nxv2i8, 17}, 936 {Intrinsic::bitreverse, MVT::nxv4i8, 17}, 937 {Intrinsic::bitreverse, MVT::nxv8i8, 17}, 938 {Intrinsic::bitreverse, MVT::nxv16i8, 17}, 939 {Intrinsic::bitreverse, MVT::v2i16, 24}, 940 {Intrinsic::bitreverse, MVT::v4i16, 24}, 941 {Intrinsic::bitreverse, MVT::v8i16, 24}, 942 {Intrinsic::bitreverse, MVT::v16i16, 24}, 943 {Intrinsic::bitreverse, MVT::nxv1i16, 24}, 944 {Intrinsic::bitreverse, MVT::nxv2i16, 24}, 945 {Intrinsic::bitreverse, MVT::nxv4i16, 24}, 946 {Intrinsic::bitreverse, MVT::nxv8i16, 24}, 947 {Intrinsic::bitreverse, MVT::nxv16i16, 24}, 948 {Intrinsic::bitreverse, MVT::v2i32, 33}, 949 {Intrinsic::bitreverse, MVT::v4i32, 33}, 950 {Intrinsic::bitreverse, MVT::v8i32, 33}, 951 {Intrinsic::bitreverse, MVT::v16i32, 33}, 952 {Intrinsic::bitreverse, MVT::nxv1i32, 33}, 953 {Intrinsic::bitreverse, MVT::nxv2i32, 33}, 954 {Intrinsic::bitreverse, MVT::nxv4i32, 33}, 955 {Intrinsic::bitreverse, MVT::nxv8i32, 33}, 956 {Intrinsic::bitreverse, MVT::nxv16i32, 33}, 957 {Intrinsic::bitreverse, MVT::v2i64, 52}, 958 {Intrinsic::bitreverse, MVT::v4i64, 52}, 959 {Intrinsic::bitreverse, MVT::v8i64, 52}, 960 {Intrinsic::bitreverse, MVT::v16i64, 52}, 961 {Intrinsic::bitreverse, MVT::nxv1i64, 52}, 962 {Intrinsic::bitreverse, MVT::nxv2i64, 52}, 963 {Intrinsic::bitreverse, MVT::nxv4i64, 52}, 964 {Intrinsic::bitreverse, MVT::nxv8i64, 52}, 965 {Intrinsic::vp_bitreverse, MVT::v2i8, 17}, 966 {Intrinsic::vp_bitreverse, MVT::v4i8, 17}, 967 {Intrinsic::vp_bitreverse, MVT::v8i8, 17}, 968 {Intrinsic::vp_bitreverse, MVT::v16i8, 17}, 969 {Intrinsic::vp_bitreverse, MVT::nxv1i8, 17}, 970 {Intrinsic::vp_bitreverse, MVT::nxv2i8, 17}, 971 {Intrinsic::vp_bitreverse, MVT::nxv4i8, 17}, 972 {Intrinsic::vp_bitreverse, MVT::nxv8i8, 17}, 973 {Intrinsic::vp_bitreverse, MVT::nxv16i8, 17}, 974 {Intrinsic::vp_bitreverse, MVT::v2i16, 24}, 975 {Intrinsic::vp_bitreverse, MVT::v4i16, 24}, 976 {Intrinsic::vp_bitreverse, MVT::v8i16, 24}, 977 {Intrinsic::vp_bitreverse, MVT::v16i16, 24}, 978 {Intrinsic::vp_bitreverse, MVT::nxv1i16, 24}, 979 {Intrinsic::vp_bitreverse, MVT::nxv2i16, 24}, 980 {Intrinsic::vp_bitreverse, MVT::nxv4i16, 24}, 981 {Intrinsic::vp_bitreverse, MVT::nxv8i16, 24}, 982 {Intrinsic::vp_bitreverse, MVT::nxv16i16, 24}, 983 {Intrinsic::vp_bitreverse, MVT::v2i32, 33}, 984 {Intrinsic::vp_bitreverse, MVT::v4i32, 33}, 985 {Intrinsic::vp_bitreverse, MVT::v8i32, 33}, 986 {Intrinsic::vp_bitreverse, MVT::v16i32, 33}, 987 {Intrinsic::vp_bitreverse, MVT::nxv1i32, 33}, 988 {Intrinsic::vp_bitreverse, MVT::nxv2i32, 33}, 989 {Intrinsic::vp_bitreverse, MVT::nxv4i32, 33}, 990 {Intrinsic::vp_bitreverse, MVT::nxv8i32, 33}, 991 {Intrinsic::vp_bitreverse, MVT::nxv16i32, 33}, 992 {Intrinsic::vp_bitreverse, MVT::v2i64, 52}, 993 {Intrinsic::vp_bitreverse, MVT::v4i64, 52}, 994 {Intrinsic::vp_bitreverse, MVT::v8i64, 52}, 995 {Intrinsic::vp_bitreverse, MVT::v16i64, 52}, 996 {Intrinsic::vp_bitreverse, MVT::nxv1i64, 52}, 997 {Intrinsic::vp_bitreverse, MVT::nxv2i64, 52}, 998 {Intrinsic::vp_bitreverse, MVT::nxv4i64, 52}, 999 {Intrinsic::vp_bitreverse, MVT::nxv8i64, 52}, 1000 {Intrinsic::ctpop, MVT::v2i8, 12}, 1001 {Intrinsic::ctpop, MVT::v4i8, 12}, 1002 {Intrinsic::ctpop, MVT::v8i8, 12}, 1003 {Intrinsic::ctpop, MVT::v16i8, 12}, 1004 {Intrinsic::ctpop, MVT::nxv1i8, 12}, 1005 {Intrinsic::ctpop, MVT::nxv2i8, 12}, 1006 {Intrinsic::ctpop, MVT::nxv4i8, 12}, 1007 {Intrinsic::ctpop, MVT::nxv8i8, 12}, 1008 {Intrinsic::ctpop, MVT::nxv16i8, 12}, 1009 {Intrinsic::ctpop, MVT::v2i16, 19}, 1010 {Intrinsic::ctpop, MVT::v4i16, 19}, 1011 {Intrinsic::ctpop, MVT::v8i16, 19}, 1012 {Intrinsic::ctpop, MVT::v16i16, 19}, 1013 {Intrinsic::ctpop, MVT::nxv1i16, 19}, 1014 {Intrinsic::ctpop, MVT::nxv2i16, 19}, 1015 {Intrinsic::ctpop, MVT::nxv4i16, 19}, 1016 {Intrinsic::ctpop, MVT::nxv8i16, 19}, 1017 {Intrinsic::ctpop, MVT::nxv16i16, 19}, 1018 {Intrinsic::ctpop, MVT::v2i32, 20}, 1019 {Intrinsic::ctpop, MVT::v4i32, 20}, 1020 {Intrinsic::ctpop, MVT::v8i32, 20}, 1021 {Intrinsic::ctpop, MVT::v16i32, 20}, 1022 {Intrinsic::ctpop, MVT::nxv1i32, 20}, 1023 {Intrinsic::ctpop, MVT::nxv2i32, 20}, 1024 {Intrinsic::ctpop, MVT::nxv4i32, 20}, 1025 {Intrinsic::ctpop, MVT::nxv8i32, 20}, 1026 {Intrinsic::ctpop, MVT::nxv16i32, 20}, 1027 {Intrinsic::ctpop, MVT::v2i64, 21}, 1028 {Intrinsic::ctpop, MVT::v4i64, 21}, 1029 {Intrinsic::ctpop, MVT::v8i64, 21}, 1030 {Intrinsic::ctpop, MVT::v16i64, 21}, 1031 {Intrinsic::ctpop, MVT::nxv1i64, 21}, 1032 {Intrinsic::ctpop, MVT::nxv2i64, 21}, 1033 {Intrinsic::ctpop, MVT::nxv4i64, 21}, 1034 {Intrinsic::ctpop, MVT::nxv8i64, 21}, 1035 {Intrinsic::vp_ctpop, MVT::v2i8, 12}, 1036 {Intrinsic::vp_ctpop, MVT::v4i8, 12}, 1037 {Intrinsic::vp_ctpop, MVT::v8i8, 12}, 1038 {Intrinsic::vp_ctpop, MVT::v16i8, 12}, 1039 {Intrinsic::vp_ctpop, MVT::nxv1i8, 12}, 1040 {Intrinsic::vp_ctpop, MVT::nxv2i8, 12}, 1041 {Intrinsic::vp_ctpop, MVT::nxv4i8, 12}, 1042 {Intrinsic::vp_ctpop, MVT::nxv8i8, 12}, 1043 {Intrinsic::vp_ctpop, MVT::nxv16i8, 12}, 1044 {Intrinsic::vp_ctpop, MVT::v2i16, 19}, 1045 {Intrinsic::vp_ctpop, MVT::v4i16, 19}, 1046 {Intrinsic::vp_ctpop, MVT::v8i16, 19}, 1047 {Intrinsic::vp_ctpop, MVT::v16i16, 19}, 1048 {Intrinsic::vp_ctpop, MVT::nxv1i16, 19}, 1049 {Intrinsic::vp_ctpop, MVT::nxv2i16, 19}, 1050 {Intrinsic::vp_ctpop, MVT::nxv4i16, 19}, 1051 {Intrinsic::vp_ctpop, MVT::nxv8i16, 19}, 1052 {Intrinsic::vp_ctpop, MVT::nxv16i16, 19}, 1053 {Intrinsic::vp_ctpop, MVT::v2i32, 20}, 1054 {Intrinsic::vp_ctpop, MVT::v4i32, 20}, 1055 {Intrinsic::vp_ctpop, MVT::v8i32, 20}, 1056 {Intrinsic::vp_ctpop, MVT::v16i32, 20}, 1057 {Intrinsic::vp_ctpop, MVT::nxv1i32, 20}, 1058 {Intrinsic::vp_ctpop, MVT::nxv2i32, 20}, 1059 {Intrinsic::vp_ctpop, MVT::nxv4i32, 20}, 1060 {Intrinsic::vp_ctpop, MVT::nxv8i32, 20}, 1061 {Intrinsic::vp_ctpop, MVT::nxv16i32, 20}, 1062 {Intrinsic::vp_ctpop, MVT::v2i64, 21}, 1063 {Intrinsic::vp_ctpop, MVT::v4i64, 21}, 1064 {Intrinsic::vp_ctpop, MVT::v8i64, 21}, 1065 {Intrinsic::vp_ctpop, MVT::v16i64, 21}, 1066 {Intrinsic::vp_ctpop, MVT::nxv1i64, 21}, 1067 {Intrinsic::vp_ctpop, MVT::nxv2i64, 21}, 1068 {Intrinsic::vp_ctpop, MVT::nxv4i64, 21}, 1069 {Intrinsic::vp_ctpop, MVT::nxv8i64, 21}, 1070 {Intrinsic::vp_ctlz, MVT::v2i8, 19}, 1071 {Intrinsic::vp_ctlz, MVT::v4i8, 19}, 1072 {Intrinsic::vp_ctlz, MVT::v8i8, 19}, 1073 {Intrinsic::vp_ctlz, MVT::v16i8, 19}, 1074 {Intrinsic::vp_ctlz, MVT::nxv1i8, 19}, 1075 {Intrinsic::vp_ctlz, MVT::nxv2i8, 19}, 1076 {Intrinsic::vp_ctlz, MVT::nxv4i8, 19}, 1077 {Intrinsic::vp_ctlz, MVT::nxv8i8, 19}, 1078 {Intrinsic::vp_ctlz, MVT::nxv16i8, 19}, 1079 {Intrinsic::vp_ctlz, MVT::nxv32i8, 19}, 1080 {Intrinsic::vp_ctlz, MVT::nxv64i8, 19}, 1081 {Intrinsic::vp_ctlz, MVT::v2i16, 28}, 1082 {Intrinsic::vp_ctlz, MVT::v4i16, 28}, 1083 {Intrinsic::vp_ctlz, MVT::v8i16, 28}, 1084 {Intrinsic::vp_ctlz, MVT::v16i16, 28}, 1085 {Intrinsic::vp_ctlz, MVT::nxv1i16, 28}, 1086 {Intrinsic::vp_ctlz, MVT::nxv2i16, 28}, 1087 {Intrinsic::vp_ctlz, MVT::nxv4i16, 28}, 1088 {Intrinsic::vp_ctlz, MVT::nxv8i16, 28}, 1089 {Intrinsic::vp_ctlz, MVT::nxv16i16, 28}, 1090 {Intrinsic::vp_ctlz, MVT::nxv32i16, 28}, 1091 {Intrinsic::vp_ctlz, MVT::v2i32, 31}, 1092 {Intrinsic::vp_ctlz, MVT::v4i32, 31}, 1093 {Intrinsic::vp_ctlz, MVT::v8i32, 31}, 1094 {Intrinsic::vp_ctlz, MVT::v16i32, 31}, 1095 {Intrinsic::vp_ctlz, MVT::nxv1i32, 31}, 1096 {Intrinsic::vp_ctlz, MVT::nxv2i32, 31}, 1097 {Intrinsic::vp_ctlz, MVT::nxv4i32, 31}, 1098 {Intrinsic::vp_ctlz, MVT::nxv8i32, 31}, 1099 {Intrinsic::vp_ctlz, MVT::nxv16i32, 31}, 1100 {Intrinsic::vp_ctlz, MVT::v2i64, 35}, 1101 {Intrinsic::vp_ctlz, MVT::v4i64, 35}, 1102 {Intrinsic::vp_ctlz, MVT::v8i64, 35}, 1103 {Intrinsic::vp_ctlz, MVT::v16i64, 35}, 1104 {Intrinsic::vp_ctlz, MVT::nxv1i64, 35}, 1105 {Intrinsic::vp_ctlz, MVT::nxv2i64, 35}, 1106 {Intrinsic::vp_ctlz, MVT::nxv4i64, 35}, 1107 {Intrinsic::vp_ctlz, MVT::nxv8i64, 35}, 1108 {Intrinsic::vp_cttz, MVT::v2i8, 16}, 1109 {Intrinsic::vp_cttz, MVT::v4i8, 16}, 1110 {Intrinsic::vp_cttz, MVT::v8i8, 16}, 1111 {Intrinsic::vp_cttz, MVT::v16i8, 16}, 1112 {Intrinsic::vp_cttz, MVT::nxv1i8, 16}, 1113 {Intrinsic::vp_cttz, MVT::nxv2i8, 16}, 1114 {Intrinsic::vp_cttz, MVT::nxv4i8, 16}, 1115 {Intrinsic::vp_cttz, MVT::nxv8i8, 16}, 1116 {Intrinsic::vp_cttz, MVT::nxv16i8, 16}, 1117 {Intrinsic::vp_cttz, MVT::nxv32i8, 16}, 1118 {Intrinsic::vp_cttz, MVT::nxv64i8, 16}, 1119 {Intrinsic::vp_cttz, MVT::v2i16, 23}, 1120 {Intrinsic::vp_cttz, MVT::v4i16, 23}, 1121 {Intrinsic::vp_cttz, MVT::v8i16, 23}, 1122 {Intrinsic::vp_cttz, MVT::v16i16, 23}, 1123 {Intrinsic::vp_cttz, MVT::nxv1i16, 23}, 1124 {Intrinsic::vp_cttz, MVT::nxv2i16, 23}, 1125 {Intrinsic::vp_cttz, MVT::nxv4i16, 23}, 1126 {Intrinsic::vp_cttz, MVT::nxv8i16, 23}, 1127 {Intrinsic::vp_cttz, MVT::nxv16i16, 23}, 1128 {Intrinsic::vp_cttz, MVT::nxv32i16, 23}, 1129 {Intrinsic::vp_cttz, MVT::v2i32, 24}, 1130 {Intrinsic::vp_cttz, MVT::v4i32, 24}, 1131 {Intrinsic::vp_cttz, MVT::v8i32, 24}, 1132 {Intrinsic::vp_cttz, MVT::v16i32, 24}, 1133 {Intrinsic::vp_cttz, MVT::nxv1i32, 24}, 1134 {Intrinsic::vp_cttz, MVT::nxv2i32, 24}, 1135 {Intrinsic::vp_cttz, MVT::nxv4i32, 24}, 1136 {Intrinsic::vp_cttz, MVT::nxv8i32, 24}, 1137 {Intrinsic::vp_cttz, MVT::nxv16i32, 24}, 1138 {Intrinsic::vp_cttz, MVT::v2i64, 25}, 1139 {Intrinsic::vp_cttz, MVT::v4i64, 25}, 1140 {Intrinsic::vp_cttz, MVT::v8i64, 25}, 1141 {Intrinsic::vp_cttz, MVT::v16i64, 25}, 1142 {Intrinsic::vp_cttz, MVT::nxv1i64, 25}, 1143 {Intrinsic::vp_cttz, MVT::nxv2i64, 25}, 1144 {Intrinsic::vp_cttz, MVT::nxv4i64, 25}, 1145 {Intrinsic::vp_cttz, MVT::nxv8i64, 25}, 1146 }; 1147 1148 static unsigned getISDForVPIntrinsicID(Intrinsic::ID ID) { 1149 switch (ID) { 1150 #define HELPER_MAP_VPID_TO_VPSD(VPID, VPSD) \ 1151 case Intrinsic::VPID: \ 1152 return ISD::VPSD; 1153 #include "llvm/IR/VPIntrinsics.def" 1154 #undef HELPER_MAP_VPID_TO_VPSD 1155 } 1156 return ISD::DELETED_NODE; 1157 } 1158 1159 InstructionCost 1160 RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, 1161 TTI::TargetCostKind CostKind) { 1162 auto *RetTy = ICA.getReturnType(); 1163 switch (ICA.getID()) { 1164 case Intrinsic::ceil: 1165 case Intrinsic::floor: 1166 case Intrinsic::trunc: 1167 case Intrinsic::rint: 1168 case Intrinsic::lrint: 1169 case Intrinsic::llrint: 1170 case Intrinsic::round: 1171 case Intrinsic::roundeven: { 1172 // These all use the same code. 1173 auto LT = getTypeLegalizationCost(RetTy); 1174 if (!LT.second.isVector() && TLI->isOperationCustom(ISD::FCEIL, LT.second)) 1175 return LT.first * 8; 1176 break; 1177 } 1178 case Intrinsic::umin: 1179 case Intrinsic::umax: 1180 case Intrinsic::smin: 1181 case Intrinsic::smax: { 1182 auto LT = getTypeLegalizationCost(RetTy); 1183 if ((ST->hasVInstructions() && LT.second.isVector()) || 1184 (LT.second.isScalarInteger() && ST->hasStdExtZbb())) 1185 return LT.first; 1186 break; 1187 } 1188 case Intrinsic::sadd_sat: 1189 case Intrinsic::ssub_sat: 1190 case Intrinsic::uadd_sat: 1191 case Intrinsic::usub_sat: 1192 case Intrinsic::fabs: 1193 case Intrinsic::sqrt: { 1194 auto LT = getTypeLegalizationCost(RetTy); 1195 if (ST->hasVInstructions() && LT.second.isVector()) 1196 return LT.first; 1197 break; 1198 } 1199 case Intrinsic::ctpop: { 1200 auto LT = getTypeLegalizationCost(RetTy); 1201 if (ST->hasVInstructions() && ST->hasStdExtZvbb() && LT.second.isVector()) 1202 return LT.first; 1203 break; 1204 } 1205 case Intrinsic::abs: { 1206 auto LT = getTypeLegalizationCost(RetTy); 1207 if (ST->hasVInstructions() && LT.second.isVector()) { 1208 // vrsub.vi v10, v8, 0 1209 // vmax.vv v8, v8, v10 1210 return LT.first * 2; 1211 } 1212 break; 1213 } 1214 // TODO: add more intrinsic 1215 case Intrinsic::experimental_stepvector: { 1216 unsigned Cost = 1; // vid 1217 auto LT = getTypeLegalizationCost(RetTy); 1218 return Cost + (LT.first - 1); 1219 } 1220 case Intrinsic::vp_rint: { 1221 // RISC-V target uses at least 5 instructions to lower rounding intrinsics. 1222 unsigned Cost = 5; 1223 auto LT = getTypeLegalizationCost(RetTy); 1224 if (TLI->isOperationCustom(ISD::VP_FRINT, LT.second)) 1225 return Cost * LT.first; 1226 break; 1227 } 1228 case Intrinsic::vp_nearbyint: { 1229 // More one read and one write for fflags than vp_rint. 1230 unsigned Cost = 7; 1231 auto LT = getTypeLegalizationCost(RetTy); 1232 if (TLI->isOperationCustom(ISD::VP_FRINT, LT.second)) 1233 return Cost * LT.first; 1234 break; 1235 } 1236 case Intrinsic::vp_ceil: 1237 case Intrinsic::vp_floor: 1238 case Intrinsic::vp_round: 1239 case Intrinsic::vp_roundeven: 1240 case Intrinsic::vp_roundtozero: { 1241 // Rounding with static rounding mode needs two more instructions to 1242 // swap/write FRM than vp_rint. 1243 unsigned Cost = 7; 1244 auto LT = getTypeLegalizationCost(RetTy); 1245 unsigned VPISD = getISDForVPIntrinsicID(ICA.getID()); 1246 if (TLI->isOperationCustom(VPISD, LT.second)) 1247 return Cost * LT.first; 1248 break; 1249 } 1250 } 1251 1252 if (ST->hasVInstructions() && RetTy->isVectorTy()) { 1253 auto LT = getTypeLegalizationCost(RetTy); 1254 if (const auto *Entry = CostTableLookup(VectorIntrinsicCostTable, 1255 ICA.getID(), LT.second)) 1256 return LT.first * Entry->Cost; 1257 } 1258 1259 return BaseT::getIntrinsicInstrCost(ICA, CostKind); 1260 } 1261 1262 InstructionCost RISCVTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, 1263 Type *Src, 1264 TTI::CastContextHint CCH, 1265 TTI::TargetCostKind CostKind, 1266 const Instruction *I) { 1267 if (isa<VectorType>(Dst) && isa<VectorType>(Src)) { 1268 // FIXME: Need to compute legalizing cost for illegal types. 1269 if (!isTypeLegal(Src) || !isTypeLegal(Dst)) 1270 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I); 1271 1272 // Skip if element size of Dst or Src is bigger than ELEN. 1273 if (Src->getScalarSizeInBits() > ST->getELen() || 1274 Dst->getScalarSizeInBits() > ST->getELen()) 1275 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I); 1276 1277 int ISD = TLI->InstructionOpcodeToISD(Opcode); 1278 assert(ISD && "Invalid opcode"); 1279 1280 // FIXME: Need to consider vsetvli and lmul. 1281 int PowDiff = (int)Log2_32(Dst->getScalarSizeInBits()) - 1282 (int)Log2_32(Src->getScalarSizeInBits()); 1283 switch (ISD) { 1284 case ISD::SIGN_EXTEND: 1285 case ISD::ZERO_EXTEND: 1286 if (Src->getScalarSizeInBits() == 1) { 1287 // We do not use vsext/vzext to extend from mask vector. 1288 // Instead we use the following instructions to extend from mask vector: 1289 // vmv.v.i v8, 0 1290 // vmerge.vim v8, v8, -1, v0 1291 return 2; 1292 } 1293 return 1; 1294 case ISD::TRUNCATE: 1295 if (Dst->getScalarSizeInBits() == 1) { 1296 // We do not use several vncvt to truncate to mask vector. So we could 1297 // not use PowDiff to calculate it. 1298 // Instead we use the following instructions to truncate to mask vector: 1299 // vand.vi v8, v8, 1 1300 // vmsne.vi v0, v8, 0 1301 return 2; 1302 } 1303 [[fallthrough]]; 1304 case ISD::FP_EXTEND: 1305 case ISD::FP_ROUND: 1306 // Counts of narrow/widen instructions. 1307 return std::abs(PowDiff); 1308 case ISD::FP_TO_SINT: 1309 case ISD::FP_TO_UINT: 1310 case ISD::SINT_TO_FP: 1311 case ISD::UINT_TO_FP: 1312 if (Src->getScalarSizeInBits() == 1 || Dst->getScalarSizeInBits() == 1) { 1313 // The cost of convert from or to mask vector is different from other 1314 // cases. We could not use PowDiff to calculate it. 1315 // For mask vector to fp, we should use the following instructions: 1316 // vmv.v.i v8, 0 1317 // vmerge.vim v8, v8, -1, v0 1318 // vfcvt.f.x.v v8, v8 1319 1320 // And for fp vector to mask, we use: 1321 // vfncvt.rtz.x.f.w v9, v8 1322 // vand.vi v8, v9, 1 1323 // vmsne.vi v0, v8, 0 1324 return 3; 1325 } 1326 if (std::abs(PowDiff) <= 1) 1327 return 1; 1328 // Backend could lower (v[sz]ext i8 to double) to vfcvt(v[sz]ext.f8 i8), 1329 // so it only need two conversion. 1330 if (Src->isIntOrIntVectorTy()) 1331 return 2; 1332 // Counts of narrow/widen instructions. 1333 return std::abs(PowDiff); 1334 } 1335 } 1336 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I); 1337 } 1338 1339 unsigned RISCVTTIImpl::getEstimatedVLFor(VectorType *Ty) { 1340 if (isa<ScalableVectorType>(Ty)) { 1341 const unsigned EltSize = DL.getTypeSizeInBits(Ty->getElementType()); 1342 const unsigned MinSize = DL.getTypeSizeInBits(Ty).getKnownMinValue(); 1343 const unsigned VectorBits = *getVScaleForTuning() * RISCV::RVVBitsPerBlock; 1344 return RISCVTargetLowering::computeVLMAX(VectorBits, EltSize, MinSize); 1345 } 1346 return cast<FixedVectorType>(Ty)->getNumElements(); 1347 } 1348 1349 InstructionCost 1350 RISCVTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, 1351 FastMathFlags FMF, 1352 TTI::TargetCostKind CostKind) { 1353 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors()) 1354 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind); 1355 1356 // Skip if scalar size of Ty is bigger than ELEN. 1357 if (Ty->getScalarSizeInBits() > ST->getELen()) 1358 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind); 1359 1360 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); 1361 if (Ty->getElementType()->isIntegerTy(1)) 1362 // vcpop sequences, see vreduction-mask.ll. umax, smin actually only 1363 // cost 2, but we don't have enough info here so we slightly over cost. 1364 return (LT.first - 1) + 3; 1365 1366 // IR Reduction is composed by two vmv and one rvv reduction instruction. 1367 InstructionCost BaseCost = 2; 1368 1369 if (CostKind == TTI::TCK_CodeSize) 1370 return (LT.first - 1) + BaseCost; 1371 1372 unsigned VL = getEstimatedVLFor(Ty); 1373 return (LT.first - 1) + BaseCost + Log2_32_Ceil(VL); 1374 } 1375 1376 InstructionCost 1377 RISCVTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, 1378 std::optional<FastMathFlags> FMF, 1379 TTI::TargetCostKind CostKind) { 1380 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors()) 1381 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind); 1382 1383 // Skip if scalar size of Ty is bigger than ELEN. 1384 if (Ty->getScalarSizeInBits() > ST->getELen()) 1385 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind); 1386 1387 int ISD = TLI->InstructionOpcodeToISD(Opcode); 1388 assert(ISD && "Invalid opcode"); 1389 1390 if (ISD != ISD::ADD && ISD != ISD::OR && ISD != ISD::XOR && ISD != ISD::AND && 1391 ISD != ISD::FADD) 1392 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind); 1393 1394 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); 1395 if (Ty->getElementType()->isIntegerTy(1)) 1396 // vcpop sequences, see vreduction-mask.ll 1397 return (LT.first - 1) + (ISD == ISD::AND ? 3 : 2); 1398 1399 // IR Reduction is composed by two vmv and one rvv reduction instruction. 1400 InstructionCost BaseCost = 2; 1401 1402 if (CostKind == TTI::TCK_CodeSize) 1403 return (LT.first - 1) + BaseCost; 1404 1405 unsigned VL = getEstimatedVLFor(Ty); 1406 if (TTI::requiresOrderedReduction(FMF)) 1407 return (LT.first - 1) + BaseCost + VL; 1408 return (LT.first - 1) + BaseCost + Log2_32_Ceil(VL); 1409 } 1410 1411 InstructionCost RISCVTTIImpl::getExtendedReductionCost( 1412 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, 1413 FastMathFlags FMF, TTI::TargetCostKind CostKind) { 1414 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors()) 1415 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy, 1416 FMF, CostKind); 1417 1418 // Skip if scalar size of ResTy is bigger than ELEN. 1419 if (ResTy->getScalarSizeInBits() > ST->getELen()) 1420 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy, 1421 FMF, CostKind); 1422 1423 if (Opcode != Instruction::Add && Opcode != Instruction::FAdd) 1424 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy, 1425 FMF, CostKind); 1426 1427 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy); 1428 1429 if (ResTy->getScalarSizeInBits() != 2 * LT.second.getScalarSizeInBits()) 1430 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy, 1431 FMF, CostKind); 1432 1433 return (LT.first - 1) + 1434 getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind); 1435 } 1436 1437 InstructionCost RISCVTTIImpl::getStoreImmCost(Type *Ty, 1438 TTI::OperandValueInfo OpInfo, 1439 TTI::TargetCostKind CostKind) { 1440 assert(OpInfo.isConstant() && "non constant operand?"); 1441 if (!isa<VectorType>(Ty)) 1442 // FIXME: We need to account for immediate materialization here, but doing 1443 // a decent job requires more knowledge about the immediate than we 1444 // currently have here. 1445 return 0; 1446 1447 if (OpInfo.isUniform()) 1448 // vmv.x.i, vmv.v.x, or vfmv.v.f 1449 // We ignore the cost of the scalar constant materialization to be consistent 1450 // with how we treat scalar constants themselves just above. 1451 return 1; 1452 1453 return getConstantPoolLoadCost(Ty, CostKind); 1454 } 1455 1456 1457 InstructionCost RISCVTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, 1458 MaybeAlign Alignment, 1459 unsigned AddressSpace, 1460 TTI::TargetCostKind CostKind, 1461 TTI::OperandValueInfo OpInfo, 1462 const Instruction *I) { 1463 EVT VT = TLI->getValueType(DL, Src, true); 1464 // Type legalization can't handle structs 1465 if (VT == MVT::Other) 1466 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, 1467 CostKind, OpInfo, I); 1468 1469 InstructionCost Cost = 0; 1470 if (Opcode == Instruction::Store && OpInfo.isConstant()) 1471 Cost += getStoreImmCost(Src, OpInfo, CostKind); 1472 InstructionCost BaseCost = 1473 BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, 1474 CostKind, OpInfo, I); 1475 // Assume memory ops cost scale with the number of vector registers 1476 // possible accessed by the instruction. Note that BasicTTI already 1477 // handles the LT.first term for us. 1478 if (std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src); 1479 LT.second.isVector()) 1480 BaseCost *= TLI->getLMULCost(LT.second); 1481 return Cost + BaseCost; 1482 1483 } 1484 1485 InstructionCost RISCVTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, 1486 Type *CondTy, 1487 CmpInst::Predicate VecPred, 1488 TTI::TargetCostKind CostKind, 1489 const Instruction *I) { 1490 if (CostKind != TTI::TCK_RecipThroughput) 1491 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, 1492 I); 1493 1494 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors()) 1495 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, 1496 I); 1497 1498 // Skip if scalar size of ValTy is bigger than ELEN. 1499 if (ValTy->isVectorTy() && ValTy->getScalarSizeInBits() > ST->getELen()) 1500 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, 1501 I); 1502 1503 if (Opcode == Instruction::Select && ValTy->isVectorTy()) { 1504 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy); 1505 if (CondTy->isVectorTy()) { 1506 if (ValTy->getScalarSizeInBits() == 1) { 1507 // vmandn.mm v8, v8, v9 1508 // vmand.mm v9, v0, v9 1509 // vmor.mm v0, v9, v8 1510 return LT.first * 3; 1511 } 1512 // vselect and max/min are supported natively. 1513 return LT.first * 1; 1514 } 1515 1516 if (ValTy->getScalarSizeInBits() == 1) { 1517 // vmv.v.x v9, a0 1518 // vmsne.vi v9, v9, 0 1519 // vmandn.mm v8, v8, v9 1520 // vmand.mm v9, v0, v9 1521 // vmor.mm v0, v9, v8 1522 return LT.first * 5; 1523 } 1524 1525 // vmv.v.x v10, a0 1526 // vmsne.vi v0, v10, 0 1527 // vmerge.vvm v8, v9, v8, v0 1528 return LT.first * 3; 1529 } 1530 1531 if ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) && 1532 ValTy->isVectorTy()) { 1533 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy); 1534 1535 // Support natively. 1536 if (CmpInst::isIntPredicate(VecPred)) 1537 return LT.first * 1; 1538 1539 // If we do not support the input floating point vector type, use the base 1540 // one which will calculate as: 1541 // ScalarizeCost + Num * Cost for fixed vector, 1542 // InvalidCost for scalable vector. 1543 if ((ValTy->getScalarSizeInBits() == 16 && !ST->hasVInstructionsF16()) || 1544 (ValTy->getScalarSizeInBits() == 32 && !ST->hasVInstructionsF32()) || 1545 (ValTy->getScalarSizeInBits() == 64 && !ST->hasVInstructionsF64())) 1546 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, 1547 I); 1548 switch (VecPred) { 1549 // Support natively. 1550 case CmpInst::FCMP_OEQ: 1551 case CmpInst::FCMP_OGT: 1552 case CmpInst::FCMP_OGE: 1553 case CmpInst::FCMP_OLT: 1554 case CmpInst::FCMP_OLE: 1555 case CmpInst::FCMP_UNE: 1556 return LT.first * 1; 1557 // TODO: Other comparisons? 1558 default: 1559 break; 1560 } 1561 } 1562 1563 // TODO: Add cost for scalar type. 1564 1565 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I); 1566 } 1567 1568 InstructionCost RISCVTTIImpl::getCFInstrCost(unsigned Opcode, 1569 TTI::TargetCostKind CostKind, 1570 const Instruction *I) { 1571 if (CostKind != TTI::TCK_RecipThroughput) 1572 return Opcode == Instruction::PHI ? 0 : 1; 1573 // Branches are assumed to be predicted. 1574 return 0; 1575 } 1576 1577 InstructionCost RISCVTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, 1578 TTI::TargetCostKind CostKind, 1579 unsigned Index, Value *Op0, 1580 Value *Op1) { 1581 assert(Val->isVectorTy() && "This must be a vector type"); 1582 1583 if (Opcode != Instruction::ExtractElement && 1584 Opcode != Instruction::InsertElement) 1585 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1); 1586 1587 // Legalize the type. 1588 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val); 1589 1590 // This type is legalized to a scalar type. 1591 if (!LT.second.isVector()) { 1592 auto *FixedVecTy = cast<FixedVectorType>(Val); 1593 // If Index is a known constant, cost is zero. 1594 if (Index != -1U) 1595 return 0; 1596 // Extract/InsertElement with non-constant index is very costly when 1597 // scalarized; estimate cost of loads/stores sequence via the stack: 1598 // ExtractElement cost: store vector to stack, load scalar; 1599 // InsertElement cost: store vector to stack, store scalar, load vector. 1600 Type *ElemTy = FixedVecTy->getElementType(); 1601 auto NumElems = FixedVecTy->getNumElements(); 1602 auto Align = DL.getPrefTypeAlign(ElemTy); 1603 InstructionCost LoadCost = 1604 getMemoryOpCost(Instruction::Load, ElemTy, Align, 0, CostKind); 1605 InstructionCost StoreCost = 1606 getMemoryOpCost(Instruction::Store, ElemTy, Align, 0, CostKind); 1607 return Opcode == Instruction::ExtractElement 1608 ? StoreCost * NumElems + LoadCost 1609 : (StoreCost + LoadCost) * NumElems + StoreCost; 1610 } 1611 1612 // For unsupported scalable vector. 1613 if (LT.second.isScalableVector() && !LT.first.isValid()) 1614 return LT.first; 1615 1616 if (!isTypeLegal(Val)) 1617 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1); 1618 1619 // Mask vector extract/insert is expanded via e8. 1620 if (Val->getScalarSizeInBits() == 1) { 1621 VectorType *WideTy = 1622 VectorType::get(IntegerType::get(Val->getContext(), 8), 1623 cast<VectorType>(Val)->getElementCount()); 1624 if (Opcode == Instruction::ExtractElement) { 1625 InstructionCost ExtendCost 1626 = getCastInstrCost(Instruction::ZExt, WideTy, Val, 1627 TTI::CastContextHint::None, CostKind); 1628 InstructionCost ExtractCost 1629 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr); 1630 return ExtendCost + ExtractCost; 1631 } 1632 InstructionCost ExtendCost 1633 = getCastInstrCost(Instruction::ZExt, WideTy, Val, 1634 TTI::CastContextHint::None, CostKind); 1635 InstructionCost InsertCost 1636 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr); 1637 InstructionCost TruncCost 1638 = getCastInstrCost(Instruction::Trunc, Val, WideTy, 1639 TTI::CastContextHint::None, CostKind); 1640 return ExtendCost + InsertCost + TruncCost; 1641 } 1642 1643 1644 // In RVV, we could use vslidedown + vmv.x.s to extract element from vector 1645 // and vslideup + vmv.s.x to insert element to vector. 1646 unsigned BaseCost = 1; 1647 // When insertelement we should add the index with 1 as the input of vslideup. 1648 unsigned SlideCost = Opcode == Instruction::InsertElement ? 2 : 1; 1649 1650 if (Index != -1U) { 1651 // The type may be split. For fixed-width vectors we can normalize the 1652 // index to the new type. 1653 if (LT.second.isFixedLengthVector()) { 1654 unsigned Width = LT.second.getVectorNumElements(); 1655 Index = Index % Width; 1656 } 1657 1658 // We could extract/insert the first element without vslidedown/vslideup. 1659 if (Index == 0) 1660 SlideCost = 0; 1661 else if (Opcode == Instruction::InsertElement) 1662 SlideCost = 1; // With a constant index, we do not need to use addi. 1663 } 1664 1665 // Extract i64 in the target that has XLEN=32 need more instruction. 1666 if (Val->getScalarType()->isIntegerTy() && 1667 ST->getXLen() < Val->getScalarSizeInBits()) { 1668 // For extractelement, we need the following instructions: 1669 // vsetivli zero, 1, e64, m1, ta, mu (not count) 1670 // vslidedown.vx v8, v8, a0 1671 // vmv.x.s a0, v8 1672 // li a1, 32 1673 // vsrl.vx v8, v8, a1 1674 // vmv.x.s a1, v8 1675 1676 // For insertelement, we need the following instructions: 1677 // vsetivli zero, 2, e32, m4, ta, mu (not count) 1678 // vmv.v.i v12, 0 1679 // vslide1up.vx v16, v12, a1 1680 // vslide1up.vx v12, v16, a0 1681 // addi a0, a2, 1 1682 // vsetvli zero, a0, e64, m4, tu, mu (not count) 1683 // vslideup.vx v8, v12, a2 1684 1685 // TODO: should we count these special vsetvlis? 1686 BaseCost = Opcode == Instruction::InsertElement ? 3 : 4; 1687 } 1688 return BaseCost + SlideCost; 1689 } 1690 1691 InstructionCost RISCVTTIImpl::getArithmeticInstrCost( 1692 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, 1693 TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, 1694 ArrayRef<const Value *> Args, const Instruction *CxtI) { 1695 1696 // TODO: Handle more cost kinds. 1697 if (CostKind != TTI::TCK_RecipThroughput) 1698 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info, 1699 Args, CxtI); 1700 1701 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors()) 1702 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info, 1703 Args, CxtI); 1704 1705 // Skip if scalar size of Ty is bigger than ELEN. 1706 if (isa<VectorType>(Ty) && Ty->getScalarSizeInBits() > ST->getELen()) 1707 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info, 1708 Args, CxtI); 1709 1710 // Legalize the type. 1711 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); 1712 1713 // TODO: Handle scalar type. 1714 if (!LT.second.isVector()) 1715 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info, 1716 Args, CxtI); 1717 1718 1719 auto getConstantMatCost = 1720 [&](unsigned Operand, TTI::OperandValueInfo OpInfo) -> InstructionCost { 1721 if (OpInfo.isUniform() && TLI->canSplatOperand(Opcode, Operand)) 1722 // Two sub-cases: 1723 // * Has a 5 bit immediate operand which can be splatted. 1724 // * Has a larger immediate which must be materialized in scalar register 1725 // We return 0 for both as we currently ignore the cost of materializing 1726 // scalar constants in GPRs. 1727 return 0; 1728 1729 return getConstantPoolLoadCost(Ty, CostKind); 1730 }; 1731 1732 // Add the cost of materializing any constant vectors required. 1733 InstructionCost ConstantMatCost = 0; 1734 if (Op1Info.isConstant()) 1735 ConstantMatCost += getConstantMatCost(0, Op1Info); 1736 if (Op2Info.isConstant()) 1737 ConstantMatCost += getConstantMatCost(1, Op2Info); 1738 1739 switch (TLI->InstructionOpcodeToISD(Opcode)) { 1740 case ISD::ADD: 1741 case ISD::SUB: 1742 case ISD::AND: 1743 case ISD::OR: 1744 case ISD::XOR: 1745 case ISD::SHL: 1746 case ISD::SRL: 1747 case ISD::SRA: 1748 case ISD::MUL: 1749 case ISD::MULHS: 1750 case ISD::MULHU: 1751 case ISD::FADD: 1752 case ISD::FSUB: 1753 case ISD::FMUL: 1754 case ISD::FNEG: { 1755 return ConstantMatCost + TLI->getLMULCost(LT.second) * LT.first * 1; 1756 } 1757 default: 1758 return ConstantMatCost + 1759 BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info, 1760 Args, CxtI); 1761 } 1762 } 1763 1764 // TODO: Deduplicate from TargetTransformInfoImplCRTPBase. 1765 InstructionCost RISCVTTIImpl::getPointersChainCost( 1766 ArrayRef<const Value *> Ptrs, const Value *Base, 1767 const TTI::PointersChainInfo &Info, Type *AccessTy, 1768 TTI::TargetCostKind CostKind) { 1769 InstructionCost Cost = TTI::TCC_Free; 1770 // In the basic model we take into account GEP instructions only 1771 // (although here can come alloca instruction, a value, constants and/or 1772 // constant expressions, PHIs, bitcasts ... whatever allowed to be used as a 1773 // pointer). Typically, if Base is a not a GEP-instruction and all the 1774 // pointers are relative to the same base address, all the rest are 1775 // either GEP instructions, PHIs, bitcasts or constants. When we have same 1776 // base, we just calculate cost of each non-Base GEP as an ADD operation if 1777 // any their index is a non-const. 1778 // If no known dependecies between the pointers cost is calculated as a sum 1779 // of costs of GEP instructions. 1780 for (auto [I, V] : enumerate(Ptrs)) { 1781 const auto *GEP = dyn_cast<GetElementPtrInst>(V); 1782 if (!GEP) 1783 continue; 1784 if (Info.isSameBase() && V != Base) { 1785 if (GEP->hasAllConstantIndices()) 1786 continue; 1787 // If the chain is unit-stride and BaseReg + stride*i is a legal 1788 // addressing mode, then presume the base GEP is sitting around in a 1789 // register somewhere and check if we can fold the offset relative to 1790 // it. 1791 unsigned Stride = DL.getTypeStoreSize(AccessTy); 1792 if (Info.isUnitStride() && 1793 isLegalAddressingMode(AccessTy, 1794 /* BaseGV */ nullptr, 1795 /* BaseOffset */ Stride * I, 1796 /* HasBaseReg */ true, 1797 /* Scale */ 0, 1798 GEP->getType()->getPointerAddressSpace())) 1799 continue; 1800 Cost += getArithmeticInstrCost(Instruction::Add, GEP->getType(), CostKind, 1801 {TTI::OK_AnyValue, TTI::OP_None}, 1802 {TTI::OK_AnyValue, TTI::OP_None}, 1803 std::nullopt); 1804 } else { 1805 SmallVector<const Value *> Indices(GEP->indices()); 1806 Cost += getGEPCost(GEP->getSourceElementType(), GEP->getPointerOperand(), 1807 Indices, AccessTy, CostKind); 1808 } 1809 } 1810 return Cost; 1811 } 1812 1813 void RISCVTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, 1814 TTI::UnrollingPreferences &UP, 1815 OptimizationRemarkEmitter *ORE) { 1816 // TODO: More tuning on benchmarks and metrics with changes as needed 1817 // would apply to all settings below to enable performance. 1818 1819 1820 if (ST->enableDefaultUnroll()) 1821 return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE); 1822 1823 // Enable Upper bound unrolling universally, not dependant upon the conditions 1824 // below. 1825 UP.UpperBound = true; 1826 1827 // Disable loop unrolling for Oz and Os. 1828 UP.OptSizeThreshold = 0; 1829 UP.PartialOptSizeThreshold = 0; 1830 if (L->getHeader()->getParent()->hasOptSize()) 1831 return; 1832 1833 SmallVector<BasicBlock *, 4> ExitingBlocks; 1834 L->getExitingBlocks(ExitingBlocks); 1835 LLVM_DEBUG(dbgs() << "Loop has:\n" 1836 << "Blocks: " << L->getNumBlocks() << "\n" 1837 << "Exit blocks: " << ExitingBlocks.size() << "\n"); 1838 1839 // Only allow another exit other than the latch. This acts as an early exit 1840 // as it mirrors the profitability calculation of the runtime unroller. 1841 if (ExitingBlocks.size() > 2) 1842 return; 1843 1844 // Limit the CFG of the loop body for targets with a branch predictor. 1845 // Allowing 4 blocks permits if-then-else diamonds in the body. 1846 if (L->getNumBlocks() > 4) 1847 return; 1848 1849 // Don't unroll vectorized loops, including the remainder loop 1850 if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized")) 1851 return; 1852 1853 // Scan the loop: don't unroll loops with calls as this could prevent 1854 // inlining. 1855 InstructionCost Cost = 0; 1856 for (auto *BB : L->getBlocks()) { 1857 for (auto &I : *BB) { 1858 // Initial setting - Don't unroll loops containing vectorized 1859 // instructions. 1860 if (I.getType()->isVectorTy()) 1861 return; 1862 1863 if (isa<CallInst>(I) || isa<InvokeInst>(I)) { 1864 if (const Function *F = cast<CallBase>(I).getCalledFunction()) { 1865 if (!isLoweredToCall(F)) 1866 continue; 1867 } 1868 return; 1869 } 1870 1871 SmallVector<const Value *> Operands(I.operand_values()); 1872 Cost += getInstructionCost(&I, Operands, 1873 TargetTransformInfo::TCK_SizeAndLatency); 1874 } 1875 } 1876 1877 LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n"); 1878 1879 UP.Partial = true; 1880 UP.Runtime = true; 1881 UP.UnrollRemainder = true; 1882 UP.UnrollAndJam = true; 1883 UP.UnrollAndJamInnerLoopThreshold = 60; 1884 1885 // Force unrolling small loops can be very useful because of the branch 1886 // taken cost of the backedge. 1887 if (Cost < 12) 1888 UP.Force = true; 1889 } 1890 1891 void RISCVTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, 1892 TTI::PeelingPreferences &PP) { 1893 BaseT::getPeelingPreferences(L, SE, PP); 1894 } 1895 1896 unsigned RISCVTTIImpl::getRegUsageForType(Type *Ty) { 1897 TypeSize Size = DL.getTypeSizeInBits(Ty); 1898 if (Ty->isVectorTy()) { 1899 if (Size.isScalable() && ST->hasVInstructions()) 1900 return divideCeil(Size.getKnownMinValue(), RISCV::RVVBitsPerBlock); 1901 1902 if (ST->useRVVForFixedLengthVectors()) 1903 return divideCeil(Size, ST->getRealMinVLen()); 1904 } 1905 1906 return BaseT::getRegUsageForType(Ty); 1907 } 1908 1909 unsigned RISCVTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const { 1910 if (SLPMaxVF.getNumOccurrences()) 1911 return SLPMaxVF; 1912 1913 // Return how many elements can fit in getRegisterBitwidth. This is the 1914 // same routine as used in LoopVectorizer. We should probably be 1915 // accounting for whether we actually have instructions with the right 1916 // lane type, but we don't have enough information to do that without 1917 // some additional plumbing which hasn't been justified yet. 1918 TypeSize RegWidth = 1919 getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector); 1920 // If no vector registers, or absurd element widths, disable 1921 // vectorization by returning 1. 1922 return std::max<unsigned>(1U, RegWidth.getFixedValue() / ElemWidth); 1923 } 1924 1925 bool RISCVTTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1, 1926 const TargetTransformInfo::LSRCost &C2) { 1927 // RISC-V specific here are "instruction number 1st priority". 1928 return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost, 1929 C1.NumIVMuls, C1.NumBaseAdds, 1930 C1.ScaleCost, C1.ImmCost, C1.SetupCost) < 1931 std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost, 1932 C2.NumIVMuls, C2.NumBaseAdds, 1933 C2.ScaleCost, C2.ImmCost, C2.SetupCost); 1934 } 1935