1 //===-- RISCVTargetTransformInfo.cpp - RISC-V specific TTI ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "RISCVTargetTransformInfo.h" 10 #include "MCTargetDesc/RISCVMatInt.h" 11 #include "llvm/ADT/STLExtras.h" 12 #include "llvm/Analysis/TargetTransformInfo.h" 13 #include "llvm/CodeGen/BasicTTIImpl.h" 14 #include "llvm/CodeGen/CostTable.h" 15 #include "llvm/CodeGen/TargetLowering.h" 16 #include "llvm/IR/Instructions.h" 17 #include "llvm/IR/PatternMatch.h" 18 #include <cmath> 19 #include <optional> 20 using namespace llvm; 21 using namespace llvm::PatternMatch; 22 23 #define DEBUG_TYPE "riscvtti" 24 25 static cl::opt<unsigned> RVVRegisterWidthLMUL( 26 "riscv-v-register-bit-width-lmul", 27 cl::desc( 28 "The LMUL to use for getRegisterBitWidth queries. Affects LMUL used " 29 "by autovectorized code. Fractional LMULs are not supported."), 30 cl::init(2), cl::Hidden); 31 32 static cl::opt<unsigned> SLPMaxVF( 33 "riscv-v-slp-max-vf", 34 cl::desc( 35 "Overrides result used for getMaximumVF query which is used " 36 "exclusively by SLP vectorizer."), 37 cl::Hidden); 38 39 InstructionCost 40 RISCVTTIImpl::getRISCVInstructionCost(ArrayRef<unsigned> OpCodes, MVT VT, 41 TTI::TargetCostKind CostKind) { 42 // Check if the type is valid for all CostKind 43 if (!VT.isVector()) 44 return InstructionCost::getInvalid(); 45 size_t NumInstr = OpCodes.size(); 46 if (CostKind == TTI::TCK_CodeSize) 47 return NumInstr; 48 InstructionCost LMULCost = TLI->getLMULCost(VT); 49 if ((CostKind != TTI::TCK_RecipThroughput) && (CostKind != TTI::TCK_Latency)) 50 return LMULCost * NumInstr; 51 InstructionCost Cost = 0; 52 for (auto Op : OpCodes) { 53 switch (Op) { 54 case RISCV::VRGATHER_VI: 55 Cost += TLI->getVRGatherVICost(VT); 56 break; 57 case RISCV::VRGATHER_VV: 58 Cost += TLI->getVRGatherVVCost(VT); 59 break; 60 case RISCV::VSLIDEUP_VI: 61 case RISCV::VSLIDEDOWN_VI: 62 Cost += TLI->getVSlideVICost(VT); 63 break; 64 case RISCV::VSLIDEUP_VX: 65 case RISCV::VSLIDEDOWN_VX: 66 Cost += TLI->getVSlideVXCost(VT); 67 break; 68 case RISCV::VREDMAX_VS: 69 case RISCV::VREDMIN_VS: 70 case RISCV::VREDMAXU_VS: 71 case RISCV::VREDMINU_VS: 72 case RISCV::VREDSUM_VS: 73 case RISCV::VREDAND_VS: 74 case RISCV::VREDOR_VS: 75 case RISCV::VREDXOR_VS: 76 case RISCV::VFREDMAX_VS: 77 case RISCV::VFREDMIN_VS: 78 case RISCV::VFREDUSUM_VS: { 79 unsigned VL = VT.getVectorMinNumElements(); 80 if (!VT.isFixedLengthVector()) 81 VL *= *getVScaleForTuning(); 82 Cost += Log2_32_Ceil(VL); 83 break; 84 } 85 case RISCV::VFREDOSUM_VS: { 86 unsigned VL = VT.getVectorMinNumElements(); 87 if (!VT.isFixedLengthVector()) 88 VL *= *getVScaleForTuning(); 89 Cost += VL; 90 break; 91 } 92 case RISCV::VMV_X_S: 93 case RISCV::VMV_S_X: 94 case RISCV::VFMV_F_S: 95 case RISCV::VFMV_S_F: 96 case RISCV::VMOR_MM: 97 case RISCV::VMXOR_MM: 98 case RISCV::VMAND_MM: 99 case RISCV::VMANDN_MM: 100 case RISCV::VMNAND_MM: 101 case RISCV::VCPOP_M: 102 case RISCV::VFIRST_M: 103 Cost += 1; 104 break; 105 default: 106 Cost += LMULCost; 107 } 108 } 109 return Cost; 110 } 111 112 static InstructionCost getIntImmCostImpl(const DataLayout &DL, 113 const RISCVSubtarget *ST, 114 const APInt &Imm, Type *Ty, 115 TTI::TargetCostKind CostKind, 116 bool FreeZeroes) { 117 assert(Ty->isIntegerTy() && 118 "getIntImmCost can only estimate cost of materialising integers"); 119 120 // We have a Zero register, so 0 is always free. 121 if (Imm == 0) 122 return TTI::TCC_Free; 123 124 // Otherwise, we check how many instructions it will take to materialise. 125 return RISCVMatInt::getIntMatCost(Imm, DL.getTypeSizeInBits(Ty), *ST, 126 /*CompressionCost=*/false, FreeZeroes); 127 } 128 129 InstructionCost RISCVTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty, 130 TTI::TargetCostKind CostKind) { 131 return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind, false); 132 } 133 134 // Look for patterns of shift followed by AND that can be turned into a pair of 135 // shifts. We won't need to materialize an immediate for the AND so these can 136 // be considered free. 137 static bool canUseShiftPair(Instruction *Inst, const APInt &Imm) { 138 uint64_t Mask = Imm.getZExtValue(); 139 auto *BO = dyn_cast<BinaryOperator>(Inst->getOperand(0)); 140 if (!BO || !BO->hasOneUse()) 141 return false; 142 143 if (BO->getOpcode() != Instruction::Shl) 144 return false; 145 146 if (!isa<ConstantInt>(BO->getOperand(1))) 147 return false; 148 149 unsigned ShAmt = cast<ConstantInt>(BO->getOperand(1))->getZExtValue(); 150 // (and (shl x, c2), c1) will be matched to (srli (slli x, c2+c3), c3) if c1 151 // is a mask shifted by c2 bits with c3 leading zeros. 152 if (isShiftedMask_64(Mask)) { 153 unsigned Trailing = llvm::countr_zero(Mask); 154 if (ShAmt == Trailing) 155 return true; 156 } 157 158 return false; 159 } 160 161 InstructionCost RISCVTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, 162 const APInt &Imm, Type *Ty, 163 TTI::TargetCostKind CostKind, 164 Instruction *Inst) { 165 assert(Ty->isIntegerTy() && 166 "getIntImmCost can only estimate cost of materialising integers"); 167 168 // We have a Zero register, so 0 is always free. 169 if (Imm == 0) 170 return TTI::TCC_Free; 171 172 // Some instructions in RISC-V can take a 12-bit immediate. Some of these are 173 // commutative, in others the immediate comes from a specific argument index. 174 bool Takes12BitImm = false; 175 unsigned ImmArgIdx = ~0U; 176 177 switch (Opcode) { 178 case Instruction::GetElementPtr: 179 // Never hoist any arguments to a GetElementPtr. CodeGenPrepare will 180 // split up large offsets in GEP into better parts than ConstantHoisting 181 // can. 182 return TTI::TCC_Free; 183 case Instruction::Store: { 184 // Use the materialization cost regardless of if it's the address or the 185 // value that is constant, except for if the store is misaligned and 186 // misaligned accesses are not legal (experience shows constant hoisting 187 // can sometimes be harmful in such cases). 188 if (Idx == 1 || !Inst) 189 return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind, 190 /*FreeZeroes=*/true); 191 192 StoreInst *ST = cast<StoreInst>(Inst); 193 if (!getTLI()->allowsMemoryAccessForAlignment( 194 Ty->getContext(), DL, getTLI()->getValueType(DL, Ty), 195 ST->getPointerAddressSpace(), ST->getAlign())) 196 return TTI::TCC_Free; 197 198 return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind, 199 /*FreeZeroes=*/true); 200 } 201 case Instruction::Load: 202 // If the address is a constant, use the materialization cost. 203 return getIntImmCost(Imm, Ty, CostKind); 204 case Instruction::And: 205 // zext.h 206 if (Imm == UINT64_C(0xffff) && ST->hasStdExtZbb()) 207 return TTI::TCC_Free; 208 // zext.w 209 if (Imm == UINT64_C(0xffffffff) && ST->hasStdExtZba()) 210 return TTI::TCC_Free; 211 // bclri 212 if (ST->hasStdExtZbs() && (~Imm).isPowerOf2()) 213 return TTI::TCC_Free; 214 if (Inst && Idx == 1 && Imm.getBitWidth() <= ST->getXLen() && 215 canUseShiftPair(Inst, Imm)) 216 return TTI::TCC_Free; 217 Takes12BitImm = true; 218 break; 219 case Instruction::Add: 220 Takes12BitImm = true; 221 break; 222 case Instruction::Or: 223 case Instruction::Xor: 224 // bseti/binvi 225 if (ST->hasStdExtZbs() && Imm.isPowerOf2()) 226 return TTI::TCC_Free; 227 Takes12BitImm = true; 228 break; 229 case Instruction::Mul: 230 // Power of 2 is a shift. Negated power of 2 is a shift and a negate. 231 if (Imm.isPowerOf2() || Imm.isNegatedPowerOf2()) 232 return TTI::TCC_Free; 233 // One more or less than a power of 2 can use SLLI+ADD/SUB. 234 if ((Imm + 1).isPowerOf2() || (Imm - 1).isPowerOf2()) 235 return TTI::TCC_Free; 236 // FIXME: There is no MULI instruction. 237 Takes12BitImm = true; 238 break; 239 case Instruction::Sub: 240 case Instruction::Shl: 241 case Instruction::LShr: 242 case Instruction::AShr: 243 Takes12BitImm = true; 244 ImmArgIdx = 1; 245 break; 246 default: 247 break; 248 } 249 250 if (Takes12BitImm) { 251 // Check immediate is the correct argument... 252 if (Instruction::isCommutative(Opcode) || Idx == ImmArgIdx) { 253 // ... and fits into the 12-bit immediate. 254 if (Imm.getSignificantBits() <= 64 && 255 getTLI()->isLegalAddImmediate(Imm.getSExtValue())) { 256 return TTI::TCC_Free; 257 } 258 } 259 260 // Otherwise, use the full materialisation cost. 261 return getIntImmCost(Imm, Ty, CostKind); 262 } 263 264 // By default, prevent hoisting. 265 return TTI::TCC_Free; 266 } 267 268 InstructionCost 269 RISCVTTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, 270 const APInt &Imm, Type *Ty, 271 TTI::TargetCostKind CostKind) { 272 // Prevent hoisting in unknown cases. 273 return TTI::TCC_Free; 274 } 275 276 bool RISCVTTIImpl::hasActiveVectorLength(unsigned, Type *DataTy, Align) const { 277 return ST->hasVInstructions(); 278 } 279 280 TargetTransformInfo::PopcntSupportKind 281 RISCVTTIImpl::getPopcntSupport(unsigned TyWidth) { 282 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); 283 return ST->hasStdExtZbb() || ST->hasVendorXCVbitmanip() 284 ? TTI::PSK_FastHardware 285 : TTI::PSK_Software; 286 } 287 288 bool RISCVTTIImpl::shouldExpandReduction(const IntrinsicInst *II) const { 289 // Currently, the ExpandReductions pass can't expand scalable-vector 290 // reductions, but we still request expansion as RVV doesn't support certain 291 // reductions and the SelectionDAG can't legalize them either. 292 switch (II->getIntrinsicID()) { 293 default: 294 return false; 295 // These reductions have no equivalent in RVV 296 case Intrinsic::vector_reduce_mul: 297 case Intrinsic::vector_reduce_fmul: 298 return true; 299 } 300 } 301 302 std::optional<unsigned> RISCVTTIImpl::getMaxVScale() const { 303 if (ST->hasVInstructions()) 304 return ST->getRealMaxVLen() / RISCV::RVVBitsPerBlock; 305 return BaseT::getMaxVScale(); 306 } 307 308 std::optional<unsigned> RISCVTTIImpl::getVScaleForTuning() const { 309 if (ST->hasVInstructions()) 310 if (unsigned MinVLen = ST->getRealMinVLen(); 311 MinVLen >= RISCV::RVVBitsPerBlock) 312 return MinVLen / RISCV::RVVBitsPerBlock; 313 return BaseT::getVScaleForTuning(); 314 } 315 316 TypeSize 317 RISCVTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { 318 unsigned LMUL = 319 llvm::bit_floor(std::clamp<unsigned>(RVVRegisterWidthLMUL, 1, 8)); 320 switch (K) { 321 case TargetTransformInfo::RGK_Scalar: 322 return TypeSize::getFixed(ST->getXLen()); 323 case TargetTransformInfo::RGK_FixedWidthVector: 324 return TypeSize::getFixed( 325 ST->useRVVForFixedLengthVectors() ? LMUL * ST->getRealMinVLen() : 0); 326 case TargetTransformInfo::RGK_ScalableVector: 327 return TypeSize::getScalable( 328 (ST->hasVInstructions() && 329 ST->getRealMinVLen() >= RISCV::RVVBitsPerBlock) 330 ? LMUL * RISCV::RVVBitsPerBlock 331 : 0); 332 } 333 334 llvm_unreachable("Unsupported register kind"); 335 } 336 337 InstructionCost 338 RISCVTTIImpl::getConstantPoolLoadCost(Type *Ty, TTI::TargetCostKind CostKind) { 339 // Add a cost of address generation + the cost of the load. The address 340 // is expected to be a PC relative offset to a constant pool entry 341 // using auipc/addi. 342 return 2 + getMemoryOpCost(Instruction::Load, Ty, DL.getABITypeAlign(Ty), 343 /*AddressSpace=*/0, CostKind); 344 } 345 346 static VectorType *getVRGatherIndexType(MVT DataVT, const RISCVSubtarget &ST, 347 LLVMContext &C) { 348 assert((DataVT.getScalarSizeInBits() != 8 || 349 DataVT.getVectorNumElements() <= 256) && "unhandled case in lowering"); 350 MVT IndexVT = DataVT.changeTypeToInteger(); 351 if (IndexVT.getScalarType().bitsGT(ST.getXLenVT())) 352 IndexVT = IndexVT.changeVectorElementType(MVT::i16); 353 return cast<VectorType>(EVT(IndexVT).getTypeForEVT(C)); 354 } 355 356 InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, 357 VectorType *Tp, ArrayRef<int> Mask, 358 TTI::TargetCostKind CostKind, 359 int Index, VectorType *SubTp, 360 ArrayRef<const Value *> Args, 361 const Instruction *CxtI) { 362 Kind = improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp); 363 364 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp); 365 366 // First, handle cases where having a fixed length vector enables us to 367 // give a more accurate cost than falling back to generic scalable codegen. 368 // TODO: Each of these cases hints at a modeling gap around scalable vectors. 369 if (isa<FixedVectorType>(Tp)) { 370 switch (Kind) { 371 default: 372 break; 373 case TTI::SK_PermuteSingleSrc: { 374 if (Mask.size() >= 2 && LT.second.isFixedLengthVector()) { 375 MVT EltTp = LT.second.getVectorElementType(); 376 // If the size of the element is < ELEN then shuffles of interleaves and 377 // deinterleaves of 2 vectors can be lowered into the following 378 // sequences 379 if (EltTp.getScalarSizeInBits() < ST->getELen()) { 380 // Example sequence: 381 // vsetivli zero, 4, e8, mf4, ta, ma (ignored) 382 // vwaddu.vv v10, v8, v9 383 // li a0, -1 (ignored) 384 // vwmaccu.vx v10, a0, v9 385 if (ShuffleVectorInst::isInterleaveMask(Mask, 2, Mask.size())) 386 return 2 * LT.first * TLI->getLMULCost(LT.second); 387 388 if (Mask[0] == 0 || Mask[0] == 1) { 389 auto DeinterleaveMask = createStrideMask(Mask[0], 2, Mask.size()); 390 // Example sequence: 391 // vnsrl.wi v10, v8, 0 392 if (equal(DeinterleaveMask, Mask)) 393 return LT.first * getRISCVInstructionCost(RISCV::VNSRL_WI, 394 LT.second, CostKind); 395 } 396 } 397 } 398 // vrgather + cost of generating the mask constant. 399 // We model this for an unknown mask with a single vrgather. 400 if (LT.second.isFixedLengthVector() && LT.first == 1 && 401 (LT.second.getScalarSizeInBits() != 8 || 402 LT.second.getVectorNumElements() <= 256)) { 403 VectorType *IdxTy = getVRGatherIndexType(LT.second, *ST, Tp->getContext()); 404 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind); 405 return IndexCost + 406 getRISCVInstructionCost(RISCV::VRGATHER_VV, LT.second, CostKind); 407 } 408 [[fallthrough]]; 409 } 410 case TTI::SK_Transpose: 411 case TTI::SK_PermuteTwoSrc: { 412 // 2 x (vrgather + cost of generating the mask constant) + cost of mask 413 // register for the second vrgather. We model this for an unknown 414 // (shuffle) mask. 415 if (LT.second.isFixedLengthVector() && LT.first == 1 && 416 (LT.second.getScalarSizeInBits() != 8 || 417 LT.second.getVectorNumElements() <= 256)) { 418 auto &C = Tp->getContext(); 419 auto EC = Tp->getElementCount(); 420 VectorType *IdxTy = getVRGatherIndexType(LT.second, *ST, C); 421 VectorType *MaskTy = VectorType::get(IntegerType::getInt1Ty(C), EC); 422 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind); 423 InstructionCost MaskCost = getConstantPoolLoadCost(MaskTy, CostKind); 424 return 2 * IndexCost + 425 getRISCVInstructionCost({RISCV::VRGATHER_VV, RISCV::VRGATHER_VV}, 426 LT.second, CostKind) + 427 MaskCost; 428 } 429 [[fallthrough]]; 430 } 431 case TTI::SK_Select: { 432 // We are going to permute multiple sources and the result will be in 433 // multiple destinations. Providing an accurate cost only for splits where 434 // the element type remains the same. 435 if (!Mask.empty() && LT.first.isValid() && LT.first != 1 && 436 LT.second.isFixedLengthVector() && 437 LT.second.getVectorElementType().getSizeInBits() == 438 Tp->getElementType()->getPrimitiveSizeInBits() && 439 LT.second.getVectorNumElements() < 440 cast<FixedVectorType>(Tp)->getNumElements() && 441 divideCeil(Mask.size(), 442 cast<FixedVectorType>(Tp)->getNumElements()) == 443 static_cast<unsigned>(*LT.first.getValue())) { 444 unsigned NumRegs = *LT.first.getValue(); 445 unsigned VF = cast<FixedVectorType>(Tp)->getNumElements(); 446 unsigned SubVF = PowerOf2Ceil(VF / NumRegs); 447 auto *SubVecTy = FixedVectorType::get(Tp->getElementType(), SubVF); 448 449 InstructionCost Cost = 0; 450 for (unsigned I = 0; I < NumRegs; ++I) { 451 bool IsSingleVector = true; 452 SmallVector<int> SubMask(SubVF, PoisonMaskElem); 453 transform(Mask.slice(I * SubVF, 454 I == NumRegs - 1 ? Mask.size() % SubVF : SubVF), 455 SubMask.begin(), [&](int I) { 456 bool SingleSubVector = I / VF == 0; 457 IsSingleVector &= SingleSubVector; 458 return (SingleSubVector ? 0 : 1) * SubVF + I % VF; 459 }); 460 Cost += getShuffleCost(IsSingleVector ? TTI::SK_PermuteSingleSrc 461 : TTI::SK_PermuteTwoSrc, 462 SubVecTy, SubMask, CostKind, 0, nullptr); 463 return Cost; 464 } 465 } 466 break; 467 } 468 } 469 }; 470 471 // Handle scalable vectors (and fixed vectors legalized to scalable vectors). 472 switch (Kind) { 473 default: 474 // Fallthrough to generic handling. 475 // TODO: Most of these cases will return getInvalid in generic code, and 476 // must be implemented here. 477 break; 478 case TTI::SK_ExtractSubvector: 479 // Extract at zero is always a subregister extract 480 if (Index == 0) 481 return TTI::TCC_Free; 482 483 // If we're extracting a subvector of at most m1 size at a sub-register 484 // boundary - which unfortunately we need exact vlen to identify - this is 485 // a subregister extract at worst and thus won't require a vslidedown. 486 // TODO: Extend for aligned m2, m4 subvector extracts 487 // TODO: Extend for misalgined (but contained) extracts 488 // TODO: Extend for scalable subvector types 489 if (std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp); 490 SubLT.second.isValid() && SubLT.second.isFixedLengthVector()) { 491 const unsigned MinVLen = ST->getRealMinVLen(); 492 const unsigned MaxVLen = ST->getRealMaxVLen(); 493 if (MinVLen == MaxVLen && 494 SubLT.second.getScalarSizeInBits() * Index % MinVLen == 0 && 495 SubLT.second.getSizeInBits() <= MinVLen) 496 return TTI::TCC_Free; 497 } 498 499 // Example sequence: 500 // vsetivli zero, 4, e8, mf2, tu, ma (ignored) 501 // vslidedown.vi v8, v9, 2 502 return LT.first * 503 getRISCVInstructionCost(RISCV::VSLIDEDOWN_VI, LT.second, CostKind); 504 case TTI::SK_InsertSubvector: 505 // Example sequence: 506 // vsetivli zero, 4, e8, mf2, tu, ma (ignored) 507 // vslideup.vi v8, v9, 2 508 return LT.first * 509 getRISCVInstructionCost(RISCV::VSLIDEUP_VI, LT.second, CostKind); 510 case TTI::SK_Select: { 511 // Example sequence: 512 // li a0, 90 513 // vsetivli zero, 8, e8, mf2, ta, ma (ignored) 514 // vmv.s.x v0, a0 515 // vmerge.vvm v8, v9, v8, v0 516 // We use 2 for the cost of the mask materialization as this is the true 517 // cost for small masks and most shuffles are small. At worst, this cost 518 // should be a very small constant for the constant pool load. As such, 519 // we may bias towards large selects slightly more than truely warranted. 520 return LT.first * 521 (1 + getRISCVInstructionCost({RISCV::VMV_S_X, RISCV::VMERGE_VVM}, 522 LT.second, CostKind)); 523 } 524 case TTI::SK_Broadcast: { 525 bool HasScalar = (Args.size() > 0) && (Operator::getOpcode(Args[0]) == 526 Instruction::InsertElement); 527 if (LT.second.getScalarSizeInBits() == 1) { 528 if (HasScalar) { 529 // Example sequence: 530 // andi a0, a0, 1 531 // vsetivli zero, 2, e8, mf8, ta, ma (ignored) 532 // vmv.v.x v8, a0 533 // vmsne.vi v0, v8, 0 534 return LT.first * 535 (1 + getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI}, 536 LT.second, CostKind)); 537 } 538 // Example sequence: 539 // vsetivli zero, 2, e8, mf8, ta, mu (ignored) 540 // vmv.v.i v8, 0 541 // vmerge.vim v8, v8, 1, v0 542 // vmv.x.s a0, v8 543 // andi a0, a0, 1 544 // vmv.v.x v8, a0 545 // vmsne.vi v0, v8, 0 546 547 return LT.first * 548 (1 + getRISCVInstructionCost({RISCV::VMV_V_I, RISCV::VMERGE_VIM, 549 RISCV::VMV_X_S, RISCV::VMV_V_X, 550 RISCV::VMSNE_VI}, 551 LT.second, CostKind)); 552 } 553 554 if (HasScalar) { 555 // Example sequence: 556 // vmv.v.x v8, a0 557 return LT.first * 558 getRISCVInstructionCost(RISCV::VMV_V_X, LT.second, CostKind); 559 } 560 561 // Example sequence: 562 // vrgather.vi v9, v8, 0 563 return LT.first * 564 getRISCVInstructionCost(RISCV::VRGATHER_VI, LT.second, CostKind); 565 } 566 case TTI::SK_Splice: { 567 // vslidedown+vslideup. 568 // TODO: Multiplying by LT.first implies this legalizes into multiple copies 569 // of similar code, but I think we expand through memory. 570 unsigned Opcodes[2] = {RISCV::VSLIDEDOWN_VX, RISCV::VSLIDEUP_VX}; 571 if (Index >= 0 && Index < 32) 572 Opcodes[0] = RISCV::VSLIDEDOWN_VI; 573 else if (Index < 0 && Index > -32) 574 Opcodes[1] = RISCV::VSLIDEUP_VI; 575 return LT.first * getRISCVInstructionCost(Opcodes, LT.second, CostKind); 576 } 577 case TTI::SK_Reverse: { 578 // TODO: Cases to improve here: 579 // * Illegal vector types 580 // * i64 on RV32 581 // * i1 vector 582 // At low LMUL, most of the cost is producing the vrgather index register. 583 // At high LMUL, the cost of the vrgather itself will dominate. 584 // Example sequence: 585 // csrr a0, vlenb 586 // srli a0, a0, 3 587 // addi a0, a0, -1 588 // vsetvli a1, zero, e8, mf8, ta, mu (ignored) 589 // vid.v v9 590 // vrsub.vx v10, v9, a0 591 // vrgather.vv v9, v8, v10 592 InstructionCost LenCost = 3; 593 if (LT.second.isFixedLengthVector()) 594 // vrsub.vi has a 5 bit immediate field, otherwise an li suffices 595 LenCost = isInt<5>(LT.second.getVectorNumElements() - 1) ? 0 : 1; 596 unsigned Opcodes[] = {RISCV::VID_V, RISCV::VRSUB_VX, RISCV::VRGATHER_VV}; 597 if (LT.second.isFixedLengthVector() && 598 isInt<5>(LT.second.getVectorNumElements() - 1)) 599 Opcodes[1] = RISCV::VRSUB_VI; 600 InstructionCost GatherCost = 601 getRISCVInstructionCost(Opcodes, LT.second, CostKind); 602 // Mask operation additionally required extend and truncate 603 InstructionCost ExtendCost = Tp->getElementType()->isIntegerTy(1) ? 3 : 0; 604 return LT.first * (LenCost + GatherCost + ExtendCost); 605 } 606 } 607 return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp); 608 } 609 610 InstructionCost 611 RISCVTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, 612 unsigned AddressSpace, 613 TTI::TargetCostKind CostKind) { 614 if (!isLegalMaskedLoadStore(Src, Alignment) || 615 CostKind != TTI::TCK_RecipThroughput) 616 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace, 617 CostKind); 618 619 return getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind); 620 } 621 622 InstructionCost RISCVTTIImpl::getInterleavedMemoryOpCost( 623 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, 624 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, 625 bool UseMaskForCond, bool UseMaskForGaps) { 626 if (isa<ScalableVectorType>(VecTy) && Factor != 2) 627 return InstructionCost::getInvalid(); 628 629 // The interleaved memory access pass will lower interleaved memory ops (i.e 630 // a load and store followed by a specific shuffle) to vlseg/vsseg 631 // intrinsics. In those cases then we can treat it as if it's just one (legal) 632 // memory op 633 if (!UseMaskForCond && !UseMaskForGaps && 634 Factor <= TLI->getMaxSupportedInterleaveFactor()) { 635 auto *VTy = cast<VectorType>(VecTy); 636 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VTy); 637 // Need to make sure type has't been scalarized 638 if (LT.second.isVector()) { 639 auto *SubVecTy = 640 VectorType::get(VTy->getElementType(), 641 VTy->getElementCount().divideCoefficientBy(Factor)); 642 643 if (VTy->getElementCount().isKnownMultipleOf(Factor) && 644 TLI->isLegalInterleavedAccessType(SubVecTy, Factor, Alignment, 645 AddressSpace, DL)) { 646 // FIXME: We use the memory op cost of the *legalized* type here, 647 // because it's getMemoryOpCost returns a really expensive cost for 648 // types like <6 x i8>, which show up when doing interleaves of 649 // Factor=3 etc. Should the memory op cost of these be cheaper? 650 auto *LegalVTy = VectorType::get(VTy->getElementType(), 651 LT.second.getVectorElementCount()); 652 InstructionCost LegalMemCost = getMemoryOpCost( 653 Opcode, LegalVTy, Alignment, AddressSpace, CostKind); 654 return LT.first + LegalMemCost; 655 } 656 } 657 } 658 659 // TODO: Return the cost of interleaved accesses for scalable vector when 660 // unable to convert to segment accesses instructions. 661 if (isa<ScalableVectorType>(VecTy)) 662 return InstructionCost::getInvalid(); 663 664 auto *FVTy = cast<FixedVectorType>(VecTy); 665 InstructionCost MemCost = 666 getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, CostKind); 667 unsigned VF = FVTy->getNumElements() / Factor; 668 669 // An interleaved load will look like this for Factor=3: 670 // %wide.vec = load <12 x i32>, ptr %3, align 4 671 // %strided.vec = shufflevector %wide.vec, poison, <4 x i32> <stride mask> 672 // %strided.vec1 = shufflevector %wide.vec, poison, <4 x i32> <stride mask> 673 // %strided.vec2 = shufflevector %wide.vec, poison, <4 x i32> <stride mask> 674 if (Opcode == Instruction::Load) { 675 InstructionCost Cost = MemCost; 676 for (unsigned Index : Indices) { 677 FixedVectorType *SubVecTy = 678 FixedVectorType::get(FVTy->getElementType(), VF * Factor); 679 auto Mask = createStrideMask(Index, Factor, VF); 680 InstructionCost ShuffleCost = 681 getShuffleCost(TTI::ShuffleKind::SK_PermuteSingleSrc, SubVecTy, Mask, 682 CostKind, 0, nullptr, {}); 683 Cost += ShuffleCost; 684 } 685 return Cost; 686 } 687 688 // TODO: Model for NF > 2 689 // We'll need to enhance getShuffleCost to model shuffles that are just 690 // inserts and extracts into subvectors, since they won't have the full cost 691 // of a vrgather. 692 // An interleaved store for 3 vectors of 4 lanes will look like 693 // %11 = shufflevector <4 x i32> %4, <4 x i32> %6, <8 x i32> <0...7> 694 // %12 = shufflevector <4 x i32> %9, <4 x i32> poison, <8 x i32> <0...3> 695 // %13 = shufflevector <8 x i32> %11, <8 x i32> %12, <12 x i32> <0...11> 696 // %interleaved.vec = shufflevector %13, poison, <12 x i32> <interleave mask> 697 // store <12 x i32> %interleaved.vec, ptr %10, align 4 698 if (Factor != 2) 699 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, 700 Alignment, AddressSpace, CostKind, 701 UseMaskForCond, UseMaskForGaps); 702 703 assert(Opcode == Instruction::Store && "Opcode must be a store"); 704 // For an interleaving store of 2 vectors, we perform one large interleaving 705 // shuffle that goes into the wide store 706 auto Mask = createInterleaveMask(VF, Factor); 707 InstructionCost ShuffleCost = 708 getShuffleCost(TTI::ShuffleKind::SK_PermuteSingleSrc, FVTy, Mask, 709 CostKind, 0, nullptr, {}); 710 return MemCost + ShuffleCost; 711 } 712 713 InstructionCost RISCVTTIImpl::getGatherScatterOpCost( 714 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, 715 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) { 716 if (CostKind != TTI::TCK_RecipThroughput) 717 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, 718 Alignment, CostKind, I); 719 720 if ((Opcode == Instruction::Load && 721 !isLegalMaskedGather(DataTy, Align(Alignment))) || 722 (Opcode == Instruction::Store && 723 !isLegalMaskedScatter(DataTy, Align(Alignment)))) 724 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, 725 Alignment, CostKind, I); 726 727 // Cost is proportional to the number of memory operations implied. For 728 // scalable vectors, we use an estimate on that number since we don't 729 // know exactly what VL will be. 730 auto &VTy = *cast<VectorType>(DataTy); 731 InstructionCost MemOpCost = 732 getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind, 733 {TTI::OK_AnyValue, TTI::OP_None}, I); 734 unsigned NumLoads = getEstimatedVLFor(&VTy); 735 return NumLoads * MemOpCost; 736 } 737 738 InstructionCost RISCVTTIImpl::getStridedMemoryOpCost( 739 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, 740 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) { 741 if (((Opcode == Instruction::Load || Opcode == Instruction::Store) && 742 !isLegalStridedLoadStore(DataTy, Alignment)) || 743 (Opcode != Instruction::Load && Opcode != Instruction::Store)) 744 return BaseT::getStridedMemoryOpCost(Opcode, DataTy, Ptr, VariableMask, 745 Alignment, CostKind, I); 746 747 if (CostKind == TTI::TCK_CodeSize) 748 return TTI::TCC_Basic; 749 750 // Cost is proportional to the number of memory operations implied. For 751 // scalable vectors, we use an estimate on that number since we don't 752 // know exactly what VL will be. 753 auto &VTy = *cast<VectorType>(DataTy); 754 InstructionCost MemOpCost = 755 getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind, 756 {TTI::OK_AnyValue, TTI::OP_None}, I); 757 unsigned NumLoads = getEstimatedVLFor(&VTy); 758 return NumLoads * MemOpCost; 759 } 760 761 // Currently, these represent both throughput and codesize costs 762 // for the respective intrinsics. The costs in this table are simply 763 // instruction counts with the following adjustments made: 764 // * One vsetvli is considered free. 765 static const CostTblEntry VectorIntrinsicCostTable[]{ 766 {Intrinsic::floor, MVT::f32, 9}, 767 {Intrinsic::floor, MVT::f64, 9}, 768 {Intrinsic::ceil, MVT::f32, 9}, 769 {Intrinsic::ceil, MVT::f64, 9}, 770 {Intrinsic::trunc, MVT::f32, 7}, 771 {Intrinsic::trunc, MVT::f64, 7}, 772 {Intrinsic::round, MVT::f32, 9}, 773 {Intrinsic::round, MVT::f64, 9}, 774 {Intrinsic::roundeven, MVT::f32, 9}, 775 {Intrinsic::roundeven, MVT::f64, 9}, 776 {Intrinsic::rint, MVT::f32, 7}, 777 {Intrinsic::rint, MVT::f64, 7}, 778 {Intrinsic::lrint, MVT::i32, 1}, 779 {Intrinsic::lrint, MVT::i64, 1}, 780 {Intrinsic::llrint, MVT::i64, 1}, 781 {Intrinsic::nearbyint, MVT::f32, 9}, 782 {Intrinsic::nearbyint, MVT::f64, 9}, 783 {Intrinsic::bswap, MVT::i16, 3}, 784 {Intrinsic::bswap, MVT::i32, 12}, 785 {Intrinsic::bswap, MVT::i64, 31}, 786 {Intrinsic::vp_bswap, MVT::i16, 3}, 787 {Intrinsic::vp_bswap, MVT::i32, 12}, 788 {Intrinsic::vp_bswap, MVT::i64, 31}, 789 {Intrinsic::vp_fshl, MVT::i8, 7}, 790 {Intrinsic::vp_fshl, MVT::i16, 7}, 791 {Intrinsic::vp_fshl, MVT::i32, 7}, 792 {Intrinsic::vp_fshl, MVT::i64, 7}, 793 {Intrinsic::vp_fshr, MVT::i8, 7}, 794 {Intrinsic::vp_fshr, MVT::i16, 7}, 795 {Intrinsic::vp_fshr, MVT::i32, 7}, 796 {Intrinsic::vp_fshr, MVT::i64, 7}, 797 {Intrinsic::bitreverse, MVT::i8, 17}, 798 {Intrinsic::bitreverse, MVT::i16, 24}, 799 {Intrinsic::bitreverse, MVT::i32, 33}, 800 {Intrinsic::bitreverse, MVT::i64, 52}, 801 {Intrinsic::vp_bitreverse, MVT::i8, 17}, 802 {Intrinsic::vp_bitreverse, MVT::i16, 24}, 803 {Intrinsic::vp_bitreverse, MVT::i32, 33}, 804 {Intrinsic::vp_bitreverse, MVT::i64, 52}, 805 {Intrinsic::ctpop, MVT::i8, 12}, 806 {Intrinsic::ctpop, MVT::i16, 19}, 807 {Intrinsic::ctpop, MVT::i32, 20}, 808 {Intrinsic::ctpop, MVT::i64, 21}, 809 {Intrinsic::vp_ctpop, MVT::i8, 12}, 810 {Intrinsic::vp_ctpop, MVT::i16, 19}, 811 {Intrinsic::vp_ctpop, MVT::i32, 20}, 812 {Intrinsic::vp_ctpop, MVT::i64, 21}, 813 {Intrinsic::vp_ctlz, MVT::i8, 19}, 814 {Intrinsic::vp_ctlz, MVT::i16, 28}, 815 {Intrinsic::vp_ctlz, MVT::i32, 31}, 816 {Intrinsic::vp_ctlz, MVT::i64, 35}, 817 {Intrinsic::vp_cttz, MVT::i8, 16}, 818 {Intrinsic::vp_cttz, MVT::i16, 23}, 819 {Intrinsic::vp_cttz, MVT::i32, 24}, 820 {Intrinsic::vp_cttz, MVT::i64, 25}, 821 }; 822 823 static unsigned getISDForVPIntrinsicID(Intrinsic::ID ID) { 824 switch (ID) { 825 #define HELPER_MAP_VPID_TO_VPSD(VPID, VPSD) \ 826 case Intrinsic::VPID: \ 827 return ISD::VPSD; 828 #include "llvm/IR/VPIntrinsics.def" 829 #undef HELPER_MAP_VPID_TO_VPSD 830 } 831 return ISD::DELETED_NODE; 832 } 833 834 InstructionCost 835 RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, 836 TTI::TargetCostKind CostKind) { 837 auto *RetTy = ICA.getReturnType(); 838 switch (ICA.getID()) { 839 case Intrinsic::ceil: 840 case Intrinsic::floor: 841 case Intrinsic::trunc: 842 case Intrinsic::rint: 843 case Intrinsic::lrint: 844 case Intrinsic::llrint: 845 case Intrinsic::round: 846 case Intrinsic::roundeven: { 847 // These all use the same code. 848 auto LT = getTypeLegalizationCost(RetTy); 849 if (!LT.second.isVector() && TLI->isOperationCustom(ISD::FCEIL, LT.second)) 850 return LT.first * 8; 851 break; 852 } 853 case Intrinsic::umin: 854 case Intrinsic::umax: 855 case Intrinsic::smin: 856 case Intrinsic::smax: { 857 auto LT = getTypeLegalizationCost(RetTy); 858 if (LT.second.isScalarInteger() && ST->hasStdExtZbb()) 859 return LT.first; 860 861 if (ST->hasVInstructions() && LT.second.isVector()) { 862 unsigned Op; 863 switch (ICA.getID()) { 864 case Intrinsic::umin: 865 Op = RISCV::VMINU_VV; 866 break; 867 case Intrinsic::umax: 868 Op = RISCV::VMAXU_VV; 869 break; 870 case Intrinsic::smin: 871 Op = RISCV::VMIN_VV; 872 break; 873 case Intrinsic::smax: 874 Op = RISCV::VMAX_VV; 875 break; 876 } 877 return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind); 878 } 879 break; 880 } 881 case Intrinsic::sadd_sat: 882 case Intrinsic::ssub_sat: 883 case Intrinsic::uadd_sat: 884 case Intrinsic::usub_sat: 885 case Intrinsic::fabs: 886 case Intrinsic::sqrt: { 887 auto LT = getTypeLegalizationCost(RetTy); 888 if (ST->hasVInstructions() && LT.second.isVector()) 889 return LT.first; 890 break; 891 } 892 case Intrinsic::ctpop: { 893 auto LT = getTypeLegalizationCost(RetTy); 894 if (ST->hasVInstructions() && ST->hasStdExtZvbb() && LT.second.isVector()) 895 return LT.first; 896 break; 897 } 898 case Intrinsic::abs: { 899 auto LT = getTypeLegalizationCost(RetTy); 900 if (ST->hasVInstructions() && LT.second.isVector()) { 901 // vrsub.vi v10, v8, 0 902 // vmax.vv v8, v8, v10 903 return LT.first * 2; 904 } 905 break; 906 } 907 case Intrinsic::get_active_lane_mask: { 908 if (ST->hasVInstructions()) { 909 Type *ExpRetTy = VectorType::get( 910 ICA.getArgTypes()[0], cast<VectorType>(RetTy)->getElementCount()); 911 auto LT = getTypeLegalizationCost(ExpRetTy); 912 913 // vid.v v8 // considered hoisted 914 // vsaddu.vx v8, v8, a0 915 // vmsltu.vx v0, v8, a1 916 return LT.first * 917 getRISCVInstructionCost({RISCV::VSADDU_VX, RISCV::VMSLTU_VX}, 918 LT.second, CostKind); 919 } 920 break; 921 } 922 // TODO: add more intrinsic 923 case Intrinsic::experimental_stepvector: { 924 auto LT = getTypeLegalizationCost(RetTy); 925 // Legalisation of illegal types involves an `index' instruction plus 926 // (LT.first - 1) vector adds. 927 if (ST->hasVInstructions()) 928 return getRISCVInstructionCost(RISCV::VID_V, LT.second, CostKind) + 929 (LT.first - 1) * 930 getRISCVInstructionCost(RISCV::VADD_VX, LT.second, CostKind); 931 return 1 + (LT.first - 1); 932 } 933 case Intrinsic::experimental_cttz_elts: { 934 Type *ArgTy = ICA.getArgTypes()[0]; 935 EVT ArgType = TLI->getValueType(DL, ArgTy, true); 936 if (getTLI()->shouldExpandCttzElements(ArgType)) 937 break; 938 InstructionCost Cost = getRISCVInstructionCost( 939 RISCV::VFIRST_M, getTypeLegalizationCost(ArgTy).second, CostKind); 940 941 // If zero_is_poison is false, then we will generate additional 942 // cmp + select instructions to convert -1 to EVL. 943 Type *BoolTy = Type::getInt1Ty(RetTy->getContext()); 944 if (ICA.getArgs().size() > 1 && 945 cast<ConstantInt>(ICA.getArgs()[1])->isZero()) 946 Cost += getCmpSelInstrCost(Instruction::ICmp, BoolTy, RetTy, 947 CmpInst::ICMP_SLT, CostKind) + 948 getCmpSelInstrCost(Instruction::Select, RetTy, BoolTy, 949 CmpInst::BAD_ICMP_PREDICATE, CostKind); 950 951 return Cost; 952 } 953 case Intrinsic::vp_rint: { 954 // RISC-V target uses at least 5 instructions to lower rounding intrinsics. 955 unsigned Cost = 5; 956 auto LT = getTypeLegalizationCost(RetTy); 957 if (TLI->isOperationCustom(ISD::VP_FRINT, LT.second)) 958 return Cost * LT.first; 959 break; 960 } 961 case Intrinsic::vp_nearbyint: { 962 // More one read and one write for fflags than vp_rint. 963 unsigned Cost = 7; 964 auto LT = getTypeLegalizationCost(RetTy); 965 if (TLI->isOperationCustom(ISD::VP_FRINT, LT.second)) 966 return Cost * LT.first; 967 break; 968 } 969 case Intrinsic::vp_ceil: 970 case Intrinsic::vp_floor: 971 case Intrinsic::vp_round: 972 case Intrinsic::vp_roundeven: 973 case Intrinsic::vp_roundtozero: { 974 // Rounding with static rounding mode needs two more instructions to 975 // swap/write FRM than vp_rint. 976 unsigned Cost = 7; 977 auto LT = getTypeLegalizationCost(RetTy); 978 unsigned VPISD = getISDForVPIntrinsicID(ICA.getID()); 979 if (TLI->isOperationCustom(VPISD, LT.second)) 980 return Cost * LT.first; 981 break; 982 } 983 // vp integer arithmetic ops. 984 case Intrinsic::vp_add: 985 case Intrinsic::vp_and: 986 case Intrinsic::vp_ashr: 987 case Intrinsic::vp_lshr: 988 case Intrinsic::vp_mul: 989 case Intrinsic::vp_or: 990 case Intrinsic::vp_sdiv: 991 case Intrinsic::vp_shl: 992 case Intrinsic::vp_srem: 993 case Intrinsic::vp_sub: 994 case Intrinsic::vp_udiv: 995 case Intrinsic::vp_urem: 996 case Intrinsic::vp_xor: 997 // vp float arithmetic ops. 998 case Intrinsic::vp_fadd: 999 case Intrinsic::vp_fsub: 1000 case Intrinsic::vp_fmul: 1001 case Intrinsic::vp_fdiv: 1002 case Intrinsic::vp_frem: { 1003 std::optional<unsigned> FOp = 1004 VPIntrinsic::getFunctionalOpcodeForVP(ICA.getID()); 1005 if (FOp) 1006 return getArithmeticInstrCost(*FOp, ICA.getReturnType(), CostKind); 1007 break; 1008 } 1009 } 1010 1011 if (ST->hasVInstructions() && RetTy->isVectorTy()) { 1012 if (auto LT = getTypeLegalizationCost(RetTy); 1013 LT.second.isVector()) { 1014 MVT EltTy = LT.second.getVectorElementType(); 1015 if (const auto *Entry = CostTableLookup(VectorIntrinsicCostTable, 1016 ICA.getID(), EltTy)) 1017 return LT.first * Entry->Cost; 1018 } 1019 } 1020 1021 return BaseT::getIntrinsicInstrCost(ICA, CostKind); 1022 } 1023 1024 InstructionCost RISCVTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, 1025 Type *Src, 1026 TTI::CastContextHint CCH, 1027 TTI::TargetCostKind CostKind, 1028 const Instruction *I) { 1029 bool IsVectorType = isa<VectorType>(Dst) && isa<VectorType>(Src); 1030 if (!IsVectorType) 1031 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I); 1032 1033 bool IsTypeLegal = isTypeLegal(Src) && isTypeLegal(Dst) && 1034 (Src->getScalarSizeInBits() <= ST->getELen()) && 1035 (Dst->getScalarSizeInBits() <= ST->getELen()); 1036 1037 // FIXME: Need to compute legalizing cost for illegal types. 1038 if (!IsTypeLegal) 1039 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I); 1040 1041 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src); 1042 std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(Dst); 1043 1044 int ISD = TLI->InstructionOpcodeToISD(Opcode); 1045 assert(ISD && "Invalid opcode"); 1046 1047 int PowDiff = (int)Log2_32(Dst->getScalarSizeInBits()) - 1048 (int)Log2_32(Src->getScalarSizeInBits()); 1049 switch (ISD) { 1050 case ISD::SIGN_EXTEND: 1051 case ISD::ZERO_EXTEND: { 1052 const unsigned SrcEltSize = Src->getScalarSizeInBits(); 1053 if (SrcEltSize == 1) { 1054 // We do not use vsext/vzext to extend from mask vector. 1055 // Instead we use the following instructions to extend from mask vector: 1056 // vmv.v.i v8, 0 1057 // vmerge.vim v8, v8, -1, v0 1058 return getRISCVInstructionCost({RISCV::VMV_V_I, RISCV::VMERGE_VIM}, 1059 DstLT.second, CostKind); 1060 } 1061 if ((PowDiff < 1) || (PowDiff > 3)) 1062 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I); 1063 unsigned SExtOp[] = {RISCV::VSEXT_VF2, RISCV::VSEXT_VF4, RISCV::VSEXT_VF8}; 1064 unsigned ZExtOp[] = {RISCV::VZEXT_VF2, RISCV::VZEXT_VF4, RISCV::VZEXT_VF8}; 1065 unsigned Op = 1066 (ISD == ISD::SIGN_EXTEND) ? SExtOp[PowDiff - 1] : ZExtOp[PowDiff - 1]; 1067 return getRISCVInstructionCost(Op, DstLT.second, CostKind); 1068 } 1069 case ISD::TRUNCATE: 1070 if (Dst->getScalarSizeInBits() == 1) { 1071 // We do not use several vncvt to truncate to mask vector. So we could 1072 // not use PowDiff to calculate it. 1073 // Instead we use the following instructions to truncate to mask vector: 1074 // vand.vi v8, v8, 1 1075 // vmsne.vi v0, v8, 0 1076 return getRISCVInstructionCost({RISCV::VAND_VI, RISCV::VMSNE_VI}, 1077 SrcLT.second, CostKind); 1078 } 1079 [[fallthrough]]; 1080 case ISD::FP_EXTEND: 1081 case ISD::FP_ROUND: { 1082 // Counts of narrow/widen instructions. 1083 unsigned SrcEltSize = Src->getScalarSizeInBits(); 1084 unsigned DstEltSize = Dst->getScalarSizeInBits(); 1085 1086 unsigned Op = (ISD == ISD::TRUNCATE) ? RISCV::VNSRL_WI 1087 : (ISD == ISD::FP_EXTEND) ? RISCV::VFWCVT_F_F_V 1088 : RISCV::VFNCVT_F_F_W; 1089 InstructionCost Cost = 0; 1090 for (; SrcEltSize != DstEltSize;) { 1091 MVT ElementMVT = (ISD == ISD::TRUNCATE) 1092 ? MVT::getIntegerVT(DstEltSize) 1093 : MVT::getFloatingPointVT(DstEltSize); 1094 MVT DstMVT = DstLT.second.changeVectorElementType(ElementMVT); 1095 DstEltSize = 1096 (DstEltSize > SrcEltSize) ? DstEltSize >> 1 : DstEltSize << 1; 1097 Cost += getRISCVInstructionCost(Op, DstMVT, CostKind); 1098 } 1099 return Cost; 1100 } 1101 case ISD::FP_TO_SINT: 1102 case ISD::FP_TO_UINT: 1103 case ISD::SINT_TO_FP: 1104 case ISD::UINT_TO_FP: 1105 if (Src->getScalarSizeInBits() == 1 || Dst->getScalarSizeInBits() == 1) { 1106 // The cost of convert from or to mask vector is different from other 1107 // cases. We could not use PowDiff to calculate it. 1108 // For mask vector to fp, we should use the following instructions: 1109 // vmv.v.i v8, 0 1110 // vmerge.vim v8, v8, -1, v0 1111 // vfcvt.f.x.v v8, v8 1112 1113 // And for fp vector to mask, we use: 1114 // vfncvt.rtz.x.f.w v9, v8 1115 // vand.vi v8, v9, 1 1116 // vmsne.vi v0, v8, 0 1117 return 3; 1118 } 1119 if (std::abs(PowDiff) <= 1) 1120 return 1; 1121 // Backend could lower (v[sz]ext i8 to double) to vfcvt(v[sz]ext.f8 i8), 1122 // so it only need two conversion. 1123 if (Src->isIntOrIntVectorTy()) 1124 return 2; 1125 // Counts of narrow/widen instructions. 1126 return std::abs(PowDiff); 1127 } 1128 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I); 1129 } 1130 1131 unsigned RISCVTTIImpl::getEstimatedVLFor(VectorType *Ty) { 1132 if (isa<ScalableVectorType>(Ty)) { 1133 const unsigned EltSize = DL.getTypeSizeInBits(Ty->getElementType()); 1134 const unsigned MinSize = DL.getTypeSizeInBits(Ty).getKnownMinValue(); 1135 const unsigned VectorBits = *getVScaleForTuning() * RISCV::RVVBitsPerBlock; 1136 return RISCVTargetLowering::computeVLMAX(VectorBits, EltSize, MinSize); 1137 } 1138 return cast<FixedVectorType>(Ty)->getNumElements(); 1139 } 1140 1141 InstructionCost 1142 RISCVTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, 1143 FastMathFlags FMF, 1144 TTI::TargetCostKind CostKind) { 1145 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors()) 1146 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind); 1147 1148 // Skip if scalar size of Ty is bigger than ELEN. 1149 if (Ty->getScalarSizeInBits() > ST->getELen()) 1150 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind); 1151 1152 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); 1153 if (Ty->getElementType()->isIntegerTy(1)) { 1154 // SelectionDAGBuilder does following transforms: 1155 // vector_reduce_{smin,umax}(<n x i1>) --> vector_reduce_or(<n x i1>) 1156 // vector_reduce_{smax,umin}(<n x i1>) --> vector_reduce_and(<n x i1>) 1157 if (IID == Intrinsic::umax || IID == Intrinsic::smin) 1158 return getArithmeticReductionCost(Instruction::Or, Ty, FMF, CostKind); 1159 else 1160 return getArithmeticReductionCost(Instruction::And, Ty, FMF, CostKind); 1161 } 1162 1163 if (IID == Intrinsic::maximum || IID == Intrinsic::minimum) { 1164 SmallVector<unsigned, 3> Opcodes; 1165 InstructionCost ExtraCost = 0; 1166 switch (IID) { 1167 case Intrinsic::maximum: 1168 if (FMF.noNaNs()) { 1169 Opcodes = {RISCV::VFREDMAX_VS, RISCV::VFMV_F_S}; 1170 } else { 1171 Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMAX_VS, 1172 RISCV::VFMV_F_S}; 1173 // Cost of Canonical Nan + branch 1174 // lui a0, 523264 1175 // fmv.w.x fa0, a0 1176 Type *DstTy = Ty->getScalarType(); 1177 const unsigned EltTyBits = DstTy->getScalarSizeInBits(); 1178 Type *SrcTy = IntegerType::getIntNTy(DstTy->getContext(), EltTyBits); 1179 ExtraCost = 1 + 1180 getCastInstrCost(Instruction::UIToFP, DstTy, SrcTy, 1181 TTI::CastContextHint::None, CostKind) + 1182 getCFInstrCost(Instruction::Br, CostKind); 1183 } 1184 break; 1185 1186 case Intrinsic::minimum: 1187 if (FMF.noNaNs()) { 1188 Opcodes = {RISCV::VFREDMIN_VS, RISCV::VFMV_F_S}; 1189 } else { 1190 Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMIN_VS, 1191 RISCV::VFMV_F_S}; 1192 // Cost of Canonical Nan + branch 1193 // lui a0, 523264 1194 // fmv.w.x fa0, a0 1195 Type *DstTy = Ty->getScalarType(); 1196 const unsigned EltTyBits = DL.getTypeSizeInBits(DstTy); 1197 Type *SrcTy = IntegerType::getIntNTy(DstTy->getContext(), EltTyBits); 1198 ExtraCost = 1 + 1199 getCastInstrCost(Instruction::UIToFP, DstTy, SrcTy, 1200 TTI::CastContextHint::None, CostKind) + 1201 getCFInstrCost(Instruction::Br, CostKind); 1202 } 1203 break; 1204 } 1205 return ExtraCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind); 1206 } 1207 1208 // IR Reduction is composed by two vmv and one rvv reduction instruction. 1209 unsigned SplitOp; 1210 SmallVector<unsigned, 3> Opcodes; 1211 switch (IID) { 1212 default: 1213 llvm_unreachable("Unsupported intrinsic"); 1214 case Intrinsic::smax: 1215 SplitOp = RISCV::VMAX_VV; 1216 Opcodes = {RISCV::VMV_S_X, RISCV::VREDMAX_VS, RISCV::VMV_X_S}; 1217 break; 1218 case Intrinsic::smin: 1219 SplitOp = RISCV::VMIN_VV; 1220 Opcodes = {RISCV::VMV_S_X, RISCV::VREDMIN_VS, RISCV::VMV_X_S}; 1221 break; 1222 case Intrinsic::umax: 1223 SplitOp = RISCV::VMAXU_VV; 1224 Opcodes = {RISCV::VMV_S_X, RISCV::VREDMAXU_VS, RISCV::VMV_X_S}; 1225 break; 1226 case Intrinsic::umin: 1227 SplitOp = RISCV::VMINU_VV; 1228 Opcodes = {RISCV::VMV_S_X, RISCV::VREDMINU_VS, RISCV::VMV_X_S}; 1229 break; 1230 case Intrinsic::maxnum: 1231 SplitOp = RISCV::VFMAX_VV; 1232 Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDMAX_VS, RISCV::VFMV_F_S}; 1233 break; 1234 case Intrinsic::minnum: 1235 SplitOp = RISCV::VFMIN_VV; 1236 Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDMIN_VS, RISCV::VFMV_F_S}; 1237 break; 1238 } 1239 // Add a cost for data larger than LMUL8 1240 InstructionCost SplitCost = 1241 (LT.first > 1) ? (LT.first - 1) * 1242 getRISCVInstructionCost(SplitOp, LT.second, CostKind) 1243 : 0; 1244 return SplitCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind); 1245 } 1246 1247 InstructionCost 1248 RISCVTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, 1249 std::optional<FastMathFlags> FMF, 1250 TTI::TargetCostKind CostKind) { 1251 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors()) 1252 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind); 1253 1254 // Skip if scalar size of Ty is bigger than ELEN. 1255 if (Ty->getScalarSizeInBits() > ST->getELen()) 1256 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind); 1257 1258 int ISD = TLI->InstructionOpcodeToISD(Opcode); 1259 assert(ISD && "Invalid opcode"); 1260 1261 if (ISD != ISD::ADD && ISD != ISD::OR && ISD != ISD::XOR && ISD != ISD::AND && 1262 ISD != ISD::FADD) 1263 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind); 1264 1265 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); 1266 SmallVector<unsigned, 3> Opcodes; 1267 Type *ElementTy = Ty->getElementType(); 1268 if (ElementTy->isIntegerTy(1)) { 1269 if (ISD == ISD::AND) { 1270 // Example sequences: 1271 // vsetvli a0, zero, e8, mf8, ta, ma 1272 // vmnot.m v8, v0 1273 // vcpop.m a0, v8 1274 // seqz a0, a0 1275 Opcodes = {RISCV::VMNAND_MM, RISCV::VCPOP_M}; 1276 return (LT.first - 1) + 1277 getRISCVInstructionCost(Opcodes, LT.second, CostKind) + 1278 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy, 1279 CmpInst::ICMP_EQ, CostKind); 1280 } else { 1281 // Example sequences: 1282 // vsetvli a0, zero, e8, mf8, ta, ma 1283 // vcpop.m a0, v0 1284 // snez a0, a0 1285 Opcodes = {RISCV::VCPOP_M}; 1286 return (LT.first - 1) + 1287 getRISCVInstructionCost(Opcodes, LT.second, CostKind) + 1288 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy, 1289 CmpInst::ICMP_NE, CostKind); 1290 } 1291 } 1292 1293 // IR Reduction is composed by two vmv and one rvv reduction instruction. 1294 if (TTI::requiresOrderedReduction(FMF)) { 1295 Opcodes.push_back(RISCV::VFMV_S_F); 1296 for (unsigned i = 0; i < LT.first.getValue(); i++) 1297 Opcodes.push_back(RISCV::VFREDOSUM_VS); 1298 Opcodes.push_back(RISCV::VFMV_F_S); 1299 return getRISCVInstructionCost(Opcodes, LT.second, CostKind); 1300 } 1301 unsigned SplitOp; 1302 switch (ISD) { 1303 case ISD::ADD: 1304 SplitOp = RISCV::VADD_VV; 1305 Opcodes = {RISCV::VMV_S_X, RISCV::VREDSUM_VS, RISCV::VMV_X_S}; 1306 break; 1307 case ISD::OR: 1308 SplitOp = RISCV::VOR_VV; 1309 Opcodes = {RISCV::VMV_S_X, RISCV::VREDOR_VS, RISCV::VMV_X_S}; 1310 break; 1311 case ISD::XOR: 1312 SplitOp = RISCV::VXOR_VV; 1313 Opcodes = {RISCV::VMV_S_X, RISCV::VREDXOR_VS, RISCV::VMV_X_S}; 1314 break; 1315 case ISD::AND: 1316 SplitOp = RISCV::VAND_VV; 1317 Opcodes = {RISCV::VMV_S_X, RISCV::VREDAND_VS, RISCV::VMV_X_S}; 1318 break; 1319 case ISD::FADD: 1320 SplitOp = RISCV::VFADD_VV; 1321 Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDUSUM_VS, RISCV::VFMV_F_S}; 1322 break; 1323 } 1324 // Add a cost for data larger than LMUL8 1325 InstructionCost SplitCost = 1326 (LT.first > 1) ? (LT.first - 1) * 1327 getRISCVInstructionCost(SplitOp, LT.second, CostKind) 1328 : 0; 1329 return SplitCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind); 1330 } 1331 1332 InstructionCost RISCVTTIImpl::getExtendedReductionCost( 1333 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, 1334 FastMathFlags FMF, TTI::TargetCostKind CostKind) { 1335 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors()) 1336 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy, 1337 FMF, CostKind); 1338 1339 // Skip if scalar size of ResTy is bigger than ELEN. 1340 if (ResTy->getScalarSizeInBits() > ST->getELen()) 1341 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy, 1342 FMF, CostKind); 1343 1344 if (Opcode != Instruction::Add && Opcode != Instruction::FAdd) 1345 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy, 1346 FMF, CostKind); 1347 1348 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy); 1349 1350 if (ResTy->getScalarSizeInBits() != 2 * LT.second.getScalarSizeInBits()) 1351 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy, 1352 FMF, CostKind); 1353 1354 return (LT.first - 1) + 1355 getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind); 1356 } 1357 1358 InstructionCost RISCVTTIImpl::getStoreImmCost(Type *Ty, 1359 TTI::OperandValueInfo OpInfo, 1360 TTI::TargetCostKind CostKind) { 1361 assert(OpInfo.isConstant() && "non constant operand?"); 1362 if (!isa<VectorType>(Ty)) 1363 // FIXME: We need to account for immediate materialization here, but doing 1364 // a decent job requires more knowledge about the immediate than we 1365 // currently have here. 1366 return 0; 1367 1368 if (OpInfo.isUniform()) 1369 // vmv.x.i, vmv.v.x, or vfmv.v.f 1370 // We ignore the cost of the scalar constant materialization to be consistent 1371 // with how we treat scalar constants themselves just above. 1372 return 1; 1373 1374 return getConstantPoolLoadCost(Ty, CostKind); 1375 } 1376 1377 1378 InstructionCost RISCVTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, 1379 MaybeAlign Alignment, 1380 unsigned AddressSpace, 1381 TTI::TargetCostKind CostKind, 1382 TTI::OperandValueInfo OpInfo, 1383 const Instruction *I) { 1384 EVT VT = TLI->getValueType(DL, Src, true); 1385 // Type legalization can't handle structs 1386 if (VT == MVT::Other) 1387 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, 1388 CostKind, OpInfo, I); 1389 1390 InstructionCost Cost = 0; 1391 if (Opcode == Instruction::Store && OpInfo.isConstant()) 1392 Cost += getStoreImmCost(Src, OpInfo, CostKind); 1393 InstructionCost BaseCost = 1394 BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, 1395 CostKind, OpInfo, I); 1396 // Assume memory ops cost scale with the number of vector registers 1397 // possible accessed by the instruction. Note that BasicTTI already 1398 // handles the LT.first term for us. 1399 if (std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src); 1400 LT.second.isVector() && CostKind != TTI::TCK_CodeSize) 1401 BaseCost *= TLI->getLMULCost(LT.second); 1402 return Cost + BaseCost; 1403 1404 } 1405 1406 InstructionCost RISCVTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, 1407 Type *CondTy, 1408 CmpInst::Predicate VecPred, 1409 TTI::TargetCostKind CostKind, 1410 const Instruction *I) { 1411 if (CostKind != TTI::TCK_RecipThroughput) 1412 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, 1413 I); 1414 1415 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors()) 1416 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, 1417 I); 1418 1419 // Skip if scalar size of ValTy is bigger than ELEN. 1420 if (ValTy->isVectorTy() && ValTy->getScalarSizeInBits() > ST->getELen()) 1421 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, 1422 I); 1423 1424 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy); 1425 if (Opcode == Instruction::Select && ValTy->isVectorTy()) { 1426 if (CondTy->isVectorTy()) { 1427 if (ValTy->getScalarSizeInBits() == 1) { 1428 // vmandn.mm v8, v8, v9 1429 // vmand.mm v9, v0, v9 1430 // vmor.mm v0, v9, v8 1431 return LT.first * 1432 getRISCVInstructionCost( 1433 {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM}, 1434 LT.second, CostKind); 1435 } 1436 // vselect and max/min are supported natively. 1437 return LT.first * 1438 getRISCVInstructionCost(RISCV::VMERGE_VVM, LT.second, CostKind); 1439 } 1440 1441 if (ValTy->getScalarSizeInBits() == 1) { 1442 // vmv.v.x v9, a0 1443 // vmsne.vi v9, v9, 0 1444 // vmandn.mm v8, v8, v9 1445 // vmand.mm v9, v0, v9 1446 // vmor.mm v0, v9, v8 1447 MVT InterimVT = LT.second.changeVectorElementType(MVT::i8); 1448 return LT.first * 1449 getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI}, 1450 InterimVT, CostKind) + 1451 LT.first * getRISCVInstructionCost( 1452 {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM}, 1453 LT.second, CostKind); 1454 } 1455 1456 // vmv.v.x v10, a0 1457 // vmsne.vi v0, v10, 0 1458 // vmerge.vvm v8, v9, v8, v0 1459 return LT.first * getRISCVInstructionCost( 1460 {RISCV::VMV_V_X, RISCV::VMSNE_VI, RISCV::VMERGE_VVM}, 1461 LT.second, CostKind); 1462 } 1463 1464 if ((Opcode == Instruction::ICmp) && ValTy->isVectorTy() && 1465 CmpInst::isIntPredicate(VecPred)) { 1466 // Use VMSLT_VV to represent VMSEQ, VMSNE, VMSLTU, VMSLEU, VMSLT, VMSLE 1467 // provided they incur the same cost across all implementations 1468 return LT.first * 1469 getRISCVInstructionCost(RISCV::VMSLT_VV, LT.second, CostKind); 1470 } 1471 1472 if ((Opcode == Instruction::FCmp) && ValTy->isVectorTy() && 1473 CmpInst::isFPPredicate(VecPred)) { 1474 1475 // Use VMXOR_MM and VMXNOR_MM to generate all true/false mask 1476 if ((VecPred == CmpInst::FCMP_FALSE) || (VecPred == CmpInst::FCMP_TRUE)) 1477 return getRISCVInstructionCost(RISCV::VMXOR_MM, LT.second, CostKind); 1478 1479 // If we do not support the input floating point vector type, use the base 1480 // one which will calculate as: 1481 // ScalarizeCost + Num * Cost for fixed vector, 1482 // InvalidCost for scalable vector. 1483 if ((ValTy->getScalarSizeInBits() == 16 && !ST->hasVInstructionsF16()) || 1484 (ValTy->getScalarSizeInBits() == 32 && !ST->hasVInstructionsF32()) || 1485 (ValTy->getScalarSizeInBits() == 64 && !ST->hasVInstructionsF64())) 1486 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, 1487 I); 1488 1489 // Assuming vector fp compare and mask instructions are all the same cost 1490 // until a need arises to differentiate them. 1491 switch (VecPred) { 1492 case CmpInst::FCMP_ONE: // vmflt.vv + vmflt.vv + vmor.mm 1493 case CmpInst::FCMP_ORD: // vmfeq.vv + vmfeq.vv + vmand.mm 1494 case CmpInst::FCMP_UNO: // vmfne.vv + vmfne.vv + vmor.mm 1495 case CmpInst::FCMP_UEQ: // vmflt.vv + vmflt.vv + vmnor.mm 1496 return LT.first * getRISCVInstructionCost( 1497 {RISCV::VMFLT_VV, RISCV::VMFLT_VV, RISCV::VMOR_MM}, 1498 LT.second, CostKind); 1499 1500 case CmpInst::FCMP_UGT: // vmfle.vv + vmnot.m 1501 case CmpInst::FCMP_UGE: // vmflt.vv + vmnot.m 1502 case CmpInst::FCMP_ULT: // vmfle.vv + vmnot.m 1503 case CmpInst::FCMP_ULE: // vmflt.vv + vmnot.m 1504 return LT.first * 1505 getRISCVInstructionCost({RISCV::VMFLT_VV, RISCV::VMNAND_MM}, 1506 LT.second, CostKind); 1507 1508 case CmpInst::FCMP_OEQ: // vmfeq.vv 1509 case CmpInst::FCMP_OGT: // vmflt.vv 1510 case CmpInst::FCMP_OGE: // vmfle.vv 1511 case CmpInst::FCMP_OLT: // vmflt.vv 1512 case CmpInst::FCMP_OLE: // vmfle.vv 1513 case CmpInst::FCMP_UNE: // vmfne.vv 1514 return LT.first * 1515 getRISCVInstructionCost(RISCV::VMFLT_VV, LT.second, CostKind); 1516 default: 1517 break; 1518 } 1519 } 1520 1521 // With ShortForwardBranchOpt or ConditionalMoveFusion, scalar icmp + select 1522 // instructions will lower to SELECT_CC and lower to PseudoCCMOVGPR which will 1523 // generate a conditional branch + mv. The cost of scalar (icmp + select) will 1524 // be (0 + select instr cost). 1525 if (ST->hasConditionalMoveFusion() && I && isa<ICmpInst>(I) && 1526 ValTy->isIntegerTy() && !I->user_empty()) { 1527 if (all_of(I->users(), [&](const User *U) { 1528 return match(U, m_Select(m_Specific(I), m_Value(), m_Value())) && 1529 U->getType()->isIntegerTy() && 1530 !isa<ConstantData>(U->getOperand(1)) && 1531 !isa<ConstantData>(U->getOperand(2)); 1532 })) 1533 return 0; 1534 } 1535 1536 // TODO: Add cost for scalar type. 1537 1538 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I); 1539 } 1540 1541 InstructionCost RISCVTTIImpl::getCFInstrCost(unsigned Opcode, 1542 TTI::TargetCostKind CostKind, 1543 const Instruction *I) { 1544 if (CostKind != TTI::TCK_RecipThroughput) 1545 return Opcode == Instruction::PHI ? 0 : 1; 1546 // Branches are assumed to be predicted. 1547 return 0; 1548 } 1549 1550 InstructionCost RISCVTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, 1551 TTI::TargetCostKind CostKind, 1552 unsigned Index, Value *Op0, 1553 Value *Op1) { 1554 assert(Val->isVectorTy() && "This must be a vector type"); 1555 1556 if (Opcode != Instruction::ExtractElement && 1557 Opcode != Instruction::InsertElement) 1558 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1); 1559 1560 // Legalize the type. 1561 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val); 1562 1563 // This type is legalized to a scalar type. 1564 if (!LT.second.isVector()) { 1565 auto *FixedVecTy = cast<FixedVectorType>(Val); 1566 // If Index is a known constant, cost is zero. 1567 if (Index != -1U) 1568 return 0; 1569 // Extract/InsertElement with non-constant index is very costly when 1570 // scalarized; estimate cost of loads/stores sequence via the stack: 1571 // ExtractElement cost: store vector to stack, load scalar; 1572 // InsertElement cost: store vector to stack, store scalar, load vector. 1573 Type *ElemTy = FixedVecTy->getElementType(); 1574 auto NumElems = FixedVecTy->getNumElements(); 1575 auto Align = DL.getPrefTypeAlign(ElemTy); 1576 InstructionCost LoadCost = 1577 getMemoryOpCost(Instruction::Load, ElemTy, Align, 0, CostKind); 1578 InstructionCost StoreCost = 1579 getMemoryOpCost(Instruction::Store, ElemTy, Align, 0, CostKind); 1580 return Opcode == Instruction::ExtractElement 1581 ? StoreCost * NumElems + LoadCost 1582 : (StoreCost + LoadCost) * NumElems + StoreCost; 1583 } 1584 1585 // For unsupported scalable vector. 1586 if (LT.second.isScalableVector() && !LT.first.isValid()) 1587 return LT.first; 1588 1589 if (!isTypeLegal(Val)) 1590 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1); 1591 1592 // Mask vector extract/insert is expanded via e8. 1593 if (Val->getScalarSizeInBits() == 1) { 1594 VectorType *WideTy = 1595 VectorType::get(IntegerType::get(Val->getContext(), 8), 1596 cast<VectorType>(Val)->getElementCount()); 1597 if (Opcode == Instruction::ExtractElement) { 1598 InstructionCost ExtendCost 1599 = getCastInstrCost(Instruction::ZExt, WideTy, Val, 1600 TTI::CastContextHint::None, CostKind); 1601 InstructionCost ExtractCost 1602 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr); 1603 return ExtendCost + ExtractCost; 1604 } 1605 InstructionCost ExtendCost 1606 = getCastInstrCost(Instruction::ZExt, WideTy, Val, 1607 TTI::CastContextHint::None, CostKind); 1608 InstructionCost InsertCost 1609 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr); 1610 InstructionCost TruncCost 1611 = getCastInstrCost(Instruction::Trunc, Val, WideTy, 1612 TTI::CastContextHint::None, CostKind); 1613 return ExtendCost + InsertCost + TruncCost; 1614 } 1615 1616 1617 // In RVV, we could use vslidedown + vmv.x.s to extract element from vector 1618 // and vslideup + vmv.s.x to insert element to vector. 1619 unsigned BaseCost = 1; 1620 // When insertelement we should add the index with 1 as the input of vslideup. 1621 unsigned SlideCost = Opcode == Instruction::InsertElement ? 2 : 1; 1622 1623 if (Index != -1U) { 1624 // The type may be split. For fixed-width vectors we can normalize the 1625 // index to the new type. 1626 if (LT.second.isFixedLengthVector()) { 1627 unsigned Width = LT.second.getVectorNumElements(); 1628 Index = Index % Width; 1629 } 1630 1631 // We could extract/insert the first element without vslidedown/vslideup. 1632 if (Index == 0) 1633 SlideCost = 0; 1634 else if (Opcode == Instruction::InsertElement) 1635 SlideCost = 1; // With a constant index, we do not need to use addi. 1636 } 1637 1638 // Extract i64 in the target that has XLEN=32 need more instruction. 1639 if (Val->getScalarType()->isIntegerTy() && 1640 ST->getXLen() < Val->getScalarSizeInBits()) { 1641 // For extractelement, we need the following instructions: 1642 // vsetivli zero, 1, e64, m1, ta, mu (not count) 1643 // vslidedown.vx v8, v8, a0 1644 // vmv.x.s a0, v8 1645 // li a1, 32 1646 // vsrl.vx v8, v8, a1 1647 // vmv.x.s a1, v8 1648 1649 // For insertelement, we need the following instructions: 1650 // vsetivli zero, 2, e32, m4, ta, mu (not count) 1651 // vmv.v.i v12, 0 1652 // vslide1up.vx v16, v12, a1 1653 // vslide1up.vx v12, v16, a0 1654 // addi a0, a2, 1 1655 // vsetvli zero, a0, e64, m4, tu, mu (not count) 1656 // vslideup.vx v8, v12, a2 1657 1658 // TODO: should we count these special vsetvlis? 1659 BaseCost = Opcode == Instruction::InsertElement ? 3 : 4; 1660 } 1661 return BaseCost + SlideCost; 1662 } 1663 1664 InstructionCost RISCVTTIImpl::getArithmeticInstrCost( 1665 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, 1666 TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, 1667 ArrayRef<const Value *> Args, const Instruction *CxtI) { 1668 1669 // TODO: Handle more cost kinds. 1670 if (CostKind != TTI::TCK_RecipThroughput) 1671 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info, 1672 Args, CxtI); 1673 1674 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors()) 1675 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info, 1676 Args, CxtI); 1677 1678 // Skip if scalar size of Ty is bigger than ELEN. 1679 if (isa<VectorType>(Ty) && Ty->getScalarSizeInBits() > ST->getELen()) 1680 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info, 1681 Args, CxtI); 1682 1683 // Legalize the type. 1684 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); 1685 1686 // TODO: Handle scalar type. 1687 if (!LT.second.isVector()) 1688 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info, 1689 Args, CxtI); 1690 1691 auto getConstantMatCost = 1692 [&](unsigned Operand, TTI::OperandValueInfo OpInfo) -> InstructionCost { 1693 if (OpInfo.isUniform() && TLI->canSplatOperand(Opcode, Operand)) 1694 // Two sub-cases: 1695 // * Has a 5 bit immediate operand which can be splatted. 1696 // * Has a larger immediate which must be materialized in scalar register 1697 // We return 0 for both as we currently ignore the cost of materializing 1698 // scalar constants in GPRs. 1699 return 0; 1700 1701 return getConstantPoolLoadCost(Ty, CostKind); 1702 }; 1703 1704 // Add the cost of materializing any constant vectors required. 1705 InstructionCost ConstantMatCost = 0; 1706 if (Op1Info.isConstant()) 1707 ConstantMatCost += getConstantMatCost(0, Op1Info); 1708 if (Op2Info.isConstant()) 1709 ConstantMatCost += getConstantMatCost(1, Op2Info); 1710 1711 unsigned Op; 1712 switch (TLI->InstructionOpcodeToISD(Opcode)) { 1713 case ISD::ADD: 1714 case ISD::SUB: 1715 Op = RISCV::VADD_VV; 1716 break; 1717 case ISD::SHL: 1718 case ISD::SRL: 1719 case ISD::SRA: 1720 Op = RISCV::VSLL_VV; 1721 break; 1722 case ISD::AND: 1723 case ISD::OR: 1724 case ISD::XOR: 1725 Op = (Ty->getScalarSizeInBits() == 1) ? RISCV::VMAND_MM : RISCV::VAND_VV; 1726 break; 1727 case ISD::MUL: 1728 case ISD::MULHS: 1729 case ISD::MULHU: 1730 Op = RISCV::VMUL_VV; 1731 break; 1732 case ISD::SDIV: 1733 case ISD::UDIV: 1734 Op = RISCV::VDIV_VV; 1735 break; 1736 case ISD::SREM: 1737 case ISD::UREM: 1738 Op = RISCV::VREM_VV; 1739 break; 1740 case ISD::FADD: 1741 case ISD::FSUB: 1742 // TODO: Address FP16 with VFHMIN 1743 Op = RISCV::VFADD_VV; 1744 break; 1745 case ISD::FMUL: 1746 // TODO: Address FP16 with VFHMIN 1747 Op = RISCV::VFMUL_VV; 1748 break; 1749 case ISD::FDIV: 1750 Op = RISCV::VFDIV_VV; 1751 break; 1752 case ISD::FNEG: 1753 Op = RISCV::VFSGNJN_VV; 1754 break; 1755 default: 1756 // Assuming all other instructions have the same cost until a need arises to 1757 // differentiate them. 1758 return ConstantMatCost + BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, 1759 Op1Info, Op2Info, 1760 Args, CxtI); 1761 } 1762 1763 InstructionCost InstrCost = getRISCVInstructionCost(Op, LT.second, CostKind); 1764 // We use BasicTTIImpl to calculate scalar costs, which assumes floating point 1765 // ops are twice as expensive as integer ops. Do the same for vectors so 1766 // scalar floating point ops aren't cheaper than their vector equivalents. 1767 if (Ty->isFPOrFPVectorTy()) 1768 InstrCost *= 2; 1769 return ConstantMatCost + LT.first * InstrCost; 1770 } 1771 1772 // TODO: Deduplicate from TargetTransformInfoImplCRTPBase. 1773 InstructionCost RISCVTTIImpl::getPointersChainCost( 1774 ArrayRef<const Value *> Ptrs, const Value *Base, 1775 const TTI::PointersChainInfo &Info, Type *AccessTy, 1776 TTI::TargetCostKind CostKind) { 1777 InstructionCost Cost = TTI::TCC_Free; 1778 // In the basic model we take into account GEP instructions only 1779 // (although here can come alloca instruction, a value, constants and/or 1780 // constant expressions, PHIs, bitcasts ... whatever allowed to be used as a 1781 // pointer). Typically, if Base is a not a GEP-instruction and all the 1782 // pointers are relative to the same base address, all the rest are 1783 // either GEP instructions, PHIs, bitcasts or constants. When we have same 1784 // base, we just calculate cost of each non-Base GEP as an ADD operation if 1785 // any their index is a non-const. 1786 // If no known dependecies between the pointers cost is calculated as a sum 1787 // of costs of GEP instructions. 1788 for (auto [I, V] : enumerate(Ptrs)) { 1789 const auto *GEP = dyn_cast<GetElementPtrInst>(V); 1790 if (!GEP) 1791 continue; 1792 if (Info.isSameBase() && V != Base) { 1793 if (GEP->hasAllConstantIndices()) 1794 continue; 1795 // If the chain is unit-stride and BaseReg + stride*i is a legal 1796 // addressing mode, then presume the base GEP is sitting around in a 1797 // register somewhere and check if we can fold the offset relative to 1798 // it. 1799 unsigned Stride = DL.getTypeStoreSize(AccessTy); 1800 if (Info.isUnitStride() && 1801 isLegalAddressingMode(AccessTy, 1802 /* BaseGV */ nullptr, 1803 /* BaseOffset */ Stride * I, 1804 /* HasBaseReg */ true, 1805 /* Scale */ 0, 1806 GEP->getType()->getPointerAddressSpace())) 1807 continue; 1808 Cost += getArithmeticInstrCost(Instruction::Add, GEP->getType(), CostKind, 1809 {TTI::OK_AnyValue, TTI::OP_None}, 1810 {TTI::OK_AnyValue, TTI::OP_None}, 1811 std::nullopt); 1812 } else { 1813 SmallVector<const Value *> Indices(GEP->indices()); 1814 Cost += getGEPCost(GEP->getSourceElementType(), GEP->getPointerOperand(), 1815 Indices, AccessTy, CostKind); 1816 } 1817 } 1818 return Cost; 1819 } 1820 1821 void RISCVTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, 1822 TTI::UnrollingPreferences &UP, 1823 OptimizationRemarkEmitter *ORE) { 1824 // TODO: More tuning on benchmarks and metrics with changes as needed 1825 // would apply to all settings below to enable performance. 1826 1827 1828 if (ST->enableDefaultUnroll()) 1829 return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE); 1830 1831 // Enable Upper bound unrolling universally, not dependant upon the conditions 1832 // below. 1833 UP.UpperBound = true; 1834 1835 // Disable loop unrolling for Oz and Os. 1836 UP.OptSizeThreshold = 0; 1837 UP.PartialOptSizeThreshold = 0; 1838 if (L->getHeader()->getParent()->hasOptSize()) 1839 return; 1840 1841 SmallVector<BasicBlock *, 4> ExitingBlocks; 1842 L->getExitingBlocks(ExitingBlocks); 1843 LLVM_DEBUG(dbgs() << "Loop has:\n" 1844 << "Blocks: " << L->getNumBlocks() << "\n" 1845 << "Exit blocks: " << ExitingBlocks.size() << "\n"); 1846 1847 // Only allow another exit other than the latch. This acts as an early exit 1848 // as it mirrors the profitability calculation of the runtime unroller. 1849 if (ExitingBlocks.size() > 2) 1850 return; 1851 1852 // Limit the CFG of the loop body for targets with a branch predictor. 1853 // Allowing 4 blocks permits if-then-else diamonds in the body. 1854 if (L->getNumBlocks() > 4) 1855 return; 1856 1857 // Don't unroll vectorized loops, including the remainder loop 1858 if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized")) 1859 return; 1860 1861 // Scan the loop: don't unroll loops with calls as this could prevent 1862 // inlining. 1863 InstructionCost Cost = 0; 1864 for (auto *BB : L->getBlocks()) { 1865 for (auto &I : *BB) { 1866 // Initial setting - Don't unroll loops containing vectorized 1867 // instructions. 1868 if (I.getType()->isVectorTy()) 1869 return; 1870 1871 if (isa<CallInst>(I) || isa<InvokeInst>(I)) { 1872 if (const Function *F = cast<CallBase>(I).getCalledFunction()) { 1873 if (!isLoweredToCall(F)) 1874 continue; 1875 } 1876 return; 1877 } 1878 1879 SmallVector<const Value *> Operands(I.operand_values()); 1880 Cost += getInstructionCost(&I, Operands, 1881 TargetTransformInfo::TCK_SizeAndLatency); 1882 } 1883 } 1884 1885 LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n"); 1886 1887 UP.Partial = true; 1888 UP.Runtime = true; 1889 UP.UnrollRemainder = true; 1890 UP.UnrollAndJam = true; 1891 UP.UnrollAndJamInnerLoopThreshold = 60; 1892 1893 // Force unrolling small loops can be very useful because of the branch 1894 // taken cost of the backedge. 1895 if (Cost < 12) 1896 UP.Force = true; 1897 } 1898 1899 void RISCVTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, 1900 TTI::PeelingPreferences &PP) { 1901 BaseT::getPeelingPreferences(L, SE, PP); 1902 } 1903 1904 unsigned RISCVTTIImpl::getRegUsageForType(Type *Ty) { 1905 TypeSize Size = DL.getTypeSizeInBits(Ty); 1906 if (Ty->isVectorTy()) { 1907 if (Size.isScalable() && ST->hasVInstructions()) 1908 return divideCeil(Size.getKnownMinValue(), RISCV::RVVBitsPerBlock); 1909 1910 if (ST->useRVVForFixedLengthVectors()) 1911 return divideCeil(Size, ST->getRealMinVLen()); 1912 } 1913 1914 return BaseT::getRegUsageForType(Ty); 1915 } 1916 1917 unsigned RISCVTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const { 1918 if (SLPMaxVF.getNumOccurrences()) 1919 return SLPMaxVF; 1920 1921 // Return how many elements can fit in getRegisterBitwidth. This is the 1922 // same routine as used in LoopVectorizer. We should probably be 1923 // accounting for whether we actually have instructions with the right 1924 // lane type, but we don't have enough information to do that without 1925 // some additional plumbing which hasn't been justified yet. 1926 TypeSize RegWidth = 1927 getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector); 1928 // If no vector registers, or absurd element widths, disable 1929 // vectorization by returning 1. 1930 return std::max<unsigned>(1U, RegWidth.getFixedValue() / ElemWidth); 1931 } 1932 1933 bool RISCVTTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1, 1934 const TargetTransformInfo::LSRCost &C2) { 1935 // RISC-V specific here are "instruction number 1st priority". 1936 // If we need to emit adds inside the loop to add up base registers, then 1937 // we need at least one extra temporary register. 1938 unsigned C1NumRegs = C1.NumRegs + (C1.NumBaseAdds != 0); 1939 unsigned C2NumRegs = C2.NumRegs + (C2.NumBaseAdds != 0); 1940 return std::tie(C1.Insns, C1NumRegs, C1.AddRecCost, 1941 C1.NumIVMuls, C1.NumBaseAdds, 1942 C1.ScaleCost, C1.ImmCost, C1.SetupCost) < 1943 std::tie(C2.Insns, C2NumRegs, C2.AddRecCost, 1944 C2.NumIVMuls, C2.NumBaseAdds, 1945 C2.ScaleCost, C2.ImmCost, C2.SetupCost); 1946 } 1947 1948 bool RISCVTTIImpl::isLegalMaskedCompressStore(Type *DataTy, Align Alignment) { 1949 auto *VTy = dyn_cast<VectorType>(DataTy); 1950 if (!VTy || VTy->isScalableTy()) 1951 return false; 1952 1953 if (!isLegalMaskedLoadStore(DataTy, Alignment)) 1954 return false; 1955 return true; 1956 } 1957 1958 bool RISCVTTIImpl::areInlineCompatible(const Function *Caller, 1959 const Function *Callee) const { 1960 const TargetMachine &TM = getTLI()->getTargetMachine(); 1961 1962 const FeatureBitset &CallerBits = 1963 TM.getSubtargetImpl(*Caller)->getFeatureBits(); 1964 const FeatureBitset &CalleeBits = 1965 TM.getSubtargetImpl(*Callee)->getFeatureBits(); 1966 1967 // Inline a callee if its target-features are a subset of the callers 1968 // target-features. 1969 return (CallerBits & CalleeBits) == CalleeBits; 1970 } 1971