1 //===-- RISCVTargetTransformInfo.cpp - RISC-V specific TTI ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "RISCVTargetTransformInfo.h" 10 #include "MCTargetDesc/RISCVMatInt.h" 11 #include "llvm/ADT/STLExtras.h" 12 #include "llvm/Analysis/TargetTransformInfo.h" 13 #include "llvm/CodeGen/BasicTTIImpl.h" 14 #include "llvm/CodeGen/CostTable.h" 15 #include "llvm/CodeGen/TargetLowering.h" 16 #include "llvm/IR/Instructions.h" 17 #include <cmath> 18 #include <optional> 19 using namespace llvm; 20 21 #define DEBUG_TYPE "riscvtti" 22 23 static cl::opt<unsigned> RVVRegisterWidthLMUL( 24 "riscv-v-register-bit-width-lmul", 25 cl::desc( 26 "The LMUL to use for getRegisterBitWidth queries. Affects LMUL used " 27 "by autovectorized code. Fractional LMULs are not supported."), 28 cl::init(2), cl::Hidden); 29 30 static cl::opt<unsigned> SLPMaxVF( 31 "riscv-v-slp-max-vf", 32 cl::desc( 33 "Overrides result used for getMaximumVF query which is used " 34 "exclusively by SLP vectorizer."), 35 cl::Hidden); 36 37 InstructionCost 38 RISCVTTIImpl::getRISCVInstructionCost(ArrayRef<unsigned> OpCodes, MVT VT, 39 TTI::TargetCostKind CostKind) { 40 // Check if the type is valid for all CostKind 41 if (!VT.isVector()) 42 return InstructionCost::getInvalid(); 43 size_t NumInstr = OpCodes.size(); 44 if (CostKind == TTI::TCK_CodeSize) 45 return NumInstr; 46 InstructionCost LMULCost = TLI->getLMULCost(VT); 47 if ((CostKind != TTI::TCK_RecipThroughput) && (CostKind != TTI::TCK_Latency)) 48 return LMULCost * NumInstr; 49 InstructionCost Cost = 0; 50 for (auto Op : OpCodes) { 51 switch (Op) { 52 case RISCV::VRGATHER_VI: 53 Cost += TLI->getVRGatherVICost(VT); 54 break; 55 case RISCV::VRGATHER_VV: 56 Cost += TLI->getVRGatherVVCost(VT); 57 break; 58 case RISCV::VSLIDEUP_VI: 59 case RISCV::VSLIDEDOWN_VI: 60 Cost += TLI->getVSlideVICost(VT); 61 break; 62 case RISCV::VSLIDEUP_VX: 63 case RISCV::VSLIDEDOWN_VX: 64 Cost += TLI->getVSlideVXCost(VT); 65 break; 66 case RISCV::VREDMAX_VS: 67 case RISCV::VREDMIN_VS: 68 case RISCV::VREDMAXU_VS: 69 case RISCV::VREDMINU_VS: 70 case RISCV::VREDSUM_VS: 71 case RISCV::VREDAND_VS: 72 case RISCV::VREDOR_VS: 73 case RISCV::VREDXOR_VS: 74 case RISCV::VFREDMAX_VS: 75 case RISCV::VFREDMIN_VS: 76 case RISCV::VFREDUSUM_VS: { 77 unsigned VL = VT.getVectorMinNumElements(); 78 if (!VT.isFixedLengthVector()) 79 VL *= *getVScaleForTuning(); 80 Cost += Log2_32_Ceil(VL); 81 break; 82 } 83 case RISCV::VFREDOSUM_VS: { 84 unsigned VL = VT.getVectorMinNumElements(); 85 if (!VT.isFixedLengthVector()) 86 VL *= *getVScaleForTuning(); 87 Cost += VL; 88 break; 89 } 90 case RISCV::VMV_X_S: 91 case RISCV::VMV_S_X: 92 Cost += 1; 93 break; 94 default: 95 Cost += LMULCost; 96 } 97 } 98 return Cost; 99 } 100 101 InstructionCost RISCVTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty, 102 TTI::TargetCostKind CostKind) { 103 assert(Ty->isIntegerTy() && 104 "getIntImmCost can only estimate cost of materialising integers"); 105 106 // We have a Zero register, so 0 is always free. 107 if (Imm == 0) 108 return TTI::TCC_Free; 109 110 // Otherwise, we check how many instructions it will take to materialise. 111 const DataLayout &DL = getDataLayout(); 112 return RISCVMatInt::getIntMatCost(Imm, DL.getTypeSizeInBits(Ty), *getST()); 113 } 114 115 // Look for patterns of shift followed by AND that can be turned into a pair of 116 // shifts. We won't need to materialize an immediate for the AND so these can 117 // be considered free. 118 static bool canUseShiftPair(Instruction *Inst, const APInt &Imm) { 119 uint64_t Mask = Imm.getZExtValue(); 120 auto *BO = dyn_cast<BinaryOperator>(Inst->getOperand(0)); 121 if (!BO || !BO->hasOneUse()) 122 return false; 123 124 if (BO->getOpcode() != Instruction::Shl) 125 return false; 126 127 if (!isa<ConstantInt>(BO->getOperand(1))) 128 return false; 129 130 unsigned ShAmt = cast<ConstantInt>(BO->getOperand(1))->getZExtValue(); 131 // (and (shl x, c2), c1) will be matched to (srli (slli x, c2+c3), c3) if c1 132 // is a mask shifted by c2 bits with c3 leading zeros. 133 if (isShiftedMask_64(Mask)) { 134 unsigned Trailing = llvm::countr_zero(Mask); 135 if (ShAmt == Trailing) 136 return true; 137 } 138 139 return false; 140 } 141 142 InstructionCost RISCVTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, 143 const APInt &Imm, Type *Ty, 144 TTI::TargetCostKind CostKind, 145 Instruction *Inst) { 146 assert(Ty->isIntegerTy() && 147 "getIntImmCost can only estimate cost of materialising integers"); 148 149 // We have a Zero register, so 0 is always free. 150 if (Imm == 0) 151 return TTI::TCC_Free; 152 153 // Some instructions in RISC-V can take a 12-bit immediate. Some of these are 154 // commutative, in others the immediate comes from a specific argument index. 155 bool Takes12BitImm = false; 156 unsigned ImmArgIdx = ~0U; 157 158 switch (Opcode) { 159 case Instruction::GetElementPtr: 160 // Never hoist any arguments to a GetElementPtr. CodeGenPrepare will 161 // split up large offsets in GEP into better parts than ConstantHoisting 162 // can. 163 return TTI::TCC_Free; 164 case Instruction::And: 165 // zext.h 166 if (Imm == UINT64_C(0xffff) && ST->hasStdExtZbb()) 167 return TTI::TCC_Free; 168 // zext.w 169 if (Imm == UINT64_C(0xffffffff) && ST->hasStdExtZba()) 170 return TTI::TCC_Free; 171 // bclri 172 if (ST->hasStdExtZbs() && (~Imm).isPowerOf2()) 173 return TTI::TCC_Free; 174 if (Inst && Idx == 1 && Imm.getBitWidth() <= ST->getXLen() && 175 canUseShiftPair(Inst, Imm)) 176 return TTI::TCC_Free; 177 Takes12BitImm = true; 178 break; 179 case Instruction::Add: 180 Takes12BitImm = true; 181 break; 182 case Instruction::Or: 183 case Instruction::Xor: 184 // bseti/binvi 185 if (ST->hasStdExtZbs() && Imm.isPowerOf2()) 186 return TTI::TCC_Free; 187 Takes12BitImm = true; 188 break; 189 case Instruction::Mul: 190 // Power of 2 is a shift. Negated power of 2 is a shift and a negate. 191 if (Imm.isPowerOf2() || Imm.isNegatedPowerOf2()) 192 return TTI::TCC_Free; 193 // One more or less than a power of 2 can use SLLI+ADD/SUB. 194 if ((Imm + 1).isPowerOf2() || (Imm - 1).isPowerOf2()) 195 return TTI::TCC_Free; 196 // FIXME: There is no MULI instruction. 197 Takes12BitImm = true; 198 break; 199 case Instruction::Sub: 200 case Instruction::Shl: 201 case Instruction::LShr: 202 case Instruction::AShr: 203 Takes12BitImm = true; 204 ImmArgIdx = 1; 205 break; 206 default: 207 break; 208 } 209 210 if (Takes12BitImm) { 211 // Check immediate is the correct argument... 212 if (Instruction::isCommutative(Opcode) || Idx == ImmArgIdx) { 213 // ... and fits into the 12-bit immediate. 214 if (Imm.getSignificantBits() <= 64 && 215 getTLI()->isLegalAddImmediate(Imm.getSExtValue())) { 216 return TTI::TCC_Free; 217 } 218 } 219 220 // Otherwise, use the full materialisation cost. 221 return getIntImmCost(Imm, Ty, CostKind); 222 } 223 224 // By default, prevent hoisting. 225 return TTI::TCC_Free; 226 } 227 228 InstructionCost 229 RISCVTTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, 230 const APInt &Imm, Type *Ty, 231 TTI::TargetCostKind CostKind) { 232 // Prevent hoisting in unknown cases. 233 return TTI::TCC_Free; 234 } 235 236 TargetTransformInfo::PopcntSupportKind 237 RISCVTTIImpl::getPopcntSupport(unsigned TyWidth) { 238 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); 239 return ST->hasStdExtZbb() || ST->hasVendorXCVbitmanip() 240 ? TTI::PSK_FastHardware 241 : TTI::PSK_Software; 242 } 243 244 bool RISCVTTIImpl::shouldExpandReduction(const IntrinsicInst *II) const { 245 // Currently, the ExpandReductions pass can't expand scalable-vector 246 // reductions, but we still request expansion as RVV doesn't support certain 247 // reductions and the SelectionDAG can't legalize them either. 248 switch (II->getIntrinsicID()) { 249 default: 250 return false; 251 // These reductions have no equivalent in RVV 252 case Intrinsic::vector_reduce_mul: 253 case Intrinsic::vector_reduce_fmul: 254 return true; 255 } 256 } 257 258 std::optional<unsigned> RISCVTTIImpl::getMaxVScale() const { 259 if (ST->hasVInstructions()) 260 return ST->getRealMaxVLen() / RISCV::RVVBitsPerBlock; 261 return BaseT::getMaxVScale(); 262 } 263 264 std::optional<unsigned> RISCVTTIImpl::getVScaleForTuning() const { 265 if (ST->hasVInstructions()) 266 if (unsigned MinVLen = ST->getRealMinVLen(); 267 MinVLen >= RISCV::RVVBitsPerBlock) 268 return MinVLen / RISCV::RVVBitsPerBlock; 269 return BaseT::getVScaleForTuning(); 270 } 271 272 TypeSize 273 RISCVTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { 274 unsigned LMUL = 275 llvm::bit_floor(std::clamp<unsigned>(RVVRegisterWidthLMUL, 1, 8)); 276 switch (K) { 277 case TargetTransformInfo::RGK_Scalar: 278 return TypeSize::getFixed(ST->getXLen()); 279 case TargetTransformInfo::RGK_FixedWidthVector: 280 return TypeSize::getFixed( 281 ST->useRVVForFixedLengthVectors() ? LMUL * ST->getRealMinVLen() : 0); 282 case TargetTransformInfo::RGK_ScalableVector: 283 return TypeSize::getScalable( 284 (ST->hasVInstructions() && 285 ST->getRealMinVLen() >= RISCV::RVVBitsPerBlock) 286 ? LMUL * RISCV::RVVBitsPerBlock 287 : 0); 288 } 289 290 llvm_unreachable("Unsupported register kind"); 291 } 292 293 InstructionCost 294 RISCVTTIImpl::getConstantPoolLoadCost(Type *Ty, TTI::TargetCostKind CostKind) { 295 // Add a cost of address generation + the cost of the load. The address 296 // is expected to be a PC relative offset to a constant pool entry 297 // using auipc/addi. 298 return 2 + getMemoryOpCost(Instruction::Load, Ty, DL.getABITypeAlign(Ty), 299 /*AddressSpace=*/0, CostKind); 300 } 301 302 static VectorType *getVRGatherIndexType(MVT DataVT, const RISCVSubtarget &ST, 303 LLVMContext &C) { 304 assert((DataVT.getScalarSizeInBits() != 8 || 305 DataVT.getVectorNumElements() <= 256) && "unhandled case in lowering"); 306 MVT IndexVT = DataVT.changeTypeToInteger(); 307 if (IndexVT.getScalarType().bitsGT(ST.getXLenVT())) 308 IndexVT = IndexVT.changeVectorElementType(MVT::i16); 309 return cast<VectorType>(EVT(IndexVT).getTypeForEVT(C)); 310 } 311 312 InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, 313 VectorType *Tp, ArrayRef<int> Mask, 314 TTI::TargetCostKind CostKind, 315 int Index, VectorType *SubTp, 316 ArrayRef<const Value *> Args) { 317 Kind = improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp); 318 319 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp); 320 321 // First, handle cases where having a fixed length vector enables us to 322 // give a more accurate cost than falling back to generic scalable codegen. 323 // TODO: Each of these cases hints at a modeling gap around scalable vectors. 324 if (isa<FixedVectorType>(Tp)) { 325 switch (Kind) { 326 default: 327 break; 328 case TTI::SK_PermuteSingleSrc: { 329 if (Mask.size() >= 2 && LT.second.isFixedLengthVector()) { 330 MVT EltTp = LT.second.getVectorElementType(); 331 // If the size of the element is < ELEN then shuffles of interleaves and 332 // deinterleaves of 2 vectors can be lowered into the following 333 // sequences 334 if (EltTp.getScalarSizeInBits() < ST->getELen()) { 335 // Example sequence: 336 // vsetivli zero, 4, e8, mf4, ta, ma (ignored) 337 // vwaddu.vv v10, v8, v9 338 // li a0, -1 (ignored) 339 // vwmaccu.vx v10, a0, v9 340 if (ShuffleVectorInst::isInterleaveMask(Mask, 2, Mask.size())) 341 return 2 * LT.first * TLI->getLMULCost(LT.second); 342 343 if (Mask[0] == 0 || Mask[0] == 1) { 344 auto DeinterleaveMask = createStrideMask(Mask[0], 2, Mask.size()); 345 // Example sequence: 346 // vnsrl.wi v10, v8, 0 347 if (equal(DeinterleaveMask, Mask)) 348 return LT.first * getRISCVInstructionCost(RISCV::VNSRL_WI, 349 LT.second, CostKind); 350 } 351 } 352 } 353 // vrgather + cost of generating the mask constant. 354 // We model this for an unknown mask with a single vrgather. 355 if (LT.second.isFixedLengthVector() && LT.first == 1 && 356 (LT.second.getScalarSizeInBits() != 8 || 357 LT.second.getVectorNumElements() <= 256)) { 358 VectorType *IdxTy = getVRGatherIndexType(LT.second, *ST, Tp->getContext()); 359 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind); 360 return IndexCost + 361 getRISCVInstructionCost(RISCV::VRGATHER_VV, LT.second, CostKind); 362 } 363 [[fallthrough]]; 364 } 365 case TTI::SK_Transpose: 366 case TTI::SK_PermuteTwoSrc: { 367 // 2 x (vrgather + cost of generating the mask constant) + cost of mask 368 // register for the second vrgather. We model this for an unknown 369 // (shuffle) mask. 370 if (LT.second.isFixedLengthVector() && LT.first == 1 && 371 (LT.second.getScalarSizeInBits() != 8 || 372 LT.second.getVectorNumElements() <= 256)) { 373 auto &C = Tp->getContext(); 374 auto EC = Tp->getElementCount(); 375 VectorType *IdxTy = getVRGatherIndexType(LT.second, *ST, C); 376 VectorType *MaskTy = VectorType::get(IntegerType::getInt1Ty(C), EC); 377 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind); 378 InstructionCost MaskCost = getConstantPoolLoadCost(MaskTy, CostKind); 379 return 2 * IndexCost + 380 getRISCVInstructionCost({RISCV::VRGATHER_VV, RISCV::VRGATHER_VV}, 381 LT.second, CostKind) + 382 MaskCost; 383 } 384 [[fallthrough]]; 385 } 386 case TTI::SK_Select: { 387 // We are going to permute multiple sources and the result will be in 388 // multiple destinations. Providing an accurate cost only for splits where 389 // the element type remains the same. 390 if (!Mask.empty() && LT.first.isValid() && LT.first != 1 && 391 LT.second.isFixedLengthVector() && 392 LT.second.getVectorElementType().getSizeInBits() == 393 Tp->getElementType()->getPrimitiveSizeInBits() && 394 LT.second.getVectorNumElements() < 395 cast<FixedVectorType>(Tp)->getNumElements() && 396 divideCeil(Mask.size(), 397 cast<FixedVectorType>(Tp)->getNumElements()) == 398 static_cast<unsigned>(*LT.first.getValue())) { 399 unsigned NumRegs = *LT.first.getValue(); 400 unsigned VF = cast<FixedVectorType>(Tp)->getNumElements(); 401 unsigned SubVF = PowerOf2Ceil(VF / NumRegs); 402 auto *SubVecTy = FixedVectorType::get(Tp->getElementType(), SubVF); 403 404 InstructionCost Cost = 0; 405 for (unsigned I = 0; I < NumRegs; ++I) { 406 bool IsSingleVector = true; 407 SmallVector<int> SubMask(SubVF, PoisonMaskElem); 408 transform(Mask.slice(I * SubVF, 409 I == NumRegs - 1 ? Mask.size() % SubVF : SubVF), 410 SubMask.begin(), [&](int I) { 411 bool SingleSubVector = I / VF == 0; 412 IsSingleVector &= SingleSubVector; 413 return (SingleSubVector ? 0 : 1) * SubVF + I % VF; 414 }); 415 Cost += getShuffleCost(IsSingleVector ? TTI::SK_PermuteSingleSrc 416 : TTI::SK_PermuteTwoSrc, 417 SubVecTy, SubMask, CostKind, 0, nullptr); 418 return Cost; 419 } 420 } 421 break; 422 } 423 } 424 }; 425 426 // Handle scalable vectors (and fixed vectors legalized to scalable vectors). 427 switch (Kind) { 428 default: 429 // Fallthrough to generic handling. 430 // TODO: Most of these cases will return getInvalid in generic code, and 431 // must be implemented here. 432 break; 433 case TTI::SK_ExtractSubvector: 434 // Example sequence: 435 // vsetivli zero, 4, e8, mf2, tu, ma (ignored) 436 // vslidedown.vi v8, v9, 2 437 return LT.first * 438 getRISCVInstructionCost(RISCV::VSLIDEDOWN_VI, LT.second, CostKind); 439 case TTI::SK_InsertSubvector: 440 // Example sequence: 441 // vsetivli zero, 4, e8, mf2, tu, ma (ignored) 442 // vslideup.vi v8, v9, 2 443 return LT.first * 444 getRISCVInstructionCost(RISCV::VSLIDEUP_VI, LT.second, CostKind); 445 case TTI::SK_Select: { 446 // Example sequence: 447 // li a0, 90 448 // vsetivli zero, 8, e8, mf2, ta, ma (ignored) 449 // vmv.s.x v0, a0 450 // vmerge.vvm v8, v9, v8, v0 451 // We use 2 for the cost of the mask materialization as this is the true 452 // cost for small masks and most shuffles are small. At worst, this cost 453 // should be a very small constant for the constant pool load. As such, 454 // we may bias towards large selects slightly more than truely warranted. 455 return LT.first * 456 (1 + getRISCVInstructionCost({RISCV::VMV_S_X, RISCV::VMERGE_VVM}, 457 LT.second, CostKind)); 458 } 459 case TTI::SK_Broadcast: { 460 bool HasScalar = (Args.size() > 0) && (Operator::getOpcode(Args[0]) == 461 Instruction::InsertElement); 462 if (LT.second.getScalarSizeInBits() == 1) { 463 if (HasScalar) { 464 // Example sequence: 465 // andi a0, a0, 1 466 // vsetivli zero, 2, e8, mf8, ta, ma (ignored) 467 // vmv.v.x v8, a0 468 // vmsne.vi v0, v8, 0 469 return LT.first * 470 (TLI->getLMULCost(LT.second) + // FIXME: should be 1 for andi 471 getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI}, 472 LT.second, CostKind)); 473 } 474 // Example sequence: 475 // vsetivli zero, 2, e8, mf8, ta, mu (ignored) 476 // vmv.v.i v8, 0 477 // vmerge.vim v8, v8, 1, v0 478 // vmv.x.s a0, v8 479 // andi a0, a0, 1 480 // vmv.v.x v8, a0 481 // vmsne.vi v0, v8, 0 482 483 return LT.first * 484 (TLI->getLMULCost(LT.second) + // FIXME: this should be 1 for andi 485 getRISCVInstructionCost({RISCV::VMV_V_I, RISCV::VMERGE_VIM, 486 RISCV::VMV_X_S, RISCV::VMV_V_X, 487 RISCV::VMSNE_VI}, 488 LT.second, CostKind)); 489 } 490 491 if (HasScalar) { 492 // Example sequence: 493 // vmv.v.x v8, a0 494 return LT.first * 495 getRISCVInstructionCost(RISCV::VMV_V_X, LT.second, CostKind); 496 } 497 498 // Example sequence: 499 // vrgather.vi v9, v8, 0 500 return LT.first * 501 getRISCVInstructionCost(RISCV::VRGATHER_VI, LT.second, CostKind); 502 } 503 case TTI::SK_Splice: { 504 // vslidedown+vslideup. 505 // TODO: Multiplying by LT.first implies this legalizes into multiple copies 506 // of similar code, but I think we expand through memory. 507 unsigned Opcodes[2] = {RISCV::VSLIDEDOWN_VX, RISCV::VSLIDEUP_VX}; 508 if (Index >= 0 && Index < 32) 509 Opcodes[0] = RISCV::VSLIDEDOWN_VI; 510 else if (Index < 0 && Index > -32) 511 Opcodes[1] = RISCV::VSLIDEUP_VI; 512 return LT.first * getRISCVInstructionCost(Opcodes, LT.second, CostKind); 513 } 514 case TTI::SK_Reverse: { 515 // TODO: Cases to improve here: 516 // * Illegal vector types 517 // * i64 on RV32 518 // * i1 vector 519 // At low LMUL, most of the cost is producing the vrgather index register. 520 // At high LMUL, the cost of the vrgather itself will dominate. 521 // Example sequence: 522 // csrr a0, vlenb 523 // srli a0, a0, 3 524 // addi a0, a0, -1 525 // vsetvli a1, zero, e8, mf8, ta, mu (ignored) 526 // vid.v v9 527 // vrsub.vx v10, v9, a0 528 // vrgather.vv v9, v8, v10 529 InstructionCost LenCost = 3; 530 if (LT.second.isFixedLengthVector()) 531 // vrsub.vi has a 5 bit immediate field, otherwise an li suffices 532 LenCost = isInt<5>(LT.second.getVectorNumElements() - 1) ? 0 : 1; 533 // FIXME: replace the constant `2` below with cost of {VID_V,VRSUB_VX} 534 InstructionCost GatherCost = 535 2 + getRISCVInstructionCost(RISCV::VRGATHER_VV, LT.second, CostKind); 536 // Mask operation additionally required extend and truncate 537 InstructionCost ExtendCost = Tp->getElementType()->isIntegerTy(1) ? 3 : 0; 538 return LT.first * (LenCost + GatherCost + ExtendCost); 539 } 540 } 541 return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp); 542 } 543 544 InstructionCost 545 RISCVTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, 546 unsigned AddressSpace, 547 TTI::TargetCostKind CostKind) { 548 if (!isLegalMaskedLoadStore(Src, Alignment) || 549 CostKind != TTI::TCK_RecipThroughput) 550 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace, 551 CostKind); 552 553 return getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind); 554 } 555 556 InstructionCost RISCVTTIImpl::getInterleavedMemoryOpCost( 557 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, 558 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, 559 bool UseMaskForCond, bool UseMaskForGaps) { 560 if (isa<ScalableVectorType>(VecTy)) 561 return InstructionCost::getInvalid(); 562 auto *FVTy = cast<FixedVectorType>(VecTy); 563 InstructionCost MemCost = 564 getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, CostKind); 565 unsigned VF = FVTy->getNumElements() / Factor; 566 567 // The interleaved memory access pass will lower interleaved memory ops (i.e 568 // a load and store followed by a specific shuffle) to vlseg/vsseg 569 // intrinsics. In those cases then we can treat it as if it's just one (legal) 570 // memory op 571 if (!UseMaskForCond && !UseMaskForGaps && 572 Factor <= TLI->getMaxSupportedInterleaveFactor()) { 573 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(FVTy); 574 // Need to make sure type has't been scalarized 575 if (LT.second.isFixedLengthVector()) { 576 auto *LegalFVTy = FixedVectorType::get(FVTy->getElementType(), 577 LT.second.getVectorNumElements()); 578 // FIXME: We use the memory op cost of the *legalized* type here, becuase 579 // it's getMemoryOpCost returns a really expensive cost for types like 580 // <6 x i8>, which show up when doing interleaves of Factor=3 etc. 581 // Should the memory op cost of these be cheaper? 582 if (TLI->isLegalInterleavedAccessType(LegalFVTy, Factor, Alignment, 583 AddressSpace, DL)) { 584 InstructionCost LegalMemCost = getMemoryOpCost( 585 Opcode, LegalFVTy, Alignment, AddressSpace, CostKind); 586 return LT.first + LegalMemCost; 587 } 588 } 589 } 590 591 // An interleaved load will look like this for Factor=3: 592 // %wide.vec = load <12 x i32>, ptr %3, align 4 593 // %strided.vec = shufflevector %wide.vec, poison, <4 x i32> <stride mask> 594 // %strided.vec1 = shufflevector %wide.vec, poison, <4 x i32> <stride mask> 595 // %strided.vec2 = shufflevector %wide.vec, poison, <4 x i32> <stride mask> 596 if (Opcode == Instruction::Load) { 597 InstructionCost Cost = MemCost; 598 for (unsigned Index : Indices) { 599 FixedVectorType *SubVecTy = 600 FixedVectorType::get(FVTy->getElementType(), VF * Factor); 601 auto Mask = createStrideMask(Index, Factor, VF); 602 InstructionCost ShuffleCost = 603 getShuffleCost(TTI::ShuffleKind::SK_PermuteSingleSrc, SubVecTy, Mask, 604 CostKind, 0, nullptr, {}); 605 Cost += ShuffleCost; 606 } 607 return Cost; 608 } 609 610 // TODO: Model for NF > 2 611 // We'll need to enhance getShuffleCost to model shuffles that are just 612 // inserts and extracts into subvectors, since they won't have the full cost 613 // of a vrgather. 614 // An interleaved store for 3 vectors of 4 lanes will look like 615 // %11 = shufflevector <4 x i32> %4, <4 x i32> %6, <8 x i32> <0...7> 616 // %12 = shufflevector <4 x i32> %9, <4 x i32> poison, <8 x i32> <0...3> 617 // %13 = shufflevector <8 x i32> %11, <8 x i32> %12, <12 x i32> <0...11> 618 // %interleaved.vec = shufflevector %13, poison, <12 x i32> <interleave mask> 619 // store <12 x i32> %interleaved.vec, ptr %10, align 4 620 if (Factor != 2) 621 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, 622 Alignment, AddressSpace, CostKind, 623 UseMaskForCond, UseMaskForGaps); 624 625 assert(Opcode == Instruction::Store && "Opcode must be a store"); 626 // For an interleaving store of 2 vectors, we perform one large interleaving 627 // shuffle that goes into the wide store 628 auto Mask = createInterleaveMask(VF, Factor); 629 InstructionCost ShuffleCost = 630 getShuffleCost(TTI::ShuffleKind::SK_PermuteSingleSrc, FVTy, Mask, 631 CostKind, 0, nullptr, {}); 632 return MemCost + ShuffleCost; 633 } 634 635 InstructionCost RISCVTTIImpl::getGatherScatterOpCost( 636 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, 637 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) { 638 if (CostKind != TTI::TCK_RecipThroughput) 639 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, 640 Alignment, CostKind, I); 641 642 if ((Opcode == Instruction::Load && 643 !isLegalMaskedGather(DataTy, Align(Alignment))) || 644 (Opcode == Instruction::Store && 645 !isLegalMaskedScatter(DataTy, Align(Alignment)))) 646 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, 647 Alignment, CostKind, I); 648 649 // Cost is proportional to the number of memory operations implied. For 650 // scalable vectors, we use an estimate on that number since we don't 651 // know exactly what VL will be. 652 auto &VTy = *cast<VectorType>(DataTy); 653 InstructionCost MemOpCost = 654 getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind, 655 {TTI::OK_AnyValue, TTI::OP_None}, I); 656 unsigned NumLoads = getEstimatedVLFor(&VTy); 657 return NumLoads * MemOpCost; 658 } 659 660 // Currently, these represent both throughput and codesize costs 661 // for the respective intrinsics. The costs in this table are simply 662 // instruction counts with the following adjustments made: 663 // * One vsetvli is considered free. 664 static const CostTblEntry VectorIntrinsicCostTable[]{ 665 {Intrinsic::floor, MVT::f32, 9}, 666 {Intrinsic::floor, MVT::f64, 9}, 667 {Intrinsic::ceil, MVT::f32, 9}, 668 {Intrinsic::ceil, MVT::f64, 9}, 669 {Intrinsic::trunc, MVT::f32, 7}, 670 {Intrinsic::trunc, MVT::f64, 7}, 671 {Intrinsic::round, MVT::f32, 9}, 672 {Intrinsic::round, MVT::f64, 9}, 673 {Intrinsic::roundeven, MVT::f32, 9}, 674 {Intrinsic::roundeven, MVT::f64, 9}, 675 {Intrinsic::rint, MVT::f32, 7}, 676 {Intrinsic::rint, MVT::f64, 7}, 677 {Intrinsic::lrint, MVT::i32, 1}, 678 {Intrinsic::lrint, MVT::i64, 1}, 679 {Intrinsic::llrint, MVT::i64, 1}, 680 {Intrinsic::nearbyint, MVT::f32, 9}, 681 {Intrinsic::nearbyint, MVT::f64, 9}, 682 {Intrinsic::bswap, MVT::i16, 3}, 683 {Intrinsic::bswap, MVT::i32, 12}, 684 {Intrinsic::bswap, MVT::i64, 31}, 685 {Intrinsic::vp_bswap, MVT::i16, 3}, 686 {Intrinsic::vp_bswap, MVT::i32, 12}, 687 {Intrinsic::vp_bswap, MVT::i64, 31}, 688 {Intrinsic::vp_fshl, MVT::i8, 7}, 689 {Intrinsic::vp_fshl, MVT::i16, 7}, 690 {Intrinsic::vp_fshl, MVT::i32, 7}, 691 {Intrinsic::vp_fshl, MVT::i64, 7}, 692 {Intrinsic::vp_fshr, MVT::i8, 7}, 693 {Intrinsic::vp_fshr, MVT::i16, 7}, 694 {Intrinsic::vp_fshr, MVT::i32, 7}, 695 {Intrinsic::vp_fshr, MVT::i64, 7}, 696 {Intrinsic::bitreverse, MVT::i8, 17}, 697 {Intrinsic::bitreverse, MVT::i16, 24}, 698 {Intrinsic::bitreverse, MVT::i32, 33}, 699 {Intrinsic::bitreverse, MVT::i64, 52}, 700 {Intrinsic::vp_bitreverse, MVT::i8, 17}, 701 {Intrinsic::vp_bitreverse, MVT::i16, 24}, 702 {Intrinsic::vp_bitreverse, MVT::i32, 33}, 703 {Intrinsic::vp_bitreverse, MVT::i64, 52}, 704 {Intrinsic::ctpop, MVT::i8, 12}, 705 {Intrinsic::ctpop, MVT::i16, 19}, 706 {Intrinsic::ctpop, MVT::i32, 20}, 707 {Intrinsic::ctpop, MVT::i64, 21}, 708 {Intrinsic::vp_ctpop, MVT::i8, 12}, 709 {Intrinsic::vp_ctpop, MVT::i16, 19}, 710 {Intrinsic::vp_ctpop, MVT::i32, 20}, 711 {Intrinsic::vp_ctpop, MVT::i64, 21}, 712 {Intrinsic::vp_ctlz, MVT::i8, 19}, 713 {Intrinsic::vp_ctlz, MVT::i16, 28}, 714 {Intrinsic::vp_ctlz, MVT::i32, 31}, 715 {Intrinsic::vp_ctlz, MVT::i64, 35}, 716 {Intrinsic::vp_cttz, MVT::i8, 16}, 717 {Intrinsic::vp_cttz, MVT::i16, 23}, 718 {Intrinsic::vp_cttz, MVT::i32, 24}, 719 {Intrinsic::vp_cttz, MVT::i64, 25}, 720 }; 721 722 static unsigned getISDForVPIntrinsicID(Intrinsic::ID ID) { 723 switch (ID) { 724 #define HELPER_MAP_VPID_TO_VPSD(VPID, VPSD) \ 725 case Intrinsic::VPID: \ 726 return ISD::VPSD; 727 #include "llvm/IR/VPIntrinsics.def" 728 #undef HELPER_MAP_VPID_TO_VPSD 729 } 730 return ISD::DELETED_NODE; 731 } 732 733 InstructionCost 734 RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, 735 TTI::TargetCostKind CostKind) { 736 auto *RetTy = ICA.getReturnType(); 737 switch (ICA.getID()) { 738 case Intrinsic::ceil: 739 case Intrinsic::floor: 740 case Intrinsic::trunc: 741 case Intrinsic::rint: 742 case Intrinsic::lrint: 743 case Intrinsic::llrint: 744 case Intrinsic::round: 745 case Intrinsic::roundeven: { 746 // These all use the same code. 747 auto LT = getTypeLegalizationCost(RetTy); 748 if (!LT.second.isVector() && TLI->isOperationCustom(ISD::FCEIL, LT.second)) 749 return LT.first * 8; 750 break; 751 } 752 case Intrinsic::umin: 753 case Intrinsic::umax: 754 case Intrinsic::smin: 755 case Intrinsic::smax: { 756 auto LT = getTypeLegalizationCost(RetTy); 757 if ((ST->hasVInstructions() && LT.second.isVector()) || 758 (LT.second.isScalarInteger() && ST->hasStdExtZbb())) 759 return LT.first; 760 break; 761 } 762 case Intrinsic::sadd_sat: 763 case Intrinsic::ssub_sat: 764 case Intrinsic::uadd_sat: 765 case Intrinsic::usub_sat: 766 case Intrinsic::fabs: 767 case Intrinsic::sqrt: { 768 auto LT = getTypeLegalizationCost(RetTy); 769 if (ST->hasVInstructions() && LT.second.isVector()) 770 return LT.first; 771 break; 772 } 773 case Intrinsic::ctpop: { 774 auto LT = getTypeLegalizationCost(RetTy); 775 if (ST->hasVInstructions() && ST->hasStdExtZvbb() && LT.second.isVector()) 776 return LT.first; 777 break; 778 } 779 case Intrinsic::abs: { 780 auto LT = getTypeLegalizationCost(RetTy); 781 if (ST->hasVInstructions() && LT.second.isVector()) { 782 // vrsub.vi v10, v8, 0 783 // vmax.vv v8, v8, v10 784 return LT.first * 2; 785 } 786 break; 787 } 788 // TODO: add more intrinsic 789 case Intrinsic::experimental_stepvector: { 790 unsigned Cost = 1; // vid 791 auto LT = getTypeLegalizationCost(RetTy); 792 return Cost + (LT.first - 1); 793 } 794 case Intrinsic::vp_rint: { 795 // RISC-V target uses at least 5 instructions to lower rounding intrinsics. 796 unsigned Cost = 5; 797 auto LT = getTypeLegalizationCost(RetTy); 798 if (TLI->isOperationCustom(ISD::VP_FRINT, LT.second)) 799 return Cost * LT.first; 800 break; 801 } 802 case Intrinsic::vp_nearbyint: { 803 // More one read and one write for fflags than vp_rint. 804 unsigned Cost = 7; 805 auto LT = getTypeLegalizationCost(RetTy); 806 if (TLI->isOperationCustom(ISD::VP_FRINT, LT.second)) 807 return Cost * LT.first; 808 break; 809 } 810 case Intrinsic::vp_ceil: 811 case Intrinsic::vp_floor: 812 case Intrinsic::vp_round: 813 case Intrinsic::vp_roundeven: 814 case Intrinsic::vp_roundtozero: { 815 // Rounding with static rounding mode needs two more instructions to 816 // swap/write FRM than vp_rint. 817 unsigned Cost = 7; 818 auto LT = getTypeLegalizationCost(RetTy); 819 unsigned VPISD = getISDForVPIntrinsicID(ICA.getID()); 820 if (TLI->isOperationCustom(VPISD, LT.second)) 821 return Cost * LT.first; 822 break; 823 } 824 } 825 826 if (ST->hasVInstructions() && RetTy->isVectorTy()) { 827 if (auto LT = getTypeLegalizationCost(RetTy); 828 LT.second.isVector()) { 829 MVT EltTy = LT.second.getVectorElementType(); 830 if (const auto *Entry = CostTableLookup(VectorIntrinsicCostTable, 831 ICA.getID(), EltTy)) 832 return LT.first * Entry->Cost; 833 } 834 } 835 836 return BaseT::getIntrinsicInstrCost(ICA, CostKind); 837 } 838 839 InstructionCost RISCVTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, 840 Type *Src, 841 TTI::CastContextHint CCH, 842 TTI::TargetCostKind CostKind, 843 const Instruction *I) { 844 if (isa<VectorType>(Dst) && isa<VectorType>(Src)) { 845 // FIXME: Need to compute legalizing cost for illegal types. 846 if (!isTypeLegal(Src) || !isTypeLegal(Dst)) 847 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I); 848 849 // Skip if element size of Dst or Src is bigger than ELEN. 850 if (Src->getScalarSizeInBits() > ST->getELen() || 851 Dst->getScalarSizeInBits() > ST->getELen()) 852 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I); 853 854 int ISD = TLI->InstructionOpcodeToISD(Opcode); 855 assert(ISD && "Invalid opcode"); 856 857 // FIXME: Need to consider vsetvli and lmul. 858 int PowDiff = (int)Log2_32(Dst->getScalarSizeInBits()) - 859 (int)Log2_32(Src->getScalarSizeInBits()); 860 switch (ISD) { 861 case ISD::SIGN_EXTEND: 862 case ISD::ZERO_EXTEND: 863 if (Src->getScalarSizeInBits() == 1) { 864 // We do not use vsext/vzext to extend from mask vector. 865 // Instead we use the following instructions to extend from mask vector: 866 // vmv.v.i v8, 0 867 // vmerge.vim v8, v8, -1, v0 868 return 2; 869 } 870 return 1; 871 case ISD::TRUNCATE: 872 if (Dst->getScalarSizeInBits() == 1) { 873 // We do not use several vncvt to truncate to mask vector. So we could 874 // not use PowDiff to calculate it. 875 // Instead we use the following instructions to truncate to mask vector: 876 // vand.vi v8, v8, 1 877 // vmsne.vi v0, v8, 0 878 return 2; 879 } 880 [[fallthrough]]; 881 case ISD::FP_EXTEND: 882 case ISD::FP_ROUND: 883 // Counts of narrow/widen instructions. 884 return std::abs(PowDiff); 885 case ISD::FP_TO_SINT: 886 case ISD::FP_TO_UINT: 887 case ISD::SINT_TO_FP: 888 case ISD::UINT_TO_FP: 889 if (Src->getScalarSizeInBits() == 1 || Dst->getScalarSizeInBits() == 1) { 890 // The cost of convert from or to mask vector is different from other 891 // cases. We could not use PowDiff to calculate it. 892 // For mask vector to fp, we should use the following instructions: 893 // vmv.v.i v8, 0 894 // vmerge.vim v8, v8, -1, v0 895 // vfcvt.f.x.v v8, v8 896 897 // And for fp vector to mask, we use: 898 // vfncvt.rtz.x.f.w v9, v8 899 // vand.vi v8, v9, 1 900 // vmsne.vi v0, v8, 0 901 return 3; 902 } 903 if (std::abs(PowDiff) <= 1) 904 return 1; 905 // Backend could lower (v[sz]ext i8 to double) to vfcvt(v[sz]ext.f8 i8), 906 // so it only need two conversion. 907 if (Src->isIntOrIntVectorTy()) 908 return 2; 909 // Counts of narrow/widen instructions. 910 return std::abs(PowDiff); 911 } 912 } 913 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I); 914 } 915 916 unsigned RISCVTTIImpl::getEstimatedVLFor(VectorType *Ty) { 917 if (isa<ScalableVectorType>(Ty)) { 918 const unsigned EltSize = DL.getTypeSizeInBits(Ty->getElementType()); 919 const unsigned MinSize = DL.getTypeSizeInBits(Ty).getKnownMinValue(); 920 const unsigned VectorBits = *getVScaleForTuning() * RISCV::RVVBitsPerBlock; 921 return RISCVTargetLowering::computeVLMAX(VectorBits, EltSize, MinSize); 922 } 923 return cast<FixedVectorType>(Ty)->getNumElements(); 924 } 925 926 InstructionCost 927 RISCVTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, 928 FastMathFlags FMF, 929 TTI::TargetCostKind CostKind) { 930 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors()) 931 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind); 932 933 // Skip if scalar size of Ty is bigger than ELEN. 934 if (Ty->getScalarSizeInBits() > ST->getELen()) 935 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind); 936 937 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); 938 if (Ty->getElementType()->isIntegerTy(1)) 939 // vcpop sequences, see vreduction-mask.ll. umax, smin actually only 940 // cost 2, but we don't have enough info here so we slightly over cost. 941 return (LT.first - 1) + 3; 942 943 // IR Reduction is composed by two vmv and one rvv reduction instruction. 944 InstructionCost BaseCost = 2; 945 946 if (CostKind == TTI::TCK_CodeSize) 947 return (LT.first - 1) + BaseCost; 948 949 unsigned VL = getEstimatedVLFor(Ty); 950 return (LT.first - 1) + BaseCost + Log2_32_Ceil(VL); 951 } 952 953 InstructionCost 954 RISCVTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, 955 std::optional<FastMathFlags> FMF, 956 TTI::TargetCostKind CostKind) { 957 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors()) 958 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind); 959 960 // Skip if scalar size of Ty is bigger than ELEN. 961 if (Ty->getScalarSizeInBits() > ST->getELen()) 962 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind); 963 964 int ISD = TLI->InstructionOpcodeToISD(Opcode); 965 assert(ISD && "Invalid opcode"); 966 967 if (ISD != ISD::ADD && ISD != ISD::OR && ISD != ISD::XOR && ISD != ISD::AND && 968 ISD != ISD::FADD) 969 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind); 970 971 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); 972 if (Ty->getElementType()->isIntegerTy(1)) 973 // vcpop sequences, see vreduction-mask.ll 974 return (LT.first - 1) + (ISD == ISD::AND ? 3 : 2); 975 976 // IR Reduction is composed by two vmv and one rvv reduction instruction. 977 InstructionCost BaseCost = 2; 978 979 if (CostKind == TTI::TCK_CodeSize) 980 return (LT.first - 1) + BaseCost; 981 982 unsigned VL = getEstimatedVLFor(Ty); 983 if (TTI::requiresOrderedReduction(FMF)) 984 return (LT.first - 1) + BaseCost + VL; 985 return (LT.first - 1) + BaseCost + Log2_32_Ceil(VL); 986 } 987 988 InstructionCost RISCVTTIImpl::getExtendedReductionCost( 989 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, 990 FastMathFlags FMF, TTI::TargetCostKind CostKind) { 991 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors()) 992 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy, 993 FMF, CostKind); 994 995 // Skip if scalar size of ResTy is bigger than ELEN. 996 if (ResTy->getScalarSizeInBits() > ST->getELen()) 997 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy, 998 FMF, CostKind); 999 1000 if (Opcode != Instruction::Add && Opcode != Instruction::FAdd) 1001 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy, 1002 FMF, CostKind); 1003 1004 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy); 1005 1006 if (ResTy->getScalarSizeInBits() != 2 * LT.second.getScalarSizeInBits()) 1007 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy, 1008 FMF, CostKind); 1009 1010 return (LT.first - 1) + 1011 getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind); 1012 } 1013 1014 InstructionCost RISCVTTIImpl::getStoreImmCost(Type *Ty, 1015 TTI::OperandValueInfo OpInfo, 1016 TTI::TargetCostKind CostKind) { 1017 assert(OpInfo.isConstant() && "non constant operand?"); 1018 if (!isa<VectorType>(Ty)) 1019 // FIXME: We need to account for immediate materialization here, but doing 1020 // a decent job requires more knowledge about the immediate than we 1021 // currently have here. 1022 return 0; 1023 1024 if (OpInfo.isUniform()) 1025 // vmv.x.i, vmv.v.x, or vfmv.v.f 1026 // We ignore the cost of the scalar constant materialization to be consistent 1027 // with how we treat scalar constants themselves just above. 1028 return 1; 1029 1030 return getConstantPoolLoadCost(Ty, CostKind); 1031 } 1032 1033 1034 InstructionCost RISCVTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, 1035 MaybeAlign Alignment, 1036 unsigned AddressSpace, 1037 TTI::TargetCostKind CostKind, 1038 TTI::OperandValueInfo OpInfo, 1039 const Instruction *I) { 1040 EVT VT = TLI->getValueType(DL, Src, true); 1041 // Type legalization can't handle structs 1042 if (VT == MVT::Other) 1043 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, 1044 CostKind, OpInfo, I); 1045 1046 InstructionCost Cost = 0; 1047 if (Opcode == Instruction::Store && OpInfo.isConstant()) 1048 Cost += getStoreImmCost(Src, OpInfo, CostKind); 1049 InstructionCost BaseCost = 1050 BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, 1051 CostKind, OpInfo, I); 1052 // Assume memory ops cost scale with the number of vector registers 1053 // possible accessed by the instruction. Note that BasicTTI already 1054 // handles the LT.first term for us. 1055 if (std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src); 1056 LT.second.isVector() && CostKind != TTI::TCK_CodeSize) 1057 BaseCost *= TLI->getLMULCost(LT.second); 1058 return Cost + BaseCost; 1059 1060 } 1061 1062 InstructionCost RISCVTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, 1063 Type *CondTy, 1064 CmpInst::Predicate VecPred, 1065 TTI::TargetCostKind CostKind, 1066 const Instruction *I) { 1067 if (CostKind != TTI::TCK_RecipThroughput) 1068 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, 1069 I); 1070 1071 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors()) 1072 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, 1073 I); 1074 1075 // Skip if scalar size of ValTy is bigger than ELEN. 1076 if (ValTy->isVectorTy() && ValTy->getScalarSizeInBits() > ST->getELen()) 1077 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, 1078 I); 1079 1080 if (Opcode == Instruction::Select && ValTy->isVectorTy()) { 1081 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy); 1082 if (CondTy->isVectorTy()) { 1083 if (ValTy->getScalarSizeInBits() == 1) { 1084 // vmandn.mm v8, v8, v9 1085 // vmand.mm v9, v0, v9 1086 // vmor.mm v0, v9, v8 1087 return LT.first * 3; 1088 } 1089 // vselect and max/min are supported natively. 1090 return LT.first * 1; 1091 } 1092 1093 if (ValTy->getScalarSizeInBits() == 1) { 1094 // vmv.v.x v9, a0 1095 // vmsne.vi v9, v9, 0 1096 // vmandn.mm v8, v8, v9 1097 // vmand.mm v9, v0, v9 1098 // vmor.mm v0, v9, v8 1099 return LT.first * 5; 1100 } 1101 1102 // vmv.v.x v10, a0 1103 // vmsne.vi v0, v10, 0 1104 // vmerge.vvm v8, v9, v8, v0 1105 return LT.first * 3; 1106 } 1107 1108 if ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) && 1109 ValTy->isVectorTy()) { 1110 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy); 1111 1112 // Support natively. 1113 if (CmpInst::isIntPredicate(VecPred)) 1114 return LT.first * 1; 1115 1116 // If we do not support the input floating point vector type, use the base 1117 // one which will calculate as: 1118 // ScalarizeCost + Num * Cost for fixed vector, 1119 // InvalidCost for scalable vector. 1120 if ((ValTy->getScalarSizeInBits() == 16 && !ST->hasVInstructionsF16()) || 1121 (ValTy->getScalarSizeInBits() == 32 && !ST->hasVInstructionsF32()) || 1122 (ValTy->getScalarSizeInBits() == 64 && !ST->hasVInstructionsF64())) 1123 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, 1124 I); 1125 switch (VecPred) { 1126 // Support natively. 1127 case CmpInst::FCMP_OEQ: 1128 case CmpInst::FCMP_OGT: 1129 case CmpInst::FCMP_OGE: 1130 case CmpInst::FCMP_OLT: 1131 case CmpInst::FCMP_OLE: 1132 case CmpInst::FCMP_UNE: 1133 return LT.first * 1; 1134 // TODO: Other comparisons? 1135 default: 1136 break; 1137 } 1138 } 1139 1140 // TODO: Add cost for scalar type. 1141 1142 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I); 1143 } 1144 1145 InstructionCost RISCVTTIImpl::getCFInstrCost(unsigned Opcode, 1146 TTI::TargetCostKind CostKind, 1147 const Instruction *I) { 1148 if (CostKind != TTI::TCK_RecipThroughput) 1149 return Opcode == Instruction::PHI ? 0 : 1; 1150 // Branches are assumed to be predicted. 1151 return 0; 1152 } 1153 1154 InstructionCost RISCVTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, 1155 TTI::TargetCostKind CostKind, 1156 unsigned Index, Value *Op0, 1157 Value *Op1) { 1158 assert(Val->isVectorTy() && "This must be a vector type"); 1159 1160 if (Opcode != Instruction::ExtractElement && 1161 Opcode != Instruction::InsertElement) 1162 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1); 1163 1164 // Legalize the type. 1165 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val); 1166 1167 // This type is legalized to a scalar type. 1168 if (!LT.second.isVector()) { 1169 auto *FixedVecTy = cast<FixedVectorType>(Val); 1170 // If Index is a known constant, cost is zero. 1171 if (Index != -1U) 1172 return 0; 1173 // Extract/InsertElement with non-constant index is very costly when 1174 // scalarized; estimate cost of loads/stores sequence via the stack: 1175 // ExtractElement cost: store vector to stack, load scalar; 1176 // InsertElement cost: store vector to stack, store scalar, load vector. 1177 Type *ElemTy = FixedVecTy->getElementType(); 1178 auto NumElems = FixedVecTy->getNumElements(); 1179 auto Align = DL.getPrefTypeAlign(ElemTy); 1180 InstructionCost LoadCost = 1181 getMemoryOpCost(Instruction::Load, ElemTy, Align, 0, CostKind); 1182 InstructionCost StoreCost = 1183 getMemoryOpCost(Instruction::Store, ElemTy, Align, 0, CostKind); 1184 return Opcode == Instruction::ExtractElement 1185 ? StoreCost * NumElems + LoadCost 1186 : (StoreCost + LoadCost) * NumElems + StoreCost; 1187 } 1188 1189 // For unsupported scalable vector. 1190 if (LT.second.isScalableVector() && !LT.first.isValid()) 1191 return LT.first; 1192 1193 if (!isTypeLegal(Val)) 1194 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1); 1195 1196 // Mask vector extract/insert is expanded via e8. 1197 if (Val->getScalarSizeInBits() == 1) { 1198 VectorType *WideTy = 1199 VectorType::get(IntegerType::get(Val->getContext(), 8), 1200 cast<VectorType>(Val)->getElementCount()); 1201 if (Opcode == Instruction::ExtractElement) { 1202 InstructionCost ExtendCost 1203 = getCastInstrCost(Instruction::ZExt, WideTy, Val, 1204 TTI::CastContextHint::None, CostKind); 1205 InstructionCost ExtractCost 1206 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr); 1207 return ExtendCost + ExtractCost; 1208 } 1209 InstructionCost ExtendCost 1210 = getCastInstrCost(Instruction::ZExt, WideTy, Val, 1211 TTI::CastContextHint::None, CostKind); 1212 InstructionCost InsertCost 1213 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr); 1214 InstructionCost TruncCost 1215 = getCastInstrCost(Instruction::Trunc, Val, WideTy, 1216 TTI::CastContextHint::None, CostKind); 1217 return ExtendCost + InsertCost + TruncCost; 1218 } 1219 1220 1221 // In RVV, we could use vslidedown + vmv.x.s to extract element from vector 1222 // and vslideup + vmv.s.x to insert element to vector. 1223 unsigned BaseCost = 1; 1224 // When insertelement we should add the index with 1 as the input of vslideup. 1225 unsigned SlideCost = Opcode == Instruction::InsertElement ? 2 : 1; 1226 1227 if (Index != -1U) { 1228 // The type may be split. For fixed-width vectors we can normalize the 1229 // index to the new type. 1230 if (LT.second.isFixedLengthVector()) { 1231 unsigned Width = LT.second.getVectorNumElements(); 1232 Index = Index % Width; 1233 } 1234 1235 // We could extract/insert the first element without vslidedown/vslideup. 1236 if (Index == 0) 1237 SlideCost = 0; 1238 else if (Opcode == Instruction::InsertElement) 1239 SlideCost = 1; // With a constant index, we do not need to use addi. 1240 } 1241 1242 // Extract i64 in the target that has XLEN=32 need more instruction. 1243 if (Val->getScalarType()->isIntegerTy() && 1244 ST->getXLen() < Val->getScalarSizeInBits()) { 1245 // For extractelement, we need the following instructions: 1246 // vsetivli zero, 1, e64, m1, ta, mu (not count) 1247 // vslidedown.vx v8, v8, a0 1248 // vmv.x.s a0, v8 1249 // li a1, 32 1250 // vsrl.vx v8, v8, a1 1251 // vmv.x.s a1, v8 1252 1253 // For insertelement, we need the following instructions: 1254 // vsetivli zero, 2, e32, m4, ta, mu (not count) 1255 // vmv.v.i v12, 0 1256 // vslide1up.vx v16, v12, a1 1257 // vslide1up.vx v12, v16, a0 1258 // addi a0, a2, 1 1259 // vsetvli zero, a0, e64, m4, tu, mu (not count) 1260 // vslideup.vx v8, v12, a2 1261 1262 // TODO: should we count these special vsetvlis? 1263 BaseCost = Opcode == Instruction::InsertElement ? 3 : 4; 1264 } 1265 return BaseCost + SlideCost; 1266 } 1267 1268 InstructionCost RISCVTTIImpl::getArithmeticInstrCost( 1269 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, 1270 TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, 1271 ArrayRef<const Value *> Args, const Instruction *CxtI) { 1272 1273 // TODO: Handle more cost kinds. 1274 if (CostKind != TTI::TCK_RecipThroughput) 1275 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info, 1276 Args, CxtI); 1277 1278 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors()) 1279 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info, 1280 Args, CxtI); 1281 1282 // Skip if scalar size of Ty is bigger than ELEN. 1283 if (isa<VectorType>(Ty) && Ty->getScalarSizeInBits() > ST->getELen()) 1284 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info, 1285 Args, CxtI); 1286 1287 // Legalize the type. 1288 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); 1289 1290 // TODO: Handle scalar type. 1291 if (!LT.second.isVector()) 1292 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info, 1293 Args, CxtI); 1294 1295 1296 auto getConstantMatCost = 1297 [&](unsigned Operand, TTI::OperandValueInfo OpInfo) -> InstructionCost { 1298 if (OpInfo.isUniform() && TLI->canSplatOperand(Opcode, Operand)) 1299 // Two sub-cases: 1300 // * Has a 5 bit immediate operand which can be splatted. 1301 // * Has a larger immediate which must be materialized in scalar register 1302 // We return 0 for both as we currently ignore the cost of materializing 1303 // scalar constants in GPRs. 1304 return 0; 1305 1306 return getConstantPoolLoadCost(Ty, CostKind); 1307 }; 1308 1309 // Add the cost of materializing any constant vectors required. 1310 InstructionCost ConstantMatCost = 0; 1311 if (Op1Info.isConstant()) 1312 ConstantMatCost += getConstantMatCost(0, Op1Info); 1313 if (Op2Info.isConstant()) 1314 ConstantMatCost += getConstantMatCost(1, Op2Info); 1315 1316 switch (TLI->InstructionOpcodeToISD(Opcode)) { 1317 case ISD::ADD: 1318 case ISD::SUB: 1319 case ISD::AND: 1320 case ISD::OR: 1321 case ISD::XOR: 1322 case ISD::SHL: 1323 case ISD::SRL: 1324 case ISD::SRA: 1325 case ISD::MUL: 1326 case ISD::MULHS: 1327 case ISD::MULHU: 1328 case ISD::FADD: 1329 case ISD::FSUB: 1330 case ISD::FMUL: 1331 case ISD::FNEG: { 1332 return ConstantMatCost + TLI->getLMULCost(LT.second) * LT.first * 1; 1333 } 1334 default: 1335 return ConstantMatCost + 1336 BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info, 1337 Args, CxtI); 1338 } 1339 } 1340 1341 // TODO: Deduplicate from TargetTransformInfoImplCRTPBase. 1342 InstructionCost RISCVTTIImpl::getPointersChainCost( 1343 ArrayRef<const Value *> Ptrs, const Value *Base, 1344 const TTI::PointersChainInfo &Info, Type *AccessTy, 1345 TTI::TargetCostKind CostKind) { 1346 InstructionCost Cost = TTI::TCC_Free; 1347 // In the basic model we take into account GEP instructions only 1348 // (although here can come alloca instruction, a value, constants and/or 1349 // constant expressions, PHIs, bitcasts ... whatever allowed to be used as a 1350 // pointer). Typically, if Base is a not a GEP-instruction and all the 1351 // pointers are relative to the same base address, all the rest are 1352 // either GEP instructions, PHIs, bitcasts or constants. When we have same 1353 // base, we just calculate cost of each non-Base GEP as an ADD operation if 1354 // any their index is a non-const. 1355 // If no known dependecies between the pointers cost is calculated as a sum 1356 // of costs of GEP instructions. 1357 for (auto [I, V] : enumerate(Ptrs)) { 1358 const auto *GEP = dyn_cast<GetElementPtrInst>(V); 1359 if (!GEP) 1360 continue; 1361 if (Info.isSameBase() && V != Base) { 1362 if (GEP->hasAllConstantIndices()) 1363 continue; 1364 // If the chain is unit-stride and BaseReg + stride*i is a legal 1365 // addressing mode, then presume the base GEP is sitting around in a 1366 // register somewhere and check if we can fold the offset relative to 1367 // it. 1368 unsigned Stride = DL.getTypeStoreSize(AccessTy); 1369 if (Info.isUnitStride() && 1370 isLegalAddressingMode(AccessTy, 1371 /* BaseGV */ nullptr, 1372 /* BaseOffset */ Stride * I, 1373 /* HasBaseReg */ true, 1374 /* Scale */ 0, 1375 GEP->getType()->getPointerAddressSpace())) 1376 continue; 1377 Cost += getArithmeticInstrCost(Instruction::Add, GEP->getType(), CostKind, 1378 {TTI::OK_AnyValue, TTI::OP_None}, 1379 {TTI::OK_AnyValue, TTI::OP_None}, 1380 std::nullopt); 1381 } else { 1382 SmallVector<const Value *> Indices(GEP->indices()); 1383 Cost += getGEPCost(GEP->getSourceElementType(), GEP->getPointerOperand(), 1384 Indices, AccessTy, CostKind); 1385 } 1386 } 1387 return Cost; 1388 } 1389 1390 void RISCVTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, 1391 TTI::UnrollingPreferences &UP, 1392 OptimizationRemarkEmitter *ORE) { 1393 // TODO: More tuning on benchmarks and metrics with changes as needed 1394 // would apply to all settings below to enable performance. 1395 1396 1397 if (ST->enableDefaultUnroll()) 1398 return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE); 1399 1400 // Enable Upper bound unrolling universally, not dependant upon the conditions 1401 // below. 1402 UP.UpperBound = true; 1403 1404 // Disable loop unrolling for Oz and Os. 1405 UP.OptSizeThreshold = 0; 1406 UP.PartialOptSizeThreshold = 0; 1407 if (L->getHeader()->getParent()->hasOptSize()) 1408 return; 1409 1410 SmallVector<BasicBlock *, 4> ExitingBlocks; 1411 L->getExitingBlocks(ExitingBlocks); 1412 LLVM_DEBUG(dbgs() << "Loop has:\n" 1413 << "Blocks: " << L->getNumBlocks() << "\n" 1414 << "Exit blocks: " << ExitingBlocks.size() << "\n"); 1415 1416 // Only allow another exit other than the latch. This acts as an early exit 1417 // as it mirrors the profitability calculation of the runtime unroller. 1418 if (ExitingBlocks.size() > 2) 1419 return; 1420 1421 // Limit the CFG of the loop body for targets with a branch predictor. 1422 // Allowing 4 blocks permits if-then-else diamonds in the body. 1423 if (L->getNumBlocks() > 4) 1424 return; 1425 1426 // Don't unroll vectorized loops, including the remainder loop 1427 if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized")) 1428 return; 1429 1430 // Scan the loop: don't unroll loops with calls as this could prevent 1431 // inlining. 1432 InstructionCost Cost = 0; 1433 for (auto *BB : L->getBlocks()) { 1434 for (auto &I : *BB) { 1435 // Initial setting - Don't unroll loops containing vectorized 1436 // instructions. 1437 if (I.getType()->isVectorTy()) 1438 return; 1439 1440 if (isa<CallInst>(I) || isa<InvokeInst>(I)) { 1441 if (const Function *F = cast<CallBase>(I).getCalledFunction()) { 1442 if (!isLoweredToCall(F)) 1443 continue; 1444 } 1445 return; 1446 } 1447 1448 SmallVector<const Value *> Operands(I.operand_values()); 1449 Cost += getInstructionCost(&I, Operands, 1450 TargetTransformInfo::TCK_SizeAndLatency); 1451 } 1452 } 1453 1454 LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n"); 1455 1456 UP.Partial = true; 1457 UP.Runtime = true; 1458 UP.UnrollRemainder = true; 1459 UP.UnrollAndJam = true; 1460 UP.UnrollAndJamInnerLoopThreshold = 60; 1461 1462 // Force unrolling small loops can be very useful because of the branch 1463 // taken cost of the backedge. 1464 if (Cost < 12) 1465 UP.Force = true; 1466 } 1467 1468 void RISCVTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, 1469 TTI::PeelingPreferences &PP) { 1470 BaseT::getPeelingPreferences(L, SE, PP); 1471 } 1472 1473 unsigned RISCVTTIImpl::getRegUsageForType(Type *Ty) { 1474 TypeSize Size = DL.getTypeSizeInBits(Ty); 1475 if (Ty->isVectorTy()) { 1476 if (Size.isScalable() && ST->hasVInstructions()) 1477 return divideCeil(Size.getKnownMinValue(), RISCV::RVVBitsPerBlock); 1478 1479 if (ST->useRVVForFixedLengthVectors()) 1480 return divideCeil(Size, ST->getRealMinVLen()); 1481 } 1482 1483 return BaseT::getRegUsageForType(Ty); 1484 } 1485 1486 unsigned RISCVTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const { 1487 if (SLPMaxVF.getNumOccurrences()) 1488 return SLPMaxVF; 1489 1490 // Return how many elements can fit in getRegisterBitwidth. This is the 1491 // same routine as used in LoopVectorizer. We should probably be 1492 // accounting for whether we actually have instructions with the right 1493 // lane type, but we don't have enough information to do that without 1494 // some additional plumbing which hasn't been justified yet. 1495 TypeSize RegWidth = 1496 getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector); 1497 // If no vector registers, or absurd element widths, disable 1498 // vectorization by returning 1. 1499 return std::max<unsigned>(1U, RegWidth.getFixedValue() / ElemWidth); 1500 } 1501 1502 bool RISCVTTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1, 1503 const TargetTransformInfo::LSRCost &C2) { 1504 // RISC-V specific here are "instruction number 1st priority". 1505 return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost, 1506 C1.NumIVMuls, C1.NumBaseAdds, 1507 C1.ScaleCost, C1.ImmCost, C1.SetupCost) < 1508 std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost, 1509 C2.NumIVMuls, C2.NumBaseAdds, 1510 C2.ScaleCost, C2.ImmCost, C2.SetupCost); 1511 } 1512