1 //===-- RISCVTargetTransformInfo.cpp - RISC-V specific TTI ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "RISCVTargetTransformInfo.h" 10 #include "MCTargetDesc/RISCVMatInt.h" 11 #include "llvm/ADT/STLExtras.h" 12 #include "llvm/Analysis/TargetTransformInfo.h" 13 #include "llvm/CodeGen/BasicTTIImpl.h" 14 #include "llvm/CodeGen/CostTable.h" 15 #include "llvm/CodeGen/TargetLowering.h" 16 #include "llvm/IR/Instructions.h" 17 #include <cmath> 18 #include <optional> 19 using namespace llvm; 20 21 #define DEBUG_TYPE "riscvtti" 22 23 static cl::opt<unsigned> RVVRegisterWidthLMUL( 24 "riscv-v-register-bit-width-lmul", 25 cl::desc( 26 "The LMUL to use for getRegisterBitWidth queries. Affects LMUL used " 27 "by autovectorized code. Fractional LMULs are not supported."), 28 cl::init(2), cl::Hidden); 29 30 static cl::opt<unsigned> SLPMaxVF( 31 "riscv-v-slp-max-vf", 32 cl::desc( 33 "Overrides result used for getMaximumVF query which is used " 34 "exclusively by SLP vectorizer."), 35 cl::Hidden); 36 37 InstructionCost 38 RISCVTTIImpl::getRISCVInstructionCost(ArrayRef<unsigned> OpCodes, MVT VT, 39 TTI::TargetCostKind CostKind) { 40 size_t NumInstr = OpCodes.size(); 41 if (CostKind == TTI::TCK_CodeSize) 42 return NumInstr; 43 InstructionCost LMULCost = TLI->getLMULCost(VT); 44 if ((CostKind != TTI::TCK_RecipThroughput) && (CostKind != TTI::TCK_Latency)) 45 return LMULCost * NumInstr; 46 InstructionCost Cost = 0; 47 for (auto Op : OpCodes) { 48 switch (Op) { 49 case RISCV::VRGATHER_VI: 50 Cost += TLI->getVRGatherVICost(VT); 51 break; 52 case RISCV::VRGATHER_VV: 53 Cost += TLI->getVRGatherVVCost(VT); 54 break; 55 case RISCV::VSLIDEUP_VI: 56 case RISCV::VSLIDEDOWN_VI: 57 Cost += TLI->getVSlideVICost(VT); 58 break; 59 case RISCV::VSLIDEUP_VX: 60 case RISCV::VSLIDEDOWN_VX: 61 Cost += TLI->getVSlideVXCost(VT); 62 break; 63 case RISCV::VREDMAX_VS: 64 case RISCV::VREDMIN_VS: 65 case RISCV::VREDMAXU_VS: 66 case RISCV::VREDMINU_VS: 67 case RISCV::VREDSUM_VS: 68 case RISCV::VREDAND_VS: 69 case RISCV::VREDOR_VS: 70 case RISCV::VREDXOR_VS: 71 case RISCV::VFREDMAX_VS: 72 case RISCV::VFREDMIN_VS: 73 case RISCV::VFREDUSUM_VS: { 74 unsigned VL = VT.getVectorMinNumElements(); 75 if (!VT.isFixedLengthVector()) 76 VL *= *getVScaleForTuning(); 77 Cost += Log2_32_Ceil(VL); 78 break; 79 } 80 case RISCV::VFREDOSUM_VS: { 81 unsigned VL = VT.getVectorMinNumElements(); 82 if (!VT.isFixedLengthVector()) 83 VL *= *getVScaleForTuning(); 84 Cost += VL; 85 break; 86 } 87 case RISCV::VMV_X_S: 88 case RISCV::VMV_S_X: 89 Cost += 1; 90 break; 91 default: 92 Cost += LMULCost; 93 } 94 } 95 return Cost; 96 } 97 98 InstructionCost RISCVTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty, 99 TTI::TargetCostKind CostKind) { 100 assert(Ty->isIntegerTy() && 101 "getIntImmCost can only estimate cost of materialising integers"); 102 103 // We have a Zero register, so 0 is always free. 104 if (Imm == 0) 105 return TTI::TCC_Free; 106 107 // Otherwise, we check how many instructions it will take to materialise. 108 const DataLayout &DL = getDataLayout(); 109 return RISCVMatInt::getIntMatCost(Imm, DL.getTypeSizeInBits(Ty), *getST()); 110 } 111 112 // Look for patterns of shift followed by AND that can be turned into a pair of 113 // shifts. We won't need to materialize an immediate for the AND so these can 114 // be considered free. 115 static bool canUseShiftPair(Instruction *Inst, const APInt &Imm) { 116 uint64_t Mask = Imm.getZExtValue(); 117 auto *BO = dyn_cast<BinaryOperator>(Inst->getOperand(0)); 118 if (!BO || !BO->hasOneUse()) 119 return false; 120 121 if (BO->getOpcode() != Instruction::Shl) 122 return false; 123 124 if (!isa<ConstantInt>(BO->getOperand(1))) 125 return false; 126 127 unsigned ShAmt = cast<ConstantInt>(BO->getOperand(1))->getZExtValue(); 128 // (and (shl x, c2), c1) will be matched to (srli (slli x, c2+c3), c3) if c1 129 // is a mask shifted by c2 bits with c3 leading zeros. 130 if (isShiftedMask_64(Mask)) { 131 unsigned Trailing = llvm::countr_zero(Mask); 132 if (ShAmt == Trailing) 133 return true; 134 } 135 136 return false; 137 } 138 139 InstructionCost RISCVTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, 140 const APInt &Imm, Type *Ty, 141 TTI::TargetCostKind CostKind, 142 Instruction *Inst) { 143 assert(Ty->isIntegerTy() && 144 "getIntImmCost can only estimate cost of materialising integers"); 145 146 // We have a Zero register, so 0 is always free. 147 if (Imm == 0) 148 return TTI::TCC_Free; 149 150 // Some instructions in RISC-V can take a 12-bit immediate. Some of these are 151 // commutative, in others the immediate comes from a specific argument index. 152 bool Takes12BitImm = false; 153 unsigned ImmArgIdx = ~0U; 154 155 switch (Opcode) { 156 case Instruction::GetElementPtr: 157 // Never hoist any arguments to a GetElementPtr. CodeGenPrepare will 158 // split up large offsets in GEP into better parts than ConstantHoisting 159 // can. 160 return TTI::TCC_Free; 161 case Instruction::And: 162 // zext.h 163 if (Imm == UINT64_C(0xffff) && ST->hasStdExtZbb()) 164 return TTI::TCC_Free; 165 // zext.w 166 if (Imm == UINT64_C(0xffffffff) && ST->hasStdExtZba()) 167 return TTI::TCC_Free; 168 // bclri 169 if (ST->hasStdExtZbs() && (~Imm).isPowerOf2()) 170 return TTI::TCC_Free; 171 if (Inst && Idx == 1 && Imm.getBitWidth() <= ST->getXLen() && 172 canUseShiftPair(Inst, Imm)) 173 return TTI::TCC_Free; 174 Takes12BitImm = true; 175 break; 176 case Instruction::Add: 177 Takes12BitImm = true; 178 break; 179 case Instruction::Or: 180 case Instruction::Xor: 181 // bseti/binvi 182 if (ST->hasStdExtZbs() && Imm.isPowerOf2()) 183 return TTI::TCC_Free; 184 Takes12BitImm = true; 185 break; 186 case Instruction::Mul: 187 // Power of 2 is a shift. Negated power of 2 is a shift and a negate. 188 if (Imm.isPowerOf2() || Imm.isNegatedPowerOf2()) 189 return TTI::TCC_Free; 190 // One more or less than a power of 2 can use SLLI+ADD/SUB. 191 if ((Imm + 1).isPowerOf2() || (Imm - 1).isPowerOf2()) 192 return TTI::TCC_Free; 193 // FIXME: There is no MULI instruction. 194 Takes12BitImm = true; 195 break; 196 case Instruction::Sub: 197 case Instruction::Shl: 198 case Instruction::LShr: 199 case Instruction::AShr: 200 Takes12BitImm = true; 201 ImmArgIdx = 1; 202 break; 203 default: 204 break; 205 } 206 207 if (Takes12BitImm) { 208 // Check immediate is the correct argument... 209 if (Instruction::isCommutative(Opcode) || Idx == ImmArgIdx) { 210 // ... and fits into the 12-bit immediate. 211 if (Imm.getSignificantBits() <= 64 && 212 getTLI()->isLegalAddImmediate(Imm.getSExtValue())) { 213 return TTI::TCC_Free; 214 } 215 } 216 217 // Otherwise, use the full materialisation cost. 218 return getIntImmCost(Imm, Ty, CostKind); 219 } 220 221 // By default, prevent hoisting. 222 return TTI::TCC_Free; 223 } 224 225 InstructionCost 226 RISCVTTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, 227 const APInt &Imm, Type *Ty, 228 TTI::TargetCostKind CostKind) { 229 // Prevent hoisting in unknown cases. 230 return TTI::TCC_Free; 231 } 232 233 TargetTransformInfo::PopcntSupportKind 234 RISCVTTIImpl::getPopcntSupport(unsigned TyWidth) { 235 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); 236 return ST->hasStdExtZbb() || ST->hasVendorXCVbitmanip() 237 ? TTI::PSK_FastHardware 238 : TTI::PSK_Software; 239 } 240 241 bool RISCVTTIImpl::shouldExpandReduction(const IntrinsicInst *II) const { 242 // Currently, the ExpandReductions pass can't expand scalable-vector 243 // reductions, but we still request expansion as RVV doesn't support certain 244 // reductions and the SelectionDAG can't legalize them either. 245 switch (II->getIntrinsicID()) { 246 default: 247 return false; 248 // These reductions have no equivalent in RVV 249 case Intrinsic::vector_reduce_mul: 250 case Intrinsic::vector_reduce_fmul: 251 return true; 252 } 253 } 254 255 std::optional<unsigned> RISCVTTIImpl::getMaxVScale() const { 256 if (ST->hasVInstructions()) 257 return ST->getRealMaxVLen() / RISCV::RVVBitsPerBlock; 258 return BaseT::getMaxVScale(); 259 } 260 261 std::optional<unsigned> RISCVTTIImpl::getVScaleForTuning() const { 262 if (ST->hasVInstructions()) 263 if (unsigned MinVLen = ST->getRealMinVLen(); 264 MinVLen >= RISCV::RVVBitsPerBlock) 265 return MinVLen / RISCV::RVVBitsPerBlock; 266 return BaseT::getVScaleForTuning(); 267 } 268 269 TypeSize 270 RISCVTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { 271 unsigned LMUL = 272 llvm::bit_floor(std::clamp<unsigned>(RVVRegisterWidthLMUL, 1, 8)); 273 switch (K) { 274 case TargetTransformInfo::RGK_Scalar: 275 return TypeSize::getFixed(ST->getXLen()); 276 case TargetTransformInfo::RGK_FixedWidthVector: 277 return TypeSize::getFixed( 278 ST->useRVVForFixedLengthVectors() ? LMUL * ST->getRealMinVLen() : 0); 279 case TargetTransformInfo::RGK_ScalableVector: 280 return TypeSize::getScalable( 281 (ST->hasVInstructions() && 282 ST->getRealMinVLen() >= RISCV::RVVBitsPerBlock) 283 ? LMUL * RISCV::RVVBitsPerBlock 284 : 0); 285 } 286 287 llvm_unreachable("Unsupported register kind"); 288 } 289 290 InstructionCost 291 RISCVTTIImpl::getConstantPoolLoadCost(Type *Ty, TTI::TargetCostKind CostKind) { 292 // Add a cost of address generation + the cost of the load. The address 293 // is expected to be a PC relative offset to a constant pool entry 294 // using auipc/addi. 295 return 2 + getMemoryOpCost(Instruction::Load, Ty, DL.getABITypeAlign(Ty), 296 /*AddressSpace=*/0, CostKind); 297 } 298 299 static VectorType *getVRGatherIndexType(MVT DataVT, const RISCVSubtarget &ST, 300 LLVMContext &C) { 301 assert((DataVT.getScalarSizeInBits() != 8 || 302 DataVT.getVectorNumElements() <= 256) && "unhandled case in lowering"); 303 MVT IndexVT = DataVT.changeTypeToInteger(); 304 if (IndexVT.getScalarType().bitsGT(ST.getXLenVT())) 305 IndexVT = IndexVT.changeVectorElementType(MVT::i16); 306 return cast<VectorType>(EVT(IndexVT).getTypeForEVT(C)); 307 } 308 309 InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, 310 VectorType *Tp, ArrayRef<int> Mask, 311 TTI::TargetCostKind CostKind, 312 int Index, VectorType *SubTp, 313 ArrayRef<const Value *> Args) { 314 Kind = improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp); 315 316 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp); 317 318 // First, handle cases where having a fixed length vector enables us to 319 // give a more accurate cost than falling back to generic scalable codegen. 320 // TODO: Each of these cases hints at a modeling gap around scalable vectors. 321 if (isa<FixedVectorType>(Tp)) { 322 switch (Kind) { 323 default: 324 break; 325 case TTI::SK_PermuteSingleSrc: { 326 if (Mask.size() >= 2 && LT.second.isFixedLengthVector()) { 327 MVT EltTp = LT.second.getVectorElementType(); 328 // If the size of the element is < ELEN then shuffles of interleaves and 329 // deinterleaves of 2 vectors can be lowered into the following 330 // sequences 331 if (EltTp.getScalarSizeInBits() < ST->getELen()) { 332 // Example sequence: 333 // vsetivli zero, 4, e8, mf4, ta, ma (ignored) 334 // vwaddu.vv v10, v8, v9 335 // li a0, -1 (ignored) 336 // vwmaccu.vx v10, a0, v9 337 if (ShuffleVectorInst::isInterleaveMask(Mask, 2, Mask.size())) 338 return 2 * LT.first * TLI->getLMULCost(LT.second); 339 340 if (Mask[0] == 0 || Mask[0] == 1) { 341 auto DeinterleaveMask = createStrideMask(Mask[0], 2, Mask.size()); 342 // Example sequence: 343 // vnsrl.wi v10, v8, 0 344 if (equal(DeinterleaveMask, Mask)) 345 return LT.first * getRISCVInstructionCost(RISCV::VNSRL_WI, 346 LT.second, CostKind); 347 } 348 } 349 } 350 // vrgather + cost of generating the mask constant. 351 // We model this for an unknown mask with a single vrgather. 352 if (LT.second.isFixedLengthVector() && LT.first == 1 && 353 (LT.second.getScalarSizeInBits() != 8 || 354 LT.second.getVectorNumElements() <= 256)) { 355 VectorType *IdxTy = getVRGatherIndexType(LT.second, *ST, Tp->getContext()); 356 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind); 357 return IndexCost + 358 getRISCVInstructionCost(RISCV::VRGATHER_VV, LT.second, CostKind); 359 } 360 [[fallthrough]]; 361 } 362 case TTI::SK_Transpose: 363 case TTI::SK_PermuteTwoSrc: { 364 // 2 x (vrgather + cost of generating the mask constant) + cost of mask 365 // register for the second vrgather. We model this for an unknown 366 // (shuffle) mask. 367 if (LT.second.isFixedLengthVector() && LT.first == 1 && 368 (LT.second.getScalarSizeInBits() != 8 || 369 LT.second.getVectorNumElements() <= 256)) { 370 auto &C = Tp->getContext(); 371 auto EC = Tp->getElementCount(); 372 VectorType *IdxTy = getVRGatherIndexType(LT.second, *ST, C); 373 VectorType *MaskTy = VectorType::get(IntegerType::getInt1Ty(C), EC); 374 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind); 375 InstructionCost MaskCost = getConstantPoolLoadCost(MaskTy, CostKind); 376 return 2 * IndexCost + 377 getRISCVInstructionCost({RISCV::VRGATHER_VV, RISCV::VRGATHER_VV}, 378 LT.second, CostKind) + 379 MaskCost; 380 } 381 [[fallthrough]]; 382 } 383 case TTI::SK_Select: { 384 // We are going to permute multiple sources and the result will be in 385 // multiple destinations. Providing an accurate cost only for splits where 386 // the element type remains the same. 387 if (!Mask.empty() && LT.first.isValid() && LT.first != 1 && 388 LT.second.isFixedLengthVector() && 389 LT.second.getVectorElementType().getSizeInBits() == 390 Tp->getElementType()->getPrimitiveSizeInBits() && 391 LT.second.getVectorNumElements() < 392 cast<FixedVectorType>(Tp)->getNumElements() && 393 divideCeil(Mask.size(), 394 cast<FixedVectorType>(Tp)->getNumElements()) == 395 static_cast<unsigned>(*LT.first.getValue())) { 396 unsigned NumRegs = *LT.first.getValue(); 397 unsigned VF = cast<FixedVectorType>(Tp)->getNumElements(); 398 unsigned SubVF = PowerOf2Ceil(VF / NumRegs); 399 auto *SubVecTy = FixedVectorType::get(Tp->getElementType(), SubVF); 400 401 InstructionCost Cost = 0; 402 for (unsigned I = 0; I < NumRegs; ++I) { 403 bool IsSingleVector = true; 404 SmallVector<int> SubMask(SubVF, PoisonMaskElem); 405 transform(Mask.slice(I * SubVF, 406 I == NumRegs - 1 ? Mask.size() % SubVF : SubVF), 407 SubMask.begin(), [&](int I) { 408 bool SingleSubVector = I / VF == 0; 409 IsSingleVector &= SingleSubVector; 410 return (SingleSubVector ? 0 : 1) * SubVF + I % VF; 411 }); 412 Cost += getShuffleCost(IsSingleVector ? TTI::SK_PermuteSingleSrc 413 : TTI::SK_PermuteTwoSrc, 414 SubVecTy, SubMask, CostKind, 0, nullptr); 415 return Cost; 416 } 417 } 418 break; 419 } 420 } 421 }; 422 423 // Handle scalable vectors (and fixed vectors legalized to scalable vectors). 424 switch (Kind) { 425 default: 426 // Fallthrough to generic handling. 427 // TODO: Most of these cases will return getInvalid in generic code, and 428 // must be implemented here. 429 break; 430 case TTI::SK_ExtractSubvector: 431 // Example sequence: 432 // vsetivli zero, 4, e8, mf2, tu, ma (ignored) 433 // vslidedown.vi v8, v9, 2 434 return LT.first * 435 getRISCVInstructionCost(RISCV::VSLIDEDOWN_VI, LT.second, CostKind); 436 case TTI::SK_InsertSubvector: 437 // Example sequence: 438 // vsetivli zero, 4, e8, mf2, tu, ma (ignored) 439 // vslideup.vi v8, v9, 2 440 return LT.first * 441 getRISCVInstructionCost(RISCV::VSLIDEUP_VI, LT.second, CostKind); 442 case TTI::SK_Select: { 443 // Example sequence: 444 // li a0, 90 445 // vsetivli zero, 8, e8, mf2, ta, ma (ignored) 446 // vmv.s.x v0, a0 447 // vmerge.vvm v8, v9, v8, v0 448 // We use 2 for the cost of the mask materialization as this is the true 449 // cost for small masks and most shuffles are small. At worst, this cost 450 // should be a very small constant for the constant pool load. As such, 451 // we may bias towards large selects slightly more than truely warranted. 452 return LT.first * 453 (1 + getRISCVInstructionCost({RISCV::VMV_S_X, RISCV::VMERGE_VVM}, 454 LT.second, CostKind)); 455 } 456 case TTI::SK_Broadcast: { 457 bool HasScalar = (Args.size() > 0) && (Operator::getOpcode(Args[0]) == 458 Instruction::InsertElement); 459 if (LT.second.getScalarSizeInBits() == 1) { 460 if (HasScalar) { 461 // Example sequence: 462 // andi a0, a0, 1 463 // vsetivli zero, 2, e8, mf8, ta, ma (ignored) 464 // vmv.v.x v8, a0 465 // vmsne.vi v0, v8, 0 466 return LT.first * 467 (TLI->getLMULCost(LT.second) + // FIXME: should be 1 for andi 468 getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI}, 469 LT.second, CostKind)); 470 } 471 // Example sequence: 472 // vsetivli zero, 2, e8, mf8, ta, mu (ignored) 473 // vmv.v.i v8, 0 474 // vmerge.vim v8, v8, 1, v0 475 // vmv.x.s a0, v8 476 // andi a0, a0, 1 477 // vmv.v.x v8, a0 478 // vmsne.vi v0, v8, 0 479 480 return LT.first * 481 (TLI->getLMULCost(LT.second) + // FIXME: this should be 1 for andi 482 getRISCVInstructionCost({RISCV::VMV_V_I, RISCV::VMERGE_VIM, 483 RISCV::VMV_X_S, RISCV::VMV_V_X, 484 RISCV::VMSNE_VI}, 485 LT.second, CostKind)); 486 } 487 488 if (HasScalar) { 489 // Example sequence: 490 // vmv.v.x v8, a0 491 return LT.first * 492 getRISCVInstructionCost(RISCV::VMV_V_X, LT.second, CostKind); 493 } 494 495 // Example sequence: 496 // vrgather.vi v9, v8, 0 497 return LT.first * 498 getRISCVInstructionCost(RISCV::VRGATHER_VI, LT.second, CostKind); 499 } 500 case TTI::SK_Splice: { 501 // vslidedown+vslideup. 502 // TODO: Multiplying by LT.first implies this legalizes into multiple copies 503 // of similar code, but I think we expand through memory. 504 unsigned Opcodes[2] = {RISCV::VSLIDEDOWN_VX, RISCV::VSLIDEUP_VX}; 505 if (Index >= 0 && Index < 32) 506 Opcodes[0] = RISCV::VSLIDEDOWN_VI; 507 else if (Index < 0 && Index > -32) 508 Opcodes[1] = RISCV::VSLIDEUP_VI; 509 return LT.first * getRISCVInstructionCost(Opcodes, LT.second, CostKind); 510 } 511 case TTI::SK_Reverse: { 512 // TODO: Cases to improve here: 513 // * Illegal vector types 514 // * i64 on RV32 515 // * i1 vector 516 // At low LMUL, most of the cost is producing the vrgather index register. 517 // At high LMUL, the cost of the vrgather itself will dominate. 518 // Example sequence: 519 // csrr a0, vlenb 520 // srli a0, a0, 3 521 // addi a0, a0, -1 522 // vsetvli a1, zero, e8, mf8, ta, mu (ignored) 523 // vid.v v9 524 // vrsub.vx v10, v9, a0 525 // vrgather.vv v9, v8, v10 526 InstructionCost LenCost = 3; 527 if (LT.second.isFixedLengthVector()) 528 // vrsub.vi has a 5 bit immediate field, otherwise an li suffices 529 LenCost = isInt<5>(LT.second.getVectorNumElements() - 1) ? 0 : 1; 530 // FIXME: replace the constant `2` below with cost of {VID_V,VRSUB_VX} 531 InstructionCost GatherCost = 532 2 + getRISCVInstructionCost(RISCV::VRGATHER_VV, LT.second, CostKind); 533 // Mask operation additionally required extend and truncate 534 InstructionCost ExtendCost = Tp->getElementType()->isIntegerTy(1) ? 3 : 0; 535 return LT.first * (LenCost + GatherCost + ExtendCost); 536 } 537 } 538 return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp); 539 } 540 541 InstructionCost 542 RISCVTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, 543 unsigned AddressSpace, 544 TTI::TargetCostKind CostKind) { 545 if (!isLegalMaskedLoadStore(Src, Alignment) || 546 CostKind != TTI::TCK_RecipThroughput) 547 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace, 548 CostKind); 549 550 return getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind); 551 } 552 553 InstructionCost RISCVTTIImpl::getInterleavedMemoryOpCost( 554 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, 555 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, 556 bool UseMaskForCond, bool UseMaskForGaps) { 557 if (isa<ScalableVectorType>(VecTy)) 558 return InstructionCost::getInvalid(); 559 auto *FVTy = cast<FixedVectorType>(VecTy); 560 InstructionCost MemCost = 561 getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, CostKind); 562 unsigned VF = FVTy->getNumElements() / Factor; 563 564 // The interleaved memory access pass will lower interleaved memory ops (i.e 565 // a load and store followed by a specific shuffle) to vlseg/vsseg 566 // intrinsics. In those cases then we can treat it as if it's just one (legal) 567 // memory op 568 if (!UseMaskForCond && !UseMaskForGaps && 569 Factor <= TLI->getMaxSupportedInterleaveFactor()) { 570 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(FVTy); 571 // Need to make sure type has't been scalarized 572 if (LT.second.isFixedLengthVector()) { 573 auto *LegalFVTy = FixedVectorType::get(FVTy->getElementType(), 574 LT.second.getVectorNumElements()); 575 // FIXME: We use the memory op cost of the *legalized* type here, becuase 576 // it's getMemoryOpCost returns a really expensive cost for types like 577 // <6 x i8>, which show up when doing interleaves of Factor=3 etc. 578 // Should the memory op cost of these be cheaper? 579 if (TLI->isLegalInterleavedAccessType(LegalFVTy, Factor, Alignment, 580 AddressSpace, DL)) { 581 InstructionCost LegalMemCost = getMemoryOpCost( 582 Opcode, LegalFVTy, Alignment, AddressSpace, CostKind); 583 return LT.first + LegalMemCost; 584 } 585 } 586 } 587 588 // An interleaved load will look like this for Factor=3: 589 // %wide.vec = load <12 x i32>, ptr %3, align 4 590 // %strided.vec = shufflevector %wide.vec, poison, <4 x i32> <stride mask> 591 // %strided.vec1 = shufflevector %wide.vec, poison, <4 x i32> <stride mask> 592 // %strided.vec2 = shufflevector %wide.vec, poison, <4 x i32> <stride mask> 593 if (Opcode == Instruction::Load) { 594 InstructionCost Cost = MemCost; 595 for (unsigned Index : Indices) { 596 FixedVectorType *SubVecTy = 597 FixedVectorType::get(FVTy->getElementType(), VF * Factor); 598 auto Mask = createStrideMask(Index, Factor, VF); 599 InstructionCost ShuffleCost = 600 getShuffleCost(TTI::ShuffleKind::SK_PermuteSingleSrc, SubVecTy, Mask, 601 CostKind, 0, nullptr, {}); 602 Cost += ShuffleCost; 603 } 604 return Cost; 605 } 606 607 // TODO: Model for NF > 2 608 // We'll need to enhance getShuffleCost to model shuffles that are just 609 // inserts and extracts into subvectors, since they won't have the full cost 610 // of a vrgather. 611 // An interleaved store for 3 vectors of 4 lanes will look like 612 // %11 = shufflevector <4 x i32> %4, <4 x i32> %6, <8 x i32> <0...7> 613 // %12 = shufflevector <4 x i32> %9, <4 x i32> poison, <8 x i32> <0...3> 614 // %13 = shufflevector <8 x i32> %11, <8 x i32> %12, <12 x i32> <0...11> 615 // %interleaved.vec = shufflevector %13, poison, <12 x i32> <interleave mask> 616 // store <12 x i32> %interleaved.vec, ptr %10, align 4 617 if (Factor != 2) 618 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, 619 Alignment, AddressSpace, CostKind, 620 UseMaskForCond, UseMaskForGaps); 621 622 assert(Opcode == Instruction::Store && "Opcode must be a store"); 623 // For an interleaving store of 2 vectors, we perform one large interleaving 624 // shuffle that goes into the wide store 625 auto Mask = createInterleaveMask(VF, Factor); 626 InstructionCost ShuffleCost = 627 getShuffleCost(TTI::ShuffleKind::SK_PermuteSingleSrc, FVTy, Mask, 628 CostKind, 0, nullptr, {}); 629 return MemCost + ShuffleCost; 630 } 631 632 InstructionCost RISCVTTIImpl::getGatherScatterOpCost( 633 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, 634 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) { 635 if (CostKind != TTI::TCK_RecipThroughput) 636 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, 637 Alignment, CostKind, I); 638 639 if ((Opcode == Instruction::Load && 640 !isLegalMaskedGather(DataTy, Align(Alignment))) || 641 (Opcode == Instruction::Store && 642 !isLegalMaskedScatter(DataTy, Align(Alignment)))) 643 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, 644 Alignment, CostKind, I); 645 646 // Cost is proportional to the number of memory operations implied. For 647 // scalable vectors, we use an estimate on that number since we don't 648 // know exactly what VL will be. 649 auto &VTy = *cast<VectorType>(DataTy); 650 InstructionCost MemOpCost = 651 getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind, 652 {TTI::OK_AnyValue, TTI::OP_None}, I); 653 unsigned NumLoads = getEstimatedVLFor(&VTy); 654 return NumLoads * MemOpCost; 655 } 656 657 // Currently, these represent both throughput and codesize costs 658 // for the respective intrinsics. The costs in this table are simply 659 // instruction counts with the following adjustments made: 660 // * One vsetvli is considered free. 661 static const CostTblEntry VectorIntrinsicCostTable[]{ 662 {Intrinsic::floor, MVT::f32, 9}, 663 {Intrinsic::floor, MVT::f64, 9}, 664 {Intrinsic::ceil, MVT::f32, 9}, 665 {Intrinsic::ceil, MVT::f64, 9}, 666 {Intrinsic::trunc, MVT::f32, 7}, 667 {Intrinsic::trunc, MVT::f64, 7}, 668 {Intrinsic::round, MVT::f32, 9}, 669 {Intrinsic::round, MVT::f64, 9}, 670 {Intrinsic::roundeven, MVT::f32, 9}, 671 {Intrinsic::roundeven, MVT::f64, 9}, 672 {Intrinsic::rint, MVT::f32, 7}, 673 {Intrinsic::rint, MVT::f64, 7}, 674 {Intrinsic::lrint, MVT::i32, 1}, 675 {Intrinsic::lrint, MVT::i64, 1}, 676 {Intrinsic::llrint, MVT::i64, 1}, 677 {Intrinsic::nearbyint, MVT::f32, 9}, 678 {Intrinsic::nearbyint, MVT::f64, 9}, 679 {Intrinsic::bswap, MVT::i16, 3}, 680 {Intrinsic::bswap, MVT::i32, 12}, 681 {Intrinsic::bswap, MVT::i64, 31}, 682 {Intrinsic::vp_bswap, MVT::i16, 3}, 683 {Intrinsic::vp_bswap, MVT::i32, 12}, 684 {Intrinsic::vp_bswap, MVT::i64, 31}, 685 {Intrinsic::vp_fshl, MVT::i8, 7}, 686 {Intrinsic::vp_fshl, MVT::i16, 7}, 687 {Intrinsic::vp_fshl, MVT::i32, 7}, 688 {Intrinsic::vp_fshl, MVT::i64, 7}, 689 {Intrinsic::vp_fshr, MVT::i8, 7}, 690 {Intrinsic::vp_fshr, MVT::i16, 7}, 691 {Intrinsic::vp_fshr, MVT::i32, 7}, 692 {Intrinsic::vp_fshr, MVT::i64, 7}, 693 {Intrinsic::bitreverse, MVT::i8, 17}, 694 {Intrinsic::bitreverse, MVT::i16, 24}, 695 {Intrinsic::bitreverse, MVT::i32, 33}, 696 {Intrinsic::bitreverse, MVT::i64, 52}, 697 {Intrinsic::vp_bitreverse, MVT::i8, 17}, 698 {Intrinsic::vp_bitreverse, MVT::i16, 24}, 699 {Intrinsic::vp_bitreverse, MVT::i32, 33}, 700 {Intrinsic::vp_bitreverse, MVT::i64, 52}, 701 {Intrinsic::ctpop, MVT::i8, 12}, 702 {Intrinsic::ctpop, MVT::i16, 19}, 703 {Intrinsic::ctpop, MVT::i32, 20}, 704 {Intrinsic::ctpop, MVT::i64, 21}, 705 {Intrinsic::vp_ctpop, MVT::i8, 12}, 706 {Intrinsic::vp_ctpop, MVT::i16, 19}, 707 {Intrinsic::vp_ctpop, MVT::i32, 20}, 708 {Intrinsic::vp_ctpop, MVT::i64, 21}, 709 {Intrinsic::vp_ctlz, MVT::i8, 19}, 710 {Intrinsic::vp_ctlz, MVT::i16, 28}, 711 {Intrinsic::vp_ctlz, MVT::i32, 31}, 712 {Intrinsic::vp_ctlz, MVT::i64, 35}, 713 {Intrinsic::vp_cttz, MVT::i8, 16}, 714 {Intrinsic::vp_cttz, MVT::i16, 23}, 715 {Intrinsic::vp_cttz, MVT::i32, 24}, 716 {Intrinsic::vp_cttz, MVT::i64, 25}, 717 }; 718 719 static unsigned getISDForVPIntrinsicID(Intrinsic::ID ID) { 720 switch (ID) { 721 #define HELPER_MAP_VPID_TO_VPSD(VPID, VPSD) \ 722 case Intrinsic::VPID: \ 723 return ISD::VPSD; 724 #include "llvm/IR/VPIntrinsics.def" 725 #undef HELPER_MAP_VPID_TO_VPSD 726 } 727 return ISD::DELETED_NODE; 728 } 729 730 InstructionCost 731 RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, 732 TTI::TargetCostKind CostKind) { 733 auto *RetTy = ICA.getReturnType(); 734 switch (ICA.getID()) { 735 case Intrinsic::ceil: 736 case Intrinsic::floor: 737 case Intrinsic::trunc: 738 case Intrinsic::rint: 739 case Intrinsic::lrint: 740 case Intrinsic::llrint: 741 case Intrinsic::round: 742 case Intrinsic::roundeven: { 743 // These all use the same code. 744 auto LT = getTypeLegalizationCost(RetTy); 745 if (!LT.second.isVector() && TLI->isOperationCustom(ISD::FCEIL, LT.second)) 746 return LT.first * 8; 747 break; 748 } 749 case Intrinsic::umin: 750 case Intrinsic::umax: 751 case Intrinsic::smin: 752 case Intrinsic::smax: { 753 auto LT = getTypeLegalizationCost(RetTy); 754 if ((ST->hasVInstructions() && LT.second.isVector()) || 755 (LT.second.isScalarInteger() && ST->hasStdExtZbb())) 756 return LT.first; 757 break; 758 } 759 case Intrinsic::sadd_sat: 760 case Intrinsic::ssub_sat: 761 case Intrinsic::uadd_sat: 762 case Intrinsic::usub_sat: 763 case Intrinsic::fabs: 764 case Intrinsic::sqrt: { 765 auto LT = getTypeLegalizationCost(RetTy); 766 if (ST->hasVInstructions() && LT.second.isVector()) 767 return LT.first; 768 break; 769 } 770 case Intrinsic::ctpop: { 771 auto LT = getTypeLegalizationCost(RetTy); 772 if (ST->hasVInstructions() && ST->hasStdExtZvbb() && LT.second.isVector()) 773 return LT.first; 774 break; 775 } 776 case Intrinsic::abs: { 777 auto LT = getTypeLegalizationCost(RetTy); 778 if (ST->hasVInstructions() && LT.second.isVector()) { 779 // vrsub.vi v10, v8, 0 780 // vmax.vv v8, v8, v10 781 return LT.first * 2; 782 } 783 break; 784 } 785 // TODO: add more intrinsic 786 case Intrinsic::experimental_stepvector: { 787 unsigned Cost = 1; // vid 788 auto LT = getTypeLegalizationCost(RetTy); 789 return Cost + (LT.first - 1); 790 } 791 case Intrinsic::vp_rint: { 792 // RISC-V target uses at least 5 instructions to lower rounding intrinsics. 793 unsigned Cost = 5; 794 auto LT = getTypeLegalizationCost(RetTy); 795 if (TLI->isOperationCustom(ISD::VP_FRINT, LT.second)) 796 return Cost * LT.first; 797 break; 798 } 799 case Intrinsic::vp_nearbyint: { 800 // More one read and one write for fflags than vp_rint. 801 unsigned Cost = 7; 802 auto LT = getTypeLegalizationCost(RetTy); 803 if (TLI->isOperationCustom(ISD::VP_FRINT, LT.second)) 804 return Cost * LT.first; 805 break; 806 } 807 case Intrinsic::vp_ceil: 808 case Intrinsic::vp_floor: 809 case Intrinsic::vp_round: 810 case Intrinsic::vp_roundeven: 811 case Intrinsic::vp_roundtozero: { 812 // Rounding with static rounding mode needs two more instructions to 813 // swap/write FRM than vp_rint. 814 unsigned Cost = 7; 815 auto LT = getTypeLegalizationCost(RetTy); 816 unsigned VPISD = getISDForVPIntrinsicID(ICA.getID()); 817 if (TLI->isOperationCustom(VPISD, LT.second)) 818 return Cost * LT.first; 819 break; 820 } 821 } 822 823 if (ST->hasVInstructions() && RetTy->isVectorTy()) { 824 if (auto LT = getTypeLegalizationCost(RetTy); 825 LT.second.isVector()) { 826 MVT EltTy = LT.second.getVectorElementType(); 827 if (const auto *Entry = CostTableLookup(VectorIntrinsicCostTable, 828 ICA.getID(), EltTy)) 829 return LT.first * Entry->Cost; 830 } 831 } 832 833 return BaseT::getIntrinsicInstrCost(ICA, CostKind); 834 } 835 836 InstructionCost RISCVTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, 837 Type *Src, 838 TTI::CastContextHint CCH, 839 TTI::TargetCostKind CostKind, 840 const Instruction *I) { 841 if (isa<VectorType>(Dst) && isa<VectorType>(Src)) { 842 // FIXME: Need to compute legalizing cost for illegal types. 843 if (!isTypeLegal(Src) || !isTypeLegal(Dst)) 844 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I); 845 846 // Skip if element size of Dst or Src is bigger than ELEN. 847 if (Src->getScalarSizeInBits() > ST->getELen() || 848 Dst->getScalarSizeInBits() > ST->getELen()) 849 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I); 850 851 int ISD = TLI->InstructionOpcodeToISD(Opcode); 852 assert(ISD && "Invalid opcode"); 853 854 // FIXME: Need to consider vsetvli and lmul. 855 int PowDiff = (int)Log2_32(Dst->getScalarSizeInBits()) - 856 (int)Log2_32(Src->getScalarSizeInBits()); 857 switch (ISD) { 858 case ISD::SIGN_EXTEND: 859 case ISD::ZERO_EXTEND: 860 if (Src->getScalarSizeInBits() == 1) { 861 // We do not use vsext/vzext to extend from mask vector. 862 // Instead we use the following instructions to extend from mask vector: 863 // vmv.v.i v8, 0 864 // vmerge.vim v8, v8, -1, v0 865 return 2; 866 } 867 return 1; 868 case ISD::TRUNCATE: 869 if (Dst->getScalarSizeInBits() == 1) { 870 // We do not use several vncvt to truncate to mask vector. So we could 871 // not use PowDiff to calculate it. 872 // Instead we use the following instructions to truncate to mask vector: 873 // vand.vi v8, v8, 1 874 // vmsne.vi v0, v8, 0 875 return 2; 876 } 877 [[fallthrough]]; 878 case ISD::FP_EXTEND: 879 case ISD::FP_ROUND: 880 // Counts of narrow/widen instructions. 881 return std::abs(PowDiff); 882 case ISD::FP_TO_SINT: 883 case ISD::FP_TO_UINT: 884 case ISD::SINT_TO_FP: 885 case ISD::UINT_TO_FP: 886 if (Src->getScalarSizeInBits() == 1 || Dst->getScalarSizeInBits() == 1) { 887 // The cost of convert from or to mask vector is different from other 888 // cases. We could not use PowDiff to calculate it. 889 // For mask vector to fp, we should use the following instructions: 890 // vmv.v.i v8, 0 891 // vmerge.vim v8, v8, -1, v0 892 // vfcvt.f.x.v v8, v8 893 894 // And for fp vector to mask, we use: 895 // vfncvt.rtz.x.f.w v9, v8 896 // vand.vi v8, v9, 1 897 // vmsne.vi v0, v8, 0 898 return 3; 899 } 900 if (std::abs(PowDiff) <= 1) 901 return 1; 902 // Backend could lower (v[sz]ext i8 to double) to vfcvt(v[sz]ext.f8 i8), 903 // so it only need two conversion. 904 if (Src->isIntOrIntVectorTy()) 905 return 2; 906 // Counts of narrow/widen instructions. 907 return std::abs(PowDiff); 908 } 909 } 910 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I); 911 } 912 913 unsigned RISCVTTIImpl::getEstimatedVLFor(VectorType *Ty) { 914 if (isa<ScalableVectorType>(Ty)) { 915 const unsigned EltSize = DL.getTypeSizeInBits(Ty->getElementType()); 916 const unsigned MinSize = DL.getTypeSizeInBits(Ty).getKnownMinValue(); 917 const unsigned VectorBits = *getVScaleForTuning() * RISCV::RVVBitsPerBlock; 918 return RISCVTargetLowering::computeVLMAX(VectorBits, EltSize, MinSize); 919 } 920 return cast<FixedVectorType>(Ty)->getNumElements(); 921 } 922 923 InstructionCost 924 RISCVTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, 925 FastMathFlags FMF, 926 TTI::TargetCostKind CostKind) { 927 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors()) 928 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind); 929 930 // Skip if scalar size of Ty is bigger than ELEN. 931 if (Ty->getScalarSizeInBits() > ST->getELen()) 932 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind); 933 934 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); 935 if (Ty->getElementType()->isIntegerTy(1)) 936 // vcpop sequences, see vreduction-mask.ll. umax, smin actually only 937 // cost 2, but we don't have enough info here so we slightly over cost. 938 return (LT.first - 1) + 3; 939 940 // IR Reduction is composed by two vmv and one rvv reduction instruction. 941 InstructionCost BaseCost = 2; 942 943 if (CostKind == TTI::TCK_CodeSize) 944 return (LT.first - 1) + BaseCost; 945 946 unsigned VL = getEstimatedVLFor(Ty); 947 return (LT.first - 1) + BaseCost + Log2_32_Ceil(VL); 948 } 949 950 InstructionCost 951 RISCVTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, 952 std::optional<FastMathFlags> FMF, 953 TTI::TargetCostKind CostKind) { 954 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors()) 955 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind); 956 957 // Skip if scalar size of Ty is bigger than ELEN. 958 if (Ty->getScalarSizeInBits() > ST->getELen()) 959 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind); 960 961 int ISD = TLI->InstructionOpcodeToISD(Opcode); 962 assert(ISD && "Invalid opcode"); 963 964 if (ISD != ISD::ADD && ISD != ISD::OR && ISD != ISD::XOR && ISD != ISD::AND && 965 ISD != ISD::FADD) 966 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind); 967 968 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); 969 if (Ty->getElementType()->isIntegerTy(1)) 970 // vcpop sequences, see vreduction-mask.ll 971 return (LT.first - 1) + (ISD == ISD::AND ? 3 : 2); 972 973 // IR Reduction is composed by two vmv and one rvv reduction instruction. 974 InstructionCost BaseCost = 2; 975 976 if (CostKind == TTI::TCK_CodeSize) 977 return (LT.first - 1) + BaseCost; 978 979 unsigned VL = getEstimatedVLFor(Ty); 980 if (TTI::requiresOrderedReduction(FMF)) 981 return (LT.first - 1) + BaseCost + VL; 982 return (LT.first - 1) + BaseCost + Log2_32_Ceil(VL); 983 } 984 985 InstructionCost RISCVTTIImpl::getExtendedReductionCost( 986 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, 987 FastMathFlags FMF, TTI::TargetCostKind CostKind) { 988 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors()) 989 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy, 990 FMF, CostKind); 991 992 // Skip if scalar size of ResTy is bigger than ELEN. 993 if (ResTy->getScalarSizeInBits() > ST->getELen()) 994 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy, 995 FMF, CostKind); 996 997 if (Opcode != Instruction::Add && Opcode != Instruction::FAdd) 998 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy, 999 FMF, CostKind); 1000 1001 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy); 1002 1003 if (ResTy->getScalarSizeInBits() != 2 * LT.second.getScalarSizeInBits()) 1004 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy, 1005 FMF, CostKind); 1006 1007 return (LT.first - 1) + 1008 getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind); 1009 } 1010 1011 InstructionCost RISCVTTIImpl::getStoreImmCost(Type *Ty, 1012 TTI::OperandValueInfo OpInfo, 1013 TTI::TargetCostKind CostKind) { 1014 assert(OpInfo.isConstant() && "non constant operand?"); 1015 if (!isa<VectorType>(Ty)) 1016 // FIXME: We need to account for immediate materialization here, but doing 1017 // a decent job requires more knowledge about the immediate than we 1018 // currently have here. 1019 return 0; 1020 1021 if (OpInfo.isUniform()) 1022 // vmv.x.i, vmv.v.x, or vfmv.v.f 1023 // We ignore the cost of the scalar constant materialization to be consistent 1024 // with how we treat scalar constants themselves just above. 1025 return 1; 1026 1027 return getConstantPoolLoadCost(Ty, CostKind); 1028 } 1029 1030 1031 InstructionCost RISCVTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, 1032 MaybeAlign Alignment, 1033 unsigned AddressSpace, 1034 TTI::TargetCostKind CostKind, 1035 TTI::OperandValueInfo OpInfo, 1036 const Instruction *I) { 1037 EVT VT = TLI->getValueType(DL, Src, true); 1038 // Type legalization can't handle structs 1039 if (VT == MVT::Other) 1040 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, 1041 CostKind, OpInfo, I); 1042 1043 InstructionCost Cost = 0; 1044 if (Opcode == Instruction::Store && OpInfo.isConstant()) 1045 Cost += getStoreImmCost(Src, OpInfo, CostKind); 1046 InstructionCost BaseCost = 1047 BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, 1048 CostKind, OpInfo, I); 1049 // Assume memory ops cost scale with the number of vector registers 1050 // possible accessed by the instruction. Note that BasicTTI already 1051 // handles the LT.first term for us. 1052 if (std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src); 1053 LT.second.isVector() && CostKind != TTI::TCK_CodeSize) 1054 BaseCost *= TLI->getLMULCost(LT.second); 1055 return Cost + BaseCost; 1056 1057 } 1058 1059 InstructionCost RISCVTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, 1060 Type *CondTy, 1061 CmpInst::Predicate VecPred, 1062 TTI::TargetCostKind CostKind, 1063 const Instruction *I) { 1064 if (CostKind != TTI::TCK_RecipThroughput) 1065 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, 1066 I); 1067 1068 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors()) 1069 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, 1070 I); 1071 1072 // Skip if scalar size of ValTy is bigger than ELEN. 1073 if (ValTy->isVectorTy() && ValTy->getScalarSizeInBits() > ST->getELen()) 1074 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, 1075 I); 1076 1077 if (Opcode == Instruction::Select && ValTy->isVectorTy()) { 1078 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy); 1079 if (CondTy->isVectorTy()) { 1080 if (ValTy->getScalarSizeInBits() == 1) { 1081 // vmandn.mm v8, v8, v9 1082 // vmand.mm v9, v0, v9 1083 // vmor.mm v0, v9, v8 1084 return LT.first * 3; 1085 } 1086 // vselect and max/min are supported natively. 1087 return LT.first * 1; 1088 } 1089 1090 if (ValTy->getScalarSizeInBits() == 1) { 1091 // vmv.v.x v9, a0 1092 // vmsne.vi v9, v9, 0 1093 // vmandn.mm v8, v8, v9 1094 // vmand.mm v9, v0, v9 1095 // vmor.mm v0, v9, v8 1096 return LT.first * 5; 1097 } 1098 1099 // vmv.v.x v10, a0 1100 // vmsne.vi v0, v10, 0 1101 // vmerge.vvm v8, v9, v8, v0 1102 return LT.first * 3; 1103 } 1104 1105 if ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) && 1106 ValTy->isVectorTy()) { 1107 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy); 1108 1109 // Support natively. 1110 if (CmpInst::isIntPredicate(VecPred)) 1111 return LT.first * 1; 1112 1113 // If we do not support the input floating point vector type, use the base 1114 // one which will calculate as: 1115 // ScalarizeCost + Num * Cost for fixed vector, 1116 // InvalidCost for scalable vector. 1117 if ((ValTy->getScalarSizeInBits() == 16 && !ST->hasVInstructionsF16()) || 1118 (ValTy->getScalarSizeInBits() == 32 && !ST->hasVInstructionsF32()) || 1119 (ValTy->getScalarSizeInBits() == 64 && !ST->hasVInstructionsF64())) 1120 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, 1121 I); 1122 switch (VecPred) { 1123 // Support natively. 1124 case CmpInst::FCMP_OEQ: 1125 case CmpInst::FCMP_OGT: 1126 case CmpInst::FCMP_OGE: 1127 case CmpInst::FCMP_OLT: 1128 case CmpInst::FCMP_OLE: 1129 case CmpInst::FCMP_UNE: 1130 return LT.first * 1; 1131 // TODO: Other comparisons? 1132 default: 1133 break; 1134 } 1135 } 1136 1137 // TODO: Add cost for scalar type. 1138 1139 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I); 1140 } 1141 1142 InstructionCost RISCVTTIImpl::getCFInstrCost(unsigned Opcode, 1143 TTI::TargetCostKind CostKind, 1144 const Instruction *I) { 1145 if (CostKind != TTI::TCK_RecipThroughput) 1146 return Opcode == Instruction::PHI ? 0 : 1; 1147 // Branches are assumed to be predicted. 1148 return 0; 1149 } 1150 1151 InstructionCost RISCVTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, 1152 TTI::TargetCostKind CostKind, 1153 unsigned Index, Value *Op0, 1154 Value *Op1) { 1155 assert(Val->isVectorTy() && "This must be a vector type"); 1156 1157 if (Opcode != Instruction::ExtractElement && 1158 Opcode != Instruction::InsertElement) 1159 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1); 1160 1161 // Legalize the type. 1162 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val); 1163 1164 // This type is legalized to a scalar type. 1165 if (!LT.second.isVector()) { 1166 auto *FixedVecTy = cast<FixedVectorType>(Val); 1167 // If Index is a known constant, cost is zero. 1168 if (Index != -1U) 1169 return 0; 1170 // Extract/InsertElement with non-constant index is very costly when 1171 // scalarized; estimate cost of loads/stores sequence via the stack: 1172 // ExtractElement cost: store vector to stack, load scalar; 1173 // InsertElement cost: store vector to stack, store scalar, load vector. 1174 Type *ElemTy = FixedVecTy->getElementType(); 1175 auto NumElems = FixedVecTy->getNumElements(); 1176 auto Align = DL.getPrefTypeAlign(ElemTy); 1177 InstructionCost LoadCost = 1178 getMemoryOpCost(Instruction::Load, ElemTy, Align, 0, CostKind); 1179 InstructionCost StoreCost = 1180 getMemoryOpCost(Instruction::Store, ElemTy, Align, 0, CostKind); 1181 return Opcode == Instruction::ExtractElement 1182 ? StoreCost * NumElems + LoadCost 1183 : (StoreCost + LoadCost) * NumElems + StoreCost; 1184 } 1185 1186 // For unsupported scalable vector. 1187 if (LT.second.isScalableVector() && !LT.first.isValid()) 1188 return LT.first; 1189 1190 if (!isTypeLegal(Val)) 1191 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1); 1192 1193 // Mask vector extract/insert is expanded via e8. 1194 if (Val->getScalarSizeInBits() == 1) { 1195 VectorType *WideTy = 1196 VectorType::get(IntegerType::get(Val->getContext(), 8), 1197 cast<VectorType>(Val)->getElementCount()); 1198 if (Opcode == Instruction::ExtractElement) { 1199 InstructionCost ExtendCost 1200 = getCastInstrCost(Instruction::ZExt, WideTy, Val, 1201 TTI::CastContextHint::None, CostKind); 1202 InstructionCost ExtractCost 1203 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr); 1204 return ExtendCost + ExtractCost; 1205 } 1206 InstructionCost ExtendCost 1207 = getCastInstrCost(Instruction::ZExt, WideTy, Val, 1208 TTI::CastContextHint::None, CostKind); 1209 InstructionCost InsertCost 1210 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr); 1211 InstructionCost TruncCost 1212 = getCastInstrCost(Instruction::Trunc, Val, WideTy, 1213 TTI::CastContextHint::None, CostKind); 1214 return ExtendCost + InsertCost + TruncCost; 1215 } 1216 1217 1218 // In RVV, we could use vslidedown + vmv.x.s to extract element from vector 1219 // and vslideup + vmv.s.x to insert element to vector. 1220 unsigned BaseCost = 1; 1221 // When insertelement we should add the index with 1 as the input of vslideup. 1222 unsigned SlideCost = Opcode == Instruction::InsertElement ? 2 : 1; 1223 1224 if (Index != -1U) { 1225 // The type may be split. For fixed-width vectors we can normalize the 1226 // index to the new type. 1227 if (LT.second.isFixedLengthVector()) { 1228 unsigned Width = LT.second.getVectorNumElements(); 1229 Index = Index % Width; 1230 } 1231 1232 // We could extract/insert the first element without vslidedown/vslideup. 1233 if (Index == 0) 1234 SlideCost = 0; 1235 else if (Opcode == Instruction::InsertElement) 1236 SlideCost = 1; // With a constant index, we do not need to use addi. 1237 } 1238 1239 // Extract i64 in the target that has XLEN=32 need more instruction. 1240 if (Val->getScalarType()->isIntegerTy() && 1241 ST->getXLen() < Val->getScalarSizeInBits()) { 1242 // For extractelement, we need the following instructions: 1243 // vsetivli zero, 1, e64, m1, ta, mu (not count) 1244 // vslidedown.vx v8, v8, a0 1245 // vmv.x.s a0, v8 1246 // li a1, 32 1247 // vsrl.vx v8, v8, a1 1248 // vmv.x.s a1, v8 1249 1250 // For insertelement, we need the following instructions: 1251 // vsetivli zero, 2, e32, m4, ta, mu (not count) 1252 // vmv.v.i v12, 0 1253 // vslide1up.vx v16, v12, a1 1254 // vslide1up.vx v12, v16, a0 1255 // addi a0, a2, 1 1256 // vsetvli zero, a0, e64, m4, tu, mu (not count) 1257 // vslideup.vx v8, v12, a2 1258 1259 // TODO: should we count these special vsetvlis? 1260 BaseCost = Opcode == Instruction::InsertElement ? 3 : 4; 1261 } 1262 return BaseCost + SlideCost; 1263 } 1264 1265 InstructionCost RISCVTTIImpl::getArithmeticInstrCost( 1266 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, 1267 TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, 1268 ArrayRef<const Value *> Args, const Instruction *CxtI) { 1269 1270 // TODO: Handle more cost kinds. 1271 if (CostKind != TTI::TCK_RecipThroughput) 1272 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info, 1273 Args, CxtI); 1274 1275 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors()) 1276 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info, 1277 Args, CxtI); 1278 1279 // Skip if scalar size of Ty is bigger than ELEN. 1280 if (isa<VectorType>(Ty) && Ty->getScalarSizeInBits() > ST->getELen()) 1281 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info, 1282 Args, CxtI); 1283 1284 // Legalize the type. 1285 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); 1286 1287 // TODO: Handle scalar type. 1288 if (!LT.second.isVector()) 1289 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info, 1290 Args, CxtI); 1291 1292 1293 auto getConstantMatCost = 1294 [&](unsigned Operand, TTI::OperandValueInfo OpInfo) -> InstructionCost { 1295 if (OpInfo.isUniform() && TLI->canSplatOperand(Opcode, Operand)) 1296 // Two sub-cases: 1297 // * Has a 5 bit immediate operand which can be splatted. 1298 // * Has a larger immediate which must be materialized in scalar register 1299 // We return 0 for both as we currently ignore the cost of materializing 1300 // scalar constants in GPRs. 1301 return 0; 1302 1303 return getConstantPoolLoadCost(Ty, CostKind); 1304 }; 1305 1306 // Add the cost of materializing any constant vectors required. 1307 InstructionCost ConstantMatCost = 0; 1308 if (Op1Info.isConstant()) 1309 ConstantMatCost += getConstantMatCost(0, Op1Info); 1310 if (Op2Info.isConstant()) 1311 ConstantMatCost += getConstantMatCost(1, Op2Info); 1312 1313 switch (TLI->InstructionOpcodeToISD(Opcode)) { 1314 case ISD::ADD: 1315 case ISD::SUB: 1316 case ISD::AND: 1317 case ISD::OR: 1318 case ISD::XOR: 1319 case ISD::SHL: 1320 case ISD::SRL: 1321 case ISD::SRA: 1322 case ISD::MUL: 1323 case ISD::MULHS: 1324 case ISD::MULHU: 1325 case ISD::FADD: 1326 case ISD::FSUB: 1327 case ISD::FMUL: 1328 case ISD::FNEG: { 1329 return ConstantMatCost + TLI->getLMULCost(LT.second) * LT.first * 1; 1330 } 1331 default: 1332 return ConstantMatCost + 1333 BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info, 1334 Args, CxtI); 1335 } 1336 } 1337 1338 // TODO: Deduplicate from TargetTransformInfoImplCRTPBase. 1339 InstructionCost RISCVTTIImpl::getPointersChainCost( 1340 ArrayRef<const Value *> Ptrs, const Value *Base, 1341 const TTI::PointersChainInfo &Info, Type *AccessTy, 1342 TTI::TargetCostKind CostKind) { 1343 InstructionCost Cost = TTI::TCC_Free; 1344 // In the basic model we take into account GEP instructions only 1345 // (although here can come alloca instruction, a value, constants and/or 1346 // constant expressions, PHIs, bitcasts ... whatever allowed to be used as a 1347 // pointer). Typically, if Base is a not a GEP-instruction and all the 1348 // pointers are relative to the same base address, all the rest are 1349 // either GEP instructions, PHIs, bitcasts or constants. When we have same 1350 // base, we just calculate cost of each non-Base GEP as an ADD operation if 1351 // any their index is a non-const. 1352 // If no known dependecies between the pointers cost is calculated as a sum 1353 // of costs of GEP instructions. 1354 for (auto [I, V] : enumerate(Ptrs)) { 1355 const auto *GEP = dyn_cast<GetElementPtrInst>(V); 1356 if (!GEP) 1357 continue; 1358 if (Info.isSameBase() && V != Base) { 1359 if (GEP->hasAllConstantIndices()) 1360 continue; 1361 // If the chain is unit-stride and BaseReg + stride*i is a legal 1362 // addressing mode, then presume the base GEP is sitting around in a 1363 // register somewhere and check if we can fold the offset relative to 1364 // it. 1365 unsigned Stride = DL.getTypeStoreSize(AccessTy); 1366 if (Info.isUnitStride() && 1367 isLegalAddressingMode(AccessTy, 1368 /* BaseGV */ nullptr, 1369 /* BaseOffset */ Stride * I, 1370 /* HasBaseReg */ true, 1371 /* Scale */ 0, 1372 GEP->getType()->getPointerAddressSpace())) 1373 continue; 1374 Cost += getArithmeticInstrCost(Instruction::Add, GEP->getType(), CostKind, 1375 {TTI::OK_AnyValue, TTI::OP_None}, 1376 {TTI::OK_AnyValue, TTI::OP_None}, 1377 std::nullopt); 1378 } else { 1379 SmallVector<const Value *> Indices(GEP->indices()); 1380 Cost += getGEPCost(GEP->getSourceElementType(), GEP->getPointerOperand(), 1381 Indices, AccessTy, CostKind); 1382 } 1383 } 1384 return Cost; 1385 } 1386 1387 void RISCVTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, 1388 TTI::UnrollingPreferences &UP, 1389 OptimizationRemarkEmitter *ORE) { 1390 // TODO: More tuning on benchmarks and metrics with changes as needed 1391 // would apply to all settings below to enable performance. 1392 1393 1394 if (ST->enableDefaultUnroll()) 1395 return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE); 1396 1397 // Enable Upper bound unrolling universally, not dependant upon the conditions 1398 // below. 1399 UP.UpperBound = true; 1400 1401 // Disable loop unrolling for Oz and Os. 1402 UP.OptSizeThreshold = 0; 1403 UP.PartialOptSizeThreshold = 0; 1404 if (L->getHeader()->getParent()->hasOptSize()) 1405 return; 1406 1407 SmallVector<BasicBlock *, 4> ExitingBlocks; 1408 L->getExitingBlocks(ExitingBlocks); 1409 LLVM_DEBUG(dbgs() << "Loop has:\n" 1410 << "Blocks: " << L->getNumBlocks() << "\n" 1411 << "Exit blocks: " << ExitingBlocks.size() << "\n"); 1412 1413 // Only allow another exit other than the latch. This acts as an early exit 1414 // as it mirrors the profitability calculation of the runtime unroller. 1415 if (ExitingBlocks.size() > 2) 1416 return; 1417 1418 // Limit the CFG of the loop body for targets with a branch predictor. 1419 // Allowing 4 blocks permits if-then-else diamonds in the body. 1420 if (L->getNumBlocks() > 4) 1421 return; 1422 1423 // Don't unroll vectorized loops, including the remainder loop 1424 if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized")) 1425 return; 1426 1427 // Scan the loop: don't unroll loops with calls as this could prevent 1428 // inlining. 1429 InstructionCost Cost = 0; 1430 for (auto *BB : L->getBlocks()) { 1431 for (auto &I : *BB) { 1432 // Initial setting - Don't unroll loops containing vectorized 1433 // instructions. 1434 if (I.getType()->isVectorTy()) 1435 return; 1436 1437 if (isa<CallInst>(I) || isa<InvokeInst>(I)) { 1438 if (const Function *F = cast<CallBase>(I).getCalledFunction()) { 1439 if (!isLoweredToCall(F)) 1440 continue; 1441 } 1442 return; 1443 } 1444 1445 SmallVector<const Value *> Operands(I.operand_values()); 1446 Cost += getInstructionCost(&I, Operands, 1447 TargetTransformInfo::TCK_SizeAndLatency); 1448 } 1449 } 1450 1451 LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n"); 1452 1453 UP.Partial = true; 1454 UP.Runtime = true; 1455 UP.UnrollRemainder = true; 1456 UP.UnrollAndJam = true; 1457 UP.UnrollAndJamInnerLoopThreshold = 60; 1458 1459 // Force unrolling small loops can be very useful because of the branch 1460 // taken cost of the backedge. 1461 if (Cost < 12) 1462 UP.Force = true; 1463 } 1464 1465 void RISCVTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, 1466 TTI::PeelingPreferences &PP) { 1467 BaseT::getPeelingPreferences(L, SE, PP); 1468 } 1469 1470 unsigned RISCVTTIImpl::getRegUsageForType(Type *Ty) { 1471 TypeSize Size = DL.getTypeSizeInBits(Ty); 1472 if (Ty->isVectorTy()) { 1473 if (Size.isScalable() && ST->hasVInstructions()) 1474 return divideCeil(Size.getKnownMinValue(), RISCV::RVVBitsPerBlock); 1475 1476 if (ST->useRVVForFixedLengthVectors()) 1477 return divideCeil(Size, ST->getRealMinVLen()); 1478 } 1479 1480 return BaseT::getRegUsageForType(Ty); 1481 } 1482 1483 unsigned RISCVTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const { 1484 if (SLPMaxVF.getNumOccurrences()) 1485 return SLPMaxVF; 1486 1487 // Return how many elements can fit in getRegisterBitwidth. This is the 1488 // same routine as used in LoopVectorizer. We should probably be 1489 // accounting for whether we actually have instructions with the right 1490 // lane type, but we don't have enough information to do that without 1491 // some additional plumbing which hasn't been justified yet. 1492 TypeSize RegWidth = 1493 getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector); 1494 // If no vector registers, or absurd element widths, disable 1495 // vectorization by returning 1. 1496 return std::max<unsigned>(1U, RegWidth.getFixedValue() / ElemWidth); 1497 } 1498 1499 bool RISCVTTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1, 1500 const TargetTransformInfo::LSRCost &C2) { 1501 // RISC-V specific here are "instruction number 1st priority". 1502 return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost, 1503 C1.NumIVMuls, C1.NumBaseAdds, 1504 C1.ScaleCost, C1.ImmCost, C1.SetupCost) < 1505 std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost, 1506 C2.NumIVMuls, C2.NumBaseAdds, 1507 C2.ScaleCost, C2.ImmCost, C2.SetupCost); 1508 } 1509