1 //===-- RISCVTargetTransformInfo.cpp - RISC-V specific TTI ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "RISCVTargetTransformInfo.h" 10 #include "MCTargetDesc/RISCVMatInt.h" 11 #include "llvm/Analysis/TargetTransformInfo.h" 12 #include "llvm/CodeGen/BasicTTIImpl.h" 13 #include "llvm/CodeGen/CostTable.h" 14 #include "llvm/CodeGen/TargetLowering.h" 15 #include <cmath> 16 #include <optional> 17 using namespace llvm; 18 19 #define DEBUG_TYPE "riscvtti" 20 21 static cl::opt<unsigned> RVVRegisterWidthLMUL( 22 "riscv-v-register-bit-width-lmul", 23 cl::desc( 24 "The LMUL to use for getRegisterBitWidth queries. Affects LMUL used " 25 "by autovectorized code. Fractional LMULs are not supported."), 26 cl::init(1), cl::Hidden); 27 28 static cl::opt<unsigned> SLPMaxVF( 29 "riscv-v-slp-max-vf", 30 cl::desc( 31 "Result used for getMaximumVF query which is used exclusively by " 32 "SLP vectorizer. Defaults to 1 which disables SLP."), 33 cl::init(1), cl::Hidden); 34 35 InstructionCost RISCVTTIImpl::getLMULCost(MVT VT) { 36 // TODO: Here assume reciprocal throughput is 1 for LMUL_1, it is 37 // implementation-defined. 38 if (!VT.isVector()) 39 return InstructionCost::getInvalid(); 40 unsigned Cost; 41 if (VT.isScalableVector()) { 42 unsigned LMul; 43 bool Fractional; 44 std::tie(LMul, Fractional) = 45 RISCVVType::decodeVLMUL(RISCVTargetLowering::getLMUL(VT)); 46 if (Fractional) 47 Cost = 1; 48 else 49 Cost = LMul; 50 } else { 51 Cost = VT.getSizeInBits() / ST->getRealMinVLen(); 52 } 53 return std::max<unsigned>(Cost, 1); 54 } 55 56 InstructionCost RISCVTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty, 57 TTI::TargetCostKind CostKind) { 58 assert(Ty->isIntegerTy() && 59 "getIntImmCost can only estimate cost of materialising integers"); 60 61 // We have a Zero register, so 0 is always free. 62 if (Imm == 0) 63 return TTI::TCC_Free; 64 65 // Otherwise, we check how many instructions it will take to materialise. 66 const DataLayout &DL = getDataLayout(); 67 return RISCVMatInt::getIntMatCost(Imm, DL.getTypeSizeInBits(Ty), 68 getST()->getFeatureBits()); 69 } 70 71 // Look for patterns of shift followed by AND that can be turned into a pair of 72 // shifts. We won't need to materialize an immediate for the AND so these can 73 // be considered free. 74 static bool canUseShiftPair(Instruction *Inst, const APInt &Imm) { 75 uint64_t Mask = Imm.getZExtValue(); 76 auto *BO = dyn_cast<BinaryOperator>(Inst->getOperand(0)); 77 if (!BO || !BO->hasOneUse()) 78 return false; 79 80 if (BO->getOpcode() != Instruction::Shl) 81 return false; 82 83 if (!isa<ConstantInt>(BO->getOperand(1))) 84 return false; 85 86 unsigned ShAmt = cast<ConstantInt>(BO->getOperand(1))->getZExtValue(); 87 // (and (shl x, c2), c1) will be matched to (srli (slli x, c2+c3), c3) if c1 88 // is a mask shifted by c2 bits with c3 leading zeros. 89 if (isShiftedMask_64(Mask)) { 90 unsigned Trailing = countTrailingZeros(Mask); 91 if (ShAmt == Trailing) 92 return true; 93 } 94 95 return false; 96 } 97 98 InstructionCost RISCVTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, 99 const APInt &Imm, Type *Ty, 100 TTI::TargetCostKind CostKind, 101 Instruction *Inst) { 102 assert(Ty->isIntegerTy() && 103 "getIntImmCost can only estimate cost of materialising integers"); 104 105 // We have a Zero register, so 0 is always free. 106 if (Imm == 0) 107 return TTI::TCC_Free; 108 109 // Some instructions in RISC-V can take a 12-bit immediate. Some of these are 110 // commutative, in others the immediate comes from a specific argument index. 111 bool Takes12BitImm = false; 112 unsigned ImmArgIdx = ~0U; 113 114 switch (Opcode) { 115 case Instruction::GetElementPtr: 116 // Never hoist any arguments to a GetElementPtr. CodeGenPrepare will 117 // split up large offsets in GEP into better parts than ConstantHoisting 118 // can. 119 return TTI::TCC_Free; 120 case Instruction::And: 121 // zext.h 122 if (Imm == UINT64_C(0xffff) && ST->hasStdExtZbb()) 123 return TTI::TCC_Free; 124 // zext.w 125 if (Imm == UINT64_C(0xffffffff) && ST->hasStdExtZba()) 126 return TTI::TCC_Free; 127 // bclri 128 if (ST->hasStdExtZbs() && (~Imm).isPowerOf2()) 129 return TTI::TCC_Free; 130 if (Inst && Idx == 1 && Imm.getBitWidth() <= ST->getXLen() && 131 canUseShiftPair(Inst, Imm)) 132 return TTI::TCC_Free; 133 Takes12BitImm = true; 134 break; 135 case Instruction::Add: 136 Takes12BitImm = true; 137 break; 138 case Instruction::Or: 139 case Instruction::Xor: 140 // bseti/binvi 141 if (ST->hasStdExtZbs() && Imm.isPowerOf2()) 142 return TTI::TCC_Free; 143 Takes12BitImm = true; 144 break; 145 case Instruction::Mul: 146 // Negated power of 2 is a shift and a negate. 147 if (Imm.isNegatedPowerOf2()) 148 return TTI::TCC_Free; 149 // FIXME: There is no MULI instruction. 150 Takes12BitImm = true; 151 break; 152 case Instruction::Sub: 153 case Instruction::Shl: 154 case Instruction::LShr: 155 case Instruction::AShr: 156 Takes12BitImm = true; 157 ImmArgIdx = 1; 158 break; 159 default: 160 break; 161 } 162 163 if (Takes12BitImm) { 164 // Check immediate is the correct argument... 165 if (Instruction::isCommutative(Opcode) || Idx == ImmArgIdx) { 166 // ... and fits into the 12-bit immediate. 167 if (Imm.getMinSignedBits() <= 64 && 168 getTLI()->isLegalAddImmediate(Imm.getSExtValue())) { 169 return TTI::TCC_Free; 170 } 171 } 172 173 // Otherwise, use the full materialisation cost. 174 return getIntImmCost(Imm, Ty, CostKind); 175 } 176 177 // By default, prevent hoisting. 178 return TTI::TCC_Free; 179 } 180 181 InstructionCost 182 RISCVTTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, 183 const APInt &Imm, Type *Ty, 184 TTI::TargetCostKind CostKind) { 185 // Prevent hoisting in unknown cases. 186 return TTI::TCC_Free; 187 } 188 189 TargetTransformInfo::PopcntSupportKind 190 RISCVTTIImpl::getPopcntSupport(unsigned TyWidth) { 191 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); 192 return ST->hasStdExtZbb() ? TTI::PSK_FastHardware : TTI::PSK_Software; 193 } 194 195 bool RISCVTTIImpl::shouldExpandReduction(const IntrinsicInst *II) const { 196 // Currently, the ExpandReductions pass can't expand scalable-vector 197 // reductions, but we still request expansion as RVV doesn't support certain 198 // reductions and the SelectionDAG can't legalize them either. 199 switch (II->getIntrinsicID()) { 200 default: 201 return false; 202 // These reductions have no equivalent in RVV 203 case Intrinsic::vector_reduce_mul: 204 case Intrinsic::vector_reduce_fmul: 205 return true; 206 } 207 } 208 209 std::optional<unsigned> RISCVTTIImpl::getMaxVScale() const { 210 if (ST->hasVInstructions()) 211 return ST->getRealMaxVLen() / RISCV::RVVBitsPerBlock; 212 return BaseT::getMaxVScale(); 213 } 214 215 std::optional<unsigned> RISCVTTIImpl::getVScaleForTuning() const { 216 if (ST->hasVInstructions()) 217 if (unsigned MinVLen = ST->getRealMinVLen(); 218 MinVLen >= RISCV::RVVBitsPerBlock) 219 return MinVLen / RISCV::RVVBitsPerBlock; 220 return BaseT::getVScaleForTuning(); 221 } 222 223 TypeSize 224 RISCVTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { 225 unsigned LMUL = PowerOf2Floor( 226 std::max<unsigned>(std::min<unsigned>(RVVRegisterWidthLMUL, 8), 1)); 227 switch (K) { 228 case TargetTransformInfo::RGK_Scalar: 229 return TypeSize::getFixed(ST->getXLen()); 230 case TargetTransformInfo::RGK_FixedWidthVector: 231 return TypeSize::getFixed( 232 ST->useRVVForFixedLengthVectors() ? LMUL * ST->getRealMinVLen() : 0); 233 case TargetTransformInfo::RGK_ScalableVector: 234 return TypeSize::getScalable( 235 (ST->hasVInstructions() && 236 ST->getRealMinVLen() >= RISCV::RVVBitsPerBlock) 237 ? LMUL * RISCV::RVVBitsPerBlock 238 : 0); 239 } 240 241 llvm_unreachable("Unsupported register kind"); 242 } 243 244 InstructionCost RISCVTTIImpl::getSpliceCost(VectorType *Tp, int Index) { 245 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp); 246 247 unsigned Cost = 2; // vslidedown+vslideup. 248 // TODO: Multiplying by LT.first implies this legalizes into multiple copies 249 // of similar code, but I think we expand through memory. 250 return Cost * LT.first * getLMULCost(LT.second); 251 } 252 253 InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, 254 VectorType *Tp, ArrayRef<int> Mask, 255 TTI::TargetCostKind CostKind, 256 int Index, VectorType *SubTp, 257 ArrayRef<const Value *> Args) { 258 if (isa<ScalableVectorType>(Tp)) { 259 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp); 260 switch (Kind) { 261 default: 262 // Fallthrough to generic handling. 263 // TODO: Most of these cases will return getInvalid in generic code, and 264 // must be implemented here. 265 break; 266 case TTI::SK_Broadcast: { 267 return LT.first * 1; 268 } 269 case TTI::SK_Splice: 270 return getSpliceCost(Tp, Index); 271 case TTI::SK_Reverse: 272 // Most of the cost here is producing the vrgather index register 273 // Example sequence: 274 // csrr a0, vlenb 275 // srli a0, a0, 3 276 // addi a0, a0, -1 277 // vsetvli a1, zero, e8, mf8, ta, mu (ignored) 278 // vid.v v9 279 // vrsub.vx v10, v9, a0 280 // vrgather.vv v9, v8, v10 281 if (Tp->getElementType()->isIntegerTy(1)) 282 // Mask operation additionally required extend and truncate 283 return LT.first * 9; 284 return LT.first * 6; 285 } 286 } 287 288 if (isa<FixedVectorType>(Tp) && Kind == TargetTransformInfo::SK_Broadcast) { 289 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp); 290 bool HasScalar = (Args.size() > 0) && (Operator::getOpcode(Args[0]) == 291 Instruction::InsertElement); 292 if (LT.second.getScalarSizeInBits() == 1) { 293 if (HasScalar) { 294 // Example sequence: 295 // andi a0, a0, 1 296 // vsetivli zero, 2, e8, mf8, ta, ma (ignored) 297 // vmv.v.x v8, a0 298 // vmsne.vi v0, v8, 0 299 return LT.first * getLMULCost(LT.second) * 3; 300 } 301 // Example sequence: 302 // vsetivli zero, 2, e8, mf8, ta, mu (ignored) 303 // vmv.v.i v8, 0 304 // vmerge.vim v8, v8, 1, v0 305 // vmv.x.s a0, v8 306 // andi a0, a0, 1 307 // vmv.v.x v8, a0 308 // vmsne.vi v0, v8, 0 309 310 return LT.first * getLMULCost(LT.second) * 6; 311 } 312 313 if (HasScalar) { 314 // Example sequence: 315 // vmv.v.x v8, a0 316 return LT.first * getLMULCost(LT.second); 317 } 318 319 // Example sequence: 320 // vrgather.vi v9, v8, 0 321 // TODO: vrgather could be slower than vmv.v.x. It is 322 // implementation-dependent. 323 return LT.first * getLMULCost(LT.second); 324 } 325 326 return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp); 327 } 328 329 InstructionCost 330 RISCVTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, 331 unsigned AddressSpace, 332 TTI::TargetCostKind CostKind) { 333 if (!isLegalMaskedLoadStore(Src, Alignment) || 334 CostKind != TTI::TCK_RecipThroughput) 335 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace, 336 CostKind); 337 338 return getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind); 339 } 340 341 InstructionCost RISCVTTIImpl::getGatherScatterOpCost( 342 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, 343 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) { 344 if (CostKind != TTI::TCK_RecipThroughput) 345 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, 346 Alignment, CostKind, I); 347 348 if ((Opcode == Instruction::Load && 349 !isLegalMaskedGather(DataTy, Align(Alignment))) || 350 (Opcode == Instruction::Store && 351 !isLegalMaskedScatter(DataTy, Align(Alignment)))) 352 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, 353 Alignment, CostKind, I); 354 355 // Cost is proportional to the number of memory operations implied. For 356 // scalable vectors, we use an estimate on that number since we don't 357 // know exactly what VL will be. 358 auto &VTy = *cast<VectorType>(DataTy); 359 InstructionCost MemOpCost = 360 getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind, 361 {TTI::OK_AnyValue, TTI::OP_None}, I); 362 unsigned NumLoads = getEstimatedVLFor(&VTy); 363 return NumLoads * MemOpCost; 364 } 365 366 // Currently, these represent both throughput and codesize costs 367 // for the respective intrinsics. The costs in this table are simply 368 // instruction counts with the following adjustments made: 369 // * One vsetvli is considered free. 370 static const CostTblEntry VectorIntrinsicCostTable[]{ 371 {Intrinsic::floor, MVT::v2f32, 9}, 372 {Intrinsic::floor, MVT::v4f32, 9}, 373 {Intrinsic::floor, MVT::v8f32, 9}, 374 {Intrinsic::floor, MVT::v16f32, 9}, 375 {Intrinsic::floor, MVT::nxv1f32, 9}, 376 {Intrinsic::floor, MVT::nxv2f32, 9}, 377 {Intrinsic::floor, MVT::nxv4f32, 9}, 378 {Intrinsic::floor, MVT::nxv8f32, 9}, 379 {Intrinsic::floor, MVT::nxv16f32, 9}, 380 {Intrinsic::floor, MVT::v2f64, 9}, 381 {Intrinsic::floor, MVT::v4f64, 9}, 382 {Intrinsic::floor, MVT::v8f64, 9}, 383 {Intrinsic::floor, MVT::v16f64, 9}, 384 {Intrinsic::floor, MVT::nxv1f64, 9}, 385 {Intrinsic::floor, MVT::nxv2f64, 9}, 386 {Intrinsic::floor, MVT::nxv4f64, 9}, 387 {Intrinsic::floor, MVT::nxv8f64, 9}, 388 {Intrinsic::ceil, MVT::v2f32, 9}, 389 {Intrinsic::ceil, MVT::v4f32, 9}, 390 {Intrinsic::ceil, MVT::v8f32, 9}, 391 {Intrinsic::ceil, MVT::v16f32, 9}, 392 {Intrinsic::ceil, MVT::nxv1f32, 9}, 393 {Intrinsic::ceil, MVT::nxv2f32, 9}, 394 {Intrinsic::ceil, MVT::nxv4f32, 9}, 395 {Intrinsic::ceil, MVT::nxv8f32, 9}, 396 {Intrinsic::ceil, MVT::nxv16f32, 9}, 397 {Intrinsic::ceil, MVT::v2f64, 9}, 398 {Intrinsic::ceil, MVT::v4f64, 9}, 399 {Intrinsic::ceil, MVT::v8f64, 9}, 400 {Intrinsic::ceil, MVT::v16f64, 9}, 401 {Intrinsic::ceil, MVT::nxv1f64, 9}, 402 {Intrinsic::ceil, MVT::nxv2f64, 9}, 403 {Intrinsic::ceil, MVT::nxv4f64, 9}, 404 {Intrinsic::ceil, MVT::nxv8f64, 9}, 405 {Intrinsic::trunc, MVT::v2f32, 7}, 406 {Intrinsic::trunc, MVT::v4f32, 7}, 407 {Intrinsic::trunc, MVT::v8f32, 7}, 408 {Intrinsic::trunc, MVT::v16f32, 7}, 409 {Intrinsic::trunc, MVT::nxv1f32, 7}, 410 {Intrinsic::trunc, MVT::nxv2f32, 7}, 411 {Intrinsic::trunc, MVT::nxv4f32, 7}, 412 {Intrinsic::trunc, MVT::nxv8f32, 7}, 413 {Intrinsic::trunc, MVT::nxv16f32, 7}, 414 {Intrinsic::trunc, MVT::v2f64, 7}, 415 {Intrinsic::trunc, MVT::v4f64, 7}, 416 {Intrinsic::trunc, MVT::v8f64, 7}, 417 {Intrinsic::trunc, MVT::v16f64, 7}, 418 {Intrinsic::trunc, MVT::nxv1f64, 7}, 419 {Intrinsic::trunc, MVT::nxv2f64, 7}, 420 {Intrinsic::trunc, MVT::nxv4f64, 7}, 421 {Intrinsic::trunc, MVT::nxv8f64, 7}, 422 {Intrinsic::round, MVT::v2f32, 9}, 423 {Intrinsic::round, MVT::v4f32, 9}, 424 {Intrinsic::round, MVT::v8f32, 9}, 425 {Intrinsic::round, MVT::v16f32, 9}, 426 {Intrinsic::round, MVT::nxv1f32, 9}, 427 {Intrinsic::round, MVT::nxv2f32, 9}, 428 {Intrinsic::round, MVT::nxv4f32, 9}, 429 {Intrinsic::round, MVT::nxv8f32, 9}, 430 {Intrinsic::round, MVT::nxv16f32, 9}, 431 {Intrinsic::round, MVT::v2f64, 9}, 432 {Intrinsic::round, MVT::v4f64, 9}, 433 {Intrinsic::round, MVT::v8f64, 9}, 434 {Intrinsic::round, MVT::v16f64, 9}, 435 {Intrinsic::round, MVT::nxv1f64, 9}, 436 {Intrinsic::round, MVT::nxv2f64, 9}, 437 {Intrinsic::round, MVT::nxv4f64, 9}, 438 {Intrinsic::round, MVT::nxv8f64, 9}, 439 {Intrinsic::roundeven, MVT::v2f32, 9}, 440 {Intrinsic::roundeven, MVT::v4f32, 9}, 441 {Intrinsic::roundeven, MVT::v8f32, 9}, 442 {Intrinsic::roundeven, MVT::v16f32, 9}, 443 {Intrinsic::roundeven, MVT::nxv1f32, 9}, 444 {Intrinsic::roundeven, MVT::nxv2f32, 9}, 445 {Intrinsic::roundeven, MVT::nxv4f32, 9}, 446 {Intrinsic::roundeven, MVT::nxv8f32, 9}, 447 {Intrinsic::roundeven, MVT::nxv16f32, 9}, 448 {Intrinsic::roundeven, MVT::v2f64, 9}, 449 {Intrinsic::roundeven, MVT::v4f64, 9}, 450 {Intrinsic::roundeven, MVT::v8f64, 9}, 451 {Intrinsic::roundeven, MVT::v16f64, 9}, 452 {Intrinsic::roundeven, MVT::nxv1f64, 9}, 453 {Intrinsic::roundeven, MVT::nxv2f64, 9}, 454 {Intrinsic::roundeven, MVT::nxv4f64, 9}, 455 {Intrinsic::roundeven, MVT::nxv8f64, 9}, 456 {Intrinsic::bswap, MVT::v2i16, 3}, 457 {Intrinsic::bswap, MVT::v4i16, 3}, 458 {Intrinsic::bswap, MVT::v8i16, 3}, 459 {Intrinsic::bswap, MVT::v16i16, 3}, 460 {Intrinsic::bswap, MVT::nxv1i16, 3}, 461 {Intrinsic::bswap, MVT::nxv2i16, 3}, 462 {Intrinsic::bswap, MVT::nxv4i16, 3}, 463 {Intrinsic::bswap, MVT::nxv8i16, 3}, 464 {Intrinsic::bswap, MVT::nxv16i16, 3}, 465 {Intrinsic::bswap, MVT::v2i32, 12}, 466 {Intrinsic::bswap, MVT::v4i32, 12}, 467 {Intrinsic::bswap, MVT::v8i32, 12}, 468 {Intrinsic::bswap, MVT::v16i32, 12}, 469 {Intrinsic::bswap, MVT::nxv1i32, 12}, 470 {Intrinsic::bswap, MVT::nxv2i32, 12}, 471 {Intrinsic::bswap, MVT::nxv4i32, 12}, 472 {Intrinsic::bswap, MVT::nxv8i32, 12}, 473 {Intrinsic::bswap, MVT::nxv16i32, 12}, 474 {Intrinsic::bswap, MVT::v2i64, 31}, 475 {Intrinsic::bswap, MVT::v4i64, 31}, 476 {Intrinsic::bswap, MVT::v8i64, 31}, 477 {Intrinsic::bswap, MVT::v16i64, 31}, 478 {Intrinsic::bswap, MVT::nxv1i64, 31}, 479 {Intrinsic::bswap, MVT::nxv2i64, 31}, 480 {Intrinsic::bswap, MVT::nxv4i64, 31}, 481 {Intrinsic::bswap, MVT::nxv8i64, 31}, 482 {Intrinsic::vp_bswap, MVT::v2i16, 3}, 483 {Intrinsic::vp_bswap, MVT::v4i16, 3}, 484 {Intrinsic::vp_bswap, MVT::v8i16, 3}, 485 {Intrinsic::vp_bswap, MVT::v16i16, 3}, 486 {Intrinsic::vp_bswap, MVT::nxv1i16, 3}, 487 {Intrinsic::vp_bswap, MVT::nxv2i16, 3}, 488 {Intrinsic::vp_bswap, MVT::nxv4i16, 3}, 489 {Intrinsic::vp_bswap, MVT::nxv8i16, 3}, 490 {Intrinsic::vp_bswap, MVT::nxv16i16, 3}, 491 {Intrinsic::vp_bswap, MVT::v2i32, 12}, 492 {Intrinsic::vp_bswap, MVT::v4i32, 12}, 493 {Intrinsic::vp_bswap, MVT::v8i32, 12}, 494 {Intrinsic::vp_bswap, MVT::v16i32, 12}, 495 {Intrinsic::vp_bswap, MVT::nxv1i32, 12}, 496 {Intrinsic::vp_bswap, MVT::nxv2i32, 12}, 497 {Intrinsic::vp_bswap, MVT::nxv4i32, 12}, 498 {Intrinsic::vp_bswap, MVT::nxv8i32, 12}, 499 {Intrinsic::vp_bswap, MVT::nxv16i32, 12}, 500 {Intrinsic::vp_bswap, MVT::v2i64, 31}, 501 {Intrinsic::vp_bswap, MVT::v4i64, 31}, 502 {Intrinsic::vp_bswap, MVT::v8i64, 31}, 503 {Intrinsic::vp_bswap, MVT::v16i64, 31}, 504 {Intrinsic::vp_bswap, MVT::nxv1i64, 31}, 505 {Intrinsic::vp_bswap, MVT::nxv2i64, 31}, 506 {Intrinsic::vp_bswap, MVT::nxv4i64, 31}, 507 {Intrinsic::vp_bswap, MVT::nxv8i64, 31}, 508 {Intrinsic::vp_fshl, MVT::v2i8, 7}, 509 {Intrinsic::vp_fshl, MVT::v4i8, 7}, 510 {Intrinsic::vp_fshl, MVT::v8i8, 7}, 511 {Intrinsic::vp_fshl, MVT::v16i8, 7}, 512 {Intrinsic::vp_fshl, MVT::nxv1i8, 7}, 513 {Intrinsic::vp_fshl, MVT::nxv2i8, 7}, 514 {Intrinsic::vp_fshl, MVT::nxv4i8, 7}, 515 {Intrinsic::vp_fshl, MVT::nxv8i8, 7}, 516 {Intrinsic::vp_fshl, MVT::nxv16i8, 7}, 517 {Intrinsic::vp_fshl, MVT::nxv32i8, 7}, 518 {Intrinsic::vp_fshl, MVT::nxv64i8, 7}, 519 {Intrinsic::vp_fshl, MVT::v2i16, 7}, 520 {Intrinsic::vp_fshl, MVT::v4i16, 7}, 521 {Intrinsic::vp_fshl, MVT::v8i16, 7}, 522 {Intrinsic::vp_fshl, MVT::v16i16, 7}, 523 {Intrinsic::vp_fshl, MVT::nxv1i16, 7}, 524 {Intrinsic::vp_fshl, MVT::nxv2i16, 7}, 525 {Intrinsic::vp_fshl, MVT::nxv4i16, 7}, 526 {Intrinsic::vp_fshl, MVT::nxv8i16, 7}, 527 {Intrinsic::vp_fshl, MVT::nxv16i16, 7}, 528 {Intrinsic::vp_fshl, MVT::nxv32i16, 7}, 529 {Intrinsic::vp_fshl, MVT::v2i32, 7}, 530 {Intrinsic::vp_fshl, MVT::v4i32, 7}, 531 {Intrinsic::vp_fshl, MVT::v8i32, 7}, 532 {Intrinsic::vp_fshl, MVT::v16i32, 7}, 533 {Intrinsic::vp_fshl, MVT::nxv1i32, 7}, 534 {Intrinsic::vp_fshl, MVT::nxv2i32, 7}, 535 {Intrinsic::vp_fshl, MVT::nxv4i32, 7}, 536 {Intrinsic::vp_fshl, MVT::nxv8i32, 7}, 537 {Intrinsic::vp_fshl, MVT::nxv16i32, 7}, 538 {Intrinsic::vp_fshl, MVT::v2i64, 7}, 539 {Intrinsic::vp_fshl, MVT::v4i64, 7}, 540 {Intrinsic::vp_fshl, MVT::v8i64, 7}, 541 {Intrinsic::vp_fshl, MVT::v16i64, 7}, 542 {Intrinsic::vp_fshl, MVT::nxv1i64, 7}, 543 {Intrinsic::vp_fshl, MVT::nxv2i64, 7}, 544 {Intrinsic::vp_fshl, MVT::nxv4i64, 7}, 545 {Intrinsic::vp_fshl, MVT::nxv8i64, 7}, 546 {Intrinsic::vp_fshr, MVT::v2i8, 7}, 547 {Intrinsic::vp_fshr, MVT::v4i8, 7}, 548 {Intrinsic::vp_fshr, MVT::v8i8, 7}, 549 {Intrinsic::vp_fshr, MVT::v16i8, 7}, 550 {Intrinsic::vp_fshr, MVT::nxv1i8, 7}, 551 {Intrinsic::vp_fshr, MVT::nxv2i8, 7}, 552 {Intrinsic::vp_fshr, MVT::nxv4i8, 7}, 553 {Intrinsic::vp_fshr, MVT::nxv8i8, 7}, 554 {Intrinsic::vp_fshr, MVT::nxv16i8, 7}, 555 {Intrinsic::vp_fshr, MVT::nxv32i8, 7}, 556 {Intrinsic::vp_fshr, MVT::nxv64i8, 7}, 557 {Intrinsic::vp_fshr, MVT::v2i16, 7}, 558 {Intrinsic::vp_fshr, MVT::v4i16, 7}, 559 {Intrinsic::vp_fshr, MVT::v8i16, 7}, 560 {Intrinsic::vp_fshr, MVT::v16i16, 7}, 561 {Intrinsic::vp_fshr, MVT::nxv1i16, 7}, 562 {Intrinsic::vp_fshr, MVT::nxv2i16, 7}, 563 {Intrinsic::vp_fshr, MVT::nxv4i16, 7}, 564 {Intrinsic::vp_fshr, MVT::nxv8i16, 7}, 565 {Intrinsic::vp_fshr, MVT::nxv16i16, 7}, 566 {Intrinsic::vp_fshr, MVT::nxv32i16, 7}, 567 {Intrinsic::vp_fshr, MVT::v2i32, 7}, 568 {Intrinsic::vp_fshr, MVT::v4i32, 7}, 569 {Intrinsic::vp_fshr, MVT::v8i32, 7}, 570 {Intrinsic::vp_fshr, MVT::v16i32, 7}, 571 {Intrinsic::vp_fshr, MVT::nxv1i32, 7}, 572 {Intrinsic::vp_fshr, MVT::nxv2i32, 7}, 573 {Intrinsic::vp_fshr, MVT::nxv4i32, 7}, 574 {Intrinsic::vp_fshr, MVT::nxv8i32, 7}, 575 {Intrinsic::vp_fshr, MVT::nxv16i32, 7}, 576 {Intrinsic::vp_fshr, MVT::v2i64, 7}, 577 {Intrinsic::vp_fshr, MVT::v4i64, 7}, 578 {Intrinsic::vp_fshr, MVT::v8i64, 7}, 579 {Intrinsic::vp_fshr, MVT::v16i64, 7}, 580 {Intrinsic::vp_fshr, MVT::nxv1i64, 7}, 581 {Intrinsic::vp_fshr, MVT::nxv2i64, 7}, 582 {Intrinsic::vp_fshr, MVT::nxv4i64, 7}, 583 {Intrinsic::vp_fshr, MVT::nxv8i64, 7}, 584 {Intrinsic::bitreverse, MVT::v2i8, 17}, 585 {Intrinsic::bitreverse, MVT::v4i8, 17}, 586 {Intrinsic::bitreverse, MVT::v8i8, 17}, 587 {Intrinsic::bitreverse, MVT::v16i8, 17}, 588 {Intrinsic::bitreverse, MVT::nxv1i8, 17}, 589 {Intrinsic::bitreverse, MVT::nxv2i8, 17}, 590 {Intrinsic::bitreverse, MVT::nxv4i8, 17}, 591 {Intrinsic::bitreverse, MVT::nxv8i8, 17}, 592 {Intrinsic::bitreverse, MVT::nxv16i8, 17}, 593 {Intrinsic::bitreverse, MVT::v2i16, 24}, 594 {Intrinsic::bitreverse, MVT::v4i16, 24}, 595 {Intrinsic::bitreverse, MVT::v8i16, 24}, 596 {Intrinsic::bitreverse, MVT::v16i16, 24}, 597 {Intrinsic::bitreverse, MVT::nxv1i16, 24}, 598 {Intrinsic::bitreverse, MVT::nxv2i16, 24}, 599 {Intrinsic::bitreverse, MVT::nxv4i16, 24}, 600 {Intrinsic::bitreverse, MVT::nxv8i16, 24}, 601 {Intrinsic::bitreverse, MVT::nxv16i16, 24}, 602 {Intrinsic::bitreverse, MVT::v2i32, 33}, 603 {Intrinsic::bitreverse, MVT::v4i32, 33}, 604 {Intrinsic::bitreverse, MVT::v8i32, 33}, 605 {Intrinsic::bitreverse, MVT::v16i32, 33}, 606 {Intrinsic::bitreverse, MVT::nxv1i32, 33}, 607 {Intrinsic::bitreverse, MVT::nxv2i32, 33}, 608 {Intrinsic::bitreverse, MVT::nxv4i32, 33}, 609 {Intrinsic::bitreverse, MVT::nxv8i32, 33}, 610 {Intrinsic::bitreverse, MVT::nxv16i32, 33}, 611 {Intrinsic::bitreverse, MVT::v2i64, 52}, 612 {Intrinsic::bitreverse, MVT::v4i64, 52}, 613 {Intrinsic::bitreverse, MVT::v8i64, 52}, 614 {Intrinsic::bitreverse, MVT::v16i64, 52}, 615 {Intrinsic::bitreverse, MVT::nxv1i64, 52}, 616 {Intrinsic::bitreverse, MVT::nxv2i64, 52}, 617 {Intrinsic::bitreverse, MVT::nxv4i64, 52}, 618 {Intrinsic::bitreverse, MVT::nxv8i64, 52}, 619 {Intrinsic::vp_bitreverse, MVT::v2i8, 17}, 620 {Intrinsic::vp_bitreverse, MVT::v4i8, 17}, 621 {Intrinsic::vp_bitreverse, MVT::v8i8, 17}, 622 {Intrinsic::vp_bitreverse, MVT::v16i8, 17}, 623 {Intrinsic::vp_bitreverse, MVT::nxv1i8, 17}, 624 {Intrinsic::vp_bitreverse, MVT::nxv2i8, 17}, 625 {Intrinsic::vp_bitreverse, MVT::nxv4i8, 17}, 626 {Intrinsic::vp_bitreverse, MVT::nxv8i8, 17}, 627 {Intrinsic::vp_bitreverse, MVT::nxv16i8, 17}, 628 {Intrinsic::vp_bitreverse, MVT::v2i16, 24}, 629 {Intrinsic::vp_bitreverse, MVT::v4i16, 24}, 630 {Intrinsic::vp_bitreverse, MVT::v8i16, 24}, 631 {Intrinsic::vp_bitreverse, MVT::v16i16, 24}, 632 {Intrinsic::vp_bitreverse, MVT::nxv1i16, 24}, 633 {Intrinsic::vp_bitreverse, MVT::nxv2i16, 24}, 634 {Intrinsic::vp_bitreverse, MVT::nxv4i16, 24}, 635 {Intrinsic::vp_bitreverse, MVT::nxv8i16, 24}, 636 {Intrinsic::vp_bitreverse, MVT::nxv16i16, 24}, 637 {Intrinsic::vp_bitreverse, MVT::v2i32, 33}, 638 {Intrinsic::vp_bitreverse, MVT::v4i32, 33}, 639 {Intrinsic::vp_bitreverse, MVT::v8i32, 33}, 640 {Intrinsic::vp_bitreverse, MVT::v16i32, 33}, 641 {Intrinsic::vp_bitreverse, MVT::nxv1i32, 33}, 642 {Intrinsic::vp_bitreverse, MVT::nxv2i32, 33}, 643 {Intrinsic::vp_bitreverse, MVT::nxv4i32, 33}, 644 {Intrinsic::vp_bitreverse, MVT::nxv8i32, 33}, 645 {Intrinsic::vp_bitreverse, MVT::nxv16i32, 33}, 646 {Intrinsic::vp_bitreverse, MVT::v2i64, 52}, 647 {Intrinsic::vp_bitreverse, MVT::v4i64, 52}, 648 {Intrinsic::vp_bitreverse, MVT::v8i64, 52}, 649 {Intrinsic::vp_bitreverse, MVT::v16i64, 52}, 650 {Intrinsic::vp_bitreverse, MVT::nxv1i64, 52}, 651 {Intrinsic::vp_bitreverse, MVT::nxv2i64, 52}, 652 {Intrinsic::vp_bitreverse, MVT::nxv4i64, 52}, 653 {Intrinsic::vp_bitreverse, MVT::nxv8i64, 52}, 654 {Intrinsic::ctpop, MVT::v2i8, 12}, 655 {Intrinsic::ctpop, MVT::v4i8, 12}, 656 {Intrinsic::ctpop, MVT::v8i8, 12}, 657 {Intrinsic::ctpop, MVT::v16i8, 12}, 658 {Intrinsic::ctpop, MVT::nxv1i8, 12}, 659 {Intrinsic::ctpop, MVT::nxv2i8, 12}, 660 {Intrinsic::ctpop, MVT::nxv4i8, 12}, 661 {Intrinsic::ctpop, MVT::nxv8i8, 12}, 662 {Intrinsic::ctpop, MVT::nxv16i8, 12}, 663 {Intrinsic::ctpop, MVT::v2i16, 19}, 664 {Intrinsic::ctpop, MVT::v4i16, 19}, 665 {Intrinsic::ctpop, MVT::v8i16, 19}, 666 {Intrinsic::ctpop, MVT::v16i16, 19}, 667 {Intrinsic::ctpop, MVT::nxv1i16, 19}, 668 {Intrinsic::ctpop, MVT::nxv2i16, 19}, 669 {Intrinsic::ctpop, MVT::nxv4i16, 19}, 670 {Intrinsic::ctpop, MVT::nxv8i16, 19}, 671 {Intrinsic::ctpop, MVT::nxv16i16, 19}, 672 {Intrinsic::ctpop, MVT::v2i32, 20}, 673 {Intrinsic::ctpop, MVT::v4i32, 20}, 674 {Intrinsic::ctpop, MVT::v8i32, 20}, 675 {Intrinsic::ctpop, MVT::v16i32, 20}, 676 {Intrinsic::ctpop, MVT::nxv1i32, 20}, 677 {Intrinsic::ctpop, MVT::nxv2i32, 20}, 678 {Intrinsic::ctpop, MVT::nxv4i32, 20}, 679 {Intrinsic::ctpop, MVT::nxv8i32, 20}, 680 {Intrinsic::ctpop, MVT::nxv16i32, 20}, 681 {Intrinsic::ctpop, MVT::v2i64, 21}, 682 {Intrinsic::ctpop, MVT::v4i64, 21}, 683 {Intrinsic::ctpop, MVT::v8i64, 21}, 684 {Intrinsic::ctpop, MVT::v16i64, 21}, 685 {Intrinsic::ctpop, MVT::nxv1i64, 21}, 686 {Intrinsic::ctpop, MVT::nxv2i64, 21}, 687 {Intrinsic::ctpop, MVT::nxv4i64, 21}, 688 {Intrinsic::ctpop, MVT::nxv8i64, 21}, 689 {Intrinsic::vp_ctpop, MVT::v2i8, 12}, 690 {Intrinsic::vp_ctpop, MVT::v4i8, 12}, 691 {Intrinsic::vp_ctpop, MVT::v8i8, 12}, 692 {Intrinsic::vp_ctpop, MVT::v16i8, 12}, 693 {Intrinsic::vp_ctpop, MVT::nxv1i8, 12}, 694 {Intrinsic::vp_ctpop, MVT::nxv2i8, 12}, 695 {Intrinsic::vp_ctpop, MVT::nxv4i8, 12}, 696 {Intrinsic::vp_ctpop, MVT::nxv8i8, 12}, 697 {Intrinsic::vp_ctpop, MVT::nxv16i8, 12}, 698 {Intrinsic::vp_ctpop, MVT::v2i16, 19}, 699 {Intrinsic::vp_ctpop, MVT::v4i16, 19}, 700 {Intrinsic::vp_ctpop, MVT::v8i16, 19}, 701 {Intrinsic::vp_ctpop, MVT::v16i16, 19}, 702 {Intrinsic::vp_ctpop, MVT::nxv1i16, 19}, 703 {Intrinsic::vp_ctpop, MVT::nxv2i16, 19}, 704 {Intrinsic::vp_ctpop, MVT::nxv4i16, 19}, 705 {Intrinsic::vp_ctpop, MVT::nxv8i16, 19}, 706 {Intrinsic::vp_ctpop, MVT::nxv16i16, 19}, 707 {Intrinsic::vp_ctpop, MVT::v2i32, 20}, 708 {Intrinsic::vp_ctpop, MVT::v4i32, 20}, 709 {Intrinsic::vp_ctpop, MVT::v8i32, 20}, 710 {Intrinsic::vp_ctpop, MVT::v16i32, 20}, 711 {Intrinsic::vp_ctpop, MVT::nxv1i32, 20}, 712 {Intrinsic::vp_ctpop, MVT::nxv2i32, 20}, 713 {Intrinsic::vp_ctpop, MVT::nxv4i32, 20}, 714 {Intrinsic::vp_ctpop, MVT::nxv8i32, 20}, 715 {Intrinsic::vp_ctpop, MVT::nxv16i32, 20}, 716 {Intrinsic::vp_ctpop, MVT::v2i64, 21}, 717 {Intrinsic::vp_ctpop, MVT::v4i64, 21}, 718 {Intrinsic::vp_ctpop, MVT::v8i64, 21}, 719 {Intrinsic::vp_ctpop, MVT::v16i64, 21}, 720 {Intrinsic::vp_ctpop, MVT::nxv1i64, 21}, 721 {Intrinsic::vp_ctpop, MVT::nxv2i64, 21}, 722 {Intrinsic::vp_ctpop, MVT::nxv4i64, 21}, 723 {Intrinsic::vp_ctpop, MVT::nxv8i64, 21}, 724 {Intrinsic::vp_ctlz, MVT::v2i8, 19}, 725 {Intrinsic::vp_ctlz, MVT::v4i8, 19}, 726 {Intrinsic::vp_ctlz, MVT::v8i8, 19}, 727 {Intrinsic::vp_ctlz, MVT::v16i8, 19}, 728 {Intrinsic::vp_ctlz, MVT::nxv1i8, 19}, 729 {Intrinsic::vp_ctlz, MVT::nxv2i8, 19}, 730 {Intrinsic::vp_ctlz, MVT::nxv4i8, 19}, 731 {Intrinsic::vp_ctlz, MVT::nxv8i8, 19}, 732 {Intrinsic::vp_ctlz, MVT::nxv16i8, 19}, 733 {Intrinsic::vp_ctlz, MVT::nxv32i8, 19}, 734 {Intrinsic::vp_ctlz, MVT::nxv64i8, 19}, 735 {Intrinsic::vp_ctlz, MVT::v2i16, 28}, 736 {Intrinsic::vp_ctlz, MVT::v4i16, 28}, 737 {Intrinsic::vp_ctlz, MVT::v8i16, 28}, 738 {Intrinsic::vp_ctlz, MVT::v16i16, 28}, 739 {Intrinsic::vp_ctlz, MVT::nxv1i16, 28}, 740 {Intrinsic::vp_ctlz, MVT::nxv2i16, 28}, 741 {Intrinsic::vp_ctlz, MVT::nxv4i16, 28}, 742 {Intrinsic::vp_ctlz, MVT::nxv8i16, 28}, 743 {Intrinsic::vp_ctlz, MVT::nxv16i16, 28}, 744 {Intrinsic::vp_ctlz, MVT::nxv32i16, 28}, 745 {Intrinsic::vp_ctlz, MVT::v2i32, 31}, 746 {Intrinsic::vp_ctlz, MVT::v4i32, 31}, 747 {Intrinsic::vp_ctlz, MVT::v8i32, 31}, 748 {Intrinsic::vp_ctlz, MVT::v16i32, 31}, 749 {Intrinsic::vp_ctlz, MVT::nxv1i32, 31}, 750 {Intrinsic::vp_ctlz, MVT::nxv2i32, 31}, 751 {Intrinsic::vp_ctlz, MVT::nxv4i32, 31}, 752 {Intrinsic::vp_ctlz, MVT::nxv8i32, 31}, 753 {Intrinsic::vp_ctlz, MVT::nxv16i32, 31}, 754 {Intrinsic::vp_ctlz, MVT::v2i64, 35}, 755 {Intrinsic::vp_ctlz, MVT::v4i64, 35}, 756 {Intrinsic::vp_ctlz, MVT::v8i64, 35}, 757 {Intrinsic::vp_ctlz, MVT::v16i64, 35}, 758 {Intrinsic::vp_ctlz, MVT::nxv1i64, 35}, 759 {Intrinsic::vp_ctlz, MVT::nxv2i64, 35}, 760 {Intrinsic::vp_ctlz, MVT::nxv4i64, 35}, 761 {Intrinsic::vp_ctlz, MVT::nxv8i64, 35}, 762 {Intrinsic::vp_cttz, MVT::v2i8, 16}, 763 {Intrinsic::vp_cttz, MVT::v4i8, 16}, 764 {Intrinsic::vp_cttz, MVT::v8i8, 16}, 765 {Intrinsic::vp_cttz, MVT::v16i8, 16}, 766 {Intrinsic::vp_cttz, MVT::nxv1i8, 16}, 767 {Intrinsic::vp_cttz, MVT::nxv2i8, 16}, 768 {Intrinsic::vp_cttz, MVT::nxv4i8, 16}, 769 {Intrinsic::vp_cttz, MVT::nxv8i8, 16}, 770 {Intrinsic::vp_cttz, MVT::nxv16i8, 16}, 771 {Intrinsic::vp_cttz, MVT::nxv32i8, 16}, 772 {Intrinsic::vp_cttz, MVT::nxv64i8, 16}, 773 {Intrinsic::vp_cttz, MVT::v2i16, 23}, 774 {Intrinsic::vp_cttz, MVT::v4i16, 23}, 775 {Intrinsic::vp_cttz, MVT::v8i16, 23}, 776 {Intrinsic::vp_cttz, MVT::v16i16, 23}, 777 {Intrinsic::vp_cttz, MVT::nxv1i16, 23}, 778 {Intrinsic::vp_cttz, MVT::nxv2i16, 23}, 779 {Intrinsic::vp_cttz, MVT::nxv4i16, 23}, 780 {Intrinsic::vp_cttz, MVT::nxv8i16, 23}, 781 {Intrinsic::vp_cttz, MVT::nxv16i16, 23}, 782 {Intrinsic::vp_cttz, MVT::nxv32i16, 23}, 783 {Intrinsic::vp_cttz, MVT::v2i32, 24}, 784 {Intrinsic::vp_cttz, MVT::v4i32, 24}, 785 {Intrinsic::vp_cttz, MVT::v8i32, 24}, 786 {Intrinsic::vp_cttz, MVT::v16i32, 24}, 787 {Intrinsic::vp_cttz, MVT::nxv1i32, 24}, 788 {Intrinsic::vp_cttz, MVT::nxv2i32, 24}, 789 {Intrinsic::vp_cttz, MVT::nxv4i32, 24}, 790 {Intrinsic::vp_cttz, MVT::nxv8i32, 24}, 791 {Intrinsic::vp_cttz, MVT::nxv16i32, 24}, 792 {Intrinsic::vp_cttz, MVT::v2i64, 25}, 793 {Intrinsic::vp_cttz, MVT::v4i64, 25}, 794 {Intrinsic::vp_cttz, MVT::v8i64, 25}, 795 {Intrinsic::vp_cttz, MVT::v16i64, 25}, 796 {Intrinsic::vp_cttz, MVT::nxv1i64, 25}, 797 {Intrinsic::vp_cttz, MVT::nxv2i64, 25}, 798 {Intrinsic::vp_cttz, MVT::nxv4i64, 25}, 799 {Intrinsic::vp_cttz, MVT::nxv8i64, 25}, 800 }; 801 802 static unsigned getISDForVPIntrinsicID(Intrinsic::ID ID) { 803 switch (ID) { 804 #define HELPER_MAP_VPID_TO_VPSD(VPID, VPSD) \ 805 case Intrinsic::VPID: \ 806 return ISD::VPSD; 807 #include "llvm/IR/VPIntrinsics.def" 808 #undef HELPER_MAP_VPID_TO_VPSD 809 } 810 return ISD::DELETED_NODE; 811 } 812 813 InstructionCost 814 RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, 815 TTI::TargetCostKind CostKind) { 816 auto *RetTy = ICA.getReturnType(); 817 switch (ICA.getID()) { 818 case Intrinsic::ceil: 819 case Intrinsic::floor: 820 case Intrinsic::trunc: 821 case Intrinsic::rint: 822 case Intrinsic::round: 823 case Intrinsic::roundeven: { 824 // These all use the same code. 825 auto LT = getTypeLegalizationCost(RetTy); 826 if (!LT.second.isVector() && TLI->isOperationCustom(ISD::FCEIL, LT.second)) 827 return LT.first * 8; 828 break; 829 } 830 case Intrinsic::umin: 831 case Intrinsic::umax: 832 case Intrinsic::smin: 833 case Intrinsic::smax: { 834 auto LT = getTypeLegalizationCost(RetTy); 835 if ((ST->hasVInstructions() && LT.second.isVector()) || 836 (LT.second.isScalarInteger() && ST->hasStdExtZbb())) 837 return LT.first; 838 break; 839 } 840 case Intrinsic::sadd_sat: 841 case Intrinsic::ssub_sat: 842 case Intrinsic::uadd_sat: 843 case Intrinsic::usub_sat: { 844 auto LT = getTypeLegalizationCost(RetTy); 845 if (ST->hasVInstructions() && LT.second.isVector()) 846 return LT.first; 847 break; 848 } 849 case Intrinsic::abs: { 850 auto LT = getTypeLegalizationCost(RetTy); 851 if (ST->hasVInstructions() && LT.second.isVector()) { 852 // vrsub.vi v10, v8, 0 853 // vmax.vv v8, v8, v10 854 return LT.first * 2; 855 } 856 break; 857 } 858 case Intrinsic::fabs: 859 case Intrinsic::sqrt: { 860 auto LT = getTypeLegalizationCost(RetTy); 861 if (ST->hasVInstructions() && LT.second.isVector()) 862 return LT.first; 863 break; 864 } 865 // TODO: add more intrinsic 866 case Intrinsic::experimental_stepvector: { 867 unsigned Cost = 1; // vid 868 auto LT = getTypeLegalizationCost(RetTy); 869 return Cost + (LT.first - 1); 870 } 871 case Intrinsic::vp_rint: { 872 // RISC-V target uses at least 5 instructions to lower rounding intrinsics. 873 unsigned Cost = 5; 874 auto LT = getTypeLegalizationCost(RetTy); 875 if (TLI->isOperationCustom(ISD::VP_FRINT, LT.second)) 876 return Cost * LT.first; 877 break; 878 } 879 case Intrinsic::vp_nearbyint: { 880 // More one read and one write for fflags than vp_rint. 881 unsigned Cost = 7; 882 auto LT = getTypeLegalizationCost(RetTy); 883 if (TLI->isOperationCustom(ISD::VP_FRINT, LT.second)) 884 return Cost * LT.first; 885 break; 886 } 887 case Intrinsic::vp_ceil: 888 case Intrinsic::vp_floor: 889 case Intrinsic::vp_round: 890 case Intrinsic::vp_roundeven: 891 case Intrinsic::vp_roundtozero: { 892 // Rounding with static rounding mode needs two more instructions to 893 // swap/write FRM than vp_rint. 894 unsigned Cost = 7; 895 auto LT = getTypeLegalizationCost(RetTy); 896 unsigned VPISD = getISDForVPIntrinsicID(ICA.getID()); 897 if (TLI->isOperationCustom(VPISD, LT.second)) 898 return Cost * LT.first; 899 break; 900 } 901 } 902 903 if (ST->hasVInstructions() && RetTy->isVectorTy()) { 904 auto LT = getTypeLegalizationCost(RetTy); 905 if (const auto *Entry = CostTableLookup(VectorIntrinsicCostTable, 906 ICA.getID(), LT.second)) 907 return LT.first * Entry->Cost; 908 } 909 910 return BaseT::getIntrinsicInstrCost(ICA, CostKind); 911 } 912 913 InstructionCost RISCVTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, 914 Type *Src, 915 TTI::CastContextHint CCH, 916 TTI::TargetCostKind CostKind, 917 const Instruction *I) { 918 if (isa<VectorType>(Dst) && isa<VectorType>(Src)) { 919 // FIXME: Need to compute legalizing cost for illegal types. 920 if (!isTypeLegal(Src) || !isTypeLegal(Dst)) 921 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I); 922 923 // Skip if element size of Dst or Src is bigger than ELEN. 924 if (Src->getScalarSizeInBits() > ST->getELEN() || 925 Dst->getScalarSizeInBits() > ST->getELEN()) 926 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I); 927 928 int ISD = TLI->InstructionOpcodeToISD(Opcode); 929 assert(ISD && "Invalid opcode"); 930 931 // FIXME: Need to consider vsetvli and lmul. 932 int PowDiff = (int)Log2_32(Dst->getScalarSizeInBits()) - 933 (int)Log2_32(Src->getScalarSizeInBits()); 934 switch (ISD) { 935 case ISD::SIGN_EXTEND: 936 case ISD::ZERO_EXTEND: 937 if (Src->getScalarSizeInBits() == 1) { 938 // We do not use vsext/vzext to extend from mask vector. 939 // Instead we use the following instructions to extend from mask vector: 940 // vmv.v.i v8, 0 941 // vmerge.vim v8, v8, -1, v0 942 return 2; 943 } 944 return 1; 945 case ISD::TRUNCATE: 946 if (Dst->getScalarSizeInBits() == 1) { 947 // We do not use several vncvt to truncate to mask vector. So we could 948 // not use PowDiff to calculate it. 949 // Instead we use the following instructions to truncate to mask vector: 950 // vand.vi v8, v8, 1 951 // vmsne.vi v0, v8, 0 952 return 2; 953 } 954 [[fallthrough]]; 955 case ISD::FP_EXTEND: 956 case ISD::FP_ROUND: 957 // Counts of narrow/widen instructions. 958 return std::abs(PowDiff); 959 case ISD::FP_TO_SINT: 960 case ISD::FP_TO_UINT: 961 case ISD::SINT_TO_FP: 962 case ISD::UINT_TO_FP: 963 if (Src->getScalarSizeInBits() == 1 || Dst->getScalarSizeInBits() == 1) { 964 // The cost of convert from or to mask vector is different from other 965 // cases. We could not use PowDiff to calculate it. 966 // For mask vector to fp, we should use the following instructions: 967 // vmv.v.i v8, 0 968 // vmerge.vim v8, v8, -1, v0 969 // vfcvt.f.x.v v8, v8 970 971 // And for fp vector to mask, we use: 972 // vfncvt.rtz.x.f.w v9, v8 973 // vand.vi v8, v9, 1 974 // vmsne.vi v0, v8, 0 975 return 3; 976 } 977 if (std::abs(PowDiff) <= 1) 978 return 1; 979 // Backend could lower (v[sz]ext i8 to double) to vfcvt(v[sz]ext.f8 i8), 980 // so it only need two conversion. 981 if (Src->isIntOrIntVectorTy()) 982 return 2; 983 // Counts of narrow/widen instructions. 984 return std::abs(PowDiff); 985 } 986 } 987 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I); 988 } 989 990 unsigned RISCVTTIImpl::getEstimatedVLFor(VectorType *Ty) { 991 if (isa<ScalableVectorType>(Ty)) { 992 const unsigned EltSize = DL.getTypeSizeInBits(Ty->getElementType()); 993 const unsigned MinSize = DL.getTypeSizeInBits(Ty).getKnownMinValue(); 994 const unsigned VectorBits = *getVScaleForTuning() * RISCV::RVVBitsPerBlock; 995 return RISCVTargetLowering::computeVLMAX(VectorBits, EltSize, MinSize); 996 } 997 return cast<FixedVectorType>(Ty)->getNumElements(); 998 } 999 1000 InstructionCost 1001 RISCVTTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, 1002 bool IsUnsigned, 1003 TTI::TargetCostKind CostKind) { 1004 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors()) 1005 return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind); 1006 1007 // Skip if scalar size of Ty is bigger than ELEN. 1008 if (Ty->getScalarSizeInBits() > ST->getELEN()) 1009 return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind); 1010 1011 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); 1012 if (Ty->getElementType()->isIntegerTy(1)) 1013 // vcpop sequences, see vreduction-mask.ll. umax, smin actually only 1014 // cost 2, but we don't have enough info here so we slightly over cost. 1015 return (LT.first - 1) + 3; 1016 1017 // IR Reduction is composed by two vmv and one rvv reduction instruction. 1018 InstructionCost BaseCost = 2; 1019 unsigned VL = getEstimatedVLFor(Ty); 1020 return (LT.first - 1) + BaseCost + Log2_32_Ceil(VL); 1021 } 1022 1023 InstructionCost 1024 RISCVTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, 1025 std::optional<FastMathFlags> FMF, 1026 TTI::TargetCostKind CostKind) { 1027 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors()) 1028 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind); 1029 1030 // Skip if scalar size of Ty is bigger than ELEN. 1031 if (Ty->getScalarSizeInBits() > ST->getELEN()) 1032 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind); 1033 1034 int ISD = TLI->InstructionOpcodeToISD(Opcode); 1035 assert(ISD && "Invalid opcode"); 1036 1037 if (ISD != ISD::ADD && ISD != ISD::OR && ISD != ISD::XOR && ISD != ISD::AND && 1038 ISD != ISD::FADD) 1039 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind); 1040 1041 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); 1042 if (Ty->getElementType()->isIntegerTy(1)) 1043 // vcpop sequences, see vreduction-mask.ll 1044 return (LT.first - 1) + (ISD == ISD::AND ? 3 : 2); 1045 1046 // IR Reduction is composed by two vmv and one rvv reduction instruction. 1047 InstructionCost BaseCost = 2; 1048 unsigned VL = getEstimatedVLFor(Ty); 1049 if (TTI::requiresOrderedReduction(FMF)) 1050 return (LT.first - 1) + BaseCost + VL; 1051 return (LT.first - 1) + BaseCost + Log2_32_Ceil(VL); 1052 } 1053 1054 InstructionCost RISCVTTIImpl::getExtendedReductionCost( 1055 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, 1056 std::optional<FastMathFlags> FMF, TTI::TargetCostKind CostKind) { 1057 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors()) 1058 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy, 1059 FMF, CostKind); 1060 1061 // Skip if scalar size of ResTy is bigger than ELEN. 1062 if (ResTy->getScalarSizeInBits() > ST->getELEN()) 1063 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy, 1064 FMF, CostKind); 1065 1066 if (Opcode != Instruction::Add && Opcode != Instruction::FAdd) 1067 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy, 1068 FMF, CostKind); 1069 1070 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy); 1071 1072 if (ResTy->getScalarSizeInBits() != 2 * LT.second.getScalarSizeInBits()) 1073 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy, 1074 FMF, CostKind); 1075 1076 return (LT.first - 1) + 1077 getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind); 1078 } 1079 1080 InstructionCost RISCVTTIImpl::getStoreImmCost(Type *Ty, 1081 TTI::OperandValueInfo OpInfo, 1082 TTI::TargetCostKind CostKind) { 1083 assert(OpInfo.isConstant() && "non constant operand?"); 1084 if (!isa<VectorType>(Ty)) 1085 // FIXME: We need to account for immediate materialization here, but doing 1086 // a decent job requires more knowledge about the immediate than we 1087 // currently have here. 1088 return 0; 1089 1090 if (OpInfo.isUniform()) 1091 // vmv.x.i, vmv.v.x, or vfmv.v.f 1092 // We ignore the cost of the scalar constant materialization to be consistent 1093 // with how we treat scalar constants themselves just above. 1094 return 1; 1095 1096 // Add a cost of address generation + the cost of the vector load. The 1097 // address is expected to be a PC relative offset to a constant pool entry 1098 // using auipc/addi. 1099 return 2 + getMemoryOpCost(Instruction::Load, Ty, DL.getABITypeAlign(Ty), 1100 /*AddressSpace=*/0, CostKind); 1101 } 1102 1103 1104 InstructionCost RISCVTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, 1105 MaybeAlign Alignment, 1106 unsigned AddressSpace, 1107 TTI::TargetCostKind CostKind, 1108 TTI::OperandValueInfo OpInfo, 1109 const Instruction *I) { 1110 InstructionCost Cost = 0; 1111 if (Opcode == Instruction::Store && OpInfo.isConstant()) 1112 Cost += getStoreImmCost(Src, OpInfo, CostKind); 1113 return Cost + BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, 1114 CostKind, OpInfo, I); 1115 } 1116 1117 InstructionCost RISCVTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, 1118 Type *CondTy, 1119 CmpInst::Predicate VecPred, 1120 TTI::TargetCostKind CostKind, 1121 const Instruction *I) { 1122 if (CostKind != TTI::TCK_RecipThroughput) 1123 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, 1124 I); 1125 1126 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors()) 1127 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, 1128 I); 1129 1130 // Skip if scalar size of ValTy is bigger than ELEN. 1131 if (ValTy->isVectorTy() && ValTy->getScalarSizeInBits() > ST->getELEN()) 1132 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, 1133 I); 1134 1135 if (Opcode == Instruction::Select && ValTy->isVectorTy()) { 1136 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy); 1137 if (CondTy->isVectorTy()) { 1138 if (ValTy->getScalarSizeInBits() == 1) { 1139 // vmandn.mm v8, v8, v9 1140 // vmand.mm v9, v0, v9 1141 // vmor.mm v0, v9, v8 1142 return LT.first * 3; 1143 } 1144 // vselect and max/min are supported natively. 1145 return LT.first * 1; 1146 } 1147 1148 if (ValTy->getScalarSizeInBits() == 1) { 1149 // vmv.v.x v9, a0 1150 // vmsne.vi v9, v9, 0 1151 // vmandn.mm v8, v8, v9 1152 // vmand.mm v9, v0, v9 1153 // vmor.mm v0, v9, v8 1154 return LT.first * 5; 1155 } 1156 1157 // vmv.v.x v10, a0 1158 // vmsne.vi v0, v10, 0 1159 // vmerge.vvm v8, v9, v8, v0 1160 return LT.first * 3; 1161 } 1162 1163 if ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) && 1164 ValTy->isVectorTy()) { 1165 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy); 1166 1167 // Support natively. 1168 if (CmpInst::isIntPredicate(VecPred)) 1169 return LT.first * 1; 1170 1171 // If we do not support the input floating point vector type, use the base 1172 // one which will calculate as: 1173 // ScalarizeCost + Num * Cost for fixed vector, 1174 // InvalidCost for scalable vector. 1175 if ((ValTy->getScalarSizeInBits() == 16 && !ST->hasVInstructionsF16()) || 1176 (ValTy->getScalarSizeInBits() == 32 && !ST->hasVInstructionsF32()) || 1177 (ValTy->getScalarSizeInBits() == 64 && !ST->hasVInstructionsF64())) 1178 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, 1179 I); 1180 switch (VecPred) { 1181 // Support natively. 1182 case CmpInst::FCMP_OEQ: 1183 case CmpInst::FCMP_OGT: 1184 case CmpInst::FCMP_OGE: 1185 case CmpInst::FCMP_OLT: 1186 case CmpInst::FCMP_OLE: 1187 case CmpInst::FCMP_UNE: 1188 return LT.first * 1; 1189 // TODO: Other comparisons? 1190 default: 1191 break; 1192 } 1193 } 1194 1195 // TODO: Add cost for scalar type. 1196 1197 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I); 1198 } 1199 1200 InstructionCost RISCVTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, 1201 TTI::TargetCostKind CostKind, 1202 unsigned Index, Value *Op0, 1203 Value *Op1) { 1204 assert(Val->isVectorTy() && "This must be a vector type"); 1205 1206 if (Opcode != Instruction::ExtractElement && 1207 Opcode != Instruction::InsertElement) 1208 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1); 1209 1210 // Legalize the type. 1211 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val); 1212 1213 // This type is legalized to a scalar type. 1214 if (!LT.second.isVector()) 1215 return 0; 1216 1217 // For unsupported scalable vector. 1218 if (LT.second.isScalableVector() && !LT.first.isValid()) 1219 return LT.first; 1220 1221 if (!isTypeLegal(Val)) 1222 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1); 1223 1224 // In RVV, we could use vslidedown + vmv.x.s to extract element from vector 1225 // and vslideup + vmv.s.x to insert element to vector. 1226 unsigned BaseCost = 1; 1227 // When insertelement we should add the index with 1 as the input of vslideup. 1228 unsigned SlideCost = Opcode == Instruction::InsertElement ? 2 : 1; 1229 1230 if (Index != -1U) { 1231 // The type may be split. For fixed-width vectors we can normalize the 1232 // index to the new type. 1233 if (LT.second.isFixedLengthVector()) { 1234 unsigned Width = LT.second.getVectorNumElements(); 1235 Index = Index % Width; 1236 } 1237 1238 // We could extract/insert the first element without vslidedown/vslideup. 1239 if (Index == 0) 1240 SlideCost = 0; 1241 else if (Opcode == Instruction::InsertElement) 1242 SlideCost = 1; // With a constant index, we do not need to use addi. 1243 } 1244 1245 // Mask vector extract/insert element is different from normal case. 1246 if (Val->getScalarSizeInBits() == 1) { 1247 // For extractelement, we need the following instructions: 1248 // vmv.v.i v8, 0 1249 // vmerge.vim v8, v8, 1, v0 1250 // vsetivli zero, 1, e8, m2, ta, mu (not count) 1251 // vslidedown.vx v8, v8, a0 1252 // vmv.x.s a0, v8 1253 1254 // For insertelement, we need the following instructions: 1255 // vsetvli a2, zero, e8, m1, ta, mu (not count) 1256 // vmv.s.x v8, a0 1257 // vmv.v.i v9, 0 1258 // vmerge.vim v9, v9, 1, v0 1259 // addi a0, a1, 1 1260 // vsetvli zero, a0, e8, m1, tu, mu (not count) 1261 // vslideup.vx v9, v8, a1 1262 // vsetvli a0, zero, e8, m1, ta, mu (not count) 1263 // vand.vi v8, v9, 1 1264 // vmsne.vi v0, v8, 0 1265 1266 // TODO: should we count these special vsetvlis? 1267 BaseCost = Opcode == Instruction::InsertElement ? 5 : 3; 1268 } 1269 // Extract i64 in the target that has XLEN=32 need more instruction. 1270 if (Val->getScalarType()->isIntegerTy() && 1271 ST->getXLen() < Val->getScalarSizeInBits()) { 1272 // For extractelement, we need the following instructions: 1273 // vsetivli zero, 1, e64, m1, ta, mu (not count) 1274 // vslidedown.vx v8, v8, a0 1275 // vmv.x.s a0, v8 1276 // li a1, 32 1277 // vsrl.vx v8, v8, a1 1278 // vmv.x.s a1, v8 1279 1280 // For insertelement, we need the following instructions: 1281 // vsetivli zero, 2, e32, m4, ta, mu (not count) 1282 // vmv.v.i v12, 0 1283 // vslide1up.vx v16, v12, a1 1284 // vslide1up.vx v12, v16, a0 1285 // addi a0, a2, 1 1286 // vsetvli zero, a0, e64, m4, tu, mu (not count) 1287 // vslideup.vx v8, v12, a2 1288 1289 // TODO: should we count these special vsetvlis? 1290 BaseCost = Opcode == Instruction::InsertElement ? 3 : 4; 1291 } 1292 return BaseCost + SlideCost; 1293 } 1294 1295 InstructionCost RISCVTTIImpl::getArithmeticInstrCost( 1296 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, 1297 TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, 1298 ArrayRef<const Value *> Args, const Instruction *CxtI) { 1299 1300 // TODO: Handle more cost kinds. 1301 if (CostKind != TTI::TCK_RecipThroughput) 1302 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info, 1303 Args, CxtI); 1304 1305 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors()) 1306 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info, 1307 Args, CxtI); 1308 1309 // Skip if scalar size of Ty is bigger than ELEN. 1310 if (isa<VectorType>(Ty) && Ty->getScalarSizeInBits() > ST->getELEN()) 1311 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info, 1312 Args, CxtI); 1313 1314 // Legalize the type. 1315 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); 1316 1317 // TODO: Handle scalar type. 1318 if (!LT.second.isVector()) 1319 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info, 1320 Args, CxtI); 1321 1322 1323 auto getConstantMatCost = 1324 [&](unsigned Operand, TTI::OperandValueInfo OpInfo) -> InstructionCost { 1325 if (OpInfo.isUniform() && TLI->canSplatOperand(Opcode, Operand)) 1326 // Two sub-cases: 1327 // * Has a 5 bit immediate operand which can be splatted. 1328 // * Has a larger immediate which must be materialized in scalar register 1329 // We return 0 for both as we currently ignore the cost of materializing 1330 // scalar constants in GPRs. 1331 return 0; 1332 1333 // Add a cost of address generation + the cost of the vector load. The 1334 // address is expected to be a PC relative offset to a constant pool entry 1335 // using auipc/addi. 1336 return 2 + getMemoryOpCost(Instruction::Load, Ty, DL.getABITypeAlign(Ty), 1337 /*AddressSpace=*/0, CostKind); 1338 }; 1339 1340 // Add the cost of materializing any constant vectors required. 1341 InstructionCost ConstantMatCost = 0; 1342 if (Op1Info.isConstant()) 1343 ConstantMatCost += getConstantMatCost(0, Op1Info); 1344 if (Op2Info.isConstant()) 1345 ConstantMatCost += getConstantMatCost(1, Op2Info); 1346 1347 switch (TLI->InstructionOpcodeToISD(Opcode)) { 1348 case ISD::ADD: 1349 case ISD::SUB: 1350 case ISD::AND: 1351 case ISD::OR: 1352 case ISD::XOR: 1353 case ISD::SHL: 1354 case ISD::SRL: 1355 case ISD::SRA: 1356 case ISD::MUL: 1357 case ISD::MULHS: 1358 case ISD::MULHU: 1359 case ISD::FADD: 1360 case ISD::FSUB: 1361 case ISD::FMUL: 1362 case ISD::FNEG: { 1363 return ConstantMatCost + getLMULCost(LT.second) * LT.first * 1; 1364 } 1365 default: 1366 return ConstantMatCost + 1367 BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info, 1368 Args, CxtI); 1369 } 1370 } 1371 1372 void RISCVTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, 1373 TTI::UnrollingPreferences &UP, 1374 OptimizationRemarkEmitter *ORE) { 1375 // TODO: More tuning on benchmarks and metrics with changes as needed 1376 // would apply to all settings below to enable performance. 1377 1378 1379 if (ST->enableDefaultUnroll()) 1380 return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE); 1381 1382 // Enable Upper bound unrolling universally, not dependant upon the conditions 1383 // below. 1384 UP.UpperBound = true; 1385 1386 // Disable loop unrolling for Oz and Os. 1387 UP.OptSizeThreshold = 0; 1388 UP.PartialOptSizeThreshold = 0; 1389 if (L->getHeader()->getParent()->hasOptSize()) 1390 return; 1391 1392 SmallVector<BasicBlock *, 4> ExitingBlocks; 1393 L->getExitingBlocks(ExitingBlocks); 1394 LLVM_DEBUG(dbgs() << "Loop has:\n" 1395 << "Blocks: " << L->getNumBlocks() << "\n" 1396 << "Exit blocks: " << ExitingBlocks.size() << "\n"); 1397 1398 // Only allow another exit other than the latch. This acts as an early exit 1399 // as it mirrors the profitability calculation of the runtime unroller. 1400 if (ExitingBlocks.size() > 2) 1401 return; 1402 1403 // Limit the CFG of the loop body for targets with a branch predictor. 1404 // Allowing 4 blocks permits if-then-else diamonds in the body. 1405 if (L->getNumBlocks() > 4) 1406 return; 1407 1408 // Don't unroll vectorized loops, including the remainder loop 1409 if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized")) 1410 return; 1411 1412 // Scan the loop: don't unroll loops with calls as this could prevent 1413 // inlining. 1414 InstructionCost Cost = 0; 1415 for (auto *BB : L->getBlocks()) { 1416 for (auto &I : *BB) { 1417 // Initial setting - Don't unroll loops containing vectorized 1418 // instructions. 1419 if (I.getType()->isVectorTy()) 1420 return; 1421 1422 if (isa<CallInst>(I) || isa<InvokeInst>(I)) { 1423 if (const Function *F = cast<CallBase>(I).getCalledFunction()) { 1424 if (!isLoweredToCall(F)) 1425 continue; 1426 } 1427 return; 1428 } 1429 1430 SmallVector<const Value *> Operands(I.operand_values()); 1431 Cost += getInstructionCost(&I, Operands, 1432 TargetTransformInfo::TCK_SizeAndLatency); 1433 } 1434 } 1435 1436 LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n"); 1437 1438 UP.Partial = true; 1439 UP.Runtime = true; 1440 UP.UnrollRemainder = true; 1441 UP.UnrollAndJam = true; 1442 UP.UnrollAndJamInnerLoopThreshold = 60; 1443 1444 // Force unrolling small loops can be very useful because of the branch 1445 // taken cost of the backedge. 1446 if (Cost < 12) 1447 UP.Force = true; 1448 } 1449 1450 void RISCVTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, 1451 TTI::PeelingPreferences &PP) { 1452 BaseT::getPeelingPreferences(L, SE, PP); 1453 } 1454 1455 unsigned RISCVTTIImpl::getRegUsageForType(Type *Ty) { 1456 TypeSize Size = DL.getTypeSizeInBits(Ty); 1457 if (Ty->isVectorTy()) { 1458 if (Size.isScalable() && ST->hasVInstructions()) 1459 return divideCeil(Size.getKnownMinValue(), RISCV::RVVBitsPerBlock); 1460 1461 if (ST->useRVVForFixedLengthVectors()) 1462 return divideCeil(Size, ST->getRealMinVLen()); 1463 } 1464 1465 return BaseT::getRegUsageForType(Ty); 1466 } 1467 1468 unsigned RISCVTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const { 1469 // This interface is currently only used by SLP. Returning 1 (which is the 1470 // default value for SLPMaxVF) disables SLP. We currently have a cost modeling 1471 // problem w/ constant materialization which causes SLP to perform majorly 1472 // unprofitable transformations. 1473 // TODO: Figure out constant materialization cost modeling and remove. 1474 return SLPMaxVF; 1475 } 1476 1477 bool RISCVTTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1, 1478 const TargetTransformInfo::LSRCost &C2) { 1479 // RISCV specific here are "instruction number 1st priority". 1480 return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost, 1481 C1.NumIVMuls, C1.NumBaseAdds, 1482 C1.ScaleCost, C1.ImmCost, C1.SetupCost) < 1483 std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost, 1484 C2.NumIVMuls, C2.NumBaseAdds, 1485 C2.ScaleCost, C2.ImmCost, C2.SetupCost); 1486 } 1487