1 //===-- RISCVTargetTransformInfo.cpp - RISC-V specific TTI ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "RISCVTargetTransformInfo.h" 10 #include "MCTargetDesc/RISCVMatInt.h" 11 #include "llvm/ADT/STLExtras.h" 12 #include "llvm/Analysis/TargetTransformInfo.h" 13 #include "llvm/CodeGen/BasicTTIImpl.h" 14 #include "llvm/CodeGen/CostTable.h" 15 #include "llvm/CodeGen/TargetLowering.h" 16 #include "llvm/CodeGen/ValueTypes.h" 17 #include "llvm/IR/Instructions.h" 18 #include "llvm/IR/PatternMatch.h" 19 #include <cmath> 20 #include <optional> 21 using namespace llvm; 22 using namespace llvm::PatternMatch; 23 24 #define DEBUG_TYPE "riscvtti" 25 26 static cl::opt<unsigned> RVVRegisterWidthLMUL( 27 "riscv-v-register-bit-width-lmul", 28 cl::desc( 29 "The LMUL to use for getRegisterBitWidth queries. Affects LMUL used " 30 "by autovectorized code. Fractional LMULs are not supported."), 31 cl::init(2), cl::Hidden); 32 33 static cl::opt<unsigned> SLPMaxVF( 34 "riscv-v-slp-max-vf", 35 cl::desc( 36 "Overrides result used for getMaximumVF query which is used " 37 "exclusively by SLP vectorizer."), 38 cl::Hidden); 39 40 static cl::opt<unsigned> 41 RVVMinTripCount("riscv-v-min-trip-count", 42 cl::desc("Set the lower bound of a trip count to decide on " 43 "vectorization while tail-folding."), 44 cl::init(5), cl::Hidden); 45 46 InstructionCost 47 RISCVTTIImpl::getRISCVInstructionCost(ArrayRef<unsigned> OpCodes, MVT VT, 48 TTI::TargetCostKind CostKind) const { 49 // Check if the type is valid for all CostKind 50 if (!VT.isVector()) 51 return InstructionCost::getInvalid(); 52 size_t NumInstr = OpCodes.size(); 53 if (CostKind == TTI::TCK_CodeSize) 54 return NumInstr; 55 InstructionCost LMULCost = TLI->getLMULCost(VT); 56 if ((CostKind != TTI::TCK_RecipThroughput) && (CostKind != TTI::TCK_Latency)) 57 return LMULCost * NumInstr; 58 InstructionCost Cost = 0; 59 for (auto Op : OpCodes) { 60 switch (Op) { 61 case RISCV::VRGATHER_VI: 62 Cost += TLI->getVRGatherVICost(VT); 63 break; 64 case RISCV::VRGATHER_VV: 65 Cost += TLI->getVRGatherVVCost(VT); 66 break; 67 case RISCV::VSLIDEUP_VI: 68 case RISCV::VSLIDEDOWN_VI: 69 Cost += TLI->getVSlideVICost(VT); 70 break; 71 case RISCV::VSLIDEUP_VX: 72 case RISCV::VSLIDEDOWN_VX: 73 Cost += TLI->getVSlideVXCost(VT); 74 break; 75 case RISCV::VREDMAX_VS: 76 case RISCV::VREDMIN_VS: 77 case RISCV::VREDMAXU_VS: 78 case RISCV::VREDMINU_VS: 79 case RISCV::VREDSUM_VS: 80 case RISCV::VREDAND_VS: 81 case RISCV::VREDOR_VS: 82 case RISCV::VREDXOR_VS: 83 case RISCV::VFREDMAX_VS: 84 case RISCV::VFREDMIN_VS: 85 case RISCV::VFREDUSUM_VS: { 86 unsigned VL = VT.getVectorMinNumElements(); 87 if (!VT.isFixedLengthVector()) 88 VL *= *getVScaleForTuning(); 89 Cost += Log2_32_Ceil(VL); 90 break; 91 } 92 case RISCV::VFREDOSUM_VS: { 93 unsigned VL = VT.getVectorMinNumElements(); 94 if (!VT.isFixedLengthVector()) 95 VL *= *getVScaleForTuning(); 96 Cost += VL; 97 break; 98 } 99 case RISCV::VMV_X_S: 100 case RISCV::VMV_S_X: 101 case RISCV::VFMV_F_S: 102 case RISCV::VFMV_S_F: 103 case RISCV::VMOR_MM: 104 case RISCV::VMXOR_MM: 105 case RISCV::VMAND_MM: 106 case RISCV::VMANDN_MM: 107 case RISCV::VMNAND_MM: 108 case RISCV::VCPOP_M: 109 case RISCV::VFIRST_M: 110 Cost += 1; 111 break; 112 default: 113 Cost += LMULCost; 114 } 115 } 116 return Cost; 117 } 118 119 static InstructionCost getIntImmCostImpl(const DataLayout &DL, 120 const RISCVSubtarget *ST, 121 const APInt &Imm, Type *Ty, 122 TTI::TargetCostKind CostKind, 123 bool FreeZeroes) { 124 assert(Ty->isIntegerTy() && 125 "getIntImmCost can only estimate cost of materialising integers"); 126 127 // We have a Zero register, so 0 is always free. 128 if (Imm == 0) 129 return TTI::TCC_Free; 130 131 // Otherwise, we check how many instructions it will take to materialise. 132 return RISCVMatInt::getIntMatCost(Imm, DL.getTypeSizeInBits(Ty), *ST, 133 /*CompressionCost=*/false, FreeZeroes); 134 } 135 136 InstructionCost 137 RISCVTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty, 138 TTI::TargetCostKind CostKind) const { 139 return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind, false); 140 } 141 142 // Look for patterns of shift followed by AND that can be turned into a pair of 143 // shifts. We won't need to materialize an immediate for the AND so these can 144 // be considered free. 145 static bool canUseShiftPair(Instruction *Inst, const APInt &Imm) { 146 uint64_t Mask = Imm.getZExtValue(); 147 auto *BO = dyn_cast<BinaryOperator>(Inst->getOperand(0)); 148 if (!BO || !BO->hasOneUse()) 149 return false; 150 151 if (BO->getOpcode() != Instruction::Shl) 152 return false; 153 154 if (!isa<ConstantInt>(BO->getOperand(1))) 155 return false; 156 157 unsigned ShAmt = cast<ConstantInt>(BO->getOperand(1))->getZExtValue(); 158 // (and (shl x, c2), c1) will be matched to (srli (slli x, c2+c3), c3) if c1 159 // is a mask shifted by c2 bits with c3 leading zeros. 160 if (isShiftedMask_64(Mask)) { 161 unsigned Trailing = llvm::countr_zero(Mask); 162 if (ShAmt == Trailing) 163 return true; 164 } 165 166 return false; 167 } 168 169 InstructionCost RISCVTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, 170 const APInt &Imm, Type *Ty, 171 TTI::TargetCostKind CostKind, 172 Instruction *Inst) const { 173 assert(Ty->isIntegerTy() && 174 "getIntImmCost can only estimate cost of materialising integers"); 175 176 // We have a Zero register, so 0 is always free. 177 if (Imm == 0) 178 return TTI::TCC_Free; 179 180 // Some instructions in RISC-V can take a 12-bit immediate. Some of these are 181 // commutative, in others the immediate comes from a specific argument index. 182 bool Takes12BitImm = false; 183 unsigned ImmArgIdx = ~0U; 184 185 switch (Opcode) { 186 case Instruction::GetElementPtr: 187 // Never hoist any arguments to a GetElementPtr. CodeGenPrepare will 188 // split up large offsets in GEP into better parts than ConstantHoisting 189 // can. 190 return TTI::TCC_Free; 191 case Instruction::Store: { 192 // Use the materialization cost regardless of if it's the address or the 193 // value that is constant, except for if the store is misaligned and 194 // misaligned accesses are not legal (experience shows constant hoisting 195 // can sometimes be harmful in such cases). 196 if (Idx == 1 || !Inst) 197 return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind, 198 /*FreeZeroes=*/true); 199 200 StoreInst *ST = cast<StoreInst>(Inst); 201 if (!getTLI()->allowsMemoryAccessForAlignment( 202 Ty->getContext(), DL, getTLI()->getValueType(DL, Ty), 203 ST->getPointerAddressSpace(), ST->getAlign())) 204 return TTI::TCC_Free; 205 206 return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind, 207 /*FreeZeroes=*/true); 208 } 209 case Instruction::Load: 210 // If the address is a constant, use the materialization cost. 211 return getIntImmCost(Imm, Ty, CostKind); 212 case Instruction::And: 213 // zext.h 214 if (Imm == UINT64_C(0xffff) && ST->hasStdExtZbb()) 215 return TTI::TCC_Free; 216 // zext.w 217 if (Imm == UINT64_C(0xffffffff) && 218 ((ST->hasStdExtZba() && ST->isRV64()) || ST->isRV32())) 219 return TTI::TCC_Free; 220 // bclri 221 if (ST->hasStdExtZbs() && (~Imm).isPowerOf2()) 222 return TTI::TCC_Free; 223 if (Inst && Idx == 1 && Imm.getBitWidth() <= ST->getXLen() && 224 canUseShiftPair(Inst, Imm)) 225 return TTI::TCC_Free; 226 Takes12BitImm = true; 227 break; 228 case Instruction::Add: 229 Takes12BitImm = true; 230 break; 231 case Instruction::Or: 232 case Instruction::Xor: 233 // bseti/binvi 234 if (ST->hasStdExtZbs() && Imm.isPowerOf2()) 235 return TTI::TCC_Free; 236 Takes12BitImm = true; 237 break; 238 case Instruction::Mul: 239 // Power of 2 is a shift. Negated power of 2 is a shift and a negate. 240 if (Imm.isPowerOf2() || Imm.isNegatedPowerOf2()) 241 return TTI::TCC_Free; 242 // One more or less than a power of 2 can use SLLI+ADD/SUB. 243 if ((Imm + 1).isPowerOf2() || (Imm - 1).isPowerOf2()) 244 return TTI::TCC_Free; 245 // FIXME: There is no MULI instruction. 246 Takes12BitImm = true; 247 break; 248 case Instruction::Sub: 249 case Instruction::Shl: 250 case Instruction::LShr: 251 case Instruction::AShr: 252 Takes12BitImm = true; 253 ImmArgIdx = 1; 254 break; 255 default: 256 break; 257 } 258 259 if (Takes12BitImm) { 260 // Check immediate is the correct argument... 261 if (Instruction::isCommutative(Opcode) || Idx == ImmArgIdx) { 262 // ... and fits into the 12-bit immediate. 263 if (Imm.getSignificantBits() <= 64 && 264 getTLI()->isLegalAddImmediate(Imm.getSExtValue())) { 265 return TTI::TCC_Free; 266 } 267 } 268 269 // Otherwise, use the full materialisation cost. 270 return getIntImmCost(Imm, Ty, CostKind); 271 } 272 273 // By default, prevent hoisting. 274 return TTI::TCC_Free; 275 } 276 277 InstructionCost 278 RISCVTTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, 279 const APInt &Imm, Type *Ty, 280 TTI::TargetCostKind CostKind) const { 281 // Prevent hoisting in unknown cases. 282 return TTI::TCC_Free; 283 } 284 285 bool RISCVTTIImpl::hasActiveVectorLength() const { 286 return ST->hasVInstructions(); 287 } 288 289 TargetTransformInfo::PopcntSupportKind 290 RISCVTTIImpl::getPopcntSupport(unsigned TyWidth) const { 291 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); 292 return ST->hasStdExtZbb() || (ST->hasVendorXCVbitmanip() && !ST->is64Bit()) 293 ? TTI::PSK_FastHardware 294 : TTI::PSK_Software; 295 } 296 297 InstructionCost RISCVTTIImpl::getPartialReductionCost( 298 unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, 299 ElementCount VF, TTI::PartialReductionExtendKind OpAExtend, 300 TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp, 301 TTI::TargetCostKind CostKind) const { 302 303 // zve32x is broken for partial_reduce_umla, but let's make sure we 304 // don't generate them. 305 if (!ST->hasStdExtZvqdotq() || ST->getELen() < 64 || 306 Opcode != Instruction::Add || !BinOp || *BinOp != Instruction::Mul || 307 InputTypeA != InputTypeB || !InputTypeA->isIntegerTy(8) || 308 !AccumType->isIntegerTy(32) || !VF.isKnownMultipleOf(4)) 309 return InstructionCost::getInvalid(); 310 311 Type *Tp = VectorType::get(AccumType, VF.divideCoefficientBy(4)); 312 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp); 313 // Note: Asuming all vqdot* variants are equal cost 314 return LT.first * 315 getRISCVInstructionCost(RISCV::VQDOT_VV, LT.second, CostKind); 316 } 317 318 bool RISCVTTIImpl::shouldExpandReduction(const IntrinsicInst *II) const { 319 // Currently, the ExpandReductions pass can't expand scalable-vector 320 // reductions, but we still request expansion as RVV doesn't support certain 321 // reductions and the SelectionDAG can't legalize them either. 322 switch (II->getIntrinsicID()) { 323 default: 324 return false; 325 // These reductions have no equivalent in RVV 326 case Intrinsic::vector_reduce_mul: 327 case Intrinsic::vector_reduce_fmul: 328 return true; 329 } 330 } 331 332 std::optional<unsigned> RISCVTTIImpl::getMaxVScale() const { 333 if (ST->hasVInstructions()) 334 return ST->getRealMaxVLen() / RISCV::RVVBitsPerBlock; 335 return BaseT::getMaxVScale(); 336 } 337 338 std::optional<unsigned> RISCVTTIImpl::getVScaleForTuning() const { 339 if (ST->hasVInstructions()) 340 if (unsigned MinVLen = ST->getRealMinVLen(); 341 MinVLen >= RISCV::RVVBitsPerBlock) 342 return MinVLen / RISCV::RVVBitsPerBlock; 343 return BaseT::getVScaleForTuning(); 344 } 345 346 TypeSize 347 RISCVTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { 348 unsigned LMUL = 349 llvm::bit_floor(std::clamp<unsigned>(RVVRegisterWidthLMUL, 1, 8)); 350 switch (K) { 351 case TargetTransformInfo::RGK_Scalar: 352 return TypeSize::getFixed(ST->getXLen()); 353 case TargetTransformInfo::RGK_FixedWidthVector: 354 return TypeSize::getFixed( 355 ST->useRVVForFixedLengthVectors() ? LMUL * ST->getRealMinVLen() : 0); 356 case TargetTransformInfo::RGK_ScalableVector: 357 return TypeSize::getScalable( 358 (ST->hasVInstructions() && 359 ST->getRealMinVLen() >= RISCV::RVVBitsPerBlock) 360 ? LMUL * RISCV::RVVBitsPerBlock 361 : 0); 362 } 363 364 llvm_unreachable("Unsupported register kind"); 365 } 366 367 InstructionCost 368 RISCVTTIImpl::getConstantPoolLoadCost(Type *Ty, 369 TTI::TargetCostKind CostKind) const { 370 // Add a cost of address generation + the cost of the load. The address 371 // is expected to be a PC relative offset to a constant pool entry 372 // using auipc/addi. 373 return 2 + getMemoryOpCost(Instruction::Load, Ty, DL.getABITypeAlign(Ty), 374 /*AddressSpace=*/0, CostKind); 375 } 376 377 static bool isRepeatedConcatMask(ArrayRef<int> Mask, int &SubVectorSize) { 378 unsigned Size = Mask.size(); 379 if (!isPowerOf2_32(Size)) 380 return false; 381 for (unsigned I = 0; I != Size; ++I) { 382 if (static_cast<unsigned>(Mask[I]) == I) 383 continue; 384 if (Mask[I] != 0) 385 return false; 386 if (Size % I != 0) 387 return false; 388 for (unsigned J = I + 1; J != Size; ++J) 389 // Check the pattern is repeated. 390 if (static_cast<unsigned>(Mask[J]) != J % I) 391 return false; 392 SubVectorSize = I; 393 return true; 394 } 395 // That means Mask is <0, 1, 2, 3>. This is not a concatenation. 396 return false; 397 } 398 399 static VectorType *getVRGatherIndexType(MVT DataVT, const RISCVSubtarget &ST, 400 LLVMContext &C) { 401 assert((DataVT.getScalarSizeInBits() != 8 || 402 DataVT.getVectorNumElements() <= 256) && "unhandled case in lowering"); 403 MVT IndexVT = DataVT.changeTypeToInteger(); 404 if (IndexVT.getScalarType().bitsGT(ST.getXLenVT())) 405 IndexVT = IndexVT.changeVectorElementType(MVT::i16); 406 return cast<VectorType>(EVT(IndexVT).getTypeForEVT(C)); 407 } 408 409 /// Attempt to approximate the cost of a shuffle which will require splitting 410 /// during legalization. Note that processShuffleMasks is not an exact proxy 411 /// for the algorithm used in LegalizeVectorTypes, but hopefully it's a 412 /// reasonably close upperbound. 413 static InstructionCost costShuffleViaSplitting(const RISCVTTIImpl &TTI, 414 MVT LegalVT, VectorType *Tp, 415 ArrayRef<int> Mask, 416 TTI::TargetCostKind CostKind) { 417 assert(LegalVT.isFixedLengthVector() && !Mask.empty() && 418 "Expected fixed vector type and non-empty mask"); 419 unsigned LegalNumElts = LegalVT.getVectorNumElements(); 420 // Number of destination vectors after legalization: 421 unsigned NumOfDests = divideCeil(Mask.size(), LegalNumElts); 422 // We are going to permute multiple sources and the result will be in 423 // multiple destinations. Providing an accurate cost only for splits where 424 // the element type remains the same. 425 if (NumOfDests <= 1 || 426 LegalVT.getVectorElementType().getSizeInBits() != 427 Tp->getElementType()->getPrimitiveSizeInBits() || 428 LegalNumElts >= Tp->getElementCount().getFixedValue()) 429 return InstructionCost::getInvalid(); 430 431 unsigned VecTySize = TTI.getDataLayout().getTypeStoreSize(Tp); 432 unsigned LegalVTSize = LegalVT.getStoreSize(); 433 // Number of source vectors after legalization: 434 unsigned NumOfSrcs = divideCeil(VecTySize, LegalVTSize); 435 436 auto *SingleOpTy = FixedVectorType::get(Tp->getElementType(), LegalNumElts); 437 438 unsigned NormalizedVF = LegalNumElts * std::max(NumOfSrcs, NumOfDests); 439 unsigned NumOfSrcRegs = NormalizedVF / LegalNumElts; 440 unsigned NumOfDestRegs = NormalizedVF / LegalNumElts; 441 SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem); 442 assert(NormalizedVF >= Mask.size() && 443 "Normalized mask expected to be not shorter than original mask."); 444 copy(Mask, NormalizedMask.begin()); 445 InstructionCost Cost = 0; 446 SmallDenseSet<std::pair<ArrayRef<int>, unsigned>> ReusedSingleSrcShuffles; 447 processShuffleMasks( 448 NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {}, 449 [&](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) { 450 if (ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size())) 451 return; 452 if (!ReusedSingleSrcShuffles.insert(std::make_pair(RegMask, SrcReg)) 453 .second) 454 return; 455 Cost += TTI.getShuffleCost( 456 TTI::SK_PermuteSingleSrc, 457 FixedVectorType::get(SingleOpTy->getElementType(), RegMask.size()), 458 SingleOpTy, RegMask, CostKind, 0, nullptr); 459 }, 460 [&](ArrayRef<int> RegMask, unsigned Idx1, unsigned Idx2, bool NewReg) { 461 Cost += TTI.getShuffleCost( 462 TTI::SK_PermuteTwoSrc, 463 FixedVectorType::get(SingleOpTy->getElementType(), RegMask.size()), 464 SingleOpTy, RegMask, CostKind, 0, nullptr); 465 }); 466 return Cost; 467 } 468 469 /// Try to perform better estimation of the permutation. 470 /// 1. Split the source/destination vectors into real registers. 471 /// 2. Do the mask analysis to identify which real registers are 472 /// permuted. If more than 1 source registers are used for the 473 /// destination register building, the cost for this destination register 474 /// is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one 475 /// source register is used, build mask and calculate the cost as a cost 476 /// of PermuteSingleSrc. 477 /// Also, for the single register permute we try to identify if the 478 /// destination register is just a copy of the source register or the 479 /// copy of the previous destination register (the cost is 480 /// TTI::TCC_Basic). If the source register is just reused, the cost for 481 /// this operation is 0. 482 static InstructionCost 483 costShuffleViaVRegSplitting(const RISCVTTIImpl &TTI, MVT LegalVT, 484 std::optional<unsigned> VLen, VectorType *Tp, 485 ArrayRef<int> Mask, TTI::TargetCostKind CostKind) { 486 assert(LegalVT.isFixedLengthVector()); 487 if (!VLen || Mask.empty()) 488 return InstructionCost::getInvalid(); 489 MVT ElemVT = LegalVT.getVectorElementType(); 490 unsigned ElemsPerVReg = *VLen / ElemVT.getFixedSizeInBits(); 491 LegalVT = TTI.getTypeLegalizationCost( 492 FixedVectorType::get(Tp->getElementType(), ElemsPerVReg)) 493 .second; 494 // Number of destination vectors after legalization: 495 InstructionCost NumOfDests = 496 divideCeil(Mask.size(), LegalVT.getVectorNumElements()); 497 if (NumOfDests <= 1 || 498 LegalVT.getVectorElementType().getSizeInBits() != 499 Tp->getElementType()->getPrimitiveSizeInBits() || 500 LegalVT.getVectorNumElements() >= Tp->getElementCount().getFixedValue()) 501 return InstructionCost::getInvalid(); 502 503 unsigned VecTySize = TTI.getDataLayout().getTypeStoreSize(Tp); 504 unsigned LegalVTSize = LegalVT.getStoreSize(); 505 // Number of source vectors after legalization: 506 unsigned NumOfSrcs = divideCeil(VecTySize, LegalVTSize); 507 508 auto *SingleOpTy = FixedVectorType::get(Tp->getElementType(), 509 LegalVT.getVectorNumElements()); 510 511 unsigned E = NumOfDests.getValue(); 512 unsigned NormalizedVF = 513 LegalVT.getVectorNumElements() * std::max(NumOfSrcs, E); 514 unsigned NumOfSrcRegs = NormalizedVF / LegalVT.getVectorNumElements(); 515 unsigned NumOfDestRegs = NormalizedVF / LegalVT.getVectorNumElements(); 516 SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem); 517 assert(NormalizedVF >= Mask.size() && 518 "Normalized mask expected to be not shorter than original mask."); 519 copy(Mask, NormalizedMask.begin()); 520 InstructionCost Cost = 0; 521 int NumShuffles = 0; 522 SmallDenseSet<std::pair<ArrayRef<int>, unsigned>> ReusedSingleSrcShuffles; 523 processShuffleMasks( 524 NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {}, 525 [&](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) { 526 if (ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size())) 527 return; 528 if (!ReusedSingleSrcShuffles.insert(std::make_pair(RegMask, SrcReg)) 529 .second) 530 return; 531 ++NumShuffles; 532 Cost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, SingleOpTy, 533 SingleOpTy, RegMask, CostKind, 0, nullptr); 534 }, 535 [&](ArrayRef<int> RegMask, unsigned Idx1, unsigned Idx2, bool NewReg) { 536 Cost += TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, 537 SingleOpTy, RegMask, CostKind, 0, nullptr); 538 NumShuffles += 2; 539 }); 540 // Note: check that we do not emit too many shuffles here to prevent code 541 // size explosion. 542 // TODO: investigate, if it can be improved by extra analysis of the masks 543 // to check if the code is more profitable. 544 if ((NumOfDestRegs > 2 && NumShuffles <= static_cast<int>(NumOfDestRegs)) || 545 (NumOfDestRegs <= 2 && NumShuffles < 4)) 546 return Cost; 547 return InstructionCost::getInvalid(); 548 } 549 550 InstructionCost RISCVTTIImpl::getSlideCost(FixedVectorType *Tp, 551 ArrayRef<int> Mask, 552 TTI::TargetCostKind CostKind) const { 553 // Avoid missing masks and length changing shuffles 554 if (Mask.size() <= 2 || Mask.size() != Tp->getNumElements()) 555 return InstructionCost::getInvalid(); 556 557 int NumElts = Tp->getNumElements(); 558 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp); 559 // Avoid scalarization cases 560 if (!LT.second.isFixedLengthVector()) 561 return InstructionCost::getInvalid(); 562 563 // Requires moving elements between parts, which requires additional 564 // unmodeled instructions. 565 if (LT.first != 1) 566 return InstructionCost::getInvalid(); 567 568 auto GetSlideOpcode = [&](int SlideAmt) { 569 assert(SlideAmt != 0); 570 bool IsVI = isUInt<5>(std::abs(SlideAmt)); 571 if (SlideAmt < 0) 572 return IsVI ? RISCV::VSLIDEDOWN_VI : RISCV::VSLIDEDOWN_VX; 573 return IsVI ? RISCV::VSLIDEUP_VI : RISCV::VSLIDEUP_VX; 574 }; 575 576 std::array<std::pair<int, int>, 2> SrcInfo; 577 if (!isMaskedSlidePair(Mask, NumElts, SrcInfo)) 578 return InstructionCost::getInvalid(); 579 580 if (SrcInfo[1].second == 0) 581 std::swap(SrcInfo[0], SrcInfo[1]); 582 583 InstructionCost FirstSlideCost = 0; 584 if (SrcInfo[0].second != 0) { 585 unsigned Opcode = GetSlideOpcode(SrcInfo[0].second); 586 FirstSlideCost = getRISCVInstructionCost(Opcode, LT.second, CostKind); 587 } 588 589 if (SrcInfo[1].first == -1) 590 return FirstSlideCost; 591 592 InstructionCost SecondSlideCost = 0; 593 if (SrcInfo[1].second != 0) { 594 unsigned Opcode = GetSlideOpcode(SrcInfo[1].second); 595 SecondSlideCost = getRISCVInstructionCost(Opcode, LT.second, CostKind); 596 } else { 597 SecondSlideCost = 598 getRISCVInstructionCost(RISCV::VMERGE_VVM, LT.second, CostKind); 599 } 600 601 auto EC = Tp->getElementCount(); 602 VectorType *MaskTy = 603 VectorType::get(IntegerType::getInt1Ty(Tp->getContext()), EC); 604 InstructionCost MaskCost = getConstantPoolLoadCost(MaskTy, CostKind); 605 return FirstSlideCost + SecondSlideCost + MaskCost; 606 } 607 608 InstructionCost 609 RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, 610 VectorType *SrcTy, ArrayRef<int> Mask, 611 TTI::TargetCostKind CostKind, int Index, 612 VectorType *SubTp, ArrayRef<const Value *> Args, 613 const Instruction *CxtI) const { 614 assert((Mask.empty() || DstTy->isScalableTy() || 615 Mask.size() == DstTy->getElementCount().getKnownMinValue()) && 616 "Expected the Mask to match the return size if given"); 617 assert(SrcTy->getScalarType() == DstTy->getScalarType() && 618 "Expected the same scalar types"); 619 620 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp); 621 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy); 622 623 // First, handle cases where having a fixed length vector enables us to 624 // give a more accurate cost than falling back to generic scalable codegen. 625 // TODO: Each of these cases hints at a modeling gap around scalable vectors. 626 if (auto *FVTp = dyn_cast<FixedVectorType>(SrcTy); 627 FVTp && ST->hasVInstructions() && LT.second.isFixedLengthVector()) { 628 InstructionCost VRegSplittingCost = costShuffleViaVRegSplitting( 629 *this, LT.second, ST->getRealVLen(), 630 Kind == TTI::SK_InsertSubvector ? DstTy : SrcTy, Mask, CostKind); 631 if (VRegSplittingCost.isValid()) 632 return VRegSplittingCost; 633 switch (Kind) { 634 default: 635 break; 636 case TTI::SK_PermuteSingleSrc: { 637 if (Mask.size() >= 2) { 638 MVT EltTp = LT.second.getVectorElementType(); 639 // If the size of the element is < ELEN then shuffles of interleaves and 640 // deinterleaves of 2 vectors can be lowered into the following 641 // sequences 642 if (EltTp.getScalarSizeInBits() < ST->getELen()) { 643 // Example sequence: 644 // vsetivli zero, 4, e8, mf4, ta, ma (ignored) 645 // vwaddu.vv v10, v8, v9 646 // li a0, -1 (ignored) 647 // vwmaccu.vx v10, a0, v9 648 if (ShuffleVectorInst::isInterleaveMask(Mask, 2, Mask.size())) 649 return 2 * LT.first * TLI->getLMULCost(LT.second); 650 651 if (Mask[0] == 0 || Mask[0] == 1) { 652 auto DeinterleaveMask = createStrideMask(Mask[0], 2, Mask.size()); 653 // Example sequence: 654 // vnsrl.wi v10, v8, 0 655 if (equal(DeinterleaveMask, Mask)) 656 return LT.first * getRISCVInstructionCost(RISCV::VNSRL_WI, 657 LT.second, CostKind); 658 } 659 } 660 int SubVectorSize; 661 if (LT.second.getScalarSizeInBits() != 1 && 662 isRepeatedConcatMask(Mask, SubVectorSize)) { 663 InstructionCost Cost = 0; 664 unsigned NumSlides = Log2_32(Mask.size() / SubVectorSize); 665 // The cost of extraction from a subvector is 0 if the index is 0. 666 for (unsigned I = 0; I != NumSlides; ++I) { 667 unsigned InsertIndex = SubVectorSize * (1 << I); 668 FixedVectorType *SubTp = 669 FixedVectorType::get(SrcTy->getElementType(), InsertIndex); 670 FixedVectorType *DestTp = 671 FixedVectorType::getDoubleElementsVectorType(SubTp); 672 std::pair<InstructionCost, MVT> DestLT = 673 getTypeLegalizationCost(DestTp); 674 // Add the cost of whole vector register move because the 675 // destination vector register group for vslideup cannot overlap the 676 // source. 677 Cost += DestLT.first * TLI->getLMULCost(DestLT.second); 678 Cost += getShuffleCost(TTI::SK_InsertSubvector, DestTp, DestTp, {}, 679 CostKind, InsertIndex, SubTp); 680 } 681 return Cost; 682 } 683 } 684 685 if (InstructionCost SlideCost = getSlideCost(FVTp, Mask, CostKind); 686 SlideCost.isValid()) 687 return SlideCost; 688 689 // vrgather + cost of generating the mask constant. 690 // We model this for an unknown mask with a single vrgather. 691 if (LT.first == 1 && (LT.second.getScalarSizeInBits() != 8 || 692 LT.second.getVectorNumElements() <= 256)) { 693 VectorType *IdxTy = 694 getVRGatherIndexType(LT.second, *ST, SrcTy->getContext()); 695 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind); 696 return IndexCost + 697 getRISCVInstructionCost(RISCV::VRGATHER_VV, LT.second, CostKind); 698 } 699 break; 700 } 701 case TTI::SK_Transpose: 702 case TTI::SK_PermuteTwoSrc: { 703 704 if (InstructionCost SlideCost = getSlideCost(FVTp, Mask, CostKind); 705 SlideCost.isValid()) 706 return SlideCost; 707 708 // 2 x (vrgather + cost of generating the mask constant) + cost of mask 709 // register for the second vrgather. We model this for an unknown 710 // (shuffle) mask. 711 if (LT.first == 1 && (LT.second.getScalarSizeInBits() != 8 || 712 LT.second.getVectorNumElements() <= 256)) { 713 auto &C = SrcTy->getContext(); 714 auto EC = SrcTy->getElementCount(); 715 VectorType *IdxTy = getVRGatherIndexType(LT.second, *ST, C); 716 VectorType *MaskTy = VectorType::get(IntegerType::getInt1Ty(C), EC); 717 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind); 718 InstructionCost MaskCost = getConstantPoolLoadCost(MaskTy, CostKind); 719 return 2 * IndexCost + 720 getRISCVInstructionCost({RISCV::VRGATHER_VV, RISCV::VRGATHER_VV}, 721 LT.second, CostKind) + 722 MaskCost; 723 } 724 break; 725 } 726 } 727 728 auto shouldSplit = [](TTI::ShuffleKind Kind) { 729 switch (Kind) { 730 default: 731 return false; 732 case TTI::SK_PermuteSingleSrc: 733 case TTI::SK_Transpose: 734 case TTI::SK_PermuteTwoSrc: 735 return true; 736 } 737 }; 738 739 if (!Mask.empty() && LT.first.isValid() && LT.first != 1 && 740 shouldSplit(Kind)) { 741 InstructionCost SplitCost = 742 costShuffleViaSplitting(*this, LT.second, FVTp, Mask, CostKind); 743 if (SplitCost.isValid()) 744 return SplitCost; 745 } 746 } 747 748 // Handle scalable vectors (and fixed vectors legalized to scalable vectors). 749 switch (Kind) { 750 default: 751 // Fallthrough to generic handling. 752 // TODO: Most of these cases will return getInvalid in generic code, and 753 // must be implemented here. 754 break; 755 case TTI::SK_ExtractSubvector: 756 // Extract at zero is always a subregister extract 757 if (Index == 0) 758 return TTI::TCC_Free; 759 760 // If we're extracting a subvector of at most m1 size at a sub-register 761 // boundary - which unfortunately we need exact vlen to identify - this is 762 // a subregister extract at worst and thus won't require a vslidedown. 763 // TODO: Extend for aligned m2, m4 subvector extracts 764 // TODO: Extend for misalgined (but contained) extracts 765 // TODO: Extend for scalable subvector types 766 if (std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp); 767 SubLT.second.isValid() && SubLT.second.isFixedLengthVector()) { 768 if (std::optional<unsigned> VLen = ST->getRealVLen(); 769 VLen && SubLT.second.getScalarSizeInBits() * Index % *VLen == 0 && 770 SubLT.second.getSizeInBits() <= *VLen) 771 return TTI::TCC_Free; 772 } 773 774 // Example sequence: 775 // vsetivli zero, 4, e8, mf2, tu, ma (ignored) 776 // vslidedown.vi v8, v9, 2 777 return LT.first * 778 getRISCVInstructionCost(RISCV::VSLIDEDOWN_VI, LT.second, CostKind); 779 case TTI::SK_InsertSubvector: 780 // Example sequence: 781 // vsetivli zero, 4, e8, mf2, tu, ma (ignored) 782 // vslideup.vi v8, v9, 2 783 LT = getTypeLegalizationCost(DstTy); 784 return LT.first * 785 getRISCVInstructionCost(RISCV::VSLIDEUP_VI, LT.second, CostKind); 786 case TTI::SK_Select: { 787 // Example sequence: 788 // li a0, 90 789 // vsetivli zero, 8, e8, mf2, ta, ma (ignored) 790 // vmv.s.x v0, a0 791 // vmerge.vvm v8, v9, v8, v0 792 // We use 2 for the cost of the mask materialization as this is the true 793 // cost for small masks and most shuffles are small. At worst, this cost 794 // should be a very small constant for the constant pool load. As such, 795 // we may bias towards large selects slightly more than truly warranted. 796 return LT.first * 797 (1 + getRISCVInstructionCost({RISCV::VMV_S_X, RISCV::VMERGE_VVM}, 798 LT.second, CostKind)); 799 } 800 case TTI::SK_Broadcast: { 801 bool HasScalar = (Args.size() > 0) && (Operator::getOpcode(Args[0]) == 802 Instruction::InsertElement); 803 if (LT.second.getScalarSizeInBits() == 1) { 804 if (HasScalar) { 805 // Example sequence: 806 // andi a0, a0, 1 807 // vsetivli zero, 2, e8, mf8, ta, ma (ignored) 808 // vmv.v.x v8, a0 809 // vmsne.vi v0, v8, 0 810 return LT.first * 811 (1 + getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI}, 812 LT.second, CostKind)); 813 } 814 // Example sequence: 815 // vsetivli zero, 2, e8, mf8, ta, mu (ignored) 816 // vmv.v.i v8, 0 817 // vmerge.vim v8, v8, 1, v0 818 // vmv.x.s a0, v8 819 // andi a0, a0, 1 820 // vmv.v.x v8, a0 821 // vmsne.vi v0, v8, 0 822 823 return LT.first * 824 (1 + getRISCVInstructionCost({RISCV::VMV_V_I, RISCV::VMERGE_VIM, 825 RISCV::VMV_X_S, RISCV::VMV_V_X, 826 RISCV::VMSNE_VI}, 827 LT.second, CostKind)); 828 } 829 830 if (HasScalar) { 831 // Example sequence: 832 // vmv.v.x v8, a0 833 return LT.first * 834 getRISCVInstructionCost(RISCV::VMV_V_X, LT.second, CostKind); 835 } 836 837 // Example sequence: 838 // vrgather.vi v9, v8, 0 839 return LT.first * 840 getRISCVInstructionCost(RISCV::VRGATHER_VI, LT.second, CostKind); 841 } 842 case TTI::SK_Splice: { 843 // vslidedown+vslideup. 844 // TODO: Multiplying by LT.first implies this legalizes into multiple copies 845 // of similar code, but I think we expand through memory. 846 unsigned Opcodes[2] = {RISCV::VSLIDEDOWN_VX, RISCV::VSLIDEUP_VX}; 847 if (Index >= 0 && Index < 32) 848 Opcodes[0] = RISCV::VSLIDEDOWN_VI; 849 else if (Index < 0 && Index > -32) 850 Opcodes[1] = RISCV::VSLIDEUP_VI; 851 return LT.first * getRISCVInstructionCost(Opcodes, LT.second, CostKind); 852 } 853 case TTI::SK_Reverse: { 854 855 if (!LT.second.isVector()) 856 return InstructionCost::getInvalid(); 857 858 // TODO: Cases to improve here: 859 // * Illegal vector types 860 // * i64 on RV32 861 if (SrcTy->getElementType()->isIntegerTy(1)) { 862 VectorType *WideTy = 863 VectorType::get(IntegerType::get(SrcTy->getContext(), 8), 864 cast<VectorType>(SrcTy)->getElementCount()); 865 return getCastInstrCost(Instruction::ZExt, WideTy, SrcTy, 866 TTI::CastContextHint::None, CostKind) + 867 getShuffleCost(TTI::SK_Reverse, WideTy, WideTy, {}, CostKind, 0, 868 nullptr) + 869 getCastInstrCost(Instruction::Trunc, SrcTy, WideTy, 870 TTI::CastContextHint::None, CostKind); 871 } 872 873 MVT ContainerVT = LT.second; 874 if (LT.second.isFixedLengthVector()) 875 ContainerVT = TLI->getContainerForFixedLengthVector(LT.second); 876 MVT M1VT = RISCVTargetLowering::getM1VT(ContainerVT); 877 if (ContainerVT.bitsLE(M1VT)) { 878 // Example sequence: 879 // csrr a0, vlenb 880 // srli a0, a0, 3 881 // addi a0, a0, -1 882 // vsetvli a1, zero, e8, mf8, ta, mu (ignored) 883 // vid.v v9 884 // vrsub.vx v10, v9, a0 885 // vrgather.vv v9, v8, v10 886 InstructionCost LenCost = 3; 887 if (LT.second.isFixedLengthVector()) 888 // vrsub.vi has a 5 bit immediate field, otherwise an li suffices 889 LenCost = isInt<5>(LT.second.getVectorNumElements() - 1) ? 0 : 1; 890 unsigned Opcodes[] = {RISCV::VID_V, RISCV::VRSUB_VX, RISCV::VRGATHER_VV}; 891 if (LT.second.isFixedLengthVector() && 892 isInt<5>(LT.second.getVectorNumElements() - 1)) 893 Opcodes[1] = RISCV::VRSUB_VI; 894 InstructionCost GatherCost = 895 getRISCVInstructionCost(Opcodes, LT.second, CostKind); 896 return LT.first * (LenCost + GatherCost); 897 } 898 899 // At high LMUL, we split into a series of M1 reverses (see 900 // lowerVECTOR_REVERSE) and then do a single slide at the end to eliminate 901 // the resulting gap at the bottom (for fixed vectors only). The important 902 // bit is that the cost scales linearly, not quadratically with LMUL. 903 unsigned M1Opcodes[] = {RISCV::VID_V, RISCV::VRSUB_VX}; 904 InstructionCost FixedCost = 905 getRISCVInstructionCost(M1Opcodes, M1VT, CostKind) + 3; 906 unsigned Ratio = 907 ContainerVT.getVectorMinNumElements() / M1VT.getVectorMinNumElements(); 908 InstructionCost GatherCost = 909 getRISCVInstructionCost({RISCV::VRGATHER_VV}, M1VT, CostKind) * Ratio; 910 InstructionCost SlideCost = !LT.second.isFixedLengthVector() ? 0 : 911 getRISCVInstructionCost({RISCV::VSLIDEDOWN_VX}, LT.second, CostKind); 912 return FixedCost + LT.first * (GatherCost + SlideCost); 913 } 914 } 915 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index, 916 SubTp); 917 } 918 919 static unsigned isM1OrSmaller(MVT VT) { 920 RISCVVType::VLMUL LMUL = RISCVTargetLowering::getLMUL(VT); 921 return (LMUL == RISCVVType::VLMUL::LMUL_F8 || 922 LMUL == RISCVVType::VLMUL::LMUL_F4 || 923 LMUL == RISCVVType::VLMUL::LMUL_F2 || 924 LMUL == RISCVVType::VLMUL::LMUL_1); 925 } 926 927 InstructionCost RISCVTTIImpl::getScalarizationOverhead( 928 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, 929 TTI::TargetCostKind CostKind, bool ForPoisonSrc, 930 ArrayRef<Value *> VL) const { 931 if (isa<ScalableVectorType>(Ty)) 932 return InstructionCost::getInvalid(); 933 934 // A build_vector (which is m1 sized or smaller) can be done in no 935 // worse than one vslide1down.vx per element in the type. We could 936 // in theory do an explode_vector in the inverse manner, but our 937 // lowering today does not have a first class node for this pattern. 938 InstructionCost Cost = BaseT::getScalarizationOverhead( 939 Ty, DemandedElts, Insert, Extract, CostKind); 940 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); 941 if (Insert && !Extract && LT.first.isValid() && LT.second.isVector()) { 942 if (Ty->getScalarSizeInBits() == 1) { 943 auto *WideVecTy = cast<VectorType>(Ty->getWithNewBitWidth(8)); 944 // Note: Implicit scalar anyextend is assumed to be free since the i1 945 // must be stored in a GPR. 946 return getScalarizationOverhead(WideVecTy, DemandedElts, Insert, Extract, 947 CostKind) + 948 getCastInstrCost(Instruction::Trunc, Ty, WideVecTy, 949 TTI::CastContextHint::None, CostKind, nullptr); 950 } 951 952 assert(LT.second.isFixedLengthVector()); 953 MVT ContainerVT = TLI->getContainerForFixedLengthVector(LT.second); 954 if (isM1OrSmaller(ContainerVT)) { 955 InstructionCost BV = 956 cast<FixedVectorType>(Ty)->getNumElements() * 957 getRISCVInstructionCost(RISCV::VSLIDE1DOWN_VX, LT.second, CostKind); 958 if (BV < Cost) 959 Cost = BV; 960 } 961 } 962 return Cost; 963 } 964 965 InstructionCost 966 RISCVTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, 967 unsigned AddressSpace, 968 TTI::TargetCostKind CostKind) const { 969 if (!isLegalMaskedLoadStore(Src, Alignment) || 970 CostKind != TTI::TCK_RecipThroughput) 971 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace, 972 CostKind); 973 974 return getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind); 975 } 976 977 InstructionCost RISCVTTIImpl::getInterleavedMemoryOpCost( 978 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, 979 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, 980 bool UseMaskForCond, bool UseMaskForGaps) const { 981 982 // The interleaved memory access pass will lower interleaved memory ops (i.e 983 // a load and store followed by a specific shuffle) to vlseg/vsseg 984 // intrinsics. 985 if (!UseMaskForCond && !UseMaskForGaps && 986 Factor <= TLI->getMaxSupportedInterleaveFactor()) { 987 auto *VTy = cast<VectorType>(VecTy); 988 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VTy); 989 // Need to make sure type has't been scalarized 990 if (LT.second.isVector()) { 991 auto *SubVecTy = 992 VectorType::get(VTy->getElementType(), 993 VTy->getElementCount().divideCoefficientBy(Factor)); 994 if (VTy->getElementCount().isKnownMultipleOf(Factor) && 995 TLI->isLegalInterleavedAccessType(SubVecTy, Factor, Alignment, 996 AddressSpace, DL)) { 997 998 // Some processors optimize segment loads/stores as one wide memory op + 999 // Factor * LMUL shuffle ops. 1000 if (ST->hasOptimizedSegmentLoadStore(Factor)) { 1001 InstructionCost Cost = 1002 getMemoryOpCost(Opcode, VTy, Alignment, AddressSpace, CostKind); 1003 MVT SubVecVT = getTLI()->getValueType(DL, SubVecTy).getSimpleVT(); 1004 Cost += Factor * TLI->getLMULCost(SubVecVT); 1005 return LT.first * Cost; 1006 } 1007 1008 // Otherwise, the cost is proportional to the number of elements (VL * 1009 // Factor ops). 1010 InstructionCost MemOpCost = 1011 getMemoryOpCost(Opcode, VTy->getElementType(), Alignment, 0, 1012 CostKind, {TTI::OK_AnyValue, TTI::OP_None}); 1013 unsigned NumLoads = getEstimatedVLFor(VTy); 1014 return NumLoads * MemOpCost; 1015 } 1016 } 1017 } 1018 1019 // TODO: Return the cost of interleaved accesses for scalable vector when 1020 // unable to convert to segment accesses instructions. 1021 if (isa<ScalableVectorType>(VecTy)) 1022 return InstructionCost::getInvalid(); 1023 1024 auto *FVTy = cast<FixedVectorType>(VecTy); 1025 InstructionCost MemCost = 1026 getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, CostKind); 1027 unsigned VF = FVTy->getNumElements() / Factor; 1028 1029 // An interleaved load will look like this for Factor=3: 1030 // %wide.vec = load <12 x i32>, ptr %3, align 4 1031 // %strided.vec = shufflevector %wide.vec, poison, <4 x i32> <stride mask> 1032 // %strided.vec1 = shufflevector %wide.vec, poison, <4 x i32> <stride mask> 1033 // %strided.vec2 = shufflevector %wide.vec, poison, <4 x i32> <stride mask> 1034 if (Opcode == Instruction::Load) { 1035 InstructionCost Cost = MemCost; 1036 for (unsigned Index : Indices) { 1037 FixedVectorType *VecTy = 1038 FixedVectorType::get(FVTy->getElementType(), VF * Factor); 1039 auto Mask = createStrideMask(Index, Factor, VF); 1040 Mask.resize(VF * Factor, -1); 1041 InstructionCost ShuffleCost = 1042 getShuffleCost(TTI::ShuffleKind::SK_PermuteSingleSrc, VecTy, VecTy, 1043 Mask, CostKind, 0, nullptr, {}); 1044 Cost += ShuffleCost; 1045 } 1046 return Cost; 1047 } 1048 1049 // TODO: Model for NF > 2 1050 // We'll need to enhance getShuffleCost to model shuffles that are just 1051 // inserts and extracts into subvectors, since they won't have the full cost 1052 // of a vrgather. 1053 // An interleaved store for 3 vectors of 4 lanes will look like 1054 // %11 = shufflevector <4 x i32> %4, <4 x i32> %6, <8 x i32> <0...7> 1055 // %12 = shufflevector <4 x i32> %9, <4 x i32> poison, <8 x i32> <0...3> 1056 // %13 = shufflevector <8 x i32> %11, <8 x i32> %12, <12 x i32> <0...11> 1057 // %interleaved.vec = shufflevector %13, poison, <12 x i32> <interleave mask> 1058 // store <12 x i32> %interleaved.vec, ptr %10, align 4 1059 if (Factor != 2) 1060 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, 1061 Alignment, AddressSpace, CostKind, 1062 UseMaskForCond, UseMaskForGaps); 1063 1064 assert(Opcode == Instruction::Store && "Opcode must be a store"); 1065 // For an interleaving store of 2 vectors, we perform one large interleaving 1066 // shuffle that goes into the wide store 1067 auto Mask = createInterleaveMask(VF, Factor); 1068 InstructionCost ShuffleCost = 1069 getShuffleCost(TTI::ShuffleKind::SK_PermuteSingleSrc, FVTy, FVTy, Mask, 1070 CostKind, 0, nullptr, {}); 1071 return MemCost + ShuffleCost; 1072 } 1073 1074 InstructionCost RISCVTTIImpl::getGatherScatterOpCost( 1075 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, 1076 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) const { 1077 if (CostKind != TTI::TCK_RecipThroughput) 1078 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, 1079 Alignment, CostKind, I); 1080 1081 if ((Opcode == Instruction::Load && 1082 !isLegalMaskedGather(DataTy, Align(Alignment))) || 1083 (Opcode == Instruction::Store && 1084 !isLegalMaskedScatter(DataTy, Align(Alignment)))) 1085 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, 1086 Alignment, CostKind, I); 1087 1088 // Cost is proportional to the number of memory operations implied. For 1089 // scalable vectors, we use an estimate on that number since we don't 1090 // know exactly what VL will be. 1091 auto &VTy = *cast<VectorType>(DataTy); 1092 InstructionCost MemOpCost = 1093 getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind, 1094 {TTI::OK_AnyValue, TTI::OP_None}, I); 1095 unsigned NumLoads = getEstimatedVLFor(&VTy); 1096 return NumLoads * MemOpCost; 1097 } 1098 1099 InstructionCost RISCVTTIImpl::getExpandCompressMemoryOpCost( 1100 unsigned Opcode, Type *DataTy, bool VariableMask, Align Alignment, 1101 TTI::TargetCostKind CostKind, const Instruction *I) const { 1102 bool IsLegal = (Opcode == Instruction::Store && 1103 isLegalMaskedCompressStore(DataTy, Alignment)) || 1104 (Opcode == Instruction::Load && 1105 isLegalMaskedExpandLoad(DataTy, Alignment)); 1106 if (!IsLegal || CostKind != TTI::TCK_RecipThroughput) 1107 return BaseT::getExpandCompressMemoryOpCost(Opcode, DataTy, VariableMask, 1108 Alignment, CostKind, I); 1109 // Example compressstore sequence: 1110 // vsetivli zero, 8, e32, m2, ta, ma (ignored) 1111 // vcompress.vm v10, v8, v0 1112 // vcpop.m a1, v0 1113 // vsetvli zero, a1, e32, m2, ta, ma 1114 // vse32.v v10, (a0) 1115 // Example expandload sequence: 1116 // vsetivli zero, 8, e8, mf2, ta, ma (ignored) 1117 // vcpop.m a1, v0 1118 // vsetvli zero, a1, e32, m2, ta, ma 1119 // vle32.v v10, (a0) 1120 // vsetivli zero, 8, e32, m2, ta, ma 1121 // viota.m v12, v0 1122 // vrgather.vv v8, v10, v12, v0.t 1123 auto MemOpCost = 1124 getMemoryOpCost(Opcode, DataTy, Alignment, /*AddressSpace*/ 0, CostKind); 1125 auto LT = getTypeLegalizationCost(DataTy); 1126 SmallVector<unsigned, 4> Opcodes{RISCV::VSETVLI}; 1127 if (VariableMask) 1128 Opcodes.push_back(RISCV::VCPOP_M); 1129 if (Opcode == Instruction::Store) 1130 Opcodes.append({RISCV::VCOMPRESS_VM}); 1131 else 1132 Opcodes.append({RISCV::VSETIVLI, RISCV::VIOTA_M, RISCV::VRGATHER_VV}); 1133 return MemOpCost + 1134 LT.first * getRISCVInstructionCost(Opcodes, LT.second, CostKind); 1135 } 1136 1137 InstructionCost RISCVTTIImpl::getStridedMemoryOpCost( 1138 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, 1139 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) const { 1140 if (((Opcode == Instruction::Load || Opcode == Instruction::Store) && 1141 !isLegalStridedLoadStore(DataTy, Alignment)) || 1142 (Opcode != Instruction::Load && Opcode != Instruction::Store)) 1143 return BaseT::getStridedMemoryOpCost(Opcode, DataTy, Ptr, VariableMask, 1144 Alignment, CostKind, I); 1145 1146 if (CostKind == TTI::TCK_CodeSize) 1147 return TTI::TCC_Basic; 1148 1149 // Cost is proportional to the number of memory operations implied. For 1150 // scalable vectors, we use an estimate on that number since we don't 1151 // know exactly what VL will be. 1152 auto &VTy = *cast<VectorType>(DataTy); 1153 InstructionCost MemOpCost = 1154 getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind, 1155 {TTI::OK_AnyValue, TTI::OP_None}, I); 1156 unsigned NumLoads = getEstimatedVLFor(&VTy); 1157 return NumLoads * MemOpCost; 1158 } 1159 1160 InstructionCost 1161 RISCVTTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) const { 1162 // FIXME: This is a property of the default vector convention, not 1163 // all possible calling conventions. Fixing that will require 1164 // some TTI API and SLP rework. 1165 InstructionCost Cost = 0; 1166 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 1167 for (auto *Ty : Tys) { 1168 if (!Ty->isVectorTy()) 1169 continue; 1170 Align A = DL.getPrefTypeAlign(Ty); 1171 Cost += getMemoryOpCost(Instruction::Store, Ty, A, 0, CostKind) + 1172 getMemoryOpCost(Instruction::Load, Ty, A, 0, CostKind); 1173 } 1174 return Cost; 1175 } 1176 1177 // Currently, these represent both throughput and codesize costs 1178 // for the respective intrinsics. The costs in this table are simply 1179 // instruction counts with the following adjustments made: 1180 // * One vsetvli is considered free. 1181 static const CostTblEntry VectorIntrinsicCostTable[]{ 1182 {Intrinsic::floor, MVT::f32, 9}, 1183 {Intrinsic::floor, MVT::f64, 9}, 1184 {Intrinsic::ceil, MVT::f32, 9}, 1185 {Intrinsic::ceil, MVT::f64, 9}, 1186 {Intrinsic::trunc, MVT::f32, 7}, 1187 {Intrinsic::trunc, MVT::f64, 7}, 1188 {Intrinsic::round, MVT::f32, 9}, 1189 {Intrinsic::round, MVT::f64, 9}, 1190 {Intrinsic::roundeven, MVT::f32, 9}, 1191 {Intrinsic::roundeven, MVT::f64, 9}, 1192 {Intrinsic::rint, MVT::f32, 7}, 1193 {Intrinsic::rint, MVT::f64, 7}, 1194 {Intrinsic::lrint, MVT::i32, 1}, 1195 {Intrinsic::lrint, MVT::i64, 1}, 1196 {Intrinsic::llrint, MVT::i64, 1}, 1197 {Intrinsic::nearbyint, MVT::f32, 9}, 1198 {Intrinsic::nearbyint, MVT::f64, 9}, 1199 {Intrinsic::bswap, MVT::i16, 3}, 1200 {Intrinsic::bswap, MVT::i32, 12}, 1201 {Intrinsic::bswap, MVT::i64, 31}, 1202 {Intrinsic::vp_bswap, MVT::i16, 3}, 1203 {Intrinsic::vp_bswap, MVT::i32, 12}, 1204 {Intrinsic::vp_bswap, MVT::i64, 31}, 1205 {Intrinsic::vp_fshl, MVT::i8, 7}, 1206 {Intrinsic::vp_fshl, MVT::i16, 7}, 1207 {Intrinsic::vp_fshl, MVT::i32, 7}, 1208 {Intrinsic::vp_fshl, MVT::i64, 7}, 1209 {Intrinsic::vp_fshr, MVT::i8, 7}, 1210 {Intrinsic::vp_fshr, MVT::i16, 7}, 1211 {Intrinsic::vp_fshr, MVT::i32, 7}, 1212 {Intrinsic::vp_fshr, MVT::i64, 7}, 1213 {Intrinsic::bitreverse, MVT::i8, 17}, 1214 {Intrinsic::bitreverse, MVT::i16, 24}, 1215 {Intrinsic::bitreverse, MVT::i32, 33}, 1216 {Intrinsic::bitreverse, MVT::i64, 52}, 1217 {Intrinsic::vp_bitreverse, MVT::i8, 17}, 1218 {Intrinsic::vp_bitreverse, MVT::i16, 24}, 1219 {Intrinsic::vp_bitreverse, MVT::i32, 33}, 1220 {Intrinsic::vp_bitreverse, MVT::i64, 52}, 1221 {Intrinsic::ctpop, MVT::i8, 12}, 1222 {Intrinsic::ctpop, MVT::i16, 19}, 1223 {Intrinsic::ctpop, MVT::i32, 20}, 1224 {Intrinsic::ctpop, MVT::i64, 21}, 1225 {Intrinsic::ctlz, MVT::i8, 19}, 1226 {Intrinsic::ctlz, MVT::i16, 28}, 1227 {Intrinsic::ctlz, MVT::i32, 31}, 1228 {Intrinsic::ctlz, MVT::i64, 35}, 1229 {Intrinsic::cttz, MVT::i8, 16}, 1230 {Intrinsic::cttz, MVT::i16, 23}, 1231 {Intrinsic::cttz, MVT::i32, 24}, 1232 {Intrinsic::cttz, MVT::i64, 25}, 1233 {Intrinsic::vp_ctpop, MVT::i8, 12}, 1234 {Intrinsic::vp_ctpop, MVT::i16, 19}, 1235 {Intrinsic::vp_ctpop, MVT::i32, 20}, 1236 {Intrinsic::vp_ctpop, MVT::i64, 21}, 1237 {Intrinsic::vp_ctlz, MVT::i8, 19}, 1238 {Intrinsic::vp_ctlz, MVT::i16, 28}, 1239 {Intrinsic::vp_ctlz, MVT::i32, 31}, 1240 {Intrinsic::vp_ctlz, MVT::i64, 35}, 1241 {Intrinsic::vp_cttz, MVT::i8, 16}, 1242 {Intrinsic::vp_cttz, MVT::i16, 23}, 1243 {Intrinsic::vp_cttz, MVT::i32, 24}, 1244 {Intrinsic::vp_cttz, MVT::i64, 25}, 1245 }; 1246 1247 InstructionCost 1248 RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, 1249 TTI::TargetCostKind CostKind) const { 1250 auto *RetTy = ICA.getReturnType(); 1251 switch (ICA.getID()) { 1252 case Intrinsic::lrint: 1253 case Intrinsic::llrint: 1254 // We can't currently lower half or bfloat vector lrint/llrint. 1255 if (auto *VecTy = dyn_cast<VectorType>(ICA.getArgTypes()[0]); 1256 VecTy && VecTy->getElementType()->is16bitFPTy()) 1257 return InstructionCost::getInvalid(); 1258 [[fallthrough]]; 1259 case Intrinsic::ceil: 1260 case Intrinsic::floor: 1261 case Intrinsic::trunc: 1262 case Intrinsic::rint: 1263 case Intrinsic::round: 1264 case Intrinsic::roundeven: { 1265 // These all use the same code. 1266 auto LT = getTypeLegalizationCost(RetTy); 1267 if (!LT.second.isVector() && TLI->isOperationCustom(ISD::FCEIL, LT.second)) 1268 return LT.first * 8; 1269 break; 1270 } 1271 case Intrinsic::umin: 1272 case Intrinsic::umax: 1273 case Intrinsic::smin: 1274 case Intrinsic::smax: { 1275 auto LT = getTypeLegalizationCost(RetTy); 1276 if (LT.second.isScalarInteger() && ST->hasStdExtZbb()) 1277 return LT.first; 1278 1279 if (ST->hasVInstructions() && LT.second.isVector()) { 1280 unsigned Op; 1281 switch (ICA.getID()) { 1282 case Intrinsic::umin: 1283 Op = RISCV::VMINU_VV; 1284 break; 1285 case Intrinsic::umax: 1286 Op = RISCV::VMAXU_VV; 1287 break; 1288 case Intrinsic::smin: 1289 Op = RISCV::VMIN_VV; 1290 break; 1291 case Intrinsic::smax: 1292 Op = RISCV::VMAX_VV; 1293 break; 1294 } 1295 return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind); 1296 } 1297 break; 1298 } 1299 case Intrinsic::sadd_sat: 1300 case Intrinsic::ssub_sat: 1301 case Intrinsic::uadd_sat: 1302 case Intrinsic::usub_sat: { 1303 auto LT = getTypeLegalizationCost(RetTy); 1304 if (ST->hasVInstructions() && LT.second.isVector()) { 1305 unsigned Op; 1306 switch (ICA.getID()) { 1307 case Intrinsic::sadd_sat: 1308 Op = RISCV::VSADD_VV; 1309 break; 1310 case Intrinsic::ssub_sat: 1311 Op = RISCV::VSSUBU_VV; 1312 break; 1313 case Intrinsic::uadd_sat: 1314 Op = RISCV::VSADDU_VV; 1315 break; 1316 case Intrinsic::usub_sat: 1317 Op = RISCV::VSSUBU_VV; 1318 break; 1319 } 1320 return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind); 1321 } 1322 break; 1323 } 1324 case Intrinsic::fma: 1325 case Intrinsic::fmuladd: { 1326 // TODO: handle promotion with f16/bf16 with zvfhmin/zvfbfmin 1327 auto LT = getTypeLegalizationCost(RetTy); 1328 if (ST->hasVInstructions() && LT.second.isVector()) 1329 return LT.first * 1330 getRISCVInstructionCost(RISCV::VFMADD_VV, LT.second, CostKind); 1331 break; 1332 } 1333 case Intrinsic::fabs: { 1334 auto LT = getTypeLegalizationCost(RetTy); 1335 if (ST->hasVInstructions() && LT.second.isVector()) { 1336 // lui a0, 8 1337 // addi a0, a0, -1 1338 // vsetvli a1, zero, e16, m1, ta, ma 1339 // vand.vx v8, v8, a0 1340 // f16 with zvfhmin and bf16 with zvfhbmin 1341 if (LT.second.getVectorElementType() == MVT::bf16 || 1342 (LT.second.getVectorElementType() == MVT::f16 && 1343 !ST->hasVInstructionsF16())) 1344 return LT.first * getRISCVInstructionCost(RISCV::VAND_VX, LT.second, 1345 CostKind) + 1346 2; 1347 else 1348 return LT.first * 1349 getRISCVInstructionCost(RISCV::VFSGNJX_VV, LT.second, CostKind); 1350 } 1351 break; 1352 } 1353 case Intrinsic::sqrt: { 1354 auto LT = getTypeLegalizationCost(RetTy); 1355 if (ST->hasVInstructions() && LT.second.isVector()) { 1356 SmallVector<unsigned, 4> ConvOp; 1357 SmallVector<unsigned, 2> FsqrtOp; 1358 MVT ConvType = LT.second; 1359 MVT FsqrtType = LT.second; 1360 // f16 with zvfhmin and bf16 with zvfbfmin and the type of nxv32[b]f16 1361 // will be spilt. 1362 if (LT.second.getVectorElementType() == MVT::bf16) { 1363 if (LT.second == MVT::nxv32bf16) { 1364 ConvOp = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFWCVTBF16_F_F_V, 1365 RISCV::VFNCVTBF16_F_F_W, RISCV::VFNCVTBF16_F_F_W}; 1366 FsqrtOp = {RISCV::VFSQRT_V, RISCV::VFSQRT_V}; 1367 ConvType = MVT::nxv16f16; 1368 FsqrtType = MVT::nxv16f32; 1369 } else { 1370 ConvOp = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFNCVTBF16_F_F_W}; 1371 FsqrtOp = {RISCV::VFSQRT_V}; 1372 FsqrtType = TLI->getTypeToPromoteTo(ISD::FSQRT, FsqrtType); 1373 } 1374 } else if (LT.second.getVectorElementType() == MVT::f16 && 1375 !ST->hasVInstructionsF16()) { 1376 if (LT.second == MVT::nxv32f16) { 1377 ConvOp = {RISCV::VFWCVT_F_F_V, RISCV::VFWCVT_F_F_V, 1378 RISCV::VFNCVT_F_F_W, RISCV::VFNCVT_F_F_W}; 1379 FsqrtOp = {RISCV::VFSQRT_V, RISCV::VFSQRT_V}; 1380 ConvType = MVT::nxv16f16; 1381 FsqrtType = MVT::nxv16f32; 1382 } else { 1383 ConvOp = {RISCV::VFWCVT_F_F_V, RISCV::VFNCVT_F_F_W}; 1384 FsqrtOp = {RISCV::VFSQRT_V}; 1385 FsqrtType = TLI->getTypeToPromoteTo(ISD::FSQRT, FsqrtType); 1386 } 1387 } else { 1388 FsqrtOp = {RISCV::VFSQRT_V}; 1389 } 1390 1391 return LT.first * (getRISCVInstructionCost(FsqrtOp, FsqrtType, CostKind) + 1392 getRISCVInstructionCost(ConvOp, ConvType, CostKind)); 1393 } 1394 break; 1395 } 1396 case Intrinsic::cttz: 1397 case Intrinsic::ctlz: 1398 case Intrinsic::ctpop: { 1399 auto LT = getTypeLegalizationCost(RetTy); 1400 if (ST->hasVInstructions() && ST->hasStdExtZvbb() && LT.second.isVector()) { 1401 unsigned Op; 1402 switch (ICA.getID()) { 1403 case Intrinsic::cttz: 1404 Op = RISCV::VCTZ_V; 1405 break; 1406 case Intrinsic::ctlz: 1407 Op = RISCV::VCLZ_V; 1408 break; 1409 case Intrinsic::ctpop: 1410 Op = RISCV::VCPOP_V; 1411 break; 1412 } 1413 return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind); 1414 } 1415 break; 1416 } 1417 case Intrinsic::abs: { 1418 auto LT = getTypeLegalizationCost(RetTy); 1419 if (ST->hasVInstructions() && LT.second.isVector()) { 1420 // vrsub.vi v10, v8, 0 1421 // vmax.vv v8, v8, v10 1422 return LT.first * 1423 getRISCVInstructionCost({RISCV::VRSUB_VI, RISCV::VMAX_VV}, 1424 LT.second, CostKind); 1425 } 1426 break; 1427 } 1428 case Intrinsic::get_active_lane_mask: { 1429 if (ST->hasVInstructions()) { 1430 Type *ExpRetTy = VectorType::get( 1431 ICA.getArgTypes()[0], cast<VectorType>(RetTy)->getElementCount()); 1432 auto LT = getTypeLegalizationCost(ExpRetTy); 1433 1434 // vid.v v8 // considered hoisted 1435 // vsaddu.vx v8, v8, a0 1436 // vmsltu.vx v0, v8, a1 1437 return LT.first * 1438 getRISCVInstructionCost({RISCV::VSADDU_VX, RISCV::VMSLTU_VX}, 1439 LT.second, CostKind); 1440 } 1441 break; 1442 } 1443 // TODO: add more intrinsic 1444 case Intrinsic::stepvector: { 1445 auto LT = getTypeLegalizationCost(RetTy); 1446 // Legalisation of illegal types involves an `index' instruction plus 1447 // (LT.first - 1) vector adds. 1448 if (ST->hasVInstructions()) 1449 return getRISCVInstructionCost(RISCV::VID_V, LT.second, CostKind) + 1450 (LT.first - 1) * 1451 getRISCVInstructionCost(RISCV::VADD_VX, LT.second, CostKind); 1452 return 1 + (LT.first - 1); 1453 } 1454 case Intrinsic::experimental_cttz_elts: { 1455 Type *ArgTy = ICA.getArgTypes()[0]; 1456 EVT ArgType = TLI->getValueType(DL, ArgTy, true); 1457 if (getTLI()->shouldExpandCttzElements(ArgType)) 1458 break; 1459 InstructionCost Cost = getRISCVInstructionCost( 1460 RISCV::VFIRST_M, getTypeLegalizationCost(ArgTy).second, CostKind); 1461 1462 // If zero_is_poison is false, then we will generate additional 1463 // cmp + select instructions to convert -1 to EVL. 1464 Type *BoolTy = Type::getInt1Ty(RetTy->getContext()); 1465 if (ICA.getArgs().size() > 1 && 1466 cast<ConstantInt>(ICA.getArgs()[1])->isZero()) 1467 Cost += getCmpSelInstrCost(Instruction::ICmp, BoolTy, RetTy, 1468 CmpInst::ICMP_SLT, CostKind) + 1469 getCmpSelInstrCost(Instruction::Select, RetTy, BoolTy, 1470 CmpInst::BAD_ICMP_PREDICATE, CostKind); 1471 1472 return Cost; 1473 } 1474 case Intrinsic::experimental_vp_splat: { 1475 auto LT = getTypeLegalizationCost(RetTy); 1476 // TODO: Lower i1 experimental_vp_splat 1477 if (!ST->hasVInstructions() || LT.second.getScalarType() == MVT::i1) 1478 return InstructionCost::getInvalid(); 1479 return LT.first * getRISCVInstructionCost(LT.second.isFloatingPoint() 1480 ? RISCV::VFMV_V_F 1481 : RISCV::VMV_V_X, 1482 LT.second, CostKind); 1483 } 1484 case Intrinsic::experimental_vp_splice: { 1485 // To support type-based query from vectorizer, set the index to 0. 1486 // Note that index only change the cost from vslide.vx to vslide.vi and in 1487 // current implementations they have same costs. 1488 return getShuffleCost(TTI::SK_Splice, cast<VectorType>(ICA.getReturnType()), 1489 cast<VectorType>(ICA.getArgTypes()[0]), {}, CostKind, 1490 0, cast<VectorType>(ICA.getReturnType())); 1491 } 1492 } 1493 1494 if (ST->hasVInstructions() && RetTy->isVectorTy()) { 1495 if (auto LT = getTypeLegalizationCost(RetTy); 1496 LT.second.isVector()) { 1497 MVT EltTy = LT.second.getVectorElementType(); 1498 if (const auto *Entry = CostTableLookup(VectorIntrinsicCostTable, 1499 ICA.getID(), EltTy)) 1500 return LT.first * Entry->Cost; 1501 } 1502 } 1503 1504 return BaseT::getIntrinsicInstrCost(ICA, CostKind); 1505 } 1506 1507 InstructionCost RISCVTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, 1508 Type *Src, 1509 TTI::CastContextHint CCH, 1510 TTI::TargetCostKind CostKind, 1511 const Instruction *I) const { 1512 bool IsVectorType = isa<VectorType>(Dst) && isa<VectorType>(Src); 1513 if (!IsVectorType) 1514 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I); 1515 1516 // FIXME: Need to compute legalizing cost for illegal types. The current 1517 // code handles only legal types and those which can be trivially 1518 // promoted to legal. 1519 if (!ST->hasVInstructions() || Src->getScalarSizeInBits() > ST->getELen() || 1520 Dst->getScalarSizeInBits() > ST->getELen()) 1521 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I); 1522 1523 int ISD = TLI->InstructionOpcodeToISD(Opcode); 1524 assert(ISD && "Invalid opcode"); 1525 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src); 1526 std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(Dst); 1527 1528 // Handle i1 source and dest cases *before* calling logic in BasicTTI. 1529 // The shared implementation doesn't model vector widening during legalization 1530 // and instead assumes scalarization. In order to scalarize an <N x i1> 1531 // vector, we need to extend/trunc to/from i8. If we don't special case 1532 // this, we can get an infinite recursion cycle. 1533 switch (ISD) { 1534 default: 1535 break; 1536 case ISD::SIGN_EXTEND: 1537 case ISD::ZERO_EXTEND: 1538 if (Src->getScalarSizeInBits() == 1) { 1539 // We do not use vsext/vzext to extend from mask vector. 1540 // Instead we use the following instructions to extend from mask vector: 1541 // vmv.v.i v8, 0 1542 // vmerge.vim v8, v8, -1, v0 (repeated per split) 1543 return getRISCVInstructionCost(RISCV::VMV_V_I, DstLT.second, CostKind) + 1544 DstLT.first * getRISCVInstructionCost(RISCV::VMERGE_VIM, 1545 DstLT.second, CostKind) + 1546 DstLT.first - 1; 1547 } 1548 break; 1549 case ISD::TRUNCATE: 1550 if (Dst->getScalarSizeInBits() == 1) { 1551 // We do not use several vncvt to truncate to mask vector. So we could 1552 // not use PowDiff to calculate it. 1553 // Instead we use the following instructions to truncate to mask vector: 1554 // vand.vi v8, v8, 1 1555 // vmsne.vi v0, v8, 0 1556 return SrcLT.first * 1557 getRISCVInstructionCost({RISCV::VAND_VI, RISCV::VMSNE_VI}, 1558 SrcLT.second, CostKind) + 1559 SrcLT.first - 1; 1560 } 1561 break; 1562 }; 1563 1564 // Our actual lowering for the case where a wider legal type is available 1565 // uses promotion to the wider type. This is reflected in the result of 1566 // getTypeLegalizationCost, but BasicTTI assumes the widened cases are 1567 // scalarized if the legalized Src and Dst are not equal sized. 1568 const DataLayout &DL = this->getDataLayout(); 1569 if (!SrcLT.second.isVector() || !DstLT.second.isVector() || 1570 !SrcLT.first.isValid() || !DstLT.first.isValid() || 1571 !TypeSize::isKnownLE(DL.getTypeSizeInBits(Src), 1572 SrcLT.second.getSizeInBits()) || 1573 !TypeSize::isKnownLE(DL.getTypeSizeInBits(Dst), 1574 DstLT.second.getSizeInBits())) 1575 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I); 1576 1577 // The split cost is handled by the base getCastInstrCost 1578 assert((SrcLT.first == 1) && (DstLT.first == 1) && "Illegal type"); 1579 1580 int PowDiff = (int)Log2_32(DstLT.second.getScalarSizeInBits()) - 1581 (int)Log2_32(SrcLT.second.getScalarSizeInBits()); 1582 switch (ISD) { 1583 case ISD::SIGN_EXTEND: 1584 case ISD::ZERO_EXTEND: { 1585 if ((PowDiff < 1) || (PowDiff > 3)) 1586 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I); 1587 unsigned SExtOp[] = {RISCV::VSEXT_VF2, RISCV::VSEXT_VF4, RISCV::VSEXT_VF8}; 1588 unsigned ZExtOp[] = {RISCV::VZEXT_VF2, RISCV::VZEXT_VF4, RISCV::VZEXT_VF8}; 1589 unsigned Op = 1590 (ISD == ISD::SIGN_EXTEND) ? SExtOp[PowDiff - 1] : ZExtOp[PowDiff - 1]; 1591 return getRISCVInstructionCost(Op, DstLT.second, CostKind); 1592 } 1593 case ISD::TRUNCATE: 1594 case ISD::FP_EXTEND: 1595 case ISD::FP_ROUND: { 1596 // Counts of narrow/widen instructions. 1597 unsigned SrcEltSize = SrcLT.second.getScalarSizeInBits(); 1598 unsigned DstEltSize = DstLT.second.getScalarSizeInBits(); 1599 1600 unsigned Op = (ISD == ISD::TRUNCATE) ? RISCV::VNSRL_WI 1601 : (ISD == ISD::FP_EXTEND) ? RISCV::VFWCVT_F_F_V 1602 : RISCV::VFNCVT_F_F_W; 1603 InstructionCost Cost = 0; 1604 for (; SrcEltSize != DstEltSize;) { 1605 MVT ElementMVT = (ISD == ISD::TRUNCATE) 1606 ? MVT::getIntegerVT(DstEltSize) 1607 : MVT::getFloatingPointVT(DstEltSize); 1608 MVT DstMVT = DstLT.second.changeVectorElementType(ElementMVT); 1609 DstEltSize = 1610 (DstEltSize > SrcEltSize) ? DstEltSize >> 1 : DstEltSize << 1; 1611 Cost += getRISCVInstructionCost(Op, DstMVT, CostKind); 1612 } 1613 return Cost; 1614 } 1615 case ISD::FP_TO_SINT: 1616 case ISD::FP_TO_UINT: { 1617 unsigned IsSigned = ISD == ISD::FP_TO_SINT; 1618 unsigned FCVT = IsSigned ? RISCV::VFCVT_RTZ_X_F_V : RISCV::VFCVT_RTZ_XU_F_V; 1619 unsigned FWCVT = 1620 IsSigned ? RISCV::VFWCVT_RTZ_X_F_V : RISCV::VFWCVT_RTZ_XU_F_V; 1621 unsigned FNCVT = 1622 IsSigned ? RISCV::VFNCVT_RTZ_X_F_W : RISCV::VFNCVT_RTZ_XU_F_W; 1623 unsigned SrcEltSize = Src->getScalarSizeInBits(); 1624 unsigned DstEltSize = Dst->getScalarSizeInBits(); 1625 InstructionCost Cost = 0; 1626 if ((SrcEltSize == 16) && 1627 (!ST->hasVInstructionsF16() || ((DstEltSize / 2) > SrcEltSize))) { 1628 // If the target only supports zvfhmin or it is fp16-to-i64 conversion 1629 // pre-widening to f32 and then convert f32 to integer 1630 VectorType *VecF32Ty = 1631 VectorType::get(Type::getFloatTy(Dst->getContext()), 1632 cast<VectorType>(Dst)->getElementCount()); 1633 std::pair<InstructionCost, MVT> VecF32LT = 1634 getTypeLegalizationCost(VecF32Ty); 1635 Cost += 1636 VecF32LT.first * getRISCVInstructionCost(RISCV::VFWCVT_F_F_V, 1637 VecF32LT.second, CostKind); 1638 Cost += getCastInstrCost(Opcode, Dst, VecF32Ty, CCH, CostKind, I); 1639 return Cost; 1640 } 1641 if (DstEltSize == SrcEltSize) 1642 Cost += getRISCVInstructionCost(FCVT, DstLT.second, CostKind); 1643 else if (DstEltSize > SrcEltSize) 1644 Cost += getRISCVInstructionCost(FWCVT, DstLT.second, CostKind); 1645 else { // (SrcEltSize > DstEltSize) 1646 // First do a narrowing conversion to an integer half the size, then 1647 // truncate if needed. 1648 MVT ElementVT = MVT::getIntegerVT(SrcEltSize / 2); 1649 MVT VecVT = DstLT.second.changeVectorElementType(ElementVT); 1650 Cost += getRISCVInstructionCost(FNCVT, VecVT, CostKind); 1651 if ((SrcEltSize / 2) > DstEltSize) { 1652 Type *VecTy = EVT(VecVT).getTypeForEVT(Dst->getContext()); 1653 Cost += 1654 getCastInstrCost(Instruction::Trunc, Dst, VecTy, CCH, CostKind, I); 1655 } 1656 } 1657 return Cost; 1658 } 1659 case ISD::SINT_TO_FP: 1660 case ISD::UINT_TO_FP: { 1661 unsigned IsSigned = ISD == ISD::SINT_TO_FP; 1662 unsigned FCVT = IsSigned ? RISCV::VFCVT_F_X_V : RISCV::VFCVT_F_XU_V; 1663 unsigned FWCVT = IsSigned ? RISCV::VFWCVT_F_X_V : RISCV::VFWCVT_F_XU_V; 1664 unsigned FNCVT = IsSigned ? RISCV::VFNCVT_F_X_W : RISCV::VFNCVT_F_XU_W; 1665 unsigned SrcEltSize = Src->getScalarSizeInBits(); 1666 unsigned DstEltSize = Dst->getScalarSizeInBits(); 1667 1668 InstructionCost Cost = 0; 1669 if ((DstEltSize == 16) && 1670 (!ST->hasVInstructionsF16() || ((SrcEltSize / 2) > DstEltSize))) { 1671 // If the target only supports zvfhmin or it is i64-to-fp16 conversion 1672 // it is converted to f32 and then converted to f16 1673 VectorType *VecF32Ty = 1674 VectorType::get(Type::getFloatTy(Dst->getContext()), 1675 cast<VectorType>(Dst)->getElementCount()); 1676 std::pair<InstructionCost, MVT> VecF32LT = 1677 getTypeLegalizationCost(VecF32Ty); 1678 Cost += getCastInstrCost(Opcode, VecF32Ty, Src, CCH, CostKind, I); 1679 Cost += VecF32LT.first * getRISCVInstructionCost(RISCV::VFNCVT_F_F_W, 1680 DstLT.second, CostKind); 1681 return Cost; 1682 } 1683 1684 if (DstEltSize == SrcEltSize) 1685 Cost += getRISCVInstructionCost(FCVT, DstLT.second, CostKind); 1686 else if (DstEltSize > SrcEltSize) { 1687 if ((DstEltSize / 2) > SrcEltSize) { 1688 VectorType *VecTy = 1689 VectorType::get(IntegerType::get(Dst->getContext(), DstEltSize / 2), 1690 cast<VectorType>(Dst)->getElementCount()); 1691 unsigned Op = IsSigned ? Instruction::SExt : Instruction::ZExt; 1692 Cost += getCastInstrCost(Op, VecTy, Src, CCH, CostKind, I); 1693 } 1694 Cost += getRISCVInstructionCost(FWCVT, DstLT.second, CostKind); 1695 } else 1696 Cost += getRISCVInstructionCost(FNCVT, DstLT.second, CostKind); 1697 return Cost; 1698 } 1699 } 1700 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I); 1701 } 1702 1703 unsigned RISCVTTIImpl::getEstimatedVLFor(VectorType *Ty) const { 1704 if (isa<ScalableVectorType>(Ty)) { 1705 const unsigned EltSize = DL.getTypeSizeInBits(Ty->getElementType()); 1706 const unsigned MinSize = DL.getTypeSizeInBits(Ty).getKnownMinValue(); 1707 const unsigned VectorBits = *getVScaleForTuning() * RISCV::RVVBitsPerBlock; 1708 return RISCVTargetLowering::computeVLMAX(VectorBits, EltSize, MinSize); 1709 } 1710 return cast<FixedVectorType>(Ty)->getNumElements(); 1711 } 1712 1713 InstructionCost 1714 RISCVTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, 1715 FastMathFlags FMF, 1716 TTI::TargetCostKind CostKind) const { 1717 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors()) 1718 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind); 1719 1720 // Skip if scalar size of Ty is bigger than ELEN. 1721 if (Ty->getScalarSizeInBits() > ST->getELen()) 1722 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind); 1723 1724 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); 1725 if (Ty->getElementType()->isIntegerTy(1)) { 1726 // SelectionDAGBuilder does following transforms: 1727 // vector_reduce_{smin,umax}(<n x i1>) --> vector_reduce_or(<n x i1>) 1728 // vector_reduce_{smax,umin}(<n x i1>) --> vector_reduce_and(<n x i1>) 1729 if (IID == Intrinsic::umax || IID == Intrinsic::smin) 1730 return getArithmeticReductionCost(Instruction::Or, Ty, FMF, CostKind); 1731 else 1732 return getArithmeticReductionCost(Instruction::And, Ty, FMF, CostKind); 1733 } 1734 1735 if (IID == Intrinsic::maximum || IID == Intrinsic::minimum) { 1736 SmallVector<unsigned, 3> Opcodes; 1737 InstructionCost ExtraCost = 0; 1738 switch (IID) { 1739 case Intrinsic::maximum: 1740 if (FMF.noNaNs()) { 1741 Opcodes = {RISCV::VFREDMAX_VS, RISCV::VFMV_F_S}; 1742 } else { 1743 Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMAX_VS, 1744 RISCV::VFMV_F_S}; 1745 // Cost of Canonical Nan + branch 1746 // lui a0, 523264 1747 // fmv.w.x fa0, a0 1748 Type *DstTy = Ty->getScalarType(); 1749 const unsigned EltTyBits = DstTy->getScalarSizeInBits(); 1750 Type *SrcTy = IntegerType::getIntNTy(DstTy->getContext(), EltTyBits); 1751 ExtraCost = 1 + 1752 getCastInstrCost(Instruction::UIToFP, DstTy, SrcTy, 1753 TTI::CastContextHint::None, CostKind) + 1754 getCFInstrCost(Instruction::Br, CostKind); 1755 } 1756 break; 1757 1758 case Intrinsic::minimum: 1759 if (FMF.noNaNs()) { 1760 Opcodes = {RISCV::VFREDMIN_VS, RISCV::VFMV_F_S}; 1761 } else { 1762 Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMIN_VS, 1763 RISCV::VFMV_F_S}; 1764 // Cost of Canonical Nan + branch 1765 // lui a0, 523264 1766 // fmv.w.x fa0, a0 1767 Type *DstTy = Ty->getScalarType(); 1768 const unsigned EltTyBits = DL.getTypeSizeInBits(DstTy); 1769 Type *SrcTy = IntegerType::getIntNTy(DstTy->getContext(), EltTyBits); 1770 ExtraCost = 1 + 1771 getCastInstrCost(Instruction::UIToFP, DstTy, SrcTy, 1772 TTI::CastContextHint::None, CostKind) + 1773 getCFInstrCost(Instruction::Br, CostKind); 1774 } 1775 break; 1776 } 1777 return ExtraCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind); 1778 } 1779 1780 // IR Reduction is composed by one rvv reduction instruction and vmv 1781 unsigned SplitOp; 1782 SmallVector<unsigned, 3> Opcodes; 1783 switch (IID) { 1784 default: 1785 llvm_unreachable("Unsupported intrinsic"); 1786 case Intrinsic::smax: 1787 SplitOp = RISCV::VMAX_VV; 1788 Opcodes = {RISCV::VREDMAX_VS, RISCV::VMV_X_S}; 1789 break; 1790 case Intrinsic::smin: 1791 SplitOp = RISCV::VMIN_VV; 1792 Opcodes = {RISCV::VREDMIN_VS, RISCV::VMV_X_S}; 1793 break; 1794 case Intrinsic::umax: 1795 SplitOp = RISCV::VMAXU_VV; 1796 Opcodes = {RISCV::VREDMAXU_VS, RISCV::VMV_X_S}; 1797 break; 1798 case Intrinsic::umin: 1799 SplitOp = RISCV::VMINU_VV; 1800 Opcodes = {RISCV::VREDMINU_VS, RISCV::VMV_X_S}; 1801 break; 1802 case Intrinsic::maxnum: 1803 SplitOp = RISCV::VFMAX_VV; 1804 Opcodes = {RISCV::VFREDMAX_VS, RISCV::VFMV_F_S}; 1805 break; 1806 case Intrinsic::minnum: 1807 SplitOp = RISCV::VFMIN_VV; 1808 Opcodes = {RISCV::VFREDMIN_VS, RISCV::VFMV_F_S}; 1809 break; 1810 } 1811 // Add a cost for data larger than LMUL8 1812 InstructionCost SplitCost = 1813 (LT.first > 1) ? (LT.first - 1) * 1814 getRISCVInstructionCost(SplitOp, LT.second, CostKind) 1815 : 0; 1816 return SplitCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind); 1817 } 1818 1819 InstructionCost 1820 RISCVTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, 1821 std::optional<FastMathFlags> FMF, 1822 TTI::TargetCostKind CostKind) const { 1823 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors()) 1824 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind); 1825 1826 // Skip if scalar size of Ty is bigger than ELEN. 1827 if (Ty->getScalarSizeInBits() > ST->getELen()) 1828 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind); 1829 1830 int ISD = TLI->InstructionOpcodeToISD(Opcode); 1831 assert(ISD && "Invalid opcode"); 1832 1833 if (ISD != ISD::ADD && ISD != ISD::OR && ISD != ISD::XOR && ISD != ISD::AND && 1834 ISD != ISD::FADD) 1835 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind); 1836 1837 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); 1838 Type *ElementTy = Ty->getElementType(); 1839 if (ElementTy->isIntegerTy(1)) { 1840 // Example sequences: 1841 // vfirst.m a0, v0 1842 // seqz a0, a0 1843 if (LT.second == MVT::v1i1) 1844 return getRISCVInstructionCost(RISCV::VFIRST_M, LT.second, CostKind) + 1845 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy, 1846 CmpInst::ICMP_EQ, CostKind); 1847 1848 if (ISD == ISD::AND) { 1849 // Example sequences: 1850 // vmand.mm v8, v9, v8 ; needed every time type is split 1851 // vmnot.m v8, v0 ; alias for vmnand 1852 // vcpop.m a0, v8 1853 // seqz a0, a0 1854 1855 // See the discussion: https://github.com/llvm/llvm-project/pull/119160 1856 // For LMUL <= 8, there is no splitting, 1857 // the sequences are vmnot, vcpop and seqz. 1858 // When LMUL > 8 and split = 1, 1859 // the sequences are vmnand, vcpop and seqz. 1860 // When LMUL > 8 and split > 1, 1861 // the sequences are (LT.first-2) * vmand, vmnand, vcpop and seqz. 1862 return ((LT.first > 2) ? (LT.first - 2) : 0) * 1863 getRISCVInstructionCost(RISCV::VMAND_MM, LT.second, CostKind) + 1864 getRISCVInstructionCost(RISCV::VMNAND_MM, LT.second, CostKind) + 1865 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) + 1866 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy, 1867 CmpInst::ICMP_EQ, CostKind); 1868 } else if (ISD == ISD::XOR || ISD == ISD::ADD) { 1869 // Example sequences: 1870 // vsetvli a0, zero, e8, mf8, ta, ma 1871 // vmxor.mm v8, v0, v8 ; needed every time type is split 1872 // vcpop.m a0, v8 1873 // andi a0, a0, 1 1874 return (LT.first - 1) * 1875 getRISCVInstructionCost(RISCV::VMXOR_MM, LT.second, CostKind) + 1876 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) + 1; 1877 } else { 1878 assert(ISD == ISD::OR); 1879 // Example sequences: 1880 // vsetvli a0, zero, e8, mf8, ta, ma 1881 // vmor.mm v8, v9, v8 ; needed every time type is split 1882 // vcpop.m a0, v0 1883 // snez a0, a0 1884 return (LT.first - 1) * 1885 getRISCVInstructionCost(RISCV::VMOR_MM, LT.second, CostKind) + 1886 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) + 1887 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy, 1888 CmpInst::ICMP_NE, CostKind); 1889 } 1890 } 1891 1892 // IR Reduction of or/and is composed by one vmv and one rvv reduction 1893 // instruction, and others is composed by two vmv and one rvv reduction 1894 // instruction 1895 unsigned SplitOp; 1896 SmallVector<unsigned, 3> Opcodes; 1897 switch (ISD) { 1898 case ISD::ADD: 1899 SplitOp = RISCV::VADD_VV; 1900 Opcodes = {RISCV::VMV_S_X, RISCV::VREDSUM_VS, RISCV::VMV_X_S}; 1901 break; 1902 case ISD::OR: 1903 SplitOp = RISCV::VOR_VV; 1904 Opcodes = {RISCV::VREDOR_VS, RISCV::VMV_X_S}; 1905 break; 1906 case ISD::XOR: 1907 SplitOp = RISCV::VXOR_VV; 1908 Opcodes = {RISCV::VMV_S_X, RISCV::VREDXOR_VS, RISCV::VMV_X_S}; 1909 break; 1910 case ISD::AND: 1911 SplitOp = RISCV::VAND_VV; 1912 Opcodes = {RISCV::VREDAND_VS, RISCV::VMV_X_S}; 1913 break; 1914 case ISD::FADD: 1915 // We can't promote f16/bf16 fadd reductions. 1916 if ((LT.second.getScalarType() == MVT::f16 && !ST->hasVInstructionsF16()) || 1917 LT.second.getScalarType() == MVT::bf16) 1918 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind); 1919 if (TTI::requiresOrderedReduction(FMF)) { 1920 Opcodes.push_back(RISCV::VFMV_S_F); 1921 for (unsigned i = 0; i < LT.first.getValue(); i++) 1922 Opcodes.push_back(RISCV::VFREDOSUM_VS); 1923 Opcodes.push_back(RISCV::VFMV_F_S); 1924 return getRISCVInstructionCost(Opcodes, LT.second, CostKind); 1925 } 1926 SplitOp = RISCV::VFADD_VV; 1927 Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDUSUM_VS, RISCV::VFMV_F_S}; 1928 break; 1929 } 1930 // Add a cost for data larger than LMUL8 1931 InstructionCost SplitCost = 1932 (LT.first > 1) ? (LT.first - 1) * 1933 getRISCVInstructionCost(SplitOp, LT.second, CostKind) 1934 : 0; 1935 return SplitCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind); 1936 } 1937 1938 InstructionCost RISCVTTIImpl::getExtendedReductionCost( 1939 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, 1940 std::optional<FastMathFlags> FMF, TTI::TargetCostKind CostKind) const { 1941 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors()) 1942 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy, 1943 FMF, CostKind); 1944 1945 // Skip if scalar size of ResTy is bigger than ELEN. 1946 if (ResTy->getScalarSizeInBits() > ST->getELen()) 1947 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy, 1948 FMF, CostKind); 1949 1950 if (Opcode != Instruction::Add && Opcode != Instruction::FAdd) 1951 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy, 1952 FMF, CostKind); 1953 1954 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy); 1955 1956 if (IsUnsigned && Opcode == Instruction::Add && 1957 LT.second.isFixedLengthVector() && LT.second.getScalarType() == MVT::i1) { 1958 // Represent vector_reduce_add(ZExt(<n x i1>)) as 1959 // ZExtOrTrunc(ctpop(bitcast <n x i1> to in)). 1960 return LT.first * 1961 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind); 1962 } 1963 1964 if (ResTy->getScalarSizeInBits() != 2 * LT.second.getScalarSizeInBits()) 1965 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy, 1966 FMF, CostKind); 1967 1968 return (LT.first - 1) + 1969 getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind); 1970 } 1971 1972 InstructionCost 1973 RISCVTTIImpl::getStoreImmCost(Type *Ty, TTI::OperandValueInfo OpInfo, 1974 TTI::TargetCostKind CostKind) const { 1975 assert(OpInfo.isConstant() && "non constant operand?"); 1976 if (!isa<VectorType>(Ty)) 1977 // FIXME: We need to account for immediate materialization here, but doing 1978 // a decent job requires more knowledge about the immediate than we 1979 // currently have here. 1980 return 0; 1981 1982 if (OpInfo.isUniform()) 1983 // vmv.v.i, vmv.v.x, or vfmv.v.f 1984 // We ignore the cost of the scalar constant materialization to be consistent 1985 // with how we treat scalar constants themselves just above. 1986 return 1; 1987 1988 return getConstantPoolLoadCost(Ty, CostKind); 1989 } 1990 1991 InstructionCost RISCVTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, 1992 Align Alignment, 1993 unsigned AddressSpace, 1994 TTI::TargetCostKind CostKind, 1995 TTI::OperandValueInfo OpInfo, 1996 const Instruction *I) const { 1997 EVT VT = TLI->getValueType(DL, Src, true); 1998 // Type legalization can't handle structs 1999 if (VT == MVT::Other) 2000 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, 2001 CostKind, OpInfo, I); 2002 2003 InstructionCost Cost = 0; 2004 if (Opcode == Instruction::Store && OpInfo.isConstant()) 2005 Cost += getStoreImmCost(Src, OpInfo, CostKind); 2006 2007 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src); 2008 2009 InstructionCost BaseCost = [&]() { 2010 InstructionCost Cost = LT.first; 2011 if (CostKind != TTI::TCK_RecipThroughput) 2012 return Cost; 2013 2014 // Our actual lowering for the case where a wider legal type is available 2015 // uses the a VL predicated load on the wider type. This is reflected in 2016 // the result of getTypeLegalizationCost, but BasicTTI assumes the 2017 // widened cases are scalarized. 2018 const DataLayout &DL = this->getDataLayout(); 2019 if (Src->isVectorTy() && LT.second.isVector() && 2020 TypeSize::isKnownLT(DL.getTypeStoreSizeInBits(Src), 2021 LT.second.getSizeInBits())) 2022 return Cost; 2023 2024 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, 2025 CostKind, OpInfo, I); 2026 }(); 2027 2028 // Assume memory ops cost scale with the number of vector registers 2029 // possible accessed by the instruction. Note that BasicTTI already 2030 // handles the LT.first term for us. 2031 if (LT.second.isVector() && CostKind != TTI::TCK_CodeSize) 2032 BaseCost *= TLI->getLMULCost(LT.second); 2033 return Cost + BaseCost; 2034 } 2035 2036 InstructionCost RISCVTTIImpl::getCmpSelInstrCost( 2037 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, 2038 TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info, 2039 TTI::OperandValueInfo Op2Info, const Instruction *I) const { 2040 if (CostKind != TTI::TCK_RecipThroughput) 2041 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, 2042 Op1Info, Op2Info, I); 2043 2044 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors()) 2045 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, 2046 Op1Info, Op2Info, I); 2047 2048 // Skip if scalar size of ValTy is bigger than ELEN. 2049 if (ValTy->isVectorTy() && ValTy->getScalarSizeInBits() > ST->getELen()) 2050 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, 2051 Op1Info, Op2Info, I); 2052 2053 auto GetConstantMatCost = 2054 [&](TTI::OperandValueInfo OpInfo) -> InstructionCost { 2055 if (OpInfo.isUniform()) 2056 // We return 0 we currently ignore the cost of materializing scalar 2057 // constants in GPRs. 2058 return 0; 2059 2060 return getConstantPoolLoadCost(ValTy, CostKind); 2061 }; 2062 2063 InstructionCost ConstantMatCost; 2064 if (Op1Info.isConstant()) 2065 ConstantMatCost += GetConstantMatCost(Op1Info); 2066 if (Op2Info.isConstant()) 2067 ConstantMatCost += GetConstantMatCost(Op2Info); 2068 2069 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy); 2070 if (Opcode == Instruction::Select && ValTy->isVectorTy()) { 2071 if (CondTy->isVectorTy()) { 2072 if (ValTy->getScalarSizeInBits() == 1) { 2073 // vmandn.mm v8, v8, v9 2074 // vmand.mm v9, v0, v9 2075 // vmor.mm v0, v9, v8 2076 return ConstantMatCost + 2077 LT.first * 2078 getRISCVInstructionCost( 2079 {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM}, 2080 LT.second, CostKind); 2081 } 2082 // vselect and max/min are supported natively. 2083 return ConstantMatCost + 2084 LT.first * getRISCVInstructionCost(RISCV::VMERGE_VVM, LT.second, 2085 CostKind); 2086 } 2087 2088 if (ValTy->getScalarSizeInBits() == 1) { 2089 // vmv.v.x v9, a0 2090 // vmsne.vi v9, v9, 0 2091 // vmandn.mm v8, v8, v9 2092 // vmand.mm v9, v0, v9 2093 // vmor.mm v0, v9, v8 2094 MVT InterimVT = LT.second.changeVectorElementType(MVT::i8); 2095 return ConstantMatCost + 2096 LT.first * 2097 getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI}, 2098 InterimVT, CostKind) + 2099 LT.first * getRISCVInstructionCost( 2100 {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM}, 2101 LT.second, CostKind); 2102 } 2103 2104 // vmv.v.x v10, a0 2105 // vmsne.vi v0, v10, 0 2106 // vmerge.vvm v8, v9, v8, v0 2107 return ConstantMatCost + 2108 LT.first * getRISCVInstructionCost( 2109 {RISCV::VMV_V_X, RISCV::VMSNE_VI, RISCV::VMERGE_VVM}, 2110 LT.second, CostKind); 2111 } 2112 2113 if ((Opcode == Instruction::ICmp) && ValTy->isVectorTy() && 2114 CmpInst::isIntPredicate(VecPred)) { 2115 // Use VMSLT_VV to represent VMSEQ, VMSNE, VMSLTU, VMSLEU, VMSLT, VMSLE 2116 // provided they incur the same cost across all implementations 2117 return ConstantMatCost + LT.first * getRISCVInstructionCost(RISCV::VMSLT_VV, 2118 LT.second, 2119 CostKind); 2120 } 2121 2122 if ((Opcode == Instruction::FCmp) && ValTy->isVectorTy() && 2123 CmpInst::isFPPredicate(VecPred)) { 2124 2125 // Use VMXOR_MM and VMXNOR_MM to generate all true/false mask 2126 if ((VecPred == CmpInst::FCMP_FALSE) || (VecPred == CmpInst::FCMP_TRUE)) 2127 return ConstantMatCost + 2128 getRISCVInstructionCost(RISCV::VMXOR_MM, LT.second, CostKind); 2129 2130 // If we do not support the input floating point vector type, use the base 2131 // one which will calculate as: 2132 // ScalarizeCost + Num * Cost for fixed vector, 2133 // InvalidCost for scalable vector. 2134 if ((ValTy->getScalarSizeInBits() == 16 && !ST->hasVInstructionsF16()) || 2135 (ValTy->getScalarSizeInBits() == 32 && !ST->hasVInstructionsF32()) || 2136 (ValTy->getScalarSizeInBits() == 64 && !ST->hasVInstructionsF64())) 2137 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, 2138 Op1Info, Op2Info, I); 2139 2140 // Assuming vector fp compare and mask instructions are all the same cost 2141 // until a need arises to differentiate them. 2142 switch (VecPred) { 2143 case CmpInst::FCMP_ONE: // vmflt.vv + vmflt.vv + vmor.mm 2144 case CmpInst::FCMP_ORD: // vmfeq.vv + vmfeq.vv + vmand.mm 2145 case CmpInst::FCMP_UNO: // vmfne.vv + vmfne.vv + vmor.mm 2146 case CmpInst::FCMP_UEQ: // vmflt.vv + vmflt.vv + vmnor.mm 2147 return ConstantMatCost + 2148 LT.first * getRISCVInstructionCost( 2149 {RISCV::VMFLT_VV, RISCV::VMFLT_VV, RISCV::VMOR_MM}, 2150 LT.second, CostKind); 2151 2152 case CmpInst::FCMP_UGT: // vmfle.vv + vmnot.m 2153 case CmpInst::FCMP_UGE: // vmflt.vv + vmnot.m 2154 case CmpInst::FCMP_ULT: // vmfle.vv + vmnot.m 2155 case CmpInst::FCMP_ULE: // vmflt.vv + vmnot.m 2156 return ConstantMatCost + 2157 LT.first * 2158 getRISCVInstructionCost({RISCV::VMFLT_VV, RISCV::VMNAND_MM}, 2159 LT.second, CostKind); 2160 2161 case CmpInst::FCMP_OEQ: // vmfeq.vv 2162 case CmpInst::FCMP_OGT: // vmflt.vv 2163 case CmpInst::FCMP_OGE: // vmfle.vv 2164 case CmpInst::FCMP_OLT: // vmflt.vv 2165 case CmpInst::FCMP_OLE: // vmfle.vv 2166 case CmpInst::FCMP_UNE: // vmfne.vv 2167 return ConstantMatCost + 2168 LT.first * 2169 getRISCVInstructionCost(RISCV::VMFLT_VV, LT.second, CostKind); 2170 default: 2171 break; 2172 } 2173 } 2174 2175 // With ShortForwardBranchOpt or ConditionalMoveFusion, scalar icmp + select 2176 // instructions will lower to SELECT_CC and lower to PseudoCCMOVGPR which will 2177 // generate a conditional branch + mv. The cost of scalar (icmp + select) will 2178 // be (0 + select instr cost). 2179 if (ST->hasConditionalMoveFusion() && I && isa<ICmpInst>(I) && 2180 ValTy->isIntegerTy() && !I->user_empty()) { 2181 if (all_of(I->users(), [&](const User *U) { 2182 return match(U, m_Select(m_Specific(I), m_Value(), m_Value())) && 2183 U->getType()->isIntegerTy() && 2184 !isa<ConstantData>(U->getOperand(1)) && 2185 !isa<ConstantData>(U->getOperand(2)); 2186 })) 2187 return 0; 2188 } 2189 2190 // TODO: Add cost for scalar type. 2191 2192 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, 2193 Op1Info, Op2Info, I); 2194 } 2195 2196 InstructionCost RISCVTTIImpl::getCFInstrCost(unsigned Opcode, 2197 TTI::TargetCostKind CostKind, 2198 const Instruction *I) const { 2199 if (CostKind != TTI::TCK_RecipThroughput) 2200 return Opcode == Instruction::PHI ? 0 : 1; 2201 // Branches are assumed to be predicted. 2202 return 0; 2203 } 2204 2205 InstructionCost RISCVTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, 2206 TTI::TargetCostKind CostKind, 2207 unsigned Index, 2208 const Value *Op0, 2209 const Value *Op1) const { 2210 assert(Val->isVectorTy() && "This must be a vector type"); 2211 2212 if (Opcode != Instruction::ExtractElement && 2213 Opcode != Instruction::InsertElement) 2214 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1); 2215 2216 // Legalize the type. 2217 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val); 2218 2219 // This type is legalized to a scalar type. 2220 if (!LT.second.isVector()) { 2221 auto *FixedVecTy = cast<FixedVectorType>(Val); 2222 // If Index is a known constant, cost is zero. 2223 if (Index != -1U) 2224 return 0; 2225 // Extract/InsertElement with non-constant index is very costly when 2226 // scalarized; estimate cost of loads/stores sequence via the stack: 2227 // ExtractElement cost: store vector to stack, load scalar; 2228 // InsertElement cost: store vector to stack, store scalar, load vector. 2229 Type *ElemTy = FixedVecTy->getElementType(); 2230 auto NumElems = FixedVecTy->getNumElements(); 2231 auto Align = DL.getPrefTypeAlign(ElemTy); 2232 InstructionCost LoadCost = 2233 getMemoryOpCost(Instruction::Load, ElemTy, Align, 0, CostKind); 2234 InstructionCost StoreCost = 2235 getMemoryOpCost(Instruction::Store, ElemTy, Align, 0, CostKind); 2236 return Opcode == Instruction::ExtractElement 2237 ? StoreCost * NumElems + LoadCost 2238 : (StoreCost + LoadCost) * NumElems + StoreCost; 2239 } 2240 2241 // For unsupported scalable vector. 2242 if (LT.second.isScalableVector() && !LT.first.isValid()) 2243 return LT.first; 2244 2245 // Mask vector extract/insert is expanded via e8. 2246 if (Val->getScalarSizeInBits() == 1) { 2247 VectorType *WideTy = 2248 VectorType::get(IntegerType::get(Val->getContext(), 8), 2249 cast<VectorType>(Val)->getElementCount()); 2250 if (Opcode == Instruction::ExtractElement) { 2251 InstructionCost ExtendCost 2252 = getCastInstrCost(Instruction::ZExt, WideTy, Val, 2253 TTI::CastContextHint::None, CostKind); 2254 InstructionCost ExtractCost 2255 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr); 2256 return ExtendCost + ExtractCost; 2257 } 2258 InstructionCost ExtendCost 2259 = getCastInstrCost(Instruction::ZExt, WideTy, Val, 2260 TTI::CastContextHint::None, CostKind); 2261 InstructionCost InsertCost 2262 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr); 2263 InstructionCost TruncCost 2264 = getCastInstrCost(Instruction::Trunc, Val, WideTy, 2265 TTI::CastContextHint::None, CostKind); 2266 return ExtendCost + InsertCost + TruncCost; 2267 } 2268 2269 2270 // In RVV, we could use vslidedown + vmv.x.s to extract element from vector 2271 // and vslideup + vmv.s.x to insert element to vector. 2272 unsigned BaseCost = 1; 2273 // When insertelement we should add the index with 1 as the input of vslideup. 2274 unsigned SlideCost = Opcode == Instruction::InsertElement ? 2 : 1; 2275 2276 if (Index != -1U) { 2277 // The type may be split. For fixed-width vectors we can normalize the 2278 // index to the new type. 2279 if (LT.second.isFixedLengthVector()) { 2280 unsigned Width = LT.second.getVectorNumElements(); 2281 Index = Index % Width; 2282 } 2283 2284 // If exact VLEN is known, we will insert/extract into the appropriate 2285 // subvector with no additional subvector insert/extract cost. 2286 if (auto VLEN = ST->getRealVLen()) { 2287 unsigned EltSize = LT.second.getScalarSizeInBits(); 2288 unsigned M1Max = *VLEN / EltSize; 2289 Index = Index % M1Max; 2290 } 2291 2292 if (Index == 0) 2293 // We can extract/insert the first element without vslidedown/vslideup. 2294 SlideCost = 0; 2295 else if (ST->hasVendorXRivosVisni() && isUInt<5>(Index) && 2296 Val->getScalarType()->isIntegerTy()) 2297 SlideCost = 0; // With ri.vinsert/ri.vextract there is no slide needed 2298 else if (Opcode == Instruction::InsertElement) 2299 SlideCost = 1; // With a constant index, we do not need to use addi. 2300 } 2301 2302 // When the vector needs to split into multiple register groups and the index 2303 // exceeds single vector register group, we need to insert/extract the element 2304 // via stack. 2305 if (LT.first > 1 && 2306 ((Index == -1U) || (Index >= LT.second.getVectorMinNumElements() && 2307 LT.second.isScalableVector()))) { 2308 Type *ScalarType = Val->getScalarType(); 2309 Align VecAlign = DL.getPrefTypeAlign(Val); 2310 Align SclAlign = DL.getPrefTypeAlign(ScalarType); 2311 // Extra addi for unknown index. 2312 InstructionCost IdxCost = Index == -1U ? 1 : 0; 2313 2314 // Store all split vectors into stack and load the target element. 2315 if (Opcode == Instruction::ExtractElement) 2316 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) + 2317 getMemoryOpCost(Instruction::Load, ScalarType, SclAlign, 0, 2318 CostKind) + 2319 IdxCost; 2320 2321 // Store all split vectors into stack and store the target element and load 2322 // vectors back. 2323 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) + 2324 getMemoryOpCost(Instruction::Load, Val, VecAlign, 0, CostKind) + 2325 getMemoryOpCost(Instruction::Store, ScalarType, SclAlign, 0, 2326 CostKind) + 2327 IdxCost; 2328 } 2329 2330 // Extract i64 in the target that has XLEN=32 need more instruction. 2331 if (Val->getScalarType()->isIntegerTy() && 2332 ST->getXLen() < Val->getScalarSizeInBits()) { 2333 // For extractelement, we need the following instructions: 2334 // vsetivli zero, 1, e64, m1, ta, mu (not count) 2335 // vslidedown.vx v8, v8, a0 2336 // vmv.x.s a0, v8 2337 // li a1, 32 2338 // vsrl.vx v8, v8, a1 2339 // vmv.x.s a1, v8 2340 2341 // For insertelement, we need the following instructions: 2342 // vsetivli zero, 2, e32, m4, ta, mu (not count) 2343 // vmv.v.i v12, 0 2344 // vslide1up.vx v16, v12, a1 2345 // vslide1up.vx v12, v16, a0 2346 // addi a0, a2, 1 2347 // vsetvli zero, a0, e64, m4, tu, mu (not count) 2348 // vslideup.vx v8, v12, a2 2349 2350 // TODO: should we count these special vsetvlis? 2351 BaseCost = Opcode == Instruction::InsertElement ? 3 : 4; 2352 } 2353 return BaseCost + SlideCost; 2354 } 2355 2356 InstructionCost RISCVTTIImpl::getArithmeticInstrCost( 2357 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, 2358 TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, 2359 ArrayRef<const Value *> Args, const Instruction *CxtI) const { 2360 2361 // TODO: Handle more cost kinds. 2362 if (CostKind != TTI::TCK_RecipThroughput) 2363 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info, 2364 Args, CxtI); 2365 2366 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors()) 2367 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info, 2368 Args, CxtI); 2369 2370 // Skip if scalar size of Ty is bigger than ELEN. 2371 if (isa<VectorType>(Ty) && Ty->getScalarSizeInBits() > ST->getELen()) 2372 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info, 2373 Args, CxtI); 2374 2375 // Legalize the type. 2376 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); 2377 2378 // TODO: Handle scalar type. 2379 if (!LT.second.isVector()) 2380 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info, 2381 Args, CxtI); 2382 2383 // f16 with zvfhmin and bf16 will be promoted to f32. 2384 // FIXME: nxv32[b]f16 will be custom lowered and split. 2385 unsigned ISDOpcode = TLI->InstructionOpcodeToISD(Opcode); 2386 InstructionCost CastCost = 0; 2387 if ((LT.second.getVectorElementType() == MVT::f16 || 2388 LT.second.getVectorElementType() == MVT::bf16) && 2389 TLI->getOperationAction(ISDOpcode, LT.second) == 2390 TargetLoweringBase::LegalizeAction::Promote) { 2391 MVT PromotedVT = TLI->getTypeToPromoteTo(ISDOpcode, LT.second); 2392 Type *PromotedTy = EVT(PromotedVT).getTypeForEVT(Ty->getContext()); 2393 Type *LegalTy = EVT(LT.second).getTypeForEVT(Ty->getContext()); 2394 // Add cost of extending arguments 2395 CastCost += LT.first * Args.size() * 2396 getCastInstrCost(Instruction::FPExt, PromotedTy, LegalTy, 2397 TTI::CastContextHint::None, CostKind); 2398 // Add cost of truncating result 2399 CastCost += 2400 LT.first * getCastInstrCost(Instruction::FPTrunc, LegalTy, PromotedTy, 2401 TTI::CastContextHint::None, CostKind); 2402 // Compute cost of op in promoted type 2403 LT.second = PromotedVT; 2404 } 2405 2406 auto getConstantMatCost = 2407 [&](unsigned Operand, TTI::OperandValueInfo OpInfo) -> InstructionCost { 2408 if (OpInfo.isUniform() && canSplatOperand(Opcode, Operand)) 2409 // Two sub-cases: 2410 // * Has a 5 bit immediate operand which can be splatted. 2411 // * Has a larger immediate which must be materialized in scalar register 2412 // We return 0 for both as we currently ignore the cost of materializing 2413 // scalar constants in GPRs. 2414 return 0; 2415 2416 return getConstantPoolLoadCost(Ty, CostKind); 2417 }; 2418 2419 // Add the cost of materializing any constant vectors required. 2420 InstructionCost ConstantMatCost = 0; 2421 if (Op1Info.isConstant()) 2422 ConstantMatCost += getConstantMatCost(0, Op1Info); 2423 if (Op2Info.isConstant()) 2424 ConstantMatCost += getConstantMatCost(1, Op2Info); 2425 2426 unsigned Op; 2427 switch (ISDOpcode) { 2428 case ISD::ADD: 2429 case ISD::SUB: 2430 Op = RISCV::VADD_VV; 2431 break; 2432 case ISD::SHL: 2433 case ISD::SRL: 2434 case ISD::SRA: 2435 Op = RISCV::VSLL_VV; 2436 break; 2437 case ISD::AND: 2438 case ISD::OR: 2439 case ISD::XOR: 2440 Op = (Ty->getScalarSizeInBits() == 1) ? RISCV::VMAND_MM : RISCV::VAND_VV; 2441 break; 2442 case ISD::MUL: 2443 case ISD::MULHS: 2444 case ISD::MULHU: 2445 Op = RISCV::VMUL_VV; 2446 break; 2447 case ISD::SDIV: 2448 case ISD::UDIV: 2449 Op = RISCV::VDIV_VV; 2450 break; 2451 case ISD::SREM: 2452 case ISD::UREM: 2453 Op = RISCV::VREM_VV; 2454 break; 2455 case ISD::FADD: 2456 case ISD::FSUB: 2457 Op = RISCV::VFADD_VV; 2458 break; 2459 case ISD::FMUL: 2460 Op = RISCV::VFMUL_VV; 2461 break; 2462 case ISD::FDIV: 2463 Op = RISCV::VFDIV_VV; 2464 break; 2465 case ISD::FNEG: 2466 Op = RISCV::VFSGNJN_VV; 2467 break; 2468 default: 2469 // Assuming all other instructions have the same cost until a need arises to 2470 // differentiate them. 2471 return CastCost + ConstantMatCost + 2472 BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info, 2473 Args, CxtI); 2474 } 2475 2476 InstructionCost InstrCost = getRISCVInstructionCost(Op, LT.second, CostKind); 2477 // We use BasicTTIImpl to calculate scalar costs, which assumes floating point 2478 // ops are twice as expensive as integer ops. Do the same for vectors so 2479 // scalar floating point ops aren't cheaper than their vector equivalents. 2480 if (Ty->isFPOrFPVectorTy()) 2481 InstrCost *= 2; 2482 return CastCost + ConstantMatCost + LT.first * InstrCost; 2483 } 2484 2485 // TODO: Deduplicate from TargetTransformInfoImplCRTPBase. 2486 InstructionCost RISCVTTIImpl::getPointersChainCost( 2487 ArrayRef<const Value *> Ptrs, const Value *Base, 2488 const TTI::PointersChainInfo &Info, Type *AccessTy, 2489 TTI::TargetCostKind CostKind) const { 2490 InstructionCost Cost = TTI::TCC_Free; 2491 // In the basic model we take into account GEP instructions only 2492 // (although here can come alloca instruction, a value, constants and/or 2493 // constant expressions, PHIs, bitcasts ... whatever allowed to be used as a 2494 // pointer). Typically, if Base is a not a GEP-instruction and all the 2495 // pointers are relative to the same base address, all the rest are 2496 // either GEP instructions, PHIs, bitcasts or constants. When we have same 2497 // base, we just calculate cost of each non-Base GEP as an ADD operation if 2498 // any their index is a non-const. 2499 // If no known dependencies between the pointers cost is calculated as a sum 2500 // of costs of GEP instructions. 2501 for (auto [I, V] : enumerate(Ptrs)) { 2502 const auto *GEP = dyn_cast<GetElementPtrInst>(V); 2503 if (!GEP) 2504 continue; 2505 if (Info.isSameBase() && V != Base) { 2506 if (GEP->hasAllConstantIndices()) 2507 continue; 2508 // If the chain is unit-stride and BaseReg + stride*i is a legal 2509 // addressing mode, then presume the base GEP is sitting around in a 2510 // register somewhere and check if we can fold the offset relative to 2511 // it. 2512 unsigned Stride = DL.getTypeStoreSize(AccessTy); 2513 if (Info.isUnitStride() && 2514 isLegalAddressingMode(AccessTy, 2515 /* BaseGV */ nullptr, 2516 /* BaseOffset */ Stride * I, 2517 /* HasBaseReg */ true, 2518 /* Scale */ 0, 2519 GEP->getType()->getPointerAddressSpace())) 2520 continue; 2521 Cost += getArithmeticInstrCost(Instruction::Add, GEP->getType(), CostKind, 2522 {TTI::OK_AnyValue, TTI::OP_None}, 2523 {TTI::OK_AnyValue, TTI::OP_None}, {}); 2524 } else { 2525 SmallVector<const Value *> Indices(GEP->indices()); 2526 Cost += getGEPCost(GEP->getSourceElementType(), GEP->getPointerOperand(), 2527 Indices, AccessTy, CostKind); 2528 } 2529 } 2530 return Cost; 2531 } 2532 2533 void RISCVTTIImpl::getUnrollingPreferences( 2534 Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, 2535 OptimizationRemarkEmitter *ORE) const { 2536 // TODO: More tuning on benchmarks and metrics with changes as needed 2537 // would apply to all settings below to enable performance. 2538 2539 2540 if (ST->enableDefaultUnroll()) 2541 return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE); 2542 2543 // Enable Upper bound unrolling universally, not dependent upon the conditions 2544 // below. 2545 UP.UpperBound = true; 2546 2547 // Disable loop unrolling for Oz and Os. 2548 UP.OptSizeThreshold = 0; 2549 UP.PartialOptSizeThreshold = 0; 2550 if (L->getHeader()->getParent()->hasOptSize()) 2551 return; 2552 2553 SmallVector<BasicBlock *, 4> ExitingBlocks; 2554 L->getExitingBlocks(ExitingBlocks); 2555 LLVM_DEBUG(dbgs() << "Loop has:\n" 2556 << "Blocks: " << L->getNumBlocks() << "\n" 2557 << "Exit blocks: " << ExitingBlocks.size() << "\n"); 2558 2559 // Only allow another exit other than the latch. This acts as an early exit 2560 // as it mirrors the profitability calculation of the runtime unroller. 2561 if (ExitingBlocks.size() > 2) 2562 return; 2563 2564 // Limit the CFG of the loop body for targets with a branch predictor. 2565 // Allowing 4 blocks permits if-then-else diamonds in the body. 2566 if (L->getNumBlocks() > 4) 2567 return; 2568 2569 // Don't unroll vectorized loops, including the remainder loop 2570 if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized")) 2571 return; 2572 2573 // Scan the loop: don't unroll loops with calls as this could prevent 2574 // inlining. 2575 InstructionCost Cost = 0; 2576 for (auto *BB : L->getBlocks()) { 2577 for (auto &I : *BB) { 2578 // Initial setting - Don't unroll loops containing vectorized 2579 // instructions. 2580 if (I.getType()->isVectorTy()) 2581 return; 2582 2583 if (isa<CallInst>(I) || isa<InvokeInst>(I)) { 2584 if (const Function *F = cast<CallBase>(I).getCalledFunction()) { 2585 if (!isLoweredToCall(F)) 2586 continue; 2587 } 2588 return; 2589 } 2590 2591 SmallVector<const Value *> Operands(I.operand_values()); 2592 Cost += getInstructionCost(&I, Operands, 2593 TargetTransformInfo::TCK_SizeAndLatency); 2594 } 2595 } 2596 2597 LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n"); 2598 2599 UP.Partial = true; 2600 UP.Runtime = true; 2601 UP.UnrollRemainder = true; 2602 UP.UnrollAndJam = true; 2603 2604 // Force unrolling small loops can be very useful because of the branch 2605 // taken cost of the backedge. 2606 if (Cost < 12) 2607 UP.Force = true; 2608 } 2609 2610 void RISCVTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, 2611 TTI::PeelingPreferences &PP) const { 2612 BaseT::getPeelingPreferences(L, SE, PP); 2613 } 2614 2615 unsigned RISCVTTIImpl::getRegUsageForType(Type *Ty) const { 2616 if (Ty->isVectorTy()) { 2617 // f16 with only zvfhmin and bf16 will be promoted to f32 2618 Type *EltTy = cast<VectorType>(Ty)->getElementType(); 2619 if ((EltTy->isHalfTy() && !ST->hasVInstructionsF16()) || 2620 EltTy->isBFloatTy()) 2621 Ty = VectorType::get(Type::getFloatTy(Ty->getContext()), 2622 cast<VectorType>(Ty)); 2623 2624 TypeSize Size = DL.getTypeSizeInBits(Ty); 2625 if (Size.isScalable() && ST->hasVInstructions()) 2626 return divideCeil(Size.getKnownMinValue(), RISCV::RVVBitsPerBlock); 2627 2628 if (ST->useRVVForFixedLengthVectors()) 2629 return divideCeil(Size, ST->getRealMinVLen()); 2630 } 2631 2632 return BaseT::getRegUsageForType(Ty); 2633 } 2634 2635 unsigned RISCVTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const { 2636 if (SLPMaxVF.getNumOccurrences()) 2637 return SLPMaxVF; 2638 2639 // Return how many elements can fit in getRegisterBitwidth. This is the 2640 // same routine as used in LoopVectorizer. We should probably be 2641 // accounting for whether we actually have instructions with the right 2642 // lane type, but we don't have enough information to do that without 2643 // some additional plumbing which hasn't been justified yet. 2644 TypeSize RegWidth = 2645 getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector); 2646 // If no vector registers, or absurd element widths, disable 2647 // vectorization by returning 1. 2648 return std::max<unsigned>(1U, RegWidth.getFixedValue() / ElemWidth); 2649 } 2650 2651 unsigned RISCVTTIImpl::getMinTripCountTailFoldingThreshold() const { 2652 return RVVMinTripCount; 2653 } 2654 2655 TTI::AddressingModeKind 2656 RISCVTTIImpl::getPreferredAddressingMode(const Loop *L, 2657 ScalarEvolution *SE) const { 2658 if (ST->hasVendorXCVmem() && !ST->is64Bit()) 2659 return TTI::AMK_PostIndexed; 2660 2661 return BasicTTIImplBase::getPreferredAddressingMode(L, SE); 2662 } 2663 2664 bool RISCVTTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1, 2665 const TargetTransformInfo::LSRCost &C2) const { 2666 // RISC-V specific here are "instruction number 1st priority". 2667 // If we need to emit adds inside the loop to add up base registers, then 2668 // we need at least one extra temporary register. 2669 unsigned C1NumRegs = C1.NumRegs + (C1.NumBaseAdds != 0); 2670 unsigned C2NumRegs = C2.NumRegs + (C2.NumBaseAdds != 0); 2671 return std::tie(C1.Insns, C1NumRegs, C1.AddRecCost, 2672 C1.NumIVMuls, C1.NumBaseAdds, 2673 C1.ScaleCost, C1.ImmCost, C1.SetupCost) < 2674 std::tie(C2.Insns, C2NumRegs, C2.AddRecCost, 2675 C2.NumIVMuls, C2.NumBaseAdds, 2676 C2.ScaleCost, C2.ImmCost, C2.SetupCost); 2677 } 2678 2679 bool RISCVTTIImpl::isLegalMaskedExpandLoad(Type *DataTy, 2680 Align Alignment) const { 2681 auto *VTy = dyn_cast<VectorType>(DataTy); 2682 if (!VTy || VTy->isScalableTy()) 2683 return false; 2684 2685 if (!isLegalMaskedLoadStore(DataTy, Alignment)) 2686 return false; 2687 2688 // FIXME: If it is an i8 vector and the element count exceeds 256, we should 2689 // scalarize these types with LMUL >= maximum fixed-length LMUL. 2690 if (VTy->getElementType()->isIntegerTy(8)) 2691 if (VTy->getElementCount().getFixedValue() > 256) 2692 return VTy->getPrimitiveSizeInBits() / ST->getRealMinVLen() < 2693 ST->getMaxLMULForFixedLengthVectors(); 2694 return true; 2695 } 2696 2697 bool RISCVTTIImpl::isLegalMaskedCompressStore(Type *DataTy, 2698 Align Alignment) const { 2699 auto *VTy = dyn_cast<VectorType>(DataTy); 2700 if (!VTy || VTy->isScalableTy()) 2701 return false; 2702 2703 if (!isLegalMaskedLoadStore(DataTy, Alignment)) 2704 return false; 2705 return true; 2706 } 2707 2708 /// See if \p I should be considered for address type promotion. We check if \p 2709 /// I is a sext with right type and used in memory accesses. If it used in a 2710 /// "complex" getelementptr, we allow it to be promoted without finding other 2711 /// sext instructions that sign extended the same initial value. A getelementptr 2712 /// is considered as "complex" if it has more than 2 operands. 2713 bool RISCVTTIImpl::shouldConsiderAddressTypePromotion( 2714 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const { 2715 bool Considerable = false; 2716 AllowPromotionWithoutCommonHeader = false; 2717 if (!isa<SExtInst>(&I)) 2718 return false; 2719 Type *ConsideredSExtType = 2720 Type::getInt64Ty(I.getParent()->getParent()->getContext()); 2721 if (I.getType() != ConsideredSExtType) 2722 return false; 2723 // See if the sext is the one with the right type and used in at least one 2724 // GetElementPtrInst. 2725 for (const User *U : I.users()) { 2726 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) { 2727 Considerable = true; 2728 // A getelementptr is considered as "complex" if it has more than 2 2729 // operands. We will promote a SExt used in such complex GEP as we 2730 // expect some computation to be merged if they are done on 64 bits. 2731 if (GEPInst->getNumOperands() > 2) { 2732 AllowPromotionWithoutCommonHeader = true; 2733 break; 2734 } 2735 } 2736 } 2737 return Considerable; 2738 } 2739 2740 bool RISCVTTIImpl::canSplatOperand(unsigned Opcode, int Operand) const { 2741 switch (Opcode) { 2742 case Instruction::Add: 2743 case Instruction::Sub: 2744 case Instruction::Mul: 2745 case Instruction::And: 2746 case Instruction::Or: 2747 case Instruction::Xor: 2748 case Instruction::FAdd: 2749 case Instruction::FSub: 2750 case Instruction::FMul: 2751 case Instruction::FDiv: 2752 case Instruction::ICmp: 2753 case Instruction::FCmp: 2754 return true; 2755 case Instruction::Shl: 2756 case Instruction::LShr: 2757 case Instruction::AShr: 2758 case Instruction::UDiv: 2759 case Instruction::SDiv: 2760 case Instruction::URem: 2761 case Instruction::SRem: 2762 case Instruction::Select: 2763 return Operand == 1; 2764 default: 2765 return false; 2766 } 2767 } 2768 2769 bool RISCVTTIImpl::canSplatOperand(Instruction *I, int Operand) const { 2770 if (!I->getType()->isVectorTy() || !ST->hasVInstructions()) 2771 return false; 2772 2773 if (canSplatOperand(I->getOpcode(), Operand)) 2774 return true; 2775 2776 auto *II = dyn_cast<IntrinsicInst>(I); 2777 if (!II) 2778 return false; 2779 2780 switch (II->getIntrinsicID()) { 2781 case Intrinsic::fma: 2782 case Intrinsic::vp_fma: 2783 case Intrinsic::fmuladd: 2784 case Intrinsic::vp_fmuladd: 2785 return Operand == 0 || Operand == 1; 2786 case Intrinsic::vp_shl: 2787 case Intrinsic::vp_lshr: 2788 case Intrinsic::vp_ashr: 2789 case Intrinsic::vp_udiv: 2790 case Intrinsic::vp_sdiv: 2791 case Intrinsic::vp_urem: 2792 case Intrinsic::vp_srem: 2793 case Intrinsic::ssub_sat: 2794 case Intrinsic::vp_ssub_sat: 2795 case Intrinsic::usub_sat: 2796 case Intrinsic::vp_usub_sat: 2797 case Intrinsic::vp_select: 2798 return Operand == 1; 2799 // These intrinsics are commutative. 2800 case Intrinsic::vp_add: 2801 case Intrinsic::vp_mul: 2802 case Intrinsic::vp_and: 2803 case Intrinsic::vp_or: 2804 case Intrinsic::vp_xor: 2805 case Intrinsic::vp_fadd: 2806 case Intrinsic::vp_fmul: 2807 case Intrinsic::vp_icmp: 2808 case Intrinsic::vp_fcmp: 2809 case Intrinsic::smin: 2810 case Intrinsic::vp_smin: 2811 case Intrinsic::umin: 2812 case Intrinsic::vp_umin: 2813 case Intrinsic::smax: 2814 case Intrinsic::vp_smax: 2815 case Intrinsic::umax: 2816 case Intrinsic::vp_umax: 2817 case Intrinsic::sadd_sat: 2818 case Intrinsic::vp_sadd_sat: 2819 case Intrinsic::uadd_sat: 2820 case Intrinsic::vp_uadd_sat: 2821 // These intrinsics have 'vr' versions. 2822 case Intrinsic::vp_sub: 2823 case Intrinsic::vp_fsub: 2824 case Intrinsic::vp_fdiv: 2825 return Operand == 0 || Operand == 1; 2826 default: 2827 return false; 2828 } 2829 } 2830 2831 /// Check if sinking \p I's operands to I's basic block is profitable, because 2832 /// the operands can be folded into a target instruction, e.g. 2833 /// splats of scalars can fold into vector instructions. 2834 bool RISCVTTIImpl::isProfitableToSinkOperands( 2835 Instruction *I, SmallVectorImpl<Use *> &Ops) const { 2836 using namespace llvm::PatternMatch; 2837 2838 if (I->isBitwiseLogicOp()) { 2839 if (!I->getType()->isVectorTy()) { 2840 if (ST->hasStdExtZbb() || ST->hasStdExtZbkb()) { 2841 for (auto &Op : I->operands()) { 2842 // (and/or/xor X, (not Y)) -> (andn/orn/xnor X, Y) 2843 if (match(Op.get(), m_Not(m_Value()))) { 2844 Ops.push_back(&Op); 2845 return true; 2846 } 2847 } 2848 } 2849 } else if (I->getOpcode() == Instruction::And && ST->hasStdExtZvkb()) { 2850 for (auto &Op : I->operands()) { 2851 // (and X, (not Y)) -> (vandn.vv X, Y) 2852 if (match(Op.get(), m_Not(m_Value()))) { 2853 Ops.push_back(&Op); 2854 return true; 2855 } 2856 // (and X, (splat (not Y))) -> (vandn.vx X, Y) 2857 if (match(Op.get(), m_Shuffle(m_InsertElt(m_Value(), m_Not(m_Value()), 2858 m_ZeroInt()), 2859 m_Value(), m_ZeroMask()))) { 2860 Use &InsertElt = cast<Instruction>(Op)->getOperandUse(0); 2861 Use &Not = cast<Instruction>(InsertElt)->getOperandUse(1); 2862 Ops.push_back(&Not); 2863 Ops.push_back(&InsertElt); 2864 Ops.push_back(&Op); 2865 return true; 2866 } 2867 } 2868 } 2869 } 2870 2871 if (!I->getType()->isVectorTy() || !ST->hasVInstructions()) 2872 return false; 2873 2874 // Don't sink splat operands if the target prefers it. Some targets requires 2875 // S2V transfer buffers and we can run out of them copying the same value 2876 // repeatedly. 2877 // FIXME: It could still be worth doing if it would improve vector register 2878 // pressure and prevent a vector spill. 2879 if (!ST->sinkSplatOperands()) 2880 return false; 2881 2882 for (auto OpIdx : enumerate(I->operands())) { 2883 if (!canSplatOperand(I, OpIdx.index())) 2884 continue; 2885 2886 Instruction *Op = dyn_cast<Instruction>(OpIdx.value().get()); 2887 // Make sure we are not already sinking this operand 2888 if (!Op || any_of(Ops, [&](Use *U) { return U->get() == Op; })) 2889 continue; 2890 2891 // We are looking for a splat/vp.splat that can be sunk. 2892 bool IsVPSplat = match(Op, m_Intrinsic<Intrinsic::experimental_vp_splat>( 2893 m_Value(), m_Value(), m_Value())); 2894 if (!IsVPSplat && 2895 !match(Op, m_Shuffle(m_InsertElt(m_Undef(), m_Value(), m_ZeroInt()), 2896 m_Undef(), m_ZeroMask()))) 2897 continue; 2898 2899 // Don't sink i1 splats. 2900 if (cast<VectorType>(Op->getType())->getElementType()->isIntegerTy(1)) 2901 continue; 2902 2903 // All uses of the shuffle should be sunk to avoid duplicating it across gpr 2904 // and vector registers 2905 for (Use &U : Op->uses()) { 2906 Instruction *Insn = cast<Instruction>(U.getUser()); 2907 if (!canSplatOperand(Insn, U.getOperandNo())) 2908 return false; 2909 } 2910 2911 // Sink any fpexts since they might be used in a widening fp pattern. 2912 if (IsVPSplat) { 2913 if (isa<FPExtInst>(Op->getOperand(0))) 2914 Ops.push_back(&Op->getOperandUse(0)); 2915 } else { 2916 Use *InsertEltUse = &Op->getOperandUse(0); 2917 auto *InsertElt = cast<InsertElementInst>(InsertEltUse); 2918 if (isa<FPExtInst>(InsertElt->getOperand(1))) 2919 Ops.push_back(&InsertElt->getOperandUse(1)); 2920 Ops.push_back(InsertEltUse); 2921 } 2922 Ops.push_back(&OpIdx.value()); 2923 } 2924 return true; 2925 } 2926 2927 RISCVTTIImpl::TTI::MemCmpExpansionOptions 2928 RISCVTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const { 2929 TTI::MemCmpExpansionOptions Options; 2930 // TODO: Enable expansion when unaligned access is not supported after we fix 2931 // issues in ExpandMemcmp. 2932 if (!ST->enableUnalignedScalarMem()) 2933 return Options; 2934 2935 if (!ST->hasStdExtZbb() && !ST->hasStdExtZbkb() && !IsZeroCmp) 2936 return Options; 2937 2938 Options.AllowOverlappingLoads = true; 2939 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize); 2940 Options.NumLoadsPerBlock = Options.MaxNumLoads; 2941 if (ST->is64Bit()) { 2942 Options.LoadSizes = {8, 4, 2, 1}; 2943 Options.AllowedTailExpansions = {3, 5, 6}; 2944 } else { 2945 Options.LoadSizes = {4, 2, 1}; 2946 Options.AllowedTailExpansions = {3}; 2947 } 2948 2949 if (IsZeroCmp && ST->hasVInstructions()) { 2950 unsigned VLenB = ST->getRealMinVLen() / 8; 2951 // The minimum size should be `XLen / 8 + 1`, and the maxinum size should be 2952 // `VLenB * MaxLMUL` so that it fits in a single register group. 2953 unsigned MinSize = ST->getXLen() / 8 + 1; 2954 unsigned MaxSize = VLenB * ST->getMaxLMULForFixedLengthVectors(); 2955 for (unsigned Size = MinSize; Size <= MaxSize; Size++) 2956 Options.LoadSizes.insert(Options.LoadSizes.begin(), Size); 2957 } 2958 return Options; 2959 } 2960