//===-- RISCVTargetTransformInfo.cpp - RISC-V specific TTI ----------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// #include "RISCVTargetTransformInfo.h" #include "MCTargetDesc/RISCVMatInt.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/CodeGen/BasicTTIImpl.h" #include "llvm/CodeGen/TargetLowering.h" #include using namespace llvm; #define DEBUG_TYPE "riscvtti" static cl::opt RVVRegisterWidthLMUL( "riscv-v-register-bit-width-lmul", cl::desc( "The LMUL to use for getRegisterBitWidth queries. Affects LMUL used " "by autovectorized code. Fractional LMULs are not supported."), cl::init(1), cl::Hidden); InstructionCost RISCVTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) { assert(Ty->isIntegerTy() && "getIntImmCost can only estimate cost of materialising integers"); // We have a Zero register, so 0 is always free. if (Imm == 0) return TTI::TCC_Free; // Otherwise, we check how many instructions it will take to materialise. const DataLayout &DL = getDataLayout(); return RISCVMatInt::getIntMatCost(Imm, DL.getTypeSizeInBits(Ty), getST()->getFeatureBits()); } InstructionCost RISCVTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst) { assert(Ty->isIntegerTy() && "getIntImmCost can only estimate cost of materialising integers"); // We have a Zero register, so 0 is always free. if (Imm == 0) return TTI::TCC_Free; // Some instructions in RISC-V can take a 12-bit immediate. Some of these are // commutative, in others the immediate comes from a specific argument index. bool Takes12BitImm = false; unsigned ImmArgIdx = ~0U; switch (Opcode) { case Instruction::GetElementPtr: // Never hoist any arguments to a GetElementPtr. CodeGenPrepare will // split up large offsets in GEP into better parts than ConstantHoisting // can. return TTI::TCC_Free; case Instruction::And: // zext.h if (Imm == UINT64_C(0xffff) && ST->hasStdExtZbb()) return TTI::TCC_Free; // zext.w if (Imm == UINT64_C(0xffffffff) && ST->hasStdExtZba()) return TTI::TCC_Free; LLVM_FALLTHROUGH; case Instruction::Add: case Instruction::Or: case Instruction::Xor: case Instruction::Mul: Takes12BitImm = true; break; case Instruction::Sub: case Instruction::Shl: case Instruction::LShr: case Instruction::AShr: Takes12BitImm = true; ImmArgIdx = 1; break; default: break; } if (Takes12BitImm) { // Check immediate is the correct argument... if (Instruction::isCommutative(Opcode) || Idx == ImmArgIdx) { // ... and fits into the 12-bit immediate. if (Imm.getMinSignedBits() <= 64 && getTLI()->isLegalAddImmediate(Imm.getSExtValue())) { return TTI::TCC_Free; } } // Otherwise, use the full materialisation cost. return getIntImmCost(Imm, Ty, CostKind); } // By default, prevent hoisting. return TTI::TCC_Free; } InstructionCost RISCVTTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) { // Prevent hoisting in unknown cases. return TTI::TCC_Free; } TargetTransformInfo::PopcntSupportKind RISCVTTIImpl::getPopcntSupport(unsigned TyWidth) { assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); return ST->hasStdExtZbb() ? TTI::PSK_FastHardware : TTI::PSK_Software; } bool RISCVTTIImpl::shouldExpandReduction(const IntrinsicInst *II) const { // Currently, the ExpandReductions pass can't expand scalable-vector // reductions, but we still request expansion as RVV doesn't support certain // reductions and the SelectionDAG can't legalize them either. switch (II->getIntrinsicID()) { default: return false; // These reductions have no equivalent in RVV case Intrinsic::vector_reduce_mul: case Intrinsic::vector_reduce_fmul: return true; } } Optional RISCVTTIImpl::getMaxVScale() const { if (ST->hasVInstructions()) return ST->getRealMaxVLen() / RISCV::RVVBitsPerBlock; return BaseT::getMaxVScale(); } Optional RISCVTTIImpl::getVScaleForTuning() const { if (ST->hasVInstructions()) return ST->getRealMinVLen() / RISCV::RVVBitsPerBlock; return BaseT::getVScaleForTuning(); } TypeSize RISCVTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { unsigned LMUL = PowerOf2Floor( std::max(std::min(RVVRegisterWidthLMUL, 8), 1)); switch (K) { case TargetTransformInfo::RGK_Scalar: return TypeSize::getFixed(ST->getXLen()); case TargetTransformInfo::RGK_FixedWidthVector: return TypeSize::getFixed( ST->useRVVForFixedLengthVectors() ? LMUL * ST->getRealMinVLen() : 0); case TargetTransformInfo::RGK_ScalableVector: return TypeSize::getScalable( ST->hasVInstructions() ? LMUL * RISCV::RVVBitsPerBlock : 0); } llvm_unreachable("Unsupported register kind"); } InstructionCost RISCVTTIImpl::getSpliceCost(VectorType *Tp, int Index) { std::pair LT = TLI->getTypeLegalizationCost(DL, Tp); unsigned Cost = 2; // vslidedown+vslideup. // TODO: LMUL should increase cost. // TODO: Multiplying by LT.first implies this legalizes into multiple copies // of similar code, but I think we expand through memory. return Cost * LT.first; } InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef Mask, int Index, VectorType *SubTp, ArrayRef Args) { if (isa(Tp)) { std::pair LT = TLI->getTypeLegalizationCost(DL, Tp); switch (Kind) { default: // Fallthrough to generic handling. // TODO: Most of these cases will return getInvalid in generic code, and // must be implemented here. break; case TTI::SK_Broadcast: { return LT.first * 1; } case TTI::SK_Splice: return getSpliceCost(Tp, Index); case TTI::SK_Reverse: // Most of the cost here is producing the vrgather index register // Example sequence: // csrr a0, vlenb // srli a0, a0, 3 // addi a0, a0, -1 // vsetvli a1, zero, e8, mf8, ta, mu (ignored) // vid.v v9 // vrsub.vx v10, v9, a0 // vrgather.vv v9, v8, v10 if (Tp->getElementType()->isIntegerTy(1)) // Mask operation additionally required extend and truncate return LT.first * 9; return LT.first * 6; } } return BaseT::getShuffleCost(Kind, Tp, Mask, Index, SubTp); } InstructionCost RISCVTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind) { if (!isa(Src)) return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind); return getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind); } InstructionCost RISCVTTIImpl::getGatherScatterOpCost( unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) { if (CostKind != TTI::TCK_RecipThroughput) return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, Alignment, CostKind, I); if ((Opcode == Instruction::Load && !isLegalMaskedGather(DataTy, Align(Alignment))) || (Opcode == Instruction::Store && !isLegalMaskedScatter(DataTy, Align(Alignment)))) return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, Alignment, CostKind, I); // Cost is proportional to the number of memory operations implied. For // scalable vectors, we use an upper bound on that number since we don't // know exactly what VL will be. auto &VTy = *cast(DataTy); InstructionCost MemOpCost = getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind, I); unsigned NumLoads = getMaxVLFor(&VTy); return NumLoads * MemOpCost; } InstructionCost RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) { auto *RetTy = ICA.getReturnType(); switch (ICA.getID()) { // TODO: add more intrinsic case Intrinsic::experimental_stepvector: { unsigned Cost = 1; // vid auto LT = TLI->getTypeLegalizationCost(DL, RetTy); return Cost + (LT.first - 1); } default: break; } return BaseT::getIntrinsicInstrCost(ICA, CostKind); } InstructionCost RISCVTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I) { if (isa(Dst) && isa(Src)) { // FIXME: Need to compute legalizing cost for illegal types. if (!isTypeLegal(Src) || !isTypeLegal(Dst)) return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I); // Skip if element size of Dst or Src is bigger than ELEN. if (Src->getScalarSizeInBits() > ST->getELEN() || Dst->getScalarSizeInBits() > ST->getELEN()) return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I); int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); // FIXME: Need to consider vsetvli and lmul. int PowDiff = (int)Log2_32(Dst->getScalarSizeInBits()) - (int)Log2_32(Src->getScalarSizeInBits()); switch (ISD) { case ISD::SIGN_EXTEND: case ISD::ZERO_EXTEND: return 1; case ISD::TRUNCATE: case ISD::FP_EXTEND: case ISD::FP_ROUND: // Counts of narrow/widen instructions. return std::abs(PowDiff); case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: case ISD::SINT_TO_FP: case ISD::UINT_TO_FP: if (std::abs(PowDiff) <= 1) return 1; // Backend could lower (v[sz]ext i8 to double) to vfcvt(v[sz]ext.f8 i8), // so it only need two conversion. if (Src->isIntOrIntVectorTy()) return 2; // Counts of narrow/widen instructions. return std::abs(PowDiff); } } return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I); } unsigned RISCVTTIImpl::getMaxVLFor(VectorType *Ty) { if (isa(Ty)) { const unsigned EltSize = DL.getTypeSizeInBits(Ty->getElementType()); const unsigned MinSize = DL.getTypeSizeInBits(Ty).getKnownMinValue(); const unsigned VectorBitsMax = ST->getRealMaxVLen(); return RISCVTargetLowering::computeVLMAX(VectorBitsMax, EltSize, MinSize); } return cast(Ty)->getNumElements(); } InstructionCost RISCVTTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, bool IsUnsigned, TTI::TargetCostKind CostKind) { if (isa(Ty) && !ST->useRVVForFixedLengthVectors()) return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind); // Skip if scalar size of Ty is bigger than ELEN. if (Ty->getScalarSizeInBits() > ST->getELEN()) return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind); std::pair LT = TLI->getTypeLegalizationCost(DL, Ty); if (Ty->getElementType()->isIntegerTy(1)) // vcpop sequences, see vreduction-mask.ll. umax, smin actually only // cost 2, but we don't have enough info here so we slightly over cost. return (LT.first - 1) + 3; // IR Reduction is composed by two vmv and one rvv reduction instruction. InstructionCost BaseCost = 2; unsigned VL = getMaxVLFor(Ty); return (LT.first - 1) + BaseCost + Log2_32_Ceil(VL); } InstructionCost RISCVTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, Optional FMF, TTI::TargetCostKind CostKind) { if (isa(Ty) && !ST->useRVVForFixedLengthVectors()) return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind); // Skip if scalar size of Ty is bigger than ELEN. if (Ty->getScalarSizeInBits() > ST->getELEN()) return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind); int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); if (ISD != ISD::ADD && ISD != ISD::OR && ISD != ISD::XOR && ISD != ISD::AND && ISD != ISD::FADD) return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind); std::pair LT = TLI->getTypeLegalizationCost(DL, Ty); if (Ty->getElementType()->isIntegerTy(1)) // vcpop sequences, see vreduction-mask.ll return (LT.first - 1) + (ISD == ISD::AND ? 3 : 2); // IR Reduction is composed by two vmv and one rvv reduction instruction. InstructionCost BaseCost = 2; unsigned VL = getMaxVLFor(Ty); if (TTI::requiresOrderedReduction(FMF)) return (LT.first - 1) + BaseCost + VL; return (LT.first - 1) + BaseCost + Log2_32_Ceil(VL); } void RISCVTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) { // TODO: More tuning on benchmarks and metrics with changes as needed // would apply to all settings below to enable performance. if (ST->enableDefaultUnroll()) return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE); // Enable Upper bound unrolling universally, not dependant upon the conditions // below. UP.UpperBound = true; // Disable loop unrolling for Oz and Os. UP.OptSizeThreshold = 0; UP.PartialOptSizeThreshold = 0; if (L->getHeader()->getParent()->hasOptSize()) return; SmallVector ExitingBlocks; L->getExitingBlocks(ExitingBlocks); LLVM_DEBUG(dbgs() << "Loop has:\n" << "Blocks: " << L->getNumBlocks() << "\n" << "Exit blocks: " << ExitingBlocks.size() << "\n"); // Only allow another exit other than the latch. This acts as an early exit // as it mirrors the profitability calculation of the runtime unroller. if (ExitingBlocks.size() > 2) return; // Limit the CFG of the loop body for targets with a branch predictor. // Allowing 4 blocks permits if-then-else diamonds in the body. if (L->getNumBlocks() > 4) return; // Don't unroll vectorized loops, including the remainder loop if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized")) return; // Scan the loop: don't unroll loops with calls as this could prevent // inlining. InstructionCost Cost = 0; for (auto *BB : L->getBlocks()) { for (auto &I : *BB) { // Initial setting - Don't unroll loops containing vectorized // instructions. if (I.getType()->isVectorTy()) return; if (isa(I) || isa(I)) { if (const Function *F = cast(I).getCalledFunction()) { if (!isLoweredToCall(F)) continue; } return; } SmallVector Operands(I.operand_values()); Cost += getUserCost(&I, Operands, TargetTransformInfo::TCK_SizeAndLatency); } } LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n"); UP.Partial = true; UP.Runtime = true; UP.UnrollRemainder = true; UP.UnrollAndJam = true; UP.UnrollAndJamInnerLoopThreshold = 60; // Force unrolling small loops can be very useful because of the branch // taken cost of the backedge. if (Cost < 12) UP.Force = true; } void RISCVTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) { BaseT::getPeelingPreferences(L, SE, PP); } unsigned RISCVTTIImpl::getRegUsageForType(Type *Ty) { TypeSize Size = Ty->getPrimitiveSizeInBits(); if (Ty->isVectorTy()) { if (Size.isScalable() && ST->hasVInstructions()) return divideCeil(Size.getKnownMinValue(), RISCV::RVVBitsPerBlock); if (ST->useRVVForFixedLengthVectors()) return divideCeil(Size, ST->getRealMinVLen()); } return BaseT::getRegUsageForType(Ty); }