//===- ARMTargetTransformInfo.h - ARM specific TTI --------------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // /// \file /// This file a TargetTransformInfo::Concept conforming object specific to the /// ARM target machine. It uses the target's detailed information to /// provide more precise answers to certain TTI queries, while letting the /// target independent and default TTI implementations handle the rest. // //===----------------------------------------------------------------------===// #ifndef LLVM_LIB_TARGET_ARM_ARMTARGETTRANSFORMINFO_H #define LLVM_LIB_TARGET_ARM_ARMTARGETTRANSFORMINFO_H #include "ARM.h" #include "ARMSubtarget.h" #include "ARMTargetMachine.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/CodeGen/BasicTTIImpl.h" #include "llvm/IR/Constant.h" #include "llvm/IR/Function.h" #include "llvm/MC/SubtargetFeature.h" #include namespace llvm { class APInt; class ARMTargetLowering; class Instruction; class Loop; class SCEV; class ScalarEvolution; class Type; class Value; namespace TailPredication { enum Mode { Disabled = 0, EnabledNoReductions, Enabled, ForceEnabledNoReductions, ForceEnabled }; } // For controlling conversion of memcpy into Tail Predicated loop. namespace TPLoop { enum MemTransfer { ForceDisabled = 0, ForceEnabled, Allow }; } class ARMTTIImpl : public BasicTTIImplBase { using BaseT = BasicTTIImplBase; using TTI = TargetTransformInfo; friend BaseT; const ARMSubtarget *ST; const ARMTargetLowering *TLI; // Currently the following features are excluded from InlineFeaturesAllowed. // ModeThumb, FeatureNoARM, ModeSoftFloat, FeatureFP64, FeatureD32 // Depending on whether they are set or unset, different // instructions/registers are available. For example, inlining a callee with // -thumb-mode in a caller with +thumb-mode, may cause the assembler to // fail if the callee uses ARM only instructions, e.g. in inline asm. const FeatureBitset InlineFeaturesAllowed = { ARM::FeatureVFP2, ARM::FeatureVFP3, ARM::FeatureNEON, ARM::FeatureThumb2, ARM::FeatureFP16, ARM::FeatureVFP4, ARM::FeatureFPARMv8, ARM::FeatureFullFP16, ARM::FeatureFP16FML, ARM::FeatureHWDivThumb, ARM::FeatureHWDivARM, ARM::FeatureDB, ARM::FeatureV7Clrex, ARM::FeatureAcquireRelease, ARM::FeatureSlowFPBrcc, ARM::FeaturePerfMon, ARM::FeatureTrustZone, ARM::Feature8MSecExt, ARM::FeatureCrypto, ARM::FeatureCRC, ARM::FeatureRAS, ARM::FeatureFPAO, ARM::FeatureFuseAES, ARM::FeatureZCZeroing, ARM::FeatureProfUnpredicate, ARM::FeatureSlowVGETLNi32, ARM::FeatureSlowVDUP32, ARM::FeaturePreferVMOVSR, ARM::FeaturePrefISHSTBarrier, ARM::FeatureMuxedUnits, ARM::FeatureSlowOddRegister, ARM::FeatureSlowLoadDSubreg, ARM::FeatureDontWidenVMOVS, ARM::FeatureExpandMLx, ARM::FeatureHasVMLxHazards, ARM::FeatureNEONForFPMovs, ARM::FeatureNEONForFP, ARM::FeatureCheckVLDnAlign, ARM::FeatureHasSlowFPVMLx, ARM::FeatureHasSlowFPVFMx, ARM::FeatureVMLxForwarding, ARM::FeaturePref32BitThumb, ARM::FeatureAvoidPartialCPSR, ARM::FeatureCheapPredicableCPSR, ARM::FeatureAvoidMOVsShOp, ARM::FeatureHasRetAddrStack, ARM::FeatureHasNoBranchPredictor, ARM::FeatureDSP, ARM::FeatureMP, ARM::FeatureVirtualization, ARM::FeatureMClass, ARM::FeatureRClass, ARM::FeatureAClass, ARM::FeatureNaClTrap, ARM::FeatureStrictAlign, ARM::FeatureLongCalls, ARM::FeatureExecuteOnly, ARM::FeatureReserveR9, ARM::FeatureNoMovt, ARM::FeatureNoNegativeImmediates }; const ARMSubtarget *getST() const { return ST; } const ARMTargetLowering *getTLI() const { return TLI; } public: explicit ARMTTIImpl(const ARMBaseTargetMachine *TM, const Function &F) : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)), TLI(ST->getTargetLowering()) {} bool areInlineCompatible(const Function *Caller, const Function *Callee) const; bool enableInterleavedAccessVectorization() { return true; } TTI::AddressingModeKind getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const; /// Floating-point computation using ARMv8 AArch32 Advanced /// SIMD instructions remains unchanged from ARMv7. Only AArch64 SIMD /// and Arm MVE are IEEE-754 compliant. bool isFPVectorizationPotentiallyUnsafe() { return !ST->isTargetDarwin() && !ST->hasMVEFloatOps(); } std::optional instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const; std::optional simplifyDemandedVectorEltsIntrinsic( InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function SimplifyAndSetOp) const; /// \name Scalar TTI Implementations /// @{ InstructionCost getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty); using BaseT::getIntImmCost; InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind); InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst = nullptr); /// @} /// \name Vector TTI Implementations /// @{ unsigned getNumberOfRegisters(unsigned ClassID) const { bool Vector = (ClassID == 1); if (Vector) { if (ST->hasNEON()) return 16; if (ST->hasMVEIntegerOps()) return 8; return 0; } if (ST->isThumb1Only()) return 8; return 13; } TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { switch (K) { case TargetTransformInfo::RGK_Scalar: return TypeSize::getFixed(32); case TargetTransformInfo::RGK_FixedWidthVector: if (ST->hasNEON()) return TypeSize::getFixed(128); if (ST->hasMVEIntegerOps()) return TypeSize::getFixed(128); return TypeSize::getFixed(0); case TargetTransformInfo::RGK_ScalableVector: return TypeSize::getScalable(0); } llvm_unreachable("Unsupported register kind"); } unsigned getMaxInterleaveFactor(unsigned VF) { return ST->getMaxInterleaveFactor(); } bool isProfitableLSRChainElement(Instruction *I); bool isLegalMaskedLoad(Type *DataTy, Align Alignment); bool isLegalMaskedStore(Type *DataTy, Align Alignment) { return isLegalMaskedLoad(DataTy, Alignment); } bool forceScalarizeMaskedGather(VectorType *VTy, Align Alignment) { // For MVE, we have a custom lowering pass that will already have custom // legalised any gathers that we can lower to MVE intrinsics, and want to // expand all the rest. The pass runs before the masked intrinsic lowering // pass. return true; } bool forceScalarizeMaskedScatter(VectorType *VTy, Align Alignment) { return forceScalarizeMaskedGather(VTy, Alignment); } bool isLegalMaskedGather(Type *Ty, Align Alignment); bool isLegalMaskedScatter(Type *Ty, Align Alignment) { return isLegalMaskedGather(Ty, Alignment); } InstructionCost getMemcpyCost(const Instruction *I); int getNumMemOps(const IntrinsicInst *I) const; InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef Args = std::nullopt); bool preferInLoopReduction(unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const; bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const; bool shouldExpandReduction(const IntrinsicInst *II) const { return false; } InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I = nullptr); InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I = nullptr); InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I = nullptr); using BaseT::getVectorInstrCost; InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1); InstructionCost getAddressComputationCost(Type *Val, ScalarEvolution *SE, const SCEV *Ptr); InstructionCost getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info = {TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info = {TTI::OK_AnyValue, TTI::OP_None}, ArrayRef Args = ArrayRef(), const Instruction *CxtI = nullptr); InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo = {TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I = nullptr); InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind); InstructionCost getInterleavedMemoryOpCost( unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond = false, bool UseMaskForGaps = false); InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I = nullptr); InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, std::optional FMF, TTI::TargetCostKind CostKind); InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, std::optional FMF, TTI::TargetCostKind CostKind); InstructionCost getMulAccReductionCost(bool IsUnsigned, Type *ResTy, VectorType *ValTy, TTI::TargetCostKind CostKind); InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind); /// getScalingFactorCost - Return the cost of the scaling used in /// addressing mode represented by AM. /// If the AM is supported, the return value must be >= 0. /// If the AM is not supported, the return value must be negative. InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const; bool maybeLoweredToCall(Instruction &I); bool isLoweredToCall(const Function *F); bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo); bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *TLI, DominatorTree *DT, LoopVectorizationLegality *LVL, InterleavedAccessInfo *IAI); void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE); PredicationStyle emitGetActiveLaneMask() const; void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP); bool shouldBuildLookupTablesForConstant(Constant *C) const { // In the ROPI and RWPI relocation models we can't have pointers to global // variables or functions in constant data, so don't convert switches to // lookup tables if any of the values would need relocation. if (ST->isROPI() || ST->isRWPI()) return !C->needsDynamicRelocation(); return true; } /// @} }; /// isVREVMask - Check if a vector shuffle corresponds to a VREV /// instruction with the specified blocksize. (The order of the elements /// within each block of the vector is reversed.) inline bool isVREVMask(ArrayRef M, EVT VT, unsigned BlockSize) { assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) && "Only possible block sizes for VREV are: 16, 32, 64"); unsigned EltSz = VT.getScalarSizeInBits(); if (EltSz != 8 && EltSz != 16 && EltSz != 32) return false; unsigned BlockElts = M[0] + 1; // If the first shuffle index is UNDEF, be optimistic. if (M[0] < 0) BlockElts = BlockSize / EltSz; if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz) return false; for (unsigned i = 0, e = M.size(); i < e; ++i) { if (M[i] < 0) continue; // ignore UNDEF indices if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts)) return false; } return true; } } // end namespace llvm #endif // LLVM_LIB_TARGET_ARM_ARMTARGETTRANSFORMINFO_H