10b57cec5SDimitry Andric //===- ARMTargetTransformInfo.cpp - ARM specific TTI ----------------------===//
20b57cec5SDimitry Andric //
30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
60b57cec5SDimitry Andric //
70b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
80b57cec5SDimitry Andric
90b57cec5SDimitry Andric #include "ARMTargetTransformInfo.h"
100b57cec5SDimitry Andric #include "ARMSubtarget.h"
110b57cec5SDimitry Andric #include "MCTargetDesc/ARMAddressingModes.h"
120b57cec5SDimitry Andric #include "llvm/ADT/APInt.h"
130b57cec5SDimitry Andric #include "llvm/ADT/SmallVector.h"
140b57cec5SDimitry Andric #include "llvm/Analysis/LoopInfo.h"
150b57cec5SDimitry Andric #include "llvm/CodeGen/CostTable.h"
160b57cec5SDimitry Andric #include "llvm/CodeGen/ISDOpcodes.h"
170b57cec5SDimitry Andric #include "llvm/CodeGen/ValueTypes.h"
18*0fca6ea1SDimitry Andric #include "llvm/CodeGenTypes/MachineValueType.h"
190b57cec5SDimitry Andric #include "llvm/IR/BasicBlock.h"
200b57cec5SDimitry Andric #include "llvm/IR/DataLayout.h"
210b57cec5SDimitry Andric #include "llvm/IR/DerivedTypes.h"
220b57cec5SDimitry Andric #include "llvm/IR/Instruction.h"
230b57cec5SDimitry Andric #include "llvm/IR/Instructions.h"
240b57cec5SDimitry Andric #include "llvm/IR/IntrinsicInst.h"
25fcaf7f86SDimitry Andric #include "llvm/IR/Intrinsics.h"
265ffd83dbSDimitry Andric #include "llvm/IR/IntrinsicsARM.h"
27480093f4SDimitry Andric #include "llvm/IR/PatternMatch.h"
280b57cec5SDimitry Andric #include "llvm/IR/Type.h"
290b57cec5SDimitry Andric #include "llvm/Support/Casting.h"
30e8d8bef9SDimitry Andric #include "llvm/Support/KnownBits.h"
310b57cec5SDimitry Andric #include "llvm/Target/TargetMachine.h"
3206c3fb27SDimitry Andric #include "llvm/TargetParser/SubtargetFeature.h"
33e8d8bef9SDimitry Andric #include "llvm/Transforms/InstCombine/InstCombiner.h"
34e8d8bef9SDimitry Andric #include "llvm/Transforms/Utils/Local.h"
355ffd83dbSDimitry Andric #include "llvm/Transforms/Utils/LoopUtils.h"
36fcaf7f86SDimitry Andric #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
370b57cec5SDimitry Andric #include <algorithm>
380b57cec5SDimitry Andric #include <cassert>
390b57cec5SDimitry Andric #include <cstdint>
40bdd1243dSDimitry Andric #include <optional>
410b57cec5SDimitry Andric #include <utility>
420b57cec5SDimitry Andric
430b57cec5SDimitry Andric using namespace llvm;
440b57cec5SDimitry Andric
450b57cec5SDimitry Andric #define DEBUG_TYPE "armtti"
460b57cec5SDimitry Andric
478bcb0991SDimitry Andric static cl::opt<bool> EnableMaskedLoadStores(
48480093f4SDimitry Andric "enable-arm-maskedldst", cl::Hidden, cl::init(true),
498bcb0991SDimitry Andric cl::desc("Enable the generation of masked loads and stores"));
508bcb0991SDimitry Andric
510b57cec5SDimitry Andric static cl::opt<bool> DisableLowOverheadLoops(
528bcb0991SDimitry Andric "disable-arm-loloops", cl::Hidden, cl::init(false),
530b57cec5SDimitry Andric cl::desc("Disable the generation of low-overhead loops"));
540b57cec5SDimitry Andric
55e8d8bef9SDimitry Andric static cl::opt<bool>
56e8d8bef9SDimitry Andric AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true),
57e8d8bef9SDimitry Andric cl::desc("Enable the generation of WLS loops"));
58e8d8bef9SDimitry Andric
595ffd83dbSDimitry Andric extern cl::opt<TailPredication::Mode> EnableTailPredication;
60480093f4SDimitry Andric
61480093f4SDimitry Andric extern cl::opt<bool> EnableMaskedGatherScatters;
62480093f4SDimitry Andric
63e8d8bef9SDimitry Andric extern cl::opt<unsigned> MVEMaxSupportedInterleaveFactor;
64e8d8bef9SDimitry Andric
65e8d8bef9SDimitry Andric /// Convert a vector load intrinsic into a simple llvm load instruction.
66e8d8bef9SDimitry Andric /// This is beneficial when the underlying object being addressed comes
67e8d8bef9SDimitry Andric /// from a constant, since we get constant-folding for free.
simplifyNeonVld1(const IntrinsicInst & II,unsigned MemAlign,InstCombiner::BuilderTy & Builder)68e8d8bef9SDimitry Andric static Value *simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign,
69e8d8bef9SDimitry Andric InstCombiner::BuilderTy &Builder) {
70e8d8bef9SDimitry Andric auto *IntrAlign = dyn_cast<ConstantInt>(II.getArgOperand(1));
71e8d8bef9SDimitry Andric
72e8d8bef9SDimitry Andric if (!IntrAlign)
73e8d8bef9SDimitry Andric return nullptr;
74e8d8bef9SDimitry Andric
75e8d8bef9SDimitry Andric unsigned Alignment = IntrAlign->getLimitedValue() < MemAlign
76e8d8bef9SDimitry Andric ? MemAlign
77e8d8bef9SDimitry Andric : IntrAlign->getLimitedValue();
78e8d8bef9SDimitry Andric
79e8d8bef9SDimitry Andric if (!isPowerOf2_32(Alignment))
80e8d8bef9SDimitry Andric return nullptr;
81e8d8bef9SDimitry Andric
82e8d8bef9SDimitry Andric auto *BCastInst = Builder.CreateBitCast(II.getArgOperand(0),
83e8d8bef9SDimitry Andric PointerType::get(II.getType(), 0));
84e8d8bef9SDimitry Andric return Builder.CreateAlignedLoad(II.getType(), BCastInst, Align(Alignment));
85e8d8bef9SDimitry Andric }
86e8d8bef9SDimitry Andric
areInlineCompatible(const Function * Caller,const Function * Callee) const870b57cec5SDimitry Andric bool ARMTTIImpl::areInlineCompatible(const Function *Caller,
880b57cec5SDimitry Andric const Function *Callee) const {
890b57cec5SDimitry Andric const TargetMachine &TM = getTLI()->getTargetMachine();
900b57cec5SDimitry Andric const FeatureBitset &CallerBits =
910b57cec5SDimitry Andric TM.getSubtargetImpl(*Caller)->getFeatureBits();
920b57cec5SDimitry Andric const FeatureBitset &CalleeBits =
930b57cec5SDimitry Andric TM.getSubtargetImpl(*Callee)->getFeatureBits();
940b57cec5SDimitry Andric
955ffd83dbSDimitry Andric // To inline a callee, all features not in the allowed list must match exactly.
965ffd83dbSDimitry Andric bool MatchExact = (CallerBits & ~InlineFeaturesAllowed) ==
975ffd83dbSDimitry Andric (CalleeBits & ~InlineFeaturesAllowed);
985ffd83dbSDimitry Andric // For features in the allowed list, the callee's features must be a subset of
990b57cec5SDimitry Andric // the callers'.
1005ffd83dbSDimitry Andric bool MatchSubset = ((CallerBits & CalleeBits) & InlineFeaturesAllowed) ==
1015ffd83dbSDimitry Andric (CalleeBits & InlineFeaturesAllowed);
1020b57cec5SDimitry Andric return MatchExact && MatchSubset;
1030b57cec5SDimitry Andric }
1040b57cec5SDimitry Andric
105fe6060f1SDimitry Andric TTI::AddressingModeKind
getPreferredAddressingMode(const Loop * L,ScalarEvolution * SE) const106fe6060f1SDimitry Andric ARMTTIImpl::getPreferredAddressingMode(const Loop *L,
107fe6060f1SDimitry Andric ScalarEvolution *SE) const {
1085ffd83dbSDimitry Andric if (ST->hasMVEIntegerOps())
109fe6060f1SDimitry Andric return TTI::AMK_PostIndexed;
1105ffd83dbSDimitry Andric
111fe6060f1SDimitry Andric if (L->getHeader()->getParent()->hasOptSize())
112fe6060f1SDimitry Andric return TTI::AMK_None;
113fe6060f1SDimitry Andric
114fe6060f1SDimitry Andric if (ST->isMClass() && ST->isThumb2() &&
115fe6060f1SDimitry Andric L->getNumBlocks() == 1)
116fe6060f1SDimitry Andric return TTI::AMK_PreIndexed;
117fe6060f1SDimitry Andric
118fe6060f1SDimitry Andric return TTI::AMK_None;
1195ffd83dbSDimitry Andric }
1205ffd83dbSDimitry Andric
121bdd1243dSDimitry Andric std::optional<Instruction *>
instCombineIntrinsic(InstCombiner & IC,IntrinsicInst & II) const122e8d8bef9SDimitry Andric ARMTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
123e8d8bef9SDimitry Andric using namespace PatternMatch;
124e8d8bef9SDimitry Andric Intrinsic::ID IID = II.getIntrinsicID();
125e8d8bef9SDimitry Andric switch (IID) {
126e8d8bef9SDimitry Andric default:
127e8d8bef9SDimitry Andric break;
128e8d8bef9SDimitry Andric case Intrinsic::arm_neon_vld1: {
129e8d8bef9SDimitry Andric Align MemAlign =
130e8d8bef9SDimitry Andric getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,
131e8d8bef9SDimitry Andric &IC.getAssumptionCache(), &IC.getDominatorTree());
132e8d8bef9SDimitry Andric if (Value *V = simplifyNeonVld1(II, MemAlign.value(), IC.Builder)) {
133e8d8bef9SDimitry Andric return IC.replaceInstUsesWith(II, V);
134e8d8bef9SDimitry Andric }
135e8d8bef9SDimitry Andric break;
136e8d8bef9SDimitry Andric }
137e8d8bef9SDimitry Andric
138e8d8bef9SDimitry Andric case Intrinsic::arm_neon_vld2:
139e8d8bef9SDimitry Andric case Intrinsic::arm_neon_vld3:
140e8d8bef9SDimitry Andric case Intrinsic::arm_neon_vld4:
141e8d8bef9SDimitry Andric case Intrinsic::arm_neon_vld2lane:
142e8d8bef9SDimitry Andric case Intrinsic::arm_neon_vld3lane:
143e8d8bef9SDimitry Andric case Intrinsic::arm_neon_vld4lane:
144e8d8bef9SDimitry Andric case Intrinsic::arm_neon_vst1:
145e8d8bef9SDimitry Andric case Intrinsic::arm_neon_vst2:
146e8d8bef9SDimitry Andric case Intrinsic::arm_neon_vst3:
147e8d8bef9SDimitry Andric case Intrinsic::arm_neon_vst4:
148e8d8bef9SDimitry Andric case Intrinsic::arm_neon_vst2lane:
149e8d8bef9SDimitry Andric case Intrinsic::arm_neon_vst3lane:
150e8d8bef9SDimitry Andric case Intrinsic::arm_neon_vst4lane: {
151e8d8bef9SDimitry Andric Align MemAlign =
152e8d8bef9SDimitry Andric getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,
153e8d8bef9SDimitry Andric &IC.getAssumptionCache(), &IC.getDominatorTree());
154349cc55cSDimitry Andric unsigned AlignArg = II.arg_size() - 1;
155e8d8bef9SDimitry Andric Value *AlignArgOp = II.getArgOperand(AlignArg);
156e8d8bef9SDimitry Andric MaybeAlign Align = cast<ConstantInt>(AlignArgOp)->getMaybeAlignValue();
157e8d8bef9SDimitry Andric if (Align && *Align < MemAlign) {
158e8d8bef9SDimitry Andric return IC.replaceOperand(
159e8d8bef9SDimitry Andric II, AlignArg,
160e8d8bef9SDimitry Andric ConstantInt::get(Type::getInt32Ty(II.getContext()), MemAlign.value(),
161e8d8bef9SDimitry Andric false));
162e8d8bef9SDimitry Andric }
163e8d8bef9SDimitry Andric break;
164e8d8bef9SDimitry Andric }
165e8d8bef9SDimitry Andric
166e8d8bef9SDimitry Andric case Intrinsic::arm_mve_pred_i2v: {
167e8d8bef9SDimitry Andric Value *Arg = II.getArgOperand(0);
168e8d8bef9SDimitry Andric Value *ArgArg;
169e8d8bef9SDimitry Andric if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
170e8d8bef9SDimitry Andric PatternMatch::m_Value(ArgArg))) &&
171e8d8bef9SDimitry Andric II.getType() == ArgArg->getType()) {
172e8d8bef9SDimitry Andric return IC.replaceInstUsesWith(II, ArgArg);
173e8d8bef9SDimitry Andric }
174e8d8bef9SDimitry Andric Constant *XorMask;
175e8d8bef9SDimitry Andric if (match(Arg, m_Xor(PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
176e8d8bef9SDimitry Andric PatternMatch::m_Value(ArgArg)),
177e8d8bef9SDimitry Andric PatternMatch::m_Constant(XorMask))) &&
178e8d8bef9SDimitry Andric II.getType() == ArgArg->getType()) {
179e8d8bef9SDimitry Andric if (auto *CI = dyn_cast<ConstantInt>(XorMask)) {
180349cc55cSDimitry Andric if (CI->getValue().trunc(16).isAllOnes()) {
181e8d8bef9SDimitry Andric auto TrueVector = IC.Builder.CreateVectorSplat(
182e8d8bef9SDimitry Andric cast<FixedVectorType>(II.getType())->getNumElements(),
183e8d8bef9SDimitry Andric IC.Builder.getTrue());
184e8d8bef9SDimitry Andric return BinaryOperator::Create(Instruction::Xor, ArgArg, TrueVector);
185e8d8bef9SDimitry Andric }
186e8d8bef9SDimitry Andric }
187e8d8bef9SDimitry Andric }
188e8d8bef9SDimitry Andric KnownBits ScalarKnown(32);
189e8d8bef9SDimitry Andric if (IC.SimplifyDemandedBits(&II, 0, APInt::getLowBitsSet(32, 16),
190*0fca6ea1SDimitry Andric ScalarKnown)) {
191e8d8bef9SDimitry Andric return &II;
192e8d8bef9SDimitry Andric }
193e8d8bef9SDimitry Andric break;
194e8d8bef9SDimitry Andric }
195e8d8bef9SDimitry Andric case Intrinsic::arm_mve_pred_v2i: {
196e8d8bef9SDimitry Andric Value *Arg = II.getArgOperand(0);
197e8d8bef9SDimitry Andric Value *ArgArg;
198e8d8bef9SDimitry Andric if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_i2v>(
199e8d8bef9SDimitry Andric PatternMatch::m_Value(ArgArg)))) {
200e8d8bef9SDimitry Andric return IC.replaceInstUsesWith(II, ArgArg);
201e8d8bef9SDimitry Andric }
202*0fca6ea1SDimitry Andric
203*0fca6ea1SDimitry Andric if (II.getMetadata(LLVMContext::MD_range))
204e8d8bef9SDimitry Andric break;
205*0fca6ea1SDimitry Andric
206*0fca6ea1SDimitry Andric ConstantRange Range(APInt(32, 0), APInt(32, 0x10000));
207*0fca6ea1SDimitry Andric
208*0fca6ea1SDimitry Andric if (auto CurrentRange = II.getRange()) {
209*0fca6ea1SDimitry Andric Range = Range.intersectWith(*CurrentRange);
210*0fca6ea1SDimitry Andric if (Range == CurrentRange)
211*0fca6ea1SDimitry Andric break;
212*0fca6ea1SDimitry Andric }
213*0fca6ea1SDimitry Andric
214*0fca6ea1SDimitry Andric II.addRangeRetAttr(Range);
215*0fca6ea1SDimitry Andric II.addRetAttr(Attribute::NoUndef);
216*0fca6ea1SDimitry Andric return &II;
217e8d8bef9SDimitry Andric }
218e8d8bef9SDimitry Andric case Intrinsic::arm_mve_vadc:
219e8d8bef9SDimitry Andric case Intrinsic::arm_mve_vadc_predicated: {
220e8d8bef9SDimitry Andric unsigned CarryOp =
221e8d8bef9SDimitry Andric (II.getIntrinsicID() == Intrinsic::arm_mve_vadc_predicated) ? 3 : 2;
222e8d8bef9SDimitry Andric assert(II.getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == 32 &&
223e8d8bef9SDimitry Andric "Bad type for intrinsic!");
224e8d8bef9SDimitry Andric
225e8d8bef9SDimitry Andric KnownBits CarryKnown(32);
226e8d8bef9SDimitry Andric if (IC.SimplifyDemandedBits(&II, CarryOp, APInt::getOneBitSet(32, 29),
227e8d8bef9SDimitry Andric CarryKnown)) {
228e8d8bef9SDimitry Andric return &II;
229e8d8bef9SDimitry Andric }
230e8d8bef9SDimitry Andric break;
231e8d8bef9SDimitry Andric }
232e8d8bef9SDimitry Andric case Intrinsic::arm_mve_vmldava: {
233e8d8bef9SDimitry Andric Instruction *I = cast<Instruction>(&II);
234e8d8bef9SDimitry Andric if (I->hasOneUse()) {
235e8d8bef9SDimitry Andric auto *User = cast<Instruction>(*I->user_begin());
236e8d8bef9SDimitry Andric Value *OpZ;
237e8d8bef9SDimitry Andric if (match(User, m_c_Add(m_Specific(I), m_Value(OpZ))) &&
238e8d8bef9SDimitry Andric match(I->getOperand(3), m_Zero())) {
239e8d8bef9SDimitry Andric Value *OpX = I->getOperand(4);
240e8d8bef9SDimitry Andric Value *OpY = I->getOperand(5);
241e8d8bef9SDimitry Andric Type *OpTy = OpX->getType();
242e8d8bef9SDimitry Andric
243e8d8bef9SDimitry Andric IC.Builder.SetInsertPoint(User);
244e8d8bef9SDimitry Andric Value *V =
245e8d8bef9SDimitry Andric IC.Builder.CreateIntrinsic(Intrinsic::arm_mve_vmldava, {OpTy},
246e8d8bef9SDimitry Andric {I->getOperand(0), I->getOperand(1),
247e8d8bef9SDimitry Andric I->getOperand(2), OpZ, OpX, OpY});
248e8d8bef9SDimitry Andric
249e8d8bef9SDimitry Andric IC.replaceInstUsesWith(*User, V);
250e8d8bef9SDimitry Andric return IC.eraseInstFromFunction(*User);
251e8d8bef9SDimitry Andric }
252e8d8bef9SDimitry Andric }
253bdd1243dSDimitry Andric return std::nullopt;
254e8d8bef9SDimitry Andric }
255e8d8bef9SDimitry Andric }
256bdd1243dSDimitry Andric return std::nullopt;
257e8d8bef9SDimitry Andric }
258e8d8bef9SDimitry Andric
simplifyDemandedVectorEltsIntrinsic(InstCombiner & IC,IntrinsicInst & II,APInt OrigDemandedElts,APInt & UndefElts,APInt & UndefElts2,APInt & UndefElts3,std::function<void (Instruction *,unsigned,APInt,APInt &)> SimplifyAndSetOp) const259bdd1243dSDimitry Andric std::optional<Value *> ARMTTIImpl::simplifyDemandedVectorEltsIntrinsic(
260349cc55cSDimitry Andric InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
261349cc55cSDimitry Andric APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
262349cc55cSDimitry Andric std::function<void(Instruction *, unsigned, APInt, APInt &)>
263349cc55cSDimitry Andric SimplifyAndSetOp) const {
264349cc55cSDimitry Andric
265349cc55cSDimitry Andric // Compute the demanded bits for a narrowing MVE intrinsic. The TopOpc is the
266349cc55cSDimitry Andric // opcode specifying a Top/Bottom instruction, which can change between
267349cc55cSDimitry Andric // instructions.
268349cc55cSDimitry Andric auto SimplifyNarrowInstrTopBottom =[&](unsigned TopOpc) {
269349cc55cSDimitry Andric unsigned NumElts = cast<FixedVectorType>(II.getType())->getNumElements();
270349cc55cSDimitry Andric unsigned IsTop = cast<ConstantInt>(II.getOperand(TopOpc))->getZExtValue();
271349cc55cSDimitry Andric
272349cc55cSDimitry Andric // The only odd/even lanes of operand 0 will only be demanded depending
273349cc55cSDimitry Andric // on whether this is a top/bottom instruction.
274349cc55cSDimitry Andric APInt DemandedElts =
275349cc55cSDimitry Andric APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1)
276349cc55cSDimitry Andric : APInt::getHighBitsSet(2, 1));
277349cc55cSDimitry Andric SimplifyAndSetOp(&II, 0, OrigDemandedElts & DemandedElts, UndefElts);
278349cc55cSDimitry Andric // The other lanes will be defined from the inserted elements.
2795f757f3fSDimitry Andric UndefElts &= APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1)
280349cc55cSDimitry Andric : APInt::getHighBitsSet(2, 1));
281bdd1243dSDimitry Andric return std::nullopt;
282349cc55cSDimitry Andric };
283349cc55cSDimitry Andric
284349cc55cSDimitry Andric switch (II.getIntrinsicID()) {
285349cc55cSDimitry Andric default:
286349cc55cSDimitry Andric break;
287349cc55cSDimitry Andric case Intrinsic::arm_mve_vcvt_narrow:
288349cc55cSDimitry Andric SimplifyNarrowInstrTopBottom(2);
289349cc55cSDimitry Andric break;
290349cc55cSDimitry Andric case Intrinsic::arm_mve_vqmovn:
291349cc55cSDimitry Andric SimplifyNarrowInstrTopBottom(4);
292349cc55cSDimitry Andric break;
293349cc55cSDimitry Andric case Intrinsic::arm_mve_vshrn:
294349cc55cSDimitry Andric SimplifyNarrowInstrTopBottom(7);
295349cc55cSDimitry Andric break;
296349cc55cSDimitry Andric }
297349cc55cSDimitry Andric
298bdd1243dSDimitry Andric return std::nullopt;
299349cc55cSDimitry Andric }
300349cc55cSDimitry Andric
getIntImmCost(const APInt & Imm,Type * Ty,TTI::TargetCostKind CostKind)301fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
3025ffd83dbSDimitry Andric TTI::TargetCostKind CostKind) {
3030b57cec5SDimitry Andric assert(Ty->isIntegerTy());
3040b57cec5SDimitry Andric
3050b57cec5SDimitry Andric unsigned Bits = Ty->getPrimitiveSizeInBits();
3060b57cec5SDimitry Andric if (Bits == 0 || Imm.getActiveBits() >= 64)
3070b57cec5SDimitry Andric return 4;
3080b57cec5SDimitry Andric
3090b57cec5SDimitry Andric int64_t SImmVal = Imm.getSExtValue();
3100b57cec5SDimitry Andric uint64_t ZImmVal = Imm.getZExtValue();
3110b57cec5SDimitry Andric if (!ST->isThumb()) {
3120b57cec5SDimitry Andric if ((SImmVal >= 0 && SImmVal < 65536) ||
3130b57cec5SDimitry Andric (ARM_AM::getSOImmVal(ZImmVal) != -1) ||
3140b57cec5SDimitry Andric (ARM_AM::getSOImmVal(~ZImmVal) != -1))
3150b57cec5SDimitry Andric return 1;
3160b57cec5SDimitry Andric return ST->hasV6T2Ops() ? 2 : 3;
3170b57cec5SDimitry Andric }
3180b57cec5SDimitry Andric if (ST->isThumb2()) {
3190b57cec5SDimitry Andric if ((SImmVal >= 0 && SImmVal < 65536) ||
3200b57cec5SDimitry Andric (ARM_AM::getT2SOImmVal(ZImmVal) != -1) ||
3210b57cec5SDimitry Andric (ARM_AM::getT2SOImmVal(~ZImmVal) != -1))
3220b57cec5SDimitry Andric return 1;
3230b57cec5SDimitry Andric return ST->hasV6T2Ops() ? 2 : 3;
3240b57cec5SDimitry Andric }
3250b57cec5SDimitry Andric // Thumb1, any i8 imm cost 1.
3260b57cec5SDimitry Andric if (Bits == 8 || (SImmVal >= 0 && SImmVal < 256))
3270b57cec5SDimitry Andric return 1;
3280b57cec5SDimitry Andric if ((~SImmVal < 256) || ARM_AM::isThumbImmShiftedVal(ZImmVal))
3290b57cec5SDimitry Andric return 2;
3300b57cec5SDimitry Andric // Load from constantpool.
3310b57cec5SDimitry Andric return 3;
3320b57cec5SDimitry Andric }
3330b57cec5SDimitry Andric
3340b57cec5SDimitry Andric // Constants smaller than 256 fit in the immediate field of
3350b57cec5SDimitry Andric // Thumb1 instructions so we return a zero cost and 1 otherwise.
getIntImmCodeSizeCost(unsigned Opcode,unsigned Idx,const APInt & Imm,Type * Ty)336fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx,
3370b57cec5SDimitry Andric const APInt &Imm, Type *Ty) {
3380b57cec5SDimitry Andric if (Imm.isNonNegative() && Imm.getLimitedValue() < 256)
3390b57cec5SDimitry Andric return 0;
3400b57cec5SDimitry Andric
3410b57cec5SDimitry Andric return 1;
3420b57cec5SDimitry Andric }
3430b57cec5SDimitry Andric
344e8d8bef9SDimitry Andric // Checks whether Inst is part of a min(max()) or max(min()) pattern
3454824e7fdSDimitry Andric // that will match to an SSAT instruction. Returns the instruction being
3464824e7fdSDimitry Andric // saturated, or null if no saturation pattern was found.
isSSATMinMaxPattern(Instruction * Inst,const APInt & Imm)3474824e7fdSDimitry Andric static Value *isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm) {
348e8d8bef9SDimitry Andric Value *LHS, *RHS;
349e8d8bef9SDimitry Andric ConstantInt *C;
350e8d8bef9SDimitry Andric SelectPatternFlavor InstSPF = matchSelectPattern(Inst, LHS, RHS).Flavor;
351e8d8bef9SDimitry Andric
352e8d8bef9SDimitry Andric if (InstSPF == SPF_SMAX &&
353e8d8bef9SDimitry Andric PatternMatch::match(RHS, PatternMatch::m_ConstantInt(C)) &&
354349cc55cSDimitry Andric C->getValue() == Imm && Imm.isNegative() && Imm.isNegatedPowerOf2()) {
355e8d8bef9SDimitry Andric
356e8d8bef9SDimitry Andric auto isSSatMin = [&](Value *MinInst) {
357e8d8bef9SDimitry Andric if (isa<SelectInst>(MinInst)) {
358e8d8bef9SDimitry Andric Value *MinLHS, *MinRHS;
359e8d8bef9SDimitry Andric ConstantInt *MinC;
360e8d8bef9SDimitry Andric SelectPatternFlavor MinSPF =
361e8d8bef9SDimitry Andric matchSelectPattern(MinInst, MinLHS, MinRHS).Flavor;
362e8d8bef9SDimitry Andric if (MinSPF == SPF_SMIN &&
363e8d8bef9SDimitry Andric PatternMatch::match(MinRHS, PatternMatch::m_ConstantInt(MinC)) &&
364e8d8bef9SDimitry Andric MinC->getValue() == ((-Imm) - 1))
365e8d8bef9SDimitry Andric return true;
366e8d8bef9SDimitry Andric }
367e8d8bef9SDimitry Andric return false;
368e8d8bef9SDimitry Andric };
369e8d8bef9SDimitry Andric
3704824e7fdSDimitry Andric if (isSSatMin(Inst->getOperand(1)))
3714824e7fdSDimitry Andric return cast<Instruction>(Inst->getOperand(1))->getOperand(1);
3724824e7fdSDimitry Andric if (Inst->hasNUses(2) &&
3734824e7fdSDimitry Andric (isSSatMin(*Inst->user_begin()) || isSSatMin(*(++Inst->user_begin()))))
3744824e7fdSDimitry Andric return Inst->getOperand(1);
375e8d8bef9SDimitry Andric }
3764824e7fdSDimitry Andric return nullptr;
3774824e7fdSDimitry Andric }
3784824e7fdSDimitry Andric
3794824e7fdSDimitry Andric // Look for a FP Saturation pattern, where the instruction can be simplified to
3804824e7fdSDimitry Andric // a fptosi.sat. max(min(fptosi)). The constant in this case is always free.
isFPSatMinMaxPattern(Instruction * Inst,const APInt & Imm)3814824e7fdSDimitry Andric static bool isFPSatMinMaxPattern(Instruction *Inst, const APInt &Imm) {
3824824e7fdSDimitry Andric if (Imm.getBitWidth() != 64 ||
3834824e7fdSDimitry Andric Imm != APInt::getHighBitsSet(64, 33)) // -2147483648
384e8d8bef9SDimitry Andric return false;
3854824e7fdSDimitry Andric Value *FP = isSSATMinMaxPattern(Inst, Imm);
3864824e7fdSDimitry Andric if (!FP && isa<ICmpInst>(Inst) && Inst->hasOneUse())
3874824e7fdSDimitry Andric FP = isSSATMinMaxPattern(cast<Instruction>(*Inst->user_begin()), Imm);
3884824e7fdSDimitry Andric if (!FP)
3894824e7fdSDimitry Andric return false;
3904824e7fdSDimitry Andric return isa<FPToSIInst>(FP);
391e8d8bef9SDimitry Andric }
392e8d8bef9SDimitry Andric
getIntImmCostInst(unsigned Opcode,unsigned Idx,const APInt & Imm,Type * Ty,TTI::TargetCostKind CostKind,Instruction * Inst)393fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
394e8d8bef9SDimitry Andric const APInt &Imm, Type *Ty,
395e8d8bef9SDimitry Andric TTI::TargetCostKind CostKind,
396e8d8bef9SDimitry Andric Instruction *Inst) {
3970b57cec5SDimitry Andric // Division by a constant can be turned into multiplication, but only if we
3980b57cec5SDimitry Andric // know it's constant. So it's not so much that the immediate is cheap (it's
3990b57cec5SDimitry Andric // not), but that the alternative is worse.
4000b57cec5SDimitry Andric // FIXME: this is probably unneeded with GlobalISel.
4010b57cec5SDimitry Andric if ((Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
4020b57cec5SDimitry Andric Opcode == Instruction::SRem || Opcode == Instruction::URem) &&
4030b57cec5SDimitry Andric Idx == 1)
4040b57cec5SDimitry Andric return 0;
4050b57cec5SDimitry Andric
406fe6060f1SDimitry Andric // Leave any gep offsets for the CodeGenPrepare, which will do a better job at
407fe6060f1SDimitry Andric // splitting any large offsets.
408fe6060f1SDimitry Andric if (Opcode == Instruction::GetElementPtr && Idx != 0)
409fe6060f1SDimitry Andric return 0;
410fe6060f1SDimitry Andric
4110b57cec5SDimitry Andric if (Opcode == Instruction::And) {
4120b57cec5SDimitry Andric // UXTB/UXTH
4130b57cec5SDimitry Andric if (Imm == 255 || Imm == 65535)
4140b57cec5SDimitry Andric return 0;
4150b57cec5SDimitry Andric // Conversion to BIC is free, and means we can use ~Imm instead.
4165ffd83dbSDimitry Andric return std::min(getIntImmCost(Imm, Ty, CostKind),
4175ffd83dbSDimitry Andric getIntImmCost(~Imm, Ty, CostKind));
4180b57cec5SDimitry Andric }
4190b57cec5SDimitry Andric
4200b57cec5SDimitry Andric if (Opcode == Instruction::Add)
4210b57cec5SDimitry Andric // Conversion to SUB is free, and means we can use -Imm instead.
4225ffd83dbSDimitry Andric return std::min(getIntImmCost(Imm, Ty, CostKind),
4235ffd83dbSDimitry Andric getIntImmCost(-Imm, Ty, CostKind));
4240b57cec5SDimitry Andric
4250b57cec5SDimitry Andric if (Opcode == Instruction::ICmp && Imm.isNegative() &&
4260b57cec5SDimitry Andric Ty->getIntegerBitWidth() == 32) {
4270b57cec5SDimitry Andric int64_t NegImm = -Imm.getSExtValue();
4280b57cec5SDimitry Andric if (ST->isThumb2() && NegImm < 1<<12)
4290b57cec5SDimitry Andric // icmp X, #-C -> cmn X, #C
4300b57cec5SDimitry Andric return 0;
4310b57cec5SDimitry Andric if (ST->isThumb() && NegImm < 1<<8)
4320b57cec5SDimitry Andric // icmp X, #-C -> adds X, #C
4330b57cec5SDimitry Andric return 0;
4340b57cec5SDimitry Andric }
4350b57cec5SDimitry Andric
4360b57cec5SDimitry Andric // xor a, -1 can always be folded to MVN
437349cc55cSDimitry Andric if (Opcode == Instruction::Xor && Imm.isAllOnes())
4380b57cec5SDimitry Andric return 0;
4390b57cec5SDimitry Andric
440e8d8bef9SDimitry Andric // Ensures negative constant of min(max()) or max(min()) patterns that
441e8d8bef9SDimitry Andric // match to SSAT instructions don't get hoisted
442e8d8bef9SDimitry Andric if (Inst && ((ST->hasV6Ops() && !ST->isThumb()) || ST->isThumb2()) &&
443e8d8bef9SDimitry Andric Ty->getIntegerBitWidth() <= 32) {
444e8d8bef9SDimitry Andric if (isSSATMinMaxPattern(Inst, Imm) ||
445e8d8bef9SDimitry Andric (isa<ICmpInst>(Inst) && Inst->hasOneUse() &&
446e8d8bef9SDimitry Andric isSSATMinMaxPattern(cast<Instruction>(*Inst->user_begin()), Imm)))
447e8d8bef9SDimitry Andric return 0;
448e8d8bef9SDimitry Andric }
449e8d8bef9SDimitry Andric
4504824e7fdSDimitry Andric if (Inst && ST->hasVFP2Base() && isFPSatMinMaxPattern(Inst, Imm))
4514824e7fdSDimitry Andric return 0;
4524824e7fdSDimitry Andric
453349cc55cSDimitry Andric // We can convert <= -1 to < 0, which is generally quite cheap.
45406c3fb27SDimitry Andric if (Inst && Opcode == Instruction::ICmp && Idx == 1 && Imm.isAllOnes()) {
455349cc55cSDimitry Andric ICmpInst::Predicate Pred = cast<ICmpInst>(Inst)->getPredicate();
456349cc55cSDimitry Andric if (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SLE)
457349cc55cSDimitry Andric return std::min(getIntImmCost(Imm, Ty, CostKind),
458349cc55cSDimitry Andric getIntImmCost(Imm + 1, Ty, CostKind));
459349cc55cSDimitry Andric }
460349cc55cSDimitry Andric
4615ffd83dbSDimitry Andric return getIntImmCost(Imm, Ty, CostKind);
4620b57cec5SDimitry Andric }
4630b57cec5SDimitry Andric
getCFInstrCost(unsigned Opcode,TTI::TargetCostKind CostKind,const Instruction * I)464fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getCFInstrCost(unsigned Opcode,
465fe6060f1SDimitry Andric TTI::TargetCostKind CostKind,
466fe6060f1SDimitry Andric const Instruction *I) {
467e8d8bef9SDimitry Andric if (CostKind == TTI::TCK_RecipThroughput &&
468e8d8bef9SDimitry Andric (ST->hasNEON() || ST->hasMVEIntegerOps())) {
469e8d8bef9SDimitry Andric // FIXME: The vectorizer is highly sensistive to the cost of these
470e8d8bef9SDimitry Andric // instructions, which suggests that it may be using the costs incorrectly.
471e8d8bef9SDimitry Andric // But, for now, just make them free to avoid performance regressions for
472e8d8bef9SDimitry Andric // vector targets.
473e8d8bef9SDimitry Andric return 0;
474e8d8bef9SDimitry Andric }
475fe6060f1SDimitry Andric return BaseT::getCFInstrCost(Opcode, CostKind, I);
476e8d8bef9SDimitry Andric }
477e8d8bef9SDimitry Andric
getCastInstrCost(unsigned Opcode,Type * Dst,Type * Src,TTI::CastContextHint CCH,TTI::TargetCostKind CostKind,const Instruction * I)478fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
479fe6060f1SDimitry Andric Type *Src,
480e8d8bef9SDimitry Andric TTI::CastContextHint CCH,
4815ffd83dbSDimitry Andric TTI::TargetCostKind CostKind,
4820b57cec5SDimitry Andric const Instruction *I) {
4830b57cec5SDimitry Andric int ISD = TLI->InstructionOpcodeToISD(Opcode);
4840b57cec5SDimitry Andric assert(ISD && "Invalid opcode");
4850b57cec5SDimitry Andric
4865ffd83dbSDimitry Andric // TODO: Allow non-throughput costs that aren't binary.
487fe6060f1SDimitry Andric auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
4885ffd83dbSDimitry Andric if (CostKind != TTI::TCK_RecipThroughput)
4895ffd83dbSDimitry Andric return Cost == 0 ? 0 : 1;
4905ffd83dbSDimitry Andric return Cost;
4910b57cec5SDimitry Andric };
492e8d8bef9SDimitry Andric auto IsLegalFPType = [this](EVT VT) {
493e8d8bef9SDimitry Andric EVT EltVT = VT.getScalarType();
494e8d8bef9SDimitry Andric return (EltVT == MVT::f32 && ST->hasVFP2Base()) ||
495e8d8bef9SDimitry Andric (EltVT == MVT::f64 && ST->hasFP64()) ||
496e8d8bef9SDimitry Andric (EltVT == MVT::f16 && ST->hasFullFP16());
497e8d8bef9SDimitry Andric };
4980b57cec5SDimitry Andric
4990b57cec5SDimitry Andric EVT SrcTy = TLI->getValueType(DL, Src);
5000b57cec5SDimitry Andric EVT DstTy = TLI->getValueType(DL, Dst);
5010b57cec5SDimitry Andric
5020b57cec5SDimitry Andric if (!SrcTy.isSimple() || !DstTy.isSimple())
503e8d8bef9SDimitry Andric return AdjustCost(
504e8d8bef9SDimitry Andric BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
5050b57cec5SDimitry Andric
506e8d8bef9SDimitry Andric // Extending masked load/Truncating masked stores is expensive because we
507e8d8bef9SDimitry Andric // currently don't split them. This means that we'll likely end up
508e8d8bef9SDimitry Andric // loading/storing each element individually (hence the high cost).
509e8d8bef9SDimitry Andric if ((ST->hasMVEIntegerOps() &&
510e8d8bef9SDimitry Andric (Opcode == Instruction::Trunc || Opcode == Instruction::ZExt ||
511e8d8bef9SDimitry Andric Opcode == Instruction::SExt)) ||
512e8d8bef9SDimitry Andric (ST->hasMVEFloatOps() &&
513e8d8bef9SDimitry Andric (Opcode == Instruction::FPExt || Opcode == Instruction::FPTrunc) &&
514e8d8bef9SDimitry Andric IsLegalFPType(SrcTy) && IsLegalFPType(DstTy)))
515e8d8bef9SDimitry Andric if (CCH == TTI::CastContextHint::Masked && DstTy.getSizeInBits() > 128)
516fe6060f1SDimitry Andric return 2 * DstTy.getVectorNumElements() *
517fe6060f1SDimitry Andric ST->getMVEVectorCostFactor(CostKind);
518e8d8bef9SDimitry Andric
519e8d8bef9SDimitry Andric // The extend of other kinds of load is free
520e8d8bef9SDimitry Andric if (CCH == TTI::CastContextHint::Normal ||
521e8d8bef9SDimitry Andric CCH == TTI::CastContextHint::Masked) {
5228bcb0991SDimitry Andric static const TypeConversionCostTblEntry LoadConversionTbl[] = {
5238bcb0991SDimitry Andric {ISD::SIGN_EXTEND, MVT::i32, MVT::i16, 0},
5248bcb0991SDimitry Andric {ISD::ZERO_EXTEND, MVT::i32, MVT::i16, 0},
5258bcb0991SDimitry Andric {ISD::SIGN_EXTEND, MVT::i32, MVT::i8, 0},
5268bcb0991SDimitry Andric {ISD::ZERO_EXTEND, MVT::i32, MVT::i8, 0},
5278bcb0991SDimitry Andric {ISD::SIGN_EXTEND, MVT::i16, MVT::i8, 0},
5288bcb0991SDimitry Andric {ISD::ZERO_EXTEND, MVT::i16, MVT::i8, 0},
5298bcb0991SDimitry Andric {ISD::SIGN_EXTEND, MVT::i64, MVT::i32, 1},
5308bcb0991SDimitry Andric {ISD::ZERO_EXTEND, MVT::i64, MVT::i32, 1},
5318bcb0991SDimitry Andric {ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 1},
5328bcb0991SDimitry Andric {ISD::ZERO_EXTEND, MVT::i64, MVT::i16, 1},
5338bcb0991SDimitry Andric {ISD::SIGN_EXTEND, MVT::i64, MVT::i8, 1},
5348bcb0991SDimitry Andric {ISD::ZERO_EXTEND, MVT::i64, MVT::i8, 1},
5358bcb0991SDimitry Andric };
5368bcb0991SDimitry Andric if (const auto *Entry = ConvertCostTableLookup(
5378bcb0991SDimitry Andric LoadConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
5385ffd83dbSDimitry Andric return AdjustCost(Entry->Cost);
5398bcb0991SDimitry Andric
5408bcb0991SDimitry Andric static const TypeConversionCostTblEntry MVELoadConversionTbl[] = {
5418bcb0991SDimitry Andric {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 0},
5428bcb0991SDimitry Andric {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 0},
5438bcb0991SDimitry Andric {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 0},
5448bcb0991SDimitry Andric {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 0},
5458bcb0991SDimitry Andric {ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 0},
5468bcb0991SDimitry Andric {ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 0},
5475ffd83dbSDimitry Andric // The following extend from a legal type to an illegal type, so need to
5485ffd83dbSDimitry Andric // split the load. This introduced an extra load operation, but the
5495ffd83dbSDimitry Andric // extend is still "free".
5505ffd83dbSDimitry Andric {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1},
5515ffd83dbSDimitry Andric {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1},
5525ffd83dbSDimitry Andric {ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 3},
5535ffd83dbSDimitry Andric {ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 3},
5545ffd83dbSDimitry Andric {ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1},
5555ffd83dbSDimitry Andric {ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1},
5568bcb0991SDimitry Andric };
5578bcb0991SDimitry Andric if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
5588bcb0991SDimitry Andric if (const auto *Entry =
5598bcb0991SDimitry Andric ConvertCostTableLookup(MVELoadConversionTbl, ISD,
5608bcb0991SDimitry Andric DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
561fe6060f1SDimitry Andric return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
5628bcb0991SDimitry Andric }
5635ffd83dbSDimitry Andric
5645ffd83dbSDimitry Andric static const TypeConversionCostTblEntry MVEFLoadConversionTbl[] = {
5655ffd83dbSDimitry Andric // FPExtends are similar but also require the VCVT instructions.
5665ffd83dbSDimitry Andric {ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, 1},
5675ffd83dbSDimitry Andric {ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, 3},
5685ffd83dbSDimitry Andric };
5695ffd83dbSDimitry Andric if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
5705ffd83dbSDimitry Andric if (const auto *Entry =
5715ffd83dbSDimitry Andric ConvertCostTableLookup(MVEFLoadConversionTbl, ISD,
5725ffd83dbSDimitry Andric DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
573fe6060f1SDimitry Andric return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
5745ffd83dbSDimitry Andric }
5755ffd83dbSDimitry Andric
5765ffd83dbSDimitry Andric // The truncate of a store is free. This is the mirror of extends above.
577e8d8bef9SDimitry Andric static const TypeConversionCostTblEntry MVEStoreConversionTbl[] = {
5785ffd83dbSDimitry Andric {ISD::TRUNCATE, MVT::v4i32, MVT::v4i16, 0},
5795ffd83dbSDimitry Andric {ISD::TRUNCATE, MVT::v4i32, MVT::v4i8, 0},
5805ffd83dbSDimitry Andric {ISD::TRUNCATE, MVT::v8i16, MVT::v8i8, 0},
5815ffd83dbSDimitry Andric {ISD::TRUNCATE, MVT::v8i32, MVT::v8i16, 1},
582e8d8bef9SDimitry Andric {ISD::TRUNCATE, MVT::v8i32, MVT::v8i8, 1},
5835ffd83dbSDimitry Andric {ISD::TRUNCATE, MVT::v16i32, MVT::v16i8, 3},
5845ffd83dbSDimitry Andric {ISD::TRUNCATE, MVT::v16i16, MVT::v16i8, 1},
5855ffd83dbSDimitry Andric };
5865ffd83dbSDimitry Andric if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
5875ffd83dbSDimitry Andric if (const auto *Entry =
588e8d8bef9SDimitry Andric ConvertCostTableLookup(MVEStoreConversionTbl, ISD,
589e8d8bef9SDimitry Andric SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
590fe6060f1SDimitry Andric return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
5915ffd83dbSDimitry Andric }
5925ffd83dbSDimitry Andric
593e8d8bef9SDimitry Andric static const TypeConversionCostTblEntry MVEFStoreConversionTbl[] = {
5945ffd83dbSDimitry Andric {ISD::FP_ROUND, MVT::v4f32, MVT::v4f16, 1},
5955ffd83dbSDimitry Andric {ISD::FP_ROUND, MVT::v8f32, MVT::v8f16, 3},
5965ffd83dbSDimitry Andric };
5975ffd83dbSDimitry Andric if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
5985ffd83dbSDimitry Andric if (const auto *Entry =
599e8d8bef9SDimitry Andric ConvertCostTableLookup(MVEFStoreConversionTbl, ISD,
600e8d8bef9SDimitry Andric SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
601fe6060f1SDimitry Andric return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
6025ffd83dbSDimitry Andric }
6035ffd83dbSDimitry Andric }
6045ffd83dbSDimitry Andric
6055ffd83dbSDimitry Andric // NEON vector operations that can extend their inputs.
6065ffd83dbSDimitry Andric if ((ISD == ISD::SIGN_EXTEND || ISD == ISD::ZERO_EXTEND) &&
6075ffd83dbSDimitry Andric I && I->hasOneUse() && ST->hasNEON() && SrcTy.isVector()) {
6085ffd83dbSDimitry Andric static const TypeConversionCostTblEntry NEONDoubleWidthTbl[] = {
6095ffd83dbSDimitry Andric // vaddl
6105ffd83dbSDimitry Andric { ISD::ADD, MVT::v4i32, MVT::v4i16, 0 },
6115ffd83dbSDimitry Andric { ISD::ADD, MVT::v8i16, MVT::v8i8, 0 },
6125ffd83dbSDimitry Andric // vsubl
6135ffd83dbSDimitry Andric { ISD::SUB, MVT::v4i32, MVT::v4i16, 0 },
6145ffd83dbSDimitry Andric { ISD::SUB, MVT::v8i16, MVT::v8i8, 0 },
6155ffd83dbSDimitry Andric // vmull
6165ffd83dbSDimitry Andric { ISD::MUL, MVT::v4i32, MVT::v4i16, 0 },
6175ffd83dbSDimitry Andric { ISD::MUL, MVT::v8i16, MVT::v8i8, 0 },
6185ffd83dbSDimitry Andric // vshll
6195ffd83dbSDimitry Andric { ISD::SHL, MVT::v4i32, MVT::v4i16, 0 },
6205ffd83dbSDimitry Andric { ISD::SHL, MVT::v8i16, MVT::v8i8, 0 },
6215ffd83dbSDimitry Andric };
6225ffd83dbSDimitry Andric
6235ffd83dbSDimitry Andric auto *User = cast<Instruction>(*I->user_begin());
6245ffd83dbSDimitry Andric int UserISD = TLI->InstructionOpcodeToISD(User->getOpcode());
6255ffd83dbSDimitry Andric if (auto *Entry = ConvertCostTableLookup(NEONDoubleWidthTbl, UserISD,
6265ffd83dbSDimitry Andric DstTy.getSimpleVT(),
6275ffd83dbSDimitry Andric SrcTy.getSimpleVT())) {
6285ffd83dbSDimitry Andric return AdjustCost(Entry->Cost);
6295ffd83dbSDimitry Andric }
6305ffd83dbSDimitry Andric }
6315ffd83dbSDimitry Andric
6325ffd83dbSDimitry Andric // Single to/from double precision conversions.
6335ffd83dbSDimitry Andric if (Src->isVectorTy() && ST->hasNEON() &&
6345ffd83dbSDimitry Andric ((ISD == ISD::FP_ROUND && SrcTy.getScalarType() == MVT::f64 &&
6355ffd83dbSDimitry Andric DstTy.getScalarType() == MVT::f32) ||
6365ffd83dbSDimitry Andric (ISD == ISD::FP_EXTEND && SrcTy.getScalarType() == MVT::f32 &&
6375ffd83dbSDimitry Andric DstTy.getScalarType() == MVT::f64))) {
6385ffd83dbSDimitry Andric static const CostTblEntry NEONFltDblTbl[] = {
6395ffd83dbSDimitry Andric // Vector fptrunc/fpext conversions.
6405ffd83dbSDimitry Andric {ISD::FP_ROUND, MVT::v2f64, 2},
6415ffd83dbSDimitry Andric {ISD::FP_EXTEND, MVT::v2f32, 2},
6425ffd83dbSDimitry Andric {ISD::FP_EXTEND, MVT::v4f32, 4}};
6435ffd83dbSDimitry Andric
644bdd1243dSDimitry Andric std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
6455ffd83dbSDimitry Andric if (const auto *Entry = CostTableLookup(NEONFltDblTbl, ISD, LT.second))
6465ffd83dbSDimitry Andric return AdjustCost(LT.first * Entry->Cost);
6478bcb0991SDimitry Andric }
6488bcb0991SDimitry Andric
6490b57cec5SDimitry Andric // Some arithmetic, load and store operations have specific instructions
6500b57cec5SDimitry Andric // to cast up/down their types automatically at no extra cost.
6510b57cec5SDimitry Andric // TODO: Get these tables to know at least what the related operations are.
6520b57cec5SDimitry Andric static const TypeConversionCostTblEntry NEONVectorConversionTbl[] = {
6535ffd83dbSDimitry Andric { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
6545ffd83dbSDimitry Andric { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
6550b57cec5SDimitry Andric { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
6560b57cec5SDimitry Andric { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
6570b57cec5SDimitry Andric { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 0 },
6580b57cec5SDimitry Andric { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 },
6590b57cec5SDimitry Andric
6600b57cec5SDimitry Andric // The number of vmovl instructions for the extension.
6615ffd83dbSDimitry Andric { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
6625ffd83dbSDimitry Andric { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
6635ffd83dbSDimitry Andric { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
6645ffd83dbSDimitry Andric { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
6655ffd83dbSDimitry Andric { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8, 3 },
6665ffd83dbSDimitry Andric { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8, 3 },
6675ffd83dbSDimitry Andric { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
6685ffd83dbSDimitry Andric { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
6690b57cec5SDimitry Andric { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
6700b57cec5SDimitry Andric { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
6710b57cec5SDimitry Andric { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
6720b57cec5SDimitry Andric { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
6730b57cec5SDimitry Andric { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
6740b57cec5SDimitry Andric { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
6750b57cec5SDimitry Andric { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
6760b57cec5SDimitry Andric { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
6770b57cec5SDimitry Andric { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
6780b57cec5SDimitry Andric { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
6790b57cec5SDimitry Andric
6800b57cec5SDimitry Andric // Operations that we legalize using splitting.
6810b57cec5SDimitry Andric { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 },
6820b57cec5SDimitry Andric { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 },
6830b57cec5SDimitry Andric
6840b57cec5SDimitry Andric // Vector float <-> i32 conversions.
6850b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
6860b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
6870b57cec5SDimitry Andric
6880b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
6890b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
6900b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 2 },
6910b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 2 },
6920b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
6930b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
6940b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 },
6950b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 },
6960b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
6970b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
6980b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
6990b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
7000b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
7010b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
7020b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 },
7030b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 },
7040b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 8 },
7050b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 8 },
7060b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 4 },
7070b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 4 },
7080b57cec5SDimitry Andric
7090b57cec5SDimitry Andric { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
7100b57cec5SDimitry Andric { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
7110b57cec5SDimitry Andric { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 3 },
7120b57cec5SDimitry Andric { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 3 },
7130b57cec5SDimitry Andric { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 },
7140b57cec5SDimitry Andric { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 },
7150b57cec5SDimitry Andric
7160b57cec5SDimitry Andric // Vector double <-> i32 conversions.
7170b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
7180b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
7190b57cec5SDimitry Andric
7200b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
7210b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
7220b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 3 },
7230b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 3 },
7240b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
7250b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
7260b57cec5SDimitry Andric
7270b57cec5SDimitry Andric { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 },
7280b57cec5SDimitry Andric { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 },
7290b57cec5SDimitry Andric { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, 4 },
7300b57cec5SDimitry Andric { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, 4 },
7310b57cec5SDimitry Andric { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f32, 8 },
7320b57cec5SDimitry Andric { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, 8 }
7330b57cec5SDimitry Andric };
7340b57cec5SDimitry Andric
7350b57cec5SDimitry Andric if (SrcTy.isVector() && ST->hasNEON()) {
7360b57cec5SDimitry Andric if (const auto *Entry = ConvertCostTableLookup(NEONVectorConversionTbl, ISD,
7370b57cec5SDimitry Andric DstTy.getSimpleVT(),
7380b57cec5SDimitry Andric SrcTy.getSimpleVT()))
7395ffd83dbSDimitry Andric return AdjustCost(Entry->Cost);
7400b57cec5SDimitry Andric }
7410b57cec5SDimitry Andric
7420b57cec5SDimitry Andric // Scalar float to integer conversions.
7430b57cec5SDimitry Andric static const TypeConversionCostTblEntry NEONFloatConversionTbl[] = {
7440b57cec5SDimitry Andric { ISD::FP_TO_SINT, MVT::i1, MVT::f32, 2 },
7450b57cec5SDimitry Andric { ISD::FP_TO_UINT, MVT::i1, MVT::f32, 2 },
7460b57cec5SDimitry Andric { ISD::FP_TO_SINT, MVT::i1, MVT::f64, 2 },
7470b57cec5SDimitry Andric { ISD::FP_TO_UINT, MVT::i1, MVT::f64, 2 },
7480b57cec5SDimitry Andric { ISD::FP_TO_SINT, MVT::i8, MVT::f32, 2 },
7490b57cec5SDimitry Andric { ISD::FP_TO_UINT, MVT::i8, MVT::f32, 2 },
7500b57cec5SDimitry Andric { ISD::FP_TO_SINT, MVT::i8, MVT::f64, 2 },
7510b57cec5SDimitry Andric { ISD::FP_TO_UINT, MVT::i8, MVT::f64, 2 },
7520b57cec5SDimitry Andric { ISD::FP_TO_SINT, MVT::i16, MVT::f32, 2 },
7530b57cec5SDimitry Andric { ISD::FP_TO_UINT, MVT::i16, MVT::f32, 2 },
7540b57cec5SDimitry Andric { ISD::FP_TO_SINT, MVT::i16, MVT::f64, 2 },
7550b57cec5SDimitry Andric { ISD::FP_TO_UINT, MVT::i16, MVT::f64, 2 },
7560b57cec5SDimitry Andric { ISD::FP_TO_SINT, MVT::i32, MVT::f32, 2 },
7570b57cec5SDimitry Andric { ISD::FP_TO_UINT, MVT::i32, MVT::f32, 2 },
7580b57cec5SDimitry Andric { ISD::FP_TO_SINT, MVT::i32, MVT::f64, 2 },
7590b57cec5SDimitry Andric { ISD::FP_TO_UINT, MVT::i32, MVT::f64, 2 },
7600b57cec5SDimitry Andric { ISD::FP_TO_SINT, MVT::i64, MVT::f32, 10 },
7610b57cec5SDimitry Andric { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 10 },
7620b57cec5SDimitry Andric { ISD::FP_TO_SINT, MVT::i64, MVT::f64, 10 },
7630b57cec5SDimitry Andric { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 10 }
7640b57cec5SDimitry Andric };
7650b57cec5SDimitry Andric if (SrcTy.isFloatingPoint() && ST->hasNEON()) {
7660b57cec5SDimitry Andric if (const auto *Entry = ConvertCostTableLookup(NEONFloatConversionTbl, ISD,
7670b57cec5SDimitry Andric DstTy.getSimpleVT(),
7680b57cec5SDimitry Andric SrcTy.getSimpleVT()))
7695ffd83dbSDimitry Andric return AdjustCost(Entry->Cost);
7700b57cec5SDimitry Andric }
7710b57cec5SDimitry Andric
7720b57cec5SDimitry Andric // Scalar integer to float conversions.
7730b57cec5SDimitry Andric static const TypeConversionCostTblEntry NEONIntegerConversionTbl[] = {
7740b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::f32, MVT::i1, 2 },
7750b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::f32, MVT::i1, 2 },
7760b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::f64, MVT::i1, 2 },
7770b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::f64, MVT::i1, 2 },
7780b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::f32, MVT::i8, 2 },
7790b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::f32, MVT::i8, 2 },
7800b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::f64, MVT::i8, 2 },
7810b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::f64, MVT::i8, 2 },
7820b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::f32, MVT::i16, 2 },
7830b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::f32, MVT::i16, 2 },
7840b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::f64, MVT::i16, 2 },
7850b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::f64, MVT::i16, 2 },
7860b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::f32, MVT::i32, 2 },
7870b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::f32, MVT::i32, 2 },
7880b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::f64, MVT::i32, 2 },
7890b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::f64, MVT::i32, 2 },
7900b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::f32, MVT::i64, 10 },
7910b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 10 },
7920b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::f64, MVT::i64, 10 },
7930b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 10 }
7940b57cec5SDimitry Andric };
7950b57cec5SDimitry Andric
7960b57cec5SDimitry Andric if (SrcTy.isInteger() && ST->hasNEON()) {
7970b57cec5SDimitry Andric if (const auto *Entry = ConvertCostTableLookup(NEONIntegerConversionTbl,
7980b57cec5SDimitry Andric ISD, DstTy.getSimpleVT(),
7990b57cec5SDimitry Andric SrcTy.getSimpleVT()))
8005ffd83dbSDimitry Andric return AdjustCost(Entry->Cost);
8010b57cec5SDimitry Andric }
8020b57cec5SDimitry Andric
8038bcb0991SDimitry Andric // MVE extend costs, taken from codegen tests. i8->i16 or i16->i32 is one
8048bcb0991SDimitry Andric // instruction, i8->i32 is two. i64 zexts are an VAND with a constant, sext
8058bcb0991SDimitry Andric // are linearised so take more.
8068bcb0991SDimitry Andric static const TypeConversionCostTblEntry MVEVectorConversionTbl[] = {
8078bcb0991SDimitry Andric { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
8088bcb0991SDimitry Andric { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
8098bcb0991SDimitry Andric { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
8108bcb0991SDimitry Andric { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
8118bcb0991SDimitry Andric { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8, 10 },
8128bcb0991SDimitry Andric { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8, 2 },
8138bcb0991SDimitry Andric { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
8148bcb0991SDimitry Andric { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
8158bcb0991SDimitry Andric { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 10 },
8168bcb0991SDimitry Andric { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
8178bcb0991SDimitry Andric { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 8 },
8188bcb0991SDimitry Andric { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 2 },
8198bcb0991SDimitry Andric };
8208bcb0991SDimitry Andric
8218bcb0991SDimitry Andric if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
8228bcb0991SDimitry Andric if (const auto *Entry = ConvertCostTableLookup(MVEVectorConversionTbl,
8238bcb0991SDimitry Andric ISD, DstTy.getSimpleVT(),
8248bcb0991SDimitry Andric SrcTy.getSimpleVT()))
825fe6060f1SDimitry Andric return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
8265ffd83dbSDimitry Andric }
8275ffd83dbSDimitry Andric
8285ffd83dbSDimitry Andric if (ISD == ISD::FP_ROUND || ISD == ISD::FP_EXTEND) {
8295ffd83dbSDimitry Andric // As general rule, fp converts that were not matched above are scalarized
8305ffd83dbSDimitry Andric // and cost 1 vcvt for each lane, so long as the instruction is available.
8315ffd83dbSDimitry Andric // If not it will become a series of function calls.
832fe6060f1SDimitry Andric const InstructionCost CallCost =
833fe6060f1SDimitry Andric getCallInstrCost(nullptr, Dst, {Src}, CostKind);
8345ffd83dbSDimitry Andric int Lanes = 1;
8355ffd83dbSDimitry Andric if (SrcTy.isFixedLengthVector())
8365ffd83dbSDimitry Andric Lanes = SrcTy.getVectorNumElements();
8375ffd83dbSDimitry Andric
838e8d8bef9SDimitry Andric if (IsLegalFPType(SrcTy) && IsLegalFPType(DstTy))
8395ffd83dbSDimitry Andric return Lanes;
8405ffd83dbSDimitry Andric else
8415ffd83dbSDimitry Andric return Lanes * CallCost;
8428bcb0991SDimitry Andric }
8438bcb0991SDimitry Andric
844e8d8bef9SDimitry Andric if (ISD == ISD::TRUNCATE && ST->hasMVEIntegerOps() &&
845e8d8bef9SDimitry Andric SrcTy.isFixedLengthVector()) {
846e8d8bef9SDimitry Andric // Treat a truncate with larger than legal source (128bits for MVE) as
847e8d8bef9SDimitry Andric // expensive, 2 instructions per lane.
848e8d8bef9SDimitry Andric if ((SrcTy.getScalarType() == MVT::i8 ||
849e8d8bef9SDimitry Andric SrcTy.getScalarType() == MVT::i16 ||
850e8d8bef9SDimitry Andric SrcTy.getScalarType() == MVT::i32) &&
851e8d8bef9SDimitry Andric SrcTy.getSizeInBits() > 128 &&
852e8d8bef9SDimitry Andric SrcTy.getSizeInBits() > DstTy.getSizeInBits())
853e8d8bef9SDimitry Andric return SrcTy.getVectorNumElements() * 2;
854e8d8bef9SDimitry Andric }
855e8d8bef9SDimitry Andric
8560b57cec5SDimitry Andric // Scalar integer conversion costs.
8570b57cec5SDimitry Andric static const TypeConversionCostTblEntry ARMIntegerConversionTbl[] = {
8580b57cec5SDimitry Andric // i16 -> i64 requires two dependent operations.
8590b57cec5SDimitry Andric { ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 2 },
8600b57cec5SDimitry Andric
8610b57cec5SDimitry Andric // Truncates on i64 are assumed to be free.
8620b57cec5SDimitry Andric { ISD::TRUNCATE, MVT::i32, MVT::i64, 0 },
8630b57cec5SDimitry Andric { ISD::TRUNCATE, MVT::i16, MVT::i64, 0 },
8640b57cec5SDimitry Andric { ISD::TRUNCATE, MVT::i8, MVT::i64, 0 },
8650b57cec5SDimitry Andric { ISD::TRUNCATE, MVT::i1, MVT::i64, 0 }
8660b57cec5SDimitry Andric };
8670b57cec5SDimitry Andric
8680b57cec5SDimitry Andric if (SrcTy.isInteger()) {
8690b57cec5SDimitry Andric if (const auto *Entry = ConvertCostTableLookup(ARMIntegerConversionTbl, ISD,
8700b57cec5SDimitry Andric DstTy.getSimpleVT(),
8710b57cec5SDimitry Andric SrcTy.getSimpleVT()))
8725ffd83dbSDimitry Andric return AdjustCost(Entry->Cost);
8730b57cec5SDimitry Andric }
8740b57cec5SDimitry Andric
8758bcb0991SDimitry Andric int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
876fe6060f1SDimitry Andric ? ST->getMVEVectorCostFactor(CostKind)
8778bcb0991SDimitry Andric : 1;
8785ffd83dbSDimitry Andric return AdjustCost(
879e8d8bef9SDimitry Andric BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
8800b57cec5SDimitry Andric }
8810b57cec5SDimitry Andric
getVectorInstrCost(unsigned Opcode,Type * ValTy,TTI::TargetCostKind CostKind,unsigned Index,Value * Op0,Value * Op1)882fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
883bdd1243dSDimitry Andric TTI::TargetCostKind CostKind,
884bdd1243dSDimitry Andric unsigned Index, Value *Op0,
885bdd1243dSDimitry Andric Value *Op1) {
8860b57cec5SDimitry Andric // Penalize inserting into an D-subregister. We end up with a three times
8870b57cec5SDimitry Andric // lower estimated throughput on swift.
8880b57cec5SDimitry Andric if (ST->hasSlowLoadDSubregister() && Opcode == Instruction::InsertElement &&
8890b57cec5SDimitry Andric ValTy->isVectorTy() && ValTy->getScalarSizeInBits() <= 32)
8900b57cec5SDimitry Andric return 3;
8910b57cec5SDimitry Andric
8928bcb0991SDimitry Andric if (ST->hasNEON() && (Opcode == Instruction::InsertElement ||
8930b57cec5SDimitry Andric Opcode == Instruction::ExtractElement)) {
8940b57cec5SDimitry Andric // Cross-class copies are expensive on many microarchitectures,
8950b57cec5SDimitry Andric // so assume they are expensive by default.
8965ffd83dbSDimitry Andric if (cast<VectorType>(ValTy)->getElementType()->isIntegerTy())
8970b57cec5SDimitry Andric return 3;
8980b57cec5SDimitry Andric
8990b57cec5SDimitry Andric // Even if it's not a cross class copy, this likely leads to mixing
9000b57cec5SDimitry Andric // of NEON and VFP code and should be therefore penalized.
9010b57cec5SDimitry Andric if (ValTy->isVectorTy() &&
9020b57cec5SDimitry Andric ValTy->getScalarSizeInBits() <= 32)
903fe6060f1SDimitry Andric return std::max<InstructionCost>(
904bdd1243dSDimitry Andric BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1),
905bdd1243dSDimitry Andric 2U);
9060b57cec5SDimitry Andric }
9070b57cec5SDimitry Andric
9088bcb0991SDimitry Andric if (ST->hasMVEIntegerOps() && (Opcode == Instruction::InsertElement ||
9098bcb0991SDimitry Andric Opcode == Instruction::ExtractElement)) {
910fe6060f1SDimitry Andric // Integer cross-lane moves are more expensive than float, which can
911fe6060f1SDimitry Andric // sometimes just be vmovs. Integer involve being passes to GPR registers,
912fe6060f1SDimitry Andric // causing more of a delay.
913fe6060f1SDimitry Andric std::pair<InstructionCost, MVT> LT =
914bdd1243dSDimitry Andric getTypeLegalizationCost(ValTy->getScalarType());
915fe6060f1SDimitry Andric return LT.first * (ValTy->getScalarType()->isIntegerTy() ? 4 : 1);
9168bcb0991SDimitry Andric }
9178bcb0991SDimitry Andric
918bdd1243dSDimitry Andric return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1);
9190b57cec5SDimitry Andric }
9200b57cec5SDimitry Andric
getCmpSelInstrCost(unsigned Opcode,Type * ValTy,Type * CondTy,CmpInst::Predicate VecPred,TTI::TargetCostKind CostKind,const Instruction * I)921fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
922fe6060f1SDimitry Andric Type *CondTy,
923e8d8bef9SDimitry Andric CmpInst::Predicate VecPred,
9245ffd83dbSDimitry Andric TTI::TargetCostKind CostKind,
9250b57cec5SDimitry Andric const Instruction *I) {
9260b57cec5SDimitry Andric int ISD = TLI->InstructionOpcodeToISD(Opcode);
927e8d8bef9SDimitry Andric
928e8d8bef9SDimitry Andric // Thumb scalar code size cost for select.
929e8d8bef9SDimitry Andric if (CostKind == TTI::TCK_CodeSize && ISD == ISD::SELECT &&
930e8d8bef9SDimitry Andric ST->isThumb() && !ValTy->isVectorTy()) {
931e8d8bef9SDimitry Andric // Assume expensive structs.
932e8d8bef9SDimitry Andric if (TLI->getValueType(DL, ValTy, true) == MVT::Other)
933e8d8bef9SDimitry Andric return TTI::TCC_Expensive;
934e8d8bef9SDimitry Andric
935e8d8bef9SDimitry Andric // Select costs can vary because they:
936e8d8bef9SDimitry Andric // - may require one or more conditional mov (including an IT),
937e8d8bef9SDimitry Andric // - can't operate directly on immediates,
938e8d8bef9SDimitry Andric // - require live flags, which we can't copy around easily.
939bdd1243dSDimitry Andric InstructionCost Cost = getTypeLegalizationCost(ValTy).first;
940e8d8bef9SDimitry Andric
941e8d8bef9SDimitry Andric // Possible IT instruction for Thumb2, or more for Thumb1.
942e8d8bef9SDimitry Andric ++Cost;
943e8d8bef9SDimitry Andric
944e8d8bef9SDimitry Andric // i1 values may need rematerialising by using mov immediates and/or
945e8d8bef9SDimitry Andric // flag setting instructions.
946e8d8bef9SDimitry Andric if (ValTy->isIntegerTy(1))
947e8d8bef9SDimitry Andric ++Cost;
948e8d8bef9SDimitry Andric
949e8d8bef9SDimitry Andric return Cost;
950e8d8bef9SDimitry Andric }
951e8d8bef9SDimitry Andric
952fe6060f1SDimitry Andric // If this is a vector min/max/abs, use the cost of that intrinsic directly
953fe6060f1SDimitry Andric // instead. Hopefully when min/max intrinsics are more prevalent this code
954fe6060f1SDimitry Andric // will not be needed.
955fe6060f1SDimitry Andric const Instruction *Sel = I;
956fe6060f1SDimitry Andric if ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) && Sel &&
957fe6060f1SDimitry Andric Sel->hasOneUse())
958fe6060f1SDimitry Andric Sel = cast<Instruction>(Sel->user_back());
959fe6060f1SDimitry Andric if (Sel && ValTy->isVectorTy() &&
960fe6060f1SDimitry Andric (ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy())) {
961fe6060f1SDimitry Andric const Value *LHS, *RHS;
962fe6060f1SDimitry Andric SelectPatternFlavor SPF = matchSelectPattern(Sel, LHS, RHS).Flavor;
963fe6060f1SDimitry Andric unsigned IID = 0;
964fe6060f1SDimitry Andric switch (SPF) {
965fe6060f1SDimitry Andric case SPF_ABS:
966fe6060f1SDimitry Andric IID = Intrinsic::abs;
967fe6060f1SDimitry Andric break;
968fe6060f1SDimitry Andric case SPF_SMIN:
969fe6060f1SDimitry Andric IID = Intrinsic::smin;
970fe6060f1SDimitry Andric break;
971fe6060f1SDimitry Andric case SPF_SMAX:
972fe6060f1SDimitry Andric IID = Intrinsic::smax;
973fe6060f1SDimitry Andric break;
974fe6060f1SDimitry Andric case SPF_UMIN:
975fe6060f1SDimitry Andric IID = Intrinsic::umin;
976fe6060f1SDimitry Andric break;
977fe6060f1SDimitry Andric case SPF_UMAX:
978fe6060f1SDimitry Andric IID = Intrinsic::umax;
979fe6060f1SDimitry Andric break;
980fe6060f1SDimitry Andric case SPF_FMINNUM:
981fe6060f1SDimitry Andric IID = Intrinsic::minnum;
982fe6060f1SDimitry Andric break;
983fe6060f1SDimitry Andric case SPF_FMAXNUM:
984fe6060f1SDimitry Andric IID = Intrinsic::maxnum;
985fe6060f1SDimitry Andric break;
986fe6060f1SDimitry Andric default:
987fe6060f1SDimitry Andric break;
988fe6060f1SDimitry Andric }
989fe6060f1SDimitry Andric if (IID) {
990fe6060f1SDimitry Andric // The ICmp is free, the select gets the cost of the min/max/etc
991fe6060f1SDimitry Andric if (Sel != I)
992fe6060f1SDimitry Andric return 0;
993fe6060f1SDimitry Andric IntrinsicCostAttributes CostAttrs(IID, ValTy, {ValTy, ValTy});
994fe6060f1SDimitry Andric return getIntrinsicInstrCost(CostAttrs, CostKind);
995fe6060f1SDimitry Andric }
996fe6060f1SDimitry Andric }
997fe6060f1SDimitry Andric
9980b57cec5SDimitry Andric // On NEON a vector select gets lowered to vbsl.
999e8d8bef9SDimitry Andric if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT && CondTy) {
10000b57cec5SDimitry Andric // Lowering of some vector selects is currently far from perfect.
10010b57cec5SDimitry Andric static const TypeConversionCostTblEntry NEONVectorSelectTbl[] = {
10020b57cec5SDimitry Andric { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4*4 + 1*2 + 1 },
10030b57cec5SDimitry Andric { ISD::SELECT, MVT::v8i1, MVT::v8i64, 50 },
10040b57cec5SDimitry Andric { ISD::SELECT, MVT::v16i1, MVT::v16i64, 100 }
10050b57cec5SDimitry Andric };
10060b57cec5SDimitry Andric
10070b57cec5SDimitry Andric EVT SelCondTy = TLI->getValueType(DL, CondTy);
10080b57cec5SDimitry Andric EVT SelValTy = TLI->getValueType(DL, ValTy);
10090b57cec5SDimitry Andric if (SelCondTy.isSimple() && SelValTy.isSimple()) {
10100b57cec5SDimitry Andric if (const auto *Entry = ConvertCostTableLookup(NEONVectorSelectTbl, ISD,
10110b57cec5SDimitry Andric SelCondTy.getSimpleVT(),
10120b57cec5SDimitry Andric SelValTy.getSimpleVT()))
10130b57cec5SDimitry Andric return Entry->Cost;
10140b57cec5SDimitry Andric }
10150b57cec5SDimitry Andric
1016bdd1243dSDimitry Andric std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
10170b57cec5SDimitry Andric return LT.first;
10180b57cec5SDimitry Andric }
10190b57cec5SDimitry Andric
1020fe6060f1SDimitry Andric if (ST->hasMVEIntegerOps() && ValTy->isVectorTy() &&
1021fe6060f1SDimitry Andric (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
1022fe6060f1SDimitry Andric cast<FixedVectorType>(ValTy)->getNumElements() > 1) {
1023fe6060f1SDimitry Andric FixedVectorType *VecValTy = cast<FixedVectorType>(ValTy);
1024fe6060f1SDimitry Andric FixedVectorType *VecCondTy = dyn_cast_or_null<FixedVectorType>(CondTy);
1025fe6060f1SDimitry Andric if (!VecCondTy)
1026fe6060f1SDimitry Andric VecCondTy = cast<FixedVectorType>(CmpInst::makeCmpResultType(VecValTy));
1027fe6060f1SDimitry Andric
1028fe6060f1SDimitry Andric // If we don't have mve.fp any fp operations will need to be scalarized.
1029fe6060f1SDimitry Andric if (Opcode == Instruction::FCmp && !ST->hasMVEFloatOps()) {
1030fe6060f1SDimitry Andric // One scalaization insert, one scalarization extract and the cost of the
1031fe6060f1SDimitry Andric // fcmps.
1032bdd1243dSDimitry Andric return BaseT::getScalarizationOverhead(VecValTy, /*Insert*/ false,
1033bdd1243dSDimitry Andric /*Extract*/ true, CostKind) +
1034bdd1243dSDimitry Andric BaseT::getScalarizationOverhead(VecCondTy, /*Insert*/ true,
1035bdd1243dSDimitry Andric /*Extract*/ false, CostKind) +
1036fe6060f1SDimitry Andric VecValTy->getNumElements() *
1037fe6060f1SDimitry Andric getCmpSelInstrCost(Opcode, ValTy->getScalarType(),
1038bdd1243dSDimitry Andric VecCondTy->getScalarType(), VecPred,
1039bdd1243dSDimitry Andric CostKind, I);
1040fe6060f1SDimitry Andric }
1041fe6060f1SDimitry Andric
1042bdd1243dSDimitry Andric std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1043fe6060f1SDimitry Andric int BaseCost = ST->getMVEVectorCostFactor(CostKind);
1044fe6060f1SDimitry Andric // There are two types - the input that specifies the type of the compare
1045fe6060f1SDimitry Andric // and the output vXi1 type. Because we don't know how the output will be
1046fe6060f1SDimitry Andric // split, we may need an expensive shuffle to get two in sync. This has the
1047fe6060f1SDimitry Andric // effect of making larger than legal compares (v8i32 for example)
1048fe6060f1SDimitry Andric // expensive.
1049f3fd488fSDimitry Andric if (LT.second.isVector() && LT.second.getVectorNumElements() > 2) {
1050fe6060f1SDimitry Andric if (LT.first > 1)
1051fe6060f1SDimitry Andric return LT.first * BaseCost +
1052bdd1243dSDimitry Andric BaseT::getScalarizationOverhead(VecCondTy, /*Insert*/ true,
1053bdd1243dSDimitry Andric /*Extract*/ false, CostKind);
1054fe6060f1SDimitry Andric return BaseCost;
1055fe6060f1SDimitry Andric }
1056fe6060f1SDimitry Andric }
1057fe6060f1SDimitry Andric
1058e8d8bef9SDimitry Andric // Default to cheap (throughput/size of 1 instruction) but adjust throughput
1059e8d8bef9SDimitry Andric // for "multiple beats" potentially needed by MVE instructions.
1060e8d8bef9SDimitry Andric int BaseCost = 1;
1061fe6060f1SDimitry Andric if (ST->hasMVEIntegerOps() && ValTy->isVectorTy())
1062fe6060f1SDimitry Andric BaseCost = ST->getMVEVectorCostFactor(CostKind);
1063e8d8bef9SDimitry Andric
1064e8d8bef9SDimitry Andric return BaseCost *
1065e8d8bef9SDimitry Andric BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
10660b57cec5SDimitry Andric }
10670b57cec5SDimitry Andric
getAddressComputationCost(Type * Ty,ScalarEvolution * SE,const SCEV * Ptr)1068fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getAddressComputationCost(Type *Ty,
1069fe6060f1SDimitry Andric ScalarEvolution *SE,
10700b57cec5SDimitry Andric const SCEV *Ptr) {
10710b57cec5SDimitry Andric // Address computations in vectorized code with non-consecutive addresses will
10720b57cec5SDimitry Andric // likely result in more instructions compared to scalar code where the
10730b57cec5SDimitry Andric // computation can more often be merged into the index mode. The resulting
10740b57cec5SDimitry Andric // extra micro-ops can significantly decrease throughput.
10750b57cec5SDimitry Andric unsigned NumVectorInstToHideOverhead = 10;
10760b57cec5SDimitry Andric int MaxMergeDistance = 64;
10770b57cec5SDimitry Andric
10788bcb0991SDimitry Andric if (ST->hasNEON()) {
10790b57cec5SDimitry Andric if (Ty->isVectorTy() && SE &&
10800b57cec5SDimitry Andric !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
10810b57cec5SDimitry Andric return NumVectorInstToHideOverhead;
10820b57cec5SDimitry Andric
10830b57cec5SDimitry Andric // In many cases the address computation is not merged into the instruction
10840b57cec5SDimitry Andric // addressing mode.
10850b57cec5SDimitry Andric return 1;
10860b57cec5SDimitry Andric }
10878bcb0991SDimitry Andric return BaseT::getAddressComputationCost(Ty, SE, Ptr);
10888bcb0991SDimitry Andric }
10898bcb0991SDimitry Andric
isProfitableLSRChainElement(Instruction * I)10905ffd83dbSDimitry Andric bool ARMTTIImpl::isProfitableLSRChainElement(Instruction *I) {
10915ffd83dbSDimitry Andric if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
10925ffd83dbSDimitry Andric // If a VCTP is part of a chain, it's already profitable and shouldn't be
10935ffd83dbSDimitry Andric // optimized, else LSR may block tail-predication.
10945ffd83dbSDimitry Andric switch (II->getIntrinsicID()) {
10955ffd83dbSDimitry Andric case Intrinsic::arm_mve_vctp8:
10965ffd83dbSDimitry Andric case Intrinsic::arm_mve_vctp16:
10975ffd83dbSDimitry Andric case Intrinsic::arm_mve_vctp32:
10985ffd83dbSDimitry Andric case Intrinsic::arm_mve_vctp64:
10995ffd83dbSDimitry Andric return true;
11005ffd83dbSDimitry Andric default:
11015ffd83dbSDimitry Andric break;
11025ffd83dbSDimitry Andric }
11035ffd83dbSDimitry Andric }
11045ffd83dbSDimitry Andric return false;
11055ffd83dbSDimitry Andric }
11065ffd83dbSDimitry Andric
isLegalMaskedLoad(Type * DataTy,Align Alignment)11075ffd83dbSDimitry Andric bool ARMTTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) {
11088bcb0991SDimitry Andric if (!EnableMaskedLoadStores || !ST->hasMVEIntegerOps())
11098bcb0991SDimitry Andric return false;
11108bcb0991SDimitry Andric
11115ffd83dbSDimitry Andric if (auto *VecTy = dyn_cast<FixedVectorType>(DataTy)) {
11128bcb0991SDimitry Andric // Don't support v2i1 yet.
11138bcb0991SDimitry Andric if (VecTy->getNumElements() == 2)
11148bcb0991SDimitry Andric return false;
11158bcb0991SDimitry Andric
11168bcb0991SDimitry Andric // We don't support extending fp types.
11178bcb0991SDimitry Andric unsigned VecWidth = DataTy->getPrimitiveSizeInBits();
11188bcb0991SDimitry Andric if (VecWidth != 128 && VecTy->getElementType()->isFloatingPointTy())
11198bcb0991SDimitry Andric return false;
11208bcb0991SDimitry Andric }
11218bcb0991SDimitry Andric
11228bcb0991SDimitry Andric unsigned EltWidth = DataTy->getScalarSizeInBits();
11235ffd83dbSDimitry Andric return (EltWidth == 32 && Alignment >= 4) ||
11245ffd83dbSDimitry Andric (EltWidth == 16 && Alignment >= 2) || (EltWidth == 8);
11258bcb0991SDimitry Andric }
11260b57cec5SDimitry Andric
isLegalMaskedGather(Type * Ty,Align Alignment)11275ffd83dbSDimitry Andric bool ARMTTIImpl::isLegalMaskedGather(Type *Ty, Align Alignment) {
1128480093f4SDimitry Andric if (!EnableMaskedGatherScatters || !ST->hasMVEIntegerOps())
1129480093f4SDimitry Andric return false;
1130480093f4SDimitry Andric
1131480093f4SDimitry Andric unsigned EltWidth = Ty->getScalarSizeInBits();
11325ffd83dbSDimitry Andric return ((EltWidth == 32 && Alignment >= 4) ||
11335ffd83dbSDimitry Andric (EltWidth == 16 && Alignment >= 2) || EltWidth == 8);
1134480093f4SDimitry Andric }
1135480093f4SDimitry Andric
1136e8d8bef9SDimitry Andric /// Given a memcpy/memset/memmove instruction, return the number of memory
1137e8d8bef9SDimitry Andric /// operations performed, via querying findOptimalMemOpLowering. Returns -1 if a
1138e8d8bef9SDimitry Andric /// call is used.
getNumMemOps(const IntrinsicInst * I) const1139e8d8bef9SDimitry Andric int ARMTTIImpl::getNumMemOps(const IntrinsicInst *I) const {
1140e8d8bef9SDimitry Andric MemOp MOp;
1141e8d8bef9SDimitry Andric unsigned DstAddrSpace = ~0u;
1142e8d8bef9SDimitry Andric unsigned SrcAddrSpace = ~0u;
1143e8d8bef9SDimitry Andric const Function *F = I->getParent()->getParent();
11440b57cec5SDimitry Andric
1145e8d8bef9SDimitry Andric if (const auto *MC = dyn_cast<MemTransferInst>(I)) {
1146e8d8bef9SDimitry Andric ConstantInt *C = dyn_cast<ConstantInt>(MC->getLength());
11470b57cec5SDimitry Andric // If 'size' is not a constant, a library call will be generated.
11480b57cec5SDimitry Andric if (!C)
1149e8d8bef9SDimitry Andric return -1;
11500b57cec5SDimitry Andric
11510b57cec5SDimitry Andric const unsigned Size = C->getValue().getZExtValue();
1152e8d8bef9SDimitry Andric const Align DstAlign = *MC->getDestAlign();
1153e8d8bef9SDimitry Andric const Align SrcAlign = *MC->getSourceAlign();
1154e8d8bef9SDimitry Andric
1155e8d8bef9SDimitry Andric MOp = MemOp::Copy(Size, /*DstAlignCanChange*/ false, DstAlign, SrcAlign,
1156e8d8bef9SDimitry Andric /*IsVolatile*/ false);
1157e8d8bef9SDimitry Andric DstAddrSpace = MC->getDestAddressSpace();
1158e8d8bef9SDimitry Andric SrcAddrSpace = MC->getSourceAddressSpace();
1159e8d8bef9SDimitry Andric }
1160e8d8bef9SDimitry Andric else if (const auto *MS = dyn_cast<MemSetInst>(I)) {
1161e8d8bef9SDimitry Andric ConstantInt *C = dyn_cast<ConstantInt>(MS->getLength());
1162e8d8bef9SDimitry Andric // If 'size' is not a constant, a library call will be generated.
1163e8d8bef9SDimitry Andric if (!C)
1164e8d8bef9SDimitry Andric return -1;
1165e8d8bef9SDimitry Andric
1166e8d8bef9SDimitry Andric const unsigned Size = C->getValue().getZExtValue();
1167e8d8bef9SDimitry Andric const Align DstAlign = *MS->getDestAlign();
1168e8d8bef9SDimitry Andric
1169e8d8bef9SDimitry Andric MOp = MemOp::Set(Size, /*DstAlignCanChange*/ false, DstAlign,
1170e8d8bef9SDimitry Andric /*IsZeroMemset*/ false, /*IsVolatile*/ false);
1171e8d8bef9SDimitry Andric DstAddrSpace = MS->getDestAddressSpace();
1172e8d8bef9SDimitry Andric }
1173e8d8bef9SDimitry Andric else
1174e8d8bef9SDimitry Andric llvm_unreachable("Expected a memcpy/move or memset!");
1175e8d8bef9SDimitry Andric
1176e8d8bef9SDimitry Andric unsigned Limit, Factor = 2;
1177e8d8bef9SDimitry Andric switch(I->getIntrinsicID()) {
1178e8d8bef9SDimitry Andric case Intrinsic::memcpy:
1179e8d8bef9SDimitry Andric Limit = TLI->getMaxStoresPerMemcpy(F->hasMinSize());
1180e8d8bef9SDimitry Andric break;
1181e8d8bef9SDimitry Andric case Intrinsic::memmove:
1182e8d8bef9SDimitry Andric Limit = TLI->getMaxStoresPerMemmove(F->hasMinSize());
1183e8d8bef9SDimitry Andric break;
1184e8d8bef9SDimitry Andric case Intrinsic::memset:
1185e8d8bef9SDimitry Andric Limit = TLI->getMaxStoresPerMemset(F->hasMinSize());
1186e8d8bef9SDimitry Andric Factor = 1;
1187e8d8bef9SDimitry Andric break;
1188e8d8bef9SDimitry Andric default:
1189e8d8bef9SDimitry Andric llvm_unreachable("Expected a memcpy/move or memset!");
1190e8d8bef9SDimitry Andric }
11910b57cec5SDimitry Andric
11920b57cec5SDimitry Andric // MemOps will be poplulated with a list of data types that needs to be
11930b57cec5SDimitry Andric // loaded and stored. That's why we multiply the number of elements by 2 to
11940b57cec5SDimitry Andric // get the cost for this memcpy.
1195e8d8bef9SDimitry Andric std::vector<EVT> MemOps;
11960b57cec5SDimitry Andric if (getTLI()->findOptimalMemOpLowering(
1197e8d8bef9SDimitry Andric MemOps, Limit, MOp, DstAddrSpace,
1198e8d8bef9SDimitry Andric SrcAddrSpace, F->getAttributes()))
1199e8d8bef9SDimitry Andric return MemOps.size() * Factor;
12000b57cec5SDimitry Andric
12010b57cec5SDimitry Andric // If we can't find an optimal memop lowering, return the default cost
1202e8d8bef9SDimitry Andric return -1;
1203e8d8bef9SDimitry Andric }
1204e8d8bef9SDimitry Andric
getMemcpyCost(const Instruction * I)1205fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getMemcpyCost(const Instruction *I) {
1206e8d8bef9SDimitry Andric int NumOps = getNumMemOps(cast<IntrinsicInst>(I));
1207e8d8bef9SDimitry Andric
1208e8d8bef9SDimitry Andric // To model the cost of a library call, we assume 1 for the call, and
1209e8d8bef9SDimitry Andric // 3 for the argument setup.
1210e8d8bef9SDimitry Andric if (NumOps == -1)
1211e8d8bef9SDimitry Andric return 4;
1212e8d8bef9SDimitry Andric return NumOps;
12130b57cec5SDimitry Andric }
12140b57cec5SDimitry Andric
getShuffleCost(TTI::ShuffleKind Kind,VectorType * Tp,ArrayRef<int> Mask,TTI::TargetCostKind CostKind,int Index,VectorType * SubTp,ArrayRef<const Value * > Args,const Instruction * CxtI)1215fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
1216fe6060f1SDimitry Andric VectorType *Tp, ArrayRef<int> Mask,
1217bdd1243dSDimitry Andric TTI::TargetCostKind CostKind,
121881ad6265SDimitry Andric int Index, VectorType *SubTp,
1219*0fca6ea1SDimitry Andric ArrayRef<const Value *> Args,
1220*0fca6ea1SDimitry Andric const Instruction *CxtI) {
12215f757f3fSDimitry Andric Kind = improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp);
1222*0fca6ea1SDimitry Andric // Treat extractsubvector as single op permutation.
1223*0fca6ea1SDimitry Andric bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
1224*0fca6ea1SDimitry Andric if (IsExtractSubvector)
1225*0fca6ea1SDimitry Andric Kind = TTI::SK_PermuteSingleSrc;
12268bcb0991SDimitry Andric if (ST->hasNEON()) {
12270b57cec5SDimitry Andric if (Kind == TTI::SK_Broadcast) {
12280b57cec5SDimitry Andric static const CostTblEntry NEONDupTbl[] = {
12290b57cec5SDimitry Andric // VDUP handles these cases.
12300b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
12310b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
12320b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
12330b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
12340b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
12350b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},
12360b57cec5SDimitry Andric
12370b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
12380b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
12390b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
12400b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1}};
12410b57cec5SDimitry Andric
1242bdd1243dSDimitry Andric std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
12438bcb0991SDimitry Andric if (const auto *Entry =
12448bcb0991SDimitry Andric CostTableLookup(NEONDupTbl, ISD::VECTOR_SHUFFLE, LT.second))
12450b57cec5SDimitry Andric return LT.first * Entry->Cost;
12460b57cec5SDimitry Andric }
12470b57cec5SDimitry Andric if (Kind == TTI::SK_Reverse) {
12480b57cec5SDimitry Andric static const CostTblEntry NEONShuffleTbl[] = {
12490b57cec5SDimitry Andric // Reverse shuffle cost one instruction if we are shuffling within a
12500b57cec5SDimitry Andric // double word (vrev) or two if we shuffle a quad word (vrev, vext).
12510b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
12520b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
12530b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
12540b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
12550b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
12560b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},
12570b57cec5SDimitry Andric
12580b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
12590b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
12600b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v8i16, 2},
12610b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v16i8, 2}};
12620b57cec5SDimitry Andric
1263bdd1243dSDimitry Andric std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
12648bcb0991SDimitry Andric if (const auto *Entry =
12658bcb0991SDimitry Andric CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
12660b57cec5SDimitry Andric return LT.first * Entry->Cost;
12670b57cec5SDimitry Andric }
12680b57cec5SDimitry Andric if (Kind == TTI::SK_Select) {
12690b57cec5SDimitry Andric static const CostTblEntry NEONSelShuffleTbl[] = {
12708bcb0991SDimitry Andric // Select shuffle cost table for ARM. Cost is the number of
12718bcb0991SDimitry Andric // instructions
12720b57cec5SDimitry Andric // required to create the shuffled vector.
12730b57cec5SDimitry Andric
12740b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
12750b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
12760b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
12770b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
12780b57cec5SDimitry Andric
12790b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
12800b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
12810b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v4i16, 2},
12820b57cec5SDimitry Andric
12830b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v8i16, 16},
12840b57cec5SDimitry Andric
12850b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v16i8, 32}};
12860b57cec5SDimitry Andric
1287bdd1243dSDimitry Andric std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
12880b57cec5SDimitry Andric if (const auto *Entry = CostTableLookup(NEONSelShuffleTbl,
12890b57cec5SDimitry Andric ISD::VECTOR_SHUFFLE, LT.second))
12900b57cec5SDimitry Andric return LT.first * Entry->Cost;
12910b57cec5SDimitry Andric }
12928bcb0991SDimitry Andric }
12938bcb0991SDimitry Andric if (ST->hasMVEIntegerOps()) {
12948bcb0991SDimitry Andric if (Kind == TTI::SK_Broadcast) {
12958bcb0991SDimitry Andric static const CostTblEntry MVEDupTbl[] = {
12968bcb0991SDimitry Andric // VDUP handles these cases.
12978bcb0991SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
12988bcb0991SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
12998bcb0991SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1},
13008bcb0991SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
13018bcb0991SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v8f16, 1}};
13028bcb0991SDimitry Andric
1303bdd1243dSDimitry Andric std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
13048bcb0991SDimitry Andric if (const auto *Entry = CostTableLookup(MVEDupTbl, ISD::VECTOR_SHUFFLE,
13058bcb0991SDimitry Andric LT.second))
1306fe6060f1SDimitry Andric return LT.first * Entry->Cost *
1307fe6060f1SDimitry Andric ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput);
13080b57cec5SDimitry Andric }
13090b57cec5SDimitry Andric
1310fe6060f1SDimitry Andric if (!Mask.empty()) {
1311bdd1243dSDimitry Andric std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
131256f451bbSDimitry Andric if (LT.second.isVector() &&
131356f451bbSDimitry Andric Mask.size() <= LT.second.getVectorNumElements() &&
1314fe6060f1SDimitry Andric (isVREVMask(Mask, LT.second, 16) || isVREVMask(Mask, LT.second, 32) ||
1315fe6060f1SDimitry Andric isVREVMask(Mask, LT.second, 64)))
1316fe6060f1SDimitry Andric return ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput) * LT.first;
1317fe6060f1SDimitry Andric }
1318fe6060f1SDimitry Andric }
1319fe6060f1SDimitry Andric
1320*0fca6ea1SDimitry Andric // Restore optimal kind.
1321*0fca6ea1SDimitry Andric if (IsExtractSubvector)
1322*0fca6ea1SDimitry Andric Kind = TTI::SK_ExtractSubvector;
1323fe6060f1SDimitry Andric int BaseCost = ST->hasMVEIntegerOps() && Tp->isVectorTy()
1324fe6060f1SDimitry Andric ? ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput)
1325fe6060f1SDimitry Andric : 1;
1326bdd1243dSDimitry Andric return BaseCost *
1327bdd1243dSDimitry Andric BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp);
1328fe6060f1SDimitry Andric }
1329fe6060f1SDimitry Andric
getArithmeticInstrCost(unsigned Opcode,Type * Ty,TTI::TargetCostKind CostKind,TTI::OperandValueInfo Op1Info,TTI::OperandValueInfo Op2Info,ArrayRef<const Value * > Args,const Instruction * CxtI)1330fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getArithmeticInstrCost(
1331fe6060f1SDimitry Andric unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
1332bdd1243dSDimitry Andric TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
1333bdd1243dSDimitry Andric ArrayRef<const Value *> Args,
1334480093f4SDimitry Andric const Instruction *CxtI) {
13350b57cec5SDimitry Andric int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
1336e8d8bef9SDimitry Andric if (ST->isThumb() && CostKind == TTI::TCK_CodeSize && Ty->isIntegerTy(1)) {
1337e8d8bef9SDimitry Andric // Make operations on i1 relatively expensive as this often involves
1338e8d8bef9SDimitry Andric // combining predicates. AND and XOR should be easier to handle with IT
1339e8d8bef9SDimitry Andric // blocks.
1340e8d8bef9SDimitry Andric switch (ISDOpcode) {
1341e8d8bef9SDimitry Andric default:
1342e8d8bef9SDimitry Andric break;
1343e8d8bef9SDimitry Andric case ISD::AND:
1344e8d8bef9SDimitry Andric case ISD::XOR:
1345e8d8bef9SDimitry Andric return 2;
1346e8d8bef9SDimitry Andric case ISD::OR:
1347e8d8bef9SDimitry Andric return 3;
1348e8d8bef9SDimitry Andric }
1349e8d8bef9SDimitry Andric }
1350e8d8bef9SDimitry Andric
1351bdd1243dSDimitry Andric std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
13520b57cec5SDimitry Andric
1353480093f4SDimitry Andric if (ST->hasNEON()) {
13540b57cec5SDimitry Andric const unsigned FunctionCallDivCost = 20;
13550b57cec5SDimitry Andric const unsigned ReciprocalDivCost = 10;
13560b57cec5SDimitry Andric static const CostTblEntry CostTbl[] = {
13570b57cec5SDimitry Andric // Division.
13580b57cec5SDimitry Andric // These costs are somewhat random. Choose a cost of 20 to indicate that
13590b57cec5SDimitry Andric // vectorizing devision (added function call) is going to be very expensive.
13600b57cec5SDimitry Andric // Double registers types.
13610b57cec5SDimitry Andric { ISD::SDIV, MVT::v1i64, 1 * FunctionCallDivCost},
13620b57cec5SDimitry Andric { ISD::UDIV, MVT::v1i64, 1 * FunctionCallDivCost},
13630b57cec5SDimitry Andric { ISD::SREM, MVT::v1i64, 1 * FunctionCallDivCost},
13640b57cec5SDimitry Andric { ISD::UREM, MVT::v1i64, 1 * FunctionCallDivCost},
13650b57cec5SDimitry Andric { ISD::SDIV, MVT::v2i32, 2 * FunctionCallDivCost},
13660b57cec5SDimitry Andric { ISD::UDIV, MVT::v2i32, 2 * FunctionCallDivCost},
13670b57cec5SDimitry Andric { ISD::SREM, MVT::v2i32, 2 * FunctionCallDivCost},
13680b57cec5SDimitry Andric { ISD::UREM, MVT::v2i32, 2 * FunctionCallDivCost},
13690b57cec5SDimitry Andric { ISD::SDIV, MVT::v4i16, ReciprocalDivCost},
13700b57cec5SDimitry Andric { ISD::UDIV, MVT::v4i16, ReciprocalDivCost},
13710b57cec5SDimitry Andric { ISD::SREM, MVT::v4i16, 4 * FunctionCallDivCost},
13720b57cec5SDimitry Andric { ISD::UREM, MVT::v4i16, 4 * FunctionCallDivCost},
13730b57cec5SDimitry Andric { ISD::SDIV, MVT::v8i8, ReciprocalDivCost},
13740b57cec5SDimitry Andric { ISD::UDIV, MVT::v8i8, ReciprocalDivCost},
13750b57cec5SDimitry Andric { ISD::SREM, MVT::v8i8, 8 * FunctionCallDivCost},
13760b57cec5SDimitry Andric { ISD::UREM, MVT::v8i8, 8 * FunctionCallDivCost},
13770b57cec5SDimitry Andric // Quad register types.
13780b57cec5SDimitry Andric { ISD::SDIV, MVT::v2i64, 2 * FunctionCallDivCost},
13790b57cec5SDimitry Andric { ISD::UDIV, MVT::v2i64, 2 * FunctionCallDivCost},
13800b57cec5SDimitry Andric { ISD::SREM, MVT::v2i64, 2 * FunctionCallDivCost},
13810b57cec5SDimitry Andric { ISD::UREM, MVT::v2i64, 2 * FunctionCallDivCost},
13820b57cec5SDimitry Andric { ISD::SDIV, MVT::v4i32, 4 * FunctionCallDivCost},
13830b57cec5SDimitry Andric { ISD::UDIV, MVT::v4i32, 4 * FunctionCallDivCost},
13840b57cec5SDimitry Andric { ISD::SREM, MVT::v4i32, 4 * FunctionCallDivCost},
13850b57cec5SDimitry Andric { ISD::UREM, MVT::v4i32, 4 * FunctionCallDivCost},
13860b57cec5SDimitry Andric { ISD::SDIV, MVT::v8i16, 8 * FunctionCallDivCost},
13870b57cec5SDimitry Andric { ISD::UDIV, MVT::v8i16, 8 * FunctionCallDivCost},
13880b57cec5SDimitry Andric { ISD::SREM, MVT::v8i16, 8 * FunctionCallDivCost},
13890b57cec5SDimitry Andric { ISD::UREM, MVT::v8i16, 8 * FunctionCallDivCost},
13900b57cec5SDimitry Andric { ISD::SDIV, MVT::v16i8, 16 * FunctionCallDivCost},
13910b57cec5SDimitry Andric { ISD::UDIV, MVT::v16i8, 16 * FunctionCallDivCost},
13920b57cec5SDimitry Andric { ISD::SREM, MVT::v16i8, 16 * FunctionCallDivCost},
13930b57cec5SDimitry Andric { ISD::UREM, MVT::v16i8, 16 * FunctionCallDivCost},
13940b57cec5SDimitry Andric // Multiplication.
13950b57cec5SDimitry Andric };
13960b57cec5SDimitry Andric
13970b57cec5SDimitry Andric if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second))
13980b57cec5SDimitry Andric return LT.first * Entry->Cost;
13990b57cec5SDimitry Andric
1400fe6060f1SDimitry Andric InstructionCost Cost = BaseT::getArithmeticInstrCost(
1401bdd1243dSDimitry Andric Opcode, Ty, CostKind, Op1Info, Op2Info);
14020b57cec5SDimitry Andric
14030b57cec5SDimitry Andric // This is somewhat of a hack. The problem that we are facing is that SROA
14040b57cec5SDimitry Andric // creates a sequence of shift, and, or instructions to construct values.
14050b57cec5SDimitry Andric // These sequences are recognized by the ISel and have zero-cost. Not so for
14060b57cec5SDimitry Andric // the vectorized code. Because we have support for v2i64 but not i64 those
14070b57cec5SDimitry Andric // sequences look particularly beneficial to vectorize.
14080b57cec5SDimitry Andric // To work around this we increase the cost of v2i64 operations to make them
14090b57cec5SDimitry Andric // seem less beneficial.
1410bdd1243dSDimitry Andric if (LT.second == MVT::v2i64 && Op2Info.isUniform() && Op2Info.isConstant())
14110b57cec5SDimitry Andric Cost += 4;
14120b57cec5SDimitry Andric
14130b57cec5SDimitry Andric return Cost;
14140b57cec5SDimitry Andric }
14150b57cec5SDimitry Andric
1416480093f4SDimitry Andric // If this operation is a shift on arm/thumb2, it might well be folded into
1417480093f4SDimitry Andric // the following instruction, hence having a cost of 0.
1418480093f4SDimitry Andric auto LooksLikeAFreeShift = [&]() {
1419480093f4SDimitry Andric if (ST->isThumb1Only() || Ty->isVectorTy())
1420480093f4SDimitry Andric return false;
1421480093f4SDimitry Andric
1422480093f4SDimitry Andric if (!CxtI || !CxtI->hasOneUse() || !CxtI->isShift())
1423480093f4SDimitry Andric return false;
1424bdd1243dSDimitry Andric if (!Op2Info.isUniform() || !Op2Info.isConstant())
1425480093f4SDimitry Andric return false;
1426480093f4SDimitry Andric
1427480093f4SDimitry Andric // Folded into a ADC/ADD/AND/BIC/CMP/EOR/MVN/ORR/ORN/RSB/SBC/SUB
1428480093f4SDimitry Andric switch (cast<Instruction>(CxtI->user_back())->getOpcode()) {
1429480093f4SDimitry Andric case Instruction::Add:
1430480093f4SDimitry Andric case Instruction::Sub:
1431480093f4SDimitry Andric case Instruction::And:
1432480093f4SDimitry Andric case Instruction::Xor:
1433480093f4SDimitry Andric case Instruction::Or:
1434480093f4SDimitry Andric case Instruction::ICmp:
1435480093f4SDimitry Andric return true;
1436480093f4SDimitry Andric default:
1437480093f4SDimitry Andric return false;
1438480093f4SDimitry Andric }
1439480093f4SDimitry Andric };
1440480093f4SDimitry Andric if (LooksLikeAFreeShift())
1441480093f4SDimitry Andric return 0;
1442480093f4SDimitry Andric
1443e8d8bef9SDimitry Andric // Default to cheap (throughput/size of 1 instruction) but adjust throughput
1444e8d8bef9SDimitry Andric // for "multiple beats" potentially needed by MVE instructions.
1445e8d8bef9SDimitry Andric int BaseCost = 1;
1446fe6060f1SDimitry Andric if (ST->hasMVEIntegerOps() && Ty->isVectorTy())
1447fe6060f1SDimitry Andric BaseCost = ST->getMVEVectorCostFactor(CostKind);
14488bcb0991SDimitry Andric
14498bcb0991SDimitry Andric // The rest of this mostly follows what is done in BaseT::getArithmeticInstrCost,
14508bcb0991SDimitry Andric // without treating floats as more expensive that scalars or increasing the
14518bcb0991SDimitry Andric // costs for custom operations. The results is also multiplied by the
14528bcb0991SDimitry Andric // MVEVectorCostFactor where appropriate.
14538bcb0991SDimitry Andric if (TLI->isOperationLegalOrCustomOrPromote(ISDOpcode, LT.second))
14548bcb0991SDimitry Andric return LT.first * BaseCost;
14558bcb0991SDimitry Andric
14568bcb0991SDimitry Andric // Else this is expand, assume that we need to scalarize this op.
14575ffd83dbSDimitry Andric if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
14585ffd83dbSDimitry Andric unsigned Num = VTy->getNumElements();
1459fe6060f1SDimitry Andric InstructionCost Cost =
1460fe6060f1SDimitry Andric getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind);
14618bcb0991SDimitry Andric // Return the cost of multiple scalar invocation plus the cost of
14628bcb0991SDimitry Andric // inserting and extracting the values.
1463fe6060f1SDimitry Andric SmallVector<Type *> Tys(Args.size(), Ty);
1464bdd1243dSDimitry Andric return BaseT::getScalarizationOverhead(VTy, Args, Tys, CostKind) +
1465bdd1243dSDimitry Andric Num * Cost;
14668bcb0991SDimitry Andric }
14678bcb0991SDimitry Andric
14688bcb0991SDimitry Andric return BaseCost;
14698bcb0991SDimitry Andric }
14708bcb0991SDimitry Andric
getMemoryOpCost(unsigned Opcode,Type * Src,MaybeAlign Alignment,unsigned AddressSpace,TTI::TargetCostKind CostKind,TTI::OperandValueInfo OpInfo,const Instruction * I)1471fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
1472fe6060f1SDimitry Andric MaybeAlign Alignment,
1473fe6060f1SDimitry Andric unsigned AddressSpace,
14745ffd83dbSDimitry Andric TTI::TargetCostKind CostKind,
1475bdd1243dSDimitry Andric TTI::OperandValueInfo OpInfo,
1476480093f4SDimitry Andric const Instruction *I) {
14775ffd83dbSDimitry Andric // TODO: Handle other cost kinds.
14785ffd83dbSDimitry Andric if (CostKind != TTI::TCK_RecipThroughput)
14795ffd83dbSDimitry Andric return 1;
14805ffd83dbSDimitry Andric
14815ffd83dbSDimitry Andric // Type legalization can't handle structs
14825ffd83dbSDimitry Andric if (TLI->getValueType(DL, Src, true) == MVT::Other)
14835ffd83dbSDimitry Andric return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
14845ffd83dbSDimitry Andric CostKind);
14850b57cec5SDimitry Andric
1486480093f4SDimitry Andric if (ST->hasNEON() && Src->isVectorTy() &&
1487480093f4SDimitry Andric (Alignment && *Alignment != Align(16)) &&
14885ffd83dbSDimitry Andric cast<VectorType>(Src)->getElementType()->isDoubleTy()) {
14890b57cec5SDimitry Andric // Unaligned loads/stores are extremely inefficient.
14900b57cec5SDimitry Andric // We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr.
1491bdd1243dSDimitry Andric std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
14920b57cec5SDimitry Andric return LT.first * 4;
14930b57cec5SDimitry Andric }
14945ffd83dbSDimitry Andric
14955ffd83dbSDimitry Andric // MVE can optimize a fpext(load(4xhalf)) using an extending integer load.
14965ffd83dbSDimitry Andric // Same for stores.
14975ffd83dbSDimitry Andric if (ST->hasMVEFloatOps() && isa<FixedVectorType>(Src) && I &&
14985ffd83dbSDimitry Andric ((Opcode == Instruction::Load && I->hasOneUse() &&
14995ffd83dbSDimitry Andric isa<FPExtInst>(*I->user_begin())) ||
15005ffd83dbSDimitry Andric (Opcode == Instruction::Store && isa<FPTruncInst>(I->getOperand(0))))) {
15015ffd83dbSDimitry Andric FixedVectorType *SrcVTy = cast<FixedVectorType>(Src);
15025ffd83dbSDimitry Andric Type *DstTy =
15035ffd83dbSDimitry Andric Opcode == Instruction::Load
15045ffd83dbSDimitry Andric ? (*I->user_begin())->getType()
15055ffd83dbSDimitry Andric : cast<Instruction>(I->getOperand(0))->getOperand(0)->getType();
15065ffd83dbSDimitry Andric if (SrcVTy->getNumElements() == 4 && SrcVTy->getScalarType()->isHalfTy() &&
15075ffd83dbSDimitry Andric DstTy->getScalarType()->isFloatTy())
1508fe6060f1SDimitry Andric return ST->getMVEVectorCostFactor(CostKind);
15095ffd83dbSDimitry Andric }
15105ffd83dbSDimitry Andric
15118bcb0991SDimitry Andric int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
1512fe6060f1SDimitry Andric ? ST->getMVEVectorCostFactor(CostKind)
15138bcb0991SDimitry Andric : 1;
15145ffd83dbSDimitry Andric return BaseCost * BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1515bdd1243dSDimitry Andric CostKind, OpInfo, I);
15160b57cec5SDimitry Andric }
15170b57cec5SDimitry Andric
1518fe6060f1SDimitry Andric InstructionCost
getMaskedMemoryOpCost(unsigned Opcode,Type * Src,Align Alignment,unsigned AddressSpace,TTI::TargetCostKind CostKind)1519fe6060f1SDimitry Andric ARMTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
1520e8d8bef9SDimitry Andric unsigned AddressSpace,
1521e8d8bef9SDimitry Andric TTI::TargetCostKind CostKind) {
1522e8d8bef9SDimitry Andric if (ST->hasMVEIntegerOps()) {
1523e8d8bef9SDimitry Andric if (Opcode == Instruction::Load && isLegalMaskedLoad(Src, Alignment))
1524fe6060f1SDimitry Andric return ST->getMVEVectorCostFactor(CostKind);
1525e8d8bef9SDimitry Andric if (Opcode == Instruction::Store && isLegalMaskedStore(Src, Alignment))
1526fe6060f1SDimitry Andric return ST->getMVEVectorCostFactor(CostKind);
1527e8d8bef9SDimitry Andric }
1528e8d8bef9SDimitry Andric if (!isa<FixedVectorType>(Src))
1529e8d8bef9SDimitry Andric return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1530e8d8bef9SDimitry Andric CostKind);
1531e8d8bef9SDimitry Andric // Scalar cost, which is currently very high due to the efficiency of the
1532e8d8bef9SDimitry Andric // generated code.
1533e8d8bef9SDimitry Andric return cast<FixedVectorType>(Src)->getNumElements() * 8;
1534e8d8bef9SDimitry Andric }
1535e8d8bef9SDimitry Andric
getInterleavedMemoryOpCost(unsigned Opcode,Type * VecTy,unsigned Factor,ArrayRef<unsigned> Indices,Align Alignment,unsigned AddressSpace,TTI::TargetCostKind CostKind,bool UseMaskForCond,bool UseMaskForGaps)1536fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getInterleavedMemoryOpCost(
1537480093f4SDimitry Andric unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
15385ffd83dbSDimitry Andric Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
15395ffd83dbSDimitry Andric bool UseMaskForCond, bool UseMaskForGaps) {
15400b57cec5SDimitry Andric assert(Factor >= 2 && "Invalid interleave factor");
15410b57cec5SDimitry Andric assert(isa<VectorType>(VecTy) && "Expect a vector type");
15420b57cec5SDimitry Andric
15430b57cec5SDimitry Andric // vldN/vstN doesn't support vector types of i64/f64 element.
15440b57cec5SDimitry Andric bool EltIs64Bits = DL.getTypeSizeInBits(VecTy->getScalarType()) == 64;
15450b57cec5SDimitry Andric
15460b57cec5SDimitry Andric if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits &&
15470b57cec5SDimitry Andric !UseMaskForCond && !UseMaskForGaps) {
15485ffd83dbSDimitry Andric unsigned NumElts = cast<FixedVectorType>(VecTy)->getNumElements();
15495ffd83dbSDimitry Andric auto *SubVecTy =
15505ffd83dbSDimitry Andric FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor);
15510b57cec5SDimitry Andric
15520b57cec5SDimitry Andric // vldN/vstN only support legal vector types of size 64 or 128 in bits.
15530b57cec5SDimitry Andric // Accesses having vector types that are a multiple of 128 bits can be
15540b57cec5SDimitry Andric // matched to more than one vldN/vstN instruction.
1555fe6060f1SDimitry Andric int BaseCost =
1556fe6060f1SDimitry Andric ST->hasMVEIntegerOps() ? ST->getMVEVectorCostFactor(CostKind) : 1;
15570b57cec5SDimitry Andric if (NumElts % Factor == 0 &&
1558fe6060f1SDimitry Andric TLI->isLegalInterleavedAccessType(Factor, SubVecTy, Alignment, DL))
1559480093f4SDimitry Andric return Factor * BaseCost * TLI->getNumInterleavedAccesses(SubVecTy, DL);
1560480093f4SDimitry Andric
1561480093f4SDimitry Andric // Some smaller than legal interleaved patterns are cheap as we can make
1562480093f4SDimitry Andric // use of the vmovn or vrev patterns to interleave a standard load. This is
1563480093f4SDimitry Andric // true for v4i8, v8i8 and v4i16 at least (but not for v4f16 as it is
1564480093f4SDimitry Andric // promoted differently). The cost of 2 here is then a load and vrev or
1565480093f4SDimitry Andric // vmovn.
1566480093f4SDimitry Andric if (ST->hasMVEIntegerOps() && Factor == 2 && NumElts / Factor > 2 &&
1567e8d8bef9SDimitry Andric VecTy->isIntOrIntVectorTy() &&
1568bdd1243dSDimitry Andric DL.getTypeSizeInBits(SubVecTy).getFixedValue() <= 64)
1569480093f4SDimitry Andric return 2 * BaseCost;
15700b57cec5SDimitry Andric }
15710b57cec5SDimitry Andric
15720b57cec5SDimitry Andric return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
15735ffd83dbSDimitry Andric Alignment, AddressSpace, CostKind,
15740b57cec5SDimitry Andric UseMaskForCond, UseMaskForGaps);
15750b57cec5SDimitry Andric }
15760b57cec5SDimitry Andric
getGatherScatterOpCost(unsigned Opcode,Type * DataTy,const Value * Ptr,bool VariableMask,Align Alignment,TTI::TargetCostKind CostKind,const Instruction * I)1577fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getGatherScatterOpCost(
1578fe6060f1SDimitry Andric unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
1579fe6060f1SDimitry Andric Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
15805ffd83dbSDimitry Andric using namespace PatternMatch;
15815ffd83dbSDimitry Andric if (!ST->hasMVEIntegerOps() || !EnableMaskedGatherScatters)
15825ffd83dbSDimitry Andric return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
15835ffd83dbSDimitry Andric Alignment, CostKind, I);
15845ffd83dbSDimitry Andric
15855ffd83dbSDimitry Andric assert(DataTy->isVectorTy() && "Can't do gather/scatters on scalar!");
15865ffd83dbSDimitry Andric auto *VTy = cast<FixedVectorType>(DataTy);
15875ffd83dbSDimitry Andric
15885ffd83dbSDimitry Andric // TODO: Splitting, once we do that.
15895ffd83dbSDimitry Andric
15905ffd83dbSDimitry Andric unsigned NumElems = VTy->getNumElements();
15915ffd83dbSDimitry Andric unsigned EltSize = VTy->getScalarSizeInBits();
1592bdd1243dSDimitry Andric std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(DataTy);
15935ffd83dbSDimitry Andric
15945ffd83dbSDimitry Andric // For now, it is assumed that for the MVE gather instructions the loads are
15955ffd83dbSDimitry Andric // all effectively serialised. This means the cost is the scalar cost
15965ffd83dbSDimitry Andric // multiplied by the number of elements being loaded. This is possibly very
15975ffd83dbSDimitry Andric // conservative, but even so we still end up vectorising loops because the
15985ffd83dbSDimitry Andric // cost per iteration for many loops is lower than for scalar loops.
1599fe6060f1SDimitry Andric InstructionCost VectorCost =
1600fe6060f1SDimitry Andric NumElems * LT.first * ST->getMVEVectorCostFactor(CostKind);
16015ffd83dbSDimitry Andric // The scalarization cost should be a lot higher. We use the number of vector
160206c3fb27SDimitry Andric // elements plus the scalarization overhead. If masking is required then a lot
160306c3fb27SDimitry Andric // of little blocks will be needed and potentially a scalarized p0 mask,
160406c3fb27SDimitry Andric // greatly increasing the cost.
1605fe6060f1SDimitry Andric InstructionCost ScalarCost =
160606c3fb27SDimitry Andric NumElems * LT.first + (VariableMask ? NumElems * 5 : 0) +
1607bdd1243dSDimitry Andric BaseT::getScalarizationOverhead(VTy, /*Insert*/ true, /*Extract*/ false,
1608bdd1243dSDimitry Andric CostKind) +
1609bdd1243dSDimitry Andric BaseT::getScalarizationOverhead(VTy, /*Insert*/ false, /*Extract*/ true,
1610bdd1243dSDimitry Andric CostKind);
16115ffd83dbSDimitry Andric
1612e8d8bef9SDimitry Andric if (EltSize < 8 || Alignment < EltSize / 8)
16135ffd83dbSDimitry Andric return ScalarCost;
16145ffd83dbSDimitry Andric
16155ffd83dbSDimitry Andric unsigned ExtSize = EltSize;
16165ffd83dbSDimitry Andric // Check whether there's a single user that asks for an extended type
16175ffd83dbSDimitry Andric if (I != nullptr) {
16185ffd83dbSDimitry Andric // Dependent of the caller of this function, a gather instruction will
16195ffd83dbSDimitry Andric // either have opcode Instruction::Load or be a call to the masked_gather
16205ffd83dbSDimitry Andric // intrinsic
16215ffd83dbSDimitry Andric if ((I->getOpcode() == Instruction::Load ||
16225ffd83dbSDimitry Andric match(I, m_Intrinsic<Intrinsic::masked_gather>())) &&
16235ffd83dbSDimitry Andric I->hasOneUse()) {
16245ffd83dbSDimitry Andric const User *Us = *I->users().begin();
16255ffd83dbSDimitry Andric if (isa<ZExtInst>(Us) || isa<SExtInst>(Us)) {
16265ffd83dbSDimitry Andric // only allow valid type combinations
16275ffd83dbSDimitry Andric unsigned TypeSize =
16285ffd83dbSDimitry Andric cast<Instruction>(Us)->getType()->getScalarSizeInBits();
16295ffd83dbSDimitry Andric if (((TypeSize == 32 && (EltSize == 8 || EltSize == 16)) ||
16305ffd83dbSDimitry Andric (TypeSize == 16 && EltSize == 8)) &&
16315ffd83dbSDimitry Andric TypeSize * NumElems == 128) {
16325ffd83dbSDimitry Andric ExtSize = TypeSize;
16335ffd83dbSDimitry Andric }
16345ffd83dbSDimitry Andric }
16355ffd83dbSDimitry Andric }
16365ffd83dbSDimitry Andric // Check whether the input data needs to be truncated
16375ffd83dbSDimitry Andric TruncInst *T;
16385ffd83dbSDimitry Andric if ((I->getOpcode() == Instruction::Store ||
16395ffd83dbSDimitry Andric match(I, m_Intrinsic<Intrinsic::masked_scatter>())) &&
16405ffd83dbSDimitry Andric (T = dyn_cast<TruncInst>(I->getOperand(0)))) {
16415ffd83dbSDimitry Andric // Only allow valid type combinations
16425ffd83dbSDimitry Andric unsigned TypeSize = T->getOperand(0)->getType()->getScalarSizeInBits();
16435ffd83dbSDimitry Andric if (((EltSize == 16 && TypeSize == 32) ||
16445ffd83dbSDimitry Andric (EltSize == 8 && (TypeSize == 32 || TypeSize == 16))) &&
16455ffd83dbSDimitry Andric TypeSize * NumElems == 128)
16465ffd83dbSDimitry Andric ExtSize = TypeSize;
16475ffd83dbSDimitry Andric }
16485ffd83dbSDimitry Andric }
16495ffd83dbSDimitry Andric
16505ffd83dbSDimitry Andric if (ExtSize * NumElems != 128 || NumElems < 4)
16515ffd83dbSDimitry Andric return ScalarCost;
16525ffd83dbSDimitry Andric
16535ffd83dbSDimitry Andric // Any (aligned) i32 gather will not need to be scalarised.
16545ffd83dbSDimitry Andric if (ExtSize == 32)
16555ffd83dbSDimitry Andric return VectorCost;
16565ffd83dbSDimitry Andric // For smaller types, we need to ensure that the gep's inputs are correctly
16575ffd83dbSDimitry Andric // extended from a small enough value. Other sizes (including i64) are
16585ffd83dbSDimitry Andric // scalarized for now.
16595ffd83dbSDimitry Andric if (ExtSize != 8 && ExtSize != 16)
16605ffd83dbSDimitry Andric return ScalarCost;
16615ffd83dbSDimitry Andric
16625ffd83dbSDimitry Andric if (const auto *BC = dyn_cast<BitCastInst>(Ptr))
16635ffd83dbSDimitry Andric Ptr = BC->getOperand(0);
16645ffd83dbSDimitry Andric if (const auto *GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
16655ffd83dbSDimitry Andric if (GEP->getNumOperands() != 2)
16665ffd83dbSDimitry Andric return ScalarCost;
16675ffd83dbSDimitry Andric unsigned Scale = DL.getTypeAllocSize(GEP->getResultElementType());
16685ffd83dbSDimitry Andric // Scale needs to be correct (which is only relevant for i16s).
16695ffd83dbSDimitry Andric if (Scale != 1 && Scale * 8 != ExtSize)
16705ffd83dbSDimitry Andric return ScalarCost;
16715ffd83dbSDimitry Andric // And we need to zext (not sext) the indexes from a small enough type.
16725ffd83dbSDimitry Andric if (const auto *ZExt = dyn_cast<ZExtInst>(GEP->getOperand(1))) {
16735ffd83dbSDimitry Andric if (ZExt->getOperand(0)->getType()->getScalarSizeInBits() <= ExtSize)
16745ffd83dbSDimitry Andric return VectorCost;
16755ffd83dbSDimitry Andric }
16765ffd83dbSDimitry Andric return ScalarCost;
16775ffd83dbSDimitry Andric }
16785ffd83dbSDimitry Andric return ScalarCost;
16795ffd83dbSDimitry Andric }
16805ffd83dbSDimitry Andric
1681fe6060f1SDimitry Andric InstructionCost
getArithmeticReductionCost(unsigned Opcode,VectorType * ValTy,std::optional<FastMathFlags> FMF,TTI::TargetCostKind CostKind)1682fe6060f1SDimitry Andric ARMTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
1683bdd1243dSDimitry Andric std::optional<FastMathFlags> FMF,
1684e8d8bef9SDimitry Andric TTI::TargetCostKind CostKind) {
1685fe6060f1SDimitry Andric
1686e8d8bef9SDimitry Andric EVT ValVT = TLI->getValueType(DL, ValTy);
1687e8d8bef9SDimitry Andric int ISD = TLI->InstructionOpcodeToISD(Opcode);
16885f757f3fSDimitry Andric unsigned EltSize = ValVT.getScalarSizeInBits();
16895f757f3fSDimitry Andric
16905f757f3fSDimitry Andric // In general floating point reductions are a series of elementwise
16915f757f3fSDimitry Andric // operations, with free extracts on each step. These are either in-order or
16925f757f3fSDimitry Andric // treewise depending on whether that is allowed by the fast math flags.
16935f757f3fSDimitry Andric if ((ISD == ISD::FADD || ISD == ISD::FMUL) &&
16945f757f3fSDimitry Andric ((EltSize == 32 && ST->hasVFP2Base()) ||
16955f757f3fSDimitry Andric (EltSize == 64 && ST->hasFP64()) ||
16965f757f3fSDimitry Andric (EltSize == 16 && ST->hasFullFP16()))) {
16975f757f3fSDimitry Andric unsigned NumElts = cast<FixedVectorType>(ValTy)->getNumElements();
16985f757f3fSDimitry Andric unsigned VecLimit = ST->hasMVEFloatOps() ? 128 : (ST->hasNEON() ? 64 : -1);
16995f757f3fSDimitry Andric InstructionCost VecCost = 0;
17005f757f3fSDimitry Andric while (!TTI::requiresOrderedReduction(FMF) && isPowerOf2_32(NumElts) &&
17015f757f3fSDimitry Andric NumElts * EltSize > VecLimit) {
17025f757f3fSDimitry Andric Type *VecTy = FixedVectorType::get(ValTy->getElementType(), NumElts / 2);
17035f757f3fSDimitry Andric VecCost += getArithmeticInstrCost(Opcode, VecTy, CostKind);
17045f757f3fSDimitry Andric NumElts /= 2;
17055f757f3fSDimitry Andric }
17065f757f3fSDimitry Andric
17075f757f3fSDimitry Andric // For fp16 we need to extract the upper lane elements. MVE can add a
17085f757f3fSDimitry Andric // VREV+FMIN/MAX to perform another vector step instead.
17095f757f3fSDimitry Andric InstructionCost ExtractCost = 0;
17105f757f3fSDimitry Andric if (!TTI::requiresOrderedReduction(FMF) && ST->hasMVEFloatOps() &&
17115f757f3fSDimitry Andric ValVT.getVectorElementType() == MVT::f16 && NumElts == 8) {
17125f757f3fSDimitry Andric VecCost += ST->getMVEVectorCostFactor(CostKind) * 2;
17135f757f3fSDimitry Andric NumElts /= 2;
17145f757f3fSDimitry Andric } else if (ValVT.getVectorElementType() == MVT::f16)
17155f757f3fSDimitry Andric ExtractCost = NumElts / 2;
17165f757f3fSDimitry Andric
17175f757f3fSDimitry Andric return VecCost + ExtractCost +
17185f757f3fSDimitry Andric NumElts *
17195f757f3fSDimitry Andric getArithmeticInstrCost(Opcode, ValTy->getElementType(), CostKind);
17205f757f3fSDimitry Andric }
17215f757f3fSDimitry Andric
17225f757f3fSDimitry Andric if ((ISD == ISD::AND || ISD == ISD::OR || ISD == ISD::XOR) &&
17235f757f3fSDimitry Andric (EltSize == 64 || EltSize == 32 || EltSize == 16 || EltSize == 8)) {
17245f757f3fSDimitry Andric unsigned NumElts = cast<FixedVectorType>(ValTy)->getNumElements();
17255f757f3fSDimitry Andric unsigned VecLimit =
17265f757f3fSDimitry Andric ST->hasMVEIntegerOps() ? 128 : (ST->hasNEON() ? 64 : -1);
17275f757f3fSDimitry Andric InstructionCost VecCost = 0;
17285f757f3fSDimitry Andric while (isPowerOf2_32(NumElts) && NumElts * EltSize > VecLimit) {
17295f757f3fSDimitry Andric Type *VecTy = FixedVectorType::get(ValTy->getElementType(), NumElts / 2);
17305f757f3fSDimitry Andric VecCost += getArithmeticInstrCost(Opcode, VecTy, CostKind);
17315f757f3fSDimitry Andric NumElts /= 2;
17325f757f3fSDimitry Andric }
17335f757f3fSDimitry Andric // For i16/i8, MVE will perform a VREV + VORR/VAND/VEOR for the 64bit vector
17345f757f3fSDimitry Andric // step.
17355f757f3fSDimitry Andric if (ST->hasMVEIntegerOps() && ValVT.getScalarSizeInBits() <= 16 &&
17365f757f3fSDimitry Andric NumElts * EltSize == 64) {
17375f757f3fSDimitry Andric Type *VecTy = FixedVectorType::get(ValTy->getElementType(), NumElts);
17385f757f3fSDimitry Andric VecCost += ST->getMVEVectorCostFactor(CostKind) +
17395f757f3fSDimitry Andric getArithmeticInstrCost(Opcode, VecTy, CostKind);
17405f757f3fSDimitry Andric NumElts /= 2;
17415f757f3fSDimitry Andric }
17425f757f3fSDimitry Andric
17435f757f3fSDimitry Andric // From here we extract the elements and perform the and/or/xor.
17445f757f3fSDimitry Andric InstructionCost ExtractCost = NumElts;
17455f757f3fSDimitry Andric return VecCost + ExtractCost +
17465f757f3fSDimitry Andric (NumElts - 1) * getArithmeticInstrCost(
17475f757f3fSDimitry Andric Opcode, ValTy->getElementType(), CostKind);
17485f757f3fSDimitry Andric }
17495f757f3fSDimitry Andric
17505f757f3fSDimitry Andric if (!ST->hasMVEIntegerOps() || !ValVT.isSimple() || ISD != ISD::ADD ||
17515f757f3fSDimitry Andric TTI::requiresOrderedReduction(FMF))
1752fe6060f1SDimitry Andric return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1753e8d8bef9SDimitry Andric
1754bdd1243dSDimitry Andric std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1755e8d8bef9SDimitry Andric
1756e8d8bef9SDimitry Andric static const CostTblEntry CostTblAdd[]{
1757e8d8bef9SDimitry Andric {ISD::ADD, MVT::v16i8, 1},
1758e8d8bef9SDimitry Andric {ISD::ADD, MVT::v8i16, 1},
1759e8d8bef9SDimitry Andric {ISD::ADD, MVT::v4i32, 1},
1760e8d8bef9SDimitry Andric };
1761e8d8bef9SDimitry Andric if (const auto *Entry = CostTableLookup(CostTblAdd, ISD, LT.second))
1762fe6060f1SDimitry Andric return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first;
1763e8d8bef9SDimitry Andric
1764fe6060f1SDimitry Andric return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1765e8d8bef9SDimitry Andric }
1766e8d8bef9SDimitry Andric
getExtendedReductionCost(unsigned Opcode,bool IsUnsigned,Type * ResTy,VectorType * ValTy,FastMathFlags FMF,TTI::TargetCostKind CostKind)1767bdd1243dSDimitry Andric InstructionCost ARMTTIImpl::getExtendedReductionCost(
1768bdd1243dSDimitry Andric unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy,
176906c3fb27SDimitry Andric FastMathFlags FMF, TTI::TargetCostKind CostKind) {
1770bdd1243dSDimitry Andric EVT ValVT = TLI->getValueType(DL, ValTy);
1771bdd1243dSDimitry Andric EVT ResVT = TLI->getValueType(DL, ResTy);
1772bdd1243dSDimitry Andric
1773bdd1243dSDimitry Andric int ISD = TLI->InstructionOpcodeToISD(Opcode);
1774bdd1243dSDimitry Andric
1775bdd1243dSDimitry Andric switch (ISD) {
1776bdd1243dSDimitry Andric case ISD::ADD:
1777bdd1243dSDimitry Andric if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {
1778bdd1243dSDimitry Andric std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1779bdd1243dSDimitry Andric
1780bdd1243dSDimitry Andric // The legal cases are:
1781bdd1243dSDimitry Andric // VADDV u/s 8/16/32
1782bdd1243dSDimitry Andric // VADDLV u/s 32
1783bdd1243dSDimitry Andric // Codegen currently cannot always handle larger than legal vectors very
1784bdd1243dSDimitry Andric // well, especially for predicated reductions where the mask needs to be
1785bdd1243dSDimitry Andric // split, so restrict to 128bit or smaller input types.
1786bdd1243dSDimitry Andric unsigned RevVTSize = ResVT.getSizeInBits();
1787bdd1243dSDimitry Andric if (ValVT.getSizeInBits() <= 128 &&
1788bdd1243dSDimitry Andric ((LT.second == MVT::v16i8 && RevVTSize <= 32) ||
1789bdd1243dSDimitry Andric (LT.second == MVT::v8i16 && RevVTSize <= 32) ||
1790bdd1243dSDimitry Andric (LT.second == MVT::v4i32 && RevVTSize <= 64)))
1791bdd1243dSDimitry Andric return ST->getMVEVectorCostFactor(CostKind) * LT.first;
1792bdd1243dSDimitry Andric }
1793bdd1243dSDimitry Andric break;
1794bdd1243dSDimitry Andric default:
1795bdd1243dSDimitry Andric break;
1796bdd1243dSDimitry Andric }
1797bdd1243dSDimitry Andric return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy, FMF,
1798bdd1243dSDimitry Andric CostKind);
1799bdd1243dSDimitry Andric }
1800bdd1243dSDimitry Andric
1801e8d8bef9SDimitry Andric InstructionCost
getMulAccReductionCost(bool IsUnsigned,Type * ResTy,VectorType * ValTy,TTI::TargetCostKind CostKind)1802bdd1243dSDimitry Andric ARMTTIImpl::getMulAccReductionCost(bool IsUnsigned, Type *ResTy,
1803bdd1243dSDimitry Andric VectorType *ValTy,
1804e8d8bef9SDimitry Andric TTI::TargetCostKind CostKind) {
1805e8d8bef9SDimitry Andric EVT ValVT = TLI->getValueType(DL, ValTy);
1806e8d8bef9SDimitry Andric EVT ResVT = TLI->getValueType(DL, ResTy);
1807349cc55cSDimitry Andric
1808e8d8bef9SDimitry Andric if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {
1809bdd1243dSDimitry Andric std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1810349cc55cSDimitry Andric
1811349cc55cSDimitry Andric // The legal cases are:
1812349cc55cSDimitry Andric // VMLAV u/s 8/16/32
1813349cc55cSDimitry Andric // VMLALV u/s 16/32
1814349cc55cSDimitry Andric // Codegen currently cannot always handle larger than legal vectors very
1815349cc55cSDimitry Andric // well, especially for predicated reductions where the mask needs to be
1816349cc55cSDimitry Andric // split, so restrict to 128bit or smaller input types.
1817349cc55cSDimitry Andric unsigned RevVTSize = ResVT.getSizeInBits();
1818349cc55cSDimitry Andric if (ValVT.getSizeInBits() <= 128 &&
1819349cc55cSDimitry Andric ((LT.second == MVT::v16i8 && RevVTSize <= 32) ||
1820bdd1243dSDimitry Andric (LT.second == MVT::v8i16 && RevVTSize <= 64) ||
1821349cc55cSDimitry Andric (LT.second == MVT::v4i32 && RevVTSize <= 64)))
1822fe6060f1SDimitry Andric return ST->getMVEVectorCostFactor(CostKind) * LT.first;
1823e8d8bef9SDimitry Andric }
1824e8d8bef9SDimitry Andric
1825bdd1243dSDimitry Andric return BaseT::getMulAccReductionCost(IsUnsigned, ResTy, ValTy, CostKind);
1826e8d8bef9SDimitry Andric }
1827e8d8bef9SDimitry Andric
1828fe6060f1SDimitry Andric InstructionCost
getMinMaxReductionCost(Intrinsic::ID IID,VectorType * Ty,FastMathFlags FMF,TTI::TargetCostKind CostKind)18295f757f3fSDimitry Andric ARMTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
18305f757f3fSDimitry Andric FastMathFlags FMF,
18315f757f3fSDimitry Andric TTI::TargetCostKind CostKind) {
18325f757f3fSDimitry Andric EVT ValVT = TLI->getValueType(DL, Ty);
18335f757f3fSDimitry Andric
18345f757f3fSDimitry Andric // In general floating point reductions are a series of elementwise
18355f757f3fSDimitry Andric // operations, with free extracts on each step. These are either in-order or
18365f757f3fSDimitry Andric // treewise depending on whether that is allowed by the fast math flags.
18375f757f3fSDimitry Andric if ((IID == Intrinsic::minnum || IID == Intrinsic::maxnum) &&
18385f757f3fSDimitry Andric ((ValVT.getVectorElementType() == MVT::f32 && ST->hasVFP2Base()) ||
18395f757f3fSDimitry Andric (ValVT.getVectorElementType() == MVT::f64 && ST->hasFP64()) ||
18405f757f3fSDimitry Andric (ValVT.getVectorElementType() == MVT::f16 && ST->hasFullFP16()))) {
18415f757f3fSDimitry Andric unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements();
18425f757f3fSDimitry Andric unsigned EltSize = ValVT.getScalarSizeInBits();
18435f757f3fSDimitry Andric unsigned VecLimit = ST->hasMVEFloatOps() ? 128 : (ST->hasNEON() ? 64 : -1);
18445f757f3fSDimitry Andric InstructionCost VecCost;
18455f757f3fSDimitry Andric while (isPowerOf2_32(NumElts) && NumElts * EltSize > VecLimit) {
18465f757f3fSDimitry Andric Type *VecTy = FixedVectorType::get(Ty->getElementType(), NumElts/2);
18475f757f3fSDimitry Andric IntrinsicCostAttributes ICA(IID, VecTy, {VecTy, VecTy}, FMF);
18485f757f3fSDimitry Andric VecCost += getIntrinsicInstrCost(ICA, CostKind);
18495f757f3fSDimitry Andric NumElts /= 2;
18505f757f3fSDimitry Andric }
18515f757f3fSDimitry Andric
18525f757f3fSDimitry Andric // For fp16 we need to extract the upper lane elements. MVE can add a
18535f757f3fSDimitry Andric // VREV+FMIN/MAX to perform another vector step instead.
18545f757f3fSDimitry Andric InstructionCost ExtractCost = 0;
18555f757f3fSDimitry Andric if (ST->hasMVEFloatOps() && ValVT.getVectorElementType() == MVT::f16 &&
18565f757f3fSDimitry Andric NumElts == 8) {
18575f757f3fSDimitry Andric VecCost += ST->getMVEVectorCostFactor(CostKind) * 2;
18585f757f3fSDimitry Andric NumElts /= 2;
18595f757f3fSDimitry Andric } else if (ValVT.getVectorElementType() == MVT::f16)
18605f757f3fSDimitry Andric ExtractCost = cast<FixedVectorType>(Ty)->getNumElements() / 2;
18615f757f3fSDimitry Andric
18625f757f3fSDimitry Andric IntrinsicCostAttributes ICA(IID, Ty->getElementType(),
18635f757f3fSDimitry Andric {Ty->getElementType(), Ty->getElementType()},
18645f757f3fSDimitry Andric FMF);
18655f757f3fSDimitry Andric return VecCost + ExtractCost +
18665f757f3fSDimitry Andric (NumElts - 1) * getIntrinsicInstrCost(ICA, CostKind);
18675f757f3fSDimitry Andric }
18685f757f3fSDimitry Andric
18695f757f3fSDimitry Andric if (IID == Intrinsic::smin || IID == Intrinsic::smax ||
18705f757f3fSDimitry Andric IID == Intrinsic::umin || IID == Intrinsic::umax) {
18715f757f3fSDimitry Andric std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
18725f757f3fSDimitry Andric
18735f757f3fSDimitry Andric // All costs are the same for u/s min/max. These lower to vminv, which are
18745f757f3fSDimitry Andric // given a slightly higher cost as they tend to take multiple cycles for
18755f757f3fSDimitry Andric // smaller type sizes.
18765f757f3fSDimitry Andric static const CostTblEntry CostTblAdd[]{
18775f757f3fSDimitry Andric {ISD::SMIN, MVT::v16i8, 4},
18785f757f3fSDimitry Andric {ISD::SMIN, MVT::v8i16, 3},
18795f757f3fSDimitry Andric {ISD::SMIN, MVT::v4i32, 2},
18805f757f3fSDimitry Andric };
18815f757f3fSDimitry Andric if (const auto *Entry = CostTableLookup(CostTblAdd, ISD::SMIN, LT.second))
18825f757f3fSDimitry Andric return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first;
18835f757f3fSDimitry Andric }
18845f757f3fSDimitry Andric
18855f757f3fSDimitry Andric return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
18865f757f3fSDimitry Andric }
18875f757f3fSDimitry Andric
18885f757f3fSDimitry Andric InstructionCost
getIntrinsicInstrCost(const IntrinsicCostAttributes & ICA,TTI::TargetCostKind CostKind)1889fe6060f1SDimitry Andric ARMTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
1890e8d8bef9SDimitry Andric TTI::TargetCostKind CostKind) {
1891e8d8bef9SDimitry Andric switch (ICA.getID()) {
1892e8d8bef9SDimitry Andric case Intrinsic::get_active_lane_mask:
1893e8d8bef9SDimitry Andric // Currently we make a somewhat optimistic assumption that
1894e8d8bef9SDimitry Andric // active_lane_mask's are always free. In reality it may be freely folded
1895e8d8bef9SDimitry Andric // into a tail predicated loop, expanded into a VCPT or expanded into a lot
1896e8d8bef9SDimitry Andric // of add/icmp code. We may need to improve this in the future, but being
1897e8d8bef9SDimitry Andric // able to detect if it is free or not involves looking at a lot of other
1898e8d8bef9SDimitry Andric // code. We currently assume that the vectorizer inserted these, and knew
1899e8d8bef9SDimitry Andric // what it was doing in adding one.
1900e8d8bef9SDimitry Andric if (ST->hasMVEIntegerOps())
1901e8d8bef9SDimitry Andric return 0;
1902e8d8bef9SDimitry Andric break;
1903e8d8bef9SDimitry Andric case Intrinsic::sadd_sat:
1904e8d8bef9SDimitry Andric case Intrinsic::ssub_sat:
1905e8d8bef9SDimitry Andric case Intrinsic::uadd_sat:
1906e8d8bef9SDimitry Andric case Intrinsic::usub_sat: {
1907e8d8bef9SDimitry Andric if (!ST->hasMVEIntegerOps())
1908e8d8bef9SDimitry Andric break;
1909e8d8bef9SDimitry Andric Type *VT = ICA.getReturnType();
1910e8d8bef9SDimitry Andric
1911bdd1243dSDimitry Andric std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VT);
1912e8d8bef9SDimitry Andric if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
1913e8d8bef9SDimitry Andric LT.second == MVT::v16i8) {
1914fe6060f1SDimitry Andric // This is a base cost of 1 for the vqadd, plus 3 extract shifts if we
1915e8d8bef9SDimitry Andric // need to extend the type, as it uses shr(qadd(shl, shl)).
1916fe6060f1SDimitry Andric unsigned Instrs =
1917fe6060f1SDimitry Andric LT.second.getScalarSizeInBits() == VT->getScalarSizeInBits() ? 1 : 4;
1918fe6060f1SDimitry Andric return LT.first * ST->getMVEVectorCostFactor(CostKind) * Instrs;
1919e8d8bef9SDimitry Andric }
1920e8d8bef9SDimitry Andric break;
1921e8d8bef9SDimitry Andric }
1922fe6060f1SDimitry Andric case Intrinsic::abs:
1923fe6060f1SDimitry Andric case Intrinsic::smin:
1924fe6060f1SDimitry Andric case Intrinsic::smax:
1925fe6060f1SDimitry Andric case Intrinsic::umin:
1926fe6060f1SDimitry Andric case Intrinsic::umax: {
1927fe6060f1SDimitry Andric if (!ST->hasMVEIntegerOps())
1928fe6060f1SDimitry Andric break;
1929fe6060f1SDimitry Andric Type *VT = ICA.getReturnType();
1930fe6060f1SDimitry Andric
1931bdd1243dSDimitry Andric std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VT);
1932fe6060f1SDimitry Andric if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
1933fe6060f1SDimitry Andric LT.second == MVT::v16i8)
1934fe6060f1SDimitry Andric return LT.first * ST->getMVEVectorCostFactor(CostKind);
1935fe6060f1SDimitry Andric break;
1936fe6060f1SDimitry Andric }
1937fe6060f1SDimitry Andric case Intrinsic::minnum:
1938fe6060f1SDimitry Andric case Intrinsic::maxnum: {
1939fe6060f1SDimitry Andric if (!ST->hasMVEFloatOps())
1940fe6060f1SDimitry Andric break;
1941fe6060f1SDimitry Andric Type *VT = ICA.getReturnType();
1942bdd1243dSDimitry Andric std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VT);
1943fe6060f1SDimitry Andric if (LT.second == MVT::v4f32 || LT.second == MVT::v8f16)
1944fe6060f1SDimitry Andric return LT.first * ST->getMVEVectorCostFactor(CostKind);
1945fe6060f1SDimitry Andric break;
1946fe6060f1SDimitry Andric }
194781ad6265SDimitry Andric case Intrinsic::fptosi_sat:
194881ad6265SDimitry Andric case Intrinsic::fptoui_sat: {
194981ad6265SDimitry Andric if (ICA.getArgTypes().empty())
195081ad6265SDimitry Andric break;
195181ad6265SDimitry Andric bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
1952bdd1243dSDimitry Andric auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]);
195381ad6265SDimitry Andric EVT MTy = TLI->getValueType(DL, ICA.getReturnType());
195481ad6265SDimitry Andric // Check for the legal types, with the corect subtarget features.
195581ad6265SDimitry Andric if ((ST->hasVFP2Base() && LT.second == MVT::f32 && MTy == MVT::i32) ||
195681ad6265SDimitry Andric (ST->hasFP64() && LT.second == MVT::f64 && MTy == MVT::i32) ||
195781ad6265SDimitry Andric (ST->hasFullFP16() && LT.second == MVT::f16 && MTy == MVT::i32))
195881ad6265SDimitry Andric return LT.first;
195981ad6265SDimitry Andric
196081ad6265SDimitry Andric // Equally for MVE vector types
196181ad6265SDimitry Andric if (ST->hasMVEFloatOps() &&
196281ad6265SDimitry Andric (LT.second == MVT::v4f32 || LT.second == MVT::v8f16) &&
196381ad6265SDimitry Andric LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())
196481ad6265SDimitry Andric return LT.first * ST->getMVEVectorCostFactor(CostKind);
196581ad6265SDimitry Andric
196681ad6265SDimitry Andric // Otherwise we use a legal convert followed by a min+max
196781ad6265SDimitry Andric if (((ST->hasVFP2Base() && LT.second == MVT::f32) ||
196881ad6265SDimitry Andric (ST->hasFP64() && LT.second == MVT::f64) ||
196981ad6265SDimitry Andric (ST->hasFullFP16() && LT.second == MVT::f16) ||
197081ad6265SDimitry Andric (ST->hasMVEFloatOps() &&
197181ad6265SDimitry Andric (LT.second == MVT::v4f32 || LT.second == MVT::v8f16))) &&
197281ad6265SDimitry Andric LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
197381ad6265SDimitry Andric Type *LegalTy = Type::getIntNTy(ICA.getReturnType()->getContext(),
197481ad6265SDimitry Andric LT.second.getScalarSizeInBits());
197581ad6265SDimitry Andric InstructionCost Cost =
197681ad6265SDimitry Andric LT.second.isVector() ? ST->getMVEVectorCostFactor(CostKind) : 1;
197781ad6265SDimitry Andric IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin
197881ad6265SDimitry Andric : Intrinsic::umin,
197981ad6265SDimitry Andric LegalTy, {LegalTy, LegalTy});
198081ad6265SDimitry Andric Cost += getIntrinsicInstrCost(Attrs1, CostKind);
198181ad6265SDimitry Andric IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax
198281ad6265SDimitry Andric : Intrinsic::umax,
198381ad6265SDimitry Andric LegalTy, {LegalTy, LegalTy});
198481ad6265SDimitry Andric Cost += getIntrinsicInstrCost(Attrs2, CostKind);
198581ad6265SDimitry Andric return LT.first * Cost;
198681ad6265SDimitry Andric }
198781ad6265SDimitry Andric break;
198881ad6265SDimitry Andric }
1989e8d8bef9SDimitry Andric }
1990e8d8bef9SDimitry Andric
1991e8d8bef9SDimitry Andric return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1992e8d8bef9SDimitry Andric }
1993e8d8bef9SDimitry Andric
isLoweredToCall(const Function * F)19940b57cec5SDimitry Andric bool ARMTTIImpl::isLoweredToCall(const Function *F) {
19950b57cec5SDimitry Andric if (!F->isIntrinsic())
199681ad6265SDimitry Andric return BaseT::isLoweredToCall(F);
19970b57cec5SDimitry Andric
19980b57cec5SDimitry Andric // Assume all Arm-specific intrinsics map to an instruction.
19995f757f3fSDimitry Andric if (F->getName().starts_with("llvm.arm"))
20000b57cec5SDimitry Andric return false;
20010b57cec5SDimitry Andric
20020b57cec5SDimitry Andric switch (F->getIntrinsicID()) {
20030b57cec5SDimitry Andric default: break;
20040b57cec5SDimitry Andric case Intrinsic::powi:
20050b57cec5SDimitry Andric case Intrinsic::sin:
20060b57cec5SDimitry Andric case Intrinsic::cos:
20070b57cec5SDimitry Andric case Intrinsic::pow:
20080b57cec5SDimitry Andric case Intrinsic::log:
20090b57cec5SDimitry Andric case Intrinsic::log10:
20100b57cec5SDimitry Andric case Intrinsic::log2:
20110b57cec5SDimitry Andric case Intrinsic::exp:
20120b57cec5SDimitry Andric case Intrinsic::exp2:
20130b57cec5SDimitry Andric return true;
20140b57cec5SDimitry Andric case Intrinsic::sqrt:
20150b57cec5SDimitry Andric case Intrinsic::fabs:
20160b57cec5SDimitry Andric case Intrinsic::copysign:
20170b57cec5SDimitry Andric case Intrinsic::floor:
20180b57cec5SDimitry Andric case Intrinsic::ceil:
20190b57cec5SDimitry Andric case Intrinsic::trunc:
20200b57cec5SDimitry Andric case Intrinsic::rint:
20210b57cec5SDimitry Andric case Intrinsic::nearbyint:
20220b57cec5SDimitry Andric case Intrinsic::round:
20230b57cec5SDimitry Andric case Intrinsic::canonicalize:
20240b57cec5SDimitry Andric case Intrinsic::lround:
20250b57cec5SDimitry Andric case Intrinsic::llround:
20260b57cec5SDimitry Andric case Intrinsic::lrint:
20270b57cec5SDimitry Andric case Intrinsic::llrint:
20280b57cec5SDimitry Andric if (F->getReturnType()->isDoubleTy() && !ST->hasFP64())
20290b57cec5SDimitry Andric return true;
20300b57cec5SDimitry Andric if (F->getReturnType()->isHalfTy() && !ST->hasFullFP16())
20310b57cec5SDimitry Andric return true;
20320b57cec5SDimitry Andric // Some operations can be handled by vector instructions and assume
20330b57cec5SDimitry Andric // unsupported vectors will be expanded into supported scalar ones.
20340b57cec5SDimitry Andric // TODO Handle scalar operations properly.
20350b57cec5SDimitry Andric return !ST->hasFPARMv8Base() && !ST->hasVFP2Base();
20360b57cec5SDimitry Andric case Intrinsic::masked_store:
20370b57cec5SDimitry Andric case Intrinsic::masked_load:
20380b57cec5SDimitry Andric case Intrinsic::masked_gather:
20390b57cec5SDimitry Andric case Intrinsic::masked_scatter:
20400b57cec5SDimitry Andric return !ST->hasMVEIntegerOps();
20410b57cec5SDimitry Andric case Intrinsic::sadd_with_overflow:
20420b57cec5SDimitry Andric case Intrinsic::uadd_with_overflow:
20430b57cec5SDimitry Andric case Intrinsic::ssub_with_overflow:
20440b57cec5SDimitry Andric case Intrinsic::usub_with_overflow:
20450b57cec5SDimitry Andric case Intrinsic::sadd_sat:
20460b57cec5SDimitry Andric case Intrinsic::uadd_sat:
20470b57cec5SDimitry Andric case Intrinsic::ssub_sat:
20480b57cec5SDimitry Andric case Intrinsic::usub_sat:
20490b57cec5SDimitry Andric return false;
20500b57cec5SDimitry Andric }
20510b57cec5SDimitry Andric
20520b57cec5SDimitry Andric return BaseT::isLoweredToCall(F);
20530b57cec5SDimitry Andric }
20540b57cec5SDimitry Andric
maybeLoweredToCall(Instruction & I)2055e8d8bef9SDimitry Andric bool ARMTTIImpl::maybeLoweredToCall(Instruction &I) {
20560b57cec5SDimitry Andric unsigned ISD = TLI->InstructionOpcodeToISD(I.getOpcode());
20570b57cec5SDimitry Andric EVT VT = TLI->getValueType(DL, I.getType(), true);
20580b57cec5SDimitry Andric if (TLI->getOperationAction(ISD, VT) == TargetLowering::LibCall)
20590b57cec5SDimitry Andric return true;
20600b57cec5SDimitry Andric
20610b57cec5SDimitry Andric // Check if an intrinsic will be lowered to a call and assume that any
20620b57cec5SDimitry Andric // other CallInst will generate a bl.
20630b57cec5SDimitry Andric if (auto *Call = dyn_cast<CallInst>(&I)) {
2064e8d8bef9SDimitry Andric if (auto *II = dyn_cast<IntrinsicInst>(Call)) {
2065e8d8bef9SDimitry Andric switch(II->getIntrinsicID()) {
2066e8d8bef9SDimitry Andric case Intrinsic::memcpy:
2067e8d8bef9SDimitry Andric case Intrinsic::memset:
2068e8d8bef9SDimitry Andric case Intrinsic::memmove:
2069e8d8bef9SDimitry Andric return getNumMemOps(II) == -1;
2070e8d8bef9SDimitry Andric default:
20710b57cec5SDimitry Andric if (const Function *F = Call->getCalledFunction())
20720b57cec5SDimitry Andric return isLoweredToCall(F);
20730b57cec5SDimitry Andric }
2074e8d8bef9SDimitry Andric }
20750b57cec5SDimitry Andric return true;
20760b57cec5SDimitry Andric }
20770b57cec5SDimitry Andric
20780b57cec5SDimitry Andric // FPv5 provides conversions between integer, double-precision,
20790b57cec5SDimitry Andric // single-precision, and half-precision formats.
20800b57cec5SDimitry Andric switch (I.getOpcode()) {
20810b57cec5SDimitry Andric default:
20820b57cec5SDimitry Andric break;
20830b57cec5SDimitry Andric case Instruction::FPToSI:
20840b57cec5SDimitry Andric case Instruction::FPToUI:
20850b57cec5SDimitry Andric case Instruction::SIToFP:
20860b57cec5SDimitry Andric case Instruction::UIToFP:
20870b57cec5SDimitry Andric case Instruction::FPTrunc:
20880b57cec5SDimitry Andric case Instruction::FPExt:
20890b57cec5SDimitry Andric return !ST->hasFPARMv8Base();
20900b57cec5SDimitry Andric }
20910b57cec5SDimitry Andric
20920b57cec5SDimitry Andric // FIXME: Unfortunately the approach of checking the Operation Action does
20930b57cec5SDimitry Andric // not catch all cases of Legalization that use library calls. Our
20940b57cec5SDimitry Andric // Legalization step categorizes some transformations into library calls as
20950b57cec5SDimitry Andric // Custom, Expand or even Legal when doing type legalization. So for now
20960b57cec5SDimitry Andric // we have to special case for instance the SDIV of 64bit integers and the
20970b57cec5SDimitry Andric // use of floating point emulation.
20980b57cec5SDimitry Andric if (VT.isInteger() && VT.getSizeInBits() >= 64) {
20990b57cec5SDimitry Andric switch (ISD) {
21000b57cec5SDimitry Andric default:
21010b57cec5SDimitry Andric break;
21020b57cec5SDimitry Andric case ISD::SDIV:
21030b57cec5SDimitry Andric case ISD::UDIV:
21040b57cec5SDimitry Andric case ISD::SREM:
21050b57cec5SDimitry Andric case ISD::UREM:
21060b57cec5SDimitry Andric case ISD::SDIVREM:
21070b57cec5SDimitry Andric case ISD::UDIVREM:
21080b57cec5SDimitry Andric return true;
21090b57cec5SDimitry Andric }
21100b57cec5SDimitry Andric }
21110b57cec5SDimitry Andric
21120b57cec5SDimitry Andric // Assume all other non-float operations are supported.
21130b57cec5SDimitry Andric if (!VT.isFloatingPoint())
21140b57cec5SDimitry Andric return false;
21150b57cec5SDimitry Andric
21160b57cec5SDimitry Andric // We'll need a library call to handle most floats when using soft.
21170b57cec5SDimitry Andric if (TLI->useSoftFloat()) {
21180b57cec5SDimitry Andric switch (I.getOpcode()) {
21190b57cec5SDimitry Andric default:
21200b57cec5SDimitry Andric return true;
21210b57cec5SDimitry Andric case Instruction::Alloca:
21220b57cec5SDimitry Andric case Instruction::Load:
21230b57cec5SDimitry Andric case Instruction::Store:
21240b57cec5SDimitry Andric case Instruction::Select:
21250b57cec5SDimitry Andric case Instruction::PHI:
21260b57cec5SDimitry Andric return false;
21270b57cec5SDimitry Andric }
21280b57cec5SDimitry Andric }
21290b57cec5SDimitry Andric
21300b57cec5SDimitry Andric // We'll need a libcall to perform double precision operations on a single
21310b57cec5SDimitry Andric // precision only FPU.
21320b57cec5SDimitry Andric if (I.getType()->isDoubleTy() && !ST->hasFP64())
21330b57cec5SDimitry Andric return true;
21340b57cec5SDimitry Andric
21350b57cec5SDimitry Andric // Likewise for half precision arithmetic.
21360b57cec5SDimitry Andric if (I.getType()->isHalfTy() && !ST->hasFullFP16())
21370b57cec5SDimitry Andric return true;
21380b57cec5SDimitry Andric
21390b57cec5SDimitry Andric return false;
2140e8d8bef9SDimitry Andric }
2141e8d8bef9SDimitry Andric
isHardwareLoopProfitable(Loop * L,ScalarEvolution & SE,AssumptionCache & AC,TargetLibraryInfo * LibInfo,HardwareLoopInfo & HWLoopInfo)2142e8d8bef9SDimitry Andric bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
2143e8d8bef9SDimitry Andric AssumptionCache &AC,
2144e8d8bef9SDimitry Andric TargetLibraryInfo *LibInfo,
2145e8d8bef9SDimitry Andric HardwareLoopInfo &HWLoopInfo) {
2146e8d8bef9SDimitry Andric // Low-overhead branches are only supported in the 'low-overhead branch'
2147e8d8bef9SDimitry Andric // extension of v8.1-m.
2148e8d8bef9SDimitry Andric if (!ST->hasLOB() || DisableLowOverheadLoops) {
2149e8d8bef9SDimitry Andric LLVM_DEBUG(dbgs() << "ARMHWLoops: Disabled\n");
2150e8d8bef9SDimitry Andric return false;
2151e8d8bef9SDimitry Andric }
2152e8d8bef9SDimitry Andric
2153e8d8bef9SDimitry Andric if (!SE.hasLoopInvariantBackedgeTakenCount(L)) {
2154e8d8bef9SDimitry Andric LLVM_DEBUG(dbgs() << "ARMHWLoops: No BETC\n");
2155e8d8bef9SDimitry Andric return false;
2156e8d8bef9SDimitry Andric }
2157e8d8bef9SDimitry Andric
2158e8d8bef9SDimitry Andric const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
2159e8d8bef9SDimitry Andric if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) {
2160e8d8bef9SDimitry Andric LLVM_DEBUG(dbgs() << "ARMHWLoops: Uncomputable BETC\n");
2161e8d8bef9SDimitry Andric return false;
2162e8d8bef9SDimitry Andric }
2163e8d8bef9SDimitry Andric
2164e8d8bef9SDimitry Andric const SCEV *TripCountSCEV =
2165e8d8bef9SDimitry Andric SE.getAddExpr(BackedgeTakenCount,
2166e8d8bef9SDimitry Andric SE.getOne(BackedgeTakenCount->getType()));
2167e8d8bef9SDimitry Andric
2168e8d8bef9SDimitry Andric // We need to store the trip count in LR, a 32-bit register.
2169e8d8bef9SDimitry Andric if (SE.getUnsignedRangeMax(TripCountSCEV).getBitWidth() > 32) {
2170e8d8bef9SDimitry Andric LLVM_DEBUG(dbgs() << "ARMHWLoops: Trip count does not fit into 32bits\n");
2171e8d8bef9SDimitry Andric return false;
2172e8d8bef9SDimitry Andric }
2173e8d8bef9SDimitry Andric
2174e8d8bef9SDimitry Andric // Making a call will trash LR and clear LO_BRANCH_INFO, so there's little
2175e8d8bef9SDimitry Andric // point in generating a hardware loop if that's going to happen.
21760b57cec5SDimitry Andric
21770b57cec5SDimitry Andric auto IsHardwareLoopIntrinsic = [](Instruction &I) {
21780b57cec5SDimitry Andric if (auto *Call = dyn_cast<IntrinsicInst>(&I)) {
21790b57cec5SDimitry Andric switch (Call->getIntrinsicID()) {
21800b57cec5SDimitry Andric default:
21810b57cec5SDimitry Andric break;
2182e8d8bef9SDimitry Andric case Intrinsic::start_loop_iterations:
2183fe6060f1SDimitry Andric case Intrinsic::test_start_loop_iterations:
21840b57cec5SDimitry Andric case Intrinsic::loop_decrement:
21850b57cec5SDimitry Andric case Intrinsic::loop_decrement_reg:
21860b57cec5SDimitry Andric return true;
21870b57cec5SDimitry Andric }
21880b57cec5SDimitry Andric }
21890b57cec5SDimitry Andric return false;
21900b57cec5SDimitry Andric };
21910b57cec5SDimitry Andric
21920b57cec5SDimitry Andric // Scan the instructions to see if there's any that we know will turn into a
2193e8d8bef9SDimitry Andric // call or if this loop is already a low-overhead loop or will become a tail
2194e8d8bef9SDimitry Andric // predicated loop.
2195e8d8bef9SDimitry Andric bool IsTailPredLoop = false;
21960b57cec5SDimitry Andric auto ScanLoop = [&](Loop *L) {
21970b57cec5SDimitry Andric for (auto *BB : L->getBlocks()) {
21980b57cec5SDimitry Andric for (auto &I : *BB) {
2199e8d8bef9SDimitry Andric if (maybeLoweredToCall(I) || IsHardwareLoopIntrinsic(I) ||
2200e8d8bef9SDimitry Andric isa<InlineAsm>(I)) {
22015ffd83dbSDimitry Andric LLVM_DEBUG(dbgs() << "ARMHWLoops: Bad instruction: " << I << "\n");
22020b57cec5SDimitry Andric return false;
22030b57cec5SDimitry Andric }
2204e8d8bef9SDimitry Andric if (auto *II = dyn_cast<IntrinsicInst>(&I))
2205e8d8bef9SDimitry Andric IsTailPredLoop |=
2206e8d8bef9SDimitry Andric II->getIntrinsicID() == Intrinsic::get_active_lane_mask ||
2207e8d8bef9SDimitry Andric II->getIntrinsicID() == Intrinsic::arm_mve_vctp8 ||
2208e8d8bef9SDimitry Andric II->getIntrinsicID() == Intrinsic::arm_mve_vctp16 ||
2209e8d8bef9SDimitry Andric II->getIntrinsicID() == Intrinsic::arm_mve_vctp32 ||
2210e8d8bef9SDimitry Andric II->getIntrinsicID() == Intrinsic::arm_mve_vctp64;
22110b57cec5SDimitry Andric }
22125ffd83dbSDimitry Andric }
22130b57cec5SDimitry Andric return true;
22140b57cec5SDimitry Andric };
22150b57cec5SDimitry Andric
22160b57cec5SDimitry Andric // Visit inner loops.
2217bdd1243dSDimitry Andric for (auto *Inner : *L)
22180b57cec5SDimitry Andric if (!ScanLoop(Inner))
22190b57cec5SDimitry Andric return false;
22200b57cec5SDimitry Andric
22210b57cec5SDimitry Andric if (!ScanLoop(L))
22220b57cec5SDimitry Andric return false;
22230b57cec5SDimitry Andric
22240b57cec5SDimitry Andric // TODO: Check whether the trip count calculation is expensive. If L is the
22250b57cec5SDimitry Andric // inner loop but we know it has a low trip count, calculating that trip
22260b57cec5SDimitry Andric // count (in the parent loop) may be detrimental.
22270b57cec5SDimitry Andric
22280b57cec5SDimitry Andric LLVMContext &C = L->getHeader()->getContext();
22290b57cec5SDimitry Andric HWLoopInfo.CounterInReg = true;
22300b57cec5SDimitry Andric HWLoopInfo.IsNestingLegal = false;
2231e8d8bef9SDimitry Andric HWLoopInfo.PerformEntryTest = AllowWLSLoops && !IsTailPredLoop;
22320b57cec5SDimitry Andric HWLoopInfo.CountType = Type::getInt32Ty(C);
22330b57cec5SDimitry Andric HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1);
22340b57cec5SDimitry Andric return true;
22350b57cec5SDimitry Andric }
22360b57cec5SDimitry Andric
canTailPredicateInstruction(Instruction & I,int & ICmpCount)2237480093f4SDimitry Andric static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) {
2238480093f4SDimitry Andric // We don't allow icmp's, and because we only look at single block loops,
2239480093f4SDimitry Andric // we simply count the icmps, i.e. there should only be 1 for the backedge.
2240480093f4SDimitry Andric if (isa<ICmpInst>(&I) && ++ICmpCount > 1)
2241480093f4SDimitry Andric return false;
2242349cc55cSDimitry Andric // FIXME: This is a workaround for poor cost modelling. Min/Max intrinsics are
2243349cc55cSDimitry Andric // not currently canonical, but soon will be. Code without them uses icmp, and
2244349cc55cSDimitry Andric // so is not tail predicated as per the condition above. In order to get the
2245349cc55cSDimitry Andric // same performance we treat min and max the same as an icmp for tailpred
2246349cc55cSDimitry Andric // purposes for the moment (we often rely on non-tailpred and higher VF's to
2247349cc55cSDimitry Andric // pick more optimial instructions like VQDMULH. They need to be recognized
2248349cc55cSDimitry Andric // directly by the vectorizer).
2249349cc55cSDimitry Andric if (auto *II = dyn_cast<IntrinsicInst>(&I))
2250349cc55cSDimitry Andric if ((II->getIntrinsicID() == Intrinsic::smin ||
2251349cc55cSDimitry Andric II->getIntrinsicID() == Intrinsic::smax ||
2252349cc55cSDimitry Andric II->getIntrinsicID() == Intrinsic::umin ||
2253349cc55cSDimitry Andric II->getIntrinsicID() == Intrinsic::umax) &&
2254349cc55cSDimitry Andric ++ICmpCount > 1)
2255349cc55cSDimitry Andric return false;
2256480093f4SDimitry Andric
2257480093f4SDimitry Andric if (isa<FCmpInst>(&I))
2258480093f4SDimitry Andric return false;
2259480093f4SDimitry Andric
2260480093f4SDimitry Andric // We could allow extending/narrowing FP loads/stores, but codegen is
2261480093f4SDimitry Andric // too inefficient so reject this for now.
2262480093f4SDimitry Andric if (isa<FPExtInst>(&I) || isa<FPTruncInst>(&I))
2263480093f4SDimitry Andric return false;
2264480093f4SDimitry Andric
2265480093f4SDimitry Andric // Extends have to be extending-loads
2266480093f4SDimitry Andric if (isa<SExtInst>(&I) || isa<ZExtInst>(&I) )
2267480093f4SDimitry Andric if (!I.getOperand(0)->hasOneUse() || !isa<LoadInst>(I.getOperand(0)))
2268480093f4SDimitry Andric return false;
2269480093f4SDimitry Andric
2270480093f4SDimitry Andric // Truncs have to be narrowing-stores
2271480093f4SDimitry Andric if (isa<TruncInst>(&I) )
2272480093f4SDimitry Andric if (!I.hasOneUse() || !isa<StoreInst>(*I.user_begin()))
2273480093f4SDimitry Andric return false;
2274480093f4SDimitry Andric
2275480093f4SDimitry Andric return true;
2276480093f4SDimitry Andric }
2277480093f4SDimitry Andric
2278480093f4SDimitry Andric // To set up a tail-predicated loop, we need to know the total number of
2279480093f4SDimitry Andric // elements processed by that loop. Thus, we need to determine the element
2280480093f4SDimitry Andric // size and:
2281480093f4SDimitry Andric // 1) it should be uniform for all operations in the vector loop, so we
2282480093f4SDimitry Andric // e.g. don't want any widening/narrowing operations.
2283480093f4SDimitry Andric // 2) it should be smaller than i64s because we don't have vector operations
2284480093f4SDimitry Andric // that work on i64s.
2285480093f4SDimitry Andric // 3) we don't want elements to be reversed or shuffled, to make sure the
2286480093f4SDimitry Andric // tail-predication masks/predicates the right lanes.
2287480093f4SDimitry Andric //
canTailPredicateLoop(Loop * L,LoopInfo * LI,ScalarEvolution & SE,const DataLayout & DL,const LoopAccessInfo * LAI)2288480093f4SDimitry Andric static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
2289480093f4SDimitry Andric const DataLayout &DL,
2290480093f4SDimitry Andric const LoopAccessInfo *LAI) {
22915ffd83dbSDimitry Andric LLVM_DEBUG(dbgs() << "Tail-predication: checking allowed instructions\n");
22925ffd83dbSDimitry Andric
2293e8d8bef9SDimitry Andric // If there are live-out values, it is probably a reduction. We can predicate
2294e8d8bef9SDimitry Andric // most reduction operations freely under MVE using a combination of
2295e8d8bef9SDimitry Andric // prefer-predicated-reduction-select and inloop reductions. We limit this to
2296e8d8bef9SDimitry Andric // floating point and integer reductions, but don't check for operators
2297e8d8bef9SDimitry Andric // specifically here. If the value ends up not being a reduction (and so the
2298e8d8bef9SDimitry Andric // vectorizer cannot tailfold the loop), we should fall back to standard
2299e8d8bef9SDimitry Andric // vectorization automatically.
23005ffd83dbSDimitry Andric SmallVector< Instruction *, 8 > LiveOuts;
23015ffd83dbSDimitry Andric LiveOuts = llvm::findDefsUsedOutsideOfLoop(L);
2302e8d8bef9SDimitry Andric bool ReductionsDisabled =
23035ffd83dbSDimitry Andric EnableTailPredication == TailPredication::EnabledNoReductions ||
23045ffd83dbSDimitry Andric EnableTailPredication == TailPredication::ForceEnabledNoReductions;
23055ffd83dbSDimitry Andric
23065ffd83dbSDimitry Andric for (auto *I : LiveOuts) {
2307e8d8bef9SDimitry Andric if (!I->getType()->isIntegerTy() && !I->getType()->isFloatTy() &&
2308e8d8bef9SDimitry Andric !I->getType()->isHalfTy()) {
2309e8d8bef9SDimitry Andric LLVM_DEBUG(dbgs() << "Don't tail-predicate loop with non-integer/float "
23105ffd83dbSDimitry Andric "live-out value\n");
23115ffd83dbSDimitry Andric return false;
23125ffd83dbSDimitry Andric }
2313e8d8bef9SDimitry Andric if (ReductionsDisabled) {
2314e8d8bef9SDimitry Andric LLVM_DEBUG(dbgs() << "Reductions not enabled\n");
23155ffd83dbSDimitry Andric return false;
23165ffd83dbSDimitry Andric }
23175ffd83dbSDimitry Andric }
23185ffd83dbSDimitry Andric
23195ffd83dbSDimitry Andric // Next, check that all instructions can be tail-predicated.
2320480093f4SDimitry Andric PredicatedScalarEvolution PSE = LAI->getPSE();
23215ffd83dbSDimitry Andric SmallVector<Instruction *, 16> LoadStores;
2322480093f4SDimitry Andric int ICmpCount = 0;
2323480093f4SDimitry Andric
2324480093f4SDimitry Andric for (BasicBlock *BB : L->blocks()) {
2325480093f4SDimitry Andric for (Instruction &I : BB->instructionsWithoutDebug()) {
2326480093f4SDimitry Andric if (isa<PHINode>(&I))
2327480093f4SDimitry Andric continue;
2328480093f4SDimitry Andric if (!canTailPredicateInstruction(I, ICmpCount)) {
2329480093f4SDimitry Andric LLVM_DEBUG(dbgs() << "Instruction not allowed: "; I.dump());
2330480093f4SDimitry Andric return false;
2331480093f4SDimitry Andric }
2332480093f4SDimitry Andric
2333480093f4SDimitry Andric Type *T = I.getType();
2334480093f4SDimitry Andric if (T->getScalarSizeInBits() > 32) {
2335480093f4SDimitry Andric LLVM_DEBUG(dbgs() << "Unsupported Type: "; T->dump());
2336480093f4SDimitry Andric return false;
2337480093f4SDimitry Andric }
2338480093f4SDimitry Andric if (isa<StoreInst>(I) || isa<LoadInst>(I)) {
2339349cc55cSDimitry Andric Value *Ptr = getLoadStorePointerOperand(&I);
2340349cc55cSDimitry Andric Type *AccessTy = getLoadStoreType(&I);
2341bdd1243dSDimitry Andric int64_t NextStride = getPtrStride(PSE, AccessTy, Ptr, L).value_or(0);
2342e8d8bef9SDimitry Andric if (NextStride == 1) {
2343480093f4SDimitry Andric // TODO: for now only allow consecutive strides of 1. We could support
2344e8d8bef9SDimitry Andric // other strides as long as it is uniform, but let's keep it simple
2345e8d8bef9SDimitry Andric // for now.
2346e8d8bef9SDimitry Andric continue;
2347e8d8bef9SDimitry Andric } else if (NextStride == -1 ||
2348e8d8bef9SDimitry Andric (NextStride == 2 && MVEMaxSupportedInterleaveFactor >= 2) ||
2349e8d8bef9SDimitry Andric (NextStride == 4 && MVEMaxSupportedInterleaveFactor >= 4)) {
2350e8d8bef9SDimitry Andric LLVM_DEBUG(dbgs()
2351e8d8bef9SDimitry Andric << "Consecutive strides of 2 found, vld2/vstr2 can't "
2352e8d8bef9SDimitry Andric "be tail-predicated\n.");
2353e8d8bef9SDimitry Andric return false;
2354e8d8bef9SDimitry Andric // TODO: don't tail predicate if there is a reversed load?
2355e8d8bef9SDimitry Andric } else if (EnableMaskedGatherScatters) {
2356e8d8bef9SDimitry Andric // Gather/scatters do allow loading from arbitrary strides, at
2357e8d8bef9SDimitry Andric // least if they are loop invariant.
2358e8d8bef9SDimitry Andric // TODO: Loop variant strides should in theory work, too, but
2359e8d8bef9SDimitry Andric // this requires further testing.
2360349cc55cSDimitry Andric const SCEV *PtrScev = PSE.getSE()->getSCEV(Ptr);
2361e8d8bef9SDimitry Andric if (auto AR = dyn_cast<SCEVAddRecExpr>(PtrScev)) {
2362e8d8bef9SDimitry Andric const SCEV *Step = AR->getStepRecurrence(*PSE.getSE());
2363e8d8bef9SDimitry Andric if (PSE.getSE()->isLoopInvariant(Step, L))
2364480093f4SDimitry Andric continue;
2365480093f4SDimitry Andric }
2366e8d8bef9SDimitry Andric }
2367e8d8bef9SDimitry Andric LLVM_DEBUG(dbgs() << "Bad stride found, can't "
2368480093f4SDimitry Andric "tail-predicate\n.");
2369480093f4SDimitry Andric return false;
2370480093f4SDimitry Andric }
2371480093f4SDimitry Andric }
2372480093f4SDimitry Andric }
2373480093f4SDimitry Andric
2374480093f4SDimitry Andric LLVM_DEBUG(dbgs() << "tail-predication: all instructions allowed!\n");
2375480093f4SDimitry Andric return true;
2376480093f4SDimitry Andric }
2377480093f4SDimitry Andric
preferPredicateOverEpilogue(TailFoldingInfo * TFI)237806c3fb27SDimitry Andric bool ARMTTIImpl::preferPredicateOverEpilogue(TailFoldingInfo *TFI) {
23795ffd83dbSDimitry Andric if (!EnableTailPredication) {
23805ffd83dbSDimitry Andric LLVM_DEBUG(dbgs() << "Tail-predication not enabled.\n");
2381480093f4SDimitry Andric return false;
23825ffd83dbSDimitry Andric }
2383480093f4SDimitry Andric
2384480093f4SDimitry Andric // Creating a predicated vector loop is the first step for generating a
2385480093f4SDimitry Andric // tail-predicated hardware loop, for which we need the MVE masked
2386480093f4SDimitry Andric // load/stores instructions:
2387480093f4SDimitry Andric if (!ST->hasMVEIntegerOps())
2388480093f4SDimitry Andric return false;
2389480093f4SDimitry Andric
239006c3fb27SDimitry Andric LoopVectorizationLegality *LVL = TFI->LVL;
239106c3fb27SDimitry Andric Loop *L = LVL->getLoop();
239206c3fb27SDimitry Andric
2393480093f4SDimitry Andric // For now, restrict this to single block loops.
2394480093f4SDimitry Andric if (L->getNumBlocks() > 1) {
2395480093f4SDimitry Andric LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: not a single block "
2396480093f4SDimitry Andric "loop.\n");
2397480093f4SDimitry Andric return false;
2398480093f4SDimitry Andric }
2399480093f4SDimitry Andric
2400e8d8bef9SDimitry Andric assert(L->isInnermost() && "preferPredicateOverEpilogue: inner-loop expected");
2401480093f4SDimitry Andric
240206c3fb27SDimitry Andric LoopInfo *LI = LVL->getLoopInfo();
2403480093f4SDimitry Andric HardwareLoopInfo HWLoopInfo(L);
2404480093f4SDimitry Andric if (!HWLoopInfo.canAnalyze(*LI)) {
2405480093f4SDimitry Andric LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2406480093f4SDimitry Andric "analyzable.\n");
2407480093f4SDimitry Andric return false;
2408480093f4SDimitry Andric }
2409480093f4SDimitry Andric
241006c3fb27SDimitry Andric AssumptionCache *AC = LVL->getAssumptionCache();
241106c3fb27SDimitry Andric ScalarEvolution *SE = LVL->getScalarEvolution();
241206c3fb27SDimitry Andric
2413480093f4SDimitry Andric // This checks if we have the low-overhead branch architecture
2414480093f4SDimitry Andric // extension, and if we will create a hardware-loop:
241506c3fb27SDimitry Andric if (!isHardwareLoopProfitable(L, *SE, *AC, TFI->TLI, HWLoopInfo)) {
2416480093f4SDimitry Andric LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2417480093f4SDimitry Andric "profitable.\n");
2418480093f4SDimitry Andric return false;
2419480093f4SDimitry Andric }
2420480093f4SDimitry Andric
242106c3fb27SDimitry Andric DominatorTree *DT = LVL->getDominatorTree();
242206c3fb27SDimitry Andric if (!HWLoopInfo.isHardwareLoopCandidate(*SE, *LI, *DT)) {
2423480093f4SDimitry Andric LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2424480093f4SDimitry Andric "a candidate.\n");
2425480093f4SDimitry Andric return false;
2426480093f4SDimitry Andric }
2427480093f4SDimitry Andric
242806c3fb27SDimitry Andric return canTailPredicateLoop(L, LI, *SE, DL, LVL->getLAI());
2429480093f4SDimitry Andric }
2430480093f4SDimitry Andric
243106c3fb27SDimitry Andric TailFoldingStyle
getPreferredTailFoldingStyle(bool IVUpdateMayOverflow) const243206c3fb27SDimitry Andric ARMTTIImpl::getPreferredTailFoldingStyle(bool IVUpdateMayOverflow) const {
24335ffd83dbSDimitry Andric if (!ST->hasMVEIntegerOps() || !EnableTailPredication)
243406c3fb27SDimitry Andric return TailFoldingStyle::DataWithoutLaneMask;
2435480093f4SDimitry Andric
24365ffd83dbSDimitry Andric // Intrinsic @llvm.get.active.lane.mask is supported.
24375ffd83dbSDimitry Andric // It is used in the MVETailPredication pass, which requires the number of
24385ffd83dbSDimitry Andric // elements processed by this vector loop to setup the tail-predicated
24395ffd83dbSDimitry Andric // loop.
244006c3fb27SDimitry Andric return TailFoldingStyle::Data;
24415ffd83dbSDimitry Andric }
getUnrollingPreferences(Loop * L,ScalarEvolution & SE,TTI::UnrollingPreferences & UP,OptimizationRemarkEmitter * ORE)24420b57cec5SDimitry Andric void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
2443349cc55cSDimitry Andric TTI::UnrollingPreferences &UP,
2444349cc55cSDimitry Andric OptimizationRemarkEmitter *ORE) {
24455f757f3fSDimitry Andric // Enable Upper bound unrolling universally, providing that we do not see an
24465f757f3fSDimitry Andric // active lane mask, which will be better kept as a loop to become tail
24475f757f3fSDimitry Andric // predicated than to be conditionally unrolled.
24485f757f3fSDimitry Andric UP.UpperBound =
24495f757f3fSDimitry Andric !ST->hasMVEIntegerOps() || !any_of(*L->getHeader(), [](Instruction &I) {
24505f757f3fSDimitry Andric return isa<IntrinsicInst>(I) &&
24515f757f3fSDimitry Andric cast<IntrinsicInst>(I).getIntrinsicID() ==
24525f757f3fSDimitry Andric Intrinsic::get_active_lane_mask;
24535f757f3fSDimitry Andric });
2454fe6060f1SDimitry Andric
24550b57cec5SDimitry Andric // Only currently enable these preferences for M-Class cores.
24560b57cec5SDimitry Andric if (!ST->isMClass())
2457349cc55cSDimitry Andric return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
24580b57cec5SDimitry Andric
24590b57cec5SDimitry Andric // Disable loop unrolling for Oz and Os.
24600b57cec5SDimitry Andric UP.OptSizeThreshold = 0;
24610b57cec5SDimitry Andric UP.PartialOptSizeThreshold = 0;
24620b57cec5SDimitry Andric if (L->getHeader()->getParent()->hasOptSize())
24630b57cec5SDimitry Andric return;
24640b57cec5SDimitry Andric
24650b57cec5SDimitry Andric SmallVector<BasicBlock*, 4> ExitingBlocks;
24660b57cec5SDimitry Andric L->getExitingBlocks(ExitingBlocks);
24670b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "Loop has:\n"
24680b57cec5SDimitry Andric << "Blocks: " << L->getNumBlocks() << "\n"
24690b57cec5SDimitry Andric << "Exit blocks: " << ExitingBlocks.size() << "\n");
24700b57cec5SDimitry Andric
24710b57cec5SDimitry Andric // Only allow another exit other than the latch. This acts as an early exit
24720b57cec5SDimitry Andric // as it mirrors the profitability calculation of the runtime unroller.
24730b57cec5SDimitry Andric if (ExitingBlocks.size() > 2)
24740b57cec5SDimitry Andric return;
24750b57cec5SDimitry Andric
24760b57cec5SDimitry Andric // Limit the CFG of the loop body for targets with a branch predictor.
24770b57cec5SDimitry Andric // Allowing 4 blocks permits if-then-else diamonds in the body.
24780b57cec5SDimitry Andric if (ST->hasBranchPredictor() && L->getNumBlocks() > 4)
24790b57cec5SDimitry Andric return;
24800b57cec5SDimitry Andric
2481e8d8bef9SDimitry Andric // Don't unroll vectorized loops, including the remainder loop
2482e8d8bef9SDimitry Andric if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized"))
2483e8d8bef9SDimitry Andric return;
2484e8d8bef9SDimitry Andric
24850b57cec5SDimitry Andric // Scan the loop: don't unroll loops with calls as this could prevent
24860b57cec5SDimitry Andric // inlining.
2487fe6060f1SDimitry Andric InstructionCost Cost = 0;
24880b57cec5SDimitry Andric for (auto *BB : L->getBlocks()) {
24890b57cec5SDimitry Andric for (auto &I : *BB) {
2490480093f4SDimitry Andric // Don't unroll vectorised loop. MVE does not benefit from it as much as
2491480093f4SDimitry Andric // scalar code.
2492480093f4SDimitry Andric if (I.getType()->isVectorTy())
2493480093f4SDimitry Andric return;
2494480093f4SDimitry Andric
24950b57cec5SDimitry Andric if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
24965ffd83dbSDimitry Andric if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
24970b57cec5SDimitry Andric if (!isLoweredToCall(F))
24980b57cec5SDimitry Andric continue;
24990b57cec5SDimitry Andric }
25000b57cec5SDimitry Andric return;
25010b57cec5SDimitry Andric }
25028bcb0991SDimitry Andric
2503e8d8bef9SDimitry Andric SmallVector<const Value*, 4> Operands(I.operand_values());
2504bdd1243dSDimitry Andric Cost += getInstructionCost(&I, Operands,
2505bdd1243dSDimitry Andric TargetTransformInfo::TCK_SizeAndLatency);
25060b57cec5SDimitry Andric }
25070b57cec5SDimitry Andric }
25080b57cec5SDimitry Andric
2509fe6060f1SDimitry Andric // On v6m cores, there are very few registers available. We can easily end up
2510fe6060f1SDimitry Andric // spilling and reloading more registers in an unrolled loop. Look at the
2511fe6060f1SDimitry Andric // number of LCSSA phis as a rough measure of how many registers will need to
2512fe6060f1SDimitry Andric // be live out of the loop, reducing the default unroll count if more than 1
2513fe6060f1SDimitry Andric // value is needed. In the long run, all of this should be being learnt by a
2514fe6060f1SDimitry Andric // machine.
2515fe6060f1SDimitry Andric unsigned UnrollCount = 4;
2516fe6060f1SDimitry Andric if (ST->isThumb1Only()) {
2517fe6060f1SDimitry Andric unsigned ExitingValues = 0;
2518fe6060f1SDimitry Andric SmallVector<BasicBlock *, 4> ExitBlocks;
2519fe6060f1SDimitry Andric L->getExitBlocks(ExitBlocks);
2520fe6060f1SDimitry Andric for (auto *Exit : ExitBlocks) {
2521fe6060f1SDimitry Andric // Count the number of LCSSA phis. Exclude values coming from GEP's as
2522fe6060f1SDimitry Andric // only the last is expected to be needed for address operands.
2523fe6060f1SDimitry Andric unsigned LiveOuts = count_if(Exit->phis(), [](auto &PH) {
2524fe6060f1SDimitry Andric return PH.getNumOperands() != 1 ||
2525fe6060f1SDimitry Andric !isa<GetElementPtrInst>(PH.getOperand(0));
2526fe6060f1SDimitry Andric });
2527fe6060f1SDimitry Andric ExitingValues = ExitingValues < LiveOuts ? LiveOuts : ExitingValues;
2528fe6060f1SDimitry Andric }
2529fe6060f1SDimitry Andric if (ExitingValues)
2530fe6060f1SDimitry Andric UnrollCount /= ExitingValues;
2531fe6060f1SDimitry Andric if (UnrollCount <= 1)
2532fe6060f1SDimitry Andric return;
2533fe6060f1SDimitry Andric }
2534fe6060f1SDimitry Andric
25350b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
2536fe6060f1SDimitry Andric LLVM_DEBUG(dbgs() << "Default Runtime Unroll Count: " << UnrollCount << "\n");
25370b57cec5SDimitry Andric
25380b57cec5SDimitry Andric UP.Partial = true;
25390b57cec5SDimitry Andric UP.Runtime = true;
25400b57cec5SDimitry Andric UP.UnrollRemainder = true;
2541fe6060f1SDimitry Andric UP.DefaultUnrollRuntimeCount = UnrollCount;
25420b57cec5SDimitry Andric UP.UnrollAndJam = true;
25430b57cec5SDimitry Andric UP.UnrollAndJamInnerLoopThreshold = 60;
25440b57cec5SDimitry Andric
25450b57cec5SDimitry Andric // Force unrolling small loops can be very useful because of the branch
25460b57cec5SDimitry Andric // taken cost of the backedge.
25470b57cec5SDimitry Andric if (Cost < 12)
25480b57cec5SDimitry Andric UP.Force = true;
25490b57cec5SDimitry Andric }
25508bcb0991SDimitry Andric
getPeelingPreferences(Loop * L,ScalarEvolution & SE,TTI::PeelingPreferences & PP)25515ffd83dbSDimitry Andric void ARMTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
25525ffd83dbSDimitry Andric TTI::PeelingPreferences &PP) {
25535ffd83dbSDimitry Andric BaseT::getPeelingPreferences(L, SE, PP);
25545ffd83dbSDimitry Andric }
25555ffd83dbSDimitry Andric
preferInLoopReduction(unsigned Opcode,Type * Ty,TTI::ReductionFlags Flags) const2556e8d8bef9SDimitry Andric bool ARMTTIImpl::preferInLoopReduction(unsigned Opcode, Type *Ty,
2557e8d8bef9SDimitry Andric TTI::ReductionFlags Flags) const {
2558e8d8bef9SDimitry Andric if (!ST->hasMVEIntegerOps())
2559e8d8bef9SDimitry Andric return false;
2560e8d8bef9SDimitry Andric
2561e8d8bef9SDimitry Andric unsigned ScalarBits = Ty->getScalarSizeInBits();
2562e8d8bef9SDimitry Andric switch (Opcode) {
2563e8d8bef9SDimitry Andric case Instruction::Add:
2564e8d8bef9SDimitry Andric return ScalarBits <= 64;
2565e8d8bef9SDimitry Andric default:
2566e8d8bef9SDimitry Andric return false;
2567e8d8bef9SDimitry Andric }
2568e8d8bef9SDimitry Andric }
2569e8d8bef9SDimitry Andric
preferPredicatedReductionSelect(unsigned Opcode,Type * Ty,TTI::ReductionFlags Flags) const2570e8d8bef9SDimitry Andric bool ARMTTIImpl::preferPredicatedReductionSelect(
2571e8d8bef9SDimitry Andric unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const {
2572e8d8bef9SDimitry Andric if (!ST->hasMVEIntegerOps())
2573e8d8bef9SDimitry Andric return false;
2574e8d8bef9SDimitry Andric return true;
2575e8d8bef9SDimitry Andric }
2576bdd1243dSDimitry Andric
getScalingFactorCost(Type * Ty,GlobalValue * BaseGV,StackOffset BaseOffset,bool HasBaseReg,int64_t Scale,unsigned AddrSpace) const2577bdd1243dSDimitry Andric InstructionCost ARMTTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
2578*0fca6ea1SDimitry Andric StackOffset BaseOffset,
2579bdd1243dSDimitry Andric bool HasBaseReg, int64_t Scale,
2580bdd1243dSDimitry Andric unsigned AddrSpace) const {
2581bdd1243dSDimitry Andric TargetLoweringBase::AddrMode AM;
2582bdd1243dSDimitry Andric AM.BaseGV = BaseGV;
2583*0fca6ea1SDimitry Andric AM.BaseOffs = BaseOffset.getFixed();
2584bdd1243dSDimitry Andric AM.HasBaseReg = HasBaseReg;
2585bdd1243dSDimitry Andric AM.Scale = Scale;
2586*0fca6ea1SDimitry Andric AM.ScalableOffset = BaseOffset.getScalable();
2587bdd1243dSDimitry Andric if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace)) {
2588bdd1243dSDimitry Andric if (ST->hasFPAO())
2589bdd1243dSDimitry Andric return AM.Scale < 0 ? 1 : 0; // positive offsets execute faster
2590bdd1243dSDimitry Andric return 0;
2591bdd1243dSDimitry Andric }
2592bdd1243dSDimitry Andric return -1;
2593bdd1243dSDimitry Andric }
259406c3fb27SDimitry Andric
hasArmWideBranch(bool Thumb) const259506c3fb27SDimitry Andric bool ARMTTIImpl::hasArmWideBranch(bool Thumb) const {
259606c3fb27SDimitry Andric if (Thumb) {
259706c3fb27SDimitry Andric // B.W is available in any Thumb2-supporting target, and also in every
259806c3fb27SDimitry Andric // version of Armv8-M, even Baseline which does not include the rest of
259906c3fb27SDimitry Andric // Thumb2.
260006c3fb27SDimitry Andric return ST->isThumb2() || ST->hasV8MBaselineOps();
260106c3fb27SDimitry Andric } else {
260206c3fb27SDimitry Andric // B is available in all versions of the Arm ISA, so the only question is
260306c3fb27SDimitry Andric // whether that ISA is available at all.
260406c3fb27SDimitry Andric return ST->hasARMOps();
260506c3fb27SDimitry Andric }
260606c3fb27SDimitry Andric }
2607