xref: /freebsd/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp (revision 0fca6ea1d4eea4c934cfff25ac9ee8ad6fe95583)
10b57cec5SDimitry Andric //===- ARMTargetTransformInfo.cpp - ARM specific TTI ----------------------===//
20b57cec5SDimitry Andric //
30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
60b57cec5SDimitry Andric //
70b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
80b57cec5SDimitry Andric 
90b57cec5SDimitry Andric #include "ARMTargetTransformInfo.h"
100b57cec5SDimitry Andric #include "ARMSubtarget.h"
110b57cec5SDimitry Andric #include "MCTargetDesc/ARMAddressingModes.h"
120b57cec5SDimitry Andric #include "llvm/ADT/APInt.h"
130b57cec5SDimitry Andric #include "llvm/ADT/SmallVector.h"
140b57cec5SDimitry Andric #include "llvm/Analysis/LoopInfo.h"
150b57cec5SDimitry Andric #include "llvm/CodeGen/CostTable.h"
160b57cec5SDimitry Andric #include "llvm/CodeGen/ISDOpcodes.h"
170b57cec5SDimitry Andric #include "llvm/CodeGen/ValueTypes.h"
18*0fca6ea1SDimitry Andric #include "llvm/CodeGenTypes/MachineValueType.h"
190b57cec5SDimitry Andric #include "llvm/IR/BasicBlock.h"
200b57cec5SDimitry Andric #include "llvm/IR/DataLayout.h"
210b57cec5SDimitry Andric #include "llvm/IR/DerivedTypes.h"
220b57cec5SDimitry Andric #include "llvm/IR/Instruction.h"
230b57cec5SDimitry Andric #include "llvm/IR/Instructions.h"
240b57cec5SDimitry Andric #include "llvm/IR/IntrinsicInst.h"
25fcaf7f86SDimitry Andric #include "llvm/IR/Intrinsics.h"
265ffd83dbSDimitry Andric #include "llvm/IR/IntrinsicsARM.h"
27480093f4SDimitry Andric #include "llvm/IR/PatternMatch.h"
280b57cec5SDimitry Andric #include "llvm/IR/Type.h"
290b57cec5SDimitry Andric #include "llvm/Support/Casting.h"
30e8d8bef9SDimitry Andric #include "llvm/Support/KnownBits.h"
310b57cec5SDimitry Andric #include "llvm/Target/TargetMachine.h"
3206c3fb27SDimitry Andric #include "llvm/TargetParser/SubtargetFeature.h"
33e8d8bef9SDimitry Andric #include "llvm/Transforms/InstCombine/InstCombiner.h"
34e8d8bef9SDimitry Andric #include "llvm/Transforms/Utils/Local.h"
355ffd83dbSDimitry Andric #include "llvm/Transforms/Utils/LoopUtils.h"
36fcaf7f86SDimitry Andric #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
370b57cec5SDimitry Andric #include <algorithm>
380b57cec5SDimitry Andric #include <cassert>
390b57cec5SDimitry Andric #include <cstdint>
40bdd1243dSDimitry Andric #include <optional>
410b57cec5SDimitry Andric #include <utility>
420b57cec5SDimitry Andric 
430b57cec5SDimitry Andric using namespace llvm;
440b57cec5SDimitry Andric 
450b57cec5SDimitry Andric #define DEBUG_TYPE "armtti"
460b57cec5SDimitry Andric 
478bcb0991SDimitry Andric static cl::opt<bool> EnableMaskedLoadStores(
48480093f4SDimitry Andric   "enable-arm-maskedldst", cl::Hidden, cl::init(true),
498bcb0991SDimitry Andric   cl::desc("Enable the generation of masked loads and stores"));
508bcb0991SDimitry Andric 
510b57cec5SDimitry Andric static cl::opt<bool> DisableLowOverheadLoops(
528bcb0991SDimitry Andric   "disable-arm-loloops", cl::Hidden, cl::init(false),
530b57cec5SDimitry Andric   cl::desc("Disable the generation of low-overhead loops"));
540b57cec5SDimitry Andric 
55e8d8bef9SDimitry Andric static cl::opt<bool>
56e8d8bef9SDimitry Andric     AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true),
57e8d8bef9SDimitry Andric                   cl::desc("Enable the generation of WLS loops"));
58e8d8bef9SDimitry Andric 
595ffd83dbSDimitry Andric extern cl::opt<TailPredication::Mode> EnableTailPredication;
60480093f4SDimitry Andric 
61480093f4SDimitry Andric extern cl::opt<bool> EnableMaskedGatherScatters;
62480093f4SDimitry Andric 
63e8d8bef9SDimitry Andric extern cl::opt<unsigned> MVEMaxSupportedInterleaveFactor;
64e8d8bef9SDimitry Andric 
65e8d8bef9SDimitry Andric /// Convert a vector load intrinsic into a simple llvm load instruction.
66e8d8bef9SDimitry Andric /// This is beneficial when the underlying object being addressed comes
67e8d8bef9SDimitry Andric /// from a constant, since we get constant-folding for free.
simplifyNeonVld1(const IntrinsicInst & II,unsigned MemAlign,InstCombiner::BuilderTy & Builder)68e8d8bef9SDimitry Andric static Value *simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign,
69e8d8bef9SDimitry Andric                                InstCombiner::BuilderTy &Builder) {
70e8d8bef9SDimitry Andric   auto *IntrAlign = dyn_cast<ConstantInt>(II.getArgOperand(1));
71e8d8bef9SDimitry Andric 
72e8d8bef9SDimitry Andric   if (!IntrAlign)
73e8d8bef9SDimitry Andric     return nullptr;
74e8d8bef9SDimitry Andric 
75e8d8bef9SDimitry Andric   unsigned Alignment = IntrAlign->getLimitedValue() < MemAlign
76e8d8bef9SDimitry Andric                            ? MemAlign
77e8d8bef9SDimitry Andric                            : IntrAlign->getLimitedValue();
78e8d8bef9SDimitry Andric 
79e8d8bef9SDimitry Andric   if (!isPowerOf2_32(Alignment))
80e8d8bef9SDimitry Andric     return nullptr;
81e8d8bef9SDimitry Andric 
82e8d8bef9SDimitry Andric   auto *BCastInst = Builder.CreateBitCast(II.getArgOperand(0),
83e8d8bef9SDimitry Andric                                           PointerType::get(II.getType(), 0));
84e8d8bef9SDimitry Andric   return Builder.CreateAlignedLoad(II.getType(), BCastInst, Align(Alignment));
85e8d8bef9SDimitry Andric }
86e8d8bef9SDimitry Andric 
areInlineCompatible(const Function * Caller,const Function * Callee) const870b57cec5SDimitry Andric bool ARMTTIImpl::areInlineCompatible(const Function *Caller,
880b57cec5SDimitry Andric                                      const Function *Callee) const {
890b57cec5SDimitry Andric   const TargetMachine &TM = getTLI()->getTargetMachine();
900b57cec5SDimitry Andric   const FeatureBitset &CallerBits =
910b57cec5SDimitry Andric       TM.getSubtargetImpl(*Caller)->getFeatureBits();
920b57cec5SDimitry Andric   const FeatureBitset &CalleeBits =
930b57cec5SDimitry Andric       TM.getSubtargetImpl(*Callee)->getFeatureBits();
940b57cec5SDimitry Andric 
955ffd83dbSDimitry Andric   // To inline a callee, all features not in the allowed list must match exactly.
965ffd83dbSDimitry Andric   bool MatchExact = (CallerBits & ~InlineFeaturesAllowed) ==
975ffd83dbSDimitry Andric                     (CalleeBits & ~InlineFeaturesAllowed);
985ffd83dbSDimitry Andric   // For features in the allowed list, the callee's features must be a subset of
990b57cec5SDimitry Andric   // the callers'.
1005ffd83dbSDimitry Andric   bool MatchSubset = ((CallerBits & CalleeBits) & InlineFeaturesAllowed) ==
1015ffd83dbSDimitry Andric                      (CalleeBits & InlineFeaturesAllowed);
1020b57cec5SDimitry Andric   return MatchExact && MatchSubset;
1030b57cec5SDimitry Andric }
1040b57cec5SDimitry Andric 
105fe6060f1SDimitry Andric TTI::AddressingModeKind
getPreferredAddressingMode(const Loop * L,ScalarEvolution * SE) const106fe6060f1SDimitry Andric ARMTTIImpl::getPreferredAddressingMode(const Loop *L,
107fe6060f1SDimitry Andric                                        ScalarEvolution *SE) const {
1085ffd83dbSDimitry Andric   if (ST->hasMVEIntegerOps())
109fe6060f1SDimitry Andric     return TTI::AMK_PostIndexed;
1105ffd83dbSDimitry Andric 
111fe6060f1SDimitry Andric   if (L->getHeader()->getParent()->hasOptSize())
112fe6060f1SDimitry Andric     return TTI::AMK_None;
113fe6060f1SDimitry Andric 
114fe6060f1SDimitry Andric   if (ST->isMClass() && ST->isThumb2() &&
115fe6060f1SDimitry Andric       L->getNumBlocks() == 1)
116fe6060f1SDimitry Andric     return TTI::AMK_PreIndexed;
117fe6060f1SDimitry Andric 
118fe6060f1SDimitry Andric   return TTI::AMK_None;
1195ffd83dbSDimitry Andric }
1205ffd83dbSDimitry Andric 
121bdd1243dSDimitry Andric std::optional<Instruction *>
instCombineIntrinsic(InstCombiner & IC,IntrinsicInst & II) const122e8d8bef9SDimitry Andric ARMTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
123e8d8bef9SDimitry Andric   using namespace PatternMatch;
124e8d8bef9SDimitry Andric   Intrinsic::ID IID = II.getIntrinsicID();
125e8d8bef9SDimitry Andric   switch (IID) {
126e8d8bef9SDimitry Andric   default:
127e8d8bef9SDimitry Andric     break;
128e8d8bef9SDimitry Andric   case Intrinsic::arm_neon_vld1: {
129e8d8bef9SDimitry Andric     Align MemAlign =
130e8d8bef9SDimitry Andric         getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,
131e8d8bef9SDimitry Andric                           &IC.getAssumptionCache(), &IC.getDominatorTree());
132e8d8bef9SDimitry Andric     if (Value *V = simplifyNeonVld1(II, MemAlign.value(), IC.Builder)) {
133e8d8bef9SDimitry Andric       return IC.replaceInstUsesWith(II, V);
134e8d8bef9SDimitry Andric     }
135e8d8bef9SDimitry Andric     break;
136e8d8bef9SDimitry Andric   }
137e8d8bef9SDimitry Andric 
138e8d8bef9SDimitry Andric   case Intrinsic::arm_neon_vld2:
139e8d8bef9SDimitry Andric   case Intrinsic::arm_neon_vld3:
140e8d8bef9SDimitry Andric   case Intrinsic::arm_neon_vld4:
141e8d8bef9SDimitry Andric   case Intrinsic::arm_neon_vld2lane:
142e8d8bef9SDimitry Andric   case Intrinsic::arm_neon_vld3lane:
143e8d8bef9SDimitry Andric   case Intrinsic::arm_neon_vld4lane:
144e8d8bef9SDimitry Andric   case Intrinsic::arm_neon_vst1:
145e8d8bef9SDimitry Andric   case Intrinsic::arm_neon_vst2:
146e8d8bef9SDimitry Andric   case Intrinsic::arm_neon_vst3:
147e8d8bef9SDimitry Andric   case Intrinsic::arm_neon_vst4:
148e8d8bef9SDimitry Andric   case Intrinsic::arm_neon_vst2lane:
149e8d8bef9SDimitry Andric   case Intrinsic::arm_neon_vst3lane:
150e8d8bef9SDimitry Andric   case Intrinsic::arm_neon_vst4lane: {
151e8d8bef9SDimitry Andric     Align MemAlign =
152e8d8bef9SDimitry Andric         getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,
153e8d8bef9SDimitry Andric                           &IC.getAssumptionCache(), &IC.getDominatorTree());
154349cc55cSDimitry Andric     unsigned AlignArg = II.arg_size() - 1;
155e8d8bef9SDimitry Andric     Value *AlignArgOp = II.getArgOperand(AlignArg);
156e8d8bef9SDimitry Andric     MaybeAlign Align = cast<ConstantInt>(AlignArgOp)->getMaybeAlignValue();
157e8d8bef9SDimitry Andric     if (Align && *Align < MemAlign) {
158e8d8bef9SDimitry Andric       return IC.replaceOperand(
159e8d8bef9SDimitry Andric           II, AlignArg,
160e8d8bef9SDimitry Andric           ConstantInt::get(Type::getInt32Ty(II.getContext()), MemAlign.value(),
161e8d8bef9SDimitry Andric                            false));
162e8d8bef9SDimitry Andric     }
163e8d8bef9SDimitry Andric     break;
164e8d8bef9SDimitry Andric   }
165e8d8bef9SDimitry Andric 
166e8d8bef9SDimitry Andric   case Intrinsic::arm_mve_pred_i2v: {
167e8d8bef9SDimitry Andric     Value *Arg = II.getArgOperand(0);
168e8d8bef9SDimitry Andric     Value *ArgArg;
169e8d8bef9SDimitry Andric     if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
170e8d8bef9SDimitry Andric                        PatternMatch::m_Value(ArgArg))) &&
171e8d8bef9SDimitry Andric         II.getType() == ArgArg->getType()) {
172e8d8bef9SDimitry Andric       return IC.replaceInstUsesWith(II, ArgArg);
173e8d8bef9SDimitry Andric     }
174e8d8bef9SDimitry Andric     Constant *XorMask;
175e8d8bef9SDimitry Andric     if (match(Arg, m_Xor(PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
176e8d8bef9SDimitry Andric                              PatternMatch::m_Value(ArgArg)),
177e8d8bef9SDimitry Andric                          PatternMatch::m_Constant(XorMask))) &&
178e8d8bef9SDimitry Andric         II.getType() == ArgArg->getType()) {
179e8d8bef9SDimitry Andric       if (auto *CI = dyn_cast<ConstantInt>(XorMask)) {
180349cc55cSDimitry Andric         if (CI->getValue().trunc(16).isAllOnes()) {
181e8d8bef9SDimitry Andric           auto TrueVector = IC.Builder.CreateVectorSplat(
182e8d8bef9SDimitry Andric               cast<FixedVectorType>(II.getType())->getNumElements(),
183e8d8bef9SDimitry Andric               IC.Builder.getTrue());
184e8d8bef9SDimitry Andric           return BinaryOperator::Create(Instruction::Xor, ArgArg, TrueVector);
185e8d8bef9SDimitry Andric         }
186e8d8bef9SDimitry Andric       }
187e8d8bef9SDimitry Andric     }
188e8d8bef9SDimitry Andric     KnownBits ScalarKnown(32);
189e8d8bef9SDimitry Andric     if (IC.SimplifyDemandedBits(&II, 0, APInt::getLowBitsSet(32, 16),
190*0fca6ea1SDimitry Andric                                 ScalarKnown)) {
191e8d8bef9SDimitry Andric       return &II;
192e8d8bef9SDimitry Andric     }
193e8d8bef9SDimitry Andric     break;
194e8d8bef9SDimitry Andric   }
195e8d8bef9SDimitry Andric   case Intrinsic::arm_mve_pred_v2i: {
196e8d8bef9SDimitry Andric     Value *Arg = II.getArgOperand(0);
197e8d8bef9SDimitry Andric     Value *ArgArg;
198e8d8bef9SDimitry Andric     if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_i2v>(
199e8d8bef9SDimitry Andric                        PatternMatch::m_Value(ArgArg)))) {
200e8d8bef9SDimitry Andric       return IC.replaceInstUsesWith(II, ArgArg);
201e8d8bef9SDimitry Andric     }
202*0fca6ea1SDimitry Andric 
203*0fca6ea1SDimitry Andric     if (II.getMetadata(LLVMContext::MD_range))
204e8d8bef9SDimitry Andric       break;
205*0fca6ea1SDimitry Andric 
206*0fca6ea1SDimitry Andric     ConstantRange Range(APInt(32, 0), APInt(32, 0x10000));
207*0fca6ea1SDimitry Andric 
208*0fca6ea1SDimitry Andric     if (auto CurrentRange = II.getRange()) {
209*0fca6ea1SDimitry Andric       Range = Range.intersectWith(*CurrentRange);
210*0fca6ea1SDimitry Andric       if (Range == CurrentRange)
211*0fca6ea1SDimitry Andric         break;
212*0fca6ea1SDimitry Andric     }
213*0fca6ea1SDimitry Andric 
214*0fca6ea1SDimitry Andric     II.addRangeRetAttr(Range);
215*0fca6ea1SDimitry Andric     II.addRetAttr(Attribute::NoUndef);
216*0fca6ea1SDimitry Andric     return &II;
217e8d8bef9SDimitry Andric   }
218e8d8bef9SDimitry Andric   case Intrinsic::arm_mve_vadc:
219e8d8bef9SDimitry Andric   case Intrinsic::arm_mve_vadc_predicated: {
220e8d8bef9SDimitry Andric     unsigned CarryOp =
221e8d8bef9SDimitry Andric         (II.getIntrinsicID() == Intrinsic::arm_mve_vadc_predicated) ? 3 : 2;
222e8d8bef9SDimitry Andric     assert(II.getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == 32 &&
223e8d8bef9SDimitry Andric            "Bad type for intrinsic!");
224e8d8bef9SDimitry Andric 
225e8d8bef9SDimitry Andric     KnownBits CarryKnown(32);
226e8d8bef9SDimitry Andric     if (IC.SimplifyDemandedBits(&II, CarryOp, APInt::getOneBitSet(32, 29),
227e8d8bef9SDimitry Andric                                 CarryKnown)) {
228e8d8bef9SDimitry Andric       return &II;
229e8d8bef9SDimitry Andric     }
230e8d8bef9SDimitry Andric     break;
231e8d8bef9SDimitry Andric   }
232e8d8bef9SDimitry Andric   case Intrinsic::arm_mve_vmldava: {
233e8d8bef9SDimitry Andric     Instruction *I = cast<Instruction>(&II);
234e8d8bef9SDimitry Andric     if (I->hasOneUse()) {
235e8d8bef9SDimitry Andric       auto *User = cast<Instruction>(*I->user_begin());
236e8d8bef9SDimitry Andric       Value *OpZ;
237e8d8bef9SDimitry Andric       if (match(User, m_c_Add(m_Specific(I), m_Value(OpZ))) &&
238e8d8bef9SDimitry Andric           match(I->getOperand(3), m_Zero())) {
239e8d8bef9SDimitry Andric         Value *OpX = I->getOperand(4);
240e8d8bef9SDimitry Andric         Value *OpY = I->getOperand(5);
241e8d8bef9SDimitry Andric         Type *OpTy = OpX->getType();
242e8d8bef9SDimitry Andric 
243e8d8bef9SDimitry Andric         IC.Builder.SetInsertPoint(User);
244e8d8bef9SDimitry Andric         Value *V =
245e8d8bef9SDimitry Andric             IC.Builder.CreateIntrinsic(Intrinsic::arm_mve_vmldava, {OpTy},
246e8d8bef9SDimitry Andric                                        {I->getOperand(0), I->getOperand(1),
247e8d8bef9SDimitry Andric                                         I->getOperand(2), OpZ, OpX, OpY});
248e8d8bef9SDimitry Andric 
249e8d8bef9SDimitry Andric         IC.replaceInstUsesWith(*User, V);
250e8d8bef9SDimitry Andric         return IC.eraseInstFromFunction(*User);
251e8d8bef9SDimitry Andric       }
252e8d8bef9SDimitry Andric     }
253bdd1243dSDimitry Andric     return std::nullopt;
254e8d8bef9SDimitry Andric   }
255e8d8bef9SDimitry Andric   }
256bdd1243dSDimitry Andric   return std::nullopt;
257e8d8bef9SDimitry Andric }
258e8d8bef9SDimitry Andric 
simplifyDemandedVectorEltsIntrinsic(InstCombiner & IC,IntrinsicInst & II,APInt OrigDemandedElts,APInt & UndefElts,APInt & UndefElts2,APInt & UndefElts3,std::function<void (Instruction *,unsigned,APInt,APInt &)> SimplifyAndSetOp) const259bdd1243dSDimitry Andric std::optional<Value *> ARMTTIImpl::simplifyDemandedVectorEltsIntrinsic(
260349cc55cSDimitry Andric     InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
261349cc55cSDimitry Andric     APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
262349cc55cSDimitry Andric     std::function<void(Instruction *, unsigned, APInt, APInt &)>
263349cc55cSDimitry Andric         SimplifyAndSetOp) const {
264349cc55cSDimitry Andric 
265349cc55cSDimitry Andric   // Compute the demanded bits for a narrowing MVE intrinsic. The TopOpc is the
266349cc55cSDimitry Andric   // opcode specifying a Top/Bottom instruction, which can change between
267349cc55cSDimitry Andric   // instructions.
268349cc55cSDimitry Andric   auto SimplifyNarrowInstrTopBottom =[&](unsigned TopOpc) {
269349cc55cSDimitry Andric     unsigned NumElts = cast<FixedVectorType>(II.getType())->getNumElements();
270349cc55cSDimitry Andric     unsigned IsTop = cast<ConstantInt>(II.getOperand(TopOpc))->getZExtValue();
271349cc55cSDimitry Andric 
272349cc55cSDimitry Andric     // The only odd/even lanes of operand 0 will only be demanded depending
273349cc55cSDimitry Andric     // on whether this is a top/bottom instruction.
274349cc55cSDimitry Andric     APInt DemandedElts =
275349cc55cSDimitry Andric         APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1)
276349cc55cSDimitry Andric                                        : APInt::getHighBitsSet(2, 1));
277349cc55cSDimitry Andric     SimplifyAndSetOp(&II, 0, OrigDemandedElts & DemandedElts, UndefElts);
278349cc55cSDimitry Andric     // The other lanes will be defined from the inserted elements.
2795f757f3fSDimitry Andric     UndefElts &= APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1)
280349cc55cSDimitry Andric                                                 : APInt::getHighBitsSet(2, 1));
281bdd1243dSDimitry Andric     return std::nullopt;
282349cc55cSDimitry Andric   };
283349cc55cSDimitry Andric 
284349cc55cSDimitry Andric   switch (II.getIntrinsicID()) {
285349cc55cSDimitry Andric   default:
286349cc55cSDimitry Andric     break;
287349cc55cSDimitry Andric   case Intrinsic::arm_mve_vcvt_narrow:
288349cc55cSDimitry Andric     SimplifyNarrowInstrTopBottom(2);
289349cc55cSDimitry Andric     break;
290349cc55cSDimitry Andric   case Intrinsic::arm_mve_vqmovn:
291349cc55cSDimitry Andric     SimplifyNarrowInstrTopBottom(4);
292349cc55cSDimitry Andric     break;
293349cc55cSDimitry Andric   case Intrinsic::arm_mve_vshrn:
294349cc55cSDimitry Andric     SimplifyNarrowInstrTopBottom(7);
295349cc55cSDimitry Andric     break;
296349cc55cSDimitry Andric   }
297349cc55cSDimitry Andric 
298bdd1243dSDimitry Andric   return std::nullopt;
299349cc55cSDimitry Andric }
300349cc55cSDimitry Andric 
getIntImmCost(const APInt & Imm,Type * Ty,TTI::TargetCostKind CostKind)301fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
3025ffd83dbSDimitry Andric                                           TTI::TargetCostKind CostKind) {
3030b57cec5SDimitry Andric   assert(Ty->isIntegerTy());
3040b57cec5SDimitry Andric 
3050b57cec5SDimitry Andric  unsigned Bits = Ty->getPrimitiveSizeInBits();
3060b57cec5SDimitry Andric  if (Bits == 0 || Imm.getActiveBits() >= 64)
3070b57cec5SDimitry Andric    return 4;
3080b57cec5SDimitry Andric 
3090b57cec5SDimitry Andric   int64_t SImmVal = Imm.getSExtValue();
3100b57cec5SDimitry Andric   uint64_t ZImmVal = Imm.getZExtValue();
3110b57cec5SDimitry Andric   if (!ST->isThumb()) {
3120b57cec5SDimitry Andric     if ((SImmVal >= 0 && SImmVal < 65536) ||
3130b57cec5SDimitry Andric         (ARM_AM::getSOImmVal(ZImmVal) != -1) ||
3140b57cec5SDimitry Andric         (ARM_AM::getSOImmVal(~ZImmVal) != -1))
3150b57cec5SDimitry Andric       return 1;
3160b57cec5SDimitry Andric     return ST->hasV6T2Ops() ? 2 : 3;
3170b57cec5SDimitry Andric   }
3180b57cec5SDimitry Andric   if (ST->isThumb2()) {
3190b57cec5SDimitry Andric     if ((SImmVal >= 0 && SImmVal < 65536) ||
3200b57cec5SDimitry Andric         (ARM_AM::getT2SOImmVal(ZImmVal) != -1) ||
3210b57cec5SDimitry Andric         (ARM_AM::getT2SOImmVal(~ZImmVal) != -1))
3220b57cec5SDimitry Andric       return 1;
3230b57cec5SDimitry Andric     return ST->hasV6T2Ops() ? 2 : 3;
3240b57cec5SDimitry Andric   }
3250b57cec5SDimitry Andric   // Thumb1, any i8 imm cost 1.
3260b57cec5SDimitry Andric   if (Bits == 8 || (SImmVal >= 0 && SImmVal < 256))
3270b57cec5SDimitry Andric     return 1;
3280b57cec5SDimitry Andric   if ((~SImmVal < 256) || ARM_AM::isThumbImmShiftedVal(ZImmVal))
3290b57cec5SDimitry Andric     return 2;
3300b57cec5SDimitry Andric   // Load from constantpool.
3310b57cec5SDimitry Andric   return 3;
3320b57cec5SDimitry Andric }
3330b57cec5SDimitry Andric 
3340b57cec5SDimitry Andric // Constants smaller than 256 fit in the immediate field of
3350b57cec5SDimitry Andric // Thumb1 instructions so we return a zero cost and 1 otherwise.
getIntImmCodeSizeCost(unsigned Opcode,unsigned Idx,const APInt & Imm,Type * Ty)336fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx,
3370b57cec5SDimitry Andric                                                   const APInt &Imm, Type *Ty) {
3380b57cec5SDimitry Andric   if (Imm.isNonNegative() && Imm.getLimitedValue() < 256)
3390b57cec5SDimitry Andric     return 0;
3400b57cec5SDimitry Andric 
3410b57cec5SDimitry Andric   return 1;
3420b57cec5SDimitry Andric }
3430b57cec5SDimitry Andric 
344e8d8bef9SDimitry Andric // Checks whether Inst is part of a min(max()) or max(min()) pattern
3454824e7fdSDimitry Andric // that will match to an SSAT instruction. Returns the instruction being
3464824e7fdSDimitry Andric // saturated, or null if no saturation pattern was found.
isSSATMinMaxPattern(Instruction * Inst,const APInt & Imm)3474824e7fdSDimitry Andric static Value *isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm) {
348e8d8bef9SDimitry Andric   Value *LHS, *RHS;
349e8d8bef9SDimitry Andric   ConstantInt *C;
350e8d8bef9SDimitry Andric   SelectPatternFlavor InstSPF = matchSelectPattern(Inst, LHS, RHS).Flavor;
351e8d8bef9SDimitry Andric 
352e8d8bef9SDimitry Andric   if (InstSPF == SPF_SMAX &&
353e8d8bef9SDimitry Andric       PatternMatch::match(RHS, PatternMatch::m_ConstantInt(C)) &&
354349cc55cSDimitry Andric       C->getValue() == Imm && Imm.isNegative() && Imm.isNegatedPowerOf2()) {
355e8d8bef9SDimitry Andric 
356e8d8bef9SDimitry Andric     auto isSSatMin = [&](Value *MinInst) {
357e8d8bef9SDimitry Andric       if (isa<SelectInst>(MinInst)) {
358e8d8bef9SDimitry Andric         Value *MinLHS, *MinRHS;
359e8d8bef9SDimitry Andric         ConstantInt *MinC;
360e8d8bef9SDimitry Andric         SelectPatternFlavor MinSPF =
361e8d8bef9SDimitry Andric             matchSelectPattern(MinInst, MinLHS, MinRHS).Flavor;
362e8d8bef9SDimitry Andric         if (MinSPF == SPF_SMIN &&
363e8d8bef9SDimitry Andric             PatternMatch::match(MinRHS, PatternMatch::m_ConstantInt(MinC)) &&
364e8d8bef9SDimitry Andric             MinC->getValue() == ((-Imm) - 1))
365e8d8bef9SDimitry Andric           return true;
366e8d8bef9SDimitry Andric       }
367e8d8bef9SDimitry Andric       return false;
368e8d8bef9SDimitry Andric     };
369e8d8bef9SDimitry Andric 
3704824e7fdSDimitry Andric     if (isSSatMin(Inst->getOperand(1)))
3714824e7fdSDimitry Andric       return cast<Instruction>(Inst->getOperand(1))->getOperand(1);
3724824e7fdSDimitry Andric     if (Inst->hasNUses(2) &&
3734824e7fdSDimitry Andric         (isSSatMin(*Inst->user_begin()) || isSSatMin(*(++Inst->user_begin()))))
3744824e7fdSDimitry Andric       return Inst->getOperand(1);
375e8d8bef9SDimitry Andric   }
3764824e7fdSDimitry Andric   return nullptr;
3774824e7fdSDimitry Andric }
3784824e7fdSDimitry Andric 
3794824e7fdSDimitry Andric // Look for a FP Saturation pattern, where the instruction can be simplified to
3804824e7fdSDimitry Andric // a fptosi.sat. max(min(fptosi)). The constant in this case is always free.
isFPSatMinMaxPattern(Instruction * Inst,const APInt & Imm)3814824e7fdSDimitry Andric static bool isFPSatMinMaxPattern(Instruction *Inst, const APInt &Imm) {
3824824e7fdSDimitry Andric   if (Imm.getBitWidth() != 64 ||
3834824e7fdSDimitry Andric       Imm != APInt::getHighBitsSet(64, 33)) // -2147483648
384e8d8bef9SDimitry Andric     return false;
3854824e7fdSDimitry Andric   Value *FP = isSSATMinMaxPattern(Inst, Imm);
3864824e7fdSDimitry Andric   if (!FP && isa<ICmpInst>(Inst) && Inst->hasOneUse())
3874824e7fdSDimitry Andric     FP = isSSATMinMaxPattern(cast<Instruction>(*Inst->user_begin()), Imm);
3884824e7fdSDimitry Andric   if (!FP)
3894824e7fdSDimitry Andric     return false;
3904824e7fdSDimitry Andric   return isa<FPToSIInst>(FP);
391e8d8bef9SDimitry Andric }
392e8d8bef9SDimitry Andric 
getIntImmCostInst(unsigned Opcode,unsigned Idx,const APInt & Imm,Type * Ty,TTI::TargetCostKind CostKind,Instruction * Inst)393fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
394e8d8bef9SDimitry Andric                                               const APInt &Imm, Type *Ty,
395e8d8bef9SDimitry Andric                                               TTI::TargetCostKind CostKind,
396e8d8bef9SDimitry Andric                                               Instruction *Inst) {
3970b57cec5SDimitry Andric   // Division by a constant can be turned into multiplication, but only if we
3980b57cec5SDimitry Andric   // know it's constant. So it's not so much that the immediate is cheap (it's
3990b57cec5SDimitry Andric   // not), but that the alternative is worse.
4000b57cec5SDimitry Andric   // FIXME: this is probably unneeded with GlobalISel.
4010b57cec5SDimitry Andric   if ((Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
4020b57cec5SDimitry Andric        Opcode == Instruction::SRem || Opcode == Instruction::URem) &&
4030b57cec5SDimitry Andric       Idx == 1)
4040b57cec5SDimitry Andric     return 0;
4050b57cec5SDimitry Andric 
406fe6060f1SDimitry Andric   // Leave any gep offsets for the CodeGenPrepare, which will do a better job at
407fe6060f1SDimitry Andric   // splitting any large offsets.
408fe6060f1SDimitry Andric   if (Opcode == Instruction::GetElementPtr && Idx != 0)
409fe6060f1SDimitry Andric     return 0;
410fe6060f1SDimitry Andric 
4110b57cec5SDimitry Andric   if (Opcode == Instruction::And) {
4120b57cec5SDimitry Andric     // UXTB/UXTH
4130b57cec5SDimitry Andric     if (Imm == 255 || Imm == 65535)
4140b57cec5SDimitry Andric       return 0;
4150b57cec5SDimitry Andric     // Conversion to BIC is free, and means we can use ~Imm instead.
4165ffd83dbSDimitry Andric     return std::min(getIntImmCost(Imm, Ty, CostKind),
4175ffd83dbSDimitry Andric                     getIntImmCost(~Imm, Ty, CostKind));
4180b57cec5SDimitry Andric   }
4190b57cec5SDimitry Andric 
4200b57cec5SDimitry Andric   if (Opcode == Instruction::Add)
4210b57cec5SDimitry Andric     // Conversion to SUB is free, and means we can use -Imm instead.
4225ffd83dbSDimitry Andric     return std::min(getIntImmCost(Imm, Ty, CostKind),
4235ffd83dbSDimitry Andric                     getIntImmCost(-Imm, Ty, CostKind));
4240b57cec5SDimitry Andric 
4250b57cec5SDimitry Andric   if (Opcode == Instruction::ICmp && Imm.isNegative() &&
4260b57cec5SDimitry Andric       Ty->getIntegerBitWidth() == 32) {
4270b57cec5SDimitry Andric     int64_t NegImm = -Imm.getSExtValue();
4280b57cec5SDimitry Andric     if (ST->isThumb2() && NegImm < 1<<12)
4290b57cec5SDimitry Andric       // icmp X, #-C -> cmn X, #C
4300b57cec5SDimitry Andric       return 0;
4310b57cec5SDimitry Andric     if (ST->isThumb() && NegImm < 1<<8)
4320b57cec5SDimitry Andric       // icmp X, #-C -> adds X, #C
4330b57cec5SDimitry Andric       return 0;
4340b57cec5SDimitry Andric   }
4350b57cec5SDimitry Andric 
4360b57cec5SDimitry Andric   // xor a, -1 can always be folded to MVN
437349cc55cSDimitry Andric   if (Opcode == Instruction::Xor && Imm.isAllOnes())
4380b57cec5SDimitry Andric     return 0;
4390b57cec5SDimitry Andric 
440e8d8bef9SDimitry Andric   // Ensures negative constant of min(max()) or max(min()) patterns that
441e8d8bef9SDimitry Andric   // match to SSAT instructions don't get hoisted
442e8d8bef9SDimitry Andric   if (Inst && ((ST->hasV6Ops() && !ST->isThumb()) || ST->isThumb2()) &&
443e8d8bef9SDimitry Andric       Ty->getIntegerBitWidth() <= 32) {
444e8d8bef9SDimitry Andric     if (isSSATMinMaxPattern(Inst, Imm) ||
445e8d8bef9SDimitry Andric         (isa<ICmpInst>(Inst) && Inst->hasOneUse() &&
446e8d8bef9SDimitry Andric          isSSATMinMaxPattern(cast<Instruction>(*Inst->user_begin()), Imm)))
447e8d8bef9SDimitry Andric       return 0;
448e8d8bef9SDimitry Andric   }
449e8d8bef9SDimitry Andric 
4504824e7fdSDimitry Andric   if (Inst && ST->hasVFP2Base() && isFPSatMinMaxPattern(Inst, Imm))
4514824e7fdSDimitry Andric     return 0;
4524824e7fdSDimitry Andric 
453349cc55cSDimitry Andric   // We can convert <= -1 to < 0, which is generally quite cheap.
45406c3fb27SDimitry Andric   if (Inst && Opcode == Instruction::ICmp && Idx == 1 && Imm.isAllOnes()) {
455349cc55cSDimitry Andric     ICmpInst::Predicate Pred = cast<ICmpInst>(Inst)->getPredicate();
456349cc55cSDimitry Andric     if (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SLE)
457349cc55cSDimitry Andric       return std::min(getIntImmCost(Imm, Ty, CostKind),
458349cc55cSDimitry Andric                       getIntImmCost(Imm + 1, Ty, CostKind));
459349cc55cSDimitry Andric   }
460349cc55cSDimitry Andric 
4615ffd83dbSDimitry Andric   return getIntImmCost(Imm, Ty, CostKind);
4620b57cec5SDimitry Andric }
4630b57cec5SDimitry Andric 
getCFInstrCost(unsigned Opcode,TTI::TargetCostKind CostKind,const Instruction * I)464fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getCFInstrCost(unsigned Opcode,
465fe6060f1SDimitry Andric                                            TTI::TargetCostKind CostKind,
466fe6060f1SDimitry Andric                                            const Instruction *I) {
467e8d8bef9SDimitry Andric   if (CostKind == TTI::TCK_RecipThroughput &&
468e8d8bef9SDimitry Andric       (ST->hasNEON() || ST->hasMVEIntegerOps())) {
469e8d8bef9SDimitry Andric     // FIXME: The vectorizer is highly sensistive to the cost of these
470e8d8bef9SDimitry Andric     // instructions, which suggests that it may be using the costs incorrectly.
471e8d8bef9SDimitry Andric     // But, for now, just make them free to avoid performance regressions for
472e8d8bef9SDimitry Andric     // vector targets.
473e8d8bef9SDimitry Andric     return 0;
474e8d8bef9SDimitry Andric   }
475fe6060f1SDimitry Andric   return BaseT::getCFInstrCost(Opcode, CostKind, I);
476e8d8bef9SDimitry Andric }
477e8d8bef9SDimitry Andric 
getCastInstrCost(unsigned Opcode,Type * Dst,Type * Src,TTI::CastContextHint CCH,TTI::TargetCostKind CostKind,const Instruction * I)478fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
479fe6060f1SDimitry Andric                                              Type *Src,
480e8d8bef9SDimitry Andric                                              TTI::CastContextHint CCH,
4815ffd83dbSDimitry Andric                                              TTI::TargetCostKind CostKind,
4820b57cec5SDimitry Andric                                              const Instruction *I) {
4830b57cec5SDimitry Andric   int ISD = TLI->InstructionOpcodeToISD(Opcode);
4840b57cec5SDimitry Andric   assert(ISD && "Invalid opcode");
4850b57cec5SDimitry Andric 
4865ffd83dbSDimitry Andric   // TODO: Allow non-throughput costs that aren't binary.
487fe6060f1SDimitry Andric   auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
4885ffd83dbSDimitry Andric     if (CostKind != TTI::TCK_RecipThroughput)
4895ffd83dbSDimitry Andric       return Cost == 0 ? 0 : 1;
4905ffd83dbSDimitry Andric     return Cost;
4910b57cec5SDimitry Andric   };
492e8d8bef9SDimitry Andric   auto IsLegalFPType = [this](EVT VT) {
493e8d8bef9SDimitry Andric     EVT EltVT = VT.getScalarType();
494e8d8bef9SDimitry Andric     return (EltVT == MVT::f32 && ST->hasVFP2Base()) ||
495e8d8bef9SDimitry Andric             (EltVT == MVT::f64 && ST->hasFP64()) ||
496e8d8bef9SDimitry Andric             (EltVT == MVT::f16 && ST->hasFullFP16());
497e8d8bef9SDimitry Andric   };
4980b57cec5SDimitry Andric 
4990b57cec5SDimitry Andric   EVT SrcTy = TLI->getValueType(DL, Src);
5000b57cec5SDimitry Andric   EVT DstTy = TLI->getValueType(DL, Dst);
5010b57cec5SDimitry Andric 
5020b57cec5SDimitry Andric   if (!SrcTy.isSimple() || !DstTy.isSimple())
503e8d8bef9SDimitry Andric     return AdjustCost(
504e8d8bef9SDimitry Andric         BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
5050b57cec5SDimitry Andric 
506e8d8bef9SDimitry Andric   // Extending masked load/Truncating masked stores is expensive because we
507e8d8bef9SDimitry Andric   // currently don't split them. This means that we'll likely end up
508e8d8bef9SDimitry Andric   // loading/storing each element individually (hence the high cost).
509e8d8bef9SDimitry Andric   if ((ST->hasMVEIntegerOps() &&
510e8d8bef9SDimitry Andric        (Opcode == Instruction::Trunc || Opcode == Instruction::ZExt ||
511e8d8bef9SDimitry Andric         Opcode == Instruction::SExt)) ||
512e8d8bef9SDimitry Andric       (ST->hasMVEFloatOps() &&
513e8d8bef9SDimitry Andric        (Opcode == Instruction::FPExt || Opcode == Instruction::FPTrunc) &&
514e8d8bef9SDimitry Andric        IsLegalFPType(SrcTy) && IsLegalFPType(DstTy)))
515e8d8bef9SDimitry Andric     if (CCH == TTI::CastContextHint::Masked && DstTy.getSizeInBits() > 128)
516fe6060f1SDimitry Andric       return 2 * DstTy.getVectorNumElements() *
517fe6060f1SDimitry Andric              ST->getMVEVectorCostFactor(CostKind);
518e8d8bef9SDimitry Andric 
519e8d8bef9SDimitry Andric   // The extend of other kinds of load is free
520e8d8bef9SDimitry Andric   if (CCH == TTI::CastContextHint::Normal ||
521e8d8bef9SDimitry Andric       CCH == TTI::CastContextHint::Masked) {
5228bcb0991SDimitry Andric     static const TypeConversionCostTblEntry LoadConversionTbl[] = {
5238bcb0991SDimitry Andric         {ISD::SIGN_EXTEND, MVT::i32, MVT::i16, 0},
5248bcb0991SDimitry Andric         {ISD::ZERO_EXTEND, MVT::i32, MVT::i16, 0},
5258bcb0991SDimitry Andric         {ISD::SIGN_EXTEND, MVT::i32, MVT::i8, 0},
5268bcb0991SDimitry Andric         {ISD::ZERO_EXTEND, MVT::i32, MVT::i8, 0},
5278bcb0991SDimitry Andric         {ISD::SIGN_EXTEND, MVT::i16, MVT::i8, 0},
5288bcb0991SDimitry Andric         {ISD::ZERO_EXTEND, MVT::i16, MVT::i8, 0},
5298bcb0991SDimitry Andric         {ISD::SIGN_EXTEND, MVT::i64, MVT::i32, 1},
5308bcb0991SDimitry Andric         {ISD::ZERO_EXTEND, MVT::i64, MVT::i32, 1},
5318bcb0991SDimitry Andric         {ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 1},
5328bcb0991SDimitry Andric         {ISD::ZERO_EXTEND, MVT::i64, MVT::i16, 1},
5338bcb0991SDimitry Andric         {ISD::SIGN_EXTEND, MVT::i64, MVT::i8, 1},
5348bcb0991SDimitry Andric         {ISD::ZERO_EXTEND, MVT::i64, MVT::i8, 1},
5358bcb0991SDimitry Andric     };
5368bcb0991SDimitry Andric     if (const auto *Entry = ConvertCostTableLookup(
5378bcb0991SDimitry Andric             LoadConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
5385ffd83dbSDimitry Andric       return AdjustCost(Entry->Cost);
5398bcb0991SDimitry Andric 
5408bcb0991SDimitry Andric     static const TypeConversionCostTblEntry MVELoadConversionTbl[] = {
5418bcb0991SDimitry Andric         {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 0},
5428bcb0991SDimitry Andric         {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 0},
5438bcb0991SDimitry Andric         {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 0},
5448bcb0991SDimitry Andric         {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 0},
5458bcb0991SDimitry Andric         {ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 0},
5468bcb0991SDimitry Andric         {ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 0},
5475ffd83dbSDimitry Andric         // The following extend from a legal type to an illegal type, so need to
5485ffd83dbSDimitry Andric         // split the load. This introduced an extra load operation, but the
5495ffd83dbSDimitry Andric         // extend is still "free".
5505ffd83dbSDimitry Andric         {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1},
5515ffd83dbSDimitry Andric         {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1},
5525ffd83dbSDimitry Andric         {ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 3},
5535ffd83dbSDimitry Andric         {ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 3},
5545ffd83dbSDimitry Andric         {ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1},
5555ffd83dbSDimitry Andric         {ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1},
5568bcb0991SDimitry Andric     };
5578bcb0991SDimitry Andric     if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
5588bcb0991SDimitry Andric       if (const auto *Entry =
5598bcb0991SDimitry Andric               ConvertCostTableLookup(MVELoadConversionTbl, ISD,
5608bcb0991SDimitry Andric                                      DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
561fe6060f1SDimitry Andric         return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
5628bcb0991SDimitry Andric     }
5635ffd83dbSDimitry Andric 
5645ffd83dbSDimitry Andric     static const TypeConversionCostTblEntry MVEFLoadConversionTbl[] = {
5655ffd83dbSDimitry Andric         // FPExtends are similar but also require the VCVT instructions.
5665ffd83dbSDimitry Andric         {ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, 1},
5675ffd83dbSDimitry Andric         {ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, 3},
5685ffd83dbSDimitry Andric     };
5695ffd83dbSDimitry Andric     if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
5705ffd83dbSDimitry Andric       if (const auto *Entry =
5715ffd83dbSDimitry Andric               ConvertCostTableLookup(MVEFLoadConversionTbl, ISD,
5725ffd83dbSDimitry Andric                                      DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
573fe6060f1SDimitry Andric         return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
5745ffd83dbSDimitry Andric     }
5755ffd83dbSDimitry Andric 
5765ffd83dbSDimitry Andric     // The truncate of a store is free. This is the mirror of extends above.
577e8d8bef9SDimitry Andric     static const TypeConversionCostTblEntry MVEStoreConversionTbl[] = {
5785ffd83dbSDimitry Andric         {ISD::TRUNCATE, MVT::v4i32, MVT::v4i16, 0},
5795ffd83dbSDimitry Andric         {ISD::TRUNCATE, MVT::v4i32, MVT::v4i8, 0},
5805ffd83dbSDimitry Andric         {ISD::TRUNCATE, MVT::v8i16, MVT::v8i8, 0},
5815ffd83dbSDimitry Andric         {ISD::TRUNCATE, MVT::v8i32, MVT::v8i16, 1},
582e8d8bef9SDimitry Andric         {ISD::TRUNCATE, MVT::v8i32, MVT::v8i8, 1},
5835ffd83dbSDimitry Andric         {ISD::TRUNCATE, MVT::v16i32, MVT::v16i8, 3},
5845ffd83dbSDimitry Andric         {ISD::TRUNCATE, MVT::v16i16, MVT::v16i8, 1},
5855ffd83dbSDimitry Andric     };
5865ffd83dbSDimitry Andric     if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
5875ffd83dbSDimitry Andric       if (const auto *Entry =
588e8d8bef9SDimitry Andric               ConvertCostTableLookup(MVEStoreConversionTbl, ISD,
589e8d8bef9SDimitry Andric                                      SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
590fe6060f1SDimitry Andric         return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
5915ffd83dbSDimitry Andric     }
5925ffd83dbSDimitry Andric 
593e8d8bef9SDimitry Andric     static const TypeConversionCostTblEntry MVEFStoreConversionTbl[] = {
5945ffd83dbSDimitry Andric         {ISD::FP_ROUND, MVT::v4f32, MVT::v4f16, 1},
5955ffd83dbSDimitry Andric         {ISD::FP_ROUND, MVT::v8f32, MVT::v8f16, 3},
5965ffd83dbSDimitry Andric     };
5975ffd83dbSDimitry Andric     if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
5985ffd83dbSDimitry Andric       if (const auto *Entry =
599e8d8bef9SDimitry Andric               ConvertCostTableLookup(MVEFStoreConversionTbl, ISD,
600e8d8bef9SDimitry Andric                                      SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
601fe6060f1SDimitry Andric         return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
6025ffd83dbSDimitry Andric     }
6035ffd83dbSDimitry Andric   }
6045ffd83dbSDimitry Andric 
6055ffd83dbSDimitry Andric   // NEON vector operations that can extend their inputs.
6065ffd83dbSDimitry Andric   if ((ISD == ISD::SIGN_EXTEND || ISD == ISD::ZERO_EXTEND) &&
6075ffd83dbSDimitry Andric       I && I->hasOneUse() && ST->hasNEON() && SrcTy.isVector()) {
6085ffd83dbSDimitry Andric     static const TypeConversionCostTblEntry NEONDoubleWidthTbl[] = {
6095ffd83dbSDimitry Andric       // vaddl
6105ffd83dbSDimitry Andric       { ISD::ADD, MVT::v4i32, MVT::v4i16, 0 },
6115ffd83dbSDimitry Andric       { ISD::ADD, MVT::v8i16, MVT::v8i8,  0 },
6125ffd83dbSDimitry Andric       // vsubl
6135ffd83dbSDimitry Andric       { ISD::SUB, MVT::v4i32, MVT::v4i16, 0 },
6145ffd83dbSDimitry Andric       { ISD::SUB, MVT::v8i16, MVT::v8i8,  0 },
6155ffd83dbSDimitry Andric       // vmull
6165ffd83dbSDimitry Andric       { ISD::MUL, MVT::v4i32, MVT::v4i16, 0 },
6175ffd83dbSDimitry Andric       { ISD::MUL, MVT::v8i16, MVT::v8i8,  0 },
6185ffd83dbSDimitry Andric       // vshll
6195ffd83dbSDimitry Andric       { ISD::SHL, MVT::v4i32, MVT::v4i16, 0 },
6205ffd83dbSDimitry Andric       { ISD::SHL, MVT::v8i16, MVT::v8i8,  0 },
6215ffd83dbSDimitry Andric     };
6225ffd83dbSDimitry Andric 
6235ffd83dbSDimitry Andric     auto *User = cast<Instruction>(*I->user_begin());
6245ffd83dbSDimitry Andric     int UserISD = TLI->InstructionOpcodeToISD(User->getOpcode());
6255ffd83dbSDimitry Andric     if (auto *Entry = ConvertCostTableLookup(NEONDoubleWidthTbl, UserISD,
6265ffd83dbSDimitry Andric                                              DstTy.getSimpleVT(),
6275ffd83dbSDimitry Andric                                              SrcTy.getSimpleVT())) {
6285ffd83dbSDimitry Andric       return AdjustCost(Entry->Cost);
6295ffd83dbSDimitry Andric     }
6305ffd83dbSDimitry Andric   }
6315ffd83dbSDimitry Andric 
6325ffd83dbSDimitry Andric   // Single to/from double precision conversions.
6335ffd83dbSDimitry Andric   if (Src->isVectorTy() && ST->hasNEON() &&
6345ffd83dbSDimitry Andric       ((ISD == ISD::FP_ROUND && SrcTy.getScalarType() == MVT::f64 &&
6355ffd83dbSDimitry Andric         DstTy.getScalarType() == MVT::f32) ||
6365ffd83dbSDimitry Andric        (ISD == ISD::FP_EXTEND && SrcTy.getScalarType() == MVT::f32 &&
6375ffd83dbSDimitry Andric         DstTy.getScalarType() == MVT::f64))) {
6385ffd83dbSDimitry Andric     static const CostTblEntry NEONFltDblTbl[] = {
6395ffd83dbSDimitry Andric         // Vector fptrunc/fpext conversions.
6405ffd83dbSDimitry Andric         {ISD::FP_ROUND, MVT::v2f64, 2},
6415ffd83dbSDimitry Andric         {ISD::FP_EXTEND, MVT::v2f32, 2},
6425ffd83dbSDimitry Andric         {ISD::FP_EXTEND, MVT::v4f32, 4}};
6435ffd83dbSDimitry Andric 
644bdd1243dSDimitry Andric     std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
6455ffd83dbSDimitry Andric     if (const auto *Entry = CostTableLookup(NEONFltDblTbl, ISD, LT.second))
6465ffd83dbSDimitry Andric       return AdjustCost(LT.first * Entry->Cost);
6478bcb0991SDimitry Andric   }
6488bcb0991SDimitry Andric 
6490b57cec5SDimitry Andric   // Some arithmetic, load and store operations have specific instructions
6500b57cec5SDimitry Andric   // to cast up/down their types automatically at no extra cost.
6510b57cec5SDimitry Andric   // TODO: Get these tables to know at least what the related operations are.
6520b57cec5SDimitry Andric   static const TypeConversionCostTblEntry NEONVectorConversionTbl[] = {
6535ffd83dbSDimitry Andric     { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
6545ffd83dbSDimitry Andric     { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
6550b57cec5SDimitry Andric     { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
6560b57cec5SDimitry Andric     { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
6570b57cec5SDimitry Andric     { ISD::TRUNCATE,    MVT::v4i32, MVT::v4i64, 0 },
6580b57cec5SDimitry Andric     { ISD::TRUNCATE,    MVT::v4i16, MVT::v4i32, 1 },
6590b57cec5SDimitry Andric 
6600b57cec5SDimitry Andric     // The number of vmovl instructions for the extension.
6615ffd83dbSDimitry Andric     { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8,  1 },
6625ffd83dbSDimitry Andric     { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8,  1 },
6635ffd83dbSDimitry Andric     { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8,  2 },
6645ffd83dbSDimitry Andric     { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8,  2 },
6655ffd83dbSDimitry Andric     { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8,  3 },
6665ffd83dbSDimitry Andric     { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8,  3 },
6675ffd83dbSDimitry Andric     { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
6685ffd83dbSDimitry Andric     { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
6690b57cec5SDimitry Andric     { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
6700b57cec5SDimitry Andric     { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
6710b57cec5SDimitry Andric     { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
6720b57cec5SDimitry Andric     { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
6730b57cec5SDimitry Andric     { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
6740b57cec5SDimitry Andric     { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
6750b57cec5SDimitry Andric     { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
6760b57cec5SDimitry Andric     { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
6770b57cec5SDimitry Andric     { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
6780b57cec5SDimitry Andric     { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
6790b57cec5SDimitry Andric 
6800b57cec5SDimitry Andric     // Operations that we legalize using splitting.
6810b57cec5SDimitry Andric     { ISD::TRUNCATE,    MVT::v16i8, MVT::v16i32, 6 },
6820b57cec5SDimitry Andric     { ISD::TRUNCATE,    MVT::v8i8, MVT::v8i32, 3 },
6830b57cec5SDimitry Andric 
6840b57cec5SDimitry Andric     // Vector float <-> i32 conversions.
6850b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i32, 1 },
6860b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i32, 1 },
6870b57cec5SDimitry Andric 
6880b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::v2f32, MVT::v2i8, 3 },
6890b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::v2f32, MVT::v2i8, 3 },
6900b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::v2f32, MVT::v2i16, 2 },
6910b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::v2f32, MVT::v2i16, 2 },
6920b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::v2f32, MVT::v2i32, 1 },
6930b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::v2f32, MVT::v2i32, 1 },
6940b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i1, 3 },
6950b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i1, 3 },
6960b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i8, 3 },
6970b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i8, 3 },
6980b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i16, 2 },
6990b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i16, 2 },
7000b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i16, 4 },
7010b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i16, 4 },
7020b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i32, 2 },
7030b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i32, 2 },
7040b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i16, 8 },
7050b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i16, 8 },
7060b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i32, 4 },
7070b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i32, 4 },
7080b57cec5SDimitry Andric 
7090b57cec5SDimitry Andric     { ISD::FP_TO_SINT,  MVT::v4i32, MVT::v4f32, 1 },
7100b57cec5SDimitry Andric     { ISD::FP_TO_UINT,  MVT::v4i32, MVT::v4f32, 1 },
7110b57cec5SDimitry Andric     { ISD::FP_TO_SINT,  MVT::v4i8, MVT::v4f32, 3 },
7120b57cec5SDimitry Andric     { ISD::FP_TO_UINT,  MVT::v4i8, MVT::v4f32, 3 },
7130b57cec5SDimitry Andric     { ISD::FP_TO_SINT,  MVT::v4i16, MVT::v4f32, 2 },
7140b57cec5SDimitry Andric     { ISD::FP_TO_UINT,  MVT::v4i16, MVT::v4f32, 2 },
7150b57cec5SDimitry Andric 
7160b57cec5SDimitry Andric     // Vector double <-> i32 conversions.
7170b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },
7180b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },
7190b57cec5SDimitry Andric 
7200b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i8, 4 },
7210b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i8, 4 },
7220b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i16, 3 },
7230b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i16, 3 },
7240b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },
7250b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },
7260b57cec5SDimitry Andric 
7270b57cec5SDimitry Andric     { ISD::FP_TO_SINT,  MVT::v2i32, MVT::v2f64, 2 },
7280b57cec5SDimitry Andric     { ISD::FP_TO_UINT,  MVT::v2i32, MVT::v2f64, 2 },
7290b57cec5SDimitry Andric     { ISD::FP_TO_SINT,  MVT::v8i16, MVT::v8f32, 4 },
7300b57cec5SDimitry Andric     { ISD::FP_TO_UINT,  MVT::v8i16, MVT::v8f32, 4 },
7310b57cec5SDimitry Andric     { ISD::FP_TO_SINT,  MVT::v16i16, MVT::v16f32, 8 },
7320b57cec5SDimitry Andric     { ISD::FP_TO_UINT,  MVT::v16i16, MVT::v16f32, 8 }
7330b57cec5SDimitry Andric   };
7340b57cec5SDimitry Andric 
7350b57cec5SDimitry Andric   if (SrcTy.isVector() && ST->hasNEON()) {
7360b57cec5SDimitry Andric     if (const auto *Entry = ConvertCostTableLookup(NEONVectorConversionTbl, ISD,
7370b57cec5SDimitry Andric                                                    DstTy.getSimpleVT(),
7380b57cec5SDimitry Andric                                                    SrcTy.getSimpleVT()))
7395ffd83dbSDimitry Andric       return AdjustCost(Entry->Cost);
7400b57cec5SDimitry Andric   }
7410b57cec5SDimitry Andric 
7420b57cec5SDimitry Andric   // Scalar float to integer conversions.
7430b57cec5SDimitry Andric   static const TypeConversionCostTblEntry NEONFloatConversionTbl[] = {
7440b57cec5SDimitry Andric     { ISD::FP_TO_SINT,  MVT::i1, MVT::f32, 2 },
7450b57cec5SDimitry Andric     { ISD::FP_TO_UINT,  MVT::i1, MVT::f32, 2 },
7460b57cec5SDimitry Andric     { ISD::FP_TO_SINT,  MVT::i1, MVT::f64, 2 },
7470b57cec5SDimitry Andric     { ISD::FP_TO_UINT,  MVT::i1, MVT::f64, 2 },
7480b57cec5SDimitry Andric     { ISD::FP_TO_SINT,  MVT::i8, MVT::f32, 2 },
7490b57cec5SDimitry Andric     { ISD::FP_TO_UINT,  MVT::i8, MVT::f32, 2 },
7500b57cec5SDimitry Andric     { ISD::FP_TO_SINT,  MVT::i8, MVT::f64, 2 },
7510b57cec5SDimitry Andric     { ISD::FP_TO_UINT,  MVT::i8, MVT::f64, 2 },
7520b57cec5SDimitry Andric     { ISD::FP_TO_SINT,  MVT::i16, MVT::f32, 2 },
7530b57cec5SDimitry Andric     { ISD::FP_TO_UINT,  MVT::i16, MVT::f32, 2 },
7540b57cec5SDimitry Andric     { ISD::FP_TO_SINT,  MVT::i16, MVT::f64, 2 },
7550b57cec5SDimitry Andric     { ISD::FP_TO_UINT,  MVT::i16, MVT::f64, 2 },
7560b57cec5SDimitry Andric     { ISD::FP_TO_SINT,  MVT::i32, MVT::f32, 2 },
7570b57cec5SDimitry Andric     { ISD::FP_TO_UINT,  MVT::i32, MVT::f32, 2 },
7580b57cec5SDimitry Andric     { ISD::FP_TO_SINT,  MVT::i32, MVT::f64, 2 },
7590b57cec5SDimitry Andric     { ISD::FP_TO_UINT,  MVT::i32, MVT::f64, 2 },
7600b57cec5SDimitry Andric     { ISD::FP_TO_SINT,  MVT::i64, MVT::f32, 10 },
7610b57cec5SDimitry Andric     { ISD::FP_TO_UINT,  MVT::i64, MVT::f32, 10 },
7620b57cec5SDimitry Andric     { ISD::FP_TO_SINT,  MVT::i64, MVT::f64, 10 },
7630b57cec5SDimitry Andric     { ISD::FP_TO_UINT,  MVT::i64, MVT::f64, 10 }
7640b57cec5SDimitry Andric   };
7650b57cec5SDimitry Andric   if (SrcTy.isFloatingPoint() && ST->hasNEON()) {
7660b57cec5SDimitry Andric     if (const auto *Entry = ConvertCostTableLookup(NEONFloatConversionTbl, ISD,
7670b57cec5SDimitry Andric                                                    DstTy.getSimpleVT(),
7680b57cec5SDimitry Andric                                                    SrcTy.getSimpleVT()))
7695ffd83dbSDimitry Andric       return AdjustCost(Entry->Cost);
7700b57cec5SDimitry Andric   }
7710b57cec5SDimitry Andric 
7720b57cec5SDimitry Andric   // Scalar integer to float conversions.
7730b57cec5SDimitry Andric   static const TypeConversionCostTblEntry NEONIntegerConversionTbl[] = {
7740b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::f32, MVT::i1, 2 },
7750b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::f32, MVT::i1, 2 },
7760b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::f64, MVT::i1, 2 },
7770b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::f64, MVT::i1, 2 },
7780b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::f32, MVT::i8, 2 },
7790b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::f32, MVT::i8, 2 },
7800b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::f64, MVT::i8, 2 },
7810b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::f64, MVT::i8, 2 },
7820b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::f32, MVT::i16, 2 },
7830b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::f32, MVT::i16, 2 },
7840b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::f64, MVT::i16, 2 },
7850b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::f64, MVT::i16, 2 },
7860b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::f32, MVT::i32, 2 },
7870b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::f32, MVT::i32, 2 },
7880b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::f64, MVT::i32, 2 },
7890b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::f64, MVT::i32, 2 },
7900b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::f32, MVT::i64, 10 },
7910b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::f32, MVT::i64, 10 },
7920b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::f64, MVT::i64, 10 },
7930b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::f64, MVT::i64, 10 }
7940b57cec5SDimitry Andric   };
7950b57cec5SDimitry Andric 
7960b57cec5SDimitry Andric   if (SrcTy.isInteger() && ST->hasNEON()) {
7970b57cec5SDimitry Andric     if (const auto *Entry = ConvertCostTableLookup(NEONIntegerConversionTbl,
7980b57cec5SDimitry Andric                                                    ISD, DstTy.getSimpleVT(),
7990b57cec5SDimitry Andric                                                    SrcTy.getSimpleVT()))
8005ffd83dbSDimitry Andric       return AdjustCost(Entry->Cost);
8010b57cec5SDimitry Andric   }
8020b57cec5SDimitry Andric 
8038bcb0991SDimitry Andric   // MVE extend costs, taken from codegen tests. i8->i16 or i16->i32 is one
8048bcb0991SDimitry Andric   // instruction, i8->i32 is two. i64 zexts are an VAND with a constant, sext
8058bcb0991SDimitry Andric   // are linearised so take more.
8068bcb0991SDimitry Andric   static const TypeConversionCostTblEntry MVEVectorConversionTbl[] = {
8078bcb0991SDimitry Andric     { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
8088bcb0991SDimitry Andric     { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
8098bcb0991SDimitry Andric     { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
8108bcb0991SDimitry Andric     { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
8118bcb0991SDimitry Andric     { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8, 10 },
8128bcb0991SDimitry Andric     { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8, 2 },
8138bcb0991SDimitry Andric     { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
8148bcb0991SDimitry Andric     { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
8158bcb0991SDimitry Andric     { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 10 },
8168bcb0991SDimitry Andric     { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
8178bcb0991SDimitry Andric     { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 8 },
8188bcb0991SDimitry Andric     { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 2 },
8198bcb0991SDimitry Andric   };
8208bcb0991SDimitry Andric 
8218bcb0991SDimitry Andric   if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
8228bcb0991SDimitry Andric     if (const auto *Entry = ConvertCostTableLookup(MVEVectorConversionTbl,
8238bcb0991SDimitry Andric                                                    ISD, DstTy.getSimpleVT(),
8248bcb0991SDimitry Andric                                                    SrcTy.getSimpleVT()))
825fe6060f1SDimitry Andric       return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
8265ffd83dbSDimitry Andric   }
8275ffd83dbSDimitry Andric 
8285ffd83dbSDimitry Andric   if (ISD == ISD::FP_ROUND || ISD == ISD::FP_EXTEND) {
8295ffd83dbSDimitry Andric     // As general rule, fp converts that were not matched above are scalarized
8305ffd83dbSDimitry Andric     // and cost 1 vcvt for each lane, so long as the instruction is available.
8315ffd83dbSDimitry Andric     // If not it will become a series of function calls.
832fe6060f1SDimitry Andric     const InstructionCost CallCost =
833fe6060f1SDimitry Andric         getCallInstrCost(nullptr, Dst, {Src}, CostKind);
8345ffd83dbSDimitry Andric     int Lanes = 1;
8355ffd83dbSDimitry Andric     if (SrcTy.isFixedLengthVector())
8365ffd83dbSDimitry Andric       Lanes = SrcTy.getVectorNumElements();
8375ffd83dbSDimitry Andric 
838e8d8bef9SDimitry Andric     if (IsLegalFPType(SrcTy) && IsLegalFPType(DstTy))
8395ffd83dbSDimitry Andric       return Lanes;
8405ffd83dbSDimitry Andric     else
8415ffd83dbSDimitry Andric       return Lanes * CallCost;
8428bcb0991SDimitry Andric   }
8438bcb0991SDimitry Andric 
844e8d8bef9SDimitry Andric   if (ISD == ISD::TRUNCATE && ST->hasMVEIntegerOps() &&
845e8d8bef9SDimitry Andric       SrcTy.isFixedLengthVector()) {
846e8d8bef9SDimitry Andric     // Treat a truncate with larger than legal source (128bits for MVE) as
847e8d8bef9SDimitry Andric     // expensive, 2 instructions per lane.
848e8d8bef9SDimitry Andric     if ((SrcTy.getScalarType() == MVT::i8 ||
849e8d8bef9SDimitry Andric          SrcTy.getScalarType() == MVT::i16 ||
850e8d8bef9SDimitry Andric          SrcTy.getScalarType() == MVT::i32) &&
851e8d8bef9SDimitry Andric         SrcTy.getSizeInBits() > 128 &&
852e8d8bef9SDimitry Andric         SrcTy.getSizeInBits() > DstTy.getSizeInBits())
853e8d8bef9SDimitry Andric       return SrcTy.getVectorNumElements() * 2;
854e8d8bef9SDimitry Andric   }
855e8d8bef9SDimitry Andric 
8560b57cec5SDimitry Andric   // Scalar integer conversion costs.
8570b57cec5SDimitry Andric   static const TypeConversionCostTblEntry ARMIntegerConversionTbl[] = {
8580b57cec5SDimitry Andric     // i16 -> i64 requires two dependent operations.
8590b57cec5SDimitry Andric     { ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 2 },
8600b57cec5SDimitry Andric 
8610b57cec5SDimitry Andric     // Truncates on i64 are assumed to be free.
8620b57cec5SDimitry Andric     { ISD::TRUNCATE,    MVT::i32, MVT::i64, 0 },
8630b57cec5SDimitry Andric     { ISD::TRUNCATE,    MVT::i16, MVT::i64, 0 },
8640b57cec5SDimitry Andric     { ISD::TRUNCATE,    MVT::i8,  MVT::i64, 0 },
8650b57cec5SDimitry Andric     { ISD::TRUNCATE,    MVT::i1,  MVT::i64, 0 }
8660b57cec5SDimitry Andric   };
8670b57cec5SDimitry Andric 
8680b57cec5SDimitry Andric   if (SrcTy.isInteger()) {
8690b57cec5SDimitry Andric     if (const auto *Entry = ConvertCostTableLookup(ARMIntegerConversionTbl, ISD,
8700b57cec5SDimitry Andric                                                    DstTy.getSimpleVT(),
8710b57cec5SDimitry Andric                                                    SrcTy.getSimpleVT()))
8725ffd83dbSDimitry Andric       return AdjustCost(Entry->Cost);
8730b57cec5SDimitry Andric   }
8740b57cec5SDimitry Andric 
8758bcb0991SDimitry Andric   int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
876fe6060f1SDimitry Andric                      ? ST->getMVEVectorCostFactor(CostKind)
8778bcb0991SDimitry Andric                      : 1;
8785ffd83dbSDimitry Andric   return AdjustCost(
879e8d8bef9SDimitry Andric       BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
8800b57cec5SDimitry Andric }
8810b57cec5SDimitry Andric 
getVectorInstrCost(unsigned Opcode,Type * ValTy,TTI::TargetCostKind CostKind,unsigned Index,Value * Op0,Value * Op1)882fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
883bdd1243dSDimitry Andric                                                TTI::TargetCostKind CostKind,
884bdd1243dSDimitry Andric                                                unsigned Index, Value *Op0,
885bdd1243dSDimitry Andric                                                Value *Op1) {
8860b57cec5SDimitry Andric   // Penalize inserting into an D-subregister. We end up with a three times
8870b57cec5SDimitry Andric   // lower estimated throughput on swift.
8880b57cec5SDimitry Andric   if (ST->hasSlowLoadDSubregister() && Opcode == Instruction::InsertElement &&
8890b57cec5SDimitry Andric       ValTy->isVectorTy() && ValTy->getScalarSizeInBits() <= 32)
8900b57cec5SDimitry Andric     return 3;
8910b57cec5SDimitry Andric 
8928bcb0991SDimitry Andric   if (ST->hasNEON() && (Opcode == Instruction::InsertElement ||
8930b57cec5SDimitry Andric                         Opcode == Instruction::ExtractElement)) {
8940b57cec5SDimitry Andric     // Cross-class copies are expensive on many microarchitectures,
8950b57cec5SDimitry Andric     // so assume they are expensive by default.
8965ffd83dbSDimitry Andric     if (cast<VectorType>(ValTy)->getElementType()->isIntegerTy())
8970b57cec5SDimitry Andric       return 3;
8980b57cec5SDimitry Andric 
8990b57cec5SDimitry Andric     // Even if it's not a cross class copy, this likely leads to mixing
9000b57cec5SDimitry Andric     // of NEON and VFP code and should be therefore penalized.
9010b57cec5SDimitry Andric     if (ValTy->isVectorTy() &&
9020b57cec5SDimitry Andric         ValTy->getScalarSizeInBits() <= 32)
903fe6060f1SDimitry Andric       return std::max<InstructionCost>(
904bdd1243dSDimitry Andric           BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1),
905bdd1243dSDimitry Andric           2U);
9060b57cec5SDimitry Andric   }
9070b57cec5SDimitry Andric 
9088bcb0991SDimitry Andric   if (ST->hasMVEIntegerOps() && (Opcode == Instruction::InsertElement ||
9098bcb0991SDimitry Andric                                  Opcode == Instruction::ExtractElement)) {
910fe6060f1SDimitry Andric     // Integer cross-lane moves are more expensive than float, which can
911fe6060f1SDimitry Andric     // sometimes just be vmovs. Integer involve being passes to GPR registers,
912fe6060f1SDimitry Andric     // causing more of a delay.
913fe6060f1SDimitry Andric     std::pair<InstructionCost, MVT> LT =
914bdd1243dSDimitry Andric         getTypeLegalizationCost(ValTy->getScalarType());
915fe6060f1SDimitry Andric     return LT.first * (ValTy->getScalarType()->isIntegerTy() ? 4 : 1);
9168bcb0991SDimitry Andric   }
9178bcb0991SDimitry Andric 
918bdd1243dSDimitry Andric   return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1);
9190b57cec5SDimitry Andric }
9200b57cec5SDimitry Andric 
getCmpSelInstrCost(unsigned Opcode,Type * ValTy,Type * CondTy,CmpInst::Predicate VecPred,TTI::TargetCostKind CostKind,const Instruction * I)921fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
922fe6060f1SDimitry Andric                                                Type *CondTy,
923e8d8bef9SDimitry Andric                                                CmpInst::Predicate VecPred,
9245ffd83dbSDimitry Andric                                                TTI::TargetCostKind CostKind,
9250b57cec5SDimitry Andric                                                const Instruction *I) {
9260b57cec5SDimitry Andric   int ISD = TLI->InstructionOpcodeToISD(Opcode);
927e8d8bef9SDimitry Andric 
928e8d8bef9SDimitry Andric   // Thumb scalar code size cost for select.
929e8d8bef9SDimitry Andric   if (CostKind == TTI::TCK_CodeSize && ISD == ISD::SELECT &&
930e8d8bef9SDimitry Andric       ST->isThumb() && !ValTy->isVectorTy()) {
931e8d8bef9SDimitry Andric     // Assume expensive structs.
932e8d8bef9SDimitry Andric     if (TLI->getValueType(DL, ValTy, true) == MVT::Other)
933e8d8bef9SDimitry Andric       return TTI::TCC_Expensive;
934e8d8bef9SDimitry Andric 
935e8d8bef9SDimitry Andric     // Select costs can vary because they:
936e8d8bef9SDimitry Andric     // - may require one or more conditional mov (including an IT),
937e8d8bef9SDimitry Andric     // - can't operate directly on immediates,
938e8d8bef9SDimitry Andric     // - require live flags, which we can't copy around easily.
939bdd1243dSDimitry Andric     InstructionCost Cost = getTypeLegalizationCost(ValTy).first;
940e8d8bef9SDimitry Andric 
941e8d8bef9SDimitry Andric     // Possible IT instruction for Thumb2, or more for Thumb1.
942e8d8bef9SDimitry Andric     ++Cost;
943e8d8bef9SDimitry Andric 
944e8d8bef9SDimitry Andric     // i1 values may need rematerialising by using mov immediates and/or
945e8d8bef9SDimitry Andric     // flag setting instructions.
946e8d8bef9SDimitry Andric     if (ValTy->isIntegerTy(1))
947e8d8bef9SDimitry Andric       ++Cost;
948e8d8bef9SDimitry Andric 
949e8d8bef9SDimitry Andric     return Cost;
950e8d8bef9SDimitry Andric   }
951e8d8bef9SDimitry Andric 
952fe6060f1SDimitry Andric   // If this is a vector min/max/abs, use the cost of that intrinsic directly
953fe6060f1SDimitry Andric   // instead. Hopefully when min/max intrinsics are more prevalent this code
954fe6060f1SDimitry Andric   // will not be needed.
955fe6060f1SDimitry Andric   const Instruction *Sel = I;
956fe6060f1SDimitry Andric   if ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) && Sel &&
957fe6060f1SDimitry Andric       Sel->hasOneUse())
958fe6060f1SDimitry Andric     Sel = cast<Instruction>(Sel->user_back());
959fe6060f1SDimitry Andric   if (Sel && ValTy->isVectorTy() &&
960fe6060f1SDimitry Andric       (ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy())) {
961fe6060f1SDimitry Andric     const Value *LHS, *RHS;
962fe6060f1SDimitry Andric     SelectPatternFlavor SPF = matchSelectPattern(Sel, LHS, RHS).Flavor;
963fe6060f1SDimitry Andric     unsigned IID = 0;
964fe6060f1SDimitry Andric     switch (SPF) {
965fe6060f1SDimitry Andric     case SPF_ABS:
966fe6060f1SDimitry Andric       IID = Intrinsic::abs;
967fe6060f1SDimitry Andric       break;
968fe6060f1SDimitry Andric     case SPF_SMIN:
969fe6060f1SDimitry Andric       IID = Intrinsic::smin;
970fe6060f1SDimitry Andric       break;
971fe6060f1SDimitry Andric     case SPF_SMAX:
972fe6060f1SDimitry Andric       IID = Intrinsic::smax;
973fe6060f1SDimitry Andric       break;
974fe6060f1SDimitry Andric     case SPF_UMIN:
975fe6060f1SDimitry Andric       IID = Intrinsic::umin;
976fe6060f1SDimitry Andric       break;
977fe6060f1SDimitry Andric     case SPF_UMAX:
978fe6060f1SDimitry Andric       IID = Intrinsic::umax;
979fe6060f1SDimitry Andric       break;
980fe6060f1SDimitry Andric     case SPF_FMINNUM:
981fe6060f1SDimitry Andric       IID = Intrinsic::minnum;
982fe6060f1SDimitry Andric       break;
983fe6060f1SDimitry Andric     case SPF_FMAXNUM:
984fe6060f1SDimitry Andric       IID = Intrinsic::maxnum;
985fe6060f1SDimitry Andric       break;
986fe6060f1SDimitry Andric     default:
987fe6060f1SDimitry Andric       break;
988fe6060f1SDimitry Andric     }
989fe6060f1SDimitry Andric     if (IID) {
990fe6060f1SDimitry Andric       // The ICmp is free, the select gets the cost of the min/max/etc
991fe6060f1SDimitry Andric       if (Sel != I)
992fe6060f1SDimitry Andric         return 0;
993fe6060f1SDimitry Andric       IntrinsicCostAttributes CostAttrs(IID, ValTy, {ValTy, ValTy});
994fe6060f1SDimitry Andric       return getIntrinsicInstrCost(CostAttrs, CostKind);
995fe6060f1SDimitry Andric     }
996fe6060f1SDimitry Andric   }
997fe6060f1SDimitry Andric 
9980b57cec5SDimitry Andric   // On NEON a vector select gets lowered to vbsl.
999e8d8bef9SDimitry Andric   if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT && CondTy) {
10000b57cec5SDimitry Andric     // Lowering of some vector selects is currently far from perfect.
10010b57cec5SDimitry Andric     static const TypeConversionCostTblEntry NEONVectorSelectTbl[] = {
10020b57cec5SDimitry Andric       { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4*4 + 1*2 + 1 },
10030b57cec5SDimitry Andric       { ISD::SELECT, MVT::v8i1, MVT::v8i64, 50 },
10040b57cec5SDimitry Andric       { ISD::SELECT, MVT::v16i1, MVT::v16i64, 100 }
10050b57cec5SDimitry Andric     };
10060b57cec5SDimitry Andric 
10070b57cec5SDimitry Andric     EVT SelCondTy = TLI->getValueType(DL, CondTy);
10080b57cec5SDimitry Andric     EVT SelValTy = TLI->getValueType(DL, ValTy);
10090b57cec5SDimitry Andric     if (SelCondTy.isSimple() && SelValTy.isSimple()) {
10100b57cec5SDimitry Andric       if (const auto *Entry = ConvertCostTableLookup(NEONVectorSelectTbl, ISD,
10110b57cec5SDimitry Andric                                                      SelCondTy.getSimpleVT(),
10120b57cec5SDimitry Andric                                                      SelValTy.getSimpleVT()))
10130b57cec5SDimitry Andric         return Entry->Cost;
10140b57cec5SDimitry Andric     }
10150b57cec5SDimitry Andric 
1016bdd1243dSDimitry Andric     std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
10170b57cec5SDimitry Andric     return LT.first;
10180b57cec5SDimitry Andric   }
10190b57cec5SDimitry Andric 
1020fe6060f1SDimitry Andric   if (ST->hasMVEIntegerOps() && ValTy->isVectorTy() &&
1021fe6060f1SDimitry Andric       (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
1022fe6060f1SDimitry Andric       cast<FixedVectorType>(ValTy)->getNumElements() > 1) {
1023fe6060f1SDimitry Andric     FixedVectorType *VecValTy = cast<FixedVectorType>(ValTy);
1024fe6060f1SDimitry Andric     FixedVectorType *VecCondTy = dyn_cast_or_null<FixedVectorType>(CondTy);
1025fe6060f1SDimitry Andric     if (!VecCondTy)
1026fe6060f1SDimitry Andric       VecCondTy = cast<FixedVectorType>(CmpInst::makeCmpResultType(VecValTy));
1027fe6060f1SDimitry Andric 
1028fe6060f1SDimitry Andric     // If we don't have mve.fp any fp operations will need to be scalarized.
1029fe6060f1SDimitry Andric     if (Opcode == Instruction::FCmp && !ST->hasMVEFloatOps()) {
1030fe6060f1SDimitry Andric       // One scalaization insert, one scalarization extract and the cost of the
1031fe6060f1SDimitry Andric       // fcmps.
1032bdd1243dSDimitry Andric       return BaseT::getScalarizationOverhead(VecValTy, /*Insert*/ false,
1033bdd1243dSDimitry Andric                                              /*Extract*/ true, CostKind) +
1034bdd1243dSDimitry Andric              BaseT::getScalarizationOverhead(VecCondTy, /*Insert*/ true,
1035bdd1243dSDimitry Andric                                              /*Extract*/ false, CostKind) +
1036fe6060f1SDimitry Andric              VecValTy->getNumElements() *
1037fe6060f1SDimitry Andric                  getCmpSelInstrCost(Opcode, ValTy->getScalarType(),
1038bdd1243dSDimitry Andric                                     VecCondTy->getScalarType(), VecPred,
1039bdd1243dSDimitry Andric                                     CostKind, I);
1040fe6060f1SDimitry Andric     }
1041fe6060f1SDimitry Andric 
1042bdd1243dSDimitry Andric     std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1043fe6060f1SDimitry Andric     int BaseCost = ST->getMVEVectorCostFactor(CostKind);
1044fe6060f1SDimitry Andric     // There are two types - the input that specifies the type of the compare
1045fe6060f1SDimitry Andric     // and the output vXi1 type. Because we don't know how the output will be
1046fe6060f1SDimitry Andric     // split, we may need an expensive shuffle to get two in sync. This has the
1047fe6060f1SDimitry Andric     // effect of making larger than legal compares (v8i32 for example)
1048fe6060f1SDimitry Andric     // expensive.
1049f3fd488fSDimitry Andric     if (LT.second.isVector() && LT.second.getVectorNumElements() > 2) {
1050fe6060f1SDimitry Andric       if (LT.first > 1)
1051fe6060f1SDimitry Andric         return LT.first * BaseCost +
1052bdd1243dSDimitry Andric                BaseT::getScalarizationOverhead(VecCondTy, /*Insert*/ true,
1053bdd1243dSDimitry Andric                                                /*Extract*/ false, CostKind);
1054fe6060f1SDimitry Andric       return BaseCost;
1055fe6060f1SDimitry Andric     }
1056fe6060f1SDimitry Andric   }
1057fe6060f1SDimitry Andric 
1058e8d8bef9SDimitry Andric   // Default to cheap (throughput/size of 1 instruction) but adjust throughput
1059e8d8bef9SDimitry Andric   // for "multiple beats" potentially needed by MVE instructions.
1060e8d8bef9SDimitry Andric   int BaseCost = 1;
1061fe6060f1SDimitry Andric   if (ST->hasMVEIntegerOps() && ValTy->isVectorTy())
1062fe6060f1SDimitry Andric     BaseCost = ST->getMVEVectorCostFactor(CostKind);
1063e8d8bef9SDimitry Andric 
1064e8d8bef9SDimitry Andric   return BaseCost *
1065e8d8bef9SDimitry Andric          BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
10660b57cec5SDimitry Andric }
10670b57cec5SDimitry Andric 
getAddressComputationCost(Type * Ty,ScalarEvolution * SE,const SCEV * Ptr)1068fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getAddressComputationCost(Type *Ty,
1069fe6060f1SDimitry Andric                                                       ScalarEvolution *SE,
10700b57cec5SDimitry Andric                                                       const SCEV *Ptr) {
10710b57cec5SDimitry Andric   // Address computations in vectorized code with non-consecutive addresses will
10720b57cec5SDimitry Andric   // likely result in more instructions compared to scalar code where the
10730b57cec5SDimitry Andric   // computation can more often be merged into the index mode. The resulting
10740b57cec5SDimitry Andric   // extra micro-ops can significantly decrease throughput.
10750b57cec5SDimitry Andric   unsigned NumVectorInstToHideOverhead = 10;
10760b57cec5SDimitry Andric   int MaxMergeDistance = 64;
10770b57cec5SDimitry Andric 
10788bcb0991SDimitry Andric   if (ST->hasNEON()) {
10790b57cec5SDimitry Andric     if (Ty->isVectorTy() && SE &&
10800b57cec5SDimitry Andric         !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
10810b57cec5SDimitry Andric       return NumVectorInstToHideOverhead;
10820b57cec5SDimitry Andric 
10830b57cec5SDimitry Andric     // In many cases the address computation is not merged into the instruction
10840b57cec5SDimitry Andric     // addressing mode.
10850b57cec5SDimitry Andric     return 1;
10860b57cec5SDimitry Andric   }
10878bcb0991SDimitry Andric   return BaseT::getAddressComputationCost(Ty, SE, Ptr);
10888bcb0991SDimitry Andric }
10898bcb0991SDimitry Andric 
isProfitableLSRChainElement(Instruction * I)10905ffd83dbSDimitry Andric bool ARMTTIImpl::isProfitableLSRChainElement(Instruction *I) {
10915ffd83dbSDimitry Andric   if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
10925ffd83dbSDimitry Andric     // If a VCTP is part of a chain, it's already profitable and shouldn't be
10935ffd83dbSDimitry Andric     // optimized, else LSR may block tail-predication.
10945ffd83dbSDimitry Andric     switch (II->getIntrinsicID()) {
10955ffd83dbSDimitry Andric     case Intrinsic::arm_mve_vctp8:
10965ffd83dbSDimitry Andric     case Intrinsic::arm_mve_vctp16:
10975ffd83dbSDimitry Andric     case Intrinsic::arm_mve_vctp32:
10985ffd83dbSDimitry Andric     case Intrinsic::arm_mve_vctp64:
10995ffd83dbSDimitry Andric       return true;
11005ffd83dbSDimitry Andric     default:
11015ffd83dbSDimitry Andric       break;
11025ffd83dbSDimitry Andric     }
11035ffd83dbSDimitry Andric   }
11045ffd83dbSDimitry Andric   return false;
11055ffd83dbSDimitry Andric }
11065ffd83dbSDimitry Andric 
isLegalMaskedLoad(Type * DataTy,Align Alignment)11075ffd83dbSDimitry Andric bool ARMTTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) {
11088bcb0991SDimitry Andric   if (!EnableMaskedLoadStores || !ST->hasMVEIntegerOps())
11098bcb0991SDimitry Andric     return false;
11108bcb0991SDimitry Andric 
11115ffd83dbSDimitry Andric   if (auto *VecTy = dyn_cast<FixedVectorType>(DataTy)) {
11128bcb0991SDimitry Andric     // Don't support v2i1 yet.
11138bcb0991SDimitry Andric     if (VecTy->getNumElements() == 2)
11148bcb0991SDimitry Andric       return false;
11158bcb0991SDimitry Andric 
11168bcb0991SDimitry Andric     // We don't support extending fp types.
11178bcb0991SDimitry Andric      unsigned VecWidth = DataTy->getPrimitiveSizeInBits();
11188bcb0991SDimitry Andric     if (VecWidth != 128 && VecTy->getElementType()->isFloatingPointTy())
11198bcb0991SDimitry Andric       return false;
11208bcb0991SDimitry Andric   }
11218bcb0991SDimitry Andric 
11228bcb0991SDimitry Andric   unsigned EltWidth = DataTy->getScalarSizeInBits();
11235ffd83dbSDimitry Andric   return (EltWidth == 32 && Alignment >= 4) ||
11245ffd83dbSDimitry Andric          (EltWidth == 16 && Alignment >= 2) || (EltWidth == 8);
11258bcb0991SDimitry Andric }
11260b57cec5SDimitry Andric 
isLegalMaskedGather(Type * Ty,Align Alignment)11275ffd83dbSDimitry Andric bool ARMTTIImpl::isLegalMaskedGather(Type *Ty, Align Alignment) {
1128480093f4SDimitry Andric   if (!EnableMaskedGatherScatters || !ST->hasMVEIntegerOps())
1129480093f4SDimitry Andric     return false;
1130480093f4SDimitry Andric 
1131480093f4SDimitry Andric   unsigned EltWidth = Ty->getScalarSizeInBits();
11325ffd83dbSDimitry Andric   return ((EltWidth == 32 && Alignment >= 4) ||
11335ffd83dbSDimitry Andric           (EltWidth == 16 && Alignment >= 2) || EltWidth == 8);
1134480093f4SDimitry Andric }
1135480093f4SDimitry Andric 
1136e8d8bef9SDimitry Andric /// Given a memcpy/memset/memmove instruction, return the number of memory
1137e8d8bef9SDimitry Andric /// operations performed, via querying findOptimalMemOpLowering. Returns -1 if a
1138e8d8bef9SDimitry Andric /// call is used.
getNumMemOps(const IntrinsicInst * I) const1139e8d8bef9SDimitry Andric int ARMTTIImpl::getNumMemOps(const IntrinsicInst *I) const {
1140e8d8bef9SDimitry Andric   MemOp MOp;
1141e8d8bef9SDimitry Andric   unsigned DstAddrSpace = ~0u;
1142e8d8bef9SDimitry Andric   unsigned SrcAddrSpace = ~0u;
1143e8d8bef9SDimitry Andric   const Function *F = I->getParent()->getParent();
11440b57cec5SDimitry Andric 
1145e8d8bef9SDimitry Andric   if (const auto *MC = dyn_cast<MemTransferInst>(I)) {
1146e8d8bef9SDimitry Andric     ConstantInt *C = dyn_cast<ConstantInt>(MC->getLength());
11470b57cec5SDimitry Andric     // If 'size' is not a constant, a library call will be generated.
11480b57cec5SDimitry Andric     if (!C)
1149e8d8bef9SDimitry Andric       return -1;
11500b57cec5SDimitry Andric 
11510b57cec5SDimitry Andric     const unsigned Size = C->getValue().getZExtValue();
1152e8d8bef9SDimitry Andric     const Align DstAlign = *MC->getDestAlign();
1153e8d8bef9SDimitry Andric     const Align SrcAlign = *MC->getSourceAlign();
1154e8d8bef9SDimitry Andric 
1155e8d8bef9SDimitry Andric     MOp = MemOp::Copy(Size, /*DstAlignCanChange*/ false, DstAlign, SrcAlign,
1156e8d8bef9SDimitry Andric                       /*IsVolatile*/ false);
1157e8d8bef9SDimitry Andric     DstAddrSpace = MC->getDestAddressSpace();
1158e8d8bef9SDimitry Andric     SrcAddrSpace = MC->getSourceAddressSpace();
1159e8d8bef9SDimitry Andric   }
1160e8d8bef9SDimitry Andric   else if (const auto *MS = dyn_cast<MemSetInst>(I)) {
1161e8d8bef9SDimitry Andric     ConstantInt *C = dyn_cast<ConstantInt>(MS->getLength());
1162e8d8bef9SDimitry Andric     // If 'size' is not a constant, a library call will be generated.
1163e8d8bef9SDimitry Andric     if (!C)
1164e8d8bef9SDimitry Andric       return -1;
1165e8d8bef9SDimitry Andric 
1166e8d8bef9SDimitry Andric     const unsigned Size = C->getValue().getZExtValue();
1167e8d8bef9SDimitry Andric     const Align DstAlign = *MS->getDestAlign();
1168e8d8bef9SDimitry Andric 
1169e8d8bef9SDimitry Andric     MOp = MemOp::Set(Size, /*DstAlignCanChange*/ false, DstAlign,
1170e8d8bef9SDimitry Andric                      /*IsZeroMemset*/ false, /*IsVolatile*/ false);
1171e8d8bef9SDimitry Andric     DstAddrSpace = MS->getDestAddressSpace();
1172e8d8bef9SDimitry Andric   }
1173e8d8bef9SDimitry Andric   else
1174e8d8bef9SDimitry Andric     llvm_unreachable("Expected a memcpy/move or memset!");
1175e8d8bef9SDimitry Andric 
1176e8d8bef9SDimitry Andric   unsigned Limit, Factor = 2;
1177e8d8bef9SDimitry Andric   switch(I->getIntrinsicID()) {
1178e8d8bef9SDimitry Andric     case Intrinsic::memcpy:
1179e8d8bef9SDimitry Andric       Limit = TLI->getMaxStoresPerMemcpy(F->hasMinSize());
1180e8d8bef9SDimitry Andric       break;
1181e8d8bef9SDimitry Andric     case Intrinsic::memmove:
1182e8d8bef9SDimitry Andric       Limit = TLI->getMaxStoresPerMemmove(F->hasMinSize());
1183e8d8bef9SDimitry Andric       break;
1184e8d8bef9SDimitry Andric     case Intrinsic::memset:
1185e8d8bef9SDimitry Andric       Limit = TLI->getMaxStoresPerMemset(F->hasMinSize());
1186e8d8bef9SDimitry Andric       Factor = 1;
1187e8d8bef9SDimitry Andric       break;
1188e8d8bef9SDimitry Andric     default:
1189e8d8bef9SDimitry Andric       llvm_unreachable("Expected a memcpy/move or memset!");
1190e8d8bef9SDimitry Andric   }
11910b57cec5SDimitry Andric 
11920b57cec5SDimitry Andric   // MemOps will be poplulated with a list of data types that needs to be
11930b57cec5SDimitry Andric   // loaded and stored. That's why we multiply the number of elements by 2 to
11940b57cec5SDimitry Andric   // get the cost for this memcpy.
1195e8d8bef9SDimitry Andric   std::vector<EVT> MemOps;
11960b57cec5SDimitry Andric   if (getTLI()->findOptimalMemOpLowering(
1197e8d8bef9SDimitry Andric           MemOps, Limit, MOp, DstAddrSpace,
1198e8d8bef9SDimitry Andric           SrcAddrSpace, F->getAttributes()))
1199e8d8bef9SDimitry Andric     return MemOps.size() * Factor;
12000b57cec5SDimitry Andric 
12010b57cec5SDimitry Andric   // If we can't find an optimal memop lowering, return the default cost
1202e8d8bef9SDimitry Andric   return -1;
1203e8d8bef9SDimitry Andric }
1204e8d8bef9SDimitry Andric 
getMemcpyCost(const Instruction * I)1205fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getMemcpyCost(const Instruction *I) {
1206e8d8bef9SDimitry Andric   int NumOps = getNumMemOps(cast<IntrinsicInst>(I));
1207e8d8bef9SDimitry Andric 
1208e8d8bef9SDimitry Andric   // To model the cost of a library call, we assume 1 for the call, and
1209e8d8bef9SDimitry Andric   // 3 for the argument setup.
1210e8d8bef9SDimitry Andric   if (NumOps == -1)
1211e8d8bef9SDimitry Andric     return 4;
1212e8d8bef9SDimitry Andric   return NumOps;
12130b57cec5SDimitry Andric }
12140b57cec5SDimitry Andric 
getShuffleCost(TTI::ShuffleKind Kind,VectorType * Tp,ArrayRef<int> Mask,TTI::TargetCostKind CostKind,int Index,VectorType * SubTp,ArrayRef<const Value * > Args,const Instruction * CxtI)1215fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
1216fe6060f1SDimitry Andric                                            VectorType *Tp, ArrayRef<int> Mask,
1217bdd1243dSDimitry Andric                                            TTI::TargetCostKind CostKind,
121881ad6265SDimitry Andric                                            int Index, VectorType *SubTp,
1219*0fca6ea1SDimitry Andric                                            ArrayRef<const Value *> Args,
1220*0fca6ea1SDimitry Andric                                            const Instruction *CxtI) {
12215f757f3fSDimitry Andric   Kind = improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp);
1222*0fca6ea1SDimitry Andric   // Treat extractsubvector as single op permutation.
1223*0fca6ea1SDimitry Andric   bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
1224*0fca6ea1SDimitry Andric   if (IsExtractSubvector)
1225*0fca6ea1SDimitry Andric     Kind = TTI::SK_PermuteSingleSrc;
12268bcb0991SDimitry Andric   if (ST->hasNEON()) {
12270b57cec5SDimitry Andric     if (Kind == TTI::SK_Broadcast) {
12280b57cec5SDimitry Andric       static const CostTblEntry NEONDupTbl[] = {
12290b57cec5SDimitry Andric           // VDUP handles these cases.
12300b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
12310b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
12320b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
12330b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
12340b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
12350b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},
12360b57cec5SDimitry Andric 
12370b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
12380b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
12390b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
12400b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1}};
12410b57cec5SDimitry Andric 
1242bdd1243dSDimitry Andric       std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
12438bcb0991SDimitry Andric       if (const auto *Entry =
12448bcb0991SDimitry Andric               CostTableLookup(NEONDupTbl, ISD::VECTOR_SHUFFLE, LT.second))
12450b57cec5SDimitry Andric         return LT.first * Entry->Cost;
12460b57cec5SDimitry Andric     }
12470b57cec5SDimitry Andric     if (Kind == TTI::SK_Reverse) {
12480b57cec5SDimitry Andric       static const CostTblEntry NEONShuffleTbl[] = {
12490b57cec5SDimitry Andric           // Reverse shuffle cost one instruction if we are shuffling within a
12500b57cec5SDimitry Andric           // double word (vrev) or two if we shuffle a quad word (vrev, vext).
12510b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
12520b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
12530b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
12540b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
12550b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
12560b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},
12570b57cec5SDimitry Andric 
12580b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
12590b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
12600b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v8i16, 2},
12610b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v16i8, 2}};
12620b57cec5SDimitry Andric 
1263bdd1243dSDimitry Andric       std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
12648bcb0991SDimitry Andric       if (const auto *Entry =
12658bcb0991SDimitry Andric               CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
12660b57cec5SDimitry Andric         return LT.first * Entry->Cost;
12670b57cec5SDimitry Andric     }
12680b57cec5SDimitry Andric     if (Kind == TTI::SK_Select) {
12690b57cec5SDimitry Andric       static const CostTblEntry NEONSelShuffleTbl[] = {
12708bcb0991SDimitry Andric           // Select shuffle cost table for ARM. Cost is the number of
12718bcb0991SDimitry Andric           // instructions
12720b57cec5SDimitry Andric           // required to create the shuffled vector.
12730b57cec5SDimitry Andric 
12740b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
12750b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
12760b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
12770b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
12780b57cec5SDimitry Andric 
12790b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
12800b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
12810b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v4i16, 2},
12820b57cec5SDimitry Andric 
12830b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v8i16, 16},
12840b57cec5SDimitry Andric 
12850b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v16i8, 32}};
12860b57cec5SDimitry Andric 
1287bdd1243dSDimitry Andric       std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
12880b57cec5SDimitry Andric       if (const auto *Entry = CostTableLookup(NEONSelShuffleTbl,
12890b57cec5SDimitry Andric                                               ISD::VECTOR_SHUFFLE, LT.second))
12900b57cec5SDimitry Andric         return LT.first * Entry->Cost;
12910b57cec5SDimitry Andric     }
12928bcb0991SDimitry Andric   }
12938bcb0991SDimitry Andric   if (ST->hasMVEIntegerOps()) {
12948bcb0991SDimitry Andric     if (Kind == TTI::SK_Broadcast) {
12958bcb0991SDimitry Andric       static const CostTblEntry MVEDupTbl[] = {
12968bcb0991SDimitry Andric           // VDUP handles these cases.
12978bcb0991SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
12988bcb0991SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
12998bcb0991SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1},
13008bcb0991SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
13018bcb0991SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v8f16, 1}};
13028bcb0991SDimitry Andric 
1303bdd1243dSDimitry Andric       std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
13048bcb0991SDimitry Andric       if (const auto *Entry = CostTableLookup(MVEDupTbl, ISD::VECTOR_SHUFFLE,
13058bcb0991SDimitry Andric                                               LT.second))
1306fe6060f1SDimitry Andric         return LT.first * Entry->Cost *
1307fe6060f1SDimitry Andric                ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput);
13080b57cec5SDimitry Andric     }
13090b57cec5SDimitry Andric 
1310fe6060f1SDimitry Andric     if (!Mask.empty()) {
1311bdd1243dSDimitry Andric       std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
131256f451bbSDimitry Andric       if (LT.second.isVector() &&
131356f451bbSDimitry Andric           Mask.size() <= LT.second.getVectorNumElements() &&
1314fe6060f1SDimitry Andric           (isVREVMask(Mask, LT.second, 16) || isVREVMask(Mask, LT.second, 32) ||
1315fe6060f1SDimitry Andric            isVREVMask(Mask, LT.second, 64)))
1316fe6060f1SDimitry Andric         return ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput) * LT.first;
1317fe6060f1SDimitry Andric     }
1318fe6060f1SDimitry Andric   }
1319fe6060f1SDimitry Andric 
1320*0fca6ea1SDimitry Andric   // Restore optimal kind.
1321*0fca6ea1SDimitry Andric   if (IsExtractSubvector)
1322*0fca6ea1SDimitry Andric     Kind = TTI::SK_ExtractSubvector;
1323fe6060f1SDimitry Andric   int BaseCost = ST->hasMVEIntegerOps() && Tp->isVectorTy()
1324fe6060f1SDimitry Andric                      ? ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput)
1325fe6060f1SDimitry Andric                      : 1;
1326bdd1243dSDimitry Andric   return BaseCost *
1327bdd1243dSDimitry Andric          BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp);
1328fe6060f1SDimitry Andric }
1329fe6060f1SDimitry Andric 
getArithmeticInstrCost(unsigned Opcode,Type * Ty,TTI::TargetCostKind CostKind,TTI::OperandValueInfo Op1Info,TTI::OperandValueInfo Op2Info,ArrayRef<const Value * > Args,const Instruction * CxtI)1330fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getArithmeticInstrCost(
1331fe6060f1SDimitry Andric     unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
1332bdd1243dSDimitry Andric     TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
1333bdd1243dSDimitry Andric     ArrayRef<const Value *> Args,
1334480093f4SDimitry Andric     const Instruction *CxtI) {
13350b57cec5SDimitry Andric   int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
1336e8d8bef9SDimitry Andric   if (ST->isThumb() && CostKind == TTI::TCK_CodeSize && Ty->isIntegerTy(1)) {
1337e8d8bef9SDimitry Andric     // Make operations on i1 relatively expensive as this often involves
1338e8d8bef9SDimitry Andric     // combining predicates. AND and XOR should be easier to handle with IT
1339e8d8bef9SDimitry Andric     // blocks.
1340e8d8bef9SDimitry Andric     switch (ISDOpcode) {
1341e8d8bef9SDimitry Andric     default:
1342e8d8bef9SDimitry Andric       break;
1343e8d8bef9SDimitry Andric     case ISD::AND:
1344e8d8bef9SDimitry Andric     case ISD::XOR:
1345e8d8bef9SDimitry Andric       return 2;
1346e8d8bef9SDimitry Andric     case ISD::OR:
1347e8d8bef9SDimitry Andric       return 3;
1348e8d8bef9SDimitry Andric     }
1349e8d8bef9SDimitry Andric   }
1350e8d8bef9SDimitry Andric 
1351bdd1243dSDimitry Andric   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
13520b57cec5SDimitry Andric 
1353480093f4SDimitry Andric   if (ST->hasNEON()) {
13540b57cec5SDimitry Andric     const unsigned FunctionCallDivCost = 20;
13550b57cec5SDimitry Andric     const unsigned ReciprocalDivCost = 10;
13560b57cec5SDimitry Andric     static const CostTblEntry CostTbl[] = {
13570b57cec5SDimitry Andric       // Division.
13580b57cec5SDimitry Andric       // These costs are somewhat random. Choose a cost of 20 to indicate that
13590b57cec5SDimitry Andric       // vectorizing devision (added function call) is going to be very expensive.
13600b57cec5SDimitry Andric       // Double registers types.
13610b57cec5SDimitry Andric       { ISD::SDIV, MVT::v1i64, 1 * FunctionCallDivCost},
13620b57cec5SDimitry Andric       { ISD::UDIV, MVT::v1i64, 1 * FunctionCallDivCost},
13630b57cec5SDimitry Andric       { ISD::SREM, MVT::v1i64, 1 * FunctionCallDivCost},
13640b57cec5SDimitry Andric       { ISD::UREM, MVT::v1i64, 1 * FunctionCallDivCost},
13650b57cec5SDimitry Andric       { ISD::SDIV, MVT::v2i32, 2 * FunctionCallDivCost},
13660b57cec5SDimitry Andric       { ISD::UDIV, MVT::v2i32, 2 * FunctionCallDivCost},
13670b57cec5SDimitry Andric       { ISD::SREM, MVT::v2i32, 2 * FunctionCallDivCost},
13680b57cec5SDimitry Andric       { ISD::UREM, MVT::v2i32, 2 * FunctionCallDivCost},
13690b57cec5SDimitry Andric       { ISD::SDIV, MVT::v4i16,     ReciprocalDivCost},
13700b57cec5SDimitry Andric       { ISD::UDIV, MVT::v4i16,     ReciprocalDivCost},
13710b57cec5SDimitry Andric       { ISD::SREM, MVT::v4i16, 4 * FunctionCallDivCost},
13720b57cec5SDimitry Andric       { ISD::UREM, MVT::v4i16, 4 * FunctionCallDivCost},
13730b57cec5SDimitry Andric       { ISD::SDIV, MVT::v8i8,      ReciprocalDivCost},
13740b57cec5SDimitry Andric       { ISD::UDIV, MVT::v8i8,      ReciprocalDivCost},
13750b57cec5SDimitry Andric       { ISD::SREM, MVT::v8i8,  8 * FunctionCallDivCost},
13760b57cec5SDimitry Andric       { ISD::UREM, MVT::v8i8,  8 * FunctionCallDivCost},
13770b57cec5SDimitry Andric       // Quad register types.
13780b57cec5SDimitry Andric       { ISD::SDIV, MVT::v2i64, 2 * FunctionCallDivCost},
13790b57cec5SDimitry Andric       { ISD::UDIV, MVT::v2i64, 2 * FunctionCallDivCost},
13800b57cec5SDimitry Andric       { ISD::SREM, MVT::v2i64, 2 * FunctionCallDivCost},
13810b57cec5SDimitry Andric       { ISD::UREM, MVT::v2i64, 2 * FunctionCallDivCost},
13820b57cec5SDimitry Andric       { ISD::SDIV, MVT::v4i32, 4 * FunctionCallDivCost},
13830b57cec5SDimitry Andric       { ISD::UDIV, MVT::v4i32, 4 * FunctionCallDivCost},
13840b57cec5SDimitry Andric       { ISD::SREM, MVT::v4i32, 4 * FunctionCallDivCost},
13850b57cec5SDimitry Andric       { ISD::UREM, MVT::v4i32, 4 * FunctionCallDivCost},
13860b57cec5SDimitry Andric       { ISD::SDIV, MVT::v8i16, 8 * FunctionCallDivCost},
13870b57cec5SDimitry Andric       { ISD::UDIV, MVT::v8i16, 8 * FunctionCallDivCost},
13880b57cec5SDimitry Andric       { ISD::SREM, MVT::v8i16, 8 * FunctionCallDivCost},
13890b57cec5SDimitry Andric       { ISD::UREM, MVT::v8i16, 8 * FunctionCallDivCost},
13900b57cec5SDimitry Andric       { ISD::SDIV, MVT::v16i8, 16 * FunctionCallDivCost},
13910b57cec5SDimitry Andric       { ISD::UDIV, MVT::v16i8, 16 * FunctionCallDivCost},
13920b57cec5SDimitry Andric       { ISD::SREM, MVT::v16i8, 16 * FunctionCallDivCost},
13930b57cec5SDimitry Andric       { ISD::UREM, MVT::v16i8, 16 * FunctionCallDivCost},
13940b57cec5SDimitry Andric       // Multiplication.
13950b57cec5SDimitry Andric     };
13960b57cec5SDimitry Andric 
13970b57cec5SDimitry Andric     if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second))
13980b57cec5SDimitry Andric       return LT.first * Entry->Cost;
13990b57cec5SDimitry Andric 
1400fe6060f1SDimitry Andric     InstructionCost Cost = BaseT::getArithmeticInstrCost(
1401bdd1243dSDimitry Andric         Opcode, Ty, CostKind, Op1Info, Op2Info);
14020b57cec5SDimitry Andric 
14030b57cec5SDimitry Andric     // This is somewhat of a hack. The problem that we are facing is that SROA
14040b57cec5SDimitry Andric     // creates a sequence of shift, and, or instructions to construct values.
14050b57cec5SDimitry Andric     // These sequences are recognized by the ISel and have zero-cost. Not so for
14060b57cec5SDimitry Andric     // the vectorized code. Because we have support for v2i64 but not i64 those
14070b57cec5SDimitry Andric     // sequences look particularly beneficial to vectorize.
14080b57cec5SDimitry Andric     // To work around this we increase the cost of v2i64 operations to make them
14090b57cec5SDimitry Andric     // seem less beneficial.
1410bdd1243dSDimitry Andric     if (LT.second == MVT::v2i64 && Op2Info.isUniform() && Op2Info.isConstant())
14110b57cec5SDimitry Andric       Cost += 4;
14120b57cec5SDimitry Andric 
14130b57cec5SDimitry Andric     return Cost;
14140b57cec5SDimitry Andric   }
14150b57cec5SDimitry Andric 
1416480093f4SDimitry Andric   // If this operation is a shift on arm/thumb2, it might well be folded into
1417480093f4SDimitry Andric   // the following instruction, hence having a cost of 0.
1418480093f4SDimitry Andric   auto LooksLikeAFreeShift = [&]() {
1419480093f4SDimitry Andric     if (ST->isThumb1Only() || Ty->isVectorTy())
1420480093f4SDimitry Andric       return false;
1421480093f4SDimitry Andric 
1422480093f4SDimitry Andric     if (!CxtI || !CxtI->hasOneUse() || !CxtI->isShift())
1423480093f4SDimitry Andric       return false;
1424bdd1243dSDimitry Andric     if (!Op2Info.isUniform() || !Op2Info.isConstant())
1425480093f4SDimitry Andric       return false;
1426480093f4SDimitry Andric 
1427480093f4SDimitry Andric     // Folded into a ADC/ADD/AND/BIC/CMP/EOR/MVN/ORR/ORN/RSB/SBC/SUB
1428480093f4SDimitry Andric     switch (cast<Instruction>(CxtI->user_back())->getOpcode()) {
1429480093f4SDimitry Andric     case Instruction::Add:
1430480093f4SDimitry Andric     case Instruction::Sub:
1431480093f4SDimitry Andric     case Instruction::And:
1432480093f4SDimitry Andric     case Instruction::Xor:
1433480093f4SDimitry Andric     case Instruction::Or:
1434480093f4SDimitry Andric     case Instruction::ICmp:
1435480093f4SDimitry Andric       return true;
1436480093f4SDimitry Andric     default:
1437480093f4SDimitry Andric       return false;
1438480093f4SDimitry Andric     }
1439480093f4SDimitry Andric   };
1440480093f4SDimitry Andric   if (LooksLikeAFreeShift())
1441480093f4SDimitry Andric     return 0;
1442480093f4SDimitry Andric 
1443e8d8bef9SDimitry Andric   // Default to cheap (throughput/size of 1 instruction) but adjust throughput
1444e8d8bef9SDimitry Andric   // for "multiple beats" potentially needed by MVE instructions.
1445e8d8bef9SDimitry Andric   int BaseCost = 1;
1446fe6060f1SDimitry Andric   if (ST->hasMVEIntegerOps() && Ty->isVectorTy())
1447fe6060f1SDimitry Andric     BaseCost = ST->getMVEVectorCostFactor(CostKind);
14488bcb0991SDimitry Andric 
14498bcb0991SDimitry Andric   // The rest of this mostly follows what is done in BaseT::getArithmeticInstrCost,
14508bcb0991SDimitry Andric   // without treating floats as more expensive that scalars or increasing the
14518bcb0991SDimitry Andric   // costs for custom operations. The results is also multiplied by the
14528bcb0991SDimitry Andric   // MVEVectorCostFactor where appropriate.
14538bcb0991SDimitry Andric   if (TLI->isOperationLegalOrCustomOrPromote(ISDOpcode, LT.second))
14548bcb0991SDimitry Andric     return LT.first * BaseCost;
14558bcb0991SDimitry Andric 
14568bcb0991SDimitry Andric   // Else this is expand, assume that we need to scalarize this op.
14575ffd83dbSDimitry Andric   if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
14585ffd83dbSDimitry Andric     unsigned Num = VTy->getNumElements();
1459fe6060f1SDimitry Andric     InstructionCost Cost =
1460fe6060f1SDimitry Andric         getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind);
14618bcb0991SDimitry Andric     // Return the cost of multiple scalar invocation plus the cost of
14628bcb0991SDimitry Andric     // inserting and extracting the values.
1463fe6060f1SDimitry Andric     SmallVector<Type *> Tys(Args.size(), Ty);
1464bdd1243dSDimitry Andric     return BaseT::getScalarizationOverhead(VTy, Args, Tys, CostKind) +
1465bdd1243dSDimitry Andric            Num * Cost;
14668bcb0991SDimitry Andric   }
14678bcb0991SDimitry Andric 
14688bcb0991SDimitry Andric   return BaseCost;
14698bcb0991SDimitry Andric }
14708bcb0991SDimitry Andric 
getMemoryOpCost(unsigned Opcode,Type * Src,MaybeAlign Alignment,unsigned AddressSpace,TTI::TargetCostKind CostKind,TTI::OperandValueInfo OpInfo,const Instruction * I)1471fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
1472fe6060f1SDimitry Andric                                             MaybeAlign Alignment,
1473fe6060f1SDimitry Andric                                             unsigned AddressSpace,
14745ffd83dbSDimitry Andric                                             TTI::TargetCostKind CostKind,
1475bdd1243dSDimitry Andric                                             TTI::OperandValueInfo OpInfo,
1476480093f4SDimitry Andric                                             const Instruction *I) {
14775ffd83dbSDimitry Andric   // TODO: Handle other cost kinds.
14785ffd83dbSDimitry Andric   if (CostKind != TTI::TCK_RecipThroughput)
14795ffd83dbSDimitry Andric     return 1;
14805ffd83dbSDimitry Andric 
14815ffd83dbSDimitry Andric   // Type legalization can't handle structs
14825ffd83dbSDimitry Andric   if (TLI->getValueType(DL, Src, true) == MVT::Other)
14835ffd83dbSDimitry Andric     return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
14845ffd83dbSDimitry Andric                                   CostKind);
14850b57cec5SDimitry Andric 
1486480093f4SDimitry Andric   if (ST->hasNEON() && Src->isVectorTy() &&
1487480093f4SDimitry Andric       (Alignment && *Alignment != Align(16)) &&
14885ffd83dbSDimitry Andric       cast<VectorType>(Src)->getElementType()->isDoubleTy()) {
14890b57cec5SDimitry Andric     // Unaligned loads/stores are extremely inefficient.
14900b57cec5SDimitry Andric     // We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr.
1491bdd1243dSDimitry Andric     std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
14920b57cec5SDimitry Andric     return LT.first * 4;
14930b57cec5SDimitry Andric   }
14945ffd83dbSDimitry Andric 
14955ffd83dbSDimitry Andric   // MVE can optimize a fpext(load(4xhalf)) using an extending integer load.
14965ffd83dbSDimitry Andric   // Same for stores.
14975ffd83dbSDimitry Andric   if (ST->hasMVEFloatOps() && isa<FixedVectorType>(Src) && I &&
14985ffd83dbSDimitry Andric       ((Opcode == Instruction::Load && I->hasOneUse() &&
14995ffd83dbSDimitry Andric         isa<FPExtInst>(*I->user_begin())) ||
15005ffd83dbSDimitry Andric        (Opcode == Instruction::Store && isa<FPTruncInst>(I->getOperand(0))))) {
15015ffd83dbSDimitry Andric     FixedVectorType *SrcVTy = cast<FixedVectorType>(Src);
15025ffd83dbSDimitry Andric     Type *DstTy =
15035ffd83dbSDimitry Andric         Opcode == Instruction::Load
15045ffd83dbSDimitry Andric             ? (*I->user_begin())->getType()
15055ffd83dbSDimitry Andric             : cast<Instruction>(I->getOperand(0))->getOperand(0)->getType();
15065ffd83dbSDimitry Andric     if (SrcVTy->getNumElements() == 4 && SrcVTy->getScalarType()->isHalfTy() &&
15075ffd83dbSDimitry Andric         DstTy->getScalarType()->isFloatTy())
1508fe6060f1SDimitry Andric       return ST->getMVEVectorCostFactor(CostKind);
15095ffd83dbSDimitry Andric   }
15105ffd83dbSDimitry Andric 
15118bcb0991SDimitry Andric   int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
1512fe6060f1SDimitry Andric                      ? ST->getMVEVectorCostFactor(CostKind)
15138bcb0991SDimitry Andric                      : 1;
15145ffd83dbSDimitry Andric   return BaseCost * BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1515bdd1243dSDimitry Andric                                            CostKind, OpInfo, I);
15160b57cec5SDimitry Andric }
15170b57cec5SDimitry Andric 
1518fe6060f1SDimitry Andric InstructionCost
getMaskedMemoryOpCost(unsigned Opcode,Type * Src,Align Alignment,unsigned AddressSpace,TTI::TargetCostKind CostKind)1519fe6060f1SDimitry Andric ARMTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
1520e8d8bef9SDimitry Andric                                   unsigned AddressSpace,
1521e8d8bef9SDimitry Andric                                   TTI::TargetCostKind CostKind) {
1522e8d8bef9SDimitry Andric   if (ST->hasMVEIntegerOps()) {
1523e8d8bef9SDimitry Andric     if (Opcode == Instruction::Load && isLegalMaskedLoad(Src, Alignment))
1524fe6060f1SDimitry Andric       return ST->getMVEVectorCostFactor(CostKind);
1525e8d8bef9SDimitry Andric     if (Opcode == Instruction::Store && isLegalMaskedStore(Src, Alignment))
1526fe6060f1SDimitry Andric       return ST->getMVEVectorCostFactor(CostKind);
1527e8d8bef9SDimitry Andric   }
1528e8d8bef9SDimitry Andric   if (!isa<FixedVectorType>(Src))
1529e8d8bef9SDimitry Andric     return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1530e8d8bef9SDimitry Andric                                         CostKind);
1531e8d8bef9SDimitry Andric   // Scalar cost, which is currently very high due to the efficiency of the
1532e8d8bef9SDimitry Andric   // generated code.
1533e8d8bef9SDimitry Andric   return cast<FixedVectorType>(Src)->getNumElements() * 8;
1534e8d8bef9SDimitry Andric }
1535e8d8bef9SDimitry Andric 
getInterleavedMemoryOpCost(unsigned Opcode,Type * VecTy,unsigned Factor,ArrayRef<unsigned> Indices,Align Alignment,unsigned AddressSpace,TTI::TargetCostKind CostKind,bool UseMaskForCond,bool UseMaskForGaps)1536fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getInterleavedMemoryOpCost(
1537480093f4SDimitry Andric     unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
15385ffd83dbSDimitry Andric     Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
15395ffd83dbSDimitry Andric     bool UseMaskForCond, bool UseMaskForGaps) {
15400b57cec5SDimitry Andric   assert(Factor >= 2 && "Invalid interleave factor");
15410b57cec5SDimitry Andric   assert(isa<VectorType>(VecTy) && "Expect a vector type");
15420b57cec5SDimitry Andric 
15430b57cec5SDimitry Andric   // vldN/vstN doesn't support vector types of i64/f64 element.
15440b57cec5SDimitry Andric   bool EltIs64Bits = DL.getTypeSizeInBits(VecTy->getScalarType()) == 64;
15450b57cec5SDimitry Andric 
15460b57cec5SDimitry Andric   if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits &&
15470b57cec5SDimitry Andric       !UseMaskForCond && !UseMaskForGaps) {
15485ffd83dbSDimitry Andric     unsigned NumElts = cast<FixedVectorType>(VecTy)->getNumElements();
15495ffd83dbSDimitry Andric     auto *SubVecTy =
15505ffd83dbSDimitry Andric         FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor);
15510b57cec5SDimitry Andric 
15520b57cec5SDimitry Andric     // vldN/vstN only support legal vector types of size 64 or 128 in bits.
15530b57cec5SDimitry Andric     // Accesses having vector types that are a multiple of 128 bits can be
15540b57cec5SDimitry Andric     // matched to more than one vldN/vstN instruction.
1555fe6060f1SDimitry Andric     int BaseCost =
1556fe6060f1SDimitry Andric         ST->hasMVEIntegerOps() ? ST->getMVEVectorCostFactor(CostKind) : 1;
15570b57cec5SDimitry Andric     if (NumElts % Factor == 0 &&
1558fe6060f1SDimitry Andric         TLI->isLegalInterleavedAccessType(Factor, SubVecTy, Alignment, DL))
1559480093f4SDimitry Andric       return Factor * BaseCost * TLI->getNumInterleavedAccesses(SubVecTy, DL);
1560480093f4SDimitry Andric 
1561480093f4SDimitry Andric     // Some smaller than legal interleaved patterns are cheap as we can make
1562480093f4SDimitry Andric     // use of the vmovn or vrev patterns to interleave a standard load. This is
1563480093f4SDimitry Andric     // true for v4i8, v8i8 and v4i16 at least (but not for v4f16 as it is
1564480093f4SDimitry Andric     // promoted differently). The cost of 2 here is then a load and vrev or
1565480093f4SDimitry Andric     // vmovn.
1566480093f4SDimitry Andric     if (ST->hasMVEIntegerOps() && Factor == 2 && NumElts / Factor > 2 &&
1567e8d8bef9SDimitry Andric         VecTy->isIntOrIntVectorTy() &&
1568bdd1243dSDimitry Andric         DL.getTypeSizeInBits(SubVecTy).getFixedValue() <= 64)
1569480093f4SDimitry Andric       return 2 * BaseCost;
15700b57cec5SDimitry Andric   }
15710b57cec5SDimitry Andric 
15720b57cec5SDimitry Andric   return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
15735ffd83dbSDimitry Andric                                            Alignment, AddressSpace, CostKind,
15740b57cec5SDimitry Andric                                            UseMaskForCond, UseMaskForGaps);
15750b57cec5SDimitry Andric }
15760b57cec5SDimitry Andric 
getGatherScatterOpCost(unsigned Opcode,Type * DataTy,const Value * Ptr,bool VariableMask,Align Alignment,TTI::TargetCostKind CostKind,const Instruction * I)1577fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getGatherScatterOpCost(
1578fe6060f1SDimitry Andric     unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
1579fe6060f1SDimitry Andric     Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
15805ffd83dbSDimitry Andric   using namespace PatternMatch;
15815ffd83dbSDimitry Andric   if (!ST->hasMVEIntegerOps() || !EnableMaskedGatherScatters)
15825ffd83dbSDimitry Andric     return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
15835ffd83dbSDimitry Andric                                          Alignment, CostKind, I);
15845ffd83dbSDimitry Andric 
15855ffd83dbSDimitry Andric   assert(DataTy->isVectorTy() && "Can't do gather/scatters on scalar!");
15865ffd83dbSDimitry Andric   auto *VTy = cast<FixedVectorType>(DataTy);
15875ffd83dbSDimitry Andric 
15885ffd83dbSDimitry Andric   // TODO: Splitting, once we do that.
15895ffd83dbSDimitry Andric 
15905ffd83dbSDimitry Andric   unsigned NumElems = VTy->getNumElements();
15915ffd83dbSDimitry Andric   unsigned EltSize = VTy->getScalarSizeInBits();
1592bdd1243dSDimitry Andric   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(DataTy);
15935ffd83dbSDimitry Andric 
15945ffd83dbSDimitry Andric   // For now, it is assumed that for the MVE gather instructions the loads are
15955ffd83dbSDimitry Andric   // all effectively serialised. This means the cost is the scalar cost
15965ffd83dbSDimitry Andric   // multiplied by the number of elements being loaded. This is possibly very
15975ffd83dbSDimitry Andric   // conservative, but even so we still end up vectorising loops because the
15985ffd83dbSDimitry Andric   // cost per iteration for many loops is lower than for scalar loops.
1599fe6060f1SDimitry Andric   InstructionCost VectorCost =
1600fe6060f1SDimitry Andric       NumElems * LT.first * ST->getMVEVectorCostFactor(CostKind);
16015ffd83dbSDimitry Andric   // The scalarization cost should be a lot higher. We use the number of vector
160206c3fb27SDimitry Andric   // elements plus the scalarization overhead. If masking is required then a lot
160306c3fb27SDimitry Andric   // of little blocks will be needed and potentially a scalarized p0 mask,
160406c3fb27SDimitry Andric   // greatly increasing the cost.
1605fe6060f1SDimitry Andric   InstructionCost ScalarCost =
160606c3fb27SDimitry Andric       NumElems * LT.first + (VariableMask ? NumElems * 5 : 0) +
1607bdd1243dSDimitry Andric       BaseT::getScalarizationOverhead(VTy, /*Insert*/ true, /*Extract*/ false,
1608bdd1243dSDimitry Andric                                       CostKind) +
1609bdd1243dSDimitry Andric       BaseT::getScalarizationOverhead(VTy, /*Insert*/ false, /*Extract*/ true,
1610bdd1243dSDimitry Andric                                       CostKind);
16115ffd83dbSDimitry Andric 
1612e8d8bef9SDimitry Andric   if (EltSize < 8 || Alignment < EltSize / 8)
16135ffd83dbSDimitry Andric     return ScalarCost;
16145ffd83dbSDimitry Andric 
16155ffd83dbSDimitry Andric   unsigned ExtSize = EltSize;
16165ffd83dbSDimitry Andric   // Check whether there's a single user that asks for an extended type
16175ffd83dbSDimitry Andric   if (I != nullptr) {
16185ffd83dbSDimitry Andric     // Dependent of the caller of this function, a gather instruction will
16195ffd83dbSDimitry Andric     // either have opcode Instruction::Load or be a call to the masked_gather
16205ffd83dbSDimitry Andric     // intrinsic
16215ffd83dbSDimitry Andric     if ((I->getOpcode() == Instruction::Load ||
16225ffd83dbSDimitry Andric          match(I, m_Intrinsic<Intrinsic::masked_gather>())) &&
16235ffd83dbSDimitry Andric         I->hasOneUse()) {
16245ffd83dbSDimitry Andric       const User *Us = *I->users().begin();
16255ffd83dbSDimitry Andric       if (isa<ZExtInst>(Us) || isa<SExtInst>(Us)) {
16265ffd83dbSDimitry Andric         // only allow valid type combinations
16275ffd83dbSDimitry Andric         unsigned TypeSize =
16285ffd83dbSDimitry Andric             cast<Instruction>(Us)->getType()->getScalarSizeInBits();
16295ffd83dbSDimitry Andric         if (((TypeSize == 32 && (EltSize == 8 || EltSize == 16)) ||
16305ffd83dbSDimitry Andric              (TypeSize == 16 && EltSize == 8)) &&
16315ffd83dbSDimitry Andric             TypeSize * NumElems == 128) {
16325ffd83dbSDimitry Andric           ExtSize = TypeSize;
16335ffd83dbSDimitry Andric         }
16345ffd83dbSDimitry Andric       }
16355ffd83dbSDimitry Andric     }
16365ffd83dbSDimitry Andric     // Check whether the input data needs to be truncated
16375ffd83dbSDimitry Andric     TruncInst *T;
16385ffd83dbSDimitry Andric     if ((I->getOpcode() == Instruction::Store ||
16395ffd83dbSDimitry Andric          match(I, m_Intrinsic<Intrinsic::masked_scatter>())) &&
16405ffd83dbSDimitry Andric         (T = dyn_cast<TruncInst>(I->getOperand(0)))) {
16415ffd83dbSDimitry Andric       // Only allow valid type combinations
16425ffd83dbSDimitry Andric       unsigned TypeSize = T->getOperand(0)->getType()->getScalarSizeInBits();
16435ffd83dbSDimitry Andric       if (((EltSize == 16 && TypeSize == 32) ||
16445ffd83dbSDimitry Andric            (EltSize == 8 && (TypeSize == 32 || TypeSize == 16))) &&
16455ffd83dbSDimitry Andric           TypeSize * NumElems == 128)
16465ffd83dbSDimitry Andric         ExtSize = TypeSize;
16475ffd83dbSDimitry Andric     }
16485ffd83dbSDimitry Andric   }
16495ffd83dbSDimitry Andric 
16505ffd83dbSDimitry Andric   if (ExtSize * NumElems != 128 || NumElems < 4)
16515ffd83dbSDimitry Andric     return ScalarCost;
16525ffd83dbSDimitry Andric 
16535ffd83dbSDimitry Andric   // Any (aligned) i32 gather will not need to be scalarised.
16545ffd83dbSDimitry Andric   if (ExtSize == 32)
16555ffd83dbSDimitry Andric     return VectorCost;
16565ffd83dbSDimitry Andric   // For smaller types, we need to ensure that the gep's inputs are correctly
16575ffd83dbSDimitry Andric   // extended from a small enough value. Other sizes (including i64) are
16585ffd83dbSDimitry Andric   // scalarized for now.
16595ffd83dbSDimitry Andric   if (ExtSize != 8 && ExtSize != 16)
16605ffd83dbSDimitry Andric     return ScalarCost;
16615ffd83dbSDimitry Andric 
16625ffd83dbSDimitry Andric   if (const auto *BC = dyn_cast<BitCastInst>(Ptr))
16635ffd83dbSDimitry Andric     Ptr = BC->getOperand(0);
16645ffd83dbSDimitry Andric   if (const auto *GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
16655ffd83dbSDimitry Andric     if (GEP->getNumOperands() != 2)
16665ffd83dbSDimitry Andric       return ScalarCost;
16675ffd83dbSDimitry Andric     unsigned Scale = DL.getTypeAllocSize(GEP->getResultElementType());
16685ffd83dbSDimitry Andric     // Scale needs to be correct (which is only relevant for i16s).
16695ffd83dbSDimitry Andric     if (Scale != 1 && Scale * 8 != ExtSize)
16705ffd83dbSDimitry Andric       return ScalarCost;
16715ffd83dbSDimitry Andric     // And we need to zext (not sext) the indexes from a small enough type.
16725ffd83dbSDimitry Andric     if (const auto *ZExt = dyn_cast<ZExtInst>(GEP->getOperand(1))) {
16735ffd83dbSDimitry Andric       if (ZExt->getOperand(0)->getType()->getScalarSizeInBits() <= ExtSize)
16745ffd83dbSDimitry Andric         return VectorCost;
16755ffd83dbSDimitry Andric     }
16765ffd83dbSDimitry Andric     return ScalarCost;
16775ffd83dbSDimitry Andric   }
16785ffd83dbSDimitry Andric   return ScalarCost;
16795ffd83dbSDimitry Andric }
16805ffd83dbSDimitry Andric 
1681fe6060f1SDimitry Andric InstructionCost
getArithmeticReductionCost(unsigned Opcode,VectorType * ValTy,std::optional<FastMathFlags> FMF,TTI::TargetCostKind CostKind)1682fe6060f1SDimitry Andric ARMTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
1683bdd1243dSDimitry Andric                                        std::optional<FastMathFlags> FMF,
1684e8d8bef9SDimitry Andric                                        TTI::TargetCostKind CostKind) {
1685fe6060f1SDimitry Andric 
1686e8d8bef9SDimitry Andric   EVT ValVT = TLI->getValueType(DL, ValTy);
1687e8d8bef9SDimitry Andric   int ISD = TLI->InstructionOpcodeToISD(Opcode);
16885f757f3fSDimitry Andric   unsigned EltSize = ValVT.getScalarSizeInBits();
16895f757f3fSDimitry Andric 
16905f757f3fSDimitry Andric   // In general floating point reductions are a series of elementwise
16915f757f3fSDimitry Andric   // operations, with free extracts on each step. These are either in-order or
16925f757f3fSDimitry Andric   // treewise depending on whether that is allowed by the fast math flags.
16935f757f3fSDimitry Andric   if ((ISD == ISD::FADD || ISD == ISD::FMUL) &&
16945f757f3fSDimitry Andric       ((EltSize == 32 && ST->hasVFP2Base()) ||
16955f757f3fSDimitry Andric        (EltSize == 64 && ST->hasFP64()) ||
16965f757f3fSDimitry Andric        (EltSize == 16 && ST->hasFullFP16()))) {
16975f757f3fSDimitry Andric     unsigned NumElts = cast<FixedVectorType>(ValTy)->getNumElements();
16985f757f3fSDimitry Andric     unsigned VecLimit = ST->hasMVEFloatOps() ? 128 : (ST->hasNEON() ? 64 : -1);
16995f757f3fSDimitry Andric     InstructionCost VecCost = 0;
17005f757f3fSDimitry Andric     while (!TTI::requiresOrderedReduction(FMF) && isPowerOf2_32(NumElts) &&
17015f757f3fSDimitry Andric            NumElts * EltSize > VecLimit) {
17025f757f3fSDimitry Andric       Type *VecTy = FixedVectorType::get(ValTy->getElementType(), NumElts / 2);
17035f757f3fSDimitry Andric       VecCost += getArithmeticInstrCost(Opcode, VecTy, CostKind);
17045f757f3fSDimitry Andric       NumElts /= 2;
17055f757f3fSDimitry Andric     }
17065f757f3fSDimitry Andric 
17075f757f3fSDimitry Andric     // For fp16 we need to extract the upper lane elements. MVE can add a
17085f757f3fSDimitry Andric     // VREV+FMIN/MAX to perform another vector step instead.
17095f757f3fSDimitry Andric     InstructionCost ExtractCost = 0;
17105f757f3fSDimitry Andric     if (!TTI::requiresOrderedReduction(FMF) && ST->hasMVEFloatOps() &&
17115f757f3fSDimitry Andric         ValVT.getVectorElementType() == MVT::f16 && NumElts == 8) {
17125f757f3fSDimitry Andric       VecCost += ST->getMVEVectorCostFactor(CostKind) * 2;
17135f757f3fSDimitry Andric       NumElts /= 2;
17145f757f3fSDimitry Andric     } else if (ValVT.getVectorElementType() == MVT::f16)
17155f757f3fSDimitry Andric       ExtractCost = NumElts / 2;
17165f757f3fSDimitry Andric 
17175f757f3fSDimitry Andric     return VecCost + ExtractCost +
17185f757f3fSDimitry Andric            NumElts *
17195f757f3fSDimitry Andric                getArithmeticInstrCost(Opcode, ValTy->getElementType(), CostKind);
17205f757f3fSDimitry Andric   }
17215f757f3fSDimitry Andric 
17225f757f3fSDimitry Andric   if ((ISD == ISD::AND || ISD == ISD::OR || ISD == ISD::XOR) &&
17235f757f3fSDimitry Andric       (EltSize == 64 || EltSize == 32 || EltSize == 16 || EltSize == 8)) {
17245f757f3fSDimitry Andric     unsigned NumElts = cast<FixedVectorType>(ValTy)->getNumElements();
17255f757f3fSDimitry Andric     unsigned VecLimit =
17265f757f3fSDimitry Andric         ST->hasMVEIntegerOps() ? 128 : (ST->hasNEON() ? 64 : -1);
17275f757f3fSDimitry Andric     InstructionCost VecCost = 0;
17285f757f3fSDimitry Andric     while (isPowerOf2_32(NumElts) && NumElts * EltSize > VecLimit) {
17295f757f3fSDimitry Andric       Type *VecTy = FixedVectorType::get(ValTy->getElementType(), NumElts / 2);
17305f757f3fSDimitry Andric       VecCost += getArithmeticInstrCost(Opcode, VecTy, CostKind);
17315f757f3fSDimitry Andric       NumElts /= 2;
17325f757f3fSDimitry Andric     }
17335f757f3fSDimitry Andric     // For i16/i8, MVE will perform a VREV + VORR/VAND/VEOR for the 64bit vector
17345f757f3fSDimitry Andric     // step.
17355f757f3fSDimitry Andric     if (ST->hasMVEIntegerOps() && ValVT.getScalarSizeInBits() <= 16 &&
17365f757f3fSDimitry Andric         NumElts * EltSize == 64) {
17375f757f3fSDimitry Andric       Type *VecTy = FixedVectorType::get(ValTy->getElementType(), NumElts);
17385f757f3fSDimitry Andric       VecCost += ST->getMVEVectorCostFactor(CostKind) +
17395f757f3fSDimitry Andric                  getArithmeticInstrCost(Opcode, VecTy, CostKind);
17405f757f3fSDimitry Andric       NumElts /= 2;
17415f757f3fSDimitry Andric     }
17425f757f3fSDimitry Andric 
17435f757f3fSDimitry Andric     // From here we extract the elements and perform the and/or/xor.
17445f757f3fSDimitry Andric     InstructionCost ExtractCost = NumElts;
17455f757f3fSDimitry Andric     return VecCost + ExtractCost +
17465f757f3fSDimitry Andric            (NumElts - 1) * getArithmeticInstrCost(
17475f757f3fSDimitry Andric                                Opcode, ValTy->getElementType(), CostKind);
17485f757f3fSDimitry Andric   }
17495f757f3fSDimitry Andric 
17505f757f3fSDimitry Andric   if (!ST->hasMVEIntegerOps() || !ValVT.isSimple() || ISD != ISD::ADD ||
17515f757f3fSDimitry Andric       TTI::requiresOrderedReduction(FMF))
1752fe6060f1SDimitry Andric     return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1753e8d8bef9SDimitry Andric 
1754bdd1243dSDimitry Andric   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1755e8d8bef9SDimitry Andric 
1756e8d8bef9SDimitry Andric   static const CostTblEntry CostTblAdd[]{
1757e8d8bef9SDimitry Andric       {ISD::ADD, MVT::v16i8, 1},
1758e8d8bef9SDimitry Andric       {ISD::ADD, MVT::v8i16, 1},
1759e8d8bef9SDimitry Andric       {ISD::ADD, MVT::v4i32, 1},
1760e8d8bef9SDimitry Andric   };
1761e8d8bef9SDimitry Andric   if (const auto *Entry = CostTableLookup(CostTblAdd, ISD, LT.second))
1762fe6060f1SDimitry Andric     return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first;
1763e8d8bef9SDimitry Andric 
1764fe6060f1SDimitry Andric   return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1765e8d8bef9SDimitry Andric }
1766e8d8bef9SDimitry Andric 
getExtendedReductionCost(unsigned Opcode,bool IsUnsigned,Type * ResTy,VectorType * ValTy,FastMathFlags FMF,TTI::TargetCostKind CostKind)1767bdd1243dSDimitry Andric InstructionCost ARMTTIImpl::getExtendedReductionCost(
1768bdd1243dSDimitry Andric     unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy,
176906c3fb27SDimitry Andric     FastMathFlags FMF, TTI::TargetCostKind CostKind) {
1770bdd1243dSDimitry Andric   EVT ValVT = TLI->getValueType(DL, ValTy);
1771bdd1243dSDimitry Andric   EVT ResVT = TLI->getValueType(DL, ResTy);
1772bdd1243dSDimitry Andric 
1773bdd1243dSDimitry Andric   int ISD = TLI->InstructionOpcodeToISD(Opcode);
1774bdd1243dSDimitry Andric 
1775bdd1243dSDimitry Andric   switch (ISD) {
1776bdd1243dSDimitry Andric   case ISD::ADD:
1777bdd1243dSDimitry Andric     if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {
1778bdd1243dSDimitry Andric       std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1779bdd1243dSDimitry Andric 
1780bdd1243dSDimitry Andric       // The legal cases are:
1781bdd1243dSDimitry Andric       //   VADDV u/s 8/16/32
1782bdd1243dSDimitry Andric       //   VADDLV u/s 32
1783bdd1243dSDimitry Andric       // Codegen currently cannot always handle larger than legal vectors very
1784bdd1243dSDimitry Andric       // well, especially for predicated reductions where the mask needs to be
1785bdd1243dSDimitry Andric       // split, so restrict to 128bit or smaller input types.
1786bdd1243dSDimitry Andric       unsigned RevVTSize = ResVT.getSizeInBits();
1787bdd1243dSDimitry Andric       if (ValVT.getSizeInBits() <= 128 &&
1788bdd1243dSDimitry Andric           ((LT.second == MVT::v16i8 && RevVTSize <= 32) ||
1789bdd1243dSDimitry Andric            (LT.second == MVT::v8i16 && RevVTSize <= 32) ||
1790bdd1243dSDimitry Andric            (LT.second == MVT::v4i32 && RevVTSize <= 64)))
1791bdd1243dSDimitry Andric         return ST->getMVEVectorCostFactor(CostKind) * LT.first;
1792bdd1243dSDimitry Andric     }
1793bdd1243dSDimitry Andric     break;
1794bdd1243dSDimitry Andric   default:
1795bdd1243dSDimitry Andric     break;
1796bdd1243dSDimitry Andric   }
1797bdd1243dSDimitry Andric   return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy, FMF,
1798bdd1243dSDimitry Andric                                          CostKind);
1799bdd1243dSDimitry Andric }
1800bdd1243dSDimitry Andric 
1801e8d8bef9SDimitry Andric InstructionCost
getMulAccReductionCost(bool IsUnsigned,Type * ResTy,VectorType * ValTy,TTI::TargetCostKind CostKind)1802bdd1243dSDimitry Andric ARMTTIImpl::getMulAccReductionCost(bool IsUnsigned, Type *ResTy,
1803bdd1243dSDimitry Andric                                    VectorType *ValTy,
1804e8d8bef9SDimitry Andric                                    TTI::TargetCostKind CostKind) {
1805e8d8bef9SDimitry Andric   EVT ValVT = TLI->getValueType(DL, ValTy);
1806e8d8bef9SDimitry Andric   EVT ResVT = TLI->getValueType(DL, ResTy);
1807349cc55cSDimitry Andric 
1808e8d8bef9SDimitry Andric   if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {
1809bdd1243dSDimitry Andric     std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1810349cc55cSDimitry Andric 
1811349cc55cSDimitry Andric     // The legal cases are:
1812349cc55cSDimitry Andric     //   VMLAV u/s 8/16/32
1813349cc55cSDimitry Andric     //   VMLALV u/s 16/32
1814349cc55cSDimitry Andric     // Codegen currently cannot always handle larger than legal vectors very
1815349cc55cSDimitry Andric     // well, especially for predicated reductions where the mask needs to be
1816349cc55cSDimitry Andric     // split, so restrict to 128bit or smaller input types.
1817349cc55cSDimitry Andric     unsigned RevVTSize = ResVT.getSizeInBits();
1818349cc55cSDimitry Andric     if (ValVT.getSizeInBits() <= 128 &&
1819349cc55cSDimitry Andric         ((LT.second == MVT::v16i8 && RevVTSize <= 32) ||
1820bdd1243dSDimitry Andric          (LT.second == MVT::v8i16 && RevVTSize <= 64) ||
1821349cc55cSDimitry Andric          (LT.second == MVT::v4i32 && RevVTSize <= 64)))
1822fe6060f1SDimitry Andric       return ST->getMVEVectorCostFactor(CostKind) * LT.first;
1823e8d8bef9SDimitry Andric   }
1824e8d8bef9SDimitry Andric 
1825bdd1243dSDimitry Andric   return BaseT::getMulAccReductionCost(IsUnsigned, ResTy, ValTy, CostKind);
1826e8d8bef9SDimitry Andric }
1827e8d8bef9SDimitry Andric 
1828fe6060f1SDimitry Andric InstructionCost
getMinMaxReductionCost(Intrinsic::ID IID,VectorType * Ty,FastMathFlags FMF,TTI::TargetCostKind CostKind)18295f757f3fSDimitry Andric ARMTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
18305f757f3fSDimitry Andric                                    FastMathFlags FMF,
18315f757f3fSDimitry Andric                                    TTI::TargetCostKind CostKind) {
18325f757f3fSDimitry Andric   EVT ValVT = TLI->getValueType(DL, Ty);
18335f757f3fSDimitry Andric 
18345f757f3fSDimitry Andric   // In general floating point reductions are a series of elementwise
18355f757f3fSDimitry Andric   // operations, with free extracts on each step. These are either in-order or
18365f757f3fSDimitry Andric   // treewise depending on whether that is allowed by the fast math flags.
18375f757f3fSDimitry Andric   if ((IID == Intrinsic::minnum || IID == Intrinsic::maxnum) &&
18385f757f3fSDimitry Andric       ((ValVT.getVectorElementType() == MVT::f32 && ST->hasVFP2Base()) ||
18395f757f3fSDimitry Andric        (ValVT.getVectorElementType() == MVT::f64 && ST->hasFP64()) ||
18405f757f3fSDimitry Andric        (ValVT.getVectorElementType() == MVT::f16 && ST->hasFullFP16()))) {
18415f757f3fSDimitry Andric     unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements();
18425f757f3fSDimitry Andric     unsigned EltSize = ValVT.getScalarSizeInBits();
18435f757f3fSDimitry Andric     unsigned VecLimit = ST->hasMVEFloatOps() ? 128 : (ST->hasNEON() ? 64 : -1);
18445f757f3fSDimitry Andric     InstructionCost VecCost;
18455f757f3fSDimitry Andric     while (isPowerOf2_32(NumElts) && NumElts * EltSize > VecLimit) {
18465f757f3fSDimitry Andric       Type *VecTy = FixedVectorType::get(Ty->getElementType(), NumElts/2);
18475f757f3fSDimitry Andric       IntrinsicCostAttributes ICA(IID, VecTy, {VecTy, VecTy}, FMF);
18485f757f3fSDimitry Andric       VecCost += getIntrinsicInstrCost(ICA, CostKind);
18495f757f3fSDimitry Andric       NumElts /= 2;
18505f757f3fSDimitry Andric     }
18515f757f3fSDimitry Andric 
18525f757f3fSDimitry Andric     // For fp16 we need to extract the upper lane elements. MVE can add a
18535f757f3fSDimitry Andric     // VREV+FMIN/MAX to perform another vector step instead.
18545f757f3fSDimitry Andric     InstructionCost ExtractCost = 0;
18555f757f3fSDimitry Andric     if (ST->hasMVEFloatOps() && ValVT.getVectorElementType() == MVT::f16 &&
18565f757f3fSDimitry Andric         NumElts == 8) {
18575f757f3fSDimitry Andric       VecCost += ST->getMVEVectorCostFactor(CostKind) * 2;
18585f757f3fSDimitry Andric       NumElts /= 2;
18595f757f3fSDimitry Andric     } else if (ValVT.getVectorElementType() == MVT::f16)
18605f757f3fSDimitry Andric       ExtractCost = cast<FixedVectorType>(Ty)->getNumElements() / 2;
18615f757f3fSDimitry Andric 
18625f757f3fSDimitry Andric     IntrinsicCostAttributes ICA(IID, Ty->getElementType(),
18635f757f3fSDimitry Andric                                 {Ty->getElementType(), Ty->getElementType()},
18645f757f3fSDimitry Andric                                 FMF);
18655f757f3fSDimitry Andric     return VecCost + ExtractCost +
18665f757f3fSDimitry Andric            (NumElts - 1) * getIntrinsicInstrCost(ICA, CostKind);
18675f757f3fSDimitry Andric   }
18685f757f3fSDimitry Andric 
18695f757f3fSDimitry Andric   if (IID == Intrinsic::smin || IID == Intrinsic::smax ||
18705f757f3fSDimitry Andric       IID == Intrinsic::umin || IID == Intrinsic::umax) {
18715f757f3fSDimitry Andric     std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
18725f757f3fSDimitry Andric 
18735f757f3fSDimitry Andric     // All costs are the same for u/s min/max.  These lower to vminv, which are
18745f757f3fSDimitry Andric     // given a slightly higher cost as they tend to take multiple cycles for
18755f757f3fSDimitry Andric     // smaller type sizes.
18765f757f3fSDimitry Andric     static const CostTblEntry CostTblAdd[]{
18775f757f3fSDimitry Andric         {ISD::SMIN, MVT::v16i8, 4},
18785f757f3fSDimitry Andric         {ISD::SMIN, MVT::v8i16, 3},
18795f757f3fSDimitry Andric         {ISD::SMIN, MVT::v4i32, 2},
18805f757f3fSDimitry Andric     };
18815f757f3fSDimitry Andric     if (const auto *Entry = CostTableLookup(CostTblAdd, ISD::SMIN, LT.second))
18825f757f3fSDimitry Andric       return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first;
18835f757f3fSDimitry Andric   }
18845f757f3fSDimitry Andric 
18855f757f3fSDimitry Andric   return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
18865f757f3fSDimitry Andric }
18875f757f3fSDimitry Andric 
18885f757f3fSDimitry Andric InstructionCost
getIntrinsicInstrCost(const IntrinsicCostAttributes & ICA,TTI::TargetCostKind CostKind)1889fe6060f1SDimitry Andric ARMTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
1890e8d8bef9SDimitry Andric                                   TTI::TargetCostKind CostKind) {
1891e8d8bef9SDimitry Andric   switch (ICA.getID()) {
1892e8d8bef9SDimitry Andric   case Intrinsic::get_active_lane_mask:
1893e8d8bef9SDimitry Andric     // Currently we make a somewhat optimistic assumption that
1894e8d8bef9SDimitry Andric     // active_lane_mask's are always free. In reality it may be freely folded
1895e8d8bef9SDimitry Andric     // into a tail predicated loop, expanded into a VCPT or expanded into a lot
1896e8d8bef9SDimitry Andric     // of add/icmp code. We may need to improve this in the future, but being
1897e8d8bef9SDimitry Andric     // able to detect if it is free or not involves looking at a lot of other
1898e8d8bef9SDimitry Andric     // code. We currently assume that the vectorizer inserted these, and knew
1899e8d8bef9SDimitry Andric     // what it was doing in adding one.
1900e8d8bef9SDimitry Andric     if (ST->hasMVEIntegerOps())
1901e8d8bef9SDimitry Andric       return 0;
1902e8d8bef9SDimitry Andric     break;
1903e8d8bef9SDimitry Andric   case Intrinsic::sadd_sat:
1904e8d8bef9SDimitry Andric   case Intrinsic::ssub_sat:
1905e8d8bef9SDimitry Andric   case Intrinsic::uadd_sat:
1906e8d8bef9SDimitry Andric   case Intrinsic::usub_sat: {
1907e8d8bef9SDimitry Andric     if (!ST->hasMVEIntegerOps())
1908e8d8bef9SDimitry Andric       break;
1909e8d8bef9SDimitry Andric     Type *VT = ICA.getReturnType();
1910e8d8bef9SDimitry Andric 
1911bdd1243dSDimitry Andric     std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VT);
1912e8d8bef9SDimitry Andric     if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
1913e8d8bef9SDimitry Andric         LT.second == MVT::v16i8) {
1914fe6060f1SDimitry Andric       // This is a base cost of 1 for the vqadd, plus 3 extract shifts if we
1915e8d8bef9SDimitry Andric       // need to extend the type, as it uses shr(qadd(shl, shl)).
1916fe6060f1SDimitry Andric       unsigned Instrs =
1917fe6060f1SDimitry Andric           LT.second.getScalarSizeInBits() == VT->getScalarSizeInBits() ? 1 : 4;
1918fe6060f1SDimitry Andric       return LT.first * ST->getMVEVectorCostFactor(CostKind) * Instrs;
1919e8d8bef9SDimitry Andric     }
1920e8d8bef9SDimitry Andric     break;
1921e8d8bef9SDimitry Andric   }
1922fe6060f1SDimitry Andric   case Intrinsic::abs:
1923fe6060f1SDimitry Andric   case Intrinsic::smin:
1924fe6060f1SDimitry Andric   case Intrinsic::smax:
1925fe6060f1SDimitry Andric   case Intrinsic::umin:
1926fe6060f1SDimitry Andric   case Intrinsic::umax: {
1927fe6060f1SDimitry Andric     if (!ST->hasMVEIntegerOps())
1928fe6060f1SDimitry Andric       break;
1929fe6060f1SDimitry Andric     Type *VT = ICA.getReturnType();
1930fe6060f1SDimitry Andric 
1931bdd1243dSDimitry Andric     std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VT);
1932fe6060f1SDimitry Andric     if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
1933fe6060f1SDimitry Andric         LT.second == MVT::v16i8)
1934fe6060f1SDimitry Andric       return LT.first * ST->getMVEVectorCostFactor(CostKind);
1935fe6060f1SDimitry Andric     break;
1936fe6060f1SDimitry Andric   }
1937fe6060f1SDimitry Andric   case Intrinsic::minnum:
1938fe6060f1SDimitry Andric   case Intrinsic::maxnum: {
1939fe6060f1SDimitry Andric     if (!ST->hasMVEFloatOps())
1940fe6060f1SDimitry Andric       break;
1941fe6060f1SDimitry Andric     Type *VT = ICA.getReturnType();
1942bdd1243dSDimitry Andric     std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VT);
1943fe6060f1SDimitry Andric     if (LT.second == MVT::v4f32 || LT.second == MVT::v8f16)
1944fe6060f1SDimitry Andric       return LT.first * ST->getMVEVectorCostFactor(CostKind);
1945fe6060f1SDimitry Andric     break;
1946fe6060f1SDimitry Andric   }
194781ad6265SDimitry Andric   case Intrinsic::fptosi_sat:
194881ad6265SDimitry Andric   case Intrinsic::fptoui_sat: {
194981ad6265SDimitry Andric     if (ICA.getArgTypes().empty())
195081ad6265SDimitry Andric       break;
195181ad6265SDimitry Andric     bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
1952bdd1243dSDimitry Andric     auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]);
195381ad6265SDimitry Andric     EVT MTy = TLI->getValueType(DL, ICA.getReturnType());
195481ad6265SDimitry Andric     // Check for the legal types, with the corect subtarget features.
195581ad6265SDimitry Andric     if ((ST->hasVFP2Base() && LT.second == MVT::f32 && MTy == MVT::i32) ||
195681ad6265SDimitry Andric         (ST->hasFP64() && LT.second == MVT::f64 && MTy == MVT::i32) ||
195781ad6265SDimitry Andric         (ST->hasFullFP16() && LT.second == MVT::f16 && MTy == MVT::i32))
195881ad6265SDimitry Andric       return LT.first;
195981ad6265SDimitry Andric 
196081ad6265SDimitry Andric     // Equally for MVE vector types
196181ad6265SDimitry Andric     if (ST->hasMVEFloatOps() &&
196281ad6265SDimitry Andric         (LT.second == MVT::v4f32 || LT.second == MVT::v8f16) &&
196381ad6265SDimitry Andric         LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())
196481ad6265SDimitry Andric       return LT.first * ST->getMVEVectorCostFactor(CostKind);
196581ad6265SDimitry Andric 
196681ad6265SDimitry Andric     // Otherwise we use a legal convert followed by a min+max
196781ad6265SDimitry Andric     if (((ST->hasVFP2Base() && LT.second == MVT::f32) ||
196881ad6265SDimitry Andric          (ST->hasFP64() && LT.second == MVT::f64) ||
196981ad6265SDimitry Andric          (ST->hasFullFP16() && LT.second == MVT::f16) ||
197081ad6265SDimitry Andric          (ST->hasMVEFloatOps() &&
197181ad6265SDimitry Andric           (LT.second == MVT::v4f32 || LT.second == MVT::v8f16))) &&
197281ad6265SDimitry Andric         LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
197381ad6265SDimitry Andric       Type *LegalTy = Type::getIntNTy(ICA.getReturnType()->getContext(),
197481ad6265SDimitry Andric                                       LT.second.getScalarSizeInBits());
197581ad6265SDimitry Andric       InstructionCost Cost =
197681ad6265SDimitry Andric           LT.second.isVector() ? ST->getMVEVectorCostFactor(CostKind) : 1;
197781ad6265SDimitry Andric       IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin
197881ad6265SDimitry Andric                                               : Intrinsic::umin,
197981ad6265SDimitry Andric                                      LegalTy, {LegalTy, LegalTy});
198081ad6265SDimitry Andric       Cost += getIntrinsicInstrCost(Attrs1, CostKind);
198181ad6265SDimitry Andric       IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax
198281ad6265SDimitry Andric                                               : Intrinsic::umax,
198381ad6265SDimitry Andric                                      LegalTy, {LegalTy, LegalTy});
198481ad6265SDimitry Andric       Cost += getIntrinsicInstrCost(Attrs2, CostKind);
198581ad6265SDimitry Andric       return LT.first * Cost;
198681ad6265SDimitry Andric     }
198781ad6265SDimitry Andric     break;
198881ad6265SDimitry Andric   }
1989e8d8bef9SDimitry Andric   }
1990e8d8bef9SDimitry Andric 
1991e8d8bef9SDimitry Andric   return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1992e8d8bef9SDimitry Andric }
1993e8d8bef9SDimitry Andric 
isLoweredToCall(const Function * F)19940b57cec5SDimitry Andric bool ARMTTIImpl::isLoweredToCall(const Function *F) {
19950b57cec5SDimitry Andric   if (!F->isIntrinsic())
199681ad6265SDimitry Andric     return BaseT::isLoweredToCall(F);
19970b57cec5SDimitry Andric 
19980b57cec5SDimitry Andric   // Assume all Arm-specific intrinsics map to an instruction.
19995f757f3fSDimitry Andric   if (F->getName().starts_with("llvm.arm"))
20000b57cec5SDimitry Andric     return false;
20010b57cec5SDimitry Andric 
20020b57cec5SDimitry Andric   switch (F->getIntrinsicID()) {
20030b57cec5SDimitry Andric   default: break;
20040b57cec5SDimitry Andric   case Intrinsic::powi:
20050b57cec5SDimitry Andric   case Intrinsic::sin:
20060b57cec5SDimitry Andric   case Intrinsic::cos:
20070b57cec5SDimitry Andric   case Intrinsic::pow:
20080b57cec5SDimitry Andric   case Intrinsic::log:
20090b57cec5SDimitry Andric   case Intrinsic::log10:
20100b57cec5SDimitry Andric   case Intrinsic::log2:
20110b57cec5SDimitry Andric   case Intrinsic::exp:
20120b57cec5SDimitry Andric   case Intrinsic::exp2:
20130b57cec5SDimitry Andric     return true;
20140b57cec5SDimitry Andric   case Intrinsic::sqrt:
20150b57cec5SDimitry Andric   case Intrinsic::fabs:
20160b57cec5SDimitry Andric   case Intrinsic::copysign:
20170b57cec5SDimitry Andric   case Intrinsic::floor:
20180b57cec5SDimitry Andric   case Intrinsic::ceil:
20190b57cec5SDimitry Andric   case Intrinsic::trunc:
20200b57cec5SDimitry Andric   case Intrinsic::rint:
20210b57cec5SDimitry Andric   case Intrinsic::nearbyint:
20220b57cec5SDimitry Andric   case Intrinsic::round:
20230b57cec5SDimitry Andric   case Intrinsic::canonicalize:
20240b57cec5SDimitry Andric   case Intrinsic::lround:
20250b57cec5SDimitry Andric   case Intrinsic::llround:
20260b57cec5SDimitry Andric   case Intrinsic::lrint:
20270b57cec5SDimitry Andric   case Intrinsic::llrint:
20280b57cec5SDimitry Andric     if (F->getReturnType()->isDoubleTy() && !ST->hasFP64())
20290b57cec5SDimitry Andric       return true;
20300b57cec5SDimitry Andric     if (F->getReturnType()->isHalfTy() && !ST->hasFullFP16())
20310b57cec5SDimitry Andric       return true;
20320b57cec5SDimitry Andric     // Some operations can be handled by vector instructions and assume
20330b57cec5SDimitry Andric     // unsupported vectors will be expanded into supported scalar ones.
20340b57cec5SDimitry Andric     // TODO Handle scalar operations properly.
20350b57cec5SDimitry Andric     return !ST->hasFPARMv8Base() && !ST->hasVFP2Base();
20360b57cec5SDimitry Andric   case Intrinsic::masked_store:
20370b57cec5SDimitry Andric   case Intrinsic::masked_load:
20380b57cec5SDimitry Andric   case Intrinsic::masked_gather:
20390b57cec5SDimitry Andric   case Intrinsic::masked_scatter:
20400b57cec5SDimitry Andric     return !ST->hasMVEIntegerOps();
20410b57cec5SDimitry Andric   case Intrinsic::sadd_with_overflow:
20420b57cec5SDimitry Andric   case Intrinsic::uadd_with_overflow:
20430b57cec5SDimitry Andric   case Intrinsic::ssub_with_overflow:
20440b57cec5SDimitry Andric   case Intrinsic::usub_with_overflow:
20450b57cec5SDimitry Andric   case Intrinsic::sadd_sat:
20460b57cec5SDimitry Andric   case Intrinsic::uadd_sat:
20470b57cec5SDimitry Andric   case Intrinsic::ssub_sat:
20480b57cec5SDimitry Andric   case Intrinsic::usub_sat:
20490b57cec5SDimitry Andric     return false;
20500b57cec5SDimitry Andric   }
20510b57cec5SDimitry Andric 
20520b57cec5SDimitry Andric   return BaseT::isLoweredToCall(F);
20530b57cec5SDimitry Andric }
20540b57cec5SDimitry Andric 
maybeLoweredToCall(Instruction & I)2055e8d8bef9SDimitry Andric bool ARMTTIImpl::maybeLoweredToCall(Instruction &I) {
20560b57cec5SDimitry Andric   unsigned ISD = TLI->InstructionOpcodeToISD(I.getOpcode());
20570b57cec5SDimitry Andric   EVT VT = TLI->getValueType(DL, I.getType(), true);
20580b57cec5SDimitry Andric   if (TLI->getOperationAction(ISD, VT) == TargetLowering::LibCall)
20590b57cec5SDimitry Andric     return true;
20600b57cec5SDimitry Andric 
20610b57cec5SDimitry Andric   // Check if an intrinsic will be lowered to a call and assume that any
20620b57cec5SDimitry Andric   // other CallInst will generate a bl.
20630b57cec5SDimitry Andric   if (auto *Call = dyn_cast<CallInst>(&I)) {
2064e8d8bef9SDimitry Andric     if (auto *II = dyn_cast<IntrinsicInst>(Call)) {
2065e8d8bef9SDimitry Andric       switch(II->getIntrinsicID()) {
2066e8d8bef9SDimitry Andric         case Intrinsic::memcpy:
2067e8d8bef9SDimitry Andric         case Intrinsic::memset:
2068e8d8bef9SDimitry Andric         case Intrinsic::memmove:
2069e8d8bef9SDimitry Andric           return getNumMemOps(II) == -1;
2070e8d8bef9SDimitry Andric         default:
20710b57cec5SDimitry Andric           if (const Function *F = Call->getCalledFunction())
20720b57cec5SDimitry Andric             return isLoweredToCall(F);
20730b57cec5SDimitry Andric       }
2074e8d8bef9SDimitry Andric     }
20750b57cec5SDimitry Andric     return true;
20760b57cec5SDimitry Andric   }
20770b57cec5SDimitry Andric 
20780b57cec5SDimitry Andric   // FPv5 provides conversions between integer, double-precision,
20790b57cec5SDimitry Andric   // single-precision, and half-precision formats.
20800b57cec5SDimitry Andric   switch (I.getOpcode()) {
20810b57cec5SDimitry Andric   default:
20820b57cec5SDimitry Andric     break;
20830b57cec5SDimitry Andric   case Instruction::FPToSI:
20840b57cec5SDimitry Andric   case Instruction::FPToUI:
20850b57cec5SDimitry Andric   case Instruction::SIToFP:
20860b57cec5SDimitry Andric   case Instruction::UIToFP:
20870b57cec5SDimitry Andric   case Instruction::FPTrunc:
20880b57cec5SDimitry Andric   case Instruction::FPExt:
20890b57cec5SDimitry Andric     return !ST->hasFPARMv8Base();
20900b57cec5SDimitry Andric   }
20910b57cec5SDimitry Andric 
20920b57cec5SDimitry Andric   // FIXME: Unfortunately the approach of checking the Operation Action does
20930b57cec5SDimitry Andric   // not catch all cases of Legalization that use library calls. Our
20940b57cec5SDimitry Andric   // Legalization step categorizes some transformations into library calls as
20950b57cec5SDimitry Andric   // Custom, Expand or even Legal when doing type legalization. So for now
20960b57cec5SDimitry Andric   // we have to special case for instance the SDIV of 64bit integers and the
20970b57cec5SDimitry Andric   // use of floating point emulation.
20980b57cec5SDimitry Andric   if (VT.isInteger() && VT.getSizeInBits() >= 64) {
20990b57cec5SDimitry Andric     switch (ISD) {
21000b57cec5SDimitry Andric     default:
21010b57cec5SDimitry Andric       break;
21020b57cec5SDimitry Andric     case ISD::SDIV:
21030b57cec5SDimitry Andric     case ISD::UDIV:
21040b57cec5SDimitry Andric     case ISD::SREM:
21050b57cec5SDimitry Andric     case ISD::UREM:
21060b57cec5SDimitry Andric     case ISD::SDIVREM:
21070b57cec5SDimitry Andric     case ISD::UDIVREM:
21080b57cec5SDimitry Andric       return true;
21090b57cec5SDimitry Andric     }
21100b57cec5SDimitry Andric   }
21110b57cec5SDimitry Andric 
21120b57cec5SDimitry Andric   // Assume all other non-float operations are supported.
21130b57cec5SDimitry Andric   if (!VT.isFloatingPoint())
21140b57cec5SDimitry Andric     return false;
21150b57cec5SDimitry Andric 
21160b57cec5SDimitry Andric   // We'll need a library call to handle most floats when using soft.
21170b57cec5SDimitry Andric   if (TLI->useSoftFloat()) {
21180b57cec5SDimitry Andric     switch (I.getOpcode()) {
21190b57cec5SDimitry Andric     default:
21200b57cec5SDimitry Andric       return true;
21210b57cec5SDimitry Andric     case Instruction::Alloca:
21220b57cec5SDimitry Andric     case Instruction::Load:
21230b57cec5SDimitry Andric     case Instruction::Store:
21240b57cec5SDimitry Andric     case Instruction::Select:
21250b57cec5SDimitry Andric     case Instruction::PHI:
21260b57cec5SDimitry Andric       return false;
21270b57cec5SDimitry Andric     }
21280b57cec5SDimitry Andric   }
21290b57cec5SDimitry Andric 
21300b57cec5SDimitry Andric   // We'll need a libcall to perform double precision operations on a single
21310b57cec5SDimitry Andric   // precision only FPU.
21320b57cec5SDimitry Andric   if (I.getType()->isDoubleTy() && !ST->hasFP64())
21330b57cec5SDimitry Andric     return true;
21340b57cec5SDimitry Andric 
21350b57cec5SDimitry Andric   // Likewise for half precision arithmetic.
21360b57cec5SDimitry Andric   if (I.getType()->isHalfTy() && !ST->hasFullFP16())
21370b57cec5SDimitry Andric     return true;
21380b57cec5SDimitry Andric 
21390b57cec5SDimitry Andric   return false;
2140e8d8bef9SDimitry Andric }
2141e8d8bef9SDimitry Andric 
isHardwareLoopProfitable(Loop * L,ScalarEvolution & SE,AssumptionCache & AC,TargetLibraryInfo * LibInfo,HardwareLoopInfo & HWLoopInfo)2142e8d8bef9SDimitry Andric bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
2143e8d8bef9SDimitry Andric                                           AssumptionCache &AC,
2144e8d8bef9SDimitry Andric                                           TargetLibraryInfo *LibInfo,
2145e8d8bef9SDimitry Andric                                           HardwareLoopInfo &HWLoopInfo) {
2146e8d8bef9SDimitry Andric   // Low-overhead branches are only supported in the 'low-overhead branch'
2147e8d8bef9SDimitry Andric   // extension of v8.1-m.
2148e8d8bef9SDimitry Andric   if (!ST->hasLOB() || DisableLowOverheadLoops) {
2149e8d8bef9SDimitry Andric     LLVM_DEBUG(dbgs() << "ARMHWLoops: Disabled\n");
2150e8d8bef9SDimitry Andric     return false;
2151e8d8bef9SDimitry Andric   }
2152e8d8bef9SDimitry Andric 
2153e8d8bef9SDimitry Andric   if (!SE.hasLoopInvariantBackedgeTakenCount(L)) {
2154e8d8bef9SDimitry Andric     LLVM_DEBUG(dbgs() << "ARMHWLoops: No BETC\n");
2155e8d8bef9SDimitry Andric     return false;
2156e8d8bef9SDimitry Andric   }
2157e8d8bef9SDimitry Andric 
2158e8d8bef9SDimitry Andric   const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
2159e8d8bef9SDimitry Andric   if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) {
2160e8d8bef9SDimitry Andric     LLVM_DEBUG(dbgs() << "ARMHWLoops: Uncomputable BETC\n");
2161e8d8bef9SDimitry Andric     return false;
2162e8d8bef9SDimitry Andric   }
2163e8d8bef9SDimitry Andric 
2164e8d8bef9SDimitry Andric   const SCEV *TripCountSCEV =
2165e8d8bef9SDimitry Andric     SE.getAddExpr(BackedgeTakenCount,
2166e8d8bef9SDimitry Andric                   SE.getOne(BackedgeTakenCount->getType()));
2167e8d8bef9SDimitry Andric 
2168e8d8bef9SDimitry Andric   // We need to store the trip count in LR, a 32-bit register.
2169e8d8bef9SDimitry Andric   if (SE.getUnsignedRangeMax(TripCountSCEV).getBitWidth() > 32) {
2170e8d8bef9SDimitry Andric     LLVM_DEBUG(dbgs() << "ARMHWLoops: Trip count does not fit into 32bits\n");
2171e8d8bef9SDimitry Andric     return false;
2172e8d8bef9SDimitry Andric   }
2173e8d8bef9SDimitry Andric 
2174e8d8bef9SDimitry Andric   // Making a call will trash LR and clear LO_BRANCH_INFO, so there's little
2175e8d8bef9SDimitry Andric   // point in generating a hardware loop if that's going to happen.
21760b57cec5SDimitry Andric 
21770b57cec5SDimitry Andric   auto IsHardwareLoopIntrinsic = [](Instruction &I) {
21780b57cec5SDimitry Andric     if (auto *Call = dyn_cast<IntrinsicInst>(&I)) {
21790b57cec5SDimitry Andric       switch (Call->getIntrinsicID()) {
21800b57cec5SDimitry Andric       default:
21810b57cec5SDimitry Andric         break;
2182e8d8bef9SDimitry Andric       case Intrinsic::start_loop_iterations:
2183fe6060f1SDimitry Andric       case Intrinsic::test_start_loop_iterations:
21840b57cec5SDimitry Andric       case Intrinsic::loop_decrement:
21850b57cec5SDimitry Andric       case Intrinsic::loop_decrement_reg:
21860b57cec5SDimitry Andric         return true;
21870b57cec5SDimitry Andric       }
21880b57cec5SDimitry Andric     }
21890b57cec5SDimitry Andric     return false;
21900b57cec5SDimitry Andric   };
21910b57cec5SDimitry Andric 
21920b57cec5SDimitry Andric   // Scan the instructions to see if there's any that we know will turn into a
2193e8d8bef9SDimitry Andric   // call or if this loop is already a low-overhead loop or will become a tail
2194e8d8bef9SDimitry Andric   // predicated loop.
2195e8d8bef9SDimitry Andric   bool IsTailPredLoop = false;
21960b57cec5SDimitry Andric   auto ScanLoop = [&](Loop *L) {
21970b57cec5SDimitry Andric     for (auto *BB : L->getBlocks()) {
21980b57cec5SDimitry Andric       for (auto &I : *BB) {
2199e8d8bef9SDimitry Andric         if (maybeLoweredToCall(I) || IsHardwareLoopIntrinsic(I) ||
2200e8d8bef9SDimitry Andric             isa<InlineAsm>(I)) {
22015ffd83dbSDimitry Andric           LLVM_DEBUG(dbgs() << "ARMHWLoops: Bad instruction: " << I << "\n");
22020b57cec5SDimitry Andric           return false;
22030b57cec5SDimitry Andric         }
2204e8d8bef9SDimitry Andric         if (auto *II = dyn_cast<IntrinsicInst>(&I))
2205e8d8bef9SDimitry Andric           IsTailPredLoop |=
2206e8d8bef9SDimitry Andric               II->getIntrinsicID() == Intrinsic::get_active_lane_mask ||
2207e8d8bef9SDimitry Andric               II->getIntrinsicID() == Intrinsic::arm_mve_vctp8 ||
2208e8d8bef9SDimitry Andric               II->getIntrinsicID() == Intrinsic::arm_mve_vctp16 ||
2209e8d8bef9SDimitry Andric               II->getIntrinsicID() == Intrinsic::arm_mve_vctp32 ||
2210e8d8bef9SDimitry Andric               II->getIntrinsicID() == Intrinsic::arm_mve_vctp64;
22110b57cec5SDimitry Andric       }
22125ffd83dbSDimitry Andric     }
22130b57cec5SDimitry Andric     return true;
22140b57cec5SDimitry Andric   };
22150b57cec5SDimitry Andric 
22160b57cec5SDimitry Andric   // Visit inner loops.
2217bdd1243dSDimitry Andric   for (auto *Inner : *L)
22180b57cec5SDimitry Andric     if (!ScanLoop(Inner))
22190b57cec5SDimitry Andric       return false;
22200b57cec5SDimitry Andric 
22210b57cec5SDimitry Andric   if (!ScanLoop(L))
22220b57cec5SDimitry Andric     return false;
22230b57cec5SDimitry Andric 
22240b57cec5SDimitry Andric   // TODO: Check whether the trip count calculation is expensive. If L is the
22250b57cec5SDimitry Andric   // inner loop but we know it has a low trip count, calculating that trip
22260b57cec5SDimitry Andric   // count (in the parent loop) may be detrimental.
22270b57cec5SDimitry Andric 
22280b57cec5SDimitry Andric   LLVMContext &C = L->getHeader()->getContext();
22290b57cec5SDimitry Andric   HWLoopInfo.CounterInReg = true;
22300b57cec5SDimitry Andric   HWLoopInfo.IsNestingLegal = false;
2231e8d8bef9SDimitry Andric   HWLoopInfo.PerformEntryTest = AllowWLSLoops && !IsTailPredLoop;
22320b57cec5SDimitry Andric   HWLoopInfo.CountType = Type::getInt32Ty(C);
22330b57cec5SDimitry Andric   HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1);
22340b57cec5SDimitry Andric   return true;
22350b57cec5SDimitry Andric }
22360b57cec5SDimitry Andric 
canTailPredicateInstruction(Instruction & I,int & ICmpCount)2237480093f4SDimitry Andric static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) {
2238480093f4SDimitry Andric   // We don't allow icmp's, and because we only look at single block loops,
2239480093f4SDimitry Andric   // we simply count the icmps, i.e. there should only be 1 for the backedge.
2240480093f4SDimitry Andric   if (isa<ICmpInst>(&I) && ++ICmpCount > 1)
2241480093f4SDimitry Andric     return false;
2242349cc55cSDimitry Andric   // FIXME: This is a workaround for poor cost modelling. Min/Max intrinsics are
2243349cc55cSDimitry Andric   // not currently canonical, but soon will be. Code without them uses icmp, and
2244349cc55cSDimitry Andric   // so is not tail predicated as per the condition above. In order to get the
2245349cc55cSDimitry Andric   // same performance we treat min and max the same as an icmp for tailpred
2246349cc55cSDimitry Andric   // purposes for the moment (we often rely on non-tailpred and higher VF's to
2247349cc55cSDimitry Andric   // pick more optimial instructions like VQDMULH. They need to be recognized
2248349cc55cSDimitry Andric   // directly by the vectorizer).
2249349cc55cSDimitry Andric   if (auto *II = dyn_cast<IntrinsicInst>(&I))
2250349cc55cSDimitry Andric     if ((II->getIntrinsicID() == Intrinsic::smin ||
2251349cc55cSDimitry Andric          II->getIntrinsicID() == Intrinsic::smax ||
2252349cc55cSDimitry Andric          II->getIntrinsicID() == Intrinsic::umin ||
2253349cc55cSDimitry Andric          II->getIntrinsicID() == Intrinsic::umax) &&
2254349cc55cSDimitry Andric         ++ICmpCount > 1)
2255349cc55cSDimitry Andric       return false;
2256480093f4SDimitry Andric 
2257480093f4SDimitry Andric   if (isa<FCmpInst>(&I))
2258480093f4SDimitry Andric     return false;
2259480093f4SDimitry Andric 
2260480093f4SDimitry Andric   // We could allow extending/narrowing FP loads/stores, but codegen is
2261480093f4SDimitry Andric   // too inefficient so reject this for now.
2262480093f4SDimitry Andric   if (isa<FPExtInst>(&I) || isa<FPTruncInst>(&I))
2263480093f4SDimitry Andric     return false;
2264480093f4SDimitry Andric 
2265480093f4SDimitry Andric   // Extends have to be extending-loads
2266480093f4SDimitry Andric   if (isa<SExtInst>(&I) || isa<ZExtInst>(&I) )
2267480093f4SDimitry Andric     if (!I.getOperand(0)->hasOneUse() || !isa<LoadInst>(I.getOperand(0)))
2268480093f4SDimitry Andric       return false;
2269480093f4SDimitry Andric 
2270480093f4SDimitry Andric   // Truncs have to be narrowing-stores
2271480093f4SDimitry Andric   if (isa<TruncInst>(&I) )
2272480093f4SDimitry Andric     if (!I.hasOneUse() || !isa<StoreInst>(*I.user_begin()))
2273480093f4SDimitry Andric       return false;
2274480093f4SDimitry Andric 
2275480093f4SDimitry Andric   return true;
2276480093f4SDimitry Andric }
2277480093f4SDimitry Andric 
2278480093f4SDimitry Andric // To set up a tail-predicated loop, we need to know the total number of
2279480093f4SDimitry Andric // elements processed by that loop. Thus, we need to determine the element
2280480093f4SDimitry Andric // size and:
2281480093f4SDimitry Andric // 1) it should be uniform for all operations in the vector loop, so we
2282480093f4SDimitry Andric //    e.g. don't want any widening/narrowing operations.
2283480093f4SDimitry Andric // 2) it should be smaller than i64s because we don't have vector operations
2284480093f4SDimitry Andric //    that work on i64s.
2285480093f4SDimitry Andric // 3) we don't want elements to be reversed or shuffled, to make sure the
2286480093f4SDimitry Andric //    tail-predication masks/predicates the right lanes.
2287480093f4SDimitry Andric //
canTailPredicateLoop(Loop * L,LoopInfo * LI,ScalarEvolution & SE,const DataLayout & DL,const LoopAccessInfo * LAI)2288480093f4SDimitry Andric static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
2289480093f4SDimitry Andric                                  const DataLayout &DL,
2290480093f4SDimitry Andric                                  const LoopAccessInfo *LAI) {
22915ffd83dbSDimitry Andric   LLVM_DEBUG(dbgs() << "Tail-predication: checking allowed instructions\n");
22925ffd83dbSDimitry Andric 
2293e8d8bef9SDimitry Andric   // If there are live-out values, it is probably a reduction. We can predicate
2294e8d8bef9SDimitry Andric   // most reduction operations freely under MVE using a combination of
2295e8d8bef9SDimitry Andric   // prefer-predicated-reduction-select and inloop reductions. We limit this to
2296e8d8bef9SDimitry Andric   // floating point and integer reductions, but don't check for operators
2297e8d8bef9SDimitry Andric   // specifically here. If the value ends up not being a reduction (and so the
2298e8d8bef9SDimitry Andric   // vectorizer cannot tailfold the loop), we should fall back to standard
2299e8d8bef9SDimitry Andric   // vectorization automatically.
23005ffd83dbSDimitry Andric   SmallVector< Instruction *, 8 > LiveOuts;
23015ffd83dbSDimitry Andric   LiveOuts = llvm::findDefsUsedOutsideOfLoop(L);
2302e8d8bef9SDimitry Andric   bool ReductionsDisabled =
23035ffd83dbSDimitry Andric       EnableTailPredication == TailPredication::EnabledNoReductions ||
23045ffd83dbSDimitry Andric       EnableTailPredication == TailPredication::ForceEnabledNoReductions;
23055ffd83dbSDimitry Andric 
23065ffd83dbSDimitry Andric   for (auto *I : LiveOuts) {
2307e8d8bef9SDimitry Andric     if (!I->getType()->isIntegerTy() && !I->getType()->isFloatTy() &&
2308e8d8bef9SDimitry Andric         !I->getType()->isHalfTy()) {
2309e8d8bef9SDimitry Andric       LLVM_DEBUG(dbgs() << "Don't tail-predicate loop with non-integer/float "
23105ffd83dbSDimitry Andric                            "live-out value\n");
23115ffd83dbSDimitry Andric       return false;
23125ffd83dbSDimitry Andric     }
2313e8d8bef9SDimitry Andric     if (ReductionsDisabled) {
2314e8d8bef9SDimitry Andric       LLVM_DEBUG(dbgs() << "Reductions not enabled\n");
23155ffd83dbSDimitry Andric       return false;
23165ffd83dbSDimitry Andric     }
23175ffd83dbSDimitry Andric   }
23185ffd83dbSDimitry Andric 
23195ffd83dbSDimitry Andric   // Next, check that all instructions can be tail-predicated.
2320480093f4SDimitry Andric   PredicatedScalarEvolution PSE = LAI->getPSE();
23215ffd83dbSDimitry Andric   SmallVector<Instruction *, 16> LoadStores;
2322480093f4SDimitry Andric   int ICmpCount = 0;
2323480093f4SDimitry Andric 
2324480093f4SDimitry Andric   for (BasicBlock *BB : L->blocks()) {
2325480093f4SDimitry Andric     for (Instruction &I : BB->instructionsWithoutDebug()) {
2326480093f4SDimitry Andric       if (isa<PHINode>(&I))
2327480093f4SDimitry Andric         continue;
2328480093f4SDimitry Andric       if (!canTailPredicateInstruction(I, ICmpCount)) {
2329480093f4SDimitry Andric         LLVM_DEBUG(dbgs() << "Instruction not allowed: "; I.dump());
2330480093f4SDimitry Andric         return false;
2331480093f4SDimitry Andric       }
2332480093f4SDimitry Andric 
2333480093f4SDimitry Andric       Type *T  = I.getType();
2334480093f4SDimitry Andric       if (T->getScalarSizeInBits() > 32) {
2335480093f4SDimitry Andric         LLVM_DEBUG(dbgs() << "Unsupported Type: "; T->dump());
2336480093f4SDimitry Andric         return false;
2337480093f4SDimitry Andric       }
2338480093f4SDimitry Andric       if (isa<StoreInst>(I) || isa<LoadInst>(I)) {
2339349cc55cSDimitry Andric         Value *Ptr = getLoadStorePointerOperand(&I);
2340349cc55cSDimitry Andric         Type *AccessTy = getLoadStoreType(&I);
2341bdd1243dSDimitry Andric         int64_t NextStride = getPtrStride(PSE, AccessTy, Ptr, L).value_or(0);
2342e8d8bef9SDimitry Andric         if (NextStride == 1) {
2343480093f4SDimitry Andric           // TODO: for now only allow consecutive strides of 1. We could support
2344e8d8bef9SDimitry Andric           // other strides as long as it is uniform, but let's keep it simple
2345e8d8bef9SDimitry Andric           // for now.
2346e8d8bef9SDimitry Andric           continue;
2347e8d8bef9SDimitry Andric         } else if (NextStride == -1 ||
2348e8d8bef9SDimitry Andric                    (NextStride == 2 && MVEMaxSupportedInterleaveFactor >= 2) ||
2349e8d8bef9SDimitry Andric                    (NextStride == 4 && MVEMaxSupportedInterleaveFactor >= 4)) {
2350e8d8bef9SDimitry Andric           LLVM_DEBUG(dbgs()
2351e8d8bef9SDimitry Andric                      << "Consecutive strides of 2 found, vld2/vstr2 can't "
2352e8d8bef9SDimitry Andric                         "be tail-predicated\n.");
2353e8d8bef9SDimitry Andric           return false;
2354e8d8bef9SDimitry Andric           // TODO: don't tail predicate if there is a reversed load?
2355e8d8bef9SDimitry Andric         } else if (EnableMaskedGatherScatters) {
2356e8d8bef9SDimitry Andric           // Gather/scatters do allow loading from arbitrary strides, at
2357e8d8bef9SDimitry Andric           // least if they are loop invariant.
2358e8d8bef9SDimitry Andric           // TODO: Loop variant strides should in theory work, too, but
2359e8d8bef9SDimitry Andric           // this requires further testing.
2360349cc55cSDimitry Andric           const SCEV *PtrScev = PSE.getSE()->getSCEV(Ptr);
2361e8d8bef9SDimitry Andric           if (auto AR = dyn_cast<SCEVAddRecExpr>(PtrScev)) {
2362e8d8bef9SDimitry Andric             const SCEV *Step = AR->getStepRecurrence(*PSE.getSE());
2363e8d8bef9SDimitry Andric             if (PSE.getSE()->isLoopInvariant(Step, L))
2364480093f4SDimitry Andric               continue;
2365480093f4SDimitry Andric           }
2366e8d8bef9SDimitry Andric         }
2367e8d8bef9SDimitry Andric         LLVM_DEBUG(dbgs() << "Bad stride found, can't "
2368480093f4SDimitry Andric                              "tail-predicate\n.");
2369480093f4SDimitry Andric         return false;
2370480093f4SDimitry Andric       }
2371480093f4SDimitry Andric     }
2372480093f4SDimitry Andric   }
2373480093f4SDimitry Andric 
2374480093f4SDimitry Andric   LLVM_DEBUG(dbgs() << "tail-predication: all instructions allowed!\n");
2375480093f4SDimitry Andric   return true;
2376480093f4SDimitry Andric }
2377480093f4SDimitry Andric 
preferPredicateOverEpilogue(TailFoldingInfo * TFI)237806c3fb27SDimitry Andric bool ARMTTIImpl::preferPredicateOverEpilogue(TailFoldingInfo *TFI) {
23795ffd83dbSDimitry Andric   if (!EnableTailPredication) {
23805ffd83dbSDimitry Andric     LLVM_DEBUG(dbgs() << "Tail-predication not enabled.\n");
2381480093f4SDimitry Andric     return false;
23825ffd83dbSDimitry Andric   }
2383480093f4SDimitry Andric 
2384480093f4SDimitry Andric   // Creating a predicated vector loop is the first step for generating a
2385480093f4SDimitry Andric   // tail-predicated hardware loop, for which we need the MVE masked
2386480093f4SDimitry Andric   // load/stores instructions:
2387480093f4SDimitry Andric   if (!ST->hasMVEIntegerOps())
2388480093f4SDimitry Andric     return false;
2389480093f4SDimitry Andric 
239006c3fb27SDimitry Andric   LoopVectorizationLegality *LVL = TFI->LVL;
239106c3fb27SDimitry Andric   Loop *L = LVL->getLoop();
239206c3fb27SDimitry Andric 
2393480093f4SDimitry Andric   // For now, restrict this to single block loops.
2394480093f4SDimitry Andric   if (L->getNumBlocks() > 1) {
2395480093f4SDimitry Andric     LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: not a single block "
2396480093f4SDimitry Andric                          "loop.\n");
2397480093f4SDimitry Andric     return false;
2398480093f4SDimitry Andric   }
2399480093f4SDimitry Andric 
2400e8d8bef9SDimitry Andric   assert(L->isInnermost() && "preferPredicateOverEpilogue: inner-loop expected");
2401480093f4SDimitry Andric 
240206c3fb27SDimitry Andric   LoopInfo *LI = LVL->getLoopInfo();
2403480093f4SDimitry Andric   HardwareLoopInfo HWLoopInfo(L);
2404480093f4SDimitry Andric   if (!HWLoopInfo.canAnalyze(*LI)) {
2405480093f4SDimitry Andric     LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2406480093f4SDimitry Andric                          "analyzable.\n");
2407480093f4SDimitry Andric     return false;
2408480093f4SDimitry Andric   }
2409480093f4SDimitry Andric 
241006c3fb27SDimitry Andric   AssumptionCache *AC = LVL->getAssumptionCache();
241106c3fb27SDimitry Andric   ScalarEvolution *SE = LVL->getScalarEvolution();
241206c3fb27SDimitry Andric 
2413480093f4SDimitry Andric   // This checks if we have the low-overhead branch architecture
2414480093f4SDimitry Andric   // extension, and if we will create a hardware-loop:
241506c3fb27SDimitry Andric   if (!isHardwareLoopProfitable(L, *SE, *AC, TFI->TLI, HWLoopInfo)) {
2416480093f4SDimitry Andric     LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2417480093f4SDimitry Andric                          "profitable.\n");
2418480093f4SDimitry Andric     return false;
2419480093f4SDimitry Andric   }
2420480093f4SDimitry Andric 
242106c3fb27SDimitry Andric   DominatorTree *DT = LVL->getDominatorTree();
242206c3fb27SDimitry Andric   if (!HWLoopInfo.isHardwareLoopCandidate(*SE, *LI, *DT)) {
2423480093f4SDimitry Andric     LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2424480093f4SDimitry Andric                          "a candidate.\n");
2425480093f4SDimitry Andric     return false;
2426480093f4SDimitry Andric   }
2427480093f4SDimitry Andric 
242806c3fb27SDimitry Andric   return canTailPredicateLoop(L, LI, *SE, DL, LVL->getLAI());
2429480093f4SDimitry Andric }
2430480093f4SDimitry Andric 
243106c3fb27SDimitry Andric TailFoldingStyle
getPreferredTailFoldingStyle(bool IVUpdateMayOverflow) const243206c3fb27SDimitry Andric ARMTTIImpl::getPreferredTailFoldingStyle(bool IVUpdateMayOverflow) const {
24335ffd83dbSDimitry Andric   if (!ST->hasMVEIntegerOps() || !EnableTailPredication)
243406c3fb27SDimitry Andric     return TailFoldingStyle::DataWithoutLaneMask;
2435480093f4SDimitry Andric 
24365ffd83dbSDimitry Andric   // Intrinsic @llvm.get.active.lane.mask is supported.
24375ffd83dbSDimitry Andric   // It is used in the MVETailPredication pass, which requires the number of
24385ffd83dbSDimitry Andric   // elements processed by this vector loop to setup the tail-predicated
24395ffd83dbSDimitry Andric   // loop.
244006c3fb27SDimitry Andric   return TailFoldingStyle::Data;
24415ffd83dbSDimitry Andric }
getUnrollingPreferences(Loop * L,ScalarEvolution & SE,TTI::UnrollingPreferences & UP,OptimizationRemarkEmitter * ORE)24420b57cec5SDimitry Andric void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
2443349cc55cSDimitry Andric                                          TTI::UnrollingPreferences &UP,
2444349cc55cSDimitry Andric                                          OptimizationRemarkEmitter *ORE) {
24455f757f3fSDimitry Andric   // Enable Upper bound unrolling universally, providing that we do not see an
24465f757f3fSDimitry Andric   // active lane mask, which will be better kept as a loop to become tail
24475f757f3fSDimitry Andric   // predicated than to be conditionally unrolled.
24485f757f3fSDimitry Andric   UP.UpperBound =
24495f757f3fSDimitry Andric       !ST->hasMVEIntegerOps() || !any_of(*L->getHeader(), [](Instruction &I) {
24505f757f3fSDimitry Andric         return isa<IntrinsicInst>(I) &&
24515f757f3fSDimitry Andric                cast<IntrinsicInst>(I).getIntrinsicID() ==
24525f757f3fSDimitry Andric                    Intrinsic::get_active_lane_mask;
24535f757f3fSDimitry Andric       });
2454fe6060f1SDimitry Andric 
24550b57cec5SDimitry Andric   // Only currently enable these preferences for M-Class cores.
24560b57cec5SDimitry Andric   if (!ST->isMClass())
2457349cc55cSDimitry Andric     return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
24580b57cec5SDimitry Andric 
24590b57cec5SDimitry Andric   // Disable loop unrolling for Oz and Os.
24600b57cec5SDimitry Andric   UP.OptSizeThreshold = 0;
24610b57cec5SDimitry Andric   UP.PartialOptSizeThreshold = 0;
24620b57cec5SDimitry Andric   if (L->getHeader()->getParent()->hasOptSize())
24630b57cec5SDimitry Andric     return;
24640b57cec5SDimitry Andric 
24650b57cec5SDimitry Andric   SmallVector<BasicBlock*, 4> ExitingBlocks;
24660b57cec5SDimitry Andric   L->getExitingBlocks(ExitingBlocks);
24670b57cec5SDimitry Andric   LLVM_DEBUG(dbgs() << "Loop has:\n"
24680b57cec5SDimitry Andric                     << "Blocks: " << L->getNumBlocks() << "\n"
24690b57cec5SDimitry Andric                     << "Exit blocks: " << ExitingBlocks.size() << "\n");
24700b57cec5SDimitry Andric 
24710b57cec5SDimitry Andric   // Only allow another exit other than the latch. This acts as an early exit
24720b57cec5SDimitry Andric   // as it mirrors the profitability calculation of the runtime unroller.
24730b57cec5SDimitry Andric   if (ExitingBlocks.size() > 2)
24740b57cec5SDimitry Andric     return;
24750b57cec5SDimitry Andric 
24760b57cec5SDimitry Andric   // Limit the CFG of the loop body for targets with a branch predictor.
24770b57cec5SDimitry Andric   // Allowing 4 blocks permits if-then-else diamonds in the body.
24780b57cec5SDimitry Andric   if (ST->hasBranchPredictor() && L->getNumBlocks() > 4)
24790b57cec5SDimitry Andric     return;
24800b57cec5SDimitry Andric 
2481e8d8bef9SDimitry Andric   // Don't unroll vectorized loops, including the remainder loop
2482e8d8bef9SDimitry Andric   if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized"))
2483e8d8bef9SDimitry Andric     return;
2484e8d8bef9SDimitry Andric 
24850b57cec5SDimitry Andric   // Scan the loop: don't unroll loops with calls as this could prevent
24860b57cec5SDimitry Andric   // inlining.
2487fe6060f1SDimitry Andric   InstructionCost Cost = 0;
24880b57cec5SDimitry Andric   for (auto *BB : L->getBlocks()) {
24890b57cec5SDimitry Andric     for (auto &I : *BB) {
2490480093f4SDimitry Andric       // Don't unroll vectorised loop. MVE does not benefit from it as much as
2491480093f4SDimitry Andric       // scalar code.
2492480093f4SDimitry Andric       if (I.getType()->isVectorTy())
2493480093f4SDimitry Andric         return;
2494480093f4SDimitry Andric 
24950b57cec5SDimitry Andric       if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
24965ffd83dbSDimitry Andric         if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
24970b57cec5SDimitry Andric           if (!isLoweredToCall(F))
24980b57cec5SDimitry Andric             continue;
24990b57cec5SDimitry Andric         }
25000b57cec5SDimitry Andric         return;
25010b57cec5SDimitry Andric       }
25028bcb0991SDimitry Andric 
2503e8d8bef9SDimitry Andric       SmallVector<const Value*, 4> Operands(I.operand_values());
2504bdd1243dSDimitry Andric       Cost += getInstructionCost(&I, Operands,
2505bdd1243dSDimitry Andric                                  TargetTransformInfo::TCK_SizeAndLatency);
25060b57cec5SDimitry Andric     }
25070b57cec5SDimitry Andric   }
25080b57cec5SDimitry Andric 
2509fe6060f1SDimitry Andric   // On v6m cores, there are very few registers available. We can easily end up
2510fe6060f1SDimitry Andric   // spilling and reloading more registers in an unrolled loop. Look at the
2511fe6060f1SDimitry Andric   // number of LCSSA phis as a rough measure of how many registers will need to
2512fe6060f1SDimitry Andric   // be live out of the loop, reducing the default unroll count if more than 1
2513fe6060f1SDimitry Andric   // value is needed.  In the long run, all of this should be being learnt by a
2514fe6060f1SDimitry Andric   // machine.
2515fe6060f1SDimitry Andric   unsigned UnrollCount = 4;
2516fe6060f1SDimitry Andric   if (ST->isThumb1Only()) {
2517fe6060f1SDimitry Andric     unsigned ExitingValues = 0;
2518fe6060f1SDimitry Andric     SmallVector<BasicBlock *, 4> ExitBlocks;
2519fe6060f1SDimitry Andric     L->getExitBlocks(ExitBlocks);
2520fe6060f1SDimitry Andric     for (auto *Exit : ExitBlocks) {
2521fe6060f1SDimitry Andric       // Count the number of LCSSA phis. Exclude values coming from GEP's as
2522fe6060f1SDimitry Andric       // only the last is expected to be needed for address operands.
2523fe6060f1SDimitry Andric       unsigned LiveOuts = count_if(Exit->phis(), [](auto &PH) {
2524fe6060f1SDimitry Andric         return PH.getNumOperands() != 1 ||
2525fe6060f1SDimitry Andric                !isa<GetElementPtrInst>(PH.getOperand(0));
2526fe6060f1SDimitry Andric       });
2527fe6060f1SDimitry Andric       ExitingValues = ExitingValues < LiveOuts ? LiveOuts : ExitingValues;
2528fe6060f1SDimitry Andric     }
2529fe6060f1SDimitry Andric     if (ExitingValues)
2530fe6060f1SDimitry Andric       UnrollCount /= ExitingValues;
2531fe6060f1SDimitry Andric     if (UnrollCount <= 1)
2532fe6060f1SDimitry Andric       return;
2533fe6060f1SDimitry Andric   }
2534fe6060f1SDimitry Andric 
25350b57cec5SDimitry Andric   LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
2536fe6060f1SDimitry Andric   LLVM_DEBUG(dbgs() << "Default Runtime Unroll Count: " << UnrollCount << "\n");
25370b57cec5SDimitry Andric 
25380b57cec5SDimitry Andric   UP.Partial = true;
25390b57cec5SDimitry Andric   UP.Runtime = true;
25400b57cec5SDimitry Andric   UP.UnrollRemainder = true;
2541fe6060f1SDimitry Andric   UP.DefaultUnrollRuntimeCount = UnrollCount;
25420b57cec5SDimitry Andric   UP.UnrollAndJam = true;
25430b57cec5SDimitry Andric   UP.UnrollAndJamInnerLoopThreshold = 60;
25440b57cec5SDimitry Andric 
25450b57cec5SDimitry Andric   // Force unrolling small loops can be very useful because of the branch
25460b57cec5SDimitry Andric   // taken cost of the backedge.
25470b57cec5SDimitry Andric   if (Cost < 12)
25480b57cec5SDimitry Andric     UP.Force = true;
25490b57cec5SDimitry Andric }
25508bcb0991SDimitry Andric 
getPeelingPreferences(Loop * L,ScalarEvolution & SE,TTI::PeelingPreferences & PP)25515ffd83dbSDimitry Andric void ARMTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
25525ffd83dbSDimitry Andric                                        TTI::PeelingPreferences &PP) {
25535ffd83dbSDimitry Andric   BaseT::getPeelingPreferences(L, SE, PP);
25545ffd83dbSDimitry Andric }
25555ffd83dbSDimitry Andric 
preferInLoopReduction(unsigned Opcode,Type * Ty,TTI::ReductionFlags Flags) const2556e8d8bef9SDimitry Andric bool ARMTTIImpl::preferInLoopReduction(unsigned Opcode, Type *Ty,
2557e8d8bef9SDimitry Andric                                        TTI::ReductionFlags Flags) const {
2558e8d8bef9SDimitry Andric   if (!ST->hasMVEIntegerOps())
2559e8d8bef9SDimitry Andric     return false;
2560e8d8bef9SDimitry Andric 
2561e8d8bef9SDimitry Andric   unsigned ScalarBits = Ty->getScalarSizeInBits();
2562e8d8bef9SDimitry Andric   switch (Opcode) {
2563e8d8bef9SDimitry Andric   case Instruction::Add:
2564e8d8bef9SDimitry Andric     return ScalarBits <= 64;
2565e8d8bef9SDimitry Andric   default:
2566e8d8bef9SDimitry Andric     return false;
2567e8d8bef9SDimitry Andric   }
2568e8d8bef9SDimitry Andric }
2569e8d8bef9SDimitry Andric 
preferPredicatedReductionSelect(unsigned Opcode,Type * Ty,TTI::ReductionFlags Flags) const2570e8d8bef9SDimitry Andric bool ARMTTIImpl::preferPredicatedReductionSelect(
2571e8d8bef9SDimitry Andric     unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const {
2572e8d8bef9SDimitry Andric   if (!ST->hasMVEIntegerOps())
2573e8d8bef9SDimitry Andric     return false;
2574e8d8bef9SDimitry Andric   return true;
2575e8d8bef9SDimitry Andric }
2576bdd1243dSDimitry Andric 
getScalingFactorCost(Type * Ty,GlobalValue * BaseGV,StackOffset BaseOffset,bool HasBaseReg,int64_t Scale,unsigned AddrSpace) const2577bdd1243dSDimitry Andric InstructionCost ARMTTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
2578*0fca6ea1SDimitry Andric                                                  StackOffset BaseOffset,
2579bdd1243dSDimitry Andric                                                  bool HasBaseReg, int64_t Scale,
2580bdd1243dSDimitry Andric                                                  unsigned AddrSpace) const {
2581bdd1243dSDimitry Andric   TargetLoweringBase::AddrMode AM;
2582bdd1243dSDimitry Andric   AM.BaseGV = BaseGV;
2583*0fca6ea1SDimitry Andric   AM.BaseOffs = BaseOffset.getFixed();
2584bdd1243dSDimitry Andric   AM.HasBaseReg = HasBaseReg;
2585bdd1243dSDimitry Andric   AM.Scale = Scale;
2586*0fca6ea1SDimitry Andric   AM.ScalableOffset = BaseOffset.getScalable();
2587bdd1243dSDimitry Andric   if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace)) {
2588bdd1243dSDimitry Andric     if (ST->hasFPAO())
2589bdd1243dSDimitry Andric       return AM.Scale < 0 ? 1 : 0; // positive offsets execute faster
2590bdd1243dSDimitry Andric     return 0;
2591bdd1243dSDimitry Andric   }
2592bdd1243dSDimitry Andric   return -1;
2593bdd1243dSDimitry Andric }
259406c3fb27SDimitry Andric 
hasArmWideBranch(bool Thumb) const259506c3fb27SDimitry Andric bool ARMTTIImpl::hasArmWideBranch(bool Thumb) const {
259606c3fb27SDimitry Andric   if (Thumb) {
259706c3fb27SDimitry Andric     // B.W is available in any Thumb2-supporting target, and also in every
259806c3fb27SDimitry Andric     // version of Armv8-M, even Baseline which does not include the rest of
259906c3fb27SDimitry Andric     // Thumb2.
260006c3fb27SDimitry Andric     return ST->isThumb2() || ST->hasV8MBaselineOps();
260106c3fb27SDimitry Andric   } else {
260206c3fb27SDimitry Andric     // B is available in all versions of the Arm ISA, so the only question is
260306c3fb27SDimitry Andric     // whether that ISA is available at all.
260406c3fb27SDimitry Andric     return ST->hasARMOps();
260506c3fb27SDimitry Andric   }
260606c3fb27SDimitry Andric }
2607