xref: /freebsd/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp (revision 5e801ac66d24704442eba426ed13c3effb8a34e7)
1 //===- ARMTargetTransformInfo.cpp - ARM specific TTI ----------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "ARMTargetTransformInfo.h"
10 #include "ARMSubtarget.h"
11 #include "MCTargetDesc/ARMAddressingModes.h"
12 #include "llvm/ADT/APInt.h"
13 #include "llvm/ADT/SmallVector.h"
14 #include "llvm/Analysis/LoopInfo.h"
15 #include "llvm/CodeGen/CostTable.h"
16 #include "llvm/CodeGen/ISDOpcodes.h"
17 #include "llvm/CodeGen/ValueTypes.h"
18 #include "llvm/IR/BasicBlock.h"
19 #include "llvm/IR/DataLayout.h"
20 #include "llvm/IR/DerivedTypes.h"
21 #include "llvm/IR/Instruction.h"
22 #include "llvm/IR/Instructions.h"
23 #include "llvm/IR/Intrinsics.h"
24 #include "llvm/IR/IntrinsicInst.h"
25 #include "llvm/IR/IntrinsicsARM.h"
26 #include "llvm/IR/PatternMatch.h"
27 #include "llvm/IR/Type.h"
28 #include "llvm/MC/SubtargetFeature.h"
29 #include "llvm/Support/Casting.h"
30 #include "llvm/Support/KnownBits.h"
31 #include "llvm/Support/MachineValueType.h"
32 #include "llvm/Target/TargetMachine.h"
33 #include "llvm/Transforms/InstCombine/InstCombiner.h"
34 #include "llvm/Transforms/Utils/Local.h"
35 #include "llvm/Transforms/Utils/LoopUtils.h"
36 #include <algorithm>
37 #include <cassert>
38 #include <cstdint>
39 #include <utility>
40 
41 using namespace llvm;
42 
43 #define DEBUG_TYPE "armtti"
44 
45 static cl::opt<bool> EnableMaskedLoadStores(
46   "enable-arm-maskedldst", cl::Hidden, cl::init(true),
47   cl::desc("Enable the generation of masked loads and stores"));
48 
49 static cl::opt<bool> DisableLowOverheadLoops(
50   "disable-arm-loloops", cl::Hidden, cl::init(false),
51   cl::desc("Disable the generation of low-overhead loops"));
52 
53 static cl::opt<bool>
54     AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true),
55                   cl::desc("Enable the generation of WLS loops"));
56 
57 extern cl::opt<TailPredication::Mode> EnableTailPredication;
58 
59 extern cl::opt<bool> EnableMaskedGatherScatters;
60 
61 extern cl::opt<unsigned> MVEMaxSupportedInterleaveFactor;
62 
63 /// Convert a vector load intrinsic into a simple llvm load instruction.
64 /// This is beneficial when the underlying object being addressed comes
65 /// from a constant, since we get constant-folding for free.
66 static Value *simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign,
67                                InstCombiner::BuilderTy &Builder) {
68   auto *IntrAlign = dyn_cast<ConstantInt>(II.getArgOperand(1));
69 
70   if (!IntrAlign)
71     return nullptr;
72 
73   unsigned Alignment = IntrAlign->getLimitedValue() < MemAlign
74                            ? MemAlign
75                            : IntrAlign->getLimitedValue();
76 
77   if (!isPowerOf2_32(Alignment))
78     return nullptr;
79 
80   auto *BCastInst = Builder.CreateBitCast(II.getArgOperand(0),
81                                           PointerType::get(II.getType(), 0));
82   return Builder.CreateAlignedLoad(II.getType(), BCastInst, Align(Alignment));
83 }
84 
85 bool ARMTTIImpl::areInlineCompatible(const Function *Caller,
86                                      const Function *Callee) const {
87   const TargetMachine &TM = getTLI()->getTargetMachine();
88   const FeatureBitset &CallerBits =
89       TM.getSubtargetImpl(*Caller)->getFeatureBits();
90   const FeatureBitset &CalleeBits =
91       TM.getSubtargetImpl(*Callee)->getFeatureBits();
92 
93   // To inline a callee, all features not in the allowed list must match exactly.
94   bool MatchExact = (CallerBits & ~InlineFeaturesAllowed) ==
95                     (CalleeBits & ~InlineFeaturesAllowed);
96   // For features in the allowed list, the callee's features must be a subset of
97   // the callers'.
98   bool MatchSubset = ((CallerBits & CalleeBits) & InlineFeaturesAllowed) ==
99                      (CalleeBits & InlineFeaturesAllowed);
100   return MatchExact && MatchSubset;
101 }
102 
103 TTI::AddressingModeKind
104 ARMTTIImpl::getPreferredAddressingMode(const Loop *L,
105                                        ScalarEvolution *SE) const {
106   if (ST->hasMVEIntegerOps())
107     return TTI::AMK_PostIndexed;
108 
109   if (L->getHeader()->getParent()->hasOptSize())
110     return TTI::AMK_None;
111 
112   if (ST->isMClass() && ST->isThumb2() &&
113       L->getNumBlocks() == 1)
114     return TTI::AMK_PreIndexed;
115 
116   return TTI::AMK_None;
117 }
118 
119 Optional<Instruction *>
120 ARMTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
121   using namespace PatternMatch;
122   Intrinsic::ID IID = II.getIntrinsicID();
123   switch (IID) {
124   default:
125     break;
126   case Intrinsic::arm_neon_vld1: {
127     Align MemAlign =
128         getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,
129                           &IC.getAssumptionCache(), &IC.getDominatorTree());
130     if (Value *V = simplifyNeonVld1(II, MemAlign.value(), IC.Builder)) {
131       return IC.replaceInstUsesWith(II, V);
132     }
133     break;
134   }
135 
136   case Intrinsic::arm_neon_vld2:
137   case Intrinsic::arm_neon_vld3:
138   case Intrinsic::arm_neon_vld4:
139   case Intrinsic::arm_neon_vld2lane:
140   case Intrinsic::arm_neon_vld3lane:
141   case Intrinsic::arm_neon_vld4lane:
142   case Intrinsic::arm_neon_vst1:
143   case Intrinsic::arm_neon_vst2:
144   case Intrinsic::arm_neon_vst3:
145   case Intrinsic::arm_neon_vst4:
146   case Intrinsic::arm_neon_vst2lane:
147   case Intrinsic::arm_neon_vst3lane:
148   case Intrinsic::arm_neon_vst4lane: {
149     Align MemAlign =
150         getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,
151                           &IC.getAssumptionCache(), &IC.getDominatorTree());
152     unsigned AlignArg = II.arg_size() - 1;
153     Value *AlignArgOp = II.getArgOperand(AlignArg);
154     MaybeAlign Align = cast<ConstantInt>(AlignArgOp)->getMaybeAlignValue();
155     if (Align && *Align < MemAlign) {
156       return IC.replaceOperand(
157           II, AlignArg,
158           ConstantInt::get(Type::getInt32Ty(II.getContext()), MemAlign.value(),
159                            false));
160     }
161     break;
162   }
163 
164   case Intrinsic::arm_mve_pred_i2v: {
165     Value *Arg = II.getArgOperand(0);
166     Value *ArgArg;
167     if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
168                        PatternMatch::m_Value(ArgArg))) &&
169         II.getType() == ArgArg->getType()) {
170       return IC.replaceInstUsesWith(II, ArgArg);
171     }
172     Constant *XorMask;
173     if (match(Arg, m_Xor(PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
174                              PatternMatch::m_Value(ArgArg)),
175                          PatternMatch::m_Constant(XorMask))) &&
176         II.getType() == ArgArg->getType()) {
177       if (auto *CI = dyn_cast<ConstantInt>(XorMask)) {
178         if (CI->getValue().trunc(16).isAllOnes()) {
179           auto TrueVector = IC.Builder.CreateVectorSplat(
180               cast<FixedVectorType>(II.getType())->getNumElements(),
181               IC.Builder.getTrue());
182           return BinaryOperator::Create(Instruction::Xor, ArgArg, TrueVector);
183         }
184       }
185     }
186     KnownBits ScalarKnown(32);
187     if (IC.SimplifyDemandedBits(&II, 0, APInt::getLowBitsSet(32, 16),
188                                 ScalarKnown, 0)) {
189       return &II;
190     }
191     break;
192   }
193   case Intrinsic::arm_mve_pred_v2i: {
194     Value *Arg = II.getArgOperand(0);
195     Value *ArgArg;
196     if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_i2v>(
197                        PatternMatch::m_Value(ArgArg)))) {
198       return IC.replaceInstUsesWith(II, ArgArg);
199     }
200     if (!II.getMetadata(LLVMContext::MD_range)) {
201       Type *IntTy32 = Type::getInt32Ty(II.getContext());
202       Metadata *M[] = {
203           ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0)),
204           ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0x10000))};
205       II.setMetadata(LLVMContext::MD_range, MDNode::get(II.getContext(), M));
206       return &II;
207     }
208     break;
209   }
210   case Intrinsic::arm_mve_vadc:
211   case Intrinsic::arm_mve_vadc_predicated: {
212     unsigned CarryOp =
213         (II.getIntrinsicID() == Intrinsic::arm_mve_vadc_predicated) ? 3 : 2;
214     assert(II.getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == 32 &&
215            "Bad type for intrinsic!");
216 
217     KnownBits CarryKnown(32);
218     if (IC.SimplifyDemandedBits(&II, CarryOp, APInt::getOneBitSet(32, 29),
219                                 CarryKnown)) {
220       return &II;
221     }
222     break;
223   }
224   case Intrinsic::arm_mve_vmldava: {
225     Instruction *I = cast<Instruction>(&II);
226     if (I->hasOneUse()) {
227       auto *User = cast<Instruction>(*I->user_begin());
228       Value *OpZ;
229       if (match(User, m_c_Add(m_Specific(I), m_Value(OpZ))) &&
230           match(I->getOperand(3), m_Zero())) {
231         Value *OpX = I->getOperand(4);
232         Value *OpY = I->getOperand(5);
233         Type *OpTy = OpX->getType();
234 
235         IC.Builder.SetInsertPoint(User);
236         Value *V =
237             IC.Builder.CreateIntrinsic(Intrinsic::arm_mve_vmldava, {OpTy},
238                                        {I->getOperand(0), I->getOperand(1),
239                                         I->getOperand(2), OpZ, OpX, OpY});
240 
241         IC.replaceInstUsesWith(*User, V);
242         return IC.eraseInstFromFunction(*User);
243       }
244     }
245     return None;
246   }
247   }
248   return None;
249 }
250 
251 Optional<Value *> ARMTTIImpl::simplifyDemandedVectorEltsIntrinsic(
252     InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
253     APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
254     std::function<void(Instruction *, unsigned, APInt, APInt &)>
255         SimplifyAndSetOp) const {
256 
257   // Compute the demanded bits for a narrowing MVE intrinsic. The TopOpc is the
258   // opcode specifying a Top/Bottom instruction, which can change between
259   // instructions.
260   auto SimplifyNarrowInstrTopBottom =[&](unsigned TopOpc) {
261     unsigned NumElts = cast<FixedVectorType>(II.getType())->getNumElements();
262     unsigned IsTop = cast<ConstantInt>(II.getOperand(TopOpc))->getZExtValue();
263 
264     // The only odd/even lanes of operand 0 will only be demanded depending
265     // on whether this is a top/bottom instruction.
266     APInt DemandedElts =
267         APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1)
268                                        : APInt::getHighBitsSet(2, 1));
269     SimplifyAndSetOp(&II, 0, OrigDemandedElts & DemandedElts, UndefElts);
270     // The other lanes will be defined from the inserted elements.
271     UndefElts &= APInt::getSplat(NumElts, !IsTop ? APInt::getLowBitsSet(2, 1)
272                                                  : APInt::getHighBitsSet(2, 1));
273     return None;
274   };
275 
276   switch (II.getIntrinsicID()) {
277   default:
278     break;
279   case Intrinsic::arm_mve_vcvt_narrow:
280     SimplifyNarrowInstrTopBottom(2);
281     break;
282   case Intrinsic::arm_mve_vqmovn:
283     SimplifyNarrowInstrTopBottom(4);
284     break;
285   case Intrinsic::arm_mve_vshrn:
286     SimplifyNarrowInstrTopBottom(7);
287     break;
288   }
289 
290   return None;
291 }
292 
293 InstructionCost ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
294                                           TTI::TargetCostKind CostKind) {
295   assert(Ty->isIntegerTy());
296 
297  unsigned Bits = Ty->getPrimitiveSizeInBits();
298  if (Bits == 0 || Imm.getActiveBits() >= 64)
299    return 4;
300 
301   int64_t SImmVal = Imm.getSExtValue();
302   uint64_t ZImmVal = Imm.getZExtValue();
303   if (!ST->isThumb()) {
304     if ((SImmVal >= 0 && SImmVal < 65536) ||
305         (ARM_AM::getSOImmVal(ZImmVal) != -1) ||
306         (ARM_AM::getSOImmVal(~ZImmVal) != -1))
307       return 1;
308     return ST->hasV6T2Ops() ? 2 : 3;
309   }
310   if (ST->isThumb2()) {
311     if ((SImmVal >= 0 && SImmVal < 65536) ||
312         (ARM_AM::getT2SOImmVal(ZImmVal) != -1) ||
313         (ARM_AM::getT2SOImmVal(~ZImmVal) != -1))
314       return 1;
315     return ST->hasV6T2Ops() ? 2 : 3;
316   }
317   // Thumb1, any i8 imm cost 1.
318   if (Bits == 8 || (SImmVal >= 0 && SImmVal < 256))
319     return 1;
320   if ((~SImmVal < 256) || ARM_AM::isThumbImmShiftedVal(ZImmVal))
321     return 2;
322   // Load from constantpool.
323   return 3;
324 }
325 
326 // Constants smaller than 256 fit in the immediate field of
327 // Thumb1 instructions so we return a zero cost and 1 otherwise.
328 InstructionCost ARMTTIImpl::getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx,
329                                                   const APInt &Imm, Type *Ty) {
330   if (Imm.isNonNegative() && Imm.getLimitedValue() < 256)
331     return 0;
332 
333   return 1;
334 }
335 
336 // Checks whether Inst is part of a min(max()) or max(min()) pattern
337 // that will match to an SSAT instruction
338 static bool isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm) {
339   Value *LHS, *RHS;
340   ConstantInt *C;
341   SelectPatternFlavor InstSPF = matchSelectPattern(Inst, LHS, RHS).Flavor;
342 
343   if (InstSPF == SPF_SMAX &&
344       PatternMatch::match(RHS, PatternMatch::m_ConstantInt(C)) &&
345       C->getValue() == Imm && Imm.isNegative() && Imm.isNegatedPowerOf2()) {
346 
347     auto isSSatMin = [&](Value *MinInst) {
348       if (isa<SelectInst>(MinInst)) {
349         Value *MinLHS, *MinRHS;
350         ConstantInt *MinC;
351         SelectPatternFlavor MinSPF =
352             matchSelectPattern(MinInst, MinLHS, MinRHS).Flavor;
353         if (MinSPF == SPF_SMIN &&
354             PatternMatch::match(MinRHS, PatternMatch::m_ConstantInt(MinC)) &&
355             MinC->getValue() == ((-Imm) - 1))
356           return true;
357       }
358       return false;
359     };
360 
361     if (isSSatMin(Inst->getOperand(1)) ||
362         (Inst->hasNUses(2) && (isSSatMin(*Inst->user_begin()) ||
363                                isSSatMin(*(++Inst->user_begin())))))
364       return true;
365   }
366   return false;
367 }
368 
369 InstructionCost ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
370                                               const APInt &Imm, Type *Ty,
371                                               TTI::TargetCostKind CostKind,
372                                               Instruction *Inst) {
373   // Division by a constant can be turned into multiplication, but only if we
374   // know it's constant. So it's not so much that the immediate is cheap (it's
375   // not), but that the alternative is worse.
376   // FIXME: this is probably unneeded with GlobalISel.
377   if ((Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
378        Opcode == Instruction::SRem || Opcode == Instruction::URem) &&
379       Idx == 1)
380     return 0;
381 
382   // Leave any gep offsets for the CodeGenPrepare, which will do a better job at
383   // splitting any large offsets.
384   if (Opcode == Instruction::GetElementPtr && Idx != 0)
385     return 0;
386 
387   if (Opcode == Instruction::And) {
388     // UXTB/UXTH
389     if (Imm == 255 || Imm == 65535)
390       return 0;
391     // Conversion to BIC is free, and means we can use ~Imm instead.
392     return std::min(getIntImmCost(Imm, Ty, CostKind),
393                     getIntImmCost(~Imm, Ty, CostKind));
394   }
395 
396   if (Opcode == Instruction::Add)
397     // Conversion to SUB is free, and means we can use -Imm instead.
398     return std::min(getIntImmCost(Imm, Ty, CostKind),
399                     getIntImmCost(-Imm, Ty, CostKind));
400 
401   if (Opcode == Instruction::ICmp && Imm.isNegative() &&
402       Ty->getIntegerBitWidth() == 32) {
403     int64_t NegImm = -Imm.getSExtValue();
404     if (ST->isThumb2() && NegImm < 1<<12)
405       // icmp X, #-C -> cmn X, #C
406       return 0;
407     if (ST->isThumb() && NegImm < 1<<8)
408       // icmp X, #-C -> adds X, #C
409       return 0;
410   }
411 
412   // xor a, -1 can always be folded to MVN
413   if (Opcode == Instruction::Xor && Imm.isAllOnes())
414     return 0;
415 
416   // Ensures negative constant of min(max()) or max(min()) patterns that
417   // match to SSAT instructions don't get hoisted
418   if (Inst && ((ST->hasV6Ops() && !ST->isThumb()) || ST->isThumb2()) &&
419       Ty->getIntegerBitWidth() <= 32) {
420     if (isSSATMinMaxPattern(Inst, Imm) ||
421         (isa<ICmpInst>(Inst) && Inst->hasOneUse() &&
422          isSSATMinMaxPattern(cast<Instruction>(*Inst->user_begin()), Imm)))
423       return 0;
424   }
425 
426   // We can convert <= -1 to < 0, which is generally quite cheap.
427   if (Inst && Opcode == Instruction::ICmp && Idx == 1 && Imm.isAllOnesValue()) {
428     ICmpInst::Predicate Pred = cast<ICmpInst>(Inst)->getPredicate();
429     if (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SLE)
430       return std::min(getIntImmCost(Imm, Ty, CostKind),
431                       getIntImmCost(Imm + 1, Ty, CostKind));
432   }
433 
434   return getIntImmCost(Imm, Ty, CostKind);
435 }
436 
437 InstructionCost ARMTTIImpl::getCFInstrCost(unsigned Opcode,
438                                            TTI::TargetCostKind CostKind,
439                                            const Instruction *I) {
440   if (CostKind == TTI::TCK_RecipThroughput &&
441       (ST->hasNEON() || ST->hasMVEIntegerOps())) {
442     // FIXME: The vectorizer is highly sensistive to the cost of these
443     // instructions, which suggests that it may be using the costs incorrectly.
444     // But, for now, just make them free to avoid performance regressions for
445     // vector targets.
446     return 0;
447   }
448   return BaseT::getCFInstrCost(Opcode, CostKind, I);
449 }
450 
451 InstructionCost ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
452                                              Type *Src,
453                                              TTI::CastContextHint CCH,
454                                              TTI::TargetCostKind CostKind,
455                                              const Instruction *I) {
456   int ISD = TLI->InstructionOpcodeToISD(Opcode);
457   assert(ISD && "Invalid opcode");
458 
459   // TODO: Allow non-throughput costs that aren't binary.
460   auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
461     if (CostKind != TTI::TCK_RecipThroughput)
462       return Cost == 0 ? 0 : 1;
463     return Cost;
464   };
465   auto IsLegalFPType = [this](EVT VT) {
466     EVT EltVT = VT.getScalarType();
467     return (EltVT == MVT::f32 && ST->hasVFP2Base()) ||
468             (EltVT == MVT::f64 && ST->hasFP64()) ||
469             (EltVT == MVT::f16 && ST->hasFullFP16());
470   };
471 
472   EVT SrcTy = TLI->getValueType(DL, Src);
473   EVT DstTy = TLI->getValueType(DL, Dst);
474 
475   if (!SrcTy.isSimple() || !DstTy.isSimple())
476     return AdjustCost(
477         BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
478 
479   // Extending masked load/Truncating masked stores is expensive because we
480   // currently don't split them. This means that we'll likely end up
481   // loading/storing each element individually (hence the high cost).
482   if ((ST->hasMVEIntegerOps() &&
483        (Opcode == Instruction::Trunc || Opcode == Instruction::ZExt ||
484         Opcode == Instruction::SExt)) ||
485       (ST->hasMVEFloatOps() &&
486        (Opcode == Instruction::FPExt || Opcode == Instruction::FPTrunc) &&
487        IsLegalFPType(SrcTy) && IsLegalFPType(DstTy)))
488     if (CCH == TTI::CastContextHint::Masked && DstTy.getSizeInBits() > 128)
489       return 2 * DstTy.getVectorNumElements() *
490              ST->getMVEVectorCostFactor(CostKind);
491 
492   // The extend of other kinds of load is free
493   if (CCH == TTI::CastContextHint::Normal ||
494       CCH == TTI::CastContextHint::Masked) {
495     static const TypeConversionCostTblEntry LoadConversionTbl[] = {
496         {ISD::SIGN_EXTEND, MVT::i32, MVT::i16, 0},
497         {ISD::ZERO_EXTEND, MVT::i32, MVT::i16, 0},
498         {ISD::SIGN_EXTEND, MVT::i32, MVT::i8, 0},
499         {ISD::ZERO_EXTEND, MVT::i32, MVT::i8, 0},
500         {ISD::SIGN_EXTEND, MVT::i16, MVT::i8, 0},
501         {ISD::ZERO_EXTEND, MVT::i16, MVT::i8, 0},
502         {ISD::SIGN_EXTEND, MVT::i64, MVT::i32, 1},
503         {ISD::ZERO_EXTEND, MVT::i64, MVT::i32, 1},
504         {ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 1},
505         {ISD::ZERO_EXTEND, MVT::i64, MVT::i16, 1},
506         {ISD::SIGN_EXTEND, MVT::i64, MVT::i8, 1},
507         {ISD::ZERO_EXTEND, MVT::i64, MVT::i8, 1},
508     };
509     if (const auto *Entry = ConvertCostTableLookup(
510             LoadConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
511       return AdjustCost(Entry->Cost);
512 
513     static const TypeConversionCostTblEntry MVELoadConversionTbl[] = {
514         {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 0},
515         {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 0},
516         {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 0},
517         {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 0},
518         {ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 0},
519         {ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 0},
520         // The following extend from a legal type to an illegal type, so need to
521         // split the load. This introduced an extra load operation, but the
522         // extend is still "free".
523         {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1},
524         {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1},
525         {ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 3},
526         {ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 3},
527         {ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1},
528         {ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1},
529     };
530     if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
531       if (const auto *Entry =
532               ConvertCostTableLookup(MVELoadConversionTbl, ISD,
533                                      DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
534         return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
535     }
536 
537     static const TypeConversionCostTblEntry MVEFLoadConversionTbl[] = {
538         // FPExtends are similar but also require the VCVT instructions.
539         {ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, 1},
540         {ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, 3},
541     };
542     if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
543       if (const auto *Entry =
544               ConvertCostTableLookup(MVEFLoadConversionTbl, ISD,
545                                      DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
546         return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
547     }
548 
549     // The truncate of a store is free. This is the mirror of extends above.
550     static const TypeConversionCostTblEntry MVEStoreConversionTbl[] = {
551         {ISD::TRUNCATE, MVT::v4i32, MVT::v4i16, 0},
552         {ISD::TRUNCATE, MVT::v4i32, MVT::v4i8, 0},
553         {ISD::TRUNCATE, MVT::v8i16, MVT::v8i8, 0},
554         {ISD::TRUNCATE, MVT::v8i32, MVT::v8i16, 1},
555         {ISD::TRUNCATE, MVT::v8i32, MVT::v8i8, 1},
556         {ISD::TRUNCATE, MVT::v16i32, MVT::v16i8, 3},
557         {ISD::TRUNCATE, MVT::v16i16, MVT::v16i8, 1},
558     };
559     if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
560       if (const auto *Entry =
561               ConvertCostTableLookup(MVEStoreConversionTbl, ISD,
562                                      SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
563         return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
564     }
565 
566     static const TypeConversionCostTblEntry MVEFStoreConversionTbl[] = {
567         {ISD::FP_ROUND, MVT::v4f32, MVT::v4f16, 1},
568         {ISD::FP_ROUND, MVT::v8f32, MVT::v8f16, 3},
569     };
570     if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
571       if (const auto *Entry =
572               ConvertCostTableLookup(MVEFStoreConversionTbl, ISD,
573                                      SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
574         return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
575     }
576   }
577 
578   // NEON vector operations that can extend their inputs.
579   if ((ISD == ISD::SIGN_EXTEND || ISD == ISD::ZERO_EXTEND) &&
580       I && I->hasOneUse() && ST->hasNEON() && SrcTy.isVector()) {
581     static const TypeConversionCostTblEntry NEONDoubleWidthTbl[] = {
582       // vaddl
583       { ISD::ADD, MVT::v4i32, MVT::v4i16, 0 },
584       { ISD::ADD, MVT::v8i16, MVT::v8i8,  0 },
585       // vsubl
586       { ISD::SUB, MVT::v4i32, MVT::v4i16, 0 },
587       { ISD::SUB, MVT::v8i16, MVT::v8i8,  0 },
588       // vmull
589       { ISD::MUL, MVT::v4i32, MVT::v4i16, 0 },
590       { ISD::MUL, MVT::v8i16, MVT::v8i8,  0 },
591       // vshll
592       { ISD::SHL, MVT::v4i32, MVT::v4i16, 0 },
593       { ISD::SHL, MVT::v8i16, MVT::v8i8,  0 },
594     };
595 
596     auto *User = cast<Instruction>(*I->user_begin());
597     int UserISD = TLI->InstructionOpcodeToISD(User->getOpcode());
598     if (auto *Entry = ConvertCostTableLookup(NEONDoubleWidthTbl, UserISD,
599                                              DstTy.getSimpleVT(),
600                                              SrcTy.getSimpleVT())) {
601       return AdjustCost(Entry->Cost);
602     }
603   }
604 
605   // Single to/from double precision conversions.
606   if (Src->isVectorTy() && ST->hasNEON() &&
607       ((ISD == ISD::FP_ROUND && SrcTy.getScalarType() == MVT::f64 &&
608         DstTy.getScalarType() == MVT::f32) ||
609        (ISD == ISD::FP_EXTEND && SrcTy.getScalarType() == MVT::f32 &&
610         DstTy.getScalarType() == MVT::f64))) {
611     static const CostTblEntry NEONFltDblTbl[] = {
612         // Vector fptrunc/fpext conversions.
613         {ISD::FP_ROUND, MVT::v2f64, 2},
614         {ISD::FP_EXTEND, MVT::v2f32, 2},
615         {ISD::FP_EXTEND, MVT::v4f32, 4}};
616 
617     std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
618     if (const auto *Entry = CostTableLookup(NEONFltDblTbl, ISD, LT.second))
619       return AdjustCost(LT.first * Entry->Cost);
620   }
621 
622   // Some arithmetic, load and store operations have specific instructions
623   // to cast up/down their types automatically at no extra cost.
624   // TODO: Get these tables to know at least what the related operations are.
625   static const TypeConversionCostTblEntry NEONVectorConversionTbl[] = {
626     { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
627     { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
628     { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
629     { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
630     { ISD::TRUNCATE,    MVT::v4i32, MVT::v4i64, 0 },
631     { ISD::TRUNCATE,    MVT::v4i16, MVT::v4i32, 1 },
632 
633     // The number of vmovl instructions for the extension.
634     { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8,  1 },
635     { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8,  1 },
636     { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8,  2 },
637     { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8,  2 },
638     { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8,  3 },
639     { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8,  3 },
640     { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
641     { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
642     { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
643     { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
644     { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
645     { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
646     { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
647     { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
648     { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
649     { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
650     { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
651     { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
652 
653     // Operations that we legalize using splitting.
654     { ISD::TRUNCATE,    MVT::v16i8, MVT::v16i32, 6 },
655     { ISD::TRUNCATE,    MVT::v8i8, MVT::v8i32, 3 },
656 
657     // Vector float <-> i32 conversions.
658     { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i32, 1 },
659     { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i32, 1 },
660 
661     { ISD::SINT_TO_FP,  MVT::v2f32, MVT::v2i8, 3 },
662     { ISD::UINT_TO_FP,  MVT::v2f32, MVT::v2i8, 3 },
663     { ISD::SINT_TO_FP,  MVT::v2f32, MVT::v2i16, 2 },
664     { ISD::UINT_TO_FP,  MVT::v2f32, MVT::v2i16, 2 },
665     { ISD::SINT_TO_FP,  MVT::v2f32, MVT::v2i32, 1 },
666     { ISD::UINT_TO_FP,  MVT::v2f32, MVT::v2i32, 1 },
667     { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i1, 3 },
668     { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i1, 3 },
669     { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i8, 3 },
670     { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i8, 3 },
671     { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i16, 2 },
672     { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i16, 2 },
673     { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i16, 4 },
674     { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i16, 4 },
675     { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i32, 2 },
676     { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i32, 2 },
677     { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i16, 8 },
678     { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i16, 8 },
679     { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i32, 4 },
680     { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i32, 4 },
681 
682     { ISD::FP_TO_SINT,  MVT::v4i32, MVT::v4f32, 1 },
683     { ISD::FP_TO_UINT,  MVT::v4i32, MVT::v4f32, 1 },
684     { ISD::FP_TO_SINT,  MVT::v4i8, MVT::v4f32, 3 },
685     { ISD::FP_TO_UINT,  MVT::v4i8, MVT::v4f32, 3 },
686     { ISD::FP_TO_SINT,  MVT::v4i16, MVT::v4f32, 2 },
687     { ISD::FP_TO_UINT,  MVT::v4i16, MVT::v4f32, 2 },
688 
689     // Vector double <-> i32 conversions.
690     { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },
691     { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },
692 
693     { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i8, 4 },
694     { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i8, 4 },
695     { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i16, 3 },
696     { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i16, 3 },
697     { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },
698     { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },
699 
700     { ISD::FP_TO_SINT,  MVT::v2i32, MVT::v2f64, 2 },
701     { ISD::FP_TO_UINT,  MVT::v2i32, MVT::v2f64, 2 },
702     { ISD::FP_TO_SINT,  MVT::v8i16, MVT::v8f32, 4 },
703     { ISD::FP_TO_UINT,  MVT::v8i16, MVT::v8f32, 4 },
704     { ISD::FP_TO_SINT,  MVT::v16i16, MVT::v16f32, 8 },
705     { ISD::FP_TO_UINT,  MVT::v16i16, MVT::v16f32, 8 }
706   };
707 
708   if (SrcTy.isVector() && ST->hasNEON()) {
709     if (const auto *Entry = ConvertCostTableLookup(NEONVectorConversionTbl, ISD,
710                                                    DstTy.getSimpleVT(),
711                                                    SrcTy.getSimpleVT()))
712       return AdjustCost(Entry->Cost);
713   }
714 
715   // Scalar float to integer conversions.
716   static const TypeConversionCostTblEntry NEONFloatConversionTbl[] = {
717     { ISD::FP_TO_SINT,  MVT::i1, MVT::f32, 2 },
718     { ISD::FP_TO_UINT,  MVT::i1, MVT::f32, 2 },
719     { ISD::FP_TO_SINT,  MVT::i1, MVT::f64, 2 },
720     { ISD::FP_TO_UINT,  MVT::i1, MVT::f64, 2 },
721     { ISD::FP_TO_SINT,  MVT::i8, MVT::f32, 2 },
722     { ISD::FP_TO_UINT,  MVT::i8, MVT::f32, 2 },
723     { ISD::FP_TO_SINT,  MVT::i8, MVT::f64, 2 },
724     { ISD::FP_TO_UINT,  MVT::i8, MVT::f64, 2 },
725     { ISD::FP_TO_SINT,  MVT::i16, MVT::f32, 2 },
726     { ISD::FP_TO_UINT,  MVT::i16, MVT::f32, 2 },
727     { ISD::FP_TO_SINT,  MVT::i16, MVT::f64, 2 },
728     { ISD::FP_TO_UINT,  MVT::i16, MVT::f64, 2 },
729     { ISD::FP_TO_SINT,  MVT::i32, MVT::f32, 2 },
730     { ISD::FP_TO_UINT,  MVT::i32, MVT::f32, 2 },
731     { ISD::FP_TO_SINT,  MVT::i32, MVT::f64, 2 },
732     { ISD::FP_TO_UINT,  MVT::i32, MVT::f64, 2 },
733     { ISD::FP_TO_SINT,  MVT::i64, MVT::f32, 10 },
734     { ISD::FP_TO_UINT,  MVT::i64, MVT::f32, 10 },
735     { ISD::FP_TO_SINT,  MVT::i64, MVT::f64, 10 },
736     { ISD::FP_TO_UINT,  MVT::i64, MVT::f64, 10 }
737   };
738   if (SrcTy.isFloatingPoint() && ST->hasNEON()) {
739     if (const auto *Entry = ConvertCostTableLookup(NEONFloatConversionTbl, ISD,
740                                                    DstTy.getSimpleVT(),
741                                                    SrcTy.getSimpleVT()))
742       return AdjustCost(Entry->Cost);
743   }
744 
745   // Scalar integer to float conversions.
746   static const TypeConversionCostTblEntry NEONIntegerConversionTbl[] = {
747     { ISD::SINT_TO_FP,  MVT::f32, MVT::i1, 2 },
748     { ISD::UINT_TO_FP,  MVT::f32, MVT::i1, 2 },
749     { ISD::SINT_TO_FP,  MVT::f64, MVT::i1, 2 },
750     { ISD::UINT_TO_FP,  MVT::f64, MVT::i1, 2 },
751     { ISD::SINT_TO_FP,  MVT::f32, MVT::i8, 2 },
752     { ISD::UINT_TO_FP,  MVT::f32, MVT::i8, 2 },
753     { ISD::SINT_TO_FP,  MVT::f64, MVT::i8, 2 },
754     { ISD::UINT_TO_FP,  MVT::f64, MVT::i8, 2 },
755     { ISD::SINT_TO_FP,  MVT::f32, MVT::i16, 2 },
756     { ISD::UINT_TO_FP,  MVT::f32, MVT::i16, 2 },
757     { ISD::SINT_TO_FP,  MVT::f64, MVT::i16, 2 },
758     { ISD::UINT_TO_FP,  MVT::f64, MVT::i16, 2 },
759     { ISD::SINT_TO_FP,  MVT::f32, MVT::i32, 2 },
760     { ISD::UINT_TO_FP,  MVT::f32, MVT::i32, 2 },
761     { ISD::SINT_TO_FP,  MVT::f64, MVT::i32, 2 },
762     { ISD::UINT_TO_FP,  MVT::f64, MVT::i32, 2 },
763     { ISD::SINT_TO_FP,  MVT::f32, MVT::i64, 10 },
764     { ISD::UINT_TO_FP,  MVT::f32, MVT::i64, 10 },
765     { ISD::SINT_TO_FP,  MVT::f64, MVT::i64, 10 },
766     { ISD::UINT_TO_FP,  MVT::f64, MVT::i64, 10 }
767   };
768 
769   if (SrcTy.isInteger() && ST->hasNEON()) {
770     if (const auto *Entry = ConvertCostTableLookup(NEONIntegerConversionTbl,
771                                                    ISD, DstTy.getSimpleVT(),
772                                                    SrcTy.getSimpleVT()))
773       return AdjustCost(Entry->Cost);
774   }
775 
776   // MVE extend costs, taken from codegen tests. i8->i16 or i16->i32 is one
777   // instruction, i8->i32 is two. i64 zexts are an VAND with a constant, sext
778   // are linearised so take more.
779   static const TypeConversionCostTblEntry MVEVectorConversionTbl[] = {
780     { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
781     { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
782     { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
783     { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
784     { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8, 10 },
785     { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8, 2 },
786     { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
787     { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
788     { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 10 },
789     { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
790     { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 8 },
791     { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 2 },
792   };
793 
794   if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
795     if (const auto *Entry = ConvertCostTableLookup(MVEVectorConversionTbl,
796                                                    ISD, DstTy.getSimpleVT(),
797                                                    SrcTy.getSimpleVT()))
798       return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
799   }
800 
801   if (ISD == ISD::FP_ROUND || ISD == ISD::FP_EXTEND) {
802     // As general rule, fp converts that were not matched above are scalarized
803     // and cost 1 vcvt for each lane, so long as the instruction is available.
804     // If not it will become a series of function calls.
805     const InstructionCost CallCost =
806         getCallInstrCost(nullptr, Dst, {Src}, CostKind);
807     int Lanes = 1;
808     if (SrcTy.isFixedLengthVector())
809       Lanes = SrcTy.getVectorNumElements();
810 
811     if (IsLegalFPType(SrcTy) && IsLegalFPType(DstTy))
812       return Lanes;
813     else
814       return Lanes * CallCost;
815   }
816 
817   if (ISD == ISD::TRUNCATE && ST->hasMVEIntegerOps() &&
818       SrcTy.isFixedLengthVector()) {
819     // Treat a truncate with larger than legal source (128bits for MVE) as
820     // expensive, 2 instructions per lane.
821     if ((SrcTy.getScalarType() == MVT::i8 ||
822          SrcTy.getScalarType() == MVT::i16 ||
823          SrcTy.getScalarType() == MVT::i32) &&
824         SrcTy.getSizeInBits() > 128 &&
825         SrcTy.getSizeInBits() > DstTy.getSizeInBits())
826       return SrcTy.getVectorNumElements() * 2;
827   }
828 
829   // Scalar integer conversion costs.
830   static const TypeConversionCostTblEntry ARMIntegerConversionTbl[] = {
831     // i16 -> i64 requires two dependent operations.
832     { ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 2 },
833 
834     // Truncates on i64 are assumed to be free.
835     { ISD::TRUNCATE,    MVT::i32, MVT::i64, 0 },
836     { ISD::TRUNCATE,    MVT::i16, MVT::i64, 0 },
837     { ISD::TRUNCATE,    MVT::i8,  MVT::i64, 0 },
838     { ISD::TRUNCATE,    MVT::i1,  MVT::i64, 0 }
839   };
840 
841   if (SrcTy.isInteger()) {
842     if (const auto *Entry = ConvertCostTableLookup(ARMIntegerConversionTbl, ISD,
843                                                    DstTy.getSimpleVT(),
844                                                    SrcTy.getSimpleVT()))
845       return AdjustCost(Entry->Cost);
846   }
847 
848   int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
849                      ? ST->getMVEVectorCostFactor(CostKind)
850                      : 1;
851   return AdjustCost(
852       BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
853 }
854 
855 InstructionCost ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
856                                                unsigned Index) {
857   // Penalize inserting into an D-subregister. We end up with a three times
858   // lower estimated throughput on swift.
859   if (ST->hasSlowLoadDSubregister() && Opcode == Instruction::InsertElement &&
860       ValTy->isVectorTy() && ValTy->getScalarSizeInBits() <= 32)
861     return 3;
862 
863   if (ST->hasNEON() && (Opcode == Instruction::InsertElement ||
864                         Opcode == Instruction::ExtractElement)) {
865     // Cross-class copies are expensive on many microarchitectures,
866     // so assume they are expensive by default.
867     if (cast<VectorType>(ValTy)->getElementType()->isIntegerTy())
868       return 3;
869 
870     // Even if it's not a cross class copy, this likely leads to mixing
871     // of NEON and VFP code and should be therefore penalized.
872     if (ValTy->isVectorTy() &&
873         ValTy->getScalarSizeInBits() <= 32)
874       return std::max<InstructionCost>(
875           BaseT::getVectorInstrCost(Opcode, ValTy, Index), 2U);
876   }
877 
878   if (ST->hasMVEIntegerOps() && (Opcode == Instruction::InsertElement ||
879                                  Opcode == Instruction::ExtractElement)) {
880     // Integer cross-lane moves are more expensive than float, which can
881     // sometimes just be vmovs. Integer involve being passes to GPR registers,
882     // causing more of a delay.
883     std::pair<InstructionCost, MVT> LT =
884         getTLI()->getTypeLegalizationCost(DL, ValTy->getScalarType());
885     return LT.first * (ValTy->getScalarType()->isIntegerTy() ? 4 : 1);
886   }
887 
888   return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
889 }
890 
891 InstructionCost ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
892                                                Type *CondTy,
893                                                CmpInst::Predicate VecPred,
894                                                TTI::TargetCostKind CostKind,
895                                                const Instruction *I) {
896   int ISD = TLI->InstructionOpcodeToISD(Opcode);
897 
898   // Thumb scalar code size cost for select.
899   if (CostKind == TTI::TCK_CodeSize && ISD == ISD::SELECT &&
900       ST->isThumb() && !ValTy->isVectorTy()) {
901     // Assume expensive structs.
902     if (TLI->getValueType(DL, ValTy, true) == MVT::Other)
903       return TTI::TCC_Expensive;
904 
905     // Select costs can vary because they:
906     // - may require one or more conditional mov (including an IT),
907     // - can't operate directly on immediates,
908     // - require live flags, which we can't copy around easily.
909     InstructionCost Cost = TLI->getTypeLegalizationCost(DL, ValTy).first;
910 
911     // Possible IT instruction for Thumb2, or more for Thumb1.
912     ++Cost;
913 
914     // i1 values may need rematerialising by using mov immediates and/or
915     // flag setting instructions.
916     if (ValTy->isIntegerTy(1))
917       ++Cost;
918 
919     return Cost;
920   }
921 
922   // If this is a vector min/max/abs, use the cost of that intrinsic directly
923   // instead. Hopefully when min/max intrinsics are more prevalent this code
924   // will not be needed.
925   const Instruction *Sel = I;
926   if ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) && Sel &&
927       Sel->hasOneUse())
928     Sel = cast<Instruction>(Sel->user_back());
929   if (Sel && ValTy->isVectorTy() &&
930       (ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy())) {
931     const Value *LHS, *RHS;
932     SelectPatternFlavor SPF = matchSelectPattern(Sel, LHS, RHS).Flavor;
933     unsigned IID = 0;
934     switch (SPF) {
935     case SPF_ABS:
936       IID = Intrinsic::abs;
937       break;
938     case SPF_SMIN:
939       IID = Intrinsic::smin;
940       break;
941     case SPF_SMAX:
942       IID = Intrinsic::smax;
943       break;
944     case SPF_UMIN:
945       IID = Intrinsic::umin;
946       break;
947     case SPF_UMAX:
948       IID = Intrinsic::umax;
949       break;
950     case SPF_FMINNUM:
951       IID = Intrinsic::minnum;
952       break;
953     case SPF_FMAXNUM:
954       IID = Intrinsic::maxnum;
955       break;
956     default:
957       break;
958     }
959     if (IID) {
960       // The ICmp is free, the select gets the cost of the min/max/etc
961       if (Sel != I)
962         return 0;
963       IntrinsicCostAttributes CostAttrs(IID, ValTy, {ValTy, ValTy});
964       return getIntrinsicInstrCost(CostAttrs, CostKind);
965     }
966   }
967 
968   // On NEON a vector select gets lowered to vbsl.
969   if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT && CondTy) {
970     // Lowering of some vector selects is currently far from perfect.
971     static const TypeConversionCostTblEntry NEONVectorSelectTbl[] = {
972       { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4*4 + 1*2 + 1 },
973       { ISD::SELECT, MVT::v8i1, MVT::v8i64, 50 },
974       { ISD::SELECT, MVT::v16i1, MVT::v16i64, 100 }
975     };
976 
977     EVT SelCondTy = TLI->getValueType(DL, CondTy);
978     EVT SelValTy = TLI->getValueType(DL, ValTy);
979     if (SelCondTy.isSimple() && SelValTy.isSimple()) {
980       if (const auto *Entry = ConvertCostTableLookup(NEONVectorSelectTbl, ISD,
981                                                      SelCondTy.getSimpleVT(),
982                                                      SelValTy.getSimpleVT()))
983         return Entry->Cost;
984     }
985 
986     std::pair<InstructionCost, MVT> LT =
987         TLI->getTypeLegalizationCost(DL, ValTy);
988     return LT.first;
989   }
990 
991   if (ST->hasMVEIntegerOps() && ValTy->isVectorTy() &&
992       (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
993       cast<FixedVectorType>(ValTy)->getNumElements() > 1) {
994     FixedVectorType *VecValTy = cast<FixedVectorType>(ValTy);
995     FixedVectorType *VecCondTy = dyn_cast_or_null<FixedVectorType>(CondTy);
996     if (!VecCondTy)
997       VecCondTy = cast<FixedVectorType>(CmpInst::makeCmpResultType(VecValTy));
998 
999     // If we don't have mve.fp any fp operations will need to be scalarized.
1000     if (Opcode == Instruction::FCmp && !ST->hasMVEFloatOps()) {
1001       // One scalaization insert, one scalarization extract and the cost of the
1002       // fcmps.
1003       return BaseT::getScalarizationOverhead(VecValTy, false, true) +
1004              BaseT::getScalarizationOverhead(VecCondTy, true, false) +
1005              VecValTy->getNumElements() *
1006                  getCmpSelInstrCost(Opcode, ValTy->getScalarType(),
1007                                     VecCondTy->getScalarType(), VecPred, CostKind,
1008                                     I);
1009     }
1010 
1011     std::pair<InstructionCost, MVT> LT =
1012         TLI->getTypeLegalizationCost(DL, ValTy);
1013     int BaseCost = ST->getMVEVectorCostFactor(CostKind);
1014     // There are two types - the input that specifies the type of the compare
1015     // and the output vXi1 type. Because we don't know how the output will be
1016     // split, we may need an expensive shuffle to get two in sync. This has the
1017     // effect of making larger than legal compares (v8i32 for example)
1018     // expensive.
1019     if (LT.second.getVectorNumElements() > 2) {
1020       if (LT.first > 1)
1021         return LT.first * BaseCost +
1022                BaseT::getScalarizationOverhead(VecCondTy, true, false);
1023       return BaseCost;
1024     }
1025   }
1026 
1027   // Default to cheap (throughput/size of 1 instruction) but adjust throughput
1028   // for "multiple beats" potentially needed by MVE instructions.
1029   int BaseCost = 1;
1030   if (ST->hasMVEIntegerOps() && ValTy->isVectorTy())
1031     BaseCost = ST->getMVEVectorCostFactor(CostKind);
1032 
1033   return BaseCost *
1034          BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
1035 }
1036 
1037 InstructionCost ARMTTIImpl::getAddressComputationCost(Type *Ty,
1038                                                       ScalarEvolution *SE,
1039                                                       const SCEV *Ptr) {
1040   // Address computations in vectorized code with non-consecutive addresses will
1041   // likely result in more instructions compared to scalar code where the
1042   // computation can more often be merged into the index mode. The resulting
1043   // extra micro-ops can significantly decrease throughput.
1044   unsigned NumVectorInstToHideOverhead = 10;
1045   int MaxMergeDistance = 64;
1046 
1047   if (ST->hasNEON()) {
1048     if (Ty->isVectorTy() && SE &&
1049         !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
1050       return NumVectorInstToHideOverhead;
1051 
1052     // In many cases the address computation is not merged into the instruction
1053     // addressing mode.
1054     return 1;
1055   }
1056   return BaseT::getAddressComputationCost(Ty, SE, Ptr);
1057 }
1058 
1059 bool ARMTTIImpl::isProfitableLSRChainElement(Instruction *I) {
1060   if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
1061     // If a VCTP is part of a chain, it's already profitable and shouldn't be
1062     // optimized, else LSR may block tail-predication.
1063     switch (II->getIntrinsicID()) {
1064     case Intrinsic::arm_mve_vctp8:
1065     case Intrinsic::arm_mve_vctp16:
1066     case Intrinsic::arm_mve_vctp32:
1067     case Intrinsic::arm_mve_vctp64:
1068       return true;
1069     default:
1070       break;
1071     }
1072   }
1073   return false;
1074 }
1075 
1076 bool ARMTTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) {
1077   if (!EnableMaskedLoadStores || !ST->hasMVEIntegerOps())
1078     return false;
1079 
1080   if (auto *VecTy = dyn_cast<FixedVectorType>(DataTy)) {
1081     // Don't support v2i1 yet.
1082     if (VecTy->getNumElements() == 2)
1083       return false;
1084 
1085     // We don't support extending fp types.
1086      unsigned VecWidth = DataTy->getPrimitiveSizeInBits();
1087     if (VecWidth != 128 && VecTy->getElementType()->isFloatingPointTy())
1088       return false;
1089   }
1090 
1091   unsigned EltWidth = DataTy->getScalarSizeInBits();
1092   return (EltWidth == 32 && Alignment >= 4) ||
1093          (EltWidth == 16 && Alignment >= 2) || (EltWidth == 8);
1094 }
1095 
1096 bool ARMTTIImpl::isLegalMaskedGather(Type *Ty, Align Alignment) {
1097   if (!EnableMaskedGatherScatters || !ST->hasMVEIntegerOps())
1098     return false;
1099 
1100   // This method is called in 2 places:
1101   //  - from the vectorizer with a scalar type, in which case we need to get
1102   //  this as good as we can with the limited info we have (and rely on the cost
1103   //  model for the rest).
1104   //  - from the masked intrinsic lowering pass with the actual vector type.
1105   // For MVE, we have a custom lowering pass that will already have custom
1106   // legalised any gathers that we can to MVE intrinsics, and want to expand all
1107   // the rest. The pass runs before the masked intrinsic lowering pass, so if we
1108   // are here, we know we want to expand.
1109   if (isa<VectorType>(Ty))
1110     return false;
1111 
1112   unsigned EltWidth = Ty->getScalarSizeInBits();
1113   return ((EltWidth == 32 && Alignment >= 4) ||
1114           (EltWidth == 16 && Alignment >= 2) || EltWidth == 8);
1115 }
1116 
1117 /// Given a memcpy/memset/memmove instruction, return the number of memory
1118 /// operations performed, via querying findOptimalMemOpLowering. Returns -1 if a
1119 /// call is used.
1120 int ARMTTIImpl::getNumMemOps(const IntrinsicInst *I) const {
1121   MemOp MOp;
1122   unsigned DstAddrSpace = ~0u;
1123   unsigned SrcAddrSpace = ~0u;
1124   const Function *F = I->getParent()->getParent();
1125 
1126   if (const auto *MC = dyn_cast<MemTransferInst>(I)) {
1127     ConstantInt *C = dyn_cast<ConstantInt>(MC->getLength());
1128     // If 'size' is not a constant, a library call will be generated.
1129     if (!C)
1130       return -1;
1131 
1132     const unsigned Size = C->getValue().getZExtValue();
1133     const Align DstAlign = *MC->getDestAlign();
1134     const Align SrcAlign = *MC->getSourceAlign();
1135 
1136     MOp = MemOp::Copy(Size, /*DstAlignCanChange*/ false, DstAlign, SrcAlign,
1137                       /*IsVolatile*/ false);
1138     DstAddrSpace = MC->getDestAddressSpace();
1139     SrcAddrSpace = MC->getSourceAddressSpace();
1140   }
1141   else if (const auto *MS = dyn_cast<MemSetInst>(I)) {
1142     ConstantInt *C = dyn_cast<ConstantInt>(MS->getLength());
1143     // If 'size' is not a constant, a library call will be generated.
1144     if (!C)
1145       return -1;
1146 
1147     const unsigned Size = C->getValue().getZExtValue();
1148     const Align DstAlign = *MS->getDestAlign();
1149 
1150     MOp = MemOp::Set(Size, /*DstAlignCanChange*/ false, DstAlign,
1151                      /*IsZeroMemset*/ false, /*IsVolatile*/ false);
1152     DstAddrSpace = MS->getDestAddressSpace();
1153   }
1154   else
1155     llvm_unreachable("Expected a memcpy/move or memset!");
1156 
1157   unsigned Limit, Factor = 2;
1158   switch(I->getIntrinsicID()) {
1159     case Intrinsic::memcpy:
1160       Limit = TLI->getMaxStoresPerMemcpy(F->hasMinSize());
1161       break;
1162     case Intrinsic::memmove:
1163       Limit = TLI->getMaxStoresPerMemmove(F->hasMinSize());
1164       break;
1165     case Intrinsic::memset:
1166       Limit = TLI->getMaxStoresPerMemset(F->hasMinSize());
1167       Factor = 1;
1168       break;
1169     default:
1170       llvm_unreachable("Expected a memcpy/move or memset!");
1171   }
1172 
1173   // MemOps will be poplulated with a list of data types that needs to be
1174   // loaded and stored. That's why we multiply the number of elements by 2 to
1175   // get the cost for this memcpy.
1176   std::vector<EVT> MemOps;
1177   if (getTLI()->findOptimalMemOpLowering(
1178           MemOps, Limit, MOp, DstAddrSpace,
1179           SrcAddrSpace, F->getAttributes()))
1180     return MemOps.size() * Factor;
1181 
1182   // If we can't find an optimal memop lowering, return the default cost
1183   return -1;
1184 }
1185 
1186 InstructionCost ARMTTIImpl::getMemcpyCost(const Instruction *I) {
1187   int NumOps = getNumMemOps(cast<IntrinsicInst>(I));
1188 
1189   // To model the cost of a library call, we assume 1 for the call, and
1190   // 3 for the argument setup.
1191   if (NumOps == -1)
1192     return 4;
1193   return NumOps;
1194 }
1195 
1196 InstructionCost ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
1197                                            VectorType *Tp, ArrayRef<int> Mask,
1198                                            int Index, VectorType *SubTp) {
1199   Kind = improveShuffleKindFromMask(Kind, Mask);
1200   if (ST->hasNEON()) {
1201     if (Kind == TTI::SK_Broadcast) {
1202       static const CostTblEntry NEONDupTbl[] = {
1203           // VDUP handles these cases.
1204           {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
1205           {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
1206           {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
1207           {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
1208           {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
1209           {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},
1210 
1211           {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
1212           {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
1213           {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
1214           {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1}};
1215 
1216       std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
1217       if (const auto *Entry =
1218               CostTableLookup(NEONDupTbl, ISD::VECTOR_SHUFFLE, LT.second))
1219         return LT.first * Entry->Cost;
1220     }
1221     if (Kind == TTI::SK_Reverse) {
1222       static const CostTblEntry NEONShuffleTbl[] = {
1223           // Reverse shuffle cost one instruction if we are shuffling within a
1224           // double word (vrev) or two if we shuffle a quad word (vrev, vext).
1225           {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
1226           {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
1227           {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
1228           {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
1229           {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
1230           {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},
1231 
1232           {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
1233           {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
1234           {ISD::VECTOR_SHUFFLE, MVT::v8i16, 2},
1235           {ISD::VECTOR_SHUFFLE, MVT::v16i8, 2}};
1236 
1237       std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
1238       if (const auto *Entry =
1239               CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
1240         return LT.first * Entry->Cost;
1241     }
1242     if (Kind == TTI::SK_Select) {
1243       static const CostTblEntry NEONSelShuffleTbl[] = {
1244           // Select shuffle cost table for ARM. Cost is the number of
1245           // instructions
1246           // required to create the shuffled vector.
1247 
1248           {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
1249           {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
1250           {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
1251           {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
1252 
1253           {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
1254           {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
1255           {ISD::VECTOR_SHUFFLE, MVT::v4i16, 2},
1256 
1257           {ISD::VECTOR_SHUFFLE, MVT::v8i16, 16},
1258 
1259           {ISD::VECTOR_SHUFFLE, MVT::v16i8, 32}};
1260 
1261       std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
1262       if (const auto *Entry = CostTableLookup(NEONSelShuffleTbl,
1263                                               ISD::VECTOR_SHUFFLE, LT.second))
1264         return LT.first * Entry->Cost;
1265     }
1266   }
1267   if (ST->hasMVEIntegerOps()) {
1268     if (Kind == TTI::SK_Broadcast) {
1269       static const CostTblEntry MVEDupTbl[] = {
1270           // VDUP handles these cases.
1271           {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
1272           {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
1273           {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1},
1274           {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
1275           {ISD::VECTOR_SHUFFLE, MVT::v8f16, 1}};
1276 
1277       std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
1278       if (const auto *Entry = CostTableLookup(MVEDupTbl, ISD::VECTOR_SHUFFLE,
1279                                               LT.second))
1280         return LT.first * Entry->Cost *
1281                ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput);
1282     }
1283 
1284     if (!Mask.empty()) {
1285       std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
1286       if (Mask.size() <= LT.second.getVectorNumElements() &&
1287           (isVREVMask(Mask, LT.second, 16) || isVREVMask(Mask, LT.second, 32) ||
1288            isVREVMask(Mask, LT.second, 64)))
1289         return ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput) * LT.first;
1290     }
1291   }
1292 
1293   int BaseCost = ST->hasMVEIntegerOps() && Tp->isVectorTy()
1294                      ? ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput)
1295                      : 1;
1296   return BaseCost * BaseT::getShuffleCost(Kind, Tp, Mask, Index, SubTp);
1297 }
1298 
1299 InstructionCost ARMTTIImpl::getArithmeticInstrCost(
1300     unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
1301     TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info,
1302     TTI::OperandValueProperties Opd1PropInfo,
1303     TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args,
1304     const Instruction *CxtI) {
1305   int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
1306   if (ST->isThumb() && CostKind == TTI::TCK_CodeSize && Ty->isIntegerTy(1)) {
1307     // Make operations on i1 relatively expensive as this often involves
1308     // combining predicates. AND and XOR should be easier to handle with IT
1309     // blocks.
1310     switch (ISDOpcode) {
1311     default:
1312       break;
1313     case ISD::AND:
1314     case ISD::XOR:
1315       return 2;
1316     case ISD::OR:
1317       return 3;
1318     }
1319   }
1320 
1321   std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
1322 
1323   if (ST->hasNEON()) {
1324     const unsigned FunctionCallDivCost = 20;
1325     const unsigned ReciprocalDivCost = 10;
1326     static const CostTblEntry CostTbl[] = {
1327       // Division.
1328       // These costs are somewhat random. Choose a cost of 20 to indicate that
1329       // vectorizing devision (added function call) is going to be very expensive.
1330       // Double registers types.
1331       { ISD::SDIV, MVT::v1i64, 1 * FunctionCallDivCost},
1332       { ISD::UDIV, MVT::v1i64, 1 * FunctionCallDivCost},
1333       { ISD::SREM, MVT::v1i64, 1 * FunctionCallDivCost},
1334       { ISD::UREM, MVT::v1i64, 1 * FunctionCallDivCost},
1335       { ISD::SDIV, MVT::v2i32, 2 * FunctionCallDivCost},
1336       { ISD::UDIV, MVT::v2i32, 2 * FunctionCallDivCost},
1337       { ISD::SREM, MVT::v2i32, 2 * FunctionCallDivCost},
1338       { ISD::UREM, MVT::v2i32, 2 * FunctionCallDivCost},
1339       { ISD::SDIV, MVT::v4i16,     ReciprocalDivCost},
1340       { ISD::UDIV, MVT::v4i16,     ReciprocalDivCost},
1341       { ISD::SREM, MVT::v4i16, 4 * FunctionCallDivCost},
1342       { ISD::UREM, MVT::v4i16, 4 * FunctionCallDivCost},
1343       { ISD::SDIV, MVT::v8i8,      ReciprocalDivCost},
1344       { ISD::UDIV, MVT::v8i8,      ReciprocalDivCost},
1345       { ISD::SREM, MVT::v8i8,  8 * FunctionCallDivCost},
1346       { ISD::UREM, MVT::v8i8,  8 * FunctionCallDivCost},
1347       // Quad register types.
1348       { ISD::SDIV, MVT::v2i64, 2 * FunctionCallDivCost},
1349       { ISD::UDIV, MVT::v2i64, 2 * FunctionCallDivCost},
1350       { ISD::SREM, MVT::v2i64, 2 * FunctionCallDivCost},
1351       { ISD::UREM, MVT::v2i64, 2 * FunctionCallDivCost},
1352       { ISD::SDIV, MVT::v4i32, 4 * FunctionCallDivCost},
1353       { ISD::UDIV, MVT::v4i32, 4 * FunctionCallDivCost},
1354       { ISD::SREM, MVT::v4i32, 4 * FunctionCallDivCost},
1355       { ISD::UREM, MVT::v4i32, 4 * FunctionCallDivCost},
1356       { ISD::SDIV, MVT::v8i16, 8 * FunctionCallDivCost},
1357       { ISD::UDIV, MVT::v8i16, 8 * FunctionCallDivCost},
1358       { ISD::SREM, MVT::v8i16, 8 * FunctionCallDivCost},
1359       { ISD::UREM, MVT::v8i16, 8 * FunctionCallDivCost},
1360       { ISD::SDIV, MVT::v16i8, 16 * FunctionCallDivCost},
1361       { ISD::UDIV, MVT::v16i8, 16 * FunctionCallDivCost},
1362       { ISD::SREM, MVT::v16i8, 16 * FunctionCallDivCost},
1363       { ISD::UREM, MVT::v16i8, 16 * FunctionCallDivCost},
1364       // Multiplication.
1365     };
1366 
1367     if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second))
1368       return LT.first * Entry->Cost;
1369 
1370     InstructionCost Cost = BaseT::getArithmeticInstrCost(
1371         Opcode, Ty, CostKind, Op1Info, Op2Info, Opd1PropInfo, Opd2PropInfo);
1372 
1373     // This is somewhat of a hack. The problem that we are facing is that SROA
1374     // creates a sequence of shift, and, or instructions to construct values.
1375     // These sequences are recognized by the ISel and have zero-cost. Not so for
1376     // the vectorized code. Because we have support for v2i64 but not i64 those
1377     // sequences look particularly beneficial to vectorize.
1378     // To work around this we increase the cost of v2i64 operations to make them
1379     // seem less beneficial.
1380     if (LT.second == MVT::v2i64 &&
1381         Op2Info == TargetTransformInfo::OK_UniformConstantValue)
1382       Cost += 4;
1383 
1384     return Cost;
1385   }
1386 
1387   // If this operation is a shift on arm/thumb2, it might well be folded into
1388   // the following instruction, hence having a cost of 0.
1389   auto LooksLikeAFreeShift = [&]() {
1390     if (ST->isThumb1Only() || Ty->isVectorTy())
1391       return false;
1392 
1393     if (!CxtI || !CxtI->hasOneUse() || !CxtI->isShift())
1394       return false;
1395     if (Op2Info != TargetTransformInfo::OK_UniformConstantValue)
1396       return false;
1397 
1398     // Folded into a ADC/ADD/AND/BIC/CMP/EOR/MVN/ORR/ORN/RSB/SBC/SUB
1399     switch (cast<Instruction>(CxtI->user_back())->getOpcode()) {
1400     case Instruction::Add:
1401     case Instruction::Sub:
1402     case Instruction::And:
1403     case Instruction::Xor:
1404     case Instruction::Or:
1405     case Instruction::ICmp:
1406       return true;
1407     default:
1408       return false;
1409     }
1410   };
1411   if (LooksLikeAFreeShift())
1412     return 0;
1413 
1414   // Default to cheap (throughput/size of 1 instruction) but adjust throughput
1415   // for "multiple beats" potentially needed by MVE instructions.
1416   int BaseCost = 1;
1417   if (ST->hasMVEIntegerOps() && Ty->isVectorTy())
1418     BaseCost = ST->getMVEVectorCostFactor(CostKind);
1419 
1420   // The rest of this mostly follows what is done in BaseT::getArithmeticInstrCost,
1421   // without treating floats as more expensive that scalars or increasing the
1422   // costs for custom operations. The results is also multiplied by the
1423   // MVEVectorCostFactor where appropriate.
1424   if (TLI->isOperationLegalOrCustomOrPromote(ISDOpcode, LT.second))
1425     return LT.first * BaseCost;
1426 
1427   // Else this is expand, assume that we need to scalarize this op.
1428   if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
1429     unsigned Num = VTy->getNumElements();
1430     InstructionCost Cost =
1431         getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind);
1432     // Return the cost of multiple scalar invocation plus the cost of
1433     // inserting and extracting the values.
1434     SmallVector<Type *> Tys(Args.size(), Ty);
1435     return BaseT::getScalarizationOverhead(VTy, Args, Tys) + Num * Cost;
1436   }
1437 
1438   return BaseCost;
1439 }
1440 
1441 InstructionCost ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
1442                                             MaybeAlign Alignment,
1443                                             unsigned AddressSpace,
1444                                             TTI::TargetCostKind CostKind,
1445                                             const Instruction *I) {
1446   // TODO: Handle other cost kinds.
1447   if (CostKind != TTI::TCK_RecipThroughput)
1448     return 1;
1449 
1450   // Type legalization can't handle structs
1451   if (TLI->getValueType(DL, Src, true) == MVT::Other)
1452     return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1453                                   CostKind);
1454 
1455   if (ST->hasNEON() && Src->isVectorTy() &&
1456       (Alignment && *Alignment != Align(16)) &&
1457       cast<VectorType>(Src)->getElementType()->isDoubleTy()) {
1458     // Unaligned loads/stores are extremely inefficient.
1459     // We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr.
1460     std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
1461     return LT.first * 4;
1462   }
1463 
1464   // MVE can optimize a fpext(load(4xhalf)) using an extending integer load.
1465   // Same for stores.
1466   if (ST->hasMVEFloatOps() && isa<FixedVectorType>(Src) && I &&
1467       ((Opcode == Instruction::Load && I->hasOneUse() &&
1468         isa<FPExtInst>(*I->user_begin())) ||
1469        (Opcode == Instruction::Store && isa<FPTruncInst>(I->getOperand(0))))) {
1470     FixedVectorType *SrcVTy = cast<FixedVectorType>(Src);
1471     Type *DstTy =
1472         Opcode == Instruction::Load
1473             ? (*I->user_begin())->getType()
1474             : cast<Instruction>(I->getOperand(0))->getOperand(0)->getType();
1475     if (SrcVTy->getNumElements() == 4 && SrcVTy->getScalarType()->isHalfTy() &&
1476         DstTy->getScalarType()->isFloatTy())
1477       return ST->getMVEVectorCostFactor(CostKind);
1478   }
1479 
1480   int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
1481                      ? ST->getMVEVectorCostFactor(CostKind)
1482                      : 1;
1483   return BaseCost * BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1484                                            CostKind, I);
1485 }
1486 
1487 InstructionCost
1488 ARMTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
1489                                   unsigned AddressSpace,
1490                                   TTI::TargetCostKind CostKind) {
1491   if (ST->hasMVEIntegerOps()) {
1492     if (Opcode == Instruction::Load && isLegalMaskedLoad(Src, Alignment))
1493       return ST->getMVEVectorCostFactor(CostKind);
1494     if (Opcode == Instruction::Store && isLegalMaskedStore(Src, Alignment))
1495       return ST->getMVEVectorCostFactor(CostKind);
1496   }
1497   if (!isa<FixedVectorType>(Src))
1498     return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1499                                         CostKind);
1500   // Scalar cost, which is currently very high due to the efficiency of the
1501   // generated code.
1502   return cast<FixedVectorType>(Src)->getNumElements() * 8;
1503 }
1504 
1505 InstructionCost ARMTTIImpl::getInterleavedMemoryOpCost(
1506     unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1507     Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1508     bool UseMaskForCond, bool UseMaskForGaps) {
1509   assert(Factor >= 2 && "Invalid interleave factor");
1510   assert(isa<VectorType>(VecTy) && "Expect a vector type");
1511 
1512   // vldN/vstN doesn't support vector types of i64/f64 element.
1513   bool EltIs64Bits = DL.getTypeSizeInBits(VecTy->getScalarType()) == 64;
1514 
1515   if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits &&
1516       !UseMaskForCond && !UseMaskForGaps) {
1517     unsigned NumElts = cast<FixedVectorType>(VecTy)->getNumElements();
1518     auto *SubVecTy =
1519         FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor);
1520 
1521     // vldN/vstN only support legal vector types of size 64 or 128 in bits.
1522     // Accesses having vector types that are a multiple of 128 bits can be
1523     // matched to more than one vldN/vstN instruction.
1524     int BaseCost =
1525         ST->hasMVEIntegerOps() ? ST->getMVEVectorCostFactor(CostKind) : 1;
1526     if (NumElts % Factor == 0 &&
1527         TLI->isLegalInterleavedAccessType(Factor, SubVecTy, Alignment, DL))
1528       return Factor * BaseCost * TLI->getNumInterleavedAccesses(SubVecTy, DL);
1529 
1530     // Some smaller than legal interleaved patterns are cheap as we can make
1531     // use of the vmovn or vrev patterns to interleave a standard load. This is
1532     // true for v4i8, v8i8 and v4i16 at least (but not for v4f16 as it is
1533     // promoted differently). The cost of 2 here is then a load and vrev or
1534     // vmovn.
1535     if (ST->hasMVEIntegerOps() && Factor == 2 && NumElts / Factor > 2 &&
1536         VecTy->isIntOrIntVectorTy() &&
1537         DL.getTypeSizeInBits(SubVecTy).getFixedSize() <= 64)
1538       return 2 * BaseCost;
1539   }
1540 
1541   return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1542                                            Alignment, AddressSpace, CostKind,
1543                                            UseMaskForCond, UseMaskForGaps);
1544 }
1545 
1546 InstructionCost ARMTTIImpl::getGatherScatterOpCost(
1547     unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
1548     Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
1549   using namespace PatternMatch;
1550   if (!ST->hasMVEIntegerOps() || !EnableMaskedGatherScatters)
1551     return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
1552                                          Alignment, CostKind, I);
1553 
1554   assert(DataTy->isVectorTy() && "Can't do gather/scatters on scalar!");
1555   auto *VTy = cast<FixedVectorType>(DataTy);
1556 
1557   // TODO: Splitting, once we do that.
1558 
1559   unsigned NumElems = VTy->getNumElements();
1560   unsigned EltSize = VTy->getScalarSizeInBits();
1561   std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, DataTy);
1562 
1563   // For now, it is assumed that for the MVE gather instructions the loads are
1564   // all effectively serialised. This means the cost is the scalar cost
1565   // multiplied by the number of elements being loaded. This is possibly very
1566   // conservative, but even so we still end up vectorising loops because the
1567   // cost per iteration for many loops is lower than for scalar loops.
1568   InstructionCost VectorCost =
1569       NumElems * LT.first * ST->getMVEVectorCostFactor(CostKind);
1570   // The scalarization cost should be a lot higher. We use the number of vector
1571   // elements plus the scalarization overhead.
1572   InstructionCost ScalarCost =
1573       NumElems * LT.first + BaseT::getScalarizationOverhead(VTy, true, false) +
1574       BaseT::getScalarizationOverhead(VTy, false, true);
1575 
1576   if (EltSize < 8 || Alignment < EltSize / 8)
1577     return ScalarCost;
1578 
1579   unsigned ExtSize = EltSize;
1580   // Check whether there's a single user that asks for an extended type
1581   if (I != nullptr) {
1582     // Dependent of the caller of this function, a gather instruction will
1583     // either have opcode Instruction::Load or be a call to the masked_gather
1584     // intrinsic
1585     if ((I->getOpcode() == Instruction::Load ||
1586          match(I, m_Intrinsic<Intrinsic::masked_gather>())) &&
1587         I->hasOneUse()) {
1588       const User *Us = *I->users().begin();
1589       if (isa<ZExtInst>(Us) || isa<SExtInst>(Us)) {
1590         // only allow valid type combinations
1591         unsigned TypeSize =
1592             cast<Instruction>(Us)->getType()->getScalarSizeInBits();
1593         if (((TypeSize == 32 && (EltSize == 8 || EltSize == 16)) ||
1594              (TypeSize == 16 && EltSize == 8)) &&
1595             TypeSize * NumElems == 128) {
1596           ExtSize = TypeSize;
1597         }
1598       }
1599     }
1600     // Check whether the input data needs to be truncated
1601     TruncInst *T;
1602     if ((I->getOpcode() == Instruction::Store ||
1603          match(I, m_Intrinsic<Intrinsic::masked_scatter>())) &&
1604         (T = dyn_cast<TruncInst>(I->getOperand(0)))) {
1605       // Only allow valid type combinations
1606       unsigned TypeSize = T->getOperand(0)->getType()->getScalarSizeInBits();
1607       if (((EltSize == 16 && TypeSize == 32) ||
1608            (EltSize == 8 && (TypeSize == 32 || TypeSize == 16))) &&
1609           TypeSize * NumElems == 128)
1610         ExtSize = TypeSize;
1611     }
1612   }
1613 
1614   if (ExtSize * NumElems != 128 || NumElems < 4)
1615     return ScalarCost;
1616 
1617   // Any (aligned) i32 gather will not need to be scalarised.
1618   if (ExtSize == 32)
1619     return VectorCost;
1620   // For smaller types, we need to ensure that the gep's inputs are correctly
1621   // extended from a small enough value. Other sizes (including i64) are
1622   // scalarized for now.
1623   if (ExtSize != 8 && ExtSize != 16)
1624     return ScalarCost;
1625 
1626   if (const auto *BC = dyn_cast<BitCastInst>(Ptr))
1627     Ptr = BC->getOperand(0);
1628   if (const auto *GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
1629     if (GEP->getNumOperands() != 2)
1630       return ScalarCost;
1631     unsigned Scale = DL.getTypeAllocSize(GEP->getResultElementType());
1632     // Scale needs to be correct (which is only relevant for i16s).
1633     if (Scale != 1 && Scale * 8 != ExtSize)
1634       return ScalarCost;
1635     // And we need to zext (not sext) the indexes from a small enough type.
1636     if (const auto *ZExt = dyn_cast<ZExtInst>(GEP->getOperand(1))) {
1637       if (ZExt->getOperand(0)->getType()->getScalarSizeInBits() <= ExtSize)
1638         return VectorCost;
1639     }
1640     return ScalarCost;
1641   }
1642   return ScalarCost;
1643 }
1644 
1645 InstructionCost
1646 ARMTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
1647                                        Optional<FastMathFlags> FMF,
1648                                        TTI::TargetCostKind CostKind) {
1649   if (TTI::requiresOrderedReduction(FMF))
1650     return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1651 
1652   EVT ValVT = TLI->getValueType(DL, ValTy);
1653   int ISD = TLI->InstructionOpcodeToISD(Opcode);
1654   if (!ST->hasMVEIntegerOps() || !ValVT.isSimple() || ISD != ISD::ADD)
1655     return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1656 
1657   std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
1658 
1659   static const CostTblEntry CostTblAdd[]{
1660       {ISD::ADD, MVT::v16i8, 1},
1661       {ISD::ADD, MVT::v8i16, 1},
1662       {ISD::ADD, MVT::v4i32, 1},
1663   };
1664   if (const auto *Entry = CostTableLookup(CostTblAdd, ISD, LT.second))
1665     return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first;
1666 
1667   return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1668 }
1669 
1670 InstructionCost
1671 ARMTTIImpl::getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned,
1672                                         Type *ResTy, VectorType *ValTy,
1673                                         TTI::TargetCostKind CostKind) {
1674   EVT ValVT = TLI->getValueType(DL, ValTy);
1675   EVT ResVT = TLI->getValueType(DL, ResTy);
1676 
1677   if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {
1678     std::pair<InstructionCost, MVT> LT =
1679         TLI->getTypeLegalizationCost(DL, ValTy);
1680 
1681     // The legal cases are:
1682     //   VADDV u/s 8/16/32
1683     //   VMLAV u/s 8/16/32
1684     //   VADDLV u/s 32
1685     //   VMLALV u/s 16/32
1686     // Codegen currently cannot always handle larger than legal vectors very
1687     // well, especially for predicated reductions where the mask needs to be
1688     // split, so restrict to 128bit or smaller input types.
1689     unsigned RevVTSize = ResVT.getSizeInBits();
1690     if (ValVT.getSizeInBits() <= 128 &&
1691         ((LT.second == MVT::v16i8 && RevVTSize <= 32) ||
1692          (LT.second == MVT::v8i16 && RevVTSize <= (IsMLA ? 64u : 32u)) ||
1693          (LT.second == MVT::v4i32 && RevVTSize <= 64)))
1694       return ST->getMVEVectorCostFactor(CostKind) * LT.first;
1695   }
1696 
1697   return BaseT::getExtendedAddReductionCost(IsMLA, IsUnsigned, ResTy, ValTy,
1698                                             CostKind);
1699 }
1700 
1701 InstructionCost
1702 ARMTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
1703                                   TTI::TargetCostKind CostKind) {
1704   switch (ICA.getID()) {
1705   case Intrinsic::get_active_lane_mask:
1706     // Currently we make a somewhat optimistic assumption that
1707     // active_lane_mask's are always free. In reality it may be freely folded
1708     // into a tail predicated loop, expanded into a VCPT or expanded into a lot
1709     // of add/icmp code. We may need to improve this in the future, but being
1710     // able to detect if it is free or not involves looking at a lot of other
1711     // code. We currently assume that the vectorizer inserted these, and knew
1712     // what it was doing in adding one.
1713     if (ST->hasMVEIntegerOps())
1714       return 0;
1715     break;
1716   case Intrinsic::sadd_sat:
1717   case Intrinsic::ssub_sat:
1718   case Intrinsic::uadd_sat:
1719   case Intrinsic::usub_sat: {
1720     if (!ST->hasMVEIntegerOps())
1721       break;
1722     Type *VT = ICA.getReturnType();
1723 
1724     std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, VT);
1725     if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
1726         LT.second == MVT::v16i8) {
1727       // This is a base cost of 1 for the vqadd, plus 3 extract shifts if we
1728       // need to extend the type, as it uses shr(qadd(shl, shl)).
1729       unsigned Instrs =
1730           LT.second.getScalarSizeInBits() == VT->getScalarSizeInBits() ? 1 : 4;
1731       return LT.first * ST->getMVEVectorCostFactor(CostKind) * Instrs;
1732     }
1733     break;
1734   }
1735   case Intrinsic::abs:
1736   case Intrinsic::smin:
1737   case Intrinsic::smax:
1738   case Intrinsic::umin:
1739   case Intrinsic::umax: {
1740     if (!ST->hasMVEIntegerOps())
1741       break;
1742     Type *VT = ICA.getReturnType();
1743 
1744     std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, VT);
1745     if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
1746         LT.second == MVT::v16i8)
1747       return LT.first * ST->getMVEVectorCostFactor(CostKind);
1748     break;
1749   }
1750   case Intrinsic::minnum:
1751   case Intrinsic::maxnum: {
1752     if (!ST->hasMVEFloatOps())
1753       break;
1754     Type *VT = ICA.getReturnType();
1755     std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, VT);
1756     if (LT.second == MVT::v4f32 || LT.second == MVT::v8f16)
1757       return LT.first * ST->getMVEVectorCostFactor(CostKind);
1758     break;
1759   }
1760   }
1761 
1762   return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1763 }
1764 
1765 bool ARMTTIImpl::isLoweredToCall(const Function *F) {
1766   if (!F->isIntrinsic())
1767     BaseT::isLoweredToCall(F);
1768 
1769   // Assume all Arm-specific intrinsics map to an instruction.
1770   if (F->getName().startswith("llvm.arm"))
1771     return false;
1772 
1773   switch (F->getIntrinsicID()) {
1774   default: break;
1775   case Intrinsic::powi:
1776   case Intrinsic::sin:
1777   case Intrinsic::cos:
1778   case Intrinsic::pow:
1779   case Intrinsic::log:
1780   case Intrinsic::log10:
1781   case Intrinsic::log2:
1782   case Intrinsic::exp:
1783   case Intrinsic::exp2:
1784     return true;
1785   case Intrinsic::sqrt:
1786   case Intrinsic::fabs:
1787   case Intrinsic::copysign:
1788   case Intrinsic::floor:
1789   case Intrinsic::ceil:
1790   case Intrinsic::trunc:
1791   case Intrinsic::rint:
1792   case Intrinsic::nearbyint:
1793   case Intrinsic::round:
1794   case Intrinsic::canonicalize:
1795   case Intrinsic::lround:
1796   case Intrinsic::llround:
1797   case Intrinsic::lrint:
1798   case Intrinsic::llrint:
1799     if (F->getReturnType()->isDoubleTy() && !ST->hasFP64())
1800       return true;
1801     if (F->getReturnType()->isHalfTy() && !ST->hasFullFP16())
1802       return true;
1803     // Some operations can be handled by vector instructions and assume
1804     // unsupported vectors will be expanded into supported scalar ones.
1805     // TODO Handle scalar operations properly.
1806     return !ST->hasFPARMv8Base() && !ST->hasVFP2Base();
1807   case Intrinsic::masked_store:
1808   case Intrinsic::masked_load:
1809   case Intrinsic::masked_gather:
1810   case Intrinsic::masked_scatter:
1811     return !ST->hasMVEIntegerOps();
1812   case Intrinsic::sadd_with_overflow:
1813   case Intrinsic::uadd_with_overflow:
1814   case Intrinsic::ssub_with_overflow:
1815   case Intrinsic::usub_with_overflow:
1816   case Intrinsic::sadd_sat:
1817   case Intrinsic::uadd_sat:
1818   case Intrinsic::ssub_sat:
1819   case Intrinsic::usub_sat:
1820     return false;
1821   }
1822 
1823   return BaseT::isLoweredToCall(F);
1824 }
1825 
1826 bool ARMTTIImpl::maybeLoweredToCall(Instruction &I) {
1827   unsigned ISD = TLI->InstructionOpcodeToISD(I.getOpcode());
1828   EVT VT = TLI->getValueType(DL, I.getType(), true);
1829   if (TLI->getOperationAction(ISD, VT) == TargetLowering::LibCall)
1830     return true;
1831 
1832   // Check if an intrinsic will be lowered to a call and assume that any
1833   // other CallInst will generate a bl.
1834   if (auto *Call = dyn_cast<CallInst>(&I)) {
1835     if (auto *II = dyn_cast<IntrinsicInst>(Call)) {
1836       switch(II->getIntrinsicID()) {
1837         case Intrinsic::memcpy:
1838         case Intrinsic::memset:
1839         case Intrinsic::memmove:
1840           return getNumMemOps(II) == -1;
1841         default:
1842           if (const Function *F = Call->getCalledFunction())
1843             return isLoweredToCall(F);
1844       }
1845     }
1846     return true;
1847   }
1848 
1849   // FPv5 provides conversions between integer, double-precision,
1850   // single-precision, and half-precision formats.
1851   switch (I.getOpcode()) {
1852   default:
1853     break;
1854   case Instruction::FPToSI:
1855   case Instruction::FPToUI:
1856   case Instruction::SIToFP:
1857   case Instruction::UIToFP:
1858   case Instruction::FPTrunc:
1859   case Instruction::FPExt:
1860     return !ST->hasFPARMv8Base();
1861   }
1862 
1863   // FIXME: Unfortunately the approach of checking the Operation Action does
1864   // not catch all cases of Legalization that use library calls. Our
1865   // Legalization step categorizes some transformations into library calls as
1866   // Custom, Expand or even Legal when doing type legalization. So for now
1867   // we have to special case for instance the SDIV of 64bit integers and the
1868   // use of floating point emulation.
1869   if (VT.isInteger() && VT.getSizeInBits() >= 64) {
1870     switch (ISD) {
1871     default:
1872       break;
1873     case ISD::SDIV:
1874     case ISD::UDIV:
1875     case ISD::SREM:
1876     case ISD::UREM:
1877     case ISD::SDIVREM:
1878     case ISD::UDIVREM:
1879       return true;
1880     }
1881   }
1882 
1883   // Assume all other non-float operations are supported.
1884   if (!VT.isFloatingPoint())
1885     return false;
1886 
1887   // We'll need a library call to handle most floats when using soft.
1888   if (TLI->useSoftFloat()) {
1889     switch (I.getOpcode()) {
1890     default:
1891       return true;
1892     case Instruction::Alloca:
1893     case Instruction::Load:
1894     case Instruction::Store:
1895     case Instruction::Select:
1896     case Instruction::PHI:
1897       return false;
1898     }
1899   }
1900 
1901   // We'll need a libcall to perform double precision operations on a single
1902   // precision only FPU.
1903   if (I.getType()->isDoubleTy() && !ST->hasFP64())
1904     return true;
1905 
1906   // Likewise for half precision arithmetic.
1907   if (I.getType()->isHalfTy() && !ST->hasFullFP16())
1908     return true;
1909 
1910   return false;
1911 }
1912 
1913 bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
1914                                           AssumptionCache &AC,
1915                                           TargetLibraryInfo *LibInfo,
1916                                           HardwareLoopInfo &HWLoopInfo) {
1917   // Low-overhead branches are only supported in the 'low-overhead branch'
1918   // extension of v8.1-m.
1919   if (!ST->hasLOB() || DisableLowOverheadLoops) {
1920     LLVM_DEBUG(dbgs() << "ARMHWLoops: Disabled\n");
1921     return false;
1922   }
1923 
1924   if (!SE.hasLoopInvariantBackedgeTakenCount(L)) {
1925     LLVM_DEBUG(dbgs() << "ARMHWLoops: No BETC\n");
1926     return false;
1927   }
1928 
1929   const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
1930   if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) {
1931     LLVM_DEBUG(dbgs() << "ARMHWLoops: Uncomputable BETC\n");
1932     return false;
1933   }
1934 
1935   const SCEV *TripCountSCEV =
1936     SE.getAddExpr(BackedgeTakenCount,
1937                   SE.getOne(BackedgeTakenCount->getType()));
1938 
1939   // We need to store the trip count in LR, a 32-bit register.
1940   if (SE.getUnsignedRangeMax(TripCountSCEV).getBitWidth() > 32) {
1941     LLVM_DEBUG(dbgs() << "ARMHWLoops: Trip count does not fit into 32bits\n");
1942     return false;
1943   }
1944 
1945   // Making a call will trash LR and clear LO_BRANCH_INFO, so there's little
1946   // point in generating a hardware loop if that's going to happen.
1947 
1948   auto IsHardwareLoopIntrinsic = [](Instruction &I) {
1949     if (auto *Call = dyn_cast<IntrinsicInst>(&I)) {
1950       switch (Call->getIntrinsicID()) {
1951       default:
1952         break;
1953       case Intrinsic::start_loop_iterations:
1954       case Intrinsic::test_start_loop_iterations:
1955       case Intrinsic::loop_decrement:
1956       case Intrinsic::loop_decrement_reg:
1957         return true;
1958       }
1959     }
1960     return false;
1961   };
1962 
1963   // Scan the instructions to see if there's any that we know will turn into a
1964   // call or if this loop is already a low-overhead loop or will become a tail
1965   // predicated loop.
1966   bool IsTailPredLoop = false;
1967   auto ScanLoop = [&](Loop *L) {
1968     for (auto *BB : L->getBlocks()) {
1969       for (auto &I : *BB) {
1970         if (maybeLoweredToCall(I) || IsHardwareLoopIntrinsic(I) ||
1971             isa<InlineAsm>(I)) {
1972           LLVM_DEBUG(dbgs() << "ARMHWLoops: Bad instruction: " << I << "\n");
1973           return false;
1974         }
1975         if (auto *II = dyn_cast<IntrinsicInst>(&I))
1976           IsTailPredLoop |=
1977               II->getIntrinsicID() == Intrinsic::get_active_lane_mask ||
1978               II->getIntrinsicID() == Intrinsic::arm_mve_vctp8 ||
1979               II->getIntrinsicID() == Intrinsic::arm_mve_vctp16 ||
1980               II->getIntrinsicID() == Intrinsic::arm_mve_vctp32 ||
1981               II->getIntrinsicID() == Intrinsic::arm_mve_vctp64;
1982       }
1983     }
1984     return true;
1985   };
1986 
1987   // Visit inner loops.
1988   for (auto Inner : *L)
1989     if (!ScanLoop(Inner))
1990       return false;
1991 
1992   if (!ScanLoop(L))
1993     return false;
1994 
1995   // TODO: Check whether the trip count calculation is expensive. If L is the
1996   // inner loop but we know it has a low trip count, calculating that trip
1997   // count (in the parent loop) may be detrimental.
1998 
1999   LLVMContext &C = L->getHeader()->getContext();
2000   HWLoopInfo.CounterInReg = true;
2001   HWLoopInfo.IsNestingLegal = false;
2002   HWLoopInfo.PerformEntryTest = AllowWLSLoops && !IsTailPredLoop;
2003   HWLoopInfo.CountType = Type::getInt32Ty(C);
2004   HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1);
2005   return true;
2006 }
2007 
2008 static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) {
2009   // We don't allow icmp's, and because we only look at single block loops,
2010   // we simply count the icmps, i.e. there should only be 1 for the backedge.
2011   if (isa<ICmpInst>(&I) && ++ICmpCount > 1)
2012     return false;
2013   // FIXME: This is a workaround for poor cost modelling. Min/Max intrinsics are
2014   // not currently canonical, but soon will be. Code without them uses icmp, and
2015   // so is not tail predicated as per the condition above. In order to get the
2016   // same performance we treat min and max the same as an icmp for tailpred
2017   // purposes for the moment (we often rely on non-tailpred and higher VF's to
2018   // pick more optimial instructions like VQDMULH. They need to be recognized
2019   // directly by the vectorizer).
2020   if (auto *II = dyn_cast<IntrinsicInst>(&I))
2021     if ((II->getIntrinsicID() == Intrinsic::smin ||
2022          II->getIntrinsicID() == Intrinsic::smax ||
2023          II->getIntrinsicID() == Intrinsic::umin ||
2024          II->getIntrinsicID() == Intrinsic::umax) &&
2025         ++ICmpCount > 1)
2026       return false;
2027 
2028   if (isa<FCmpInst>(&I))
2029     return false;
2030 
2031   // We could allow extending/narrowing FP loads/stores, but codegen is
2032   // too inefficient so reject this for now.
2033   if (isa<FPExtInst>(&I) || isa<FPTruncInst>(&I))
2034     return false;
2035 
2036   // Extends have to be extending-loads
2037   if (isa<SExtInst>(&I) || isa<ZExtInst>(&I) )
2038     if (!I.getOperand(0)->hasOneUse() || !isa<LoadInst>(I.getOperand(0)))
2039       return false;
2040 
2041   // Truncs have to be narrowing-stores
2042   if (isa<TruncInst>(&I) )
2043     if (!I.hasOneUse() || !isa<StoreInst>(*I.user_begin()))
2044       return false;
2045 
2046   return true;
2047 }
2048 
2049 // To set up a tail-predicated loop, we need to know the total number of
2050 // elements processed by that loop. Thus, we need to determine the element
2051 // size and:
2052 // 1) it should be uniform for all operations in the vector loop, so we
2053 //    e.g. don't want any widening/narrowing operations.
2054 // 2) it should be smaller than i64s because we don't have vector operations
2055 //    that work on i64s.
2056 // 3) we don't want elements to be reversed or shuffled, to make sure the
2057 //    tail-predication masks/predicates the right lanes.
2058 //
2059 static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
2060                                  const DataLayout &DL,
2061                                  const LoopAccessInfo *LAI) {
2062   LLVM_DEBUG(dbgs() << "Tail-predication: checking allowed instructions\n");
2063 
2064   // If there are live-out values, it is probably a reduction. We can predicate
2065   // most reduction operations freely under MVE using a combination of
2066   // prefer-predicated-reduction-select and inloop reductions. We limit this to
2067   // floating point and integer reductions, but don't check for operators
2068   // specifically here. If the value ends up not being a reduction (and so the
2069   // vectorizer cannot tailfold the loop), we should fall back to standard
2070   // vectorization automatically.
2071   SmallVector< Instruction *, 8 > LiveOuts;
2072   LiveOuts = llvm::findDefsUsedOutsideOfLoop(L);
2073   bool ReductionsDisabled =
2074       EnableTailPredication == TailPredication::EnabledNoReductions ||
2075       EnableTailPredication == TailPredication::ForceEnabledNoReductions;
2076 
2077   for (auto *I : LiveOuts) {
2078     if (!I->getType()->isIntegerTy() && !I->getType()->isFloatTy() &&
2079         !I->getType()->isHalfTy()) {
2080       LLVM_DEBUG(dbgs() << "Don't tail-predicate loop with non-integer/float "
2081                            "live-out value\n");
2082       return false;
2083     }
2084     if (ReductionsDisabled) {
2085       LLVM_DEBUG(dbgs() << "Reductions not enabled\n");
2086       return false;
2087     }
2088   }
2089 
2090   // Next, check that all instructions can be tail-predicated.
2091   PredicatedScalarEvolution PSE = LAI->getPSE();
2092   SmallVector<Instruction *, 16> LoadStores;
2093   int ICmpCount = 0;
2094 
2095   for (BasicBlock *BB : L->blocks()) {
2096     for (Instruction &I : BB->instructionsWithoutDebug()) {
2097       if (isa<PHINode>(&I))
2098         continue;
2099       if (!canTailPredicateInstruction(I, ICmpCount)) {
2100         LLVM_DEBUG(dbgs() << "Instruction not allowed: "; I.dump());
2101         return false;
2102       }
2103 
2104       Type *T  = I.getType();
2105       if (T->isPointerTy())
2106         T = T->getPointerElementType();
2107 
2108       if (T->getScalarSizeInBits() > 32) {
2109         LLVM_DEBUG(dbgs() << "Unsupported Type: "; T->dump());
2110         return false;
2111       }
2112       if (isa<StoreInst>(I) || isa<LoadInst>(I)) {
2113         Value *Ptr = getLoadStorePointerOperand(&I);
2114         Type *AccessTy = getLoadStoreType(&I);
2115         int64_t NextStride = getPtrStride(PSE, AccessTy, Ptr, L);
2116         if (NextStride == 1) {
2117           // TODO: for now only allow consecutive strides of 1. We could support
2118           // other strides as long as it is uniform, but let's keep it simple
2119           // for now.
2120           continue;
2121         } else if (NextStride == -1 ||
2122                    (NextStride == 2 && MVEMaxSupportedInterleaveFactor >= 2) ||
2123                    (NextStride == 4 && MVEMaxSupportedInterleaveFactor >= 4)) {
2124           LLVM_DEBUG(dbgs()
2125                      << "Consecutive strides of 2 found, vld2/vstr2 can't "
2126                         "be tail-predicated\n.");
2127           return false;
2128           // TODO: don't tail predicate if there is a reversed load?
2129         } else if (EnableMaskedGatherScatters) {
2130           // Gather/scatters do allow loading from arbitrary strides, at
2131           // least if they are loop invariant.
2132           // TODO: Loop variant strides should in theory work, too, but
2133           // this requires further testing.
2134           const SCEV *PtrScev = PSE.getSE()->getSCEV(Ptr);
2135           if (auto AR = dyn_cast<SCEVAddRecExpr>(PtrScev)) {
2136             const SCEV *Step = AR->getStepRecurrence(*PSE.getSE());
2137             if (PSE.getSE()->isLoopInvariant(Step, L))
2138               continue;
2139           }
2140         }
2141         LLVM_DEBUG(dbgs() << "Bad stride found, can't "
2142                              "tail-predicate\n.");
2143         return false;
2144       }
2145     }
2146   }
2147 
2148   LLVM_DEBUG(dbgs() << "tail-predication: all instructions allowed!\n");
2149   return true;
2150 }
2151 
2152 bool ARMTTIImpl::preferPredicateOverEpilogue(Loop *L, LoopInfo *LI,
2153                                              ScalarEvolution &SE,
2154                                              AssumptionCache &AC,
2155                                              TargetLibraryInfo *TLI,
2156                                              DominatorTree *DT,
2157                                              const LoopAccessInfo *LAI) {
2158   if (!EnableTailPredication) {
2159     LLVM_DEBUG(dbgs() << "Tail-predication not enabled.\n");
2160     return false;
2161   }
2162 
2163   // Creating a predicated vector loop is the first step for generating a
2164   // tail-predicated hardware loop, for which we need the MVE masked
2165   // load/stores instructions:
2166   if (!ST->hasMVEIntegerOps())
2167     return false;
2168 
2169   // For now, restrict this to single block loops.
2170   if (L->getNumBlocks() > 1) {
2171     LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: not a single block "
2172                          "loop.\n");
2173     return false;
2174   }
2175 
2176   assert(L->isInnermost() && "preferPredicateOverEpilogue: inner-loop expected");
2177 
2178   HardwareLoopInfo HWLoopInfo(L);
2179   if (!HWLoopInfo.canAnalyze(*LI)) {
2180     LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2181                          "analyzable.\n");
2182     return false;
2183   }
2184 
2185   // This checks if we have the low-overhead branch architecture
2186   // extension, and if we will create a hardware-loop:
2187   if (!isHardwareLoopProfitable(L, SE, AC, TLI, HWLoopInfo)) {
2188     LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2189                          "profitable.\n");
2190     return false;
2191   }
2192 
2193   if (!HWLoopInfo.isHardwareLoopCandidate(SE, *LI, *DT)) {
2194     LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2195                          "a candidate.\n");
2196     return false;
2197   }
2198 
2199   return canTailPredicateLoop(L, LI, SE, DL, LAI);
2200 }
2201 
2202 bool ARMTTIImpl::emitGetActiveLaneMask() const {
2203   if (!ST->hasMVEIntegerOps() || !EnableTailPredication)
2204     return false;
2205 
2206   // Intrinsic @llvm.get.active.lane.mask is supported.
2207   // It is used in the MVETailPredication pass, which requires the number of
2208   // elements processed by this vector loop to setup the tail-predicated
2209   // loop.
2210   return true;
2211 }
2212 void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
2213                                          TTI::UnrollingPreferences &UP,
2214                                          OptimizationRemarkEmitter *ORE) {
2215   // Enable Upper bound unrolling universally, not dependant upon the conditions
2216   // below.
2217   UP.UpperBound = true;
2218 
2219   // Only currently enable these preferences for M-Class cores.
2220   if (!ST->isMClass())
2221     return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
2222 
2223   // Disable loop unrolling for Oz and Os.
2224   UP.OptSizeThreshold = 0;
2225   UP.PartialOptSizeThreshold = 0;
2226   if (L->getHeader()->getParent()->hasOptSize())
2227     return;
2228 
2229   SmallVector<BasicBlock*, 4> ExitingBlocks;
2230   L->getExitingBlocks(ExitingBlocks);
2231   LLVM_DEBUG(dbgs() << "Loop has:\n"
2232                     << "Blocks: " << L->getNumBlocks() << "\n"
2233                     << "Exit blocks: " << ExitingBlocks.size() << "\n");
2234 
2235   // Only allow another exit other than the latch. This acts as an early exit
2236   // as it mirrors the profitability calculation of the runtime unroller.
2237   if (ExitingBlocks.size() > 2)
2238     return;
2239 
2240   // Limit the CFG of the loop body for targets with a branch predictor.
2241   // Allowing 4 blocks permits if-then-else diamonds in the body.
2242   if (ST->hasBranchPredictor() && L->getNumBlocks() > 4)
2243     return;
2244 
2245   // Don't unroll vectorized loops, including the remainder loop
2246   if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized"))
2247     return;
2248 
2249   // Scan the loop: don't unroll loops with calls as this could prevent
2250   // inlining.
2251   InstructionCost Cost = 0;
2252   for (auto *BB : L->getBlocks()) {
2253     for (auto &I : *BB) {
2254       // Don't unroll vectorised loop. MVE does not benefit from it as much as
2255       // scalar code.
2256       if (I.getType()->isVectorTy())
2257         return;
2258 
2259       if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
2260         if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
2261           if (!isLoweredToCall(F))
2262             continue;
2263         }
2264         return;
2265       }
2266 
2267       SmallVector<const Value*, 4> Operands(I.operand_values());
2268       Cost +=
2269         getUserCost(&I, Operands, TargetTransformInfo::TCK_SizeAndLatency);
2270     }
2271   }
2272 
2273   // On v6m cores, there are very few registers available. We can easily end up
2274   // spilling and reloading more registers in an unrolled loop. Look at the
2275   // number of LCSSA phis as a rough measure of how many registers will need to
2276   // be live out of the loop, reducing the default unroll count if more than 1
2277   // value is needed.  In the long run, all of this should be being learnt by a
2278   // machine.
2279   unsigned UnrollCount = 4;
2280   if (ST->isThumb1Only()) {
2281     unsigned ExitingValues = 0;
2282     SmallVector<BasicBlock *, 4> ExitBlocks;
2283     L->getExitBlocks(ExitBlocks);
2284     for (auto *Exit : ExitBlocks) {
2285       // Count the number of LCSSA phis. Exclude values coming from GEP's as
2286       // only the last is expected to be needed for address operands.
2287       unsigned LiveOuts = count_if(Exit->phis(), [](auto &PH) {
2288         return PH.getNumOperands() != 1 ||
2289                !isa<GetElementPtrInst>(PH.getOperand(0));
2290       });
2291       ExitingValues = ExitingValues < LiveOuts ? LiveOuts : ExitingValues;
2292     }
2293     if (ExitingValues)
2294       UnrollCount /= ExitingValues;
2295     if (UnrollCount <= 1)
2296       return;
2297   }
2298 
2299   LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
2300   LLVM_DEBUG(dbgs() << "Default Runtime Unroll Count: " << UnrollCount << "\n");
2301 
2302   UP.Partial = true;
2303   UP.Runtime = true;
2304   UP.UnrollRemainder = true;
2305   UP.DefaultUnrollRuntimeCount = UnrollCount;
2306   UP.UnrollAndJam = true;
2307   UP.UnrollAndJamInnerLoopThreshold = 60;
2308 
2309   // Force unrolling small loops can be very useful because of the branch
2310   // taken cost of the backedge.
2311   if (Cost < 12)
2312     UP.Force = true;
2313 }
2314 
2315 void ARMTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
2316                                        TTI::PeelingPreferences &PP) {
2317   BaseT::getPeelingPreferences(L, SE, PP);
2318 }
2319 
2320 bool ARMTTIImpl::preferInLoopReduction(unsigned Opcode, Type *Ty,
2321                                        TTI::ReductionFlags Flags) const {
2322   if (!ST->hasMVEIntegerOps())
2323     return false;
2324 
2325   unsigned ScalarBits = Ty->getScalarSizeInBits();
2326   switch (Opcode) {
2327   case Instruction::Add:
2328     return ScalarBits <= 64;
2329   default:
2330     return false;
2331   }
2332 }
2333 
2334 bool ARMTTIImpl::preferPredicatedReductionSelect(
2335     unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const {
2336   if (!ST->hasMVEIntegerOps())
2337     return false;
2338   return true;
2339 }
2340