1 //===- ARMTargetTransformInfo.cpp - ARM specific TTI ----------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8
9 #include "ARMTargetTransformInfo.h"
10 #include "ARMSubtarget.h"
11 #include "MCTargetDesc/ARMAddressingModes.h"
12 #include "llvm/ADT/APInt.h"
13 #include "llvm/ADT/SmallVector.h"
14 #include "llvm/Analysis/LoopInfo.h"
15 #include "llvm/CodeGen/CostTable.h"
16 #include "llvm/CodeGen/ISDOpcodes.h"
17 #include "llvm/CodeGen/ValueTypes.h"
18 #include "llvm/CodeGenTypes/MachineValueType.h"
19 #include "llvm/IR/BasicBlock.h"
20 #include "llvm/IR/DataLayout.h"
21 #include "llvm/IR/DerivedTypes.h"
22 #include "llvm/IR/Instruction.h"
23 #include "llvm/IR/Instructions.h"
24 #include "llvm/IR/IntrinsicInst.h"
25 #include "llvm/IR/Intrinsics.h"
26 #include "llvm/IR/IntrinsicsARM.h"
27 #include "llvm/IR/PatternMatch.h"
28 #include "llvm/IR/Type.h"
29 #include "llvm/Support/Casting.h"
30 #include "llvm/Support/KnownBits.h"
31 #include "llvm/Target/TargetMachine.h"
32 #include "llvm/TargetParser/SubtargetFeature.h"
33 #include "llvm/Transforms/InstCombine/InstCombiner.h"
34 #include "llvm/Transforms/Utils/Local.h"
35 #include "llvm/Transforms/Utils/LoopUtils.h"
36 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
37 #include <algorithm>
38 #include <cassert>
39 #include <cstdint>
40 #include <optional>
41 #include <utility>
42
43 using namespace llvm;
44
45 #define DEBUG_TYPE "armtti"
46
47 static cl::opt<bool> EnableMaskedLoadStores(
48 "enable-arm-maskedldst", cl::Hidden, cl::init(true),
49 cl::desc("Enable the generation of masked loads and stores"));
50
51 static cl::opt<bool> DisableLowOverheadLoops(
52 "disable-arm-loloops", cl::Hidden, cl::init(false),
53 cl::desc("Disable the generation of low-overhead loops"));
54
55 static cl::opt<bool>
56 AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true),
57 cl::desc("Enable the generation of WLS loops"));
58
59 static cl::opt<bool> UseWidenGlobalArrays(
60 "widen-global-strings", cl::Hidden, cl::init(true),
61 cl::desc("Enable the widening of global strings to alignment boundaries"));
62
63 extern cl::opt<TailPredication::Mode> EnableTailPredication;
64
65 extern cl::opt<bool> EnableMaskedGatherScatters;
66
67 extern cl::opt<unsigned> MVEMaxSupportedInterleaveFactor;
68
69 /// Convert a vector load intrinsic into a simple llvm load instruction.
70 /// This is beneficial when the underlying object being addressed comes
71 /// from a constant, since we get constant-folding for free.
simplifyNeonVld1(const IntrinsicInst & II,unsigned MemAlign,InstCombiner::BuilderTy & Builder)72 static Value *simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign,
73 InstCombiner::BuilderTy &Builder) {
74 auto *IntrAlign = dyn_cast<ConstantInt>(II.getArgOperand(1));
75
76 if (!IntrAlign)
77 return nullptr;
78
79 unsigned Alignment = IntrAlign->getLimitedValue() < MemAlign
80 ? MemAlign
81 : IntrAlign->getLimitedValue();
82
83 if (!isPowerOf2_32(Alignment))
84 return nullptr;
85
86 return Builder.CreateAlignedLoad(II.getType(), II.getArgOperand(0),
87 Align(Alignment));
88 }
89
areInlineCompatible(const Function * Caller,const Function * Callee) const90 bool ARMTTIImpl::areInlineCompatible(const Function *Caller,
91 const Function *Callee) const {
92 const TargetMachine &TM = getTLI()->getTargetMachine();
93 const FeatureBitset &CallerBits =
94 TM.getSubtargetImpl(*Caller)->getFeatureBits();
95 const FeatureBitset &CalleeBits =
96 TM.getSubtargetImpl(*Callee)->getFeatureBits();
97
98 // To inline a callee, all features not in the allowed list must match exactly.
99 bool MatchExact = (CallerBits & ~InlineFeaturesAllowed) ==
100 (CalleeBits & ~InlineFeaturesAllowed);
101 // For features in the allowed list, the callee's features must be a subset of
102 // the callers'.
103 bool MatchSubset = ((CallerBits & CalleeBits) & InlineFeaturesAllowed) ==
104 (CalleeBits & InlineFeaturesAllowed);
105 return MatchExact && MatchSubset;
106 }
107
108 TTI::AddressingModeKind
getPreferredAddressingMode(const Loop * L,ScalarEvolution * SE) const109 ARMTTIImpl::getPreferredAddressingMode(const Loop *L,
110 ScalarEvolution *SE) const {
111 if (ST->hasMVEIntegerOps())
112 return TTI::AMK_PostIndexed;
113
114 if (L->getHeader()->getParent()->hasOptSize())
115 return TTI::AMK_None;
116
117 if (ST->isMClass() && ST->isThumb2() &&
118 L->getNumBlocks() == 1)
119 return TTI::AMK_PreIndexed;
120
121 return TTI::AMK_None;
122 }
123
124 std::optional<Instruction *>
instCombineIntrinsic(InstCombiner & IC,IntrinsicInst & II) const125 ARMTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
126 using namespace PatternMatch;
127 Intrinsic::ID IID = II.getIntrinsicID();
128 switch (IID) {
129 default:
130 break;
131 case Intrinsic::arm_neon_vld1: {
132 Align MemAlign =
133 getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,
134 &IC.getAssumptionCache(), &IC.getDominatorTree());
135 if (Value *V = simplifyNeonVld1(II, MemAlign.value(), IC.Builder)) {
136 return IC.replaceInstUsesWith(II, V);
137 }
138 break;
139 }
140
141 case Intrinsic::arm_neon_vld2:
142 case Intrinsic::arm_neon_vld3:
143 case Intrinsic::arm_neon_vld4:
144 case Intrinsic::arm_neon_vld2lane:
145 case Intrinsic::arm_neon_vld3lane:
146 case Intrinsic::arm_neon_vld4lane:
147 case Intrinsic::arm_neon_vst1:
148 case Intrinsic::arm_neon_vst2:
149 case Intrinsic::arm_neon_vst3:
150 case Intrinsic::arm_neon_vst4:
151 case Intrinsic::arm_neon_vst2lane:
152 case Intrinsic::arm_neon_vst3lane:
153 case Intrinsic::arm_neon_vst4lane: {
154 Align MemAlign =
155 getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,
156 &IC.getAssumptionCache(), &IC.getDominatorTree());
157 unsigned AlignArg = II.arg_size() - 1;
158 Value *AlignArgOp = II.getArgOperand(AlignArg);
159 MaybeAlign Align = cast<ConstantInt>(AlignArgOp)->getMaybeAlignValue();
160 if (Align && *Align < MemAlign) {
161 return IC.replaceOperand(
162 II, AlignArg,
163 ConstantInt::get(Type::getInt32Ty(II.getContext()), MemAlign.value(),
164 false));
165 }
166 break;
167 }
168
169 case Intrinsic::arm_neon_vld1x2:
170 case Intrinsic::arm_neon_vld1x3:
171 case Intrinsic::arm_neon_vld1x4:
172 case Intrinsic::arm_neon_vst1x2:
173 case Intrinsic::arm_neon_vst1x3:
174 case Intrinsic::arm_neon_vst1x4: {
175 Align NewAlign =
176 getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,
177 &IC.getAssumptionCache(), &IC.getDominatorTree());
178 Align OldAlign = II.getParamAlign(0).valueOrOne();
179 if (NewAlign > OldAlign)
180 II.addParamAttr(0,
181 Attribute::getWithAlignment(II.getContext(), NewAlign));
182 break;
183 }
184
185 case Intrinsic::arm_mve_pred_i2v: {
186 Value *Arg = II.getArgOperand(0);
187 Value *ArgArg;
188 if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
189 PatternMatch::m_Value(ArgArg))) &&
190 II.getType() == ArgArg->getType()) {
191 return IC.replaceInstUsesWith(II, ArgArg);
192 }
193 Constant *XorMask;
194 if (match(Arg, m_Xor(PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
195 PatternMatch::m_Value(ArgArg)),
196 PatternMatch::m_Constant(XorMask))) &&
197 II.getType() == ArgArg->getType()) {
198 if (auto *CI = dyn_cast<ConstantInt>(XorMask)) {
199 if (CI->getValue().trunc(16).isAllOnes()) {
200 auto TrueVector = IC.Builder.CreateVectorSplat(
201 cast<FixedVectorType>(II.getType())->getNumElements(),
202 IC.Builder.getTrue());
203 return BinaryOperator::Create(Instruction::Xor, ArgArg, TrueVector);
204 }
205 }
206 }
207 KnownBits ScalarKnown(32);
208 if (IC.SimplifyDemandedBits(&II, 0, APInt::getLowBitsSet(32, 16),
209 ScalarKnown)) {
210 return &II;
211 }
212 break;
213 }
214 case Intrinsic::arm_mve_pred_v2i: {
215 Value *Arg = II.getArgOperand(0);
216 Value *ArgArg;
217 if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_i2v>(
218 PatternMatch::m_Value(ArgArg)))) {
219 return IC.replaceInstUsesWith(II, ArgArg);
220 }
221
222 if (II.getMetadata(LLVMContext::MD_range))
223 break;
224
225 ConstantRange Range(APInt(32, 0), APInt(32, 0x10000));
226
227 if (auto CurrentRange = II.getRange()) {
228 Range = Range.intersectWith(*CurrentRange);
229 if (Range == CurrentRange)
230 break;
231 }
232
233 II.addRangeRetAttr(Range);
234 II.addRetAttr(Attribute::NoUndef);
235 return &II;
236 }
237 case Intrinsic::arm_mve_vadc:
238 case Intrinsic::arm_mve_vadc_predicated: {
239 unsigned CarryOp =
240 (II.getIntrinsicID() == Intrinsic::arm_mve_vadc_predicated) ? 3 : 2;
241 assert(II.getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == 32 &&
242 "Bad type for intrinsic!");
243
244 KnownBits CarryKnown(32);
245 if (IC.SimplifyDemandedBits(&II, CarryOp, APInt::getOneBitSet(32, 29),
246 CarryKnown)) {
247 return &II;
248 }
249 break;
250 }
251 case Intrinsic::arm_mve_vmldava: {
252 Instruction *I = cast<Instruction>(&II);
253 if (I->hasOneUse()) {
254 auto *User = cast<Instruction>(*I->user_begin());
255 Value *OpZ;
256 if (match(User, m_c_Add(m_Specific(I), m_Value(OpZ))) &&
257 match(I->getOperand(3), m_Zero())) {
258 Value *OpX = I->getOperand(4);
259 Value *OpY = I->getOperand(5);
260 Type *OpTy = OpX->getType();
261
262 IC.Builder.SetInsertPoint(User);
263 Value *V =
264 IC.Builder.CreateIntrinsic(Intrinsic::arm_mve_vmldava, {OpTy},
265 {I->getOperand(0), I->getOperand(1),
266 I->getOperand(2), OpZ, OpX, OpY});
267
268 IC.replaceInstUsesWith(*User, V);
269 return IC.eraseInstFromFunction(*User);
270 }
271 }
272 return std::nullopt;
273 }
274 }
275 return std::nullopt;
276 }
277
simplifyDemandedVectorEltsIntrinsic(InstCombiner & IC,IntrinsicInst & II,APInt OrigDemandedElts,APInt & UndefElts,APInt & UndefElts2,APInt & UndefElts3,std::function<void (Instruction *,unsigned,APInt,APInt &)> SimplifyAndSetOp) const278 std::optional<Value *> ARMTTIImpl::simplifyDemandedVectorEltsIntrinsic(
279 InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
280 APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
281 std::function<void(Instruction *, unsigned, APInt, APInt &)>
282 SimplifyAndSetOp) const {
283
284 // Compute the demanded bits for a narrowing MVE intrinsic. The TopOpc is the
285 // opcode specifying a Top/Bottom instruction, which can change between
286 // instructions.
287 auto SimplifyNarrowInstrTopBottom =[&](unsigned TopOpc) {
288 unsigned NumElts = cast<FixedVectorType>(II.getType())->getNumElements();
289 unsigned IsTop = cast<ConstantInt>(II.getOperand(TopOpc))->getZExtValue();
290
291 // The only odd/even lanes of operand 0 will only be demanded depending
292 // on whether this is a top/bottom instruction.
293 APInt DemandedElts =
294 APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1)
295 : APInt::getHighBitsSet(2, 1));
296 SimplifyAndSetOp(&II, 0, OrigDemandedElts & DemandedElts, UndefElts);
297 // The other lanes will be defined from the inserted elements.
298 UndefElts &= APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1)
299 : APInt::getHighBitsSet(2, 1));
300 return std::nullopt;
301 };
302
303 switch (II.getIntrinsicID()) {
304 default:
305 break;
306 case Intrinsic::arm_mve_vcvt_narrow:
307 SimplifyNarrowInstrTopBottom(2);
308 break;
309 case Intrinsic::arm_mve_vqmovn:
310 SimplifyNarrowInstrTopBottom(4);
311 break;
312 case Intrinsic::arm_mve_vshrn:
313 SimplifyNarrowInstrTopBottom(7);
314 break;
315 }
316
317 return std::nullopt;
318 }
319
getIntImmCost(const APInt & Imm,Type * Ty,TTI::TargetCostKind CostKind) const320 InstructionCost ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
321 TTI::TargetCostKind CostKind) const {
322 assert(Ty->isIntegerTy());
323
324 unsigned Bits = Ty->getPrimitiveSizeInBits();
325 if (Bits == 0 || Imm.getActiveBits() >= 64)
326 return 4;
327
328 int64_t SImmVal = Imm.getSExtValue();
329 uint64_t ZImmVal = Imm.getZExtValue();
330 if (!ST->isThumb()) {
331 if ((SImmVal >= 0 && SImmVal < 65536) ||
332 (ARM_AM::getSOImmVal(ZImmVal) != -1) ||
333 (ARM_AM::getSOImmVal(~ZImmVal) != -1))
334 return 1;
335 return ST->hasV6T2Ops() ? 2 : 3;
336 }
337 if (ST->isThumb2()) {
338 if ((SImmVal >= 0 && SImmVal < 65536) ||
339 (ARM_AM::getT2SOImmVal(ZImmVal) != -1) ||
340 (ARM_AM::getT2SOImmVal(~ZImmVal) != -1))
341 return 1;
342 return ST->hasV6T2Ops() ? 2 : 3;
343 }
344 // Thumb1, any i8 imm cost 1.
345 if (Bits == 8 || (SImmVal >= 0 && SImmVal < 256))
346 return 1;
347 if ((~SImmVal < 256) || ARM_AM::isThumbImmShiftedVal(ZImmVal))
348 return 2;
349 // Load from constantpool.
350 return 3;
351 }
352
353 // Constants smaller than 256 fit in the immediate field of
354 // Thumb1 instructions so we return a zero cost and 1 otherwise.
getIntImmCodeSizeCost(unsigned Opcode,unsigned Idx,const APInt & Imm,Type * Ty) const355 InstructionCost ARMTTIImpl::getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx,
356 const APInt &Imm,
357 Type *Ty) const {
358 if (Imm.isNonNegative() && Imm.getLimitedValue() < 256)
359 return 0;
360
361 return 1;
362 }
363
364 // Checks whether Inst is part of a min(max()) or max(min()) pattern
365 // that will match to an SSAT instruction. Returns the instruction being
366 // saturated, or null if no saturation pattern was found.
isSSATMinMaxPattern(Instruction * Inst,const APInt & Imm)367 static Value *isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm) {
368 Value *LHS, *RHS;
369 ConstantInt *C;
370 SelectPatternFlavor InstSPF = matchSelectPattern(Inst, LHS, RHS).Flavor;
371
372 if (InstSPF == SPF_SMAX &&
373 PatternMatch::match(RHS, PatternMatch::m_ConstantInt(C)) &&
374 C->getValue() == Imm && Imm.isNegative() && Imm.isNegatedPowerOf2()) {
375
376 auto isSSatMin = [&](Value *MinInst) {
377 if (isa<SelectInst>(MinInst)) {
378 Value *MinLHS, *MinRHS;
379 ConstantInt *MinC;
380 SelectPatternFlavor MinSPF =
381 matchSelectPattern(MinInst, MinLHS, MinRHS).Flavor;
382 if (MinSPF == SPF_SMIN &&
383 PatternMatch::match(MinRHS, PatternMatch::m_ConstantInt(MinC)) &&
384 MinC->getValue() == ((-Imm) - 1))
385 return true;
386 }
387 return false;
388 };
389
390 if (isSSatMin(Inst->getOperand(1)))
391 return cast<Instruction>(Inst->getOperand(1))->getOperand(1);
392 if (Inst->hasNUses(2) &&
393 (isSSatMin(*Inst->user_begin()) || isSSatMin(*(++Inst->user_begin()))))
394 return Inst->getOperand(1);
395 }
396 return nullptr;
397 }
398
399 // Look for a FP Saturation pattern, where the instruction can be simplified to
400 // a fptosi.sat. max(min(fptosi)). The constant in this case is always free.
isFPSatMinMaxPattern(Instruction * Inst,const APInt & Imm)401 static bool isFPSatMinMaxPattern(Instruction *Inst, const APInt &Imm) {
402 if (Imm.getBitWidth() != 64 ||
403 Imm != APInt::getHighBitsSet(64, 33)) // -2147483648
404 return false;
405 Value *FP = isSSATMinMaxPattern(Inst, Imm);
406 if (!FP && isa<ICmpInst>(Inst) && Inst->hasOneUse())
407 FP = isSSATMinMaxPattern(cast<Instruction>(*Inst->user_begin()), Imm);
408 if (!FP)
409 return false;
410 return isa<FPToSIInst>(FP);
411 }
412
getIntImmCostInst(unsigned Opcode,unsigned Idx,const APInt & Imm,Type * Ty,TTI::TargetCostKind CostKind,Instruction * Inst) const413 InstructionCost ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
414 const APInt &Imm, Type *Ty,
415 TTI::TargetCostKind CostKind,
416 Instruction *Inst) const {
417 // Division by a constant can be turned into multiplication, but only if we
418 // know it's constant. So it's not so much that the immediate is cheap (it's
419 // not), but that the alternative is worse.
420 // FIXME: this is probably unneeded with GlobalISel.
421 if ((Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
422 Opcode == Instruction::SRem || Opcode == Instruction::URem) &&
423 Idx == 1)
424 return 0;
425
426 // Leave any gep offsets for the CodeGenPrepare, which will do a better job at
427 // splitting any large offsets.
428 if (Opcode == Instruction::GetElementPtr && Idx != 0)
429 return 0;
430
431 if (Opcode == Instruction::And) {
432 // UXTB/UXTH
433 if (Imm == 255 || Imm == 65535)
434 return 0;
435 // Conversion to BIC is free, and means we can use ~Imm instead.
436 return std::min(getIntImmCost(Imm, Ty, CostKind),
437 getIntImmCost(~Imm, Ty, CostKind));
438 }
439
440 if (Opcode == Instruction::Add)
441 // Conversion to SUB is free, and means we can use -Imm instead.
442 return std::min(getIntImmCost(Imm, Ty, CostKind),
443 getIntImmCost(-Imm, Ty, CostKind));
444
445 if (Opcode == Instruction::ICmp && Imm.isNegative() &&
446 Ty->getIntegerBitWidth() == 32) {
447 int64_t NegImm = -Imm.getSExtValue();
448 if (ST->isThumb2() && NegImm < 1<<12)
449 // icmp X, #-C -> cmn X, #C
450 return 0;
451 if (ST->isThumb() && NegImm < 1<<8)
452 // icmp X, #-C -> adds X, #C
453 return 0;
454 }
455
456 // xor a, -1 can always be folded to MVN
457 if (Opcode == Instruction::Xor && Imm.isAllOnes())
458 return 0;
459
460 // Ensures negative constant of min(max()) or max(min()) patterns that
461 // match to SSAT instructions don't get hoisted
462 if (Inst && ((ST->hasV6Ops() && !ST->isThumb()) || ST->isThumb2()) &&
463 Ty->getIntegerBitWidth() <= 32) {
464 if (isSSATMinMaxPattern(Inst, Imm) ||
465 (isa<ICmpInst>(Inst) && Inst->hasOneUse() &&
466 isSSATMinMaxPattern(cast<Instruction>(*Inst->user_begin()), Imm)))
467 return 0;
468 }
469
470 if (Inst && ST->hasVFP2Base() && isFPSatMinMaxPattern(Inst, Imm))
471 return 0;
472
473 // We can convert <= -1 to < 0, which is generally quite cheap.
474 if (Inst && Opcode == Instruction::ICmp && Idx == 1 && Imm.isAllOnes()) {
475 ICmpInst::Predicate Pred = cast<ICmpInst>(Inst)->getPredicate();
476 if (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SLE)
477 return std::min(getIntImmCost(Imm, Ty, CostKind),
478 getIntImmCost(Imm + 1, Ty, CostKind));
479 }
480
481 return getIntImmCost(Imm, Ty, CostKind);
482 }
483
getCFInstrCost(unsigned Opcode,TTI::TargetCostKind CostKind,const Instruction * I) const484 InstructionCost ARMTTIImpl::getCFInstrCost(unsigned Opcode,
485 TTI::TargetCostKind CostKind,
486 const Instruction *I) const {
487 if (CostKind == TTI::TCK_RecipThroughput &&
488 (ST->hasNEON() || ST->hasMVEIntegerOps())) {
489 // FIXME: The vectorizer is highly sensistive to the cost of these
490 // instructions, which suggests that it may be using the costs incorrectly.
491 // But, for now, just make them free to avoid performance regressions for
492 // vector targets.
493 return 0;
494 }
495 return BaseT::getCFInstrCost(Opcode, CostKind, I);
496 }
497
getCastInstrCost(unsigned Opcode,Type * Dst,Type * Src,TTI::CastContextHint CCH,TTI::TargetCostKind CostKind,const Instruction * I) const498 InstructionCost ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
499 Type *Src,
500 TTI::CastContextHint CCH,
501 TTI::TargetCostKind CostKind,
502 const Instruction *I) const {
503 int ISD = TLI->InstructionOpcodeToISD(Opcode);
504 assert(ISD && "Invalid opcode");
505
506 // TODO: Allow non-throughput costs that aren't binary.
507 auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
508 if (CostKind != TTI::TCK_RecipThroughput)
509 return Cost == 0 ? 0 : 1;
510 return Cost;
511 };
512 auto IsLegalFPType = [this](EVT VT) {
513 EVT EltVT = VT.getScalarType();
514 return (EltVT == MVT::f32 && ST->hasVFP2Base()) ||
515 (EltVT == MVT::f64 && ST->hasFP64()) ||
516 (EltVT == MVT::f16 && ST->hasFullFP16());
517 };
518
519 EVT SrcTy = TLI->getValueType(DL, Src);
520 EVT DstTy = TLI->getValueType(DL, Dst);
521
522 if (!SrcTy.isSimple() || !DstTy.isSimple())
523 return AdjustCost(
524 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
525
526 // Extending masked load/Truncating masked stores is expensive because we
527 // currently don't split them. This means that we'll likely end up
528 // loading/storing each element individually (hence the high cost).
529 if ((ST->hasMVEIntegerOps() &&
530 (Opcode == Instruction::Trunc || Opcode == Instruction::ZExt ||
531 Opcode == Instruction::SExt)) ||
532 (ST->hasMVEFloatOps() &&
533 (Opcode == Instruction::FPExt || Opcode == Instruction::FPTrunc) &&
534 IsLegalFPType(SrcTy) && IsLegalFPType(DstTy)))
535 if (CCH == TTI::CastContextHint::Masked && DstTy.getSizeInBits() > 128)
536 return 2 * DstTy.getVectorNumElements() *
537 ST->getMVEVectorCostFactor(CostKind);
538
539 // The extend of other kinds of load is free
540 if (CCH == TTI::CastContextHint::Normal ||
541 CCH == TTI::CastContextHint::Masked) {
542 static const TypeConversionCostTblEntry LoadConversionTbl[] = {
543 {ISD::SIGN_EXTEND, MVT::i32, MVT::i16, 0},
544 {ISD::ZERO_EXTEND, MVT::i32, MVT::i16, 0},
545 {ISD::SIGN_EXTEND, MVT::i32, MVT::i8, 0},
546 {ISD::ZERO_EXTEND, MVT::i32, MVT::i8, 0},
547 {ISD::SIGN_EXTEND, MVT::i16, MVT::i8, 0},
548 {ISD::ZERO_EXTEND, MVT::i16, MVT::i8, 0},
549 {ISD::SIGN_EXTEND, MVT::i64, MVT::i32, 1},
550 {ISD::ZERO_EXTEND, MVT::i64, MVT::i32, 1},
551 {ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 1},
552 {ISD::ZERO_EXTEND, MVT::i64, MVT::i16, 1},
553 {ISD::SIGN_EXTEND, MVT::i64, MVT::i8, 1},
554 {ISD::ZERO_EXTEND, MVT::i64, MVT::i8, 1},
555 };
556 if (const auto *Entry = ConvertCostTableLookup(
557 LoadConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
558 return AdjustCost(Entry->Cost);
559
560 static const TypeConversionCostTblEntry MVELoadConversionTbl[] = {
561 {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 0},
562 {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 0},
563 {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 0},
564 {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 0},
565 {ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 0},
566 {ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 0},
567 // The following extend from a legal type to an illegal type, so need to
568 // split the load. This introduced an extra load operation, but the
569 // extend is still "free".
570 {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1},
571 {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1},
572 {ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 3},
573 {ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 3},
574 {ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1},
575 {ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1},
576 };
577 if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
578 if (const auto *Entry =
579 ConvertCostTableLookup(MVELoadConversionTbl, ISD,
580 DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
581 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
582 }
583
584 static const TypeConversionCostTblEntry MVEFLoadConversionTbl[] = {
585 // FPExtends are similar but also require the VCVT instructions.
586 {ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, 1},
587 {ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, 3},
588 };
589 if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
590 if (const auto *Entry =
591 ConvertCostTableLookup(MVEFLoadConversionTbl, ISD,
592 DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
593 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
594 }
595
596 // The truncate of a store is free. This is the mirror of extends above.
597 static const TypeConversionCostTblEntry MVEStoreConversionTbl[] = {
598 {ISD::TRUNCATE, MVT::v4i32, MVT::v4i16, 0},
599 {ISD::TRUNCATE, MVT::v4i32, MVT::v4i8, 0},
600 {ISD::TRUNCATE, MVT::v8i16, MVT::v8i8, 0},
601 {ISD::TRUNCATE, MVT::v8i32, MVT::v8i16, 1},
602 {ISD::TRUNCATE, MVT::v8i32, MVT::v8i8, 1},
603 {ISD::TRUNCATE, MVT::v16i32, MVT::v16i8, 3},
604 {ISD::TRUNCATE, MVT::v16i16, MVT::v16i8, 1},
605 };
606 if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
607 if (const auto *Entry =
608 ConvertCostTableLookup(MVEStoreConversionTbl, ISD,
609 SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
610 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
611 }
612
613 static const TypeConversionCostTblEntry MVEFStoreConversionTbl[] = {
614 {ISD::FP_ROUND, MVT::v4f32, MVT::v4f16, 1},
615 {ISD::FP_ROUND, MVT::v8f32, MVT::v8f16, 3},
616 };
617 if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
618 if (const auto *Entry =
619 ConvertCostTableLookup(MVEFStoreConversionTbl, ISD,
620 SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
621 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
622 }
623 }
624
625 // NEON vector operations that can extend their inputs.
626 if ((ISD == ISD::SIGN_EXTEND || ISD == ISD::ZERO_EXTEND) &&
627 I && I->hasOneUse() && ST->hasNEON() && SrcTy.isVector()) {
628 static const TypeConversionCostTblEntry NEONDoubleWidthTbl[] = {
629 // vaddl
630 { ISD::ADD, MVT::v4i32, MVT::v4i16, 0 },
631 { ISD::ADD, MVT::v8i16, MVT::v8i8, 0 },
632 // vsubl
633 { ISD::SUB, MVT::v4i32, MVT::v4i16, 0 },
634 { ISD::SUB, MVT::v8i16, MVT::v8i8, 0 },
635 // vmull
636 { ISD::MUL, MVT::v4i32, MVT::v4i16, 0 },
637 { ISD::MUL, MVT::v8i16, MVT::v8i8, 0 },
638 // vshll
639 { ISD::SHL, MVT::v4i32, MVT::v4i16, 0 },
640 { ISD::SHL, MVT::v8i16, MVT::v8i8, 0 },
641 };
642
643 auto *User = cast<Instruction>(*I->user_begin());
644 int UserISD = TLI->InstructionOpcodeToISD(User->getOpcode());
645 if (auto *Entry = ConvertCostTableLookup(NEONDoubleWidthTbl, UserISD,
646 DstTy.getSimpleVT(),
647 SrcTy.getSimpleVT())) {
648 return AdjustCost(Entry->Cost);
649 }
650 }
651
652 // Single to/from double precision conversions.
653 if (Src->isVectorTy() && ST->hasNEON() &&
654 ((ISD == ISD::FP_ROUND && SrcTy.getScalarType() == MVT::f64 &&
655 DstTy.getScalarType() == MVT::f32) ||
656 (ISD == ISD::FP_EXTEND && SrcTy.getScalarType() == MVT::f32 &&
657 DstTy.getScalarType() == MVT::f64))) {
658 static const CostTblEntry NEONFltDblTbl[] = {
659 // Vector fptrunc/fpext conversions.
660 {ISD::FP_ROUND, MVT::v2f64, 2},
661 {ISD::FP_EXTEND, MVT::v2f32, 2},
662 {ISD::FP_EXTEND, MVT::v4f32, 4}};
663
664 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
665 if (const auto *Entry = CostTableLookup(NEONFltDblTbl, ISD, LT.second))
666 return AdjustCost(LT.first * Entry->Cost);
667 }
668
669 // Some arithmetic, load and store operations have specific instructions
670 // to cast up/down their types automatically at no extra cost.
671 // TODO: Get these tables to know at least what the related operations are.
672 static const TypeConversionCostTblEntry NEONVectorConversionTbl[] = {
673 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
674 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
675 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
676 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
677 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 0 },
678 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 },
679
680 // The number of vmovl instructions for the extension.
681 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
682 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
683 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
684 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
685 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8, 3 },
686 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8, 3 },
687 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
688 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
689 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
690 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
691 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
692 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
693 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
694 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
695 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
696 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
697 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
698 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
699
700 // Operations that we legalize using splitting.
701 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 },
702 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 },
703
704 // Vector float <-> i32 conversions.
705 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
706 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
707
708 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
709 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
710 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 2 },
711 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 2 },
712 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
713 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
714 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 },
715 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 },
716 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
717 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
718 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
719 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
720 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
721 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
722 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 },
723 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 },
724 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 8 },
725 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 8 },
726 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 4 },
727 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 4 },
728
729 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
730 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
731 { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 3 },
732 { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 3 },
733 { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 },
734 { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 },
735
736 // Vector double <-> i32 conversions.
737 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
738 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
739
740 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
741 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
742 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 3 },
743 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 3 },
744 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
745 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
746
747 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 },
748 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 },
749 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, 4 },
750 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, 4 },
751 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f32, 8 },
752 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, 8 }
753 };
754
755 if (SrcTy.isVector() && ST->hasNEON()) {
756 if (const auto *Entry = ConvertCostTableLookup(NEONVectorConversionTbl, ISD,
757 DstTy.getSimpleVT(),
758 SrcTy.getSimpleVT()))
759 return AdjustCost(Entry->Cost);
760 }
761
762 // Scalar float to integer conversions.
763 static const TypeConversionCostTblEntry NEONFloatConversionTbl[] = {
764 { ISD::FP_TO_SINT, MVT::i1, MVT::f32, 2 },
765 { ISD::FP_TO_UINT, MVT::i1, MVT::f32, 2 },
766 { ISD::FP_TO_SINT, MVT::i1, MVT::f64, 2 },
767 { ISD::FP_TO_UINT, MVT::i1, MVT::f64, 2 },
768 { ISD::FP_TO_SINT, MVT::i8, MVT::f32, 2 },
769 { ISD::FP_TO_UINT, MVT::i8, MVT::f32, 2 },
770 { ISD::FP_TO_SINT, MVT::i8, MVT::f64, 2 },
771 { ISD::FP_TO_UINT, MVT::i8, MVT::f64, 2 },
772 { ISD::FP_TO_SINT, MVT::i16, MVT::f32, 2 },
773 { ISD::FP_TO_UINT, MVT::i16, MVT::f32, 2 },
774 { ISD::FP_TO_SINT, MVT::i16, MVT::f64, 2 },
775 { ISD::FP_TO_UINT, MVT::i16, MVT::f64, 2 },
776 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, 2 },
777 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, 2 },
778 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, 2 },
779 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, 2 },
780 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, 10 },
781 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 10 },
782 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, 10 },
783 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 10 }
784 };
785 if (SrcTy.isFloatingPoint() && ST->hasNEON()) {
786 if (const auto *Entry = ConvertCostTableLookup(NEONFloatConversionTbl, ISD,
787 DstTy.getSimpleVT(),
788 SrcTy.getSimpleVT()))
789 return AdjustCost(Entry->Cost);
790 }
791
792 // Scalar integer to float conversions.
793 static const TypeConversionCostTblEntry NEONIntegerConversionTbl[] = {
794 { ISD::SINT_TO_FP, MVT::f32, MVT::i1, 2 },
795 { ISD::UINT_TO_FP, MVT::f32, MVT::i1, 2 },
796 { ISD::SINT_TO_FP, MVT::f64, MVT::i1, 2 },
797 { ISD::UINT_TO_FP, MVT::f64, MVT::i1, 2 },
798 { ISD::SINT_TO_FP, MVT::f32, MVT::i8, 2 },
799 { ISD::UINT_TO_FP, MVT::f32, MVT::i8, 2 },
800 { ISD::SINT_TO_FP, MVT::f64, MVT::i8, 2 },
801 { ISD::UINT_TO_FP, MVT::f64, MVT::i8, 2 },
802 { ISD::SINT_TO_FP, MVT::f32, MVT::i16, 2 },
803 { ISD::UINT_TO_FP, MVT::f32, MVT::i16, 2 },
804 { ISD::SINT_TO_FP, MVT::f64, MVT::i16, 2 },
805 { ISD::UINT_TO_FP, MVT::f64, MVT::i16, 2 },
806 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, 2 },
807 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, 2 },
808 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, 2 },
809 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, 2 },
810 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, 10 },
811 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 10 },
812 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, 10 },
813 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 10 }
814 };
815
816 if (SrcTy.isInteger() && ST->hasNEON()) {
817 if (const auto *Entry = ConvertCostTableLookup(NEONIntegerConversionTbl,
818 ISD, DstTy.getSimpleVT(),
819 SrcTy.getSimpleVT()))
820 return AdjustCost(Entry->Cost);
821 }
822
823 // MVE extend costs, taken from codegen tests. i8->i16 or i16->i32 is one
824 // instruction, i8->i32 is two. i64 zexts are an VAND with a constant, sext
825 // are linearised so take more.
826 static const TypeConversionCostTblEntry MVEVectorConversionTbl[] = {
827 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
828 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
829 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
830 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
831 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8, 10 },
832 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8, 2 },
833 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
834 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
835 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 10 },
836 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
837 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 8 },
838 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 2 },
839 };
840
841 if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
842 if (const auto *Entry = ConvertCostTableLookup(MVEVectorConversionTbl,
843 ISD, DstTy.getSimpleVT(),
844 SrcTy.getSimpleVT()))
845 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
846 }
847
848 if (ISD == ISD::FP_ROUND || ISD == ISD::FP_EXTEND) {
849 // As general rule, fp converts that were not matched above are scalarized
850 // and cost 1 vcvt for each lane, so long as the instruction is available.
851 // If not it will become a series of function calls.
852 const InstructionCost CallCost =
853 getCallInstrCost(nullptr, Dst, {Src}, CostKind);
854 int Lanes = 1;
855 if (SrcTy.isFixedLengthVector())
856 Lanes = SrcTy.getVectorNumElements();
857
858 if (IsLegalFPType(SrcTy) && IsLegalFPType(DstTy))
859 return Lanes;
860 else
861 return Lanes * CallCost;
862 }
863
864 if (ISD == ISD::TRUNCATE && ST->hasMVEIntegerOps() &&
865 SrcTy.isFixedLengthVector()) {
866 // Treat a truncate with larger than legal source (128bits for MVE) as
867 // expensive, 2 instructions per lane.
868 if ((SrcTy.getScalarType() == MVT::i8 ||
869 SrcTy.getScalarType() == MVT::i16 ||
870 SrcTy.getScalarType() == MVT::i32) &&
871 SrcTy.getSizeInBits() > 128 &&
872 SrcTy.getSizeInBits() > DstTy.getSizeInBits())
873 return SrcTy.getVectorNumElements() * 2;
874 }
875
876 // Scalar integer conversion costs.
877 static const TypeConversionCostTblEntry ARMIntegerConversionTbl[] = {
878 // i16 -> i64 requires two dependent operations.
879 { ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 2 },
880
881 // Truncates on i64 are assumed to be free.
882 { ISD::TRUNCATE, MVT::i32, MVT::i64, 0 },
883 { ISD::TRUNCATE, MVT::i16, MVT::i64, 0 },
884 { ISD::TRUNCATE, MVT::i8, MVT::i64, 0 },
885 { ISD::TRUNCATE, MVT::i1, MVT::i64, 0 }
886 };
887
888 if (SrcTy.isInteger()) {
889 if (const auto *Entry = ConvertCostTableLookup(ARMIntegerConversionTbl, ISD,
890 DstTy.getSimpleVT(),
891 SrcTy.getSimpleVT()))
892 return AdjustCost(Entry->Cost);
893 }
894
895 int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
896 ? ST->getMVEVectorCostFactor(CostKind)
897 : 1;
898 return AdjustCost(
899 BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
900 }
901
getVectorInstrCost(unsigned Opcode,Type * ValTy,TTI::TargetCostKind CostKind,unsigned Index,const Value * Op0,const Value * Op1) const902 InstructionCost ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
903 TTI::TargetCostKind CostKind,
904 unsigned Index, const Value *Op0,
905 const Value *Op1) const {
906 // Penalize inserting into an D-subregister. We end up with a three times
907 // lower estimated throughput on swift.
908 if (ST->hasSlowLoadDSubregister() && Opcode == Instruction::InsertElement &&
909 ValTy->isVectorTy() && ValTy->getScalarSizeInBits() <= 32)
910 return 3;
911
912 if (ST->hasNEON() && (Opcode == Instruction::InsertElement ||
913 Opcode == Instruction::ExtractElement)) {
914 // Cross-class copies are expensive on many microarchitectures,
915 // so assume they are expensive by default.
916 if (cast<VectorType>(ValTy)->getElementType()->isIntegerTy())
917 return 3;
918
919 // Even if it's not a cross class copy, this likely leads to mixing
920 // of NEON and VFP code and should be therefore penalized.
921 if (ValTy->isVectorTy() &&
922 ValTy->getScalarSizeInBits() <= 32)
923 return std::max<InstructionCost>(
924 BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1),
925 2U);
926 }
927
928 if (ST->hasMVEIntegerOps() && (Opcode == Instruction::InsertElement ||
929 Opcode == Instruction::ExtractElement)) {
930 // Integer cross-lane moves are more expensive than float, which can
931 // sometimes just be vmovs. Integer involve being passes to GPR registers,
932 // causing more of a delay.
933 std::pair<InstructionCost, MVT> LT =
934 getTypeLegalizationCost(ValTy->getScalarType());
935 return LT.first * (ValTy->getScalarType()->isIntegerTy() ? 4 : 1);
936 }
937
938 return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1);
939 }
940
getCmpSelInstrCost(unsigned Opcode,Type * ValTy,Type * CondTy,CmpInst::Predicate VecPred,TTI::TargetCostKind CostKind,TTI::OperandValueInfo Op1Info,TTI::OperandValueInfo Op2Info,const Instruction * I) const941 InstructionCost ARMTTIImpl::getCmpSelInstrCost(
942 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
943 TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info,
944 TTI::OperandValueInfo Op2Info, const Instruction *I) const {
945 int ISD = TLI->InstructionOpcodeToISD(Opcode);
946
947 // Thumb scalar code size cost for select.
948 if (CostKind == TTI::TCK_CodeSize && ISD == ISD::SELECT &&
949 ST->isThumb() && !ValTy->isVectorTy()) {
950 // Assume expensive structs.
951 if (TLI->getValueType(DL, ValTy, true) == MVT::Other)
952 return TTI::TCC_Expensive;
953
954 // Select costs can vary because they:
955 // - may require one or more conditional mov (including an IT),
956 // - can't operate directly on immediates,
957 // - require live flags, which we can't copy around easily.
958 InstructionCost Cost = getTypeLegalizationCost(ValTy).first;
959
960 // Possible IT instruction for Thumb2, or more for Thumb1.
961 ++Cost;
962
963 // i1 values may need rematerialising by using mov immediates and/or
964 // flag setting instructions.
965 if (ValTy->isIntegerTy(1))
966 ++Cost;
967
968 return Cost;
969 }
970
971 // If this is a vector min/max/abs, use the cost of that intrinsic directly
972 // instead. Hopefully when min/max intrinsics are more prevalent this code
973 // will not be needed.
974 const Instruction *Sel = I;
975 if ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) && Sel &&
976 Sel->hasOneUse())
977 Sel = cast<Instruction>(Sel->user_back());
978 if (Sel && ValTy->isVectorTy() &&
979 (ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy())) {
980 const Value *LHS, *RHS;
981 SelectPatternFlavor SPF = matchSelectPattern(Sel, LHS, RHS).Flavor;
982 unsigned IID = 0;
983 switch (SPF) {
984 case SPF_ABS:
985 IID = Intrinsic::abs;
986 break;
987 case SPF_SMIN:
988 IID = Intrinsic::smin;
989 break;
990 case SPF_SMAX:
991 IID = Intrinsic::smax;
992 break;
993 case SPF_UMIN:
994 IID = Intrinsic::umin;
995 break;
996 case SPF_UMAX:
997 IID = Intrinsic::umax;
998 break;
999 case SPF_FMINNUM:
1000 IID = Intrinsic::minnum;
1001 break;
1002 case SPF_FMAXNUM:
1003 IID = Intrinsic::maxnum;
1004 break;
1005 default:
1006 break;
1007 }
1008 if (IID) {
1009 // The ICmp is free, the select gets the cost of the min/max/etc
1010 if (Sel != I)
1011 return 0;
1012 IntrinsicCostAttributes CostAttrs(IID, ValTy, {ValTy, ValTy});
1013 return getIntrinsicInstrCost(CostAttrs, CostKind);
1014 }
1015 }
1016
1017 // On NEON a vector select gets lowered to vbsl.
1018 if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT && CondTy) {
1019 // Lowering of some vector selects is currently far from perfect.
1020 static const TypeConversionCostTblEntry NEONVectorSelectTbl[] = {
1021 { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4*4 + 1*2 + 1 },
1022 { ISD::SELECT, MVT::v8i1, MVT::v8i64, 50 },
1023 { ISD::SELECT, MVT::v16i1, MVT::v16i64, 100 }
1024 };
1025
1026 EVT SelCondTy = TLI->getValueType(DL, CondTy);
1027 EVT SelValTy = TLI->getValueType(DL, ValTy);
1028 if (SelCondTy.isSimple() && SelValTy.isSimple()) {
1029 if (const auto *Entry = ConvertCostTableLookup(NEONVectorSelectTbl, ISD,
1030 SelCondTy.getSimpleVT(),
1031 SelValTy.getSimpleVT()))
1032 return Entry->Cost;
1033 }
1034
1035 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1036 return LT.first;
1037 }
1038
1039 if (ST->hasMVEIntegerOps() && ValTy->isVectorTy() &&
1040 (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
1041 cast<FixedVectorType>(ValTy)->getNumElements() > 1) {
1042 FixedVectorType *VecValTy = cast<FixedVectorType>(ValTy);
1043 FixedVectorType *VecCondTy = dyn_cast_or_null<FixedVectorType>(CondTy);
1044 if (!VecCondTy)
1045 VecCondTy = cast<FixedVectorType>(CmpInst::makeCmpResultType(VecValTy));
1046
1047 // If we don't have mve.fp any fp operations will need to be scalarized.
1048 if (Opcode == Instruction::FCmp && !ST->hasMVEFloatOps()) {
1049 // One scalaization insert, one scalarization extract and the cost of the
1050 // fcmps.
1051 return BaseT::getScalarizationOverhead(VecValTy, /*Insert*/ false,
1052 /*Extract*/ true, CostKind) +
1053 BaseT::getScalarizationOverhead(VecCondTy, /*Insert*/ true,
1054 /*Extract*/ false, CostKind) +
1055 VecValTy->getNumElements() *
1056 getCmpSelInstrCost(Opcode, ValTy->getScalarType(),
1057 VecCondTy->getScalarType(), VecPred,
1058 CostKind, Op1Info, Op2Info, I);
1059 }
1060
1061 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1062 int BaseCost = ST->getMVEVectorCostFactor(CostKind);
1063 // There are two types - the input that specifies the type of the compare
1064 // and the output vXi1 type. Because we don't know how the output will be
1065 // split, we may need an expensive shuffle to get two in sync. This has the
1066 // effect of making larger than legal compares (v8i32 for example)
1067 // expensive.
1068 if (LT.second.isVector() && LT.second.getVectorNumElements() > 2) {
1069 if (LT.first > 1)
1070 return LT.first * BaseCost +
1071 BaseT::getScalarizationOverhead(VecCondTy, /*Insert*/ true,
1072 /*Extract*/ false, CostKind);
1073 return BaseCost;
1074 }
1075 }
1076
1077 // Default to cheap (throughput/size of 1 instruction) but adjust throughput
1078 // for "multiple beats" potentially needed by MVE instructions.
1079 int BaseCost = 1;
1080 if (ST->hasMVEIntegerOps() && ValTy->isVectorTy())
1081 BaseCost = ST->getMVEVectorCostFactor(CostKind);
1082
1083 return BaseCost * BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred,
1084 CostKind, Op1Info, Op2Info, I);
1085 }
1086
getAddressComputationCost(Type * Ty,ScalarEvolution * SE,const SCEV * Ptr) const1087 InstructionCost ARMTTIImpl::getAddressComputationCost(Type *Ty,
1088 ScalarEvolution *SE,
1089 const SCEV *Ptr) const {
1090 // Address computations in vectorized code with non-consecutive addresses will
1091 // likely result in more instructions compared to scalar code where the
1092 // computation can more often be merged into the index mode. The resulting
1093 // extra micro-ops can significantly decrease throughput.
1094 unsigned NumVectorInstToHideOverhead = 10;
1095 int MaxMergeDistance = 64;
1096
1097 if (ST->hasNEON()) {
1098 if (Ty->isVectorTy() && SE &&
1099 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
1100 return NumVectorInstToHideOverhead;
1101
1102 // In many cases the address computation is not merged into the instruction
1103 // addressing mode.
1104 return 1;
1105 }
1106 return BaseT::getAddressComputationCost(Ty, SE, Ptr);
1107 }
1108
isProfitableLSRChainElement(Instruction * I) const1109 bool ARMTTIImpl::isProfitableLSRChainElement(Instruction *I) const {
1110 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
1111 // If a VCTP is part of a chain, it's already profitable and shouldn't be
1112 // optimized, else LSR may block tail-predication.
1113 switch (II->getIntrinsicID()) {
1114 case Intrinsic::arm_mve_vctp8:
1115 case Intrinsic::arm_mve_vctp16:
1116 case Intrinsic::arm_mve_vctp32:
1117 case Intrinsic::arm_mve_vctp64:
1118 return true;
1119 default:
1120 break;
1121 }
1122 }
1123 return false;
1124 }
1125
isLegalMaskedLoad(Type * DataTy,Align Alignment,unsigned) const1126 bool ARMTTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment,
1127 unsigned /*AddressSpace*/) const {
1128 if (!EnableMaskedLoadStores || !ST->hasMVEIntegerOps())
1129 return false;
1130
1131 if (auto *VecTy = dyn_cast<FixedVectorType>(DataTy)) {
1132 // Don't support v2i1 yet.
1133 if (VecTy->getNumElements() == 2)
1134 return false;
1135
1136 // We don't support extending fp types.
1137 unsigned VecWidth = DataTy->getPrimitiveSizeInBits();
1138 if (VecWidth != 128 && VecTy->getElementType()->isFloatingPointTy())
1139 return false;
1140 }
1141
1142 unsigned EltWidth = DataTy->getScalarSizeInBits();
1143 return (EltWidth == 32 && Alignment >= 4) ||
1144 (EltWidth == 16 && Alignment >= 2) || (EltWidth == 8);
1145 }
1146
isLegalMaskedGather(Type * Ty,Align Alignment) const1147 bool ARMTTIImpl::isLegalMaskedGather(Type *Ty, Align Alignment) const {
1148 if (!EnableMaskedGatherScatters || !ST->hasMVEIntegerOps())
1149 return false;
1150
1151 unsigned EltWidth = Ty->getScalarSizeInBits();
1152 return ((EltWidth == 32 && Alignment >= 4) ||
1153 (EltWidth == 16 && Alignment >= 2) || EltWidth == 8);
1154 }
1155
1156 /// Given a memcpy/memset/memmove instruction, return the number of memory
1157 /// operations performed, via querying findOptimalMemOpLowering. Returns -1 if a
1158 /// call is used.
getNumMemOps(const IntrinsicInst * I) const1159 int ARMTTIImpl::getNumMemOps(const IntrinsicInst *I) const {
1160 MemOp MOp;
1161 unsigned DstAddrSpace = ~0u;
1162 unsigned SrcAddrSpace = ~0u;
1163 const Function *F = I->getParent()->getParent();
1164
1165 if (const auto *MC = dyn_cast<MemTransferInst>(I)) {
1166 ConstantInt *C = dyn_cast<ConstantInt>(MC->getLength());
1167 // If 'size' is not a constant, a library call will be generated.
1168 if (!C)
1169 return -1;
1170
1171 const unsigned Size = C->getValue().getZExtValue();
1172 const Align DstAlign = MC->getDestAlign().valueOrOne();
1173 const Align SrcAlign = MC->getSourceAlign().valueOrOne();
1174
1175 MOp = MemOp::Copy(Size, /*DstAlignCanChange*/ false, DstAlign, SrcAlign,
1176 /*IsVolatile*/ false);
1177 DstAddrSpace = MC->getDestAddressSpace();
1178 SrcAddrSpace = MC->getSourceAddressSpace();
1179 }
1180 else if (const auto *MS = dyn_cast<MemSetInst>(I)) {
1181 ConstantInt *C = dyn_cast<ConstantInt>(MS->getLength());
1182 // If 'size' is not a constant, a library call will be generated.
1183 if (!C)
1184 return -1;
1185
1186 const unsigned Size = C->getValue().getZExtValue();
1187 const Align DstAlign = MS->getDestAlign().valueOrOne();
1188
1189 MOp = MemOp::Set(Size, /*DstAlignCanChange*/ false, DstAlign,
1190 /*IsZeroMemset*/ false, /*IsVolatile*/ false);
1191 DstAddrSpace = MS->getDestAddressSpace();
1192 }
1193 else
1194 llvm_unreachable("Expected a memcpy/move or memset!");
1195
1196 unsigned Limit, Factor = 2;
1197 switch(I->getIntrinsicID()) {
1198 case Intrinsic::memcpy:
1199 Limit = TLI->getMaxStoresPerMemcpy(F->hasMinSize());
1200 break;
1201 case Intrinsic::memmove:
1202 Limit = TLI->getMaxStoresPerMemmove(F->hasMinSize());
1203 break;
1204 case Intrinsic::memset:
1205 Limit = TLI->getMaxStoresPerMemset(F->hasMinSize());
1206 Factor = 1;
1207 break;
1208 default:
1209 llvm_unreachable("Expected a memcpy/move or memset!");
1210 }
1211
1212 // MemOps will be poplulated with a list of data types that needs to be
1213 // loaded and stored. That's why we multiply the number of elements by 2 to
1214 // get the cost for this memcpy.
1215 std::vector<EVT> MemOps;
1216 LLVMContext &C = F->getContext();
1217 if (getTLI()->findOptimalMemOpLowering(C, MemOps, Limit, MOp, DstAddrSpace,
1218 SrcAddrSpace, F->getAttributes()))
1219 return MemOps.size() * Factor;
1220
1221 // If we can't find an optimal memop lowering, return the default cost
1222 return -1;
1223 }
1224
getMemcpyCost(const Instruction * I) const1225 InstructionCost ARMTTIImpl::getMemcpyCost(const Instruction *I) const {
1226 int NumOps = getNumMemOps(cast<IntrinsicInst>(I));
1227
1228 // To model the cost of a library call, we assume 1 for the call, and
1229 // 3 for the argument setup.
1230 if (NumOps == -1)
1231 return 4;
1232 return NumOps;
1233 }
1234
getShuffleCost(TTI::ShuffleKind Kind,VectorType * DstTy,VectorType * SrcTy,ArrayRef<int> Mask,TTI::TargetCostKind CostKind,int Index,VectorType * SubTp,ArrayRef<const Value * > Args,const Instruction * CxtI) const1235 InstructionCost ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
1236 VectorType *DstTy, VectorType *SrcTy,
1237 ArrayRef<int> Mask,
1238 TTI::TargetCostKind CostKind,
1239 int Index, VectorType *SubTp,
1240 ArrayRef<const Value *> Args,
1241 const Instruction *CxtI) const {
1242 assert((Mask.empty() || DstTy->isScalableTy() ||
1243 Mask.size() == DstTy->getElementCount().getKnownMinValue()) &&
1244 "Expected the Mask to match the return size if given");
1245 assert(SrcTy->getScalarType() == DstTy->getScalarType() &&
1246 "Expected the same scalar types");
1247
1248 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
1249 // Treat extractsubvector as single op permutation.
1250 bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
1251 if (IsExtractSubvector)
1252 Kind = TTI::SK_PermuteSingleSrc;
1253 if (ST->hasNEON()) {
1254 if (Kind == TTI::SK_Broadcast) {
1255 static const CostTblEntry NEONDupTbl[] = {
1256 // VDUP handles these cases.
1257 {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
1258 {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
1259 {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
1260 {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
1261 {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
1262 {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},
1263
1264 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
1265 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
1266 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
1267 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1}};
1268
1269 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
1270 if (const auto *Entry =
1271 CostTableLookup(NEONDupTbl, ISD::VECTOR_SHUFFLE, LT.second))
1272 return LT.first * Entry->Cost;
1273 }
1274 if (Kind == TTI::SK_Reverse) {
1275 static const CostTblEntry NEONShuffleTbl[] = {
1276 // Reverse shuffle cost one instruction if we are shuffling within a
1277 // double word (vrev) or two if we shuffle a quad word (vrev, vext).
1278 {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
1279 {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
1280 {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
1281 {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
1282 {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
1283 {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},
1284
1285 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
1286 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
1287 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 2},
1288 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 2}};
1289
1290 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
1291 if (const auto *Entry =
1292 CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
1293 return LT.first * Entry->Cost;
1294 }
1295 if (Kind == TTI::SK_Select) {
1296 static const CostTblEntry NEONSelShuffleTbl[] = {
1297 // Select shuffle cost table for ARM. Cost is the number of
1298 // instructions
1299 // required to create the shuffled vector.
1300
1301 {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
1302 {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
1303 {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
1304 {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
1305
1306 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
1307 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
1308 {ISD::VECTOR_SHUFFLE, MVT::v4i16, 2},
1309
1310 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 16},
1311
1312 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 32}};
1313
1314 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
1315 if (const auto *Entry = CostTableLookup(NEONSelShuffleTbl,
1316 ISD::VECTOR_SHUFFLE, LT.second))
1317 return LT.first * Entry->Cost;
1318 }
1319 }
1320 if (ST->hasMVEIntegerOps()) {
1321 if (Kind == TTI::SK_Broadcast) {
1322 static const CostTblEntry MVEDupTbl[] = {
1323 // VDUP handles these cases.
1324 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
1325 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
1326 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1},
1327 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
1328 {ISD::VECTOR_SHUFFLE, MVT::v8f16, 1}};
1329
1330 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
1331 if (const auto *Entry = CostTableLookup(MVEDupTbl, ISD::VECTOR_SHUFFLE,
1332 LT.second))
1333 return LT.first * Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
1334 }
1335
1336 if (!Mask.empty()) {
1337 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
1338 if (LT.second.isVector() &&
1339 Mask.size() <= LT.second.getVectorNumElements() &&
1340 (isVREVMask(Mask, LT.second, 16) || isVREVMask(Mask, LT.second, 32) ||
1341 isVREVMask(Mask, LT.second, 64)))
1342 return ST->getMVEVectorCostFactor(CostKind) * LT.first;
1343 }
1344 }
1345
1346 // Restore optimal kind.
1347 if (IsExtractSubvector)
1348 Kind = TTI::SK_ExtractSubvector;
1349 int BaseCost = ST->hasMVEIntegerOps() && SrcTy->isVectorTy()
1350 ? ST->getMVEVectorCostFactor(CostKind)
1351 : 1;
1352 return BaseCost * BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind,
1353 Index, SubTp);
1354 }
1355
getArithmeticInstrCost(unsigned Opcode,Type * Ty,TTI::TargetCostKind CostKind,TTI::OperandValueInfo Op1Info,TTI::OperandValueInfo Op2Info,ArrayRef<const Value * > Args,const Instruction * CxtI) const1356 InstructionCost ARMTTIImpl::getArithmeticInstrCost(
1357 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
1358 TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
1359 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
1360 int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
1361 if (ST->isThumb() && CostKind == TTI::TCK_CodeSize && Ty->isIntegerTy(1)) {
1362 // Make operations on i1 relatively expensive as this often involves
1363 // combining predicates. AND and XOR should be easier to handle with IT
1364 // blocks.
1365 switch (ISDOpcode) {
1366 default:
1367 break;
1368 case ISD::AND:
1369 case ISD::XOR:
1370 return 2;
1371 case ISD::OR:
1372 return 3;
1373 }
1374 }
1375
1376 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1377
1378 if (ST->hasNEON()) {
1379 const unsigned FunctionCallDivCost = 20;
1380 const unsigned ReciprocalDivCost = 10;
1381 static const CostTblEntry CostTbl[] = {
1382 // Division.
1383 // These costs are somewhat random. Choose a cost of 20 to indicate that
1384 // vectorizing devision (added function call) is going to be very expensive.
1385 // Double registers types.
1386 { ISD::SDIV, MVT::v1i64, 1 * FunctionCallDivCost},
1387 { ISD::UDIV, MVT::v1i64, 1 * FunctionCallDivCost},
1388 { ISD::SREM, MVT::v1i64, 1 * FunctionCallDivCost},
1389 { ISD::UREM, MVT::v1i64, 1 * FunctionCallDivCost},
1390 { ISD::SDIV, MVT::v2i32, 2 * FunctionCallDivCost},
1391 { ISD::UDIV, MVT::v2i32, 2 * FunctionCallDivCost},
1392 { ISD::SREM, MVT::v2i32, 2 * FunctionCallDivCost},
1393 { ISD::UREM, MVT::v2i32, 2 * FunctionCallDivCost},
1394 { ISD::SDIV, MVT::v4i16, ReciprocalDivCost},
1395 { ISD::UDIV, MVT::v4i16, ReciprocalDivCost},
1396 { ISD::SREM, MVT::v4i16, 4 * FunctionCallDivCost},
1397 { ISD::UREM, MVT::v4i16, 4 * FunctionCallDivCost},
1398 { ISD::SDIV, MVT::v8i8, ReciprocalDivCost},
1399 { ISD::UDIV, MVT::v8i8, ReciprocalDivCost},
1400 { ISD::SREM, MVT::v8i8, 8 * FunctionCallDivCost},
1401 { ISD::UREM, MVT::v8i8, 8 * FunctionCallDivCost},
1402 // Quad register types.
1403 { ISD::SDIV, MVT::v2i64, 2 * FunctionCallDivCost},
1404 { ISD::UDIV, MVT::v2i64, 2 * FunctionCallDivCost},
1405 { ISD::SREM, MVT::v2i64, 2 * FunctionCallDivCost},
1406 { ISD::UREM, MVT::v2i64, 2 * FunctionCallDivCost},
1407 { ISD::SDIV, MVT::v4i32, 4 * FunctionCallDivCost},
1408 { ISD::UDIV, MVT::v4i32, 4 * FunctionCallDivCost},
1409 { ISD::SREM, MVT::v4i32, 4 * FunctionCallDivCost},
1410 { ISD::UREM, MVT::v4i32, 4 * FunctionCallDivCost},
1411 { ISD::SDIV, MVT::v8i16, 8 * FunctionCallDivCost},
1412 { ISD::UDIV, MVT::v8i16, 8 * FunctionCallDivCost},
1413 { ISD::SREM, MVT::v8i16, 8 * FunctionCallDivCost},
1414 { ISD::UREM, MVT::v8i16, 8 * FunctionCallDivCost},
1415 { ISD::SDIV, MVT::v16i8, 16 * FunctionCallDivCost},
1416 { ISD::UDIV, MVT::v16i8, 16 * FunctionCallDivCost},
1417 { ISD::SREM, MVT::v16i8, 16 * FunctionCallDivCost},
1418 { ISD::UREM, MVT::v16i8, 16 * FunctionCallDivCost},
1419 // Multiplication.
1420 };
1421
1422 if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second))
1423 return LT.first * Entry->Cost;
1424
1425 InstructionCost Cost = BaseT::getArithmeticInstrCost(
1426 Opcode, Ty, CostKind, Op1Info, Op2Info);
1427
1428 // This is somewhat of a hack. The problem that we are facing is that SROA
1429 // creates a sequence of shift, and, or instructions to construct values.
1430 // These sequences are recognized by the ISel and have zero-cost. Not so for
1431 // the vectorized code. Because we have support for v2i64 but not i64 those
1432 // sequences look particularly beneficial to vectorize.
1433 // To work around this we increase the cost of v2i64 operations to make them
1434 // seem less beneficial.
1435 if (LT.second == MVT::v2i64 && Op2Info.isUniform() && Op2Info.isConstant())
1436 Cost += 4;
1437
1438 return Cost;
1439 }
1440
1441 // If this operation is a shift on arm/thumb2, it might well be folded into
1442 // the following instruction, hence having a cost of 0.
1443 auto LooksLikeAFreeShift = [&]() {
1444 if (ST->isThumb1Only() || Ty->isVectorTy())
1445 return false;
1446
1447 if (!CxtI || !CxtI->hasOneUse() || !CxtI->isShift())
1448 return false;
1449 if (!Op2Info.isUniform() || !Op2Info.isConstant())
1450 return false;
1451
1452 // Folded into a ADC/ADD/AND/BIC/CMP/EOR/MVN/ORR/ORN/RSB/SBC/SUB
1453 switch (cast<Instruction>(CxtI->user_back())->getOpcode()) {
1454 case Instruction::Add:
1455 case Instruction::Sub:
1456 case Instruction::And:
1457 case Instruction::Xor:
1458 case Instruction::Or:
1459 case Instruction::ICmp:
1460 return true;
1461 default:
1462 return false;
1463 }
1464 };
1465 if (LooksLikeAFreeShift())
1466 return 0;
1467
1468 // When targets have both DSP and MVE we find that the
1469 // the compiler will attempt to vectorize as well as using
1470 // scalar (S/U)MLAL operations. This is in cases where we have
1471 // the pattern ext(mul(ext(i16), ext(i16))) we find
1472 // that codegen performs better when only using (S/U)MLAL scalar
1473 // ops instead of trying to mix vector ops with (S/U)MLAL ops. We therefore
1474 // check if a mul instruction is used in a (U/S)MLAL pattern.
1475 auto MulInDSPMLALPattern = [&](const Instruction *I, unsigned Opcode,
1476 Type *Ty) -> bool {
1477 if (!ST->hasDSP())
1478 return false;
1479
1480 if (!I)
1481 return false;
1482
1483 if (Opcode != Instruction::Mul)
1484 return false;
1485
1486 if (Ty->isVectorTy())
1487 return false;
1488
1489 auto ValueOpcodesEqual = [](const Value *LHS, const Value *RHS) -> bool {
1490 return cast<Instruction>(LHS)->getOpcode() ==
1491 cast<Instruction>(RHS)->getOpcode();
1492 };
1493 auto IsExtInst = [](const Value *V) -> bool {
1494 return isa<ZExtInst>(V) || isa<SExtInst>(V);
1495 };
1496 auto IsExtensionFromHalf = [](const Value *V) -> bool {
1497 return cast<Instruction>(V)->getOperand(0)->getType()->isIntegerTy(16);
1498 };
1499
1500 // We check the arguments of the instruction to see if they're extends
1501 auto *BinOp = dyn_cast<BinaryOperator>(I);
1502 if (!BinOp)
1503 return false;
1504 Value *Op0 = BinOp->getOperand(0);
1505 Value *Op1 = BinOp->getOperand(1);
1506 if (IsExtInst(Op0) && IsExtInst(Op1) && ValueOpcodesEqual(Op0, Op1)) {
1507 // We're interested in an ext of an i16
1508 if (!I->getType()->isIntegerTy(32) || !IsExtensionFromHalf(Op0) ||
1509 !IsExtensionFromHalf(Op1))
1510 return false;
1511 // We need to check if this result will be further extended to i64
1512 // and that all these uses are SExt
1513 for (auto *U : I->users())
1514 if (!IsExtInst(U))
1515 return false;
1516 return true;
1517 }
1518
1519 return false;
1520 };
1521
1522 if (MulInDSPMLALPattern(CxtI, Opcode, Ty))
1523 return 0;
1524
1525 // Default to cheap (throughput/size of 1 instruction) but adjust throughput
1526 // for "multiple beats" potentially needed by MVE instructions.
1527 int BaseCost = 1;
1528 if (ST->hasMVEIntegerOps() && Ty->isVectorTy())
1529 BaseCost = ST->getMVEVectorCostFactor(CostKind);
1530
1531 // The rest of this mostly follows what is done in
1532 // BaseT::getArithmeticInstrCost, without treating floats as more expensive
1533 // that scalars or increasing the costs for custom operations. The results is
1534 // also multiplied by the MVEVectorCostFactor where appropriate.
1535 if (TLI->isOperationLegalOrCustomOrPromote(ISDOpcode, LT.second))
1536 return LT.first * BaseCost;
1537
1538 // Else this is expand, assume that we need to scalarize this op.
1539 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
1540 unsigned Num = VTy->getNumElements();
1541 InstructionCost Cost =
1542 getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind);
1543 // Return the cost of multiple scalar invocation plus the cost of
1544 // inserting and extracting the values.
1545 SmallVector<Type *> Tys(Args.size(), Ty);
1546 return BaseT::getScalarizationOverhead(VTy, Args, Tys, CostKind) +
1547 Num * Cost;
1548 }
1549
1550 return BaseCost;
1551 }
1552
getMemoryOpCost(unsigned Opcode,Type * Src,Align Alignment,unsigned AddressSpace,TTI::TargetCostKind CostKind,TTI::OperandValueInfo OpInfo,const Instruction * I) const1553 InstructionCost ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
1554 Align Alignment,
1555 unsigned AddressSpace,
1556 TTI::TargetCostKind CostKind,
1557 TTI::OperandValueInfo OpInfo,
1558 const Instruction *I) const {
1559 // TODO: Handle other cost kinds.
1560 if (CostKind != TTI::TCK_RecipThroughput)
1561 return 1;
1562
1563 // Type legalization can't handle structs
1564 if (TLI->getValueType(DL, Src, true) == MVT::Other)
1565 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1566 CostKind);
1567
1568 if (ST->hasNEON() && Src->isVectorTy() && Alignment != Align(16) &&
1569 cast<VectorType>(Src)->getElementType()->isDoubleTy()) {
1570 // Unaligned loads/stores are extremely inefficient.
1571 // We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr.
1572 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
1573 return LT.first * 4;
1574 }
1575
1576 // MVE can optimize a fpext(load(4xhalf)) using an extending integer load.
1577 // Same for stores.
1578 if (ST->hasMVEFloatOps() && isa<FixedVectorType>(Src) && I &&
1579 ((Opcode == Instruction::Load && I->hasOneUse() &&
1580 isa<FPExtInst>(*I->user_begin())) ||
1581 (Opcode == Instruction::Store && isa<FPTruncInst>(I->getOperand(0))))) {
1582 FixedVectorType *SrcVTy = cast<FixedVectorType>(Src);
1583 Type *DstTy =
1584 Opcode == Instruction::Load
1585 ? (*I->user_begin())->getType()
1586 : cast<Instruction>(I->getOperand(0))->getOperand(0)->getType();
1587 if (SrcVTy->getNumElements() == 4 && SrcVTy->getScalarType()->isHalfTy() &&
1588 DstTy->getScalarType()->isFloatTy())
1589 return ST->getMVEVectorCostFactor(CostKind);
1590 }
1591
1592 int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
1593 ? ST->getMVEVectorCostFactor(CostKind)
1594 : 1;
1595 return BaseCost * BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1596 CostKind, OpInfo, I);
1597 }
1598
1599 InstructionCost
getMaskedMemoryOpCost(unsigned Opcode,Type * Src,Align Alignment,unsigned AddressSpace,TTI::TargetCostKind CostKind) const1600 ARMTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
1601 unsigned AddressSpace,
1602 TTI::TargetCostKind CostKind) const {
1603 if (ST->hasMVEIntegerOps()) {
1604 if (Opcode == Instruction::Load &&
1605 isLegalMaskedLoad(Src, Alignment, AddressSpace))
1606 return ST->getMVEVectorCostFactor(CostKind);
1607 if (Opcode == Instruction::Store &&
1608 isLegalMaskedStore(Src, Alignment, AddressSpace))
1609 return ST->getMVEVectorCostFactor(CostKind);
1610 }
1611 if (!isa<FixedVectorType>(Src))
1612 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1613 CostKind);
1614 // Scalar cost, which is currently very high due to the efficiency of the
1615 // generated code.
1616 return cast<FixedVectorType>(Src)->getNumElements() * 8;
1617 }
1618
getInterleavedMemoryOpCost(unsigned Opcode,Type * VecTy,unsigned Factor,ArrayRef<unsigned> Indices,Align Alignment,unsigned AddressSpace,TTI::TargetCostKind CostKind,bool UseMaskForCond,bool UseMaskForGaps) const1619 InstructionCost ARMTTIImpl::getInterleavedMemoryOpCost(
1620 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1621 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1622 bool UseMaskForCond, bool UseMaskForGaps) const {
1623 assert(Factor >= 2 && "Invalid interleave factor");
1624 assert(isa<VectorType>(VecTy) && "Expect a vector type");
1625
1626 // vldN/vstN doesn't support vector types of i64/f64 element.
1627 bool EltIs64Bits = DL.getTypeSizeInBits(VecTy->getScalarType()) == 64;
1628
1629 if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits &&
1630 !UseMaskForCond && !UseMaskForGaps) {
1631 unsigned NumElts = cast<FixedVectorType>(VecTy)->getNumElements();
1632 auto *SubVecTy =
1633 FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor);
1634
1635 // vldN/vstN only support legal vector types of size 64 or 128 in bits.
1636 // Accesses having vector types that are a multiple of 128 bits can be
1637 // matched to more than one vldN/vstN instruction.
1638 int BaseCost =
1639 ST->hasMVEIntegerOps() ? ST->getMVEVectorCostFactor(CostKind) : 1;
1640 if (NumElts % Factor == 0 &&
1641 TLI->isLegalInterleavedAccessType(Factor, SubVecTy, Alignment, DL))
1642 return Factor * BaseCost * TLI->getNumInterleavedAccesses(SubVecTy, DL);
1643
1644 // Some smaller than legal interleaved patterns are cheap as we can make
1645 // use of the vmovn or vrev patterns to interleave a standard load. This is
1646 // true for v4i8, v8i8 and v4i16 at least (but not for v4f16 as it is
1647 // promoted differently). The cost of 2 here is then a load and vrev or
1648 // vmovn.
1649 if (ST->hasMVEIntegerOps() && Factor == 2 && NumElts / Factor > 2 &&
1650 VecTy->isIntOrIntVectorTy() &&
1651 DL.getTypeSizeInBits(SubVecTy).getFixedValue() <= 64)
1652 return 2 * BaseCost;
1653 }
1654
1655 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1656 Alignment, AddressSpace, CostKind,
1657 UseMaskForCond, UseMaskForGaps);
1658 }
1659
getGatherScatterOpCost(unsigned Opcode,Type * DataTy,const Value * Ptr,bool VariableMask,Align Alignment,TTI::TargetCostKind CostKind,const Instruction * I) const1660 InstructionCost ARMTTIImpl::getGatherScatterOpCost(
1661 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
1662 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) const {
1663 using namespace PatternMatch;
1664 if (!ST->hasMVEIntegerOps() || !EnableMaskedGatherScatters)
1665 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
1666 Alignment, CostKind, I);
1667
1668 assert(DataTy->isVectorTy() && "Can't do gather/scatters on scalar!");
1669 auto *VTy = cast<FixedVectorType>(DataTy);
1670
1671 // TODO: Splitting, once we do that.
1672
1673 unsigned NumElems = VTy->getNumElements();
1674 unsigned EltSize = VTy->getScalarSizeInBits();
1675 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(DataTy);
1676
1677 // For now, it is assumed that for the MVE gather instructions the loads are
1678 // all effectively serialised. This means the cost is the scalar cost
1679 // multiplied by the number of elements being loaded. This is possibly very
1680 // conservative, but even so we still end up vectorising loops because the
1681 // cost per iteration for many loops is lower than for scalar loops.
1682 InstructionCost VectorCost =
1683 NumElems * LT.first * ST->getMVEVectorCostFactor(CostKind);
1684 // The scalarization cost should be a lot higher. We use the number of vector
1685 // elements plus the scalarization overhead. If masking is required then a lot
1686 // of little blocks will be needed and potentially a scalarized p0 mask,
1687 // greatly increasing the cost.
1688 InstructionCost ScalarCost =
1689 NumElems * LT.first + (VariableMask ? NumElems * 5 : 0) +
1690 BaseT::getScalarizationOverhead(VTy, /*Insert*/ true, /*Extract*/ false,
1691 CostKind) +
1692 BaseT::getScalarizationOverhead(VTy, /*Insert*/ false, /*Extract*/ true,
1693 CostKind);
1694
1695 if (EltSize < 8 || Alignment < EltSize / 8)
1696 return ScalarCost;
1697
1698 unsigned ExtSize = EltSize;
1699 // Check whether there's a single user that asks for an extended type
1700 if (I != nullptr) {
1701 // Dependent of the caller of this function, a gather instruction will
1702 // either have opcode Instruction::Load or be a call to the masked_gather
1703 // intrinsic
1704 if ((I->getOpcode() == Instruction::Load ||
1705 match(I, m_Intrinsic<Intrinsic::masked_gather>())) &&
1706 I->hasOneUse()) {
1707 const User *Us = *I->users().begin();
1708 if (isa<ZExtInst>(Us) || isa<SExtInst>(Us)) {
1709 // only allow valid type combinations
1710 unsigned TypeSize =
1711 cast<Instruction>(Us)->getType()->getScalarSizeInBits();
1712 if (((TypeSize == 32 && (EltSize == 8 || EltSize == 16)) ||
1713 (TypeSize == 16 && EltSize == 8)) &&
1714 TypeSize * NumElems == 128) {
1715 ExtSize = TypeSize;
1716 }
1717 }
1718 }
1719 // Check whether the input data needs to be truncated
1720 TruncInst *T;
1721 if ((I->getOpcode() == Instruction::Store ||
1722 match(I, m_Intrinsic<Intrinsic::masked_scatter>())) &&
1723 (T = dyn_cast<TruncInst>(I->getOperand(0)))) {
1724 // Only allow valid type combinations
1725 unsigned TypeSize = T->getOperand(0)->getType()->getScalarSizeInBits();
1726 if (((EltSize == 16 && TypeSize == 32) ||
1727 (EltSize == 8 && (TypeSize == 32 || TypeSize == 16))) &&
1728 TypeSize * NumElems == 128)
1729 ExtSize = TypeSize;
1730 }
1731 }
1732
1733 if (ExtSize * NumElems != 128 || NumElems < 4)
1734 return ScalarCost;
1735
1736 // Any (aligned) i32 gather will not need to be scalarised.
1737 if (ExtSize == 32)
1738 return VectorCost;
1739 // For smaller types, we need to ensure that the gep's inputs are correctly
1740 // extended from a small enough value. Other sizes (including i64) are
1741 // scalarized for now.
1742 if (ExtSize != 8 && ExtSize != 16)
1743 return ScalarCost;
1744
1745 if (const auto *BC = dyn_cast<BitCastInst>(Ptr))
1746 Ptr = BC->getOperand(0);
1747 if (const auto *GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
1748 if (GEP->getNumOperands() != 2)
1749 return ScalarCost;
1750 unsigned Scale = DL.getTypeAllocSize(GEP->getResultElementType());
1751 // Scale needs to be correct (which is only relevant for i16s).
1752 if (Scale != 1 && Scale * 8 != ExtSize)
1753 return ScalarCost;
1754 // And we need to zext (not sext) the indexes from a small enough type.
1755 if (const auto *ZExt = dyn_cast<ZExtInst>(GEP->getOperand(1))) {
1756 if (ZExt->getOperand(0)->getType()->getScalarSizeInBits() <= ExtSize)
1757 return VectorCost;
1758 }
1759 return ScalarCost;
1760 }
1761 return ScalarCost;
1762 }
1763
1764 InstructionCost
getArithmeticReductionCost(unsigned Opcode,VectorType * ValTy,std::optional<FastMathFlags> FMF,TTI::TargetCostKind CostKind) const1765 ARMTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
1766 std::optional<FastMathFlags> FMF,
1767 TTI::TargetCostKind CostKind) const {
1768
1769 EVT ValVT = TLI->getValueType(DL, ValTy);
1770 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1771 unsigned EltSize = ValVT.getScalarSizeInBits();
1772
1773 // In general floating point reductions are a series of elementwise
1774 // operations, with free extracts on each step. These are either in-order or
1775 // treewise depending on whether that is allowed by the fast math flags.
1776 if ((ISD == ISD::FADD || ISD == ISD::FMUL) &&
1777 ((EltSize == 32 && ST->hasVFP2Base()) ||
1778 (EltSize == 64 && ST->hasFP64()) ||
1779 (EltSize == 16 && ST->hasFullFP16()))) {
1780 unsigned NumElts = cast<FixedVectorType>(ValTy)->getNumElements();
1781 unsigned VecLimit = ST->hasMVEFloatOps() ? 128 : (ST->hasNEON() ? 64 : -1);
1782 InstructionCost VecCost = 0;
1783 while (!TTI::requiresOrderedReduction(FMF) && isPowerOf2_32(NumElts) &&
1784 NumElts * EltSize > VecLimit) {
1785 Type *VecTy = FixedVectorType::get(ValTy->getElementType(), NumElts / 2);
1786 VecCost += getArithmeticInstrCost(Opcode, VecTy, CostKind);
1787 NumElts /= 2;
1788 }
1789
1790 // For fp16 we need to extract the upper lane elements. MVE can add a
1791 // VREV+FMIN/MAX to perform another vector step instead.
1792 InstructionCost ExtractCost = 0;
1793 if (!TTI::requiresOrderedReduction(FMF) && ST->hasMVEFloatOps() &&
1794 ValVT.getVectorElementType() == MVT::f16 && NumElts == 8) {
1795 VecCost += ST->getMVEVectorCostFactor(CostKind) * 2;
1796 NumElts /= 2;
1797 } else if (ValVT.getVectorElementType() == MVT::f16)
1798 ExtractCost = NumElts / 2;
1799
1800 return VecCost + ExtractCost +
1801 NumElts *
1802 getArithmeticInstrCost(Opcode, ValTy->getElementType(), CostKind);
1803 }
1804
1805 if ((ISD == ISD::AND || ISD == ISD::OR || ISD == ISD::XOR) &&
1806 (EltSize == 64 || EltSize == 32 || EltSize == 16 || EltSize == 8)) {
1807 unsigned NumElts = cast<FixedVectorType>(ValTy)->getNumElements();
1808 unsigned VecLimit =
1809 ST->hasMVEIntegerOps() ? 128 : (ST->hasNEON() ? 64 : -1);
1810 InstructionCost VecCost = 0;
1811 while (isPowerOf2_32(NumElts) && NumElts * EltSize > VecLimit) {
1812 Type *VecTy = FixedVectorType::get(ValTy->getElementType(), NumElts / 2);
1813 VecCost += getArithmeticInstrCost(Opcode, VecTy, CostKind);
1814 NumElts /= 2;
1815 }
1816 // For i16/i8, MVE will perform a VREV + VORR/VAND/VEOR for the 64bit vector
1817 // step.
1818 if (ST->hasMVEIntegerOps() && ValVT.getScalarSizeInBits() <= 16 &&
1819 NumElts * EltSize == 64) {
1820 Type *VecTy = FixedVectorType::get(ValTy->getElementType(), NumElts);
1821 VecCost += ST->getMVEVectorCostFactor(CostKind) +
1822 getArithmeticInstrCost(Opcode, VecTy, CostKind);
1823 NumElts /= 2;
1824 }
1825
1826 // From here we extract the elements and perform the and/or/xor.
1827 InstructionCost ExtractCost = NumElts;
1828 return VecCost + ExtractCost +
1829 (NumElts - 1) * getArithmeticInstrCost(
1830 Opcode, ValTy->getElementType(), CostKind);
1831 }
1832
1833 if (!ST->hasMVEIntegerOps() || !ValVT.isSimple() || ISD != ISD::ADD ||
1834 TTI::requiresOrderedReduction(FMF))
1835 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1836
1837 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1838
1839 static const CostTblEntry CostTblAdd[]{
1840 {ISD::ADD, MVT::v16i8, 1},
1841 {ISD::ADD, MVT::v8i16, 1},
1842 {ISD::ADD, MVT::v4i32, 1},
1843 };
1844 if (const auto *Entry = CostTableLookup(CostTblAdd, ISD, LT.second))
1845 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first;
1846
1847 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1848 }
1849
getExtendedReductionCost(unsigned Opcode,bool IsUnsigned,Type * ResTy,VectorType * ValTy,std::optional<FastMathFlags> FMF,TTI::TargetCostKind CostKind) const1850 InstructionCost ARMTTIImpl::getExtendedReductionCost(
1851 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy,
1852 std::optional<FastMathFlags> FMF, TTI::TargetCostKind CostKind) const {
1853 EVT ValVT = TLI->getValueType(DL, ValTy);
1854 EVT ResVT = TLI->getValueType(DL, ResTy);
1855
1856 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1857
1858 switch (ISD) {
1859 case ISD::ADD:
1860 if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {
1861 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1862
1863 // The legal cases are:
1864 // VADDV u/s 8/16/32
1865 // VADDLV u/s 32
1866 // Codegen currently cannot always handle larger than legal vectors very
1867 // well, especially for predicated reductions where the mask needs to be
1868 // split, so restrict to 128bit or smaller input types.
1869 unsigned RevVTSize = ResVT.getSizeInBits();
1870 if (ValVT.getSizeInBits() <= 128 &&
1871 ((LT.second == MVT::v16i8 && RevVTSize <= 32) ||
1872 (LT.second == MVT::v8i16 && RevVTSize <= 32) ||
1873 (LT.second == MVT::v4i32 && RevVTSize <= 64)))
1874 return ST->getMVEVectorCostFactor(CostKind) * LT.first;
1875 }
1876 break;
1877 default:
1878 break;
1879 }
1880 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy, FMF,
1881 CostKind);
1882 }
1883
1884 InstructionCost
getMulAccReductionCost(bool IsUnsigned,Type * ResTy,VectorType * ValTy,TTI::TargetCostKind CostKind) const1885 ARMTTIImpl::getMulAccReductionCost(bool IsUnsigned, Type *ResTy,
1886 VectorType *ValTy,
1887 TTI::TargetCostKind CostKind) const {
1888 EVT ValVT = TLI->getValueType(DL, ValTy);
1889 EVT ResVT = TLI->getValueType(DL, ResTy);
1890
1891 if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {
1892 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1893
1894 // The legal cases are:
1895 // VMLAV u/s 8/16/32
1896 // VMLALV u/s 16/32
1897 // Codegen currently cannot always handle larger than legal vectors very
1898 // well, especially for predicated reductions where the mask needs to be
1899 // split, so restrict to 128bit or smaller input types.
1900 unsigned RevVTSize = ResVT.getSizeInBits();
1901 if (ValVT.getSizeInBits() <= 128 &&
1902 ((LT.second == MVT::v16i8 && RevVTSize <= 32) ||
1903 (LT.second == MVT::v8i16 && RevVTSize <= 64) ||
1904 (LT.second == MVT::v4i32 && RevVTSize <= 64)))
1905 return ST->getMVEVectorCostFactor(CostKind) * LT.first;
1906 }
1907
1908 return BaseT::getMulAccReductionCost(IsUnsigned, ResTy, ValTy, CostKind);
1909 }
1910
1911 InstructionCost
getMinMaxReductionCost(Intrinsic::ID IID,VectorType * Ty,FastMathFlags FMF,TTI::TargetCostKind CostKind) const1912 ARMTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
1913 FastMathFlags FMF,
1914 TTI::TargetCostKind CostKind) const {
1915 EVT ValVT = TLI->getValueType(DL, Ty);
1916
1917 // In general floating point reductions are a series of elementwise
1918 // operations, with free extracts on each step. These are either in-order or
1919 // treewise depending on whether that is allowed by the fast math flags.
1920 if ((IID == Intrinsic::minnum || IID == Intrinsic::maxnum) &&
1921 ((ValVT.getVectorElementType() == MVT::f32 && ST->hasVFP2Base()) ||
1922 (ValVT.getVectorElementType() == MVT::f64 && ST->hasFP64()) ||
1923 (ValVT.getVectorElementType() == MVT::f16 && ST->hasFullFP16()))) {
1924 unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements();
1925 unsigned EltSize = ValVT.getScalarSizeInBits();
1926 unsigned VecLimit = ST->hasMVEFloatOps() ? 128 : (ST->hasNEON() ? 64 : -1);
1927 InstructionCost VecCost;
1928 while (isPowerOf2_32(NumElts) && NumElts * EltSize > VecLimit) {
1929 Type *VecTy = FixedVectorType::get(Ty->getElementType(), NumElts/2);
1930 IntrinsicCostAttributes ICA(IID, VecTy, {VecTy, VecTy}, FMF);
1931 VecCost += getIntrinsicInstrCost(ICA, CostKind);
1932 NumElts /= 2;
1933 }
1934
1935 // For fp16 we need to extract the upper lane elements. MVE can add a
1936 // VREV+FMIN/MAX to perform another vector step instead.
1937 InstructionCost ExtractCost = 0;
1938 if (ST->hasMVEFloatOps() && ValVT.getVectorElementType() == MVT::f16 &&
1939 NumElts == 8) {
1940 VecCost += ST->getMVEVectorCostFactor(CostKind) * 2;
1941 NumElts /= 2;
1942 } else if (ValVT.getVectorElementType() == MVT::f16)
1943 ExtractCost = cast<FixedVectorType>(Ty)->getNumElements() / 2;
1944
1945 IntrinsicCostAttributes ICA(IID, Ty->getElementType(),
1946 {Ty->getElementType(), Ty->getElementType()},
1947 FMF);
1948 return VecCost + ExtractCost +
1949 (NumElts - 1) * getIntrinsicInstrCost(ICA, CostKind);
1950 }
1951
1952 if (IID == Intrinsic::smin || IID == Intrinsic::smax ||
1953 IID == Intrinsic::umin || IID == Intrinsic::umax) {
1954 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1955
1956 // All costs are the same for u/s min/max. These lower to vminv, which are
1957 // given a slightly higher cost as they tend to take multiple cycles for
1958 // smaller type sizes.
1959 static const CostTblEntry CostTblAdd[]{
1960 {ISD::SMIN, MVT::v16i8, 4},
1961 {ISD::SMIN, MVT::v8i16, 3},
1962 {ISD::SMIN, MVT::v4i32, 2},
1963 };
1964 if (const auto *Entry = CostTableLookup(CostTblAdd, ISD::SMIN, LT.second))
1965 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first;
1966 }
1967
1968 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1969 }
1970
1971 InstructionCost
getIntrinsicInstrCost(const IntrinsicCostAttributes & ICA,TTI::TargetCostKind CostKind) const1972 ARMTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
1973 TTI::TargetCostKind CostKind) const {
1974 unsigned Opc = ICA.getID();
1975 switch (Opc) {
1976 case Intrinsic::get_active_lane_mask:
1977 // Currently we make a somewhat optimistic assumption that
1978 // active_lane_mask's are always free. In reality it may be freely folded
1979 // into a tail predicated loop, expanded into a VCPT or expanded into a lot
1980 // of add/icmp code. We may need to improve this in the future, but being
1981 // able to detect if it is free or not involves looking at a lot of other
1982 // code. We currently assume that the vectorizer inserted these, and knew
1983 // what it was doing in adding one.
1984 if (ST->hasMVEIntegerOps())
1985 return 0;
1986 break;
1987 case Intrinsic::sadd_sat:
1988 case Intrinsic::ssub_sat:
1989 case Intrinsic::uadd_sat:
1990 case Intrinsic::usub_sat: {
1991 bool IsAdd = (Opc == Intrinsic::sadd_sat || Opc == Intrinsic::ssub_sat);
1992 bool IsSigned = (Opc == Intrinsic::sadd_sat || Opc == Intrinsic::ssub_sat);
1993 Type *RetTy = ICA.getReturnType();
1994
1995 if (auto *ITy = dyn_cast<IntegerType>(RetTy)) {
1996 if (IsSigned && ST->hasDSP() && ITy->getBitWidth() == 32)
1997 return 1; // qadd / qsub
1998 if (ST->hasDSP() && (ITy->getBitWidth() == 8 || ITy->getBitWidth() == 16))
1999 return 2; // uqadd16 / qadd16 / uqsub16 / qsub16 + possible extend.
2000 // Otherwise return the cost of expanding the node. Generally an add +
2001 // icmp + sel.
2002 CmpInst::Predicate Pred = CmpInst::ICMP_SGT;
2003 Type *CondTy = RetTy->getWithNewBitWidth(1);
2004 return getArithmeticInstrCost(IsAdd ? Instruction::Add : Instruction::Sub,
2005 RetTy, CostKind) +
2006 2 * getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy, Pred,
2007 CostKind) +
2008 2 * getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, Pred,
2009 CostKind);
2010 }
2011
2012 if (!ST->hasMVEIntegerOps())
2013 break;
2014
2015 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);
2016 if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
2017 LT.second == MVT::v16i8) {
2018 // This is a base cost of 1 for the vqadd, plus 3 extract shifts if we
2019 // need to extend the type, as it uses shr(qadd(shl, shl)).
2020 unsigned Instrs =
2021 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1
2022 : 4;
2023 return LT.first * ST->getMVEVectorCostFactor(CostKind) * Instrs;
2024 }
2025 break;
2026 }
2027 case Intrinsic::abs:
2028 case Intrinsic::smin:
2029 case Intrinsic::smax:
2030 case Intrinsic::umin:
2031 case Intrinsic::umax: {
2032 if (!ST->hasMVEIntegerOps())
2033 break;
2034 Type *VT = ICA.getReturnType();
2035
2036 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VT);
2037 if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
2038 LT.second == MVT::v16i8)
2039 return LT.first * ST->getMVEVectorCostFactor(CostKind);
2040 break;
2041 }
2042 case Intrinsic::minnum:
2043 case Intrinsic::maxnum: {
2044 if (!ST->hasMVEFloatOps())
2045 break;
2046 Type *VT = ICA.getReturnType();
2047 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VT);
2048 if (LT.second == MVT::v4f32 || LT.second == MVT::v8f16)
2049 return LT.first * ST->getMVEVectorCostFactor(CostKind);
2050 break;
2051 }
2052 case Intrinsic::fptosi_sat:
2053 case Intrinsic::fptoui_sat: {
2054 if (ICA.getArgTypes().empty())
2055 break;
2056 bool IsSigned = Opc == Intrinsic::fptosi_sat;
2057 auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]);
2058 EVT MTy = TLI->getValueType(DL, ICA.getReturnType());
2059 // Check for the legal types, with the corect subtarget features.
2060 if ((ST->hasVFP2Base() && LT.second == MVT::f32 && MTy == MVT::i32) ||
2061 (ST->hasFP64() && LT.second == MVT::f64 && MTy == MVT::i32) ||
2062 (ST->hasFullFP16() && LT.second == MVT::f16 && MTy == MVT::i32))
2063 return LT.first;
2064
2065 // Equally for MVE vector types
2066 if (ST->hasMVEFloatOps() &&
2067 (LT.second == MVT::v4f32 || LT.second == MVT::v8f16) &&
2068 LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())
2069 return LT.first * ST->getMVEVectorCostFactor(CostKind);
2070
2071 // If we can we use a legal convert followed by a min+max
2072 if (((ST->hasVFP2Base() && LT.second == MVT::f32) ||
2073 (ST->hasFP64() && LT.second == MVT::f64) ||
2074 (ST->hasFullFP16() && LT.second == MVT::f16) ||
2075 (ST->hasMVEFloatOps() &&
2076 (LT.second == MVT::v4f32 || LT.second == MVT::v8f16))) &&
2077 LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
2078 Type *LegalTy = Type::getIntNTy(ICA.getReturnType()->getContext(),
2079 LT.second.getScalarSizeInBits());
2080 InstructionCost Cost =
2081 LT.second.isVector() ? ST->getMVEVectorCostFactor(CostKind) : 1;
2082 IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin
2083 : Intrinsic::umin,
2084 LegalTy, {LegalTy, LegalTy});
2085 Cost += getIntrinsicInstrCost(Attrs1, CostKind);
2086 IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax
2087 : Intrinsic::umax,
2088 LegalTy, {LegalTy, LegalTy});
2089 Cost += getIntrinsicInstrCost(Attrs2, CostKind);
2090 return LT.first * Cost;
2091 }
2092 // Otherwise we need to follow the default expansion that clamps the value
2093 // using a float min/max with a fcmp+sel for nan handling when signed.
2094 Type *FPTy = ICA.getArgTypes()[0];
2095 Type *RetTy = ICA.getReturnType();
2096 IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FPTy, {FPTy, FPTy});
2097 InstructionCost Cost = getIntrinsicInstrCost(Attrs1, CostKind);
2098 IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FPTy, {FPTy, FPTy});
2099 Cost += getIntrinsicInstrCost(Attrs2, CostKind);
2100 Cost +=
2101 getCastInstrCost(IsSigned ? Instruction::FPToSI : Instruction::FPToUI,
2102 RetTy, FPTy, TTI::CastContextHint::None, CostKind);
2103 if (IsSigned) {
2104 Type *CondTy = RetTy->getWithNewBitWidth(1);
2105 Cost += getCmpSelInstrCost(BinaryOperator::FCmp, FPTy, CondTy,
2106 CmpInst::FCMP_UNO, CostKind);
2107 Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
2108 CmpInst::FCMP_UNO, CostKind);
2109 }
2110 return Cost;
2111 }
2112 }
2113
2114 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
2115 }
2116
isLoweredToCall(const Function * F) const2117 bool ARMTTIImpl::isLoweredToCall(const Function *F) const {
2118 if (!F->isIntrinsic())
2119 return BaseT::isLoweredToCall(F);
2120
2121 // Assume all Arm-specific intrinsics map to an instruction.
2122 if (F->getName().starts_with("llvm.arm"))
2123 return false;
2124
2125 switch (F->getIntrinsicID()) {
2126 default: break;
2127 case Intrinsic::powi:
2128 case Intrinsic::sin:
2129 case Intrinsic::cos:
2130 case Intrinsic::sincos:
2131 case Intrinsic::pow:
2132 case Intrinsic::log:
2133 case Intrinsic::log10:
2134 case Intrinsic::log2:
2135 case Intrinsic::exp:
2136 case Intrinsic::exp2:
2137 return true;
2138 case Intrinsic::sqrt:
2139 case Intrinsic::fabs:
2140 case Intrinsic::copysign:
2141 case Intrinsic::floor:
2142 case Intrinsic::ceil:
2143 case Intrinsic::trunc:
2144 case Intrinsic::rint:
2145 case Intrinsic::nearbyint:
2146 case Intrinsic::round:
2147 case Intrinsic::canonicalize:
2148 case Intrinsic::lround:
2149 case Intrinsic::llround:
2150 case Intrinsic::lrint:
2151 case Intrinsic::llrint:
2152 if (F->getReturnType()->isDoubleTy() && !ST->hasFP64())
2153 return true;
2154 if (F->getReturnType()->isHalfTy() && !ST->hasFullFP16())
2155 return true;
2156 // Some operations can be handled by vector instructions and assume
2157 // unsupported vectors will be expanded into supported scalar ones.
2158 // TODO Handle scalar operations properly.
2159 return !ST->hasFPARMv8Base() && !ST->hasVFP2Base();
2160 case Intrinsic::masked_store:
2161 case Intrinsic::masked_load:
2162 case Intrinsic::masked_gather:
2163 case Intrinsic::masked_scatter:
2164 return !ST->hasMVEIntegerOps();
2165 case Intrinsic::sadd_with_overflow:
2166 case Intrinsic::uadd_with_overflow:
2167 case Intrinsic::ssub_with_overflow:
2168 case Intrinsic::usub_with_overflow:
2169 case Intrinsic::sadd_sat:
2170 case Intrinsic::uadd_sat:
2171 case Intrinsic::ssub_sat:
2172 case Intrinsic::usub_sat:
2173 return false;
2174 }
2175
2176 return BaseT::isLoweredToCall(F);
2177 }
2178
maybeLoweredToCall(Instruction & I) const2179 bool ARMTTIImpl::maybeLoweredToCall(Instruction &I) const {
2180 unsigned ISD = TLI->InstructionOpcodeToISD(I.getOpcode());
2181 EVT VT = TLI->getValueType(DL, I.getType(), true);
2182 if (TLI->getOperationAction(ISD, VT) == TargetLowering::LibCall)
2183 return true;
2184
2185 // Check if an intrinsic will be lowered to a call and assume that any
2186 // other CallInst will generate a bl.
2187 if (auto *Call = dyn_cast<CallInst>(&I)) {
2188 if (auto *II = dyn_cast<IntrinsicInst>(Call)) {
2189 switch(II->getIntrinsicID()) {
2190 case Intrinsic::memcpy:
2191 case Intrinsic::memset:
2192 case Intrinsic::memmove:
2193 return getNumMemOps(II) == -1;
2194 default:
2195 if (const Function *F = Call->getCalledFunction())
2196 return isLoweredToCall(F);
2197 }
2198 }
2199 return true;
2200 }
2201
2202 // FPv5 provides conversions between integer, double-precision,
2203 // single-precision, and half-precision formats.
2204 switch (I.getOpcode()) {
2205 default:
2206 break;
2207 case Instruction::FPToSI:
2208 case Instruction::FPToUI:
2209 case Instruction::SIToFP:
2210 case Instruction::UIToFP:
2211 case Instruction::FPTrunc:
2212 case Instruction::FPExt:
2213 return !ST->hasFPARMv8Base();
2214 }
2215
2216 // FIXME: Unfortunately the approach of checking the Operation Action does
2217 // not catch all cases of Legalization that use library calls. Our
2218 // Legalization step categorizes some transformations into library calls as
2219 // Custom, Expand or even Legal when doing type legalization. So for now
2220 // we have to special case for instance the SDIV of 64bit integers and the
2221 // use of floating point emulation.
2222 if (VT.isInteger() && VT.getSizeInBits() >= 64) {
2223 switch (ISD) {
2224 default:
2225 break;
2226 case ISD::SDIV:
2227 case ISD::UDIV:
2228 case ISD::SREM:
2229 case ISD::UREM:
2230 case ISD::SDIVREM:
2231 case ISD::UDIVREM:
2232 return true;
2233 }
2234 }
2235
2236 // Assume all other non-float operations are supported.
2237 if (!VT.isFloatingPoint())
2238 return false;
2239
2240 // We'll need a library call to handle most floats when using soft.
2241 if (TLI->useSoftFloat()) {
2242 switch (I.getOpcode()) {
2243 default:
2244 return true;
2245 case Instruction::Alloca:
2246 case Instruction::Load:
2247 case Instruction::Store:
2248 case Instruction::Select:
2249 case Instruction::PHI:
2250 return false;
2251 }
2252 }
2253
2254 // We'll need a libcall to perform double precision operations on a single
2255 // precision only FPU.
2256 if (I.getType()->isDoubleTy() && !ST->hasFP64())
2257 return true;
2258
2259 // Likewise for half precision arithmetic.
2260 if (I.getType()->isHalfTy() && !ST->hasFullFP16())
2261 return true;
2262
2263 return false;
2264 }
2265
isHardwareLoopProfitable(Loop * L,ScalarEvolution & SE,AssumptionCache & AC,TargetLibraryInfo * LibInfo,HardwareLoopInfo & HWLoopInfo) const2266 bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
2267 AssumptionCache &AC,
2268 TargetLibraryInfo *LibInfo,
2269 HardwareLoopInfo &HWLoopInfo) const {
2270 // Low-overhead branches are only supported in the 'low-overhead branch'
2271 // extension of v8.1-m.
2272 if (!ST->hasLOB() || DisableLowOverheadLoops) {
2273 LLVM_DEBUG(dbgs() << "ARMHWLoops: Disabled\n");
2274 return false;
2275 }
2276
2277 if (!SE.hasLoopInvariantBackedgeTakenCount(L)) {
2278 LLVM_DEBUG(dbgs() << "ARMHWLoops: No BETC\n");
2279 return false;
2280 }
2281
2282 const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
2283 if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) {
2284 LLVM_DEBUG(dbgs() << "ARMHWLoops: Uncomputable BETC\n");
2285 return false;
2286 }
2287
2288 const SCEV *TripCountSCEV =
2289 SE.getAddExpr(BackedgeTakenCount,
2290 SE.getOne(BackedgeTakenCount->getType()));
2291
2292 // We need to store the trip count in LR, a 32-bit register.
2293 if (SE.getUnsignedRangeMax(TripCountSCEV).getBitWidth() > 32) {
2294 LLVM_DEBUG(dbgs() << "ARMHWLoops: Trip count does not fit into 32bits\n");
2295 return false;
2296 }
2297
2298 // Making a call will trash LR and clear LO_BRANCH_INFO, so there's little
2299 // point in generating a hardware loop if that's going to happen.
2300
2301 auto IsHardwareLoopIntrinsic = [](Instruction &I) {
2302 if (auto *Call = dyn_cast<IntrinsicInst>(&I)) {
2303 switch (Call->getIntrinsicID()) {
2304 default:
2305 break;
2306 case Intrinsic::start_loop_iterations:
2307 case Intrinsic::test_start_loop_iterations:
2308 case Intrinsic::loop_decrement:
2309 case Intrinsic::loop_decrement_reg:
2310 return true;
2311 }
2312 }
2313 return false;
2314 };
2315
2316 // Scan the instructions to see if there's any that we know will turn into a
2317 // call or if this loop is already a low-overhead loop or will become a tail
2318 // predicated loop.
2319 bool IsTailPredLoop = false;
2320 auto ScanLoop = [&](Loop *L) {
2321 for (auto *BB : L->getBlocks()) {
2322 for (auto &I : *BB) {
2323 if (maybeLoweredToCall(I) || IsHardwareLoopIntrinsic(I) ||
2324 isa<InlineAsm>(I)) {
2325 LLVM_DEBUG(dbgs() << "ARMHWLoops: Bad instruction: " << I << "\n");
2326 return false;
2327 }
2328 if (auto *II = dyn_cast<IntrinsicInst>(&I))
2329 IsTailPredLoop |=
2330 II->getIntrinsicID() == Intrinsic::get_active_lane_mask ||
2331 II->getIntrinsicID() == Intrinsic::arm_mve_vctp8 ||
2332 II->getIntrinsicID() == Intrinsic::arm_mve_vctp16 ||
2333 II->getIntrinsicID() == Intrinsic::arm_mve_vctp32 ||
2334 II->getIntrinsicID() == Intrinsic::arm_mve_vctp64;
2335 }
2336 }
2337 return true;
2338 };
2339
2340 // Visit inner loops.
2341 for (auto *Inner : *L)
2342 if (!ScanLoop(Inner))
2343 return false;
2344
2345 if (!ScanLoop(L))
2346 return false;
2347
2348 // TODO: Check whether the trip count calculation is expensive. If L is the
2349 // inner loop but we know it has a low trip count, calculating that trip
2350 // count (in the parent loop) may be detrimental.
2351
2352 LLVMContext &C = L->getHeader()->getContext();
2353 HWLoopInfo.CounterInReg = true;
2354 HWLoopInfo.IsNestingLegal = false;
2355 HWLoopInfo.PerformEntryTest = AllowWLSLoops && !IsTailPredLoop;
2356 HWLoopInfo.CountType = Type::getInt32Ty(C);
2357 HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1);
2358 return true;
2359 }
2360
canTailPredicateInstruction(Instruction & I,int & ICmpCount)2361 static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) {
2362 // We don't allow icmp's, and because we only look at single block loops,
2363 // we simply count the icmps, i.e. there should only be 1 for the backedge.
2364 if (isa<ICmpInst>(&I) && ++ICmpCount > 1)
2365 return false;
2366 // FIXME: This is a workaround for poor cost modelling. Min/Max intrinsics are
2367 // not currently canonical, but soon will be. Code without them uses icmp, and
2368 // so is not tail predicated as per the condition above. In order to get the
2369 // same performance we treat min and max the same as an icmp for tailpred
2370 // purposes for the moment (we often rely on non-tailpred and higher VF's to
2371 // pick more optimial instructions like VQDMULH. They need to be recognized
2372 // directly by the vectorizer).
2373 if (auto *II = dyn_cast<IntrinsicInst>(&I))
2374 if ((II->getIntrinsicID() == Intrinsic::smin ||
2375 II->getIntrinsicID() == Intrinsic::smax ||
2376 II->getIntrinsicID() == Intrinsic::umin ||
2377 II->getIntrinsicID() == Intrinsic::umax) &&
2378 ++ICmpCount > 1)
2379 return false;
2380
2381 if (isa<FCmpInst>(&I))
2382 return false;
2383
2384 // We could allow extending/narrowing FP loads/stores, but codegen is
2385 // too inefficient so reject this for now.
2386 if (isa<FPExtInst>(&I) || isa<FPTruncInst>(&I))
2387 return false;
2388
2389 // Extends have to be extending-loads
2390 if (isa<SExtInst>(&I) || isa<ZExtInst>(&I) )
2391 if (!I.getOperand(0)->hasOneUse() || !isa<LoadInst>(I.getOperand(0)))
2392 return false;
2393
2394 // Truncs have to be narrowing-stores
2395 if (isa<TruncInst>(&I) )
2396 if (!I.hasOneUse() || !isa<StoreInst>(*I.user_begin()))
2397 return false;
2398
2399 return true;
2400 }
2401
2402 // To set up a tail-predicated loop, we need to know the total number of
2403 // elements processed by that loop. Thus, we need to determine the element
2404 // size and:
2405 // 1) it should be uniform for all operations in the vector loop, so we
2406 // e.g. don't want any widening/narrowing operations.
2407 // 2) it should be smaller than i64s because we don't have vector operations
2408 // that work on i64s.
2409 // 3) we don't want elements to be reversed or shuffled, to make sure the
2410 // tail-predication masks/predicates the right lanes.
2411 //
canTailPredicateLoop(Loop * L,LoopInfo * LI,ScalarEvolution & SE,const DataLayout & DL,const LoopAccessInfo * LAI)2412 static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
2413 const DataLayout &DL,
2414 const LoopAccessInfo *LAI) {
2415 LLVM_DEBUG(dbgs() << "Tail-predication: checking allowed instructions\n");
2416
2417 // If there are live-out values, it is probably a reduction. We can predicate
2418 // most reduction operations freely under MVE using a combination of
2419 // prefer-predicated-reduction-select and inloop reductions. We limit this to
2420 // floating point and integer reductions, but don't check for operators
2421 // specifically here. If the value ends up not being a reduction (and so the
2422 // vectorizer cannot tailfold the loop), we should fall back to standard
2423 // vectorization automatically.
2424 SmallVector< Instruction *, 8 > LiveOuts;
2425 LiveOuts = llvm::findDefsUsedOutsideOfLoop(L);
2426 bool ReductionsDisabled =
2427 EnableTailPredication == TailPredication::EnabledNoReductions ||
2428 EnableTailPredication == TailPredication::ForceEnabledNoReductions;
2429
2430 for (auto *I : LiveOuts) {
2431 if (!I->getType()->isIntegerTy() && !I->getType()->isFloatTy() &&
2432 !I->getType()->isHalfTy()) {
2433 LLVM_DEBUG(dbgs() << "Don't tail-predicate loop with non-integer/float "
2434 "live-out value\n");
2435 return false;
2436 }
2437 if (ReductionsDisabled) {
2438 LLVM_DEBUG(dbgs() << "Reductions not enabled\n");
2439 return false;
2440 }
2441 }
2442
2443 // Next, check that all instructions can be tail-predicated.
2444 PredicatedScalarEvolution PSE = LAI->getPSE();
2445 int ICmpCount = 0;
2446
2447 for (BasicBlock *BB : L->blocks()) {
2448 for (Instruction &I : BB->instructionsWithoutDebug()) {
2449 if (isa<PHINode>(&I))
2450 continue;
2451 if (!canTailPredicateInstruction(I, ICmpCount)) {
2452 LLVM_DEBUG(dbgs() << "Instruction not allowed: "; I.dump());
2453 return false;
2454 }
2455
2456 Type *T = I.getType();
2457 if (T->getScalarSizeInBits() > 32) {
2458 LLVM_DEBUG(dbgs() << "Unsupported Type: "; T->dump());
2459 return false;
2460 }
2461 if (isa<StoreInst>(I) || isa<LoadInst>(I)) {
2462 Value *Ptr = getLoadStorePointerOperand(&I);
2463 Type *AccessTy = getLoadStoreType(&I);
2464 int64_t NextStride = getPtrStride(PSE, AccessTy, Ptr, L).value_or(0);
2465 if (NextStride == 1) {
2466 // TODO: for now only allow consecutive strides of 1. We could support
2467 // other strides as long as it is uniform, but let's keep it simple
2468 // for now.
2469 continue;
2470 } else if (NextStride == -1 ||
2471 (NextStride == 2 && MVEMaxSupportedInterleaveFactor >= 2) ||
2472 (NextStride == 4 && MVEMaxSupportedInterleaveFactor >= 4)) {
2473 LLVM_DEBUG(dbgs()
2474 << "Consecutive strides of 2 found, vld2/vstr2 can't "
2475 "be tail-predicated\n.");
2476 return false;
2477 // TODO: don't tail predicate if there is a reversed load?
2478 } else if (EnableMaskedGatherScatters) {
2479 // Gather/scatters do allow loading from arbitrary strides, at
2480 // least if they are loop invariant.
2481 // TODO: Loop variant strides should in theory work, too, but
2482 // this requires further testing.
2483 const SCEV *PtrScev = PSE.getSE()->getSCEV(Ptr);
2484 if (auto AR = dyn_cast<SCEVAddRecExpr>(PtrScev)) {
2485 const SCEV *Step = AR->getStepRecurrence(*PSE.getSE());
2486 if (PSE.getSE()->isLoopInvariant(Step, L))
2487 continue;
2488 }
2489 }
2490 LLVM_DEBUG(dbgs() << "Bad stride found, can't "
2491 "tail-predicate\n.");
2492 return false;
2493 }
2494 }
2495 }
2496
2497 LLVM_DEBUG(dbgs() << "tail-predication: all instructions allowed!\n");
2498 return true;
2499 }
2500
preferPredicateOverEpilogue(TailFoldingInfo * TFI) const2501 bool ARMTTIImpl::preferPredicateOverEpilogue(TailFoldingInfo *TFI) const {
2502 if (!EnableTailPredication) {
2503 LLVM_DEBUG(dbgs() << "Tail-predication not enabled.\n");
2504 return false;
2505 }
2506
2507 // Creating a predicated vector loop is the first step for generating a
2508 // tail-predicated hardware loop, for which we need the MVE masked
2509 // load/stores instructions:
2510 if (!ST->hasMVEIntegerOps())
2511 return false;
2512
2513 LoopVectorizationLegality *LVL = TFI->LVL;
2514 Loop *L = LVL->getLoop();
2515
2516 // For now, restrict this to single block loops.
2517 if (L->getNumBlocks() > 1) {
2518 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: not a single block "
2519 "loop.\n");
2520 return false;
2521 }
2522
2523 assert(L->isInnermost() && "preferPredicateOverEpilogue: inner-loop expected");
2524
2525 LoopInfo *LI = LVL->getLoopInfo();
2526 HardwareLoopInfo HWLoopInfo(L);
2527 if (!HWLoopInfo.canAnalyze(*LI)) {
2528 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2529 "analyzable.\n");
2530 return false;
2531 }
2532
2533 AssumptionCache *AC = LVL->getAssumptionCache();
2534 ScalarEvolution *SE = LVL->getScalarEvolution();
2535
2536 // This checks if we have the low-overhead branch architecture
2537 // extension, and if we will create a hardware-loop:
2538 if (!isHardwareLoopProfitable(L, *SE, *AC, TFI->TLI, HWLoopInfo)) {
2539 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2540 "profitable.\n");
2541 return false;
2542 }
2543
2544 DominatorTree *DT = LVL->getDominatorTree();
2545 if (!HWLoopInfo.isHardwareLoopCandidate(*SE, *LI, *DT)) {
2546 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2547 "a candidate.\n");
2548 return false;
2549 }
2550
2551 return canTailPredicateLoop(L, LI, *SE, DL, LVL->getLAI());
2552 }
2553
2554 TailFoldingStyle
getPreferredTailFoldingStyle(bool IVUpdateMayOverflow) const2555 ARMTTIImpl::getPreferredTailFoldingStyle(bool IVUpdateMayOverflow) const {
2556 if (!ST->hasMVEIntegerOps() || !EnableTailPredication)
2557 return TailFoldingStyle::DataWithoutLaneMask;
2558
2559 // Intrinsic @llvm.get.active.lane.mask is supported.
2560 // It is used in the MVETailPredication pass, which requires the number of
2561 // elements processed by this vector loop to setup the tail-predicated
2562 // loop.
2563 return TailFoldingStyle::Data;
2564 }
getUnrollingPreferences(Loop * L,ScalarEvolution & SE,TTI::UnrollingPreferences & UP,OptimizationRemarkEmitter * ORE) const2565 void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
2566 TTI::UnrollingPreferences &UP,
2567 OptimizationRemarkEmitter *ORE) const {
2568 // Enable Upper bound unrolling universally, providing that we do not see an
2569 // active lane mask, which will be better kept as a loop to become tail
2570 // predicated than to be conditionally unrolled.
2571 UP.UpperBound =
2572 !ST->hasMVEIntegerOps() || !any_of(*L->getHeader(), [](Instruction &I) {
2573 return isa<IntrinsicInst>(I) &&
2574 cast<IntrinsicInst>(I).getIntrinsicID() ==
2575 Intrinsic::get_active_lane_mask;
2576 });
2577
2578 // Only currently enable these preferences for M-Class cores.
2579 if (!ST->isMClass())
2580 return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
2581
2582 // Disable loop unrolling for Oz and Os.
2583 UP.OptSizeThreshold = 0;
2584 UP.PartialOptSizeThreshold = 0;
2585 if (L->getHeader()->getParent()->hasOptSize())
2586 return;
2587
2588 SmallVector<BasicBlock*, 4> ExitingBlocks;
2589 L->getExitingBlocks(ExitingBlocks);
2590 LLVM_DEBUG(dbgs() << "Loop has:\n"
2591 << "Blocks: " << L->getNumBlocks() << "\n"
2592 << "Exit blocks: " << ExitingBlocks.size() << "\n");
2593
2594 // Only allow another exit other than the latch. This acts as an early exit
2595 // as it mirrors the profitability calculation of the runtime unroller.
2596 if (ExitingBlocks.size() > 2)
2597 return;
2598
2599 // Limit the CFG of the loop body for targets with a branch predictor.
2600 // Allowing 4 blocks permits if-then-else diamonds in the body.
2601 if (ST->hasBranchPredictor() && L->getNumBlocks() > 4)
2602 return;
2603
2604 // Don't unroll vectorized loops, including the remainder loop
2605 if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized"))
2606 return;
2607
2608 // Scan the loop: don't unroll loops with calls as this could prevent
2609 // inlining.
2610 InstructionCost Cost = 0;
2611 for (auto *BB : L->getBlocks()) {
2612 for (auto &I : *BB) {
2613 // Don't unroll vectorised loop. MVE does not benefit from it as much as
2614 // scalar code.
2615 if (I.getType()->isVectorTy())
2616 return;
2617
2618 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
2619 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
2620 if (!isLoweredToCall(F))
2621 continue;
2622 }
2623 return;
2624 }
2625
2626 SmallVector<const Value*, 4> Operands(I.operand_values());
2627 Cost += getInstructionCost(&I, Operands,
2628 TargetTransformInfo::TCK_SizeAndLatency);
2629 }
2630 }
2631
2632 // On v6m cores, there are very few registers available. We can easily end up
2633 // spilling and reloading more registers in an unrolled loop. Look at the
2634 // number of LCSSA phis as a rough measure of how many registers will need to
2635 // be live out of the loop, reducing the default unroll count if more than 1
2636 // value is needed. In the long run, all of this should be being learnt by a
2637 // machine.
2638 unsigned UnrollCount = 4;
2639 if (ST->isThumb1Only()) {
2640 unsigned ExitingValues = 0;
2641 SmallVector<BasicBlock *, 4> ExitBlocks;
2642 L->getExitBlocks(ExitBlocks);
2643 for (auto *Exit : ExitBlocks) {
2644 // Count the number of LCSSA phis. Exclude values coming from GEP's as
2645 // only the last is expected to be needed for address operands.
2646 unsigned LiveOuts = count_if(Exit->phis(), [](auto &PH) {
2647 return PH.getNumOperands() != 1 ||
2648 !isa<GetElementPtrInst>(PH.getOperand(0));
2649 });
2650 ExitingValues = ExitingValues < LiveOuts ? LiveOuts : ExitingValues;
2651 }
2652 if (ExitingValues)
2653 UnrollCount /= ExitingValues;
2654 if (UnrollCount <= 1)
2655 return;
2656 }
2657
2658 // For processors with low overhead branching (LOB), runtime unrolling the
2659 // innermost loop is often detrimental to performance. In these cases the loop
2660 // remainder gets unrolled into a series of compare-and-jump blocks, which in
2661 // deeply nested loops get executed multiple times, negating the benefits of
2662 // LOB. This is particularly noticable when the loop trip count of the
2663 // innermost loop varies within the outer loop, such as in the case of
2664 // triangular matrix decompositions. In these cases we will prefer to not
2665 // unroll the innermost loop, with the intention for it to be executed as a
2666 // low overhead loop.
2667 bool Runtime = true;
2668 if (ST->hasLOB()) {
2669 if (SE.hasLoopInvariantBackedgeTakenCount(L)) {
2670 const auto *BETC = SE.getBackedgeTakenCount(L);
2671 auto *Outer = L->getOutermostLoop();
2672 if ((L != Outer && Outer != L->getParentLoop()) ||
2673 (L != Outer && BETC && !SE.isLoopInvariant(BETC, Outer))) {
2674 Runtime = false;
2675 }
2676 }
2677 }
2678
2679 LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
2680 LLVM_DEBUG(dbgs() << "Default Runtime Unroll Count: " << UnrollCount << "\n");
2681
2682 UP.Partial = true;
2683 UP.Runtime = Runtime;
2684 UP.UnrollRemainder = true;
2685 UP.DefaultUnrollRuntimeCount = UnrollCount;
2686 UP.UnrollAndJam = true;
2687 UP.UnrollAndJamInnerLoopThreshold = 60;
2688
2689 // Force unrolling small loops can be very useful because of the branch
2690 // taken cost of the backedge.
2691 if (Cost < 12)
2692 UP.Force = true;
2693 }
2694
getPeelingPreferences(Loop * L,ScalarEvolution & SE,TTI::PeelingPreferences & PP) const2695 void ARMTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
2696 TTI::PeelingPreferences &PP) const {
2697 BaseT::getPeelingPreferences(L, SE, PP);
2698 }
2699
preferInLoopReduction(RecurKind Kind,Type * Ty) const2700 bool ARMTTIImpl::preferInLoopReduction(RecurKind Kind, Type *Ty) const {
2701 if (!ST->hasMVEIntegerOps())
2702 return false;
2703
2704 unsigned ScalarBits = Ty->getScalarSizeInBits();
2705 switch (Kind) {
2706 case RecurKind::Add:
2707 return ScalarBits <= 64;
2708 default:
2709 return false;
2710 }
2711 }
2712
preferPredicatedReductionSelect() const2713 bool ARMTTIImpl::preferPredicatedReductionSelect() const {
2714 if (!ST->hasMVEIntegerOps())
2715 return false;
2716 return true;
2717 }
2718
getScalingFactorCost(Type * Ty,GlobalValue * BaseGV,StackOffset BaseOffset,bool HasBaseReg,int64_t Scale,unsigned AddrSpace) const2719 InstructionCost ARMTTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
2720 StackOffset BaseOffset,
2721 bool HasBaseReg, int64_t Scale,
2722 unsigned AddrSpace) const {
2723 TargetLoweringBase::AddrMode AM;
2724 AM.BaseGV = BaseGV;
2725 AM.BaseOffs = BaseOffset.getFixed();
2726 AM.HasBaseReg = HasBaseReg;
2727 AM.Scale = Scale;
2728 AM.ScalableOffset = BaseOffset.getScalable();
2729 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace)) {
2730 if (ST->hasFPAO())
2731 return AM.Scale < 0 ? 1 : 0; // positive offsets execute faster
2732 return 0;
2733 }
2734 return InstructionCost::getInvalid();
2735 }
2736
hasArmWideBranch(bool Thumb) const2737 bool ARMTTIImpl::hasArmWideBranch(bool Thumb) const {
2738 if (Thumb) {
2739 // B.W is available in any Thumb2-supporting target, and also in every
2740 // version of Armv8-M, even Baseline which does not include the rest of
2741 // Thumb2.
2742 return ST->isThumb2() || ST->hasV8MBaselineOps();
2743 } else {
2744 // B is available in all versions of the Arm ISA, so the only question is
2745 // whether that ISA is available at all.
2746 return ST->hasARMOps();
2747 }
2748 }
2749
2750 /// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
2751 /// of the vector elements.
areExtractExts(Value * Ext1,Value * Ext2)2752 static bool areExtractExts(Value *Ext1, Value *Ext2) {
2753 using namespace PatternMatch;
2754
2755 auto areExtDoubled = [](Instruction *Ext) {
2756 return Ext->getType()->getScalarSizeInBits() ==
2757 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();
2758 };
2759
2760 if (!match(Ext1, m_ZExtOrSExt(m_Value())) ||
2761 !match(Ext2, m_ZExtOrSExt(m_Value())) ||
2762 !areExtDoubled(cast<Instruction>(Ext1)) ||
2763 !areExtDoubled(cast<Instruction>(Ext2)))
2764 return false;
2765
2766 return true;
2767 }
2768
2769 /// Check if sinking \p I's operands to I's basic block is profitable, because
2770 /// the operands can be folded into a target instruction, e.g.
2771 /// sext/zext can be folded into vsubl.
isProfitableToSinkOperands(Instruction * I,SmallVectorImpl<Use * > & Ops) const2772 bool ARMTTIImpl::isProfitableToSinkOperands(Instruction *I,
2773 SmallVectorImpl<Use *> &Ops) const {
2774 using namespace PatternMatch;
2775
2776 if (!I->getType()->isVectorTy())
2777 return false;
2778
2779 if (ST->hasNEON()) {
2780 switch (I->getOpcode()) {
2781 case Instruction::Sub:
2782 case Instruction::Add: {
2783 if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
2784 return false;
2785 Ops.push_back(&I->getOperandUse(0));
2786 Ops.push_back(&I->getOperandUse(1));
2787 return true;
2788 }
2789 default:
2790 return false;
2791 }
2792 }
2793
2794 if (!ST->hasMVEIntegerOps())
2795 return false;
2796
2797 auto IsFMSMul = [&](Instruction *I) {
2798 if (!I->hasOneUse())
2799 return false;
2800 auto *Sub = cast<Instruction>(*I->users().begin());
2801 return Sub->getOpcode() == Instruction::FSub && Sub->getOperand(1) == I;
2802 };
2803 auto IsFMS = [&](Instruction *I) {
2804 if (match(I->getOperand(0), m_FNeg(m_Value())) ||
2805 match(I->getOperand(1), m_FNeg(m_Value())))
2806 return true;
2807 return false;
2808 };
2809
2810 auto IsSinker = [&](Instruction *I, int Operand) {
2811 switch (I->getOpcode()) {
2812 case Instruction::Add:
2813 case Instruction::Mul:
2814 case Instruction::FAdd:
2815 case Instruction::ICmp:
2816 case Instruction::FCmp:
2817 return true;
2818 case Instruction::FMul:
2819 return !IsFMSMul(I);
2820 case Instruction::Sub:
2821 case Instruction::FSub:
2822 case Instruction::Shl:
2823 case Instruction::LShr:
2824 case Instruction::AShr:
2825 return Operand == 1;
2826 case Instruction::Call:
2827 if (auto *II = dyn_cast<IntrinsicInst>(I)) {
2828 switch (II->getIntrinsicID()) {
2829 case Intrinsic::fma:
2830 return !IsFMS(I);
2831 case Intrinsic::sadd_sat:
2832 case Intrinsic::uadd_sat:
2833 case Intrinsic::arm_mve_add_predicated:
2834 case Intrinsic::arm_mve_mul_predicated:
2835 case Intrinsic::arm_mve_qadd_predicated:
2836 case Intrinsic::arm_mve_vhadd:
2837 case Intrinsic::arm_mve_hadd_predicated:
2838 case Intrinsic::arm_mve_vqdmull:
2839 case Intrinsic::arm_mve_vqdmull_predicated:
2840 case Intrinsic::arm_mve_vqdmulh:
2841 case Intrinsic::arm_mve_qdmulh_predicated:
2842 case Intrinsic::arm_mve_vqrdmulh:
2843 case Intrinsic::arm_mve_qrdmulh_predicated:
2844 case Intrinsic::arm_mve_fma_predicated:
2845 return true;
2846 case Intrinsic::ssub_sat:
2847 case Intrinsic::usub_sat:
2848 case Intrinsic::arm_mve_sub_predicated:
2849 case Intrinsic::arm_mve_qsub_predicated:
2850 case Intrinsic::arm_mve_hsub_predicated:
2851 case Intrinsic::arm_mve_vhsub:
2852 return Operand == 1;
2853 default:
2854 return false;
2855 }
2856 }
2857 return false;
2858 default:
2859 return false;
2860 }
2861 };
2862
2863 for (auto OpIdx : enumerate(I->operands())) {
2864 Instruction *Op = dyn_cast<Instruction>(OpIdx.value().get());
2865 // Make sure we are not already sinking this operand
2866 if (!Op || any_of(Ops, [&](Use *U) { return U->get() == Op; }))
2867 continue;
2868
2869 Instruction *Shuffle = Op;
2870 if (Shuffle->getOpcode() == Instruction::BitCast)
2871 Shuffle = dyn_cast<Instruction>(Shuffle->getOperand(0));
2872 // We are looking for a splat that can be sunk.
2873 if (!Shuffle || !match(Shuffle, m_Shuffle(m_InsertElt(m_Undef(), m_Value(),
2874 m_ZeroInt()),
2875 m_Undef(), m_ZeroMask())))
2876 continue;
2877 if (!IsSinker(I, OpIdx.index()))
2878 continue;
2879
2880 // All uses of the shuffle should be sunk to avoid duplicating it across gpr
2881 // and vector registers
2882 for (Use &U : Op->uses()) {
2883 Instruction *Insn = cast<Instruction>(U.getUser());
2884 if (!IsSinker(Insn, U.getOperandNo()))
2885 return false;
2886 }
2887
2888 Ops.push_back(&Shuffle->getOperandUse(0));
2889 if (Shuffle != Op)
2890 Ops.push_back(&Op->getOperandUse(0));
2891 Ops.push_back(&OpIdx.value());
2892 }
2893 return true;
2894 }
2895
getNumBytesToPadGlobalArray(unsigned Size,Type * ArrayType) const2896 unsigned ARMTTIImpl::getNumBytesToPadGlobalArray(unsigned Size,
2897 Type *ArrayType) const {
2898 if (!UseWidenGlobalArrays) {
2899 LLVM_DEBUG(dbgs() << "Padding global arrays disabled\n");
2900 return false;
2901 }
2902
2903 // Don't modify none integer array types
2904 if (!ArrayType || !ArrayType->isArrayTy() ||
2905 !ArrayType->getArrayElementType()->isIntegerTy())
2906 return 0;
2907
2908 // We pad to 4 byte boundaries
2909 if (Size % 4 == 0)
2910 return 0;
2911
2912 unsigned NumBytesToPad = 4 - (Size % 4);
2913 unsigned NewSize = Size + NumBytesToPad;
2914
2915 // Max number of bytes that memcpy allows for lowering to load/stores before
2916 // it uses library function (__aeabi_memcpy).
2917 unsigned MaxMemIntrinsicSize = getMaxMemIntrinsicInlineSizeThreshold();
2918
2919 if (NewSize > MaxMemIntrinsicSize)
2920 return 0;
2921
2922 return NumBytesToPad;
2923 }
2924