1 //===- ARMTargetTransformInfo.cpp - ARM specific TTI ----------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8
9 #include "ARMTargetTransformInfo.h"
10 #include "ARMSubtarget.h"
11 #include "MCTargetDesc/ARMAddressingModes.h"
12 #include "llvm/ADT/APInt.h"
13 #include "llvm/ADT/SmallVector.h"
14 #include "llvm/Analysis/LoopInfo.h"
15 #include "llvm/CodeGen/CostTable.h"
16 #include "llvm/CodeGen/ISDOpcodes.h"
17 #include "llvm/CodeGen/ValueTypes.h"
18 #include "llvm/CodeGenTypes/MachineValueType.h"
19 #include "llvm/IR/BasicBlock.h"
20 #include "llvm/IR/DataLayout.h"
21 #include "llvm/IR/DerivedTypes.h"
22 #include "llvm/IR/Instruction.h"
23 #include "llvm/IR/Instructions.h"
24 #include "llvm/IR/IntrinsicInst.h"
25 #include "llvm/IR/Intrinsics.h"
26 #include "llvm/IR/IntrinsicsARM.h"
27 #include "llvm/IR/PatternMatch.h"
28 #include "llvm/IR/Type.h"
29 #include "llvm/Support/Casting.h"
30 #include "llvm/Support/KnownBits.h"
31 #include "llvm/Target/TargetMachine.h"
32 #include "llvm/TargetParser/SubtargetFeature.h"
33 #include "llvm/Transforms/InstCombine/InstCombiner.h"
34 #include "llvm/Transforms/Utils/Local.h"
35 #include "llvm/Transforms/Utils/LoopUtils.h"
36 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
37 #include <algorithm>
38 #include <cassert>
39 #include <cstdint>
40 #include <optional>
41 #include <utility>
42
43 using namespace llvm;
44
45 #define DEBUG_TYPE "armtti"
46
47 static cl::opt<bool> EnableMaskedLoadStores(
48 "enable-arm-maskedldst", cl::Hidden, cl::init(true),
49 cl::desc("Enable the generation of masked loads and stores"));
50
51 static cl::opt<bool> DisableLowOverheadLoops(
52 "disable-arm-loloops", cl::Hidden, cl::init(false),
53 cl::desc("Disable the generation of low-overhead loops"));
54
55 static cl::opt<bool>
56 AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true),
57 cl::desc("Enable the generation of WLS loops"));
58
59 extern cl::opt<TailPredication::Mode> EnableTailPredication;
60
61 extern cl::opt<bool> EnableMaskedGatherScatters;
62
63 extern cl::opt<unsigned> MVEMaxSupportedInterleaveFactor;
64
65 /// Convert a vector load intrinsic into a simple llvm load instruction.
66 /// This is beneficial when the underlying object being addressed comes
67 /// from a constant, since we get constant-folding for free.
simplifyNeonVld1(const IntrinsicInst & II,unsigned MemAlign,InstCombiner::BuilderTy & Builder)68 static Value *simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign,
69 InstCombiner::BuilderTy &Builder) {
70 auto *IntrAlign = dyn_cast<ConstantInt>(II.getArgOperand(1));
71
72 if (!IntrAlign)
73 return nullptr;
74
75 unsigned Alignment = IntrAlign->getLimitedValue() < MemAlign
76 ? MemAlign
77 : IntrAlign->getLimitedValue();
78
79 if (!isPowerOf2_32(Alignment))
80 return nullptr;
81
82 auto *BCastInst = Builder.CreateBitCast(II.getArgOperand(0),
83 PointerType::get(II.getType(), 0));
84 return Builder.CreateAlignedLoad(II.getType(), BCastInst, Align(Alignment));
85 }
86
areInlineCompatible(const Function * Caller,const Function * Callee) const87 bool ARMTTIImpl::areInlineCompatible(const Function *Caller,
88 const Function *Callee) const {
89 const TargetMachine &TM = getTLI()->getTargetMachine();
90 const FeatureBitset &CallerBits =
91 TM.getSubtargetImpl(*Caller)->getFeatureBits();
92 const FeatureBitset &CalleeBits =
93 TM.getSubtargetImpl(*Callee)->getFeatureBits();
94
95 // To inline a callee, all features not in the allowed list must match exactly.
96 bool MatchExact = (CallerBits & ~InlineFeaturesAllowed) ==
97 (CalleeBits & ~InlineFeaturesAllowed);
98 // For features in the allowed list, the callee's features must be a subset of
99 // the callers'.
100 bool MatchSubset = ((CallerBits & CalleeBits) & InlineFeaturesAllowed) ==
101 (CalleeBits & InlineFeaturesAllowed);
102 return MatchExact && MatchSubset;
103 }
104
105 TTI::AddressingModeKind
getPreferredAddressingMode(const Loop * L,ScalarEvolution * SE) const106 ARMTTIImpl::getPreferredAddressingMode(const Loop *L,
107 ScalarEvolution *SE) const {
108 if (ST->hasMVEIntegerOps())
109 return TTI::AMK_PostIndexed;
110
111 if (L->getHeader()->getParent()->hasOptSize())
112 return TTI::AMK_None;
113
114 if (ST->isMClass() && ST->isThumb2() &&
115 L->getNumBlocks() == 1)
116 return TTI::AMK_PreIndexed;
117
118 return TTI::AMK_None;
119 }
120
121 std::optional<Instruction *>
instCombineIntrinsic(InstCombiner & IC,IntrinsicInst & II) const122 ARMTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
123 using namespace PatternMatch;
124 Intrinsic::ID IID = II.getIntrinsicID();
125 switch (IID) {
126 default:
127 break;
128 case Intrinsic::arm_neon_vld1: {
129 Align MemAlign =
130 getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,
131 &IC.getAssumptionCache(), &IC.getDominatorTree());
132 if (Value *V = simplifyNeonVld1(II, MemAlign.value(), IC.Builder)) {
133 return IC.replaceInstUsesWith(II, V);
134 }
135 break;
136 }
137
138 case Intrinsic::arm_neon_vld2:
139 case Intrinsic::arm_neon_vld3:
140 case Intrinsic::arm_neon_vld4:
141 case Intrinsic::arm_neon_vld2lane:
142 case Intrinsic::arm_neon_vld3lane:
143 case Intrinsic::arm_neon_vld4lane:
144 case Intrinsic::arm_neon_vst1:
145 case Intrinsic::arm_neon_vst2:
146 case Intrinsic::arm_neon_vst3:
147 case Intrinsic::arm_neon_vst4:
148 case Intrinsic::arm_neon_vst2lane:
149 case Intrinsic::arm_neon_vst3lane:
150 case Intrinsic::arm_neon_vst4lane: {
151 Align MemAlign =
152 getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,
153 &IC.getAssumptionCache(), &IC.getDominatorTree());
154 unsigned AlignArg = II.arg_size() - 1;
155 Value *AlignArgOp = II.getArgOperand(AlignArg);
156 MaybeAlign Align = cast<ConstantInt>(AlignArgOp)->getMaybeAlignValue();
157 if (Align && *Align < MemAlign) {
158 return IC.replaceOperand(
159 II, AlignArg,
160 ConstantInt::get(Type::getInt32Ty(II.getContext()), MemAlign.value(),
161 false));
162 }
163 break;
164 }
165
166 case Intrinsic::arm_mve_pred_i2v: {
167 Value *Arg = II.getArgOperand(0);
168 Value *ArgArg;
169 if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
170 PatternMatch::m_Value(ArgArg))) &&
171 II.getType() == ArgArg->getType()) {
172 return IC.replaceInstUsesWith(II, ArgArg);
173 }
174 Constant *XorMask;
175 if (match(Arg, m_Xor(PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
176 PatternMatch::m_Value(ArgArg)),
177 PatternMatch::m_Constant(XorMask))) &&
178 II.getType() == ArgArg->getType()) {
179 if (auto *CI = dyn_cast<ConstantInt>(XorMask)) {
180 if (CI->getValue().trunc(16).isAllOnes()) {
181 auto TrueVector = IC.Builder.CreateVectorSplat(
182 cast<FixedVectorType>(II.getType())->getNumElements(),
183 IC.Builder.getTrue());
184 return BinaryOperator::Create(Instruction::Xor, ArgArg, TrueVector);
185 }
186 }
187 }
188 KnownBits ScalarKnown(32);
189 if (IC.SimplifyDemandedBits(&II, 0, APInt::getLowBitsSet(32, 16),
190 ScalarKnown)) {
191 return &II;
192 }
193 break;
194 }
195 case Intrinsic::arm_mve_pred_v2i: {
196 Value *Arg = II.getArgOperand(0);
197 Value *ArgArg;
198 if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_i2v>(
199 PatternMatch::m_Value(ArgArg)))) {
200 return IC.replaceInstUsesWith(II, ArgArg);
201 }
202
203 if (II.getMetadata(LLVMContext::MD_range))
204 break;
205
206 ConstantRange Range(APInt(32, 0), APInt(32, 0x10000));
207
208 if (auto CurrentRange = II.getRange()) {
209 Range = Range.intersectWith(*CurrentRange);
210 if (Range == CurrentRange)
211 break;
212 }
213
214 II.addRangeRetAttr(Range);
215 II.addRetAttr(Attribute::NoUndef);
216 return &II;
217 }
218 case Intrinsic::arm_mve_vadc:
219 case Intrinsic::arm_mve_vadc_predicated: {
220 unsigned CarryOp =
221 (II.getIntrinsicID() == Intrinsic::arm_mve_vadc_predicated) ? 3 : 2;
222 assert(II.getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == 32 &&
223 "Bad type for intrinsic!");
224
225 KnownBits CarryKnown(32);
226 if (IC.SimplifyDemandedBits(&II, CarryOp, APInt::getOneBitSet(32, 29),
227 CarryKnown)) {
228 return &II;
229 }
230 break;
231 }
232 case Intrinsic::arm_mve_vmldava: {
233 Instruction *I = cast<Instruction>(&II);
234 if (I->hasOneUse()) {
235 auto *User = cast<Instruction>(*I->user_begin());
236 Value *OpZ;
237 if (match(User, m_c_Add(m_Specific(I), m_Value(OpZ))) &&
238 match(I->getOperand(3), m_Zero())) {
239 Value *OpX = I->getOperand(4);
240 Value *OpY = I->getOperand(5);
241 Type *OpTy = OpX->getType();
242
243 IC.Builder.SetInsertPoint(User);
244 Value *V =
245 IC.Builder.CreateIntrinsic(Intrinsic::arm_mve_vmldava, {OpTy},
246 {I->getOperand(0), I->getOperand(1),
247 I->getOperand(2), OpZ, OpX, OpY});
248
249 IC.replaceInstUsesWith(*User, V);
250 return IC.eraseInstFromFunction(*User);
251 }
252 }
253 return std::nullopt;
254 }
255 }
256 return std::nullopt;
257 }
258
simplifyDemandedVectorEltsIntrinsic(InstCombiner & IC,IntrinsicInst & II,APInt OrigDemandedElts,APInt & UndefElts,APInt & UndefElts2,APInt & UndefElts3,std::function<void (Instruction *,unsigned,APInt,APInt &)> SimplifyAndSetOp) const259 std::optional<Value *> ARMTTIImpl::simplifyDemandedVectorEltsIntrinsic(
260 InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
261 APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
262 std::function<void(Instruction *, unsigned, APInt, APInt &)>
263 SimplifyAndSetOp) const {
264
265 // Compute the demanded bits for a narrowing MVE intrinsic. The TopOpc is the
266 // opcode specifying a Top/Bottom instruction, which can change between
267 // instructions.
268 auto SimplifyNarrowInstrTopBottom =[&](unsigned TopOpc) {
269 unsigned NumElts = cast<FixedVectorType>(II.getType())->getNumElements();
270 unsigned IsTop = cast<ConstantInt>(II.getOperand(TopOpc))->getZExtValue();
271
272 // The only odd/even lanes of operand 0 will only be demanded depending
273 // on whether this is a top/bottom instruction.
274 APInt DemandedElts =
275 APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1)
276 : APInt::getHighBitsSet(2, 1));
277 SimplifyAndSetOp(&II, 0, OrigDemandedElts & DemandedElts, UndefElts);
278 // The other lanes will be defined from the inserted elements.
279 UndefElts &= APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1)
280 : APInt::getHighBitsSet(2, 1));
281 return std::nullopt;
282 };
283
284 switch (II.getIntrinsicID()) {
285 default:
286 break;
287 case Intrinsic::arm_mve_vcvt_narrow:
288 SimplifyNarrowInstrTopBottom(2);
289 break;
290 case Intrinsic::arm_mve_vqmovn:
291 SimplifyNarrowInstrTopBottom(4);
292 break;
293 case Intrinsic::arm_mve_vshrn:
294 SimplifyNarrowInstrTopBottom(7);
295 break;
296 }
297
298 return std::nullopt;
299 }
300
getIntImmCost(const APInt & Imm,Type * Ty,TTI::TargetCostKind CostKind)301 InstructionCost ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
302 TTI::TargetCostKind CostKind) {
303 assert(Ty->isIntegerTy());
304
305 unsigned Bits = Ty->getPrimitiveSizeInBits();
306 if (Bits == 0 || Imm.getActiveBits() >= 64)
307 return 4;
308
309 int64_t SImmVal = Imm.getSExtValue();
310 uint64_t ZImmVal = Imm.getZExtValue();
311 if (!ST->isThumb()) {
312 if ((SImmVal >= 0 && SImmVal < 65536) ||
313 (ARM_AM::getSOImmVal(ZImmVal) != -1) ||
314 (ARM_AM::getSOImmVal(~ZImmVal) != -1))
315 return 1;
316 return ST->hasV6T2Ops() ? 2 : 3;
317 }
318 if (ST->isThumb2()) {
319 if ((SImmVal >= 0 && SImmVal < 65536) ||
320 (ARM_AM::getT2SOImmVal(ZImmVal) != -1) ||
321 (ARM_AM::getT2SOImmVal(~ZImmVal) != -1))
322 return 1;
323 return ST->hasV6T2Ops() ? 2 : 3;
324 }
325 // Thumb1, any i8 imm cost 1.
326 if (Bits == 8 || (SImmVal >= 0 && SImmVal < 256))
327 return 1;
328 if ((~SImmVal < 256) || ARM_AM::isThumbImmShiftedVal(ZImmVal))
329 return 2;
330 // Load from constantpool.
331 return 3;
332 }
333
334 // Constants smaller than 256 fit in the immediate field of
335 // Thumb1 instructions so we return a zero cost and 1 otherwise.
getIntImmCodeSizeCost(unsigned Opcode,unsigned Idx,const APInt & Imm,Type * Ty)336 InstructionCost ARMTTIImpl::getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx,
337 const APInt &Imm, Type *Ty) {
338 if (Imm.isNonNegative() && Imm.getLimitedValue() < 256)
339 return 0;
340
341 return 1;
342 }
343
344 // Checks whether Inst is part of a min(max()) or max(min()) pattern
345 // that will match to an SSAT instruction. Returns the instruction being
346 // saturated, or null if no saturation pattern was found.
isSSATMinMaxPattern(Instruction * Inst,const APInt & Imm)347 static Value *isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm) {
348 Value *LHS, *RHS;
349 ConstantInt *C;
350 SelectPatternFlavor InstSPF = matchSelectPattern(Inst, LHS, RHS).Flavor;
351
352 if (InstSPF == SPF_SMAX &&
353 PatternMatch::match(RHS, PatternMatch::m_ConstantInt(C)) &&
354 C->getValue() == Imm && Imm.isNegative() && Imm.isNegatedPowerOf2()) {
355
356 auto isSSatMin = [&](Value *MinInst) {
357 if (isa<SelectInst>(MinInst)) {
358 Value *MinLHS, *MinRHS;
359 ConstantInt *MinC;
360 SelectPatternFlavor MinSPF =
361 matchSelectPattern(MinInst, MinLHS, MinRHS).Flavor;
362 if (MinSPF == SPF_SMIN &&
363 PatternMatch::match(MinRHS, PatternMatch::m_ConstantInt(MinC)) &&
364 MinC->getValue() == ((-Imm) - 1))
365 return true;
366 }
367 return false;
368 };
369
370 if (isSSatMin(Inst->getOperand(1)))
371 return cast<Instruction>(Inst->getOperand(1))->getOperand(1);
372 if (Inst->hasNUses(2) &&
373 (isSSatMin(*Inst->user_begin()) || isSSatMin(*(++Inst->user_begin()))))
374 return Inst->getOperand(1);
375 }
376 return nullptr;
377 }
378
379 // Look for a FP Saturation pattern, where the instruction can be simplified to
380 // a fptosi.sat. max(min(fptosi)). The constant in this case is always free.
isFPSatMinMaxPattern(Instruction * Inst,const APInt & Imm)381 static bool isFPSatMinMaxPattern(Instruction *Inst, const APInt &Imm) {
382 if (Imm.getBitWidth() != 64 ||
383 Imm != APInt::getHighBitsSet(64, 33)) // -2147483648
384 return false;
385 Value *FP = isSSATMinMaxPattern(Inst, Imm);
386 if (!FP && isa<ICmpInst>(Inst) && Inst->hasOneUse())
387 FP = isSSATMinMaxPattern(cast<Instruction>(*Inst->user_begin()), Imm);
388 if (!FP)
389 return false;
390 return isa<FPToSIInst>(FP);
391 }
392
getIntImmCostInst(unsigned Opcode,unsigned Idx,const APInt & Imm,Type * Ty,TTI::TargetCostKind CostKind,Instruction * Inst)393 InstructionCost ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
394 const APInt &Imm, Type *Ty,
395 TTI::TargetCostKind CostKind,
396 Instruction *Inst) {
397 // Division by a constant can be turned into multiplication, but only if we
398 // know it's constant. So it's not so much that the immediate is cheap (it's
399 // not), but that the alternative is worse.
400 // FIXME: this is probably unneeded with GlobalISel.
401 if ((Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
402 Opcode == Instruction::SRem || Opcode == Instruction::URem) &&
403 Idx == 1)
404 return 0;
405
406 // Leave any gep offsets for the CodeGenPrepare, which will do a better job at
407 // splitting any large offsets.
408 if (Opcode == Instruction::GetElementPtr && Idx != 0)
409 return 0;
410
411 if (Opcode == Instruction::And) {
412 // UXTB/UXTH
413 if (Imm == 255 || Imm == 65535)
414 return 0;
415 // Conversion to BIC is free, and means we can use ~Imm instead.
416 return std::min(getIntImmCost(Imm, Ty, CostKind),
417 getIntImmCost(~Imm, Ty, CostKind));
418 }
419
420 if (Opcode == Instruction::Add)
421 // Conversion to SUB is free, and means we can use -Imm instead.
422 return std::min(getIntImmCost(Imm, Ty, CostKind),
423 getIntImmCost(-Imm, Ty, CostKind));
424
425 if (Opcode == Instruction::ICmp && Imm.isNegative() &&
426 Ty->getIntegerBitWidth() == 32) {
427 int64_t NegImm = -Imm.getSExtValue();
428 if (ST->isThumb2() && NegImm < 1<<12)
429 // icmp X, #-C -> cmn X, #C
430 return 0;
431 if (ST->isThumb() && NegImm < 1<<8)
432 // icmp X, #-C -> adds X, #C
433 return 0;
434 }
435
436 // xor a, -1 can always be folded to MVN
437 if (Opcode == Instruction::Xor && Imm.isAllOnes())
438 return 0;
439
440 // Ensures negative constant of min(max()) or max(min()) patterns that
441 // match to SSAT instructions don't get hoisted
442 if (Inst && ((ST->hasV6Ops() && !ST->isThumb()) || ST->isThumb2()) &&
443 Ty->getIntegerBitWidth() <= 32) {
444 if (isSSATMinMaxPattern(Inst, Imm) ||
445 (isa<ICmpInst>(Inst) && Inst->hasOneUse() &&
446 isSSATMinMaxPattern(cast<Instruction>(*Inst->user_begin()), Imm)))
447 return 0;
448 }
449
450 if (Inst && ST->hasVFP2Base() && isFPSatMinMaxPattern(Inst, Imm))
451 return 0;
452
453 // We can convert <= -1 to < 0, which is generally quite cheap.
454 if (Inst && Opcode == Instruction::ICmp && Idx == 1 && Imm.isAllOnes()) {
455 ICmpInst::Predicate Pred = cast<ICmpInst>(Inst)->getPredicate();
456 if (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SLE)
457 return std::min(getIntImmCost(Imm, Ty, CostKind),
458 getIntImmCost(Imm + 1, Ty, CostKind));
459 }
460
461 return getIntImmCost(Imm, Ty, CostKind);
462 }
463
getCFInstrCost(unsigned Opcode,TTI::TargetCostKind CostKind,const Instruction * I)464 InstructionCost ARMTTIImpl::getCFInstrCost(unsigned Opcode,
465 TTI::TargetCostKind CostKind,
466 const Instruction *I) {
467 if (CostKind == TTI::TCK_RecipThroughput &&
468 (ST->hasNEON() || ST->hasMVEIntegerOps())) {
469 // FIXME: The vectorizer is highly sensistive to the cost of these
470 // instructions, which suggests that it may be using the costs incorrectly.
471 // But, for now, just make them free to avoid performance regressions for
472 // vector targets.
473 return 0;
474 }
475 return BaseT::getCFInstrCost(Opcode, CostKind, I);
476 }
477
getCastInstrCost(unsigned Opcode,Type * Dst,Type * Src,TTI::CastContextHint CCH,TTI::TargetCostKind CostKind,const Instruction * I)478 InstructionCost ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
479 Type *Src,
480 TTI::CastContextHint CCH,
481 TTI::TargetCostKind CostKind,
482 const Instruction *I) {
483 int ISD = TLI->InstructionOpcodeToISD(Opcode);
484 assert(ISD && "Invalid opcode");
485
486 // TODO: Allow non-throughput costs that aren't binary.
487 auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
488 if (CostKind != TTI::TCK_RecipThroughput)
489 return Cost == 0 ? 0 : 1;
490 return Cost;
491 };
492 auto IsLegalFPType = [this](EVT VT) {
493 EVT EltVT = VT.getScalarType();
494 return (EltVT == MVT::f32 && ST->hasVFP2Base()) ||
495 (EltVT == MVT::f64 && ST->hasFP64()) ||
496 (EltVT == MVT::f16 && ST->hasFullFP16());
497 };
498
499 EVT SrcTy = TLI->getValueType(DL, Src);
500 EVT DstTy = TLI->getValueType(DL, Dst);
501
502 if (!SrcTy.isSimple() || !DstTy.isSimple())
503 return AdjustCost(
504 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
505
506 // Extending masked load/Truncating masked stores is expensive because we
507 // currently don't split them. This means that we'll likely end up
508 // loading/storing each element individually (hence the high cost).
509 if ((ST->hasMVEIntegerOps() &&
510 (Opcode == Instruction::Trunc || Opcode == Instruction::ZExt ||
511 Opcode == Instruction::SExt)) ||
512 (ST->hasMVEFloatOps() &&
513 (Opcode == Instruction::FPExt || Opcode == Instruction::FPTrunc) &&
514 IsLegalFPType(SrcTy) && IsLegalFPType(DstTy)))
515 if (CCH == TTI::CastContextHint::Masked && DstTy.getSizeInBits() > 128)
516 return 2 * DstTy.getVectorNumElements() *
517 ST->getMVEVectorCostFactor(CostKind);
518
519 // The extend of other kinds of load is free
520 if (CCH == TTI::CastContextHint::Normal ||
521 CCH == TTI::CastContextHint::Masked) {
522 static const TypeConversionCostTblEntry LoadConversionTbl[] = {
523 {ISD::SIGN_EXTEND, MVT::i32, MVT::i16, 0},
524 {ISD::ZERO_EXTEND, MVT::i32, MVT::i16, 0},
525 {ISD::SIGN_EXTEND, MVT::i32, MVT::i8, 0},
526 {ISD::ZERO_EXTEND, MVT::i32, MVT::i8, 0},
527 {ISD::SIGN_EXTEND, MVT::i16, MVT::i8, 0},
528 {ISD::ZERO_EXTEND, MVT::i16, MVT::i8, 0},
529 {ISD::SIGN_EXTEND, MVT::i64, MVT::i32, 1},
530 {ISD::ZERO_EXTEND, MVT::i64, MVT::i32, 1},
531 {ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 1},
532 {ISD::ZERO_EXTEND, MVT::i64, MVT::i16, 1},
533 {ISD::SIGN_EXTEND, MVT::i64, MVT::i8, 1},
534 {ISD::ZERO_EXTEND, MVT::i64, MVT::i8, 1},
535 };
536 if (const auto *Entry = ConvertCostTableLookup(
537 LoadConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
538 return AdjustCost(Entry->Cost);
539
540 static const TypeConversionCostTblEntry MVELoadConversionTbl[] = {
541 {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 0},
542 {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 0},
543 {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 0},
544 {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 0},
545 {ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 0},
546 {ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 0},
547 // The following extend from a legal type to an illegal type, so need to
548 // split the load. This introduced an extra load operation, but the
549 // extend is still "free".
550 {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1},
551 {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1},
552 {ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 3},
553 {ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 3},
554 {ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1},
555 {ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1},
556 };
557 if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
558 if (const auto *Entry =
559 ConvertCostTableLookup(MVELoadConversionTbl, ISD,
560 DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
561 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
562 }
563
564 static const TypeConversionCostTblEntry MVEFLoadConversionTbl[] = {
565 // FPExtends are similar but also require the VCVT instructions.
566 {ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, 1},
567 {ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, 3},
568 };
569 if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
570 if (const auto *Entry =
571 ConvertCostTableLookup(MVEFLoadConversionTbl, ISD,
572 DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
573 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
574 }
575
576 // The truncate of a store is free. This is the mirror of extends above.
577 static const TypeConversionCostTblEntry MVEStoreConversionTbl[] = {
578 {ISD::TRUNCATE, MVT::v4i32, MVT::v4i16, 0},
579 {ISD::TRUNCATE, MVT::v4i32, MVT::v4i8, 0},
580 {ISD::TRUNCATE, MVT::v8i16, MVT::v8i8, 0},
581 {ISD::TRUNCATE, MVT::v8i32, MVT::v8i16, 1},
582 {ISD::TRUNCATE, MVT::v8i32, MVT::v8i8, 1},
583 {ISD::TRUNCATE, MVT::v16i32, MVT::v16i8, 3},
584 {ISD::TRUNCATE, MVT::v16i16, MVT::v16i8, 1},
585 };
586 if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
587 if (const auto *Entry =
588 ConvertCostTableLookup(MVEStoreConversionTbl, ISD,
589 SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
590 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
591 }
592
593 static const TypeConversionCostTblEntry MVEFStoreConversionTbl[] = {
594 {ISD::FP_ROUND, MVT::v4f32, MVT::v4f16, 1},
595 {ISD::FP_ROUND, MVT::v8f32, MVT::v8f16, 3},
596 };
597 if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
598 if (const auto *Entry =
599 ConvertCostTableLookup(MVEFStoreConversionTbl, ISD,
600 SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
601 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
602 }
603 }
604
605 // NEON vector operations that can extend their inputs.
606 if ((ISD == ISD::SIGN_EXTEND || ISD == ISD::ZERO_EXTEND) &&
607 I && I->hasOneUse() && ST->hasNEON() && SrcTy.isVector()) {
608 static const TypeConversionCostTblEntry NEONDoubleWidthTbl[] = {
609 // vaddl
610 { ISD::ADD, MVT::v4i32, MVT::v4i16, 0 },
611 { ISD::ADD, MVT::v8i16, MVT::v8i8, 0 },
612 // vsubl
613 { ISD::SUB, MVT::v4i32, MVT::v4i16, 0 },
614 { ISD::SUB, MVT::v8i16, MVT::v8i8, 0 },
615 // vmull
616 { ISD::MUL, MVT::v4i32, MVT::v4i16, 0 },
617 { ISD::MUL, MVT::v8i16, MVT::v8i8, 0 },
618 // vshll
619 { ISD::SHL, MVT::v4i32, MVT::v4i16, 0 },
620 { ISD::SHL, MVT::v8i16, MVT::v8i8, 0 },
621 };
622
623 auto *User = cast<Instruction>(*I->user_begin());
624 int UserISD = TLI->InstructionOpcodeToISD(User->getOpcode());
625 if (auto *Entry = ConvertCostTableLookup(NEONDoubleWidthTbl, UserISD,
626 DstTy.getSimpleVT(),
627 SrcTy.getSimpleVT())) {
628 return AdjustCost(Entry->Cost);
629 }
630 }
631
632 // Single to/from double precision conversions.
633 if (Src->isVectorTy() && ST->hasNEON() &&
634 ((ISD == ISD::FP_ROUND && SrcTy.getScalarType() == MVT::f64 &&
635 DstTy.getScalarType() == MVT::f32) ||
636 (ISD == ISD::FP_EXTEND && SrcTy.getScalarType() == MVT::f32 &&
637 DstTy.getScalarType() == MVT::f64))) {
638 static const CostTblEntry NEONFltDblTbl[] = {
639 // Vector fptrunc/fpext conversions.
640 {ISD::FP_ROUND, MVT::v2f64, 2},
641 {ISD::FP_EXTEND, MVT::v2f32, 2},
642 {ISD::FP_EXTEND, MVT::v4f32, 4}};
643
644 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
645 if (const auto *Entry = CostTableLookup(NEONFltDblTbl, ISD, LT.second))
646 return AdjustCost(LT.first * Entry->Cost);
647 }
648
649 // Some arithmetic, load and store operations have specific instructions
650 // to cast up/down their types automatically at no extra cost.
651 // TODO: Get these tables to know at least what the related operations are.
652 static const TypeConversionCostTblEntry NEONVectorConversionTbl[] = {
653 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
654 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
655 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
656 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
657 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 0 },
658 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 },
659
660 // The number of vmovl instructions for the extension.
661 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
662 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
663 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
664 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
665 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8, 3 },
666 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8, 3 },
667 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
668 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
669 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
670 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
671 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
672 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
673 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
674 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
675 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
676 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
677 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
678 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
679
680 // Operations that we legalize using splitting.
681 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 },
682 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 },
683
684 // Vector float <-> i32 conversions.
685 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
686 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
687
688 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
689 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
690 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 2 },
691 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 2 },
692 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
693 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
694 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 },
695 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 },
696 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
697 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
698 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
699 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
700 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
701 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
702 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 },
703 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 },
704 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 8 },
705 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 8 },
706 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 4 },
707 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 4 },
708
709 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
710 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
711 { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 3 },
712 { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 3 },
713 { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 },
714 { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 },
715
716 // Vector double <-> i32 conversions.
717 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
718 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
719
720 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
721 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
722 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 3 },
723 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 3 },
724 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
725 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
726
727 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 },
728 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 },
729 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, 4 },
730 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, 4 },
731 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f32, 8 },
732 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, 8 }
733 };
734
735 if (SrcTy.isVector() && ST->hasNEON()) {
736 if (const auto *Entry = ConvertCostTableLookup(NEONVectorConversionTbl, ISD,
737 DstTy.getSimpleVT(),
738 SrcTy.getSimpleVT()))
739 return AdjustCost(Entry->Cost);
740 }
741
742 // Scalar float to integer conversions.
743 static const TypeConversionCostTblEntry NEONFloatConversionTbl[] = {
744 { ISD::FP_TO_SINT, MVT::i1, MVT::f32, 2 },
745 { ISD::FP_TO_UINT, MVT::i1, MVT::f32, 2 },
746 { ISD::FP_TO_SINT, MVT::i1, MVT::f64, 2 },
747 { ISD::FP_TO_UINT, MVT::i1, MVT::f64, 2 },
748 { ISD::FP_TO_SINT, MVT::i8, MVT::f32, 2 },
749 { ISD::FP_TO_UINT, MVT::i8, MVT::f32, 2 },
750 { ISD::FP_TO_SINT, MVT::i8, MVT::f64, 2 },
751 { ISD::FP_TO_UINT, MVT::i8, MVT::f64, 2 },
752 { ISD::FP_TO_SINT, MVT::i16, MVT::f32, 2 },
753 { ISD::FP_TO_UINT, MVT::i16, MVT::f32, 2 },
754 { ISD::FP_TO_SINT, MVT::i16, MVT::f64, 2 },
755 { ISD::FP_TO_UINT, MVT::i16, MVT::f64, 2 },
756 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, 2 },
757 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, 2 },
758 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, 2 },
759 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, 2 },
760 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, 10 },
761 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 10 },
762 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, 10 },
763 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 10 }
764 };
765 if (SrcTy.isFloatingPoint() && ST->hasNEON()) {
766 if (const auto *Entry = ConvertCostTableLookup(NEONFloatConversionTbl, ISD,
767 DstTy.getSimpleVT(),
768 SrcTy.getSimpleVT()))
769 return AdjustCost(Entry->Cost);
770 }
771
772 // Scalar integer to float conversions.
773 static const TypeConversionCostTblEntry NEONIntegerConversionTbl[] = {
774 { ISD::SINT_TO_FP, MVT::f32, MVT::i1, 2 },
775 { ISD::UINT_TO_FP, MVT::f32, MVT::i1, 2 },
776 { ISD::SINT_TO_FP, MVT::f64, MVT::i1, 2 },
777 { ISD::UINT_TO_FP, MVT::f64, MVT::i1, 2 },
778 { ISD::SINT_TO_FP, MVT::f32, MVT::i8, 2 },
779 { ISD::UINT_TO_FP, MVT::f32, MVT::i8, 2 },
780 { ISD::SINT_TO_FP, MVT::f64, MVT::i8, 2 },
781 { ISD::UINT_TO_FP, MVT::f64, MVT::i8, 2 },
782 { ISD::SINT_TO_FP, MVT::f32, MVT::i16, 2 },
783 { ISD::UINT_TO_FP, MVT::f32, MVT::i16, 2 },
784 { ISD::SINT_TO_FP, MVT::f64, MVT::i16, 2 },
785 { ISD::UINT_TO_FP, MVT::f64, MVT::i16, 2 },
786 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, 2 },
787 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, 2 },
788 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, 2 },
789 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, 2 },
790 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, 10 },
791 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 10 },
792 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, 10 },
793 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 10 }
794 };
795
796 if (SrcTy.isInteger() && ST->hasNEON()) {
797 if (const auto *Entry = ConvertCostTableLookup(NEONIntegerConversionTbl,
798 ISD, DstTy.getSimpleVT(),
799 SrcTy.getSimpleVT()))
800 return AdjustCost(Entry->Cost);
801 }
802
803 // MVE extend costs, taken from codegen tests. i8->i16 or i16->i32 is one
804 // instruction, i8->i32 is two. i64 zexts are an VAND with a constant, sext
805 // are linearised so take more.
806 static const TypeConversionCostTblEntry MVEVectorConversionTbl[] = {
807 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
808 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
809 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
810 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
811 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8, 10 },
812 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8, 2 },
813 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
814 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
815 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 10 },
816 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
817 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 8 },
818 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 2 },
819 };
820
821 if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
822 if (const auto *Entry = ConvertCostTableLookup(MVEVectorConversionTbl,
823 ISD, DstTy.getSimpleVT(),
824 SrcTy.getSimpleVT()))
825 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
826 }
827
828 if (ISD == ISD::FP_ROUND || ISD == ISD::FP_EXTEND) {
829 // As general rule, fp converts that were not matched above are scalarized
830 // and cost 1 vcvt for each lane, so long as the instruction is available.
831 // If not it will become a series of function calls.
832 const InstructionCost CallCost =
833 getCallInstrCost(nullptr, Dst, {Src}, CostKind);
834 int Lanes = 1;
835 if (SrcTy.isFixedLengthVector())
836 Lanes = SrcTy.getVectorNumElements();
837
838 if (IsLegalFPType(SrcTy) && IsLegalFPType(DstTy))
839 return Lanes;
840 else
841 return Lanes * CallCost;
842 }
843
844 if (ISD == ISD::TRUNCATE && ST->hasMVEIntegerOps() &&
845 SrcTy.isFixedLengthVector()) {
846 // Treat a truncate with larger than legal source (128bits for MVE) as
847 // expensive, 2 instructions per lane.
848 if ((SrcTy.getScalarType() == MVT::i8 ||
849 SrcTy.getScalarType() == MVT::i16 ||
850 SrcTy.getScalarType() == MVT::i32) &&
851 SrcTy.getSizeInBits() > 128 &&
852 SrcTy.getSizeInBits() > DstTy.getSizeInBits())
853 return SrcTy.getVectorNumElements() * 2;
854 }
855
856 // Scalar integer conversion costs.
857 static const TypeConversionCostTblEntry ARMIntegerConversionTbl[] = {
858 // i16 -> i64 requires two dependent operations.
859 { ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 2 },
860
861 // Truncates on i64 are assumed to be free.
862 { ISD::TRUNCATE, MVT::i32, MVT::i64, 0 },
863 { ISD::TRUNCATE, MVT::i16, MVT::i64, 0 },
864 { ISD::TRUNCATE, MVT::i8, MVT::i64, 0 },
865 { ISD::TRUNCATE, MVT::i1, MVT::i64, 0 }
866 };
867
868 if (SrcTy.isInteger()) {
869 if (const auto *Entry = ConvertCostTableLookup(ARMIntegerConversionTbl, ISD,
870 DstTy.getSimpleVT(),
871 SrcTy.getSimpleVT()))
872 return AdjustCost(Entry->Cost);
873 }
874
875 int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
876 ? ST->getMVEVectorCostFactor(CostKind)
877 : 1;
878 return AdjustCost(
879 BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
880 }
881
getVectorInstrCost(unsigned Opcode,Type * ValTy,TTI::TargetCostKind CostKind,unsigned Index,Value * Op0,Value * Op1)882 InstructionCost ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
883 TTI::TargetCostKind CostKind,
884 unsigned Index, Value *Op0,
885 Value *Op1) {
886 // Penalize inserting into an D-subregister. We end up with a three times
887 // lower estimated throughput on swift.
888 if (ST->hasSlowLoadDSubregister() && Opcode == Instruction::InsertElement &&
889 ValTy->isVectorTy() && ValTy->getScalarSizeInBits() <= 32)
890 return 3;
891
892 if (ST->hasNEON() && (Opcode == Instruction::InsertElement ||
893 Opcode == Instruction::ExtractElement)) {
894 // Cross-class copies are expensive on many microarchitectures,
895 // so assume they are expensive by default.
896 if (cast<VectorType>(ValTy)->getElementType()->isIntegerTy())
897 return 3;
898
899 // Even if it's not a cross class copy, this likely leads to mixing
900 // of NEON and VFP code and should be therefore penalized.
901 if (ValTy->isVectorTy() &&
902 ValTy->getScalarSizeInBits() <= 32)
903 return std::max<InstructionCost>(
904 BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1),
905 2U);
906 }
907
908 if (ST->hasMVEIntegerOps() && (Opcode == Instruction::InsertElement ||
909 Opcode == Instruction::ExtractElement)) {
910 // Integer cross-lane moves are more expensive than float, which can
911 // sometimes just be vmovs. Integer involve being passes to GPR registers,
912 // causing more of a delay.
913 std::pair<InstructionCost, MVT> LT =
914 getTypeLegalizationCost(ValTy->getScalarType());
915 return LT.first * (ValTy->getScalarType()->isIntegerTy() ? 4 : 1);
916 }
917
918 return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1);
919 }
920
getCmpSelInstrCost(unsigned Opcode,Type * ValTy,Type * CondTy,CmpInst::Predicate VecPred,TTI::TargetCostKind CostKind,const Instruction * I)921 InstructionCost ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
922 Type *CondTy,
923 CmpInst::Predicate VecPred,
924 TTI::TargetCostKind CostKind,
925 const Instruction *I) {
926 int ISD = TLI->InstructionOpcodeToISD(Opcode);
927
928 // Thumb scalar code size cost for select.
929 if (CostKind == TTI::TCK_CodeSize && ISD == ISD::SELECT &&
930 ST->isThumb() && !ValTy->isVectorTy()) {
931 // Assume expensive structs.
932 if (TLI->getValueType(DL, ValTy, true) == MVT::Other)
933 return TTI::TCC_Expensive;
934
935 // Select costs can vary because they:
936 // - may require one or more conditional mov (including an IT),
937 // - can't operate directly on immediates,
938 // - require live flags, which we can't copy around easily.
939 InstructionCost Cost = getTypeLegalizationCost(ValTy).first;
940
941 // Possible IT instruction for Thumb2, or more for Thumb1.
942 ++Cost;
943
944 // i1 values may need rematerialising by using mov immediates and/or
945 // flag setting instructions.
946 if (ValTy->isIntegerTy(1))
947 ++Cost;
948
949 return Cost;
950 }
951
952 // If this is a vector min/max/abs, use the cost of that intrinsic directly
953 // instead. Hopefully when min/max intrinsics are more prevalent this code
954 // will not be needed.
955 const Instruction *Sel = I;
956 if ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) && Sel &&
957 Sel->hasOneUse())
958 Sel = cast<Instruction>(Sel->user_back());
959 if (Sel && ValTy->isVectorTy() &&
960 (ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy())) {
961 const Value *LHS, *RHS;
962 SelectPatternFlavor SPF = matchSelectPattern(Sel, LHS, RHS).Flavor;
963 unsigned IID = 0;
964 switch (SPF) {
965 case SPF_ABS:
966 IID = Intrinsic::abs;
967 break;
968 case SPF_SMIN:
969 IID = Intrinsic::smin;
970 break;
971 case SPF_SMAX:
972 IID = Intrinsic::smax;
973 break;
974 case SPF_UMIN:
975 IID = Intrinsic::umin;
976 break;
977 case SPF_UMAX:
978 IID = Intrinsic::umax;
979 break;
980 case SPF_FMINNUM:
981 IID = Intrinsic::minnum;
982 break;
983 case SPF_FMAXNUM:
984 IID = Intrinsic::maxnum;
985 break;
986 default:
987 break;
988 }
989 if (IID) {
990 // The ICmp is free, the select gets the cost of the min/max/etc
991 if (Sel != I)
992 return 0;
993 IntrinsicCostAttributes CostAttrs(IID, ValTy, {ValTy, ValTy});
994 return getIntrinsicInstrCost(CostAttrs, CostKind);
995 }
996 }
997
998 // On NEON a vector select gets lowered to vbsl.
999 if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT && CondTy) {
1000 // Lowering of some vector selects is currently far from perfect.
1001 static const TypeConversionCostTblEntry NEONVectorSelectTbl[] = {
1002 { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4*4 + 1*2 + 1 },
1003 { ISD::SELECT, MVT::v8i1, MVT::v8i64, 50 },
1004 { ISD::SELECT, MVT::v16i1, MVT::v16i64, 100 }
1005 };
1006
1007 EVT SelCondTy = TLI->getValueType(DL, CondTy);
1008 EVT SelValTy = TLI->getValueType(DL, ValTy);
1009 if (SelCondTy.isSimple() && SelValTy.isSimple()) {
1010 if (const auto *Entry = ConvertCostTableLookup(NEONVectorSelectTbl, ISD,
1011 SelCondTy.getSimpleVT(),
1012 SelValTy.getSimpleVT()))
1013 return Entry->Cost;
1014 }
1015
1016 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1017 return LT.first;
1018 }
1019
1020 if (ST->hasMVEIntegerOps() && ValTy->isVectorTy() &&
1021 (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
1022 cast<FixedVectorType>(ValTy)->getNumElements() > 1) {
1023 FixedVectorType *VecValTy = cast<FixedVectorType>(ValTy);
1024 FixedVectorType *VecCondTy = dyn_cast_or_null<FixedVectorType>(CondTy);
1025 if (!VecCondTy)
1026 VecCondTy = cast<FixedVectorType>(CmpInst::makeCmpResultType(VecValTy));
1027
1028 // If we don't have mve.fp any fp operations will need to be scalarized.
1029 if (Opcode == Instruction::FCmp && !ST->hasMVEFloatOps()) {
1030 // One scalaization insert, one scalarization extract and the cost of the
1031 // fcmps.
1032 return BaseT::getScalarizationOverhead(VecValTy, /*Insert*/ false,
1033 /*Extract*/ true, CostKind) +
1034 BaseT::getScalarizationOverhead(VecCondTy, /*Insert*/ true,
1035 /*Extract*/ false, CostKind) +
1036 VecValTy->getNumElements() *
1037 getCmpSelInstrCost(Opcode, ValTy->getScalarType(),
1038 VecCondTy->getScalarType(), VecPred,
1039 CostKind, I);
1040 }
1041
1042 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1043 int BaseCost = ST->getMVEVectorCostFactor(CostKind);
1044 // There are two types - the input that specifies the type of the compare
1045 // and the output vXi1 type. Because we don't know how the output will be
1046 // split, we may need an expensive shuffle to get two in sync. This has the
1047 // effect of making larger than legal compares (v8i32 for example)
1048 // expensive.
1049 if (LT.second.isVector() && LT.second.getVectorNumElements() > 2) {
1050 if (LT.first > 1)
1051 return LT.first * BaseCost +
1052 BaseT::getScalarizationOverhead(VecCondTy, /*Insert*/ true,
1053 /*Extract*/ false, CostKind);
1054 return BaseCost;
1055 }
1056 }
1057
1058 // Default to cheap (throughput/size of 1 instruction) but adjust throughput
1059 // for "multiple beats" potentially needed by MVE instructions.
1060 int BaseCost = 1;
1061 if (ST->hasMVEIntegerOps() && ValTy->isVectorTy())
1062 BaseCost = ST->getMVEVectorCostFactor(CostKind);
1063
1064 return BaseCost *
1065 BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
1066 }
1067
getAddressComputationCost(Type * Ty,ScalarEvolution * SE,const SCEV * Ptr)1068 InstructionCost ARMTTIImpl::getAddressComputationCost(Type *Ty,
1069 ScalarEvolution *SE,
1070 const SCEV *Ptr) {
1071 // Address computations in vectorized code with non-consecutive addresses will
1072 // likely result in more instructions compared to scalar code where the
1073 // computation can more often be merged into the index mode. The resulting
1074 // extra micro-ops can significantly decrease throughput.
1075 unsigned NumVectorInstToHideOverhead = 10;
1076 int MaxMergeDistance = 64;
1077
1078 if (ST->hasNEON()) {
1079 if (Ty->isVectorTy() && SE &&
1080 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
1081 return NumVectorInstToHideOverhead;
1082
1083 // In many cases the address computation is not merged into the instruction
1084 // addressing mode.
1085 return 1;
1086 }
1087 return BaseT::getAddressComputationCost(Ty, SE, Ptr);
1088 }
1089
isProfitableLSRChainElement(Instruction * I)1090 bool ARMTTIImpl::isProfitableLSRChainElement(Instruction *I) {
1091 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
1092 // If a VCTP is part of a chain, it's already profitable and shouldn't be
1093 // optimized, else LSR may block tail-predication.
1094 switch (II->getIntrinsicID()) {
1095 case Intrinsic::arm_mve_vctp8:
1096 case Intrinsic::arm_mve_vctp16:
1097 case Intrinsic::arm_mve_vctp32:
1098 case Intrinsic::arm_mve_vctp64:
1099 return true;
1100 default:
1101 break;
1102 }
1103 }
1104 return false;
1105 }
1106
isLegalMaskedLoad(Type * DataTy,Align Alignment)1107 bool ARMTTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) {
1108 if (!EnableMaskedLoadStores || !ST->hasMVEIntegerOps())
1109 return false;
1110
1111 if (auto *VecTy = dyn_cast<FixedVectorType>(DataTy)) {
1112 // Don't support v2i1 yet.
1113 if (VecTy->getNumElements() == 2)
1114 return false;
1115
1116 // We don't support extending fp types.
1117 unsigned VecWidth = DataTy->getPrimitiveSizeInBits();
1118 if (VecWidth != 128 && VecTy->getElementType()->isFloatingPointTy())
1119 return false;
1120 }
1121
1122 unsigned EltWidth = DataTy->getScalarSizeInBits();
1123 return (EltWidth == 32 && Alignment >= 4) ||
1124 (EltWidth == 16 && Alignment >= 2) || (EltWidth == 8);
1125 }
1126
isLegalMaskedGather(Type * Ty,Align Alignment)1127 bool ARMTTIImpl::isLegalMaskedGather(Type *Ty, Align Alignment) {
1128 if (!EnableMaskedGatherScatters || !ST->hasMVEIntegerOps())
1129 return false;
1130
1131 unsigned EltWidth = Ty->getScalarSizeInBits();
1132 return ((EltWidth == 32 && Alignment >= 4) ||
1133 (EltWidth == 16 && Alignment >= 2) || EltWidth == 8);
1134 }
1135
1136 /// Given a memcpy/memset/memmove instruction, return the number of memory
1137 /// operations performed, via querying findOptimalMemOpLowering. Returns -1 if a
1138 /// call is used.
getNumMemOps(const IntrinsicInst * I) const1139 int ARMTTIImpl::getNumMemOps(const IntrinsicInst *I) const {
1140 MemOp MOp;
1141 unsigned DstAddrSpace = ~0u;
1142 unsigned SrcAddrSpace = ~0u;
1143 const Function *F = I->getParent()->getParent();
1144
1145 if (const auto *MC = dyn_cast<MemTransferInst>(I)) {
1146 ConstantInt *C = dyn_cast<ConstantInt>(MC->getLength());
1147 // If 'size' is not a constant, a library call will be generated.
1148 if (!C)
1149 return -1;
1150
1151 const unsigned Size = C->getValue().getZExtValue();
1152 const Align DstAlign = *MC->getDestAlign();
1153 const Align SrcAlign = *MC->getSourceAlign();
1154
1155 MOp = MemOp::Copy(Size, /*DstAlignCanChange*/ false, DstAlign, SrcAlign,
1156 /*IsVolatile*/ false);
1157 DstAddrSpace = MC->getDestAddressSpace();
1158 SrcAddrSpace = MC->getSourceAddressSpace();
1159 }
1160 else if (const auto *MS = dyn_cast<MemSetInst>(I)) {
1161 ConstantInt *C = dyn_cast<ConstantInt>(MS->getLength());
1162 // If 'size' is not a constant, a library call will be generated.
1163 if (!C)
1164 return -1;
1165
1166 const unsigned Size = C->getValue().getZExtValue();
1167 const Align DstAlign = *MS->getDestAlign();
1168
1169 MOp = MemOp::Set(Size, /*DstAlignCanChange*/ false, DstAlign,
1170 /*IsZeroMemset*/ false, /*IsVolatile*/ false);
1171 DstAddrSpace = MS->getDestAddressSpace();
1172 }
1173 else
1174 llvm_unreachable("Expected a memcpy/move or memset!");
1175
1176 unsigned Limit, Factor = 2;
1177 switch(I->getIntrinsicID()) {
1178 case Intrinsic::memcpy:
1179 Limit = TLI->getMaxStoresPerMemcpy(F->hasMinSize());
1180 break;
1181 case Intrinsic::memmove:
1182 Limit = TLI->getMaxStoresPerMemmove(F->hasMinSize());
1183 break;
1184 case Intrinsic::memset:
1185 Limit = TLI->getMaxStoresPerMemset(F->hasMinSize());
1186 Factor = 1;
1187 break;
1188 default:
1189 llvm_unreachable("Expected a memcpy/move or memset!");
1190 }
1191
1192 // MemOps will be poplulated with a list of data types that needs to be
1193 // loaded and stored. That's why we multiply the number of elements by 2 to
1194 // get the cost for this memcpy.
1195 std::vector<EVT> MemOps;
1196 if (getTLI()->findOptimalMemOpLowering(
1197 MemOps, Limit, MOp, DstAddrSpace,
1198 SrcAddrSpace, F->getAttributes()))
1199 return MemOps.size() * Factor;
1200
1201 // If we can't find an optimal memop lowering, return the default cost
1202 return -1;
1203 }
1204
getMemcpyCost(const Instruction * I)1205 InstructionCost ARMTTIImpl::getMemcpyCost(const Instruction *I) {
1206 int NumOps = getNumMemOps(cast<IntrinsicInst>(I));
1207
1208 // To model the cost of a library call, we assume 1 for the call, and
1209 // 3 for the argument setup.
1210 if (NumOps == -1)
1211 return 4;
1212 return NumOps;
1213 }
1214
getShuffleCost(TTI::ShuffleKind Kind,VectorType * Tp,ArrayRef<int> Mask,TTI::TargetCostKind CostKind,int Index,VectorType * SubTp,ArrayRef<const Value * > Args,const Instruction * CxtI)1215 InstructionCost ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
1216 VectorType *Tp, ArrayRef<int> Mask,
1217 TTI::TargetCostKind CostKind,
1218 int Index, VectorType *SubTp,
1219 ArrayRef<const Value *> Args,
1220 const Instruction *CxtI) {
1221 Kind = improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp);
1222 // Treat extractsubvector as single op permutation.
1223 bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
1224 if (IsExtractSubvector)
1225 Kind = TTI::SK_PermuteSingleSrc;
1226 if (ST->hasNEON()) {
1227 if (Kind == TTI::SK_Broadcast) {
1228 static const CostTblEntry NEONDupTbl[] = {
1229 // VDUP handles these cases.
1230 {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
1231 {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
1232 {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
1233 {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
1234 {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
1235 {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},
1236
1237 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
1238 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
1239 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
1240 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1}};
1241
1242 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
1243 if (const auto *Entry =
1244 CostTableLookup(NEONDupTbl, ISD::VECTOR_SHUFFLE, LT.second))
1245 return LT.first * Entry->Cost;
1246 }
1247 if (Kind == TTI::SK_Reverse) {
1248 static const CostTblEntry NEONShuffleTbl[] = {
1249 // Reverse shuffle cost one instruction if we are shuffling within a
1250 // double word (vrev) or two if we shuffle a quad word (vrev, vext).
1251 {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
1252 {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
1253 {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
1254 {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
1255 {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
1256 {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},
1257
1258 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
1259 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
1260 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 2},
1261 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 2}};
1262
1263 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
1264 if (const auto *Entry =
1265 CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
1266 return LT.first * Entry->Cost;
1267 }
1268 if (Kind == TTI::SK_Select) {
1269 static const CostTblEntry NEONSelShuffleTbl[] = {
1270 // Select shuffle cost table for ARM. Cost is the number of
1271 // instructions
1272 // required to create the shuffled vector.
1273
1274 {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
1275 {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
1276 {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
1277 {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
1278
1279 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
1280 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
1281 {ISD::VECTOR_SHUFFLE, MVT::v4i16, 2},
1282
1283 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 16},
1284
1285 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 32}};
1286
1287 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
1288 if (const auto *Entry = CostTableLookup(NEONSelShuffleTbl,
1289 ISD::VECTOR_SHUFFLE, LT.second))
1290 return LT.first * Entry->Cost;
1291 }
1292 }
1293 if (ST->hasMVEIntegerOps()) {
1294 if (Kind == TTI::SK_Broadcast) {
1295 static const CostTblEntry MVEDupTbl[] = {
1296 // VDUP handles these cases.
1297 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
1298 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
1299 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1},
1300 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
1301 {ISD::VECTOR_SHUFFLE, MVT::v8f16, 1}};
1302
1303 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
1304 if (const auto *Entry = CostTableLookup(MVEDupTbl, ISD::VECTOR_SHUFFLE,
1305 LT.second))
1306 return LT.first * Entry->Cost *
1307 ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput);
1308 }
1309
1310 if (!Mask.empty()) {
1311 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
1312 if (LT.second.isVector() &&
1313 Mask.size() <= LT.second.getVectorNumElements() &&
1314 (isVREVMask(Mask, LT.second, 16) || isVREVMask(Mask, LT.second, 32) ||
1315 isVREVMask(Mask, LT.second, 64)))
1316 return ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput) * LT.first;
1317 }
1318 }
1319
1320 // Restore optimal kind.
1321 if (IsExtractSubvector)
1322 Kind = TTI::SK_ExtractSubvector;
1323 int BaseCost = ST->hasMVEIntegerOps() && Tp->isVectorTy()
1324 ? ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput)
1325 : 1;
1326 return BaseCost *
1327 BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp);
1328 }
1329
getArithmeticInstrCost(unsigned Opcode,Type * Ty,TTI::TargetCostKind CostKind,TTI::OperandValueInfo Op1Info,TTI::OperandValueInfo Op2Info,ArrayRef<const Value * > Args,const Instruction * CxtI)1330 InstructionCost ARMTTIImpl::getArithmeticInstrCost(
1331 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
1332 TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
1333 ArrayRef<const Value *> Args,
1334 const Instruction *CxtI) {
1335 int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
1336 if (ST->isThumb() && CostKind == TTI::TCK_CodeSize && Ty->isIntegerTy(1)) {
1337 // Make operations on i1 relatively expensive as this often involves
1338 // combining predicates. AND and XOR should be easier to handle with IT
1339 // blocks.
1340 switch (ISDOpcode) {
1341 default:
1342 break;
1343 case ISD::AND:
1344 case ISD::XOR:
1345 return 2;
1346 case ISD::OR:
1347 return 3;
1348 }
1349 }
1350
1351 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1352
1353 if (ST->hasNEON()) {
1354 const unsigned FunctionCallDivCost = 20;
1355 const unsigned ReciprocalDivCost = 10;
1356 static const CostTblEntry CostTbl[] = {
1357 // Division.
1358 // These costs are somewhat random. Choose a cost of 20 to indicate that
1359 // vectorizing devision (added function call) is going to be very expensive.
1360 // Double registers types.
1361 { ISD::SDIV, MVT::v1i64, 1 * FunctionCallDivCost},
1362 { ISD::UDIV, MVT::v1i64, 1 * FunctionCallDivCost},
1363 { ISD::SREM, MVT::v1i64, 1 * FunctionCallDivCost},
1364 { ISD::UREM, MVT::v1i64, 1 * FunctionCallDivCost},
1365 { ISD::SDIV, MVT::v2i32, 2 * FunctionCallDivCost},
1366 { ISD::UDIV, MVT::v2i32, 2 * FunctionCallDivCost},
1367 { ISD::SREM, MVT::v2i32, 2 * FunctionCallDivCost},
1368 { ISD::UREM, MVT::v2i32, 2 * FunctionCallDivCost},
1369 { ISD::SDIV, MVT::v4i16, ReciprocalDivCost},
1370 { ISD::UDIV, MVT::v4i16, ReciprocalDivCost},
1371 { ISD::SREM, MVT::v4i16, 4 * FunctionCallDivCost},
1372 { ISD::UREM, MVT::v4i16, 4 * FunctionCallDivCost},
1373 { ISD::SDIV, MVT::v8i8, ReciprocalDivCost},
1374 { ISD::UDIV, MVT::v8i8, ReciprocalDivCost},
1375 { ISD::SREM, MVT::v8i8, 8 * FunctionCallDivCost},
1376 { ISD::UREM, MVT::v8i8, 8 * FunctionCallDivCost},
1377 // Quad register types.
1378 { ISD::SDIV, MVT::v2i64, 2 * FunctionCallDivCost},
1379 { ISD::UDIV, MVT::v2i64, 2 * FunctionCallDivCost},
1380 { ISD::SREM, MVT::v2i64, 2 * FunctionCallDivCost},
1381 { ISD::UREM, MVT::v2i64, 2 * FunctionCallDivCost},
1382 { ISD::SDIV, MVT::v4i32, 4 * FunctionCallDivCost},
1383 { ISD::UDIV, MVT::v4i32, 4 * FunctionCallDivCost},
1384 { ISD::SREM, MVT::v4i32, 4 * FunctionCallDivCost},
1385 { ISD::UREM, MVT::v4i32, 4 * FunctionCallDivCost},
1386 { ISD::SDIV, MVT::v8i16, 8 * FunctionCallDivCost},
1387 { ISD::UDIV, MVT::v8i16, 8 * FunctionCallDivCost},
1388 { ISD::SREM, MVT::v8i16, 8 * FunctionCallDivCost},
1389 { ISD::UREM, MVT::v8i16, 8 * FunctionCallDivCost},
1390 { ISD::SDIV, MVT::v16i8, 16 * FunctionCallDivCost},
1391 { ISD::UDIV, MVT::v16i8, 16 * FunctionCallDivCost},
1392 { ISD::SREM, MVT::v16i8, 16 * FunctionCallDivCost},
1393 { ISD::UREM, MVT::v16i8, 16 * FunctionCallDivCost},
1394 // Multiplication.
1395 };
1396
1397 if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second))
1398 return LT.first * Entry->Cost;
1399
1400 InstructionCost Cost = BaseT::getArithmeticInstrCost(
1401 Opcode, Ty, CostKind, Op1Info, Op2Info);
1402
1403 // This is somewhat of a hack. The problem that we are facing is that SROA
1404 // creates a sequence of shift, and, or instructions to construct values.
1405 // These sequences are recognized by the ISel and have zero-cost. Not so for
1406 // the vectorized code. Because we have support for v2i64 but not i64 those
1407 // sequences look particularly beneficial to vectorize.
1408 // To work around this we increase the cost of v2i64 operations to make them
1409 // seem less beneficial.
1410 if (LT.second == MVT::v2i64 && Op2Info.isUniform() && Op2Info.isConstant())
1411 Cost += 4;
1412
1413 return Cost;
1414 }
1415
1416 // If this operation is a shift on arm/thumb2, it might well be folded into
1417 // the following instruction, hence having a cost of 0.
1418 auto LooksLikeAFreeShift = [&]() {
1419 if (ST->isThumb1Only() || Ty->isVectorTy())
1420 return false;
1421
1422 if (!CxtI || !CxtI->hasOneUse() || !CxtI->isShift())
1423 return false;
1424 if (!Op2Info.isUniform() || !Op2Info.isConstant())
1425 return false;
1426
1427 // Folded into a ADC/ADD/AND/BIC/CMP/EOR/MVN/ORR/ORN/RSB/SBC/SUB
1428 switch (cast<Instruction>(CxtI->user_back())->getOpcode()) {
1429 case Instruction::Add:
1430 case Instruction::Sub:
1431 case Instruction::And:
1432 case Instruction::Xor:
1433 case Instruction::Or:
1434 case Instruction::ICmp:
1435 return true;
1436 default:
1437 return false;
1438 }
1439 };
1440 if (LooksLikeAFreeShift())
1441 return 0;
1442
1443 // Default to cheap (throughput/size of 1 instruction) but adjust throughput
1444 // for "multiple beats" potentially needed by MVE instructions.
1445 int BaseCost = 1;
1446 if (ST->hasMVEIntegerOps() && Ty->isVectorTy())
1447 BaseCost = ST->getMVEVectorCostFactor(CostKind);
1448
1449 // The rest of this mostly follows what is done in BaseT::getArithmeticInstrCost,
1450 // without treating floats as more expensive that scalars or increasing the
1451 // costs for custom operations. The results is also multiplied by the
1452 // MVEVectorCostFactor where appropriate.
1453 if (TLI->isOperationLegalOrCustomOrPromote(ISDOpcode, LT.second))
1454 return LT.first * BaseCost;
1455
1456 // Else this is expand, assume that we need to scalarize this op.
1457 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
1458 unsigned Num = VTy->getNumElements();
1459 InstructionCost Cost =
1460 getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind);
1461 // Return the cost of multiple scalar invocation plus the cost of
1462 // inserting and extracting the values.
1463 SmallVector<Type *> Tys(Args.size(), Ty);
1464 return BaseT::getScalarizationOverhead(VTy, Args, Tys, CostKind) +
1465 Num * Cost;
1466 }
1467
1468 return BaseCost;
1469 }
1470
getMemoryOpCost(unsigned Opcode,Type * Src,MaybeAlign Alignment,unsigned AddressSpace,TTI::TargetCostKind CostKind,TTI::OperandValueInfo OpInfo,const Instruction * I)1471 InstructionCost ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
1472 MaybeAlign Alignment,
1473 unsigned AddressSpace,
1474 TTI::TargetCostKind CostKind,
1475 TTI::OperandValueInfo OpInfo,
1476 const Instruction *I) {
1477 // TODO: Handle other cost kinds.
1478 if (CostKind != TTI::TCK_RecipThroughput)
1479 return 1;
1480
1481 // Type legalization can't handle structs
1482 if (TLI->getValueType(DL, Src, true) == MVT::Other)
1483 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1484 CostKind);
1485
1486 if (ST->hasNEON() && Src->isVectorTy() &&
1487 (Alignment && *Alignment != Align(16)) &&
1488 cast<VectorType>(Src)->getElementType()->isDoubleTy()) {
1489 // Unaligned loads/stores are extremely inefficient.
1490 // We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr.
1491 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
1492 return LT.first * 4;
1493 }
1494
1495 // MVE can optimize a fpext(load(4xhalf)) using an extending integer load.
1496 // Same for stores.
1497 if (ST->hasMVEFloatOps() && isa<FixedVectorType>(Src) && I &&
1498 ((Opcode == Instruction::Load && I->hasOneUse() &&
1499 isa<FPExtInst>(*I->user_begin())) ||
1500 (Opcode == Instruction::Store && isa<FPTruncInst>(I->getOperand(0))))) {
1501 FixedVectorType *SrcVTy = cast<FixedVectorType>(Src);
1502 Type *DstTy =
1503 Opcode == Instruction::Load
1504 ? (*I->user_begin())->getType()
1505 : cast<Instruction>(I->getOperand(0))->getOperand(0)->getType();
1506 if (SrcVTy->getNumElements() == 4 && SrcVTy->getScalarType()->isHalfTy() &&
1507 DstTy->getScalarType()->isFloatTy())
1508 return ST->getMVEVectorCostFactor(CostKind);
1509 }
1510
1511 int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
1512 ? ST->getMVEVectorCostFactor(CostKind)
1513 : 1;
1514 return BaseCost * BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1515 CostKind, OpInfo, I);
1516 }
1517
1518 InstructionCost
getMaskedMemoryOpCost(unsigned Opcode,Type * Src,Align Alignment,unsigned AddressSpace,TTI::TargetCostKind CostKind)1519 ARMTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
1520 unsigned AddressSpace,
1521 TTI::TargetCostKind CostKind) {
1522 if (ST->hasMVEIntegerOps()) {
1523 if (Opcode == Instruction::Load && isLegalMaskedLoad(Src, Alignment))
1524 return ST->getMVEVectorCostFactor(CostKind);
1525 if (Opcode == Instruction::Store && isLegalMaskedStore(Src, Alignment))
1526 return ST->getMVEVectorCostFactor(CostKind);
1527 }
1528 if (!isa<FixedVectorType>(Src))
1529 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1530 CostKind);
1531 // Scalar cost, which is currently very high due to the efficiency of the
1532 // generated code.
1533 return cast<FixedVectorType>(Src)->getNumElements() * 8;
1534 }
1535
getInterleavedMemoryOpCost(unsigned Opcode,Type * VecTy,unsigned Factor,ArrayRef<unsigned> Indices,Align Alignment,unsigned AddressSpace,TTI::TargetCostKind CostKind,bool UseMaskForCond,bool UseMaskForGaps)1536 InstructionCost ARMTTIImpl::getInterleavedMemoryOpCost(
1537 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1538 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1539 bool UseMaskForCond, bool UseMaskForGaps) {
1540 assert(Factor >= 2 && "Invalid interleave factor");
1541 assert(isa<VectorType>(VecTy) && "Expect a vector type");
1542
1543 // vldN/vstN doesn't support vector types of i64/f64 element.
1544 bool EltIs64Bits = DL.getTypeSizeInBits(VecTy->getScalarType()) == 64;
1545
1546 if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits &&
1547 !UseMaskForCond && !UseMaskForGaps) {
1548 unsigned NumElts = cast<FixedVectorType>(VecTy)->getNumElements();
1549 auto *SubVecTy =
1550 FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor);
1551
1552 // vldN/vstN only support legal vector types of size 64 or 128 in bits.
1553 // Accesses having vector types that are a multiple of 128 bits can be
1554 // matched to more than one vldN/vstN instruction.
1555 int BaseCost =
1556 ST->hasMVEIntegerOps() ? ST->getMVEVectorCostFactor(CostKind) : 1;
1557 if (NumElts % Factor == 0 &&
1558 TLI->isLegalInterleavedAccessType(Factor, SubVecTy, Alignment, DL))
1559 return Factor * BaseCost * TLI->getNumInterleavedAccesses(SubVecTy, DL);
1560
1561 // Some smaller than legal interleaved patterns are cheap as we can make
1562 // use of the vmovn or vrev patterns to interleave a standard load. This is
1563 // true for v4i8, v8i8 and v4i16 at least (but not for v4f16 as it is
1564 // promoted differently). The cost of 2 here is then a load and vrev or
1565 // vmovn.
1566 if (ST->hasMVEIntegerOps() && Factor == 2 && NumElts / Factor > 2 &&
1567 VecTy->isIntOrIntVectorTy() &&
1568 DL.getTypeSizeInBits(SubVecTy).getFixedValue() <= 64)
1569 return 2 * BaseCost;
1570 }
1571
1572 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1573 Alignment, AddressSpace, CostKind,
1574 UseMaskForCond, UseMaskForGaps);
1575 }
1576
getGatherScatterOpCost(unsigned Opcode,Type * DataTy,const Value * Ptr,bool VariableMask,Align Alignment,TTI::TargetCostKind CostKind,const Instruction * I)1577 InstructionCost ARMTTIImpl::getGatherScatterOpCost(
1578 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
1579 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
1580 using namespace PatternMatch;
1581 if (!ST->hasMVEIntegerOps() || !EnableMaskedGatherScatters)
1582 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
1583 Alignment, CostKind, I);
1584
1585 assert(DataTy->isVectorTy() && "Can't do gather/scatters on scalar!");
1586 auto *VTy = cast<FixedVectorType>(DataTy);
1587
1588 // TODO: Splitting, once we do that.
1589
1590 unsigned NumElems = VTy->getNumElements();
1591 unsigned EltSize = VTy->getScalarSizeInBits();
1592 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(DataTy);
1593
1594 // For now, it is assumed that for the MVE gather instructions the loads are
1595 // all effectively serialised. This means the cost is the scalar cost
1596 // multiplied by the number of elements being loaded. This is possibly very
1597 // conservative, but even so we still end up vectorising loops because the
1598 // cost per iteration for many loops is lower than for scalar loops.
1599 InstructionCost VectorCost =
1600 NumElems * LT.first * ST->getMVEVectorCostFactor(CostKind);
1601 // The scalarization cost should be a lot higher. We use the number of vector
1602 // elements plus the scalarization overhead. If masking is required then a lot
1603 // of little blocks will be needed and potentially a scalarized p0 mask,
1604 // greatly increasing the cost.
1605 InstructionCost ScalarCost =
1606 NumElems * LT.first + (VariableMask ? NumElems * 5 : 0) +
1607 BaseT::getScalarizationOverhead(VTy, /*Insert*/ true, /*Extract*/ false,
1608 CostKind) +
1609 BaseT::getScalarizationOverhead(VTy, /*Insert*/ false, /*Extract*/ true,
1610 CostKind);
1611
1612 if (EltSize < 8 || Alignment < EltSize / 8)
1613 return ScalarCost;
1614
1615 unsigned ExtSize = EltSize;
1616 // Check whether there's a single user that asks for an extended type
1617 if (I != nullptr) {
1618 // Dependent of the caller of this function, a gather instruction will
1619 // either have opcode Instruction::Load or be a call to the masked_gather
1620 // intrinsic
1621 if ((I->getOpcode() == Instruction::Load ||
1622 match(I, m_Intrinsic<Intrinsic::masked_gather>())) &&
1623 I->hasOneUse()) {
1624 const User *Us = *I->users().begin();
1625 if (isa<ZExtInst>(Us) || isa<SExtInst>(Us)) {
1626 // only allow valid type combinations
1627 unsigned TypeSize =
1628 cast<Instruction>(Us)->getType()->getScalarSizeInBits();
1629 if (((TypeSize == 32 && (EltSize == 8 || EltSize == 16)) ||
1630 (TypeSize == 16 && EltSize == 8)) &&
1631 TypeSize * NumElems == 128) {
1632 ExtSize = TypeSize;
1633 }
1634 }
1635 }
1636 // Check whether the input data needs to be truncated
1637 TruncInst *T;
1638 if ((I->getOpcode() == Instruction::Store ||
1639 match(I, m_Intrinsic<Intrinsic::masked_scatter>())) &&
1640 (T = dyn_cast<TruncInst>(I->getOperand(0)))) {
1641 // Only allow valid type combinations
1642 unsigned TypeSize = T->getOperand(0)->getType()->getScalarSizeInBits();
1643 if (((EltSize == 16 && TypeSize == 32) ||
1644 (EltSize == 8 && (TypeSize == 32 || TypeSize == 16))) &&
1645 TypeSize * NumElems == 128)
1646 ExtSize = TypeSize;
1647 }
1648 }
1649
1650 if (ExtSize * NumElems != 128 || NumElems < 4)
1651 return ScalarCost;
1652
1653 // Any (aligned) i32 gather will not need to be scalarised.
1654 if (ExtSize == 32)
1655 return VectorCost;
1656 // For smaller types, we need to ensure that the gep's inputs are correctly
1657 // extended from a small enough value. Other sizes (including i64) are
1658 // scalarized for now.
1659 if (ExtSize != 8 && ExtSize != 16)
1660 return ScalarCost;
1661
1662 if (const auto *BC = dyn_cast<BitCastInst>(Ptr))
1663 Ptr = BC->getOperand(0);
1664 if (const auto *GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
1665 if (GEP->getNumOperands() != 2)
1666 return ScalarCost;
1667 unsigned Scale = DL.getTypeAllocSize(GEP->getResultElementType());
1668 // Scale needs to be correct (which is only relevant for i16s).
1669 if (Scale != 1 && Scale * 8 != ExtSize)
1670 return ScalarCost;
1671 // And we need to zext (not sext) the indexes from a small enough type.
1672 if (const auto *ZExt = dyn_cast<ZExtInst>(GEP->getOperand(1))) {
1673 if (ZExt->getOperand(0)->getType()->getScalarSizeInBits() <= ExtSize)
1674 return VectorCost;
1675 }
1676 return ScalarCost;
1677 }
1678 return ScalarCost;
1679 }
1680
1681 InstructionCost
getArithmeticReductionCost(unsigned Opcode,VectorType * ValTy,std::optional<FastMathFlags> FMF,TTI::TargetCostKind CostKind)1682 ARMTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
1683 std::optional<FastMathFlags> FMF,
1684 TTI::TargetCostKind CostKind) {
1685
1686 EVT ValVT = TLI->getValueType(DL, ValTy);
1687 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1688 unsigned EltSize = ValVT.getScalarSizeInBits();
1689
1690 // In general floating point reductions are a series of elementwise
1691 // operations, with free extracts on each step. These are either in-order or
1692 // treewise depending on whether that is allowed by the fast math flags.
1693 if ((ISD == ISD::FADD || ISD == ISD::FMUL) &&
1694 ((EltSize == 32 && ST->hasVFP2Base()) ||
1695 (EltSize == 64 && ST->hasFP64()) ||
1696 (EltSize == 16 && ST->hasFullFP16()))) {
1697 unsigned NumElts = cast<FixedVectorType>(ValTy)->getNumElements();
1698 unsigned VecLimit = ST->hasMVEFloatOps() ? 128 : (ST->hasNEON() ? 64 : -1);
1699 InstructionCost VecCost = 0;
1700 while (!TTI::requiresOrderedReduction(FMF) && isPowerOf2_32(NumElts) &&
1701 NumElts * EltSize > VecLimit) {
1702 Type *VecTy = FixedVectorType::get(ValTy->getElementType(), NumElts / 2);
1703 VecCost += getArithmeticInstrCost(Opcode, VecTy, CostKind);
1704 NumElts /= 2;
1705 }
1706
1707 // For fp16 we need to extract the upper lane elements. MVE can add a
1708 // VREV+FMIN/MAX to perform another vector step instead.
1709 InstructionCost ExtractCost = 0;
1710 if (!TTI::requiresOrderedReduction(FMF) && ST->hasMVEFloatOps() &&
1711 ValVT.getVectorElementType() == MVT::f16 && NumElts == 8) {
1712 VecCost += ST->getMVEVectorCostFactor(CostKind) * 2;
1713 NumElts /= 2;
1714 } else if (ValVT.getVectorElementType() == MVT::f16)
1715 ExtractCost = NumElts / 2;
1716
1717 return VecCost + ExtractCost +
1718 NumElts *
1719 getArithmeticInstrCost(Opcode, ValTy->getElementType(), CostKind);
1720 }
1721
1722 if ((ISD == ISD::AND || ISD == ISD::OR || ISD == ISD::XOR) &&
1723 (EltSize == 64 || EltSize == 32 || EltSize == 16 || EltSize == 8)) {
1724 unsigned NumElts = cast<FixedVectorType>(ValTy)->getNumElements();
1725 unsigned VecLimit =
1726 ST->hasMVEIntegerOps() ? 128 : (ST->hasNEON() ? 64 : -1);
1727 InstructionCost VecCost = 0;
1728 while (isPowerOf2_32(NumElts) && NumElts * EltSize > VecLimit) {
1729 Type *VecTy = FixedVectorType::get(ValTy->getElementType(), NumElts / 2);
1730 VecCost += getArithmeticInstrCost(Opcode, VecTy, CostKind);
1731 NumElts /= 2;
1732 }
1733 // For i16/i8, MVE will perform a VREV + VORR/VAND/VEOR for the 64bit vector
1734 // step.
1735 if (ST->hasMVEIntegerOps() && ValVT.getScalarSizeInBits() <= 16 &&
1736 NumElts * EltSize == 64) {
1737 Type *VecTy = FixedVectorType::get(ValTy->getElementType(), NumElts);
1738 VecCost += ST->getMVEVectorCostFactor(CostKind) +
1739 getArithmeticInstrCost(Opcode, VecTy, CostKind);
1740 NumElts /= 2;
1741 }
1742
1743 // From here we extract the elements and perform the and/or/xor.
1744 InstructionCost ExtractCost = NumElts;
1745 return VecCost + ExtractCost +
1746 (NumElts - 1) * getArithmeticInstrCost(
1747 Opcode, ValTy->getElementType(), CostKind);
1748 }
1749
1750 if (!ST->hasMVEIntegerOps() || !ValVT.isSimple() || ISD != ISD::ADD ||
1751 TTI::requiresOrderedReduction(FMF))
1752 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1753
1754 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1755
1756 static const CostTblEntry CostTblAdd[]{
1757 {ISD::ADD, MVT::v16i8, 1},
1758 {ISD::ADD, MVT::v8i16, 1},
1759 {ISD::ADD, MVT::v4i32, 1},
1760 };
1761 if (const auto *Entry = CostTableLookup(CostTblAdd, ISD, LT.second))
1762 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first;
1763
1764 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1765 }
1766
getExtendedReductionCost(unsigned Opcode,bool IsUnsigned,Type * ResTy,VectorType * ValTy,FastMathFlags FMF,TTI::TargetCostKind CostKind)1767 InstructionCost ARMTTIImpl::getExtendedReductionCost(
1768 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy,
1769 FastMathFlags FMF, TTI::TargetCostKind CostKind) {
1770 EVT ValVT = TLI->getValueType(DL, ValTy);
1771 EVT ResVT = TLI->getValueType(DL, ResTy);
1772
1773 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1774
1775 switch (ISD) {
1776 case ISD::ADD:
1777 if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {
1778 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1779
1780 // The legal cases are:
1781 // VADDV u/s 8/16/32
1782 // VADDLV u/s 32
1783 // Codegen currently cannot always handle larger than legal vectors very
1784 // well, especially for predicated reductions where the mask needs to be
1785 // split, so restrict to 128bit or smaller input types.
1786 unsigned RevVTSize = ResVT.getSizeInBits();
1787 if (ValVT.getSizeInBits() <= 128 &&
1788 ((LT.second == MVT::v16i8 && RevVTSize <= 32) ||
1789 (LT.second == MVT::v8i16 && RevVTSize <= 32) ||
1790 (LT.second == MVT::v4i32 && RevVTSize <= 64)))
1791 return ST->getMVEVectorCostFactor(CostKind) * LT.first;
1792 }
1793 break;
1794 default:
1795 break;
1796 }
1797 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy, FMF,
1798 CostKind);
1799 }
1800
1801 InstructionCost
getMulAccReductionCost(bool IsUnsigned,Type * ResTy,VectorType * ValTy,TTI::TargetCostKind CostKind)1802 ARMTTIImpl::getMulAccReductionCost(bool IsUnsigned, Type *ResTy,
1803 VectorType *ValTy,
1804 TTI::TargetCostKind CostKind) {
1805 EVT ValVT = TLI->getValueType(DL, ValTy);
1806 EVT ResVT = TLI->getValueType(DL, ResTy);
1807
1808 if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {
1809 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1810
1811 // The legal cases are:
1812 // VMLAV u/s 8/16/32
1813 // VMLALV u/s 16/32
1814 // Codegen currently cannot always handle larger than legal vectors very
1815 // well, especially for predicated reductions where the mask needs to be
1816 // split, so restrict to 128bit or smaller input types.
1817 unsigned RevVTSize = ResVT.getSizeInBits();
1818 if (ValVT.getSizeInBits() <= 128 &&
1819 ((LT.second == MVT::v16i8 && RevVTSize <= 32) ||
1820 (LT.second == MVT::v8i16 && RevVTSize <= 64) ||
1821 (LT.second == MVT::v4i32 && RevVTSize <= 64)))
1822 return ST->getMVEVectorCostFactor(CostKind) * LT.first;
1823 }
1824
1825 return BaseT::getMulAccReductionCost(IsUnsigned, ResTy, ValTy, CostKind);
1826 }
1827
1828 InstructionCost
getMinMaxReductionCost(Intrinsic::ID IID,VectorType * Ty,FastMathFlags FMF,TTI::TargetCostKind CostKind)1829 ARMTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
1830 FastMathFlags FMF,
1831 TTI::TargetCostKind CostKind) {
1832 EVT ValVT = TLI->getValueType(DL, Ty);
1833
1834 // In general floating point reductions are a series of elementwise
1835 // operations, with free extracts on each step. These are either in-order or
1836 // treewise depending on whether that is allowed by the fast math flags.
1837 if ((IID == Intrinsic::minnum || IID == Intrinsic::maxnum) &&
1838 ((ValVT.getVectorElementType() == MVT::f32 && ST->hasVFP2Base()) ||
1839 (ValVT.getVectorElementType() == MVT::f64 && ST->hasFP64()) ||
1840 (ValVT.getVectorElementType() == MVT::f16 && ST->hasFullFP16()))) {
1841 unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements();
1842 unsigned EltSize = ValVT.getScalarSizeInBits();
1843 unsigned VecLimit = ST->hasMVEFloatOps() ? 128 : (ST->hasNEON() ? 64 : -1);
1844 InstructionCost VecCost;
1845 while (isPowerOf2_32(NumElts) && NumElts * EltSize > VecLimit) {
1846 Type *VecTy = FixedVectorType::get(Ty->getElementType(), NumElts/2);
1847 IntrinsicCostAttributes ICA(IID, VecTy, {VecTy, VecTy}, FMF);
1848 VecCost += getIntrinsicInstrCost(ICA, CostKind);
1849 NumElts /= 2;
1850 }
1851
1852 // For fp16 we need to extract the upper lane elements. MVE can add a
1853 // VREV+FMIN/MAX to perform another vector step instead.
1854 InstructionCost ExtractCost = 0;
1855 if (ST->hasMVEFloatOps() && ValVT.getVectorElementType() == MVT::f16 &&
1856 NumElts == 8) {
1857 VecCost += ST->getMVEVectorCostFactor(CostKind) * 2;
1858 NumElts /= 2;
1859 } else if (ValVT.getVectorElementType() == MVT::f16)
1860 ExtractCost = cast<FixedVectorType>(Ty)->getNumElements() / 2;
1861
1862 IntrinsicCostAttributes ICA(IID, Ty->getElementType(),
1863 {Ty->getElementType(), Ty->getElementType()},
1864 FMF);
1865 return VecCost + ExtractCost +
1866 (NumElts - 1) * getIntrinsicInstrCost(ICA, CostKind);
1867 }
1868
1869 if (IID == Intrinsic::smin || IID == Intrinsic::smax ||
1870 IID == Intrinsic::umin || IID == Intrinsic::umax) {
1871 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1872
1873 // All costs are the same for u/s min/max. These lower to vminv, which are
1874 // given a slightly higher cost as they tend to take multiple cycles for
1875 // smaller type sizes.
1876 static const CostTblEntry CostTblAdd[]{
1877 {ISD::SMIN, MVT::v16i8, 4},
1878 {ISD::SMIN, MVT::v8i16, 3},
1879 {ISD::SMIN, MVT::v4i32, 2},
1880 };
1881 if (const auto *Entry = CostTableLookup(CostTblAdd, ISD::SMIN, LT.second))
1882 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first;
1883 }
1884
1885 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1886 }
1887
1888 InstructionCost
getIntrinsicInstrCost(const IntrinsicCostAttributes & ICA,TTI::TargetCostKind CostKind)1889 ARMTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
1890 TTI::TargetCostKind CostKind) {
1891 switch (ICA.getID()) {
1892 case Intrinsic::get_active_lane_mask:
1893 // Currently we make a somewhat optimistic assumption that
1894 // active_lane_mask's are always free. In reality it may be freely folded
1895 // into a tail predicated loop, expanded into a VCPT or expanded into a lot
1896 // of add/icmp code. We may need to improve this in the future, but being
1897 // able to detect if it is free or not involves looking at a lot of other
1898 // code. We currently assume that the vectorizer inserted these, and knew
1899 // what it was doing in adding one.
1900 if (ST->hasMVEIntegerOps())
1901 return 0;
1902 break;
1903 case Intrinsic::sadd_sat:
1904 case Intrinsic::ssub_sat:
1905 case Intrinsic::uadd_sat:
1906 case Intrinsic::usub_sat: {
1907 if (!ST->hasMVEIntegerOps())
1908 break;
1909 Type *VT = ICA.getReturnType();
1910
1911 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VT);
1912 if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
1913 LT.second == MVT::v16i8) {
1914 // This is a base cost of 1 for the vqadd, plus 3 extract shifts if we
1915 // need to extend the type, as it uses shr(qadd(shl, shl)).
1916 unsigned Instrs =
1917 LT.second.getScalarSizeInBits() == VT->getScalarSizeInBits() ? 1 : 4;
1918 return LT.first * ST->getMVEVectorCostFactor(CostKind) * Instrs;
1919 }
1920 break;
1921 }
1922 case Intrinsic::abs:
1923 case Intrinsic::smin:
1924 case Intrinsic::smax:
1925 case Intrinsic::umin:
1926 case Intrinsic::umax: {
1927 if (!ST->hasMVEIntegerOps())
1928 break;
1929 Type *VT = ICA.getReturnType();
1930
1931 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VT);
1932 if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
1933 LT.second == MVT::v16i8)
1934 return LT.first * ST->getMVEVectorCostFactor(CostKind);
1935 break;
1936 }
1937 case Intrinsic::minnum:
1938 case Intrinsic::maxnum: {
1939 if (!ST->hasMVEFloatOps())
1940 break;
1941 Type *VT = ICA.getReturnType();
1942 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VT);
1943 if (LT.second == MVT::v4f32 || LT.second == MVT::v8f16)
1944 return LT.first * ST->getMVEVectorCostFactor(CostKind);
1945 break;
1946 }
1947 case Intrinsic::fptosi_sat:
1948 case Intrinsic::fptoui_sat: {
1949 if (ICA.getArgTypes().empty())
1950 break;
1951 bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
1952 auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]);
1953 EVT MTy = TLI->getValueType(DL, ICA.getReturnType());
1954 // Check for the legal types, with the corect subtarget features.
1955 if ((ST->hasVFP2Base() && LT.second == MVT::f32 && MTy == MVT::i32) ||
1956 (ST->hasFP64() && LT.second == MVT::f64 && MTy == MVT::i32) ||
1957 (ST->hasFullFP16() && LT.second == MVT::f16 && MTy == MVT::i32))
1958 return LT.first;
1959
1960 // Equally for MVE vector types
1961 if (ST->hasMVEFloatOps() &&
1962 (LT.second == MVT::v4f32 || LT.second == MVT::v8f16) &&
1963 LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())
1964 return LT.first * ST->getMVEVectorCostFactor(CostKind);
1965
1966 // Otherwise we use a legal convert followed by a min+max
1967 if (((ST->hasVFP2Base() && LT.second == MVT::f32) ||
1968 (ST->hasFP64() && LT.second == MVT::f64) ||
1969 (ST->hasFullFP16() && LT.second == MVT::f16) ||
1970 (ST->hasMVEFloatOps() &&
1971 (LT.second == MVT::v4f32 || LT.second == MVT::v8f16))) &&
1972 LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
1973 Type *LegalTy = Type::getIntNTy(ICA.getReturnType()->getContext(),
1974 LT.second.getScalarSizeInBits());
1975 InstructionCost Cost =
1976 LT.second.isVector() ? ST->getMVEVectorCostFactor(CostKind) : 1;
1977 IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin
1978 : Intrinsic::umin,
1979 LegalTy, {LegalTy, LegalTy});
1980 Cost += getIntrinsicInstrCost(Attrs1, CostKind);
1981 IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax
1982 : Intrinsic::umax,
1983 LegalTy, {LegalTy, LegalTy});
1984 Cost += getIntrinsicInstrCost(Attrs2, CostKind);
1985 return LT.first * Cost;
1986 }
1987 break;
1988 }
1989 }
1990
1991 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1992 }
1993
isLoweredToCall(const Function * F)1994 bool ARMTTIImpl::isLoweredToCall(const Function *F) {
1995 if (!F->isIntrinsic())
1996 return BaseT::isLoweredToCall(F);
1997
1998 // Assume all Arm-specific intrinsics map to an instruction.
1999 if (F->getName().starts_with("llvm.arm"))
2000 return false;
2001
2002 switch (F->getIntrinsicID()) {
2003 default: break;
2004 case Intrinsic::powi:
2005 case Intrinsic::sin:
2006 case Intrinsic::cos:
2007 case Intrinsic::pow:
2008 case Intrinsic::log:
2009 case Intrinsic::log10:
2010 case Intrinsic::log2:
2011 case Intrinsic::exp:
2012 case Intrinsic::exp2:
2013 return true;
2014 case Intrinsic::sqrt:
2015 case Intrinsic::fabs:
2016 case Intrinsic::copysign:
2017 case Intrinsic::floor:
2018 case Intrinsic::ceil:
2019 case Intrinsic::trunc:
2020 case Intrinsic::rint:
2021 case Intrinsic::nearbyint:
2022 case Intrinsic::round:
2023 case Intrinsic::canonicalize:
2024 case Intrinsic::lround:
2025 case Intrinsic::llround:
2026 case Intrinsic::lrint:
2027 case Intrinsic::llrint:
2028 if (F->getReturnType()->isDoubleTy() && !ST->hasFP64())
2029 return true;
2030 if (F->getReturnType()->isHalfTy() && !ST->hasFullFP16())
2031 return true;
2032 // Some operations can be handled by vector instructions and assume
2033 // unsupported vectors will be expanded into supported scalar ones.
2034 // TODO Handle scalar operations properly.
2035 return !ST->hasFPARMv8Base() && !ST->hasVFP2Base();
2036 case Intrinsic::masked_store:
2037 case Intrinsic::masked_load:
2038 case Intrinsic::masked_gather:
2039 case Intrinsic::masked_scatter:
2040 return !ST->hasMVEIntegerOps();
2041 case Intrinsic::sadd_with_overflow:
2042 case Intrinsic::uadd_with_overflow:
2043 case Intrinsic::ssub_with_overflow:
2044 case Intrinsic::usub_with_overflow:
2045 case Intrinsic::sadd_sat:
2046 case Intrinsic::uadd_sat:
2047 case Intrinsic::ssub_sat:
2048 case Intrinsic::usub_sat:
2049 return false;
2050 }
2051
2052 return BaseT::isLoweredToCall(F);
2053 }
2054
maybeLoweredToCall(Instruction & I)2055 bool ARMTTIImpl::maybeLoweredToCall(Instruction &I) {
2056 unsigned ISD = TLI->InstructionOpcodeToISD(I.getOpcode());
2057 EVT VT = TLI->getValueType(DL, I.getType(), true);
2058 if (TLI->getOperationAction(ISD, VT) == TargetLowering::LibCall)
2059 return true;
2060
2061 // Check if an intrinsic will be lowered to a call and assume that any
2062 // other CallInst will generate a bl.
2063 if (auto *Call = dyn_cast<CallInst>(&I)) {
2064 if (auto *II = dyn_cast<IntrinsicInst>(Call)) {
2065 switch(II->getIntrinsicID()) {
2066 case Intrinsic::memcpy:
2067 case Intrinsic::memset:
2068 case Intrinsic::memmove:
2069 return getNumMemOps(II) == -1;
2070 default:
2071 if (const Function *F = Call->getCalledFunction())
2072 return isLoweredToCall(F);
2073 }
2074 }
2075 return true;
2076 }
2077
2078 // FPv5 provides conversions between integer, double-precision,
2079 // single-precision, and half-precision formats.
2080 switch (I.getOpcode()) {
2081 default:
2082 break;
2083 case Instruction::FPToSI:
2084 case Instruction::FPToUI:
2085 case Instruction::SIToFP:
2086 case Instruction::UIToFP:
2087 case Instruction::FPTrunc:
2088 case Instruction::FPExt:
2089 return !ST->hasFPARMv8Base();
2090 }
2091
2092 // FIXME: Unfortunately the approach of checking the Operation Action does
2093 // not catch all cases of Legalization that use library calls. Our
2094 // Legalization step categorizes some transformations into library calls as
2095 // Custom, Expand or even Legal when doing type legalization. So for now
2096 // we have to special case for instance the SDIV of 64bit integers and the
2097 // use of floating point emulation.
2098 if (VT.isInteger() && VT.getSizeInBits() >= 64) {
2099 switch (ISD) {
2100 default:
2101 break;
2102 case ISD::SDIV:
2103 case ISD::UDIV:
2104 case ISD::SREM:
2105 case ISD::UREM:
2106 case ISD::SDIVREM:
2107 case ISD::UDIVREM:
2108 return true;
2109 }
2110 }
2111
2112 // Assume all other non-float operations are supported.
2113 if (!VT.isFloatingPoint())
2114 return false;
2115
2116 // We'll need a library call to handle most floats when using soft.
2117 if (TLI->useSoftFloat()) {
2118 switch (I.getOpcode()) {
2119 default:
2120 return true;
2121 case Instruction::Alloca:
2122 case Instruction::Load:
2123 case Instruction::Store:
2124 case Instruction::Select:
2125 case Instruction::PHI:
2126 return false;
2127 }
2128 }
2129
2130 // We'll need a libcall to perform double precision operations on a single
2131 // precision only FPU.
2132 if (I.getType()->isDoubleTy() && !ST->hasFP64())
2133 return true;
2134
2135 // Likewise for half precision arithmetic.
2136 if (I.getType()->isHalfTy() && !ST->hasFullFP16())
2137 return true;
2138
2139 return false;
2140 }
2141
isHardwareLoopProfitable(Loop * L,ScalarEvolution & SE,AssumptionCache & AC,TargetLibraryInfo * LibInfo,HardwareLoopInfo & HWLoopInfo)2142 bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
2143 AssumptionCache &AC,
2144 TargetLibraryInfo *LibInfo,
2145 HardwareLoopInfo &HWLoopInfo) {
2146 // Low-overhead branches are only supported in the 'low-overhead branch'
2147 // extension of v8.1-m.
2148 if (!ST->hasLOB() || DisableLowOverheadLoops) {
2149 LLVM_DEBUG(dbgs() << "ARMHWLoops: Disabled\n");
2150 return false;
2151 }
2152
2153 if (!SE.hasLoopInvariantBackedgeTakenCount(L)) {
2154 LLVM_DEBUG(dbgs() << "ARMHWLoops: No BETC\n");
2155 return false;
2156 }
2157
2158 const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
2159 if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) {
2160 LLVM_DEBUG(dbgs() << "ARMHWLoops: Uncomputable BETC\n");
2161 return false;
2162 }
2163
2164 const SCEV *TripCountSCEV =
2165 SE.getAddExpr(BackedgeTakenCount,
2166 SE.getOne(BackedgeTakenCount->getType()));
2167
2168 // We need to store the trip count in LR, a 32-bit register.
2169 if (SE.getUnsignedRangeMax(TripCountSCEV).getBitWidth() > 32) {
2170 LLVM_DEBUG(dbgs() << "ARMHWLoops: Trip count does not fit into 32bits\n");
2171 return false;
2172 }
2173
2174 // Making a call will trash LR and clear LO_BRANCH_INFO, so there's little
2175 // point in generating a hardware loop if that's going to happen.
2176
2177 auto IsHardwareLoopIntrinsic = [](Instruction &I) {
2178 if (auto *Call = dyn_cast<IntrinsicInst>(&I)) {
2179 switch (Call->getIntrinsicID()) {
2180 default:
2181 break;
2182 case Intrinsic::start_loop_iterations:
2183 case Intrinsic::test_start_loop_iterations:
2184 case Intrinsic::loop_decrement:
2185 case Intrinsic::loop_decrement_reg:
2186 return true;
2187 }
2188 }
2189 return false;
2190 };
2191
2192 // Scan the instructions to see if there's any that we know will turn into a
2193 // call or if this loop is already a low-overhead loop or will become a tail
2194 // predicated loop.
2195 bool IsTailPredLoop = false;
2196 auto ScanLoop = [&](Loop *L) {
2197 for (auto *BB : L->getBlocks()) {
2198 for (auto &I : *BB) {
2199 if (maybeLoweredToCall(I) || IsHardwareLoopIntrinsic(I) ||
2200 isa<InlineAsm>(I)) {
2201 LLVM_DEBUG(dbgs() << "ARMHWLoops: Bad instruction: " << I << "\n");
2202 return false;
2203 }
2204 if (auto *II = dyn_cast<IntrinsicInst>(&I))
2205 IsTailPredLoop |=
2206 II->getIntrinsicID() == Intrinsic::get_active_lane_mask ||
2207 II->getIntrinsicID() == Intrinsic::arm_mve_vctp8 ||
2208 II->getIntrinsicID() == Intrinsic::arm_mve_vctp16 ||
2209 II->getIntrinsicID() == Intrinsic::arm_mve_vctp32 ||
2210 II->getIntrinsicID() == Intrinsic::arm_mve_vctp64;
2211 }
2212 }
2213 return true;
2214 };
2215
2216 // Visit inner loops.
2217 for (auto *Inner : *L)
2218 if (!ScanLoop(Inner))
2219 return false;
2220
2221 if (!ScanLoop(L))
2222 return false;
2223
2224 // TODO: Check whether the trip count calculation is expensive. If L is the
2225 // inner loop but we know it has a low trip count, calculating that trip
2226 // count (in the parent loop) may be detrimental.
2227
2228 LLVMContext &C = L->getHeader()->getContext();
2229 HWLoopInfo.CounterInReg = true;
2230 HWLoopInfo.IsNestingLegal = false;
2231 HWLoopInfo.PerformEntryTest = AllowWLSLoops && !IsTailPredLoop;
2232 HWLoopInfo.CountType = Type::getInt32Ty(C);
2233 HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1);
2234 return true;
2235 }
2236
canTailPredicateInstruction(Instruction & I,int & ICmpCount)2237 static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) {
2238 // We don't allow icmp's, and because we only look at single block loops,
2239 // we simply count the icmps, i.e. there should only be 1 for the backedge.
2240 if (isa<ICmpInst>(&I) && ++ICmpCount > 1)
2241 return false;
2242 // FIXME: This is a workaround for poor cost modelling. Min/Max intrinsics are
2243 // not currently canonical, but soon will be. Code without them uses icmp, and
2244 // so is not tail predicated as per the condition above. In order to get the
2245 // same performance we treat min and max the same as an icmp for tailpred
2246 // purposes for the moment (we often rely on non-tailpred and higher VF's to
2247 // pick more optimial instructions like VQDMULH. They need to be recognized
2248 // directly by the vectorizer).
2249 if (auto *II = dyn_cast<IntrinsicInst>(&I))
2250 if ((II->getIntrinsicID() == Intrinsic::smin ||
2251 II->getIntrinsicID() == Intrinsic::smax ||
2252 II->getIntrinsicID() == Intrinsic::umin ||
2253 II->getIntrinsicID() == Intrinsic::umax) &&
2254 ++ICmpCount > 1)
2255 return false;
2256
2257 if (isa<FCmpInst>(&I))
2258 return false;
2259
2260 // We could allow extending/narrowing FP loads/stores, but codegen is
2261 // too inefficient so reject this for now.
2262 if (isa<FPExtInst>(&I) || isa<FPTruncInst>(&I))
2263 return false;
2264
2265 // Extends have to be extending-loads
2266 if (isa<SExtInst>(&I) || isa<ZExtInst>(&I) )
2267 if (!I.getOperand(0)->hasOneUse() || !isa<LoadInst>(I.getOperand(0)))
2268 return false;
2269
2270 // Truncs have to be narrowing-stores
2271 if (isa<TruncInst>(&I) )
2272 if (!I.hasOneUse() || !isa<StoreInst>(*I.user_begin()))
2273 return false;
2274
2275 return true;
2276 }
2277
2278 // To set up a tail-predicated loop, we need to know the total number of
2279 // elements processed by that loop. Thus, we need to determine the element
2280 // size and:
2281 // 1) it should be uniform for all operations in the vector loop, so we
2282 // e.g. don't want any widening/narrowing operations.
2283 // 2) it should be smaller than i64s because we don't have vector operations
2284 // that work on i64s.
2285 // 3) we don't want elements to be reversed or shuffled, to make sure the
2286 // tail-predication masks/predicates the right lanes.
2287 //
canTailPredicateLoop(Loop * L,LoopInfo * LI,ScalarEvolution & SE,const DataLayout & DL,const LoopAccessInfo * LAI)2288 static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
2289 const DataLayout &DL,
2290 const LoopAccessInfo *LAI) {
2291 LLVM_DEBUG(dbgs() << "Tail-predication: checking allowed instructions\n");
2292
2293 // If there are live-out values, it is probably a reduction. We can predicate
2294 // most reduction operations freely under MVE using a combination of
2295 // prefer-predicated-reduction-select and inloop reductions. We limit this to
2296 // floating point and integer reductions, but don't check for operators
2297 // specifically here. If the value ends up not being a reduction (and so the
2298 // vectorizer cannot tailfold the loop), we should fall back to standard
2299 // vectorization automatically.
2300 SmallVector< Instruction *, 8 > LiveOuts;
2301 LiveOuts = llvm::findDefsUsedOutsideOfLoop(L);
2302 bool ReductionsDisabled =
2303 EnableTailPredication == TailPredication::EnabledNoReductions ||
2304 EnableTailPredication == TailPredication::ForceEnabledNoReductions;
2305
2306 for (auto *I : LiveOuts) {
2307 if (!I->getType()->isIntegerTy() && !I->getType()->isFloatTy() &&
2308 !I->getType()->isHalfTy()) {
2309 LLVM_DEBUG(dbgs() << "Don't tail-predicate loop with non-integer/float "
2310 "live-out value\n");
2311 return false;
2312 }
2313 if (ReductionsDisabled) {
2314 LLVM_DEBUG(dbgs() << "Reductions not enabled\n");
2315 return false;
2316 }
2317 }
2318
2319 // Next, check that all instructions can be tail-predicated.
2320 PredicatedScalarEvolution PSE = LAI->getPSE();
2321 SmallVector<Instruction *, 16> LoadStores;
2322 int ICmpCount = 0;
2323
2324 for (BasicBlock *BB : L->blocks()) {
2325 for (Instruction &I : BB->instructionsWithoutDebug()) {
2326 if (isa<PHINode>(&I))
2327 continue;
2328 if (!canTailPredicateInstruction(I, ICmpCount)) {
2329 LLVM_DEBUG(dbgs() << "Instruction not allowed: "; I.dump());
2330 return false;
2331 }
2332
2333 Type *T = I.getType();
2334 if (T->getScalarSizeInBits() > 32) {
2335 LLVM_DEBUG(dbgs() << "Unsupported Type: "; T->dump());
2336 return false;
2337 }
2338 if (isa<StoreInst>(I) || isa<LoadInst>(I)) {
2339 Value *Ptr = getLoadStorePointerOperand(&I);
2340 Type *AccessTy = getLoadStoreType(&I);
2341 int64_t NextStride = getPtrStride(PSE, AccessTy, Ptr, L).value_or(0);
2342 if (NextStride == 1) {
2343 // TODO: for now only allow consecutive strides of 1. We could support
2344 // other strides as long as it is uniform, but let's keep it simple
2345 // for now.
2346 continue;
2347 } else if (NextStride == -1 ||
2348 (NextStride == 2 && MVEMaxSupportedInterleaveFactor >= 2) ||
2349 (NextStride == 4 && MVEMaxSupportedInterleaveFactor >= 4)) {
2350 LLVM_DEBUG(dbgs()
2351 << "Consecutive strides of 2 found, vld2/vstr2 can't "
2352 "be tail-predicated\n.");
2353 return false;
2354 // TODO: don't tail predicate if there is a reversed load?
2355 } else if (EnableMaskedGatherScatters) {
2356 // Gather/scatters do allow loading from arbitrary strides, at
2357 // least if they are loop invariant.
2358 // TODO: Loop variant strides should in theory work, too, but
2359 // this requires further testing.
2360 const SCEV *PtrScev = PSE.getSE()->getSCEV(Ptr);
2361 if (auto AR = dyn_cast<SCEVAddRecExpr>(PtrScev)) {
2362 const SCEV *Step = AR->getStepRecurrence(*PSE.getSE());
2363 if (PSE.getSE()->isLoopInvariant(Step, L))
2364 continue;
2365 }
2366 }
2367 LLVM_DEBUG(dbgs() << "Bad stride found, can't "
2368 "tail-predicate\n.");
2369 return false;
2370 }
2371 }
2372 }
2373
2374 LLVM_DEBUG(dbgs() << "tail-predication: all instructions allowed!\n");
2375 return true;
2376 }
2377
preferPredicateOverEpilogue(TailFoldingInfo * TFI)2378 bool ARMTTIImpl::preferPredicateOverEpilogue(TailFoldingInfo *TFI) {
2379 if (!EnableTailPredication) {
2380 LLVM_DEBUG(dbgs() << "Tail-predication not enabled.\n");
2381 return false;
2382 }
2383
2384 // Creating a predicated vector loop is the first step for generating a
2385 // tail-predicated hardware loop, for which we need the MVE masked
2386 // load/stores instructions:
2387 if (!ST->hasMVEIntegerOps())
2388 return false;
2389
2390 LoopVectorizationLegality *LVL = TFI->LVL;
2391 Loop *L = LVL->getLoop();
2392
2393 // For now, restrict this to single block loops.
2394 if (L->getNumBlocks() > 1) {
2395 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: not a single block "
2396 "loop.\n");
2397 return false;
2398 }
2399
2400 assert(L->isInnermost() && "preferPredicateOverEpilogue: inner-loop expected");
2401
2402 LoopInfo *LI = LVL->getLoopInfo();
2403 HardwareLoopInfo HWLoopInfo(L);
2404 if (!HWLoopInfo.canAnalyze(*LI)) {
2405 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2406 "analyzable.\n");
2407 return false;
2408 }
2409
2410 AssumptionCache *AC = LVL->getAssumptionCache();
2411 ScalarEvolution *SE = LVL->getScalarEvolution();
2412
2413 // This checks if we have the low-overhead branch architecture
2414 // extension, and if we will create a hardware-loop:
2415 if (!isHardwareLoopProfitable(L, *SE, *AC, TFI->TLI, HWLoopInfo)) {
2416 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2417 "profitable.\n");
2418 return false;
2419 }
2420
2421 DominatorTree *DT = LVL->getDominatorTree();
2422 if (!HWLoopInfo.isHardwareLoopCandidate(*SE, *LI, *DT)) {
2423 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2424 "a candidate.\n");
2425 return false;
2426 }
2427
2428 return canTailPredicateLoop(L, LI, *SE, DL, LVL->getLAI());
2429 }
2430
2431 TailFoldingStyle
getPreferredTailFoldingStyle(bool IVUpdateMayOverflow) const2432 ARMTTIImpl::getPreferredTailFoldingStyle(bool IVUpdateMayOverflow) const {
2433 if (!ST->hasMVEIntegerOps() || !EnableTailPredication)
2434 return TailFoldingStyle::DataWithoutLaneMask;
2435
2436 // Intrinsic @llvm.get.active.lane.mask is supported.
2437 // It is used in the MVETailPredication pass, which requires the number of
2438 // elements processed by this vector loop to setup the tail-predicated
2439 // loop.
2440 return TailFoldingStyle::Data;
2441 }
getUnrollingPreferences(Loop * L,ScalarEvolution & SE,TTI::UnrollingPreferences & UP,OptimizationRemarkEmitter * ORE)2442 void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
2443 TTI::UnrollingPreferences &UP,
2444 OptimizationRemarkEmitter *ORE) {
2445 // Enable Upper bound unrolling universally, providing that we do not see an
2446 // active lane mask, which will be better kept as a loop to become tail
2447 // predicated than to be conditionally unrolled.
2448 UP.UpperBound =
2449 !ST->hasMVEIntegerOps() || !any_of(*L->getHeader(), [](Instruction &I) {
2450 return isa<IntrinsicInst>(I) &&
2451 cast<IntrinsicInst>(I).getIntrinsicID() ==
2452 Intrinsic::get_active_lane_mask;
2453 });
2454
2455 // Only currently enable these preferences for M-Class cores.
2456 if (!ST->isMClass())
2457 return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
2458
2459 // Disable loop unrolling for Oz and Os.
2460 UP.OptSizeThreshold = 0;
2461 UP.PartialOptSizeThreshold = 0;
2462 if (L->getHeader()->getParent()->hasOptSize())
2463 return;
2464
2465 SmallVector<BasicBlock*, 4> ExitingBlocks;
2466 L->getExitingBlocks(ExitingBlocks);
2467 LLVM_DEBUG(dbgs() << "Loop has:\n"
2468 << "Blocks: " << L->getNumBlocks() << "\n"
2469 << "Exit blocks: " << ExitingBlocks.size() << "\n");
2470
2471 // Only allow another exit other than the latch. This acts as an early exit
2472 // as it mirrors the profitability calculation of the runtime unroller.
2473 if (ExitingBlocks.size() > 2)
2474 return;
2475
2476 // Limit the CFG of the loop body for targets with a branch predictor.
2477 // Allowing 4 blocks permits if-then-else diamonds in the body.
2478 if (ST->hasBranchPredictor() && L->getNumBlocks() > 4)
2479 return;
2480
2481 // Don't unroll vectorized loops, including the remainder loop
2482 if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized"))
2483 return;
2484
2485 // Scan the loop: don't unroll loops with calls as this could prevent
2486 // inlining.
2487 InstructionCost Cost = 0;
2488 for (auto *BB : L->getBlocks()) {
2489 for (auto &I : *BB) {
2490 // Don't unroll vectorised loop. MVE does not benefit from it as much as
2491 // scalar code.
2492 if (I.getType()->isVectorTy())
2493 return;
2494
2495 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
2496 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
2497 if (!isLoweredToCall(F))
2498 continue;
2499 }
2500 return;
2501 }
2502
2503 SmallVector<const Value*, 4> Operands(I.operand_values());
2504 Cost += getInstructionCost(&I, Operands,
2505 TargetTransformInfo::TCK_SizeAndLatency);
2506 }
2507 }
2508
2509 // On v6m cores, there are very few registers available. We can easily end up
2510 // spilling and reloading more registers in an unrolled loop. Look at the
2511 // number of LCSSA phis as a rough measure of how many registers will need to
2512 // be live out of the loop, reducing the default unroll count if more than 1
2513 // value is needed. In the long run, all of this should be being learnt by a
2514 // machine.
2515 unsigned UnrollCount = 4;
2516 if (ST->isThumb1Only()) {
2517 unsigned ExitingValues = 0;
2518 SmallVector<BasicBlock *, 4> ExitBlocks;
2519 L->getExitBlocks(ExitBlocks);
2520 for (auto *Exit : ExitBlocks) {
2521 // Count the number of LCSSA phis. Exclude values coming from GEP's as
2522 // only the last is expected to be needed for address operands.
2523 unsigned LiveOuts = count_if(Exit->phis(), [](auto &PH) {
2524 return PH.getNumOperands() != 1 ||
2525 !isa<GetElementPtrInst>(PH.getOperand(0));
2526 });
2527 ExitingValues = ExitingValues < LiveOuts ? LiveOuts : ExitingValues;
2528 }
2529 if (ExitingValues)
2530 UnrollCount /= ExitingValues;
2531 if (UnrollCount <= 1)
2532 return;
2533 }
2534
2535 LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
2536 LLVM_DEBUG(dbgs() << "Default Runtime Unroll Count: " << UnrollCount << "\n");
2537
2538 UP.Partial = true;
2539 UP.Runtime = true;
2540 UP.UnrollRemainder = true;
2541 UP.DefaultUnrollRuntimeCount = UnrollCount;
2542 UP.UnrollAndJam = true;
2543 UP.UnrollAndJamInnerLoopThreshold = 60;
2544
2545 // Force unrolling small loops can be very useful because of the branch
2546 // taken cost of the backedge.
2547 if (Cost < 12)
2548 UP.Force = true;
2549 }
2550
getPeelingPreferences(Loop * L,ScalarEvolution & SE,TTI::PeelingPreferences & PP)2551 void ARMTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
2552 TTI::PeelingPreferences &PP) {
2553 BaseT::getPeelingPreferences(L, SE, PP);
2554 }
2555
preferInLoopReduction(unsigned Opcode,Type * Ty,TTI::ReductionFlags Flags) const2556 bool ARMTTIImpl::preferInLoopReduction(unsigned Opcode, Type *Ty,
2557 TTI::ReductionFlags Flags) const {
2558 if (!ST->hasMVEIntegerOps())
2559 return false;
2560
2561 unsigned ScalarBits = Ty->getScalarSizeInBits();
2562 switch (Opcode) {
2563 case Instruction::Add:
2564 return ScalarBits <= 64;
2565 default:
2566 return false;
2567 }
2568 }
2569
preferPredicatedReductionSelect(unsigned Opcode,Type * Ty,TTI::ReductionFlags Flags) const2570 bool ARMTTIImpl::preferPredicatedReductionSelect(
2571 unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const {
2572 if (!ST->hasMVEIntegerOps())
2573 return false;
2574 return true;
2575 }
2576
getScalingFactorCost(Type * Ty,GlobalValue * BaseGV,StackOffset BaseOffset,bool HasBaseReg,int64_t Scale,unsigned AddrSpace) const2577 InstructionCost ARMTTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
2578 StackOffset BaseOffset,
2579 bool HasBaseReg, int64_t Scale,
2580 unsigned AddrSpace) const {
2581 TargetLoweringBase::AddrMode AM;
2582 AM.BaseGV = BaseGV;
2583 AM.BaseOffs = BaseOffset.getFixed();
2584 AM.HasBaseReg = HasBaseReg;
2585 AM.Scale = Scale;
2586 AM.ScalableOffset = BaseOffset.getScalable();
2587 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace)) {
2588 if (ST->hasFPAO())
2589 return AM.Scale < 0 ? 1 : 0; // positive offsets execute faster
2590 return 0;
2591 }
2592 return -1;
2593 }
2594
hasArmWideBranch(bool Thumb) const2595 bool ARMTTIImpl::hasArmWideBranch(bool Thumb) const {
2596 if (Thumb) {
2597 // B.W is available in any Thumb2-supporting target, and also in every
2598 // version of Armv8-M, even Baseline which does not include the rest of
2599 // Thumb2.
2600 return ST->isThumb2() || ST->hasV8MBaselineOps();
2601 } else {
2602 // B is available in all versions of the Arm ISA, so the only question is
2603 // whether that ISA is available at all.
2604 return ST->hasARMOps();
2605 }
2606 }
2607