1 //===- ARMTargetTransformInfo.cpp - ARM specific TTI ----------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "ARMTargetTransformInfo.h" 10 #include "ARMSubtarget.h" 11 #include "MCTargetDesc/ARMAddressingModes.h" 12 #include "llvm/ADT/APInt.h" 13 #include "llvm/ADT/SmallVector.h" 14 #include "llvm/Analysis/LoopInfo.h" 15 #include "llvm/CodeGen/CostTable.h" 16 #include "llvm/CodeGen/ISDOpcodes.h" 17 #include "llvm/CodeGen/ValueTypes.h" 18 #include "llvm/CodeGenTypes/MachineValueType.h" 19 #include "llvm/IR/BasicBlock.h" 20 #include "llvm/IR/DataLayout.h" 21 #include "llvm/IR/DerivedTypes.h" 22 #include "llvm/IR/Instruction.h" 23 #include "llvm/IR/Instructions.h" 24 #include "llvm/IR/IntrinsicInst.h" 25 #include "llvm/IR/Intrinsics.h" 26 #include "llvm/IR/IntrinsicsARM.h" 27 #include "llvm/IR/PatternMatch.h" 28 #include "llvm/IR/Type.h" 29 #include "llvm/Support/Casting.h" 30 #include "llvm/Support/KnownBits.h" 31 #include "llvm/Target/TargetMachine.h" 32 #include "llvm/TargetParser/SubtargetFeature.h" 33 #include "llvm/Transforms/InstCombine/InstCombiner.h" 34 #include "llvm/Transforms/Utils/Local.h" 35 #include "llvm/Transforms/Utils/LoopUtils.h" 36 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 37 #include <algorithm> 38 #include <cassert> 39 #include <cstdint> 40 #include <optional> 41 #include <utility> 42 43 using namespace llvm; 44 45 #define DEBUG_TYPE "armtti" 46 47 static cl::opt<bool> EnableMaskedLoadStores( 48 "enable-arm-maskedldst", cl::Hidden, cl::init(true), 49 cl::desc("Enable the generation of masked loads and stores")); 50 51 static cl::opt<bool> DisableLowOverheadLoops( 52 "disable-arm-loloops", cl::Hidden, cl::init(false), 53 cl::desc("Disable the generation of low-overhead loops")); 54 55 static cl::opt<bool> 56 AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true), 57 cl::desc("Enable the generation of WLS loops")); 58 59 extern cl::opt<TailPredication::Mode> EnableTailPredication; 60 61 extern cl::opt<bool> EnableMaskedGatherScatters; 62 63 extern cl::opt<unsigned> MVEMaxSupportedInterleaveFactor; 64 65 /// Convert a vector load intrinsic into a simple llvm load instruction. 66 /// This is beneficial when the underlying object being addressed comes 67 /// from a constant, since we get constant-folding for free. 68 static Value *simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign, 69 InstCombiner::BuilderTy &Builder) { 70 auto *IntrAlign = dyn_cast<ConstantInt>(II.getArgOperand(1)); 71 72 if (!IntrAlign) 73 return nullptr; 74 75 unsigned Alignment = IntrAlign->getLimitedValue() < MemAlign 76 ? MemAlign 77 : IntrAlign->getLimitedValue(); 78 79 if (!isPowerOf2_32(Alignment)) 80 return nullptr; 81 82 auto *BCastInst = Builder.CreateBitCast(II.getArgOperand(0), 83 PointerType::get(II.getType(), 0)); 84 return Builder.CreateAlignedLoad(II.getType(), BCastInst, Align(Alignment)); 85 } 86 87 bool ARMTTIImpl::areInlineCompatible(const Function *Caller, 88 const Function *Callee) const { 89 const TargetMachine &TM = getTLI()->getTargetMachine(); 90 const FeatureBitset &CallerBits = 91 TM.getSubtargetImpl(*Caller)->getFeatureBits(); 92 const FeatureBitset &CalleeBits = 93 TM.getSubtargetImpl(*Callee)->getFeatureBits(); 94 95 // To inline a callee, all features not in the allowed list must match exactly. 96 bool MatchExact = (CallerBits & ~InlineFeaturesAllowed) == 97 (CalleeBits & ~InlineFeaturesAllowed); 98 // For features in the allowed list, the callee's features must be a subset of 99 // the callers'. 100 bool MatchSubset = ((CallerBits & CalleeBits) & InlineFeaturesAllowed) == 101 (CalleeBits & InlineFeaturesAllowed); 102 return MatchExact && MatchSubset; 103 } 104 105 TTI::AddressingModeKind 106 ARMTTIImpl::getPreferredAddressingMode(const Loop *L, 107 ScalarEvolution *SE) const { 108 if (ST->hasMVEIntegerOps()) 109 return TTI::AMK_PostIndexed; 110 111 if (L->getHeader()->getParent()->hasOptSize()) 112 return TTI::AMK_None; 113 114 if (ST->isMClass() && ST->isThumb2() && 115 L->getNumBlocks() == 1) 116 return TTI::AMK_PreIndexed; 117 118 return TTI::AMK_None; 119 } 120 121 std::optional<Instruction *> 122 ARMTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { 123 using namespace PatternMatch; 124 Intrinsic::ID IID = II.getIntrinsicID(); 125 switch (IID) { 126 default: 127 break; 128 case Intrinsic::arm_neon_vld1: { 129 Align MemAlign = 130 getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II, 131 &IC.getAssumptionCache(), &IC.getDominatorTree()); 132 if (Value *V = simplifyNeonVld1(II, MemAlign.value(), IC.Builder)) { 133 return IC.replaceInstUsesWith(II, V); 134 } 135 break; 136 } 137 138 case Intrinsic::arm_neon_vld2: 139 case Intrinsic::arm_neon_vld3: 140 case Intrinsic::arm_neon_vld4: 141 case Intrinsic::arm_neon_vld2lane: 142 case Intrinsic::arm_neon_vld3lane: 143 case Intrinsic::arm_neon_vld4lane: 144 case Intrinsic::arm_neon_vst1: 145 case Intrinsic::arm_neon_vst2: 146 case Intrinsic::arm_neon_vst3: 147 case Intrinsic::arm_neon_vst4: 148 case Intrinsic::arm_neon_vst2lane: 149 case Intrinsic::arm_neon_vst3lane: 150 case Intrinsic::arm_neon_vst4lane: { 151 Align MemAlign = 152 getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II, 153 &IC.getAssumptionCache(), &IC.getDominatorTree()); 154 unsigned AlignArg = II.arg_size() - 1; 155 Value *AlignArgOp = II.getArgOperand(AlignArg); 156 MaybeAlign Align = cast<ConstantInt>(AlignArgOp)->getMaybeAlignValue(); 157 if (Align && *Align < MemAlign) { 158 return IC.replaceOperand( 159 II, AlignArg, 160 ConstantInt::get(Type::getInt32Ty(II.getContext()), MemAlign.value(), 161 false)); 162 } 163 break; 164 } 165 166 case Intrinsic::arm_mve_pred_i2v: { 167 Value *Arg = II.getArgOperand(0); 168 Value *ArgArg; 169 if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>( 170 PatternMatch::m_Value(ArgArg))) && 171 II.getType() == ArgArg->getType()) { 172 return IC.replaceInstUsesWith(II, ArgArg); 173 } 174 Constant *XorMask; 175 if (match(Arg, m_Xor(PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>( 176 PatternMatch::m_Value(ArgArg)), 177 PatternMatch::m_Constant(XorMask))) && 178 II.getType() == ArgArg->getType()) { 179 if (auto *CI = dyn_cast<ConstantInt>(XorMask)) { 180 if (CI->getValue().trunc(16).isAllOnes()) { 181 auto TrueVector = IC.Builder.CreateVectorSplat( 182 cast<FixedVectorType>(II.getType())->getNumElements(), 183 IC.Builder.getTrue()); 184 return BinaryOperator::Create(Instruction::Xor, ArgArg, TrueVector); 185 } 186 } 187 } 188 KnownBits ScalarKnown(32); 189 if (IC.SimplifyDemandedBits(&II, 0, APInt::getLowBitsSet(32, 16), 190 ScalarKnown)) { 191 return &II; 192 } 193 break; 194 } 195 case Intrinsic::arm_mve_pred_v2i: { 196 Value *Arg = II.getArgOperand(0); 197 Value *ArgArg; 198 if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_i2v>( 199 PatternMatch::m_Value(ArgArg)))) { 200 return IC.replaceInstUsesWith(II, ArgArg); 201 } 202 203 if (II.getMetadata(LLVMContext::MD_range)) 204 break; 205 206 ConstantRange Range(APInt(32, 0), APInt(32, 0x10000)); 207 208 if (auto CurrentRange = II.getRange()) { 209 Range = Range.intersectWith(*CurrentRange); 210 if (Range == CurrentRange) 211 break; 212 } 213 214 II.addRangeRetAttr(Range); 215 II.addRetAttr(Attribute::NoUndef); 216 return &II; 217 } 218 case Intrinsic::arm_mve_vadc: 219 case Intrinsic::arm_mve_vadc_predicated: { 220 unsigned CarryOp = 221 (II.getIntrinsicID() == Intrinsic::arm_mve_vadc_predicated) ? 3 : 2; 222 assert(II.getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == 32 && 223 "Bad type for intrinsic!"); 224 225 KnownBits CarryKnown(32); 226 if (IC.SimplifyDemandedBits(&II, CarryOp, APInt::getOneBitSet(32, 29), 227 CarryKnown)) { 228 return &II; 229 } 230 break; 231 } 232 case Intrinsic::arm_mve_vmldava: { 233 Instruction *I = cast<Instruction>(&II); 234 if (I->hasOneUse()) { 235 auto *User = cast<Instruction>(*I->user_begin()); 236 Value *OpZ; 237 if (match(User, m_c_Add(m_Specific(I), m_Value(OpZ))) && 238 match(I->getOperand(3), m_Zero())) { 239 Value *OpX = I->getOperand(4); 240 Value *OpY = I->getOperand(5); 241 Type *OpTy = OpX->getType(); 242 243 IC.Builder.SetInsertPoint(User); 244 Value *V = 245 IC.Builder.CreateIntrinsic(Intrinsic::arm_mve_vmldava, {OpTy}, 246 {I->getOperand(0), I->getOperand(1), 247 I->getOperand(2), OpZ, OpX, OpY}); 248 249 IC.replaceInstUsesWith(*User, V); 250 return IC.eraseInstFromFunction(*User); 251 } 252 } 253 return std::nullopt; 254 } 255 } 256 return std::nullopt; 257 } 258 259 std::optional<Value *> ARMTTIImpl::simplifyDemandedVectorEltsIntrinsic( 260 InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts, 261 APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, 262 std::function<void(Instruction *, unsigned, APInt, APInt &)> 263 SimplifyAndSetOp) const { 264 265 // Compute the demanded bits for a narrowing MVE intrinsic. The TopOpc is the 266 // opcode specifying a Top/Bottom instruction, which can change between 267 // instructions. 268 auto SimplifyNarrowInstrTopBottom =[&](unsigned TopOpc) { 269 unsigned NumElts = cast<FixedVectorType>(II.getType())->getNumElements(); 270 unsigned IsTop = cast<ConstantInt>(II.getOperand(TopOpc))->getZExtValue(); 271 272 // The only odd/even lanes of operand 0 will only be demanded depending 273 // on whether this is a top/bottom instruction. 274 APInt DemandedElts = 275 APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1) 276 : APInt::getHighBitsSet(2, 1)); 277 SimplifyAndSetOp(&II, 0, OrigDemandedElts & DemandedElts, UndefElts); 278 // The other lanes will be defined from the inserted elements. 279 UndefElts &= APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1) 280 : APInt::getHighBitsSet(2, 1)); 281 return std::nullopt; 282 }; 283 284 switch (II.getIntrinsicID()) { 285 default: 286 break; 287 case Intrinsic::arm_mve_vcvt_narrow: 288 SimplifyNarrowInstrTopBottom(2); 289 break; 290 case Intrinsic::arm_mve_vqmovn: 291 SimplifyNarrowInstrTopBottom(4); 292 break; 293 case Intrinsic::arm_mve_vshrn: 294 SimplifyNarrowInstrTopBottom(7); 295 break; 296 } 297 298 return std::nullopt; 299 } 300 301 InstructionCost ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty, 302 TTI::TargetCostKind CostKind) { 303 assert(Ty->isIntegerTy()); 304 305 unsigned Bits = Ty->getPrimitiveSizeInBits(); 306 if (Bits == 0 || Imm.getActiveBits() >= 64) 307 return 4; 308 309 int64_t SImmVal = Imm.getSExtValue(); 310 uint64_t ZImmVal = Imm.getZExtValue(); 311 if (!ST->isThumb()) { 312 if ((SImmVal >= 0 && SImmVal < 65536) || 313 (ARM_AM::getSOImmVal(ZImmVal) != -1) || 314 (ARM_AM::getSOImmVal(~ZImmVal) != -1)) 315 return 1; 316 return ST->hasV6T2Ops() ? 2 : 3; 317 } 318 if (ST->isThumb2()) { 319 if ((SImmVal >= 0 && SImmVal < 65536) || 320 (ARM_AM::getT2SOImmVal(ZImmVal) != -1) || 321 (ARM_AM::getT2SOImmVal(~ZImmVal) != -1)) 322 return 1; 323 return ST->hasV6T2Ops() ? 2 : 3; 324 } 325 // Thumb1, any i8 imm cost 1. 326 if (Bits == 8 || (SImmVal >= 0 && SImmVal < 256)) 327 return 1; 328 if ((~SImmVal < 256) || ARM_AM::isThumbImmShiftedVal(ZImmVal)) 329 return 2; 330 // Load from constantpool. 331 return 3; 332 } 333 334 // Constants smaller than 256 fit in the immediate field of 335 // Thumb1 instructions so we return a zero cost and 1 otherwise. 336 InstructionCost ARMTTIImpl::getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx, 337 const APInt &Imm, Type *Ty) { 338 if (Imm.isNonNegative() && Imm.getLimitedValue() < 256) 339 return 0; 340 341 return 1; 342 } 343 344 // Checks whether Inst is part of a min(max()) or max(min()) pattern 345 // that will match to an SSAT instruction. Returns the instruction being 346 // saturated, or null if no saturation pattern was found. 347 static Value *isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm) { 348 Value *LHS, *RHS; 349 ConstantInt *C; 350 SelectPatternFlavor InstSPF = matchSelectPattern(Inst, LHS, RHS).Flavor; 351 352 if (InstSPF == SPF_SMAX && 353 PatternMatch::match(RHS, PatternMatch::m_ConstantInt(C)) && 354 C->getValue() == Imm && Imm.isNegative() && Imm.isNegatedPowerOf2()) { 355 356 auto isSSatMin = [&](Value *MinInst) { 357 if (isa<SelectInst>(MinInst)) { 358 Value *MinLHS, *MinRHS; 359 ConstantInt *MinC; 360 SelectPatternFlavor MinSPF = 361 matchSelectPattern(MinInst, MinLHS, MinRHS).Flavor; 362 if (MinSPF == SPF_SMIN && 363 PatternMatch::match(MinRHS, PatternMatch::m_ConstantInt(MinC)) && 364 MinC->getValue() == ((-Imm) - 1)) 365 return true; 366 } 367 return false; 368 }; 369 370 if (isSSatMin(Inst->getOperand(1))) 371 return cast<Instruction>(Inst->getOperand(1))->getOperand(1); 372 if (Inst->hasNUses(2) && 373 (isSSatMin(*Inst->user_begin()) || isSSatMin(*(++Inst->user_begin())))) 374 return Inst->getOperand(1); 375 } 376 return nullptr; 377 } 378 379 // Look for a FP Saturation pattern, where the instruction can be simplified to 380 // a fptosi.sat. max(min(fptosi)). The constant in this case is always free. 381 static bool isFPSatMinMaxPattern(Instruction *Inst, const APInt &Imm) { 382 if (Imm.getBitWidth() != 64 || 383 Imm != APInt::getHighBitsSet(64, 33)) // -2147483648 384 return false; 385 Value *FP = isSSATMinMaxPattern(Inst, Imm); 386 if (!FP && isa<ICmpInst>(Inst) && Inst->hasOneUse()) 387 FP = isSSATMinMaxPattern(cast<Instruction>(*Inst->user_begin()), Imm); 388 if (!FP) 389 return false; 390 return isa<FPToSIInst>(FP); 391 } 392 393 InstructionCost ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, 394 const APInt &Imm, Type *Ty, 395 TTI::TargetCostKind CostKind, 396 Instruction *Inst) { 397 // Division by a constant can be turned into multiplication, but only if we 398 // know it's constant. So it's not so much that the immediate is cheap (it's 399 // not), but that the alternative is worse. 400 // FIXME: this is probably unneeded with GlobalISel. 401 if ((Opcode == Instruction::SDiv || Opcode == Instruction::UDiv || 402 Opcode == Instruction::SRem || Opcode == Instruction::URem) && 403 Idx == 1) 404 return 0; 405 406 // Leave any gep offsets for the CodeGenPrepare, which will do a better job at 407 // splitting any large offsets. 408 if (Opcode == Instruction::GetElementPtr && Idx != 0) 409 return 0; 410 411 if (Opcode == Instruction::And) { 412 // UXTB/UXTH 413 if (Imm == 255 || Imm == 65535) 414 return 0; 415 // Conversion to BIC is free, and means we can use ~Imm instead. 416 return std::min(getIntImmCost(Imm, Ty, CostKind), 417 getIntImmCost(~Imm, Ty, CostKind)); 418 } 419 420 if (Opcode == Instruction::Add) 421 // Conversion to SUB is free, and means we can use -Imm instead. 422 return std::min(getIntImmCost(Imm, Ty, CostKind), 423 getIntImmCost(-Imm, Ty, CostKind)); 424 425 if (Opcode == Instruction::ICmp && Imm.isNegative() && 426 Ty->getIntegerBitWidth() == 32) { 427 int64_t NegImm = -Imm.getSExtValue(); 428 if (ST->isThumb2() && NegImm < 1<<12) 429 // icmp X, #-C -> cmn X, #C 430 return 0; 431 if (ST->isThumb() && NegImm < 1<<8) 432 // icmp X, #-C -> adds X, #C 433 return 0; 434 } 435 436 // xor a, -1 can always be folded to MVN 437 if (Opcode == Instruction::Xor && Imm.isAllOnes()) 438 return 0; 439 440 // Ensures negative constant of min(max()) or max(min()) patterns that 441 // match to SSAT instructions don't get hoisted 442 if (Inst && ((ST->hasV6Ops() && !ST->isThumb()) || ST->isThumb2()) && 443 Ty->getIntegerBitWidth() <= 32) { 444 if (isSSATMinMaxPattern(Inst, Imm) || 445 (isa<ICmpInst>(Inst) && Inst->hasOneUse() && 446 isSSATMinMaxPattern(cast<Instruction>(*Inst->user_begin()), Imm))) 447 return 0; 448 } 449 450 if (Inst && ST->hasVFP2Base() && isFPSatMinMaxPattern(Inst, Imm)) 451 return 0; 452 453 // We can convert <= -1 to < 0, which is generally quite cheap. 454 if (Inst && Opcode == Instruction::ICmp && Idx == 1 && Imm.isAllOnes()) { 455 ICmpInst::Predicate Pred = cast<ICmpInst>(Inst)->getPredicate(); 456 if (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SLE) 457 return std::min(getIntImmCost(Imm, Ty, CostKind), 458 getIntImmCost(Imm + 1, Ty, CostKind)); 459 } 460 461 return getIntImmCost(Imm, Ty, CostKind); 462 } 463 464 InstructionCost ARMTTIImpl::getCFInstrCost(unsigned Opcode, 465 TTI::TargetCostKind CostKind, 466 const Instruction *I) { 467 if (CostKind == TTI::TCK_RecipThroughput && 468 (ST->hasNEON() || ST->hasMVEIntegerOps())) { 469 // FIXME: The vectorizer is highly sensistive to the cost of these 470 // instructions, which suggests that it may be using the costs incorrectly. 471 // But, for now, just make them free to avoid performance regressions for 472 // vector targets. 473 return 0; 474 } 475 return BaseT::getCFInstrCost(Opcode, CostKind, I); 476 } 477 478 InstructionCost ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, 479 Type *Src, 480 TTI::CastContextHint CCH, 481 TTI::TargetCostKind CostKind, 482 const Instruction *I) { 483 int ISD = TLI->InstructionOpcodeToISD(Opcode); 484 assert(ISD && "Invalid opcode"); 485 486 // TODO: Allow non-throughput costs that aren't binary. 487 auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost { 488 if (CostKind != TTI::TCK_RecipThroughput) 489 return Cost == 0 ? 0 : 1; 490 return Cost; 491 }; 492 auto IsLegalFPType = [this](EVT VT) { 493 EVT EltVT = VT.getScalarType(); 494 return (EltVT == MVT::f32 && ST->hasVFP2Base()) || 495 (EltVT == MVT::f64 && ST->hasFP64()) || 496 (EltVT == MVT::f16 && ST->hasFullFP16()); 497 }; 498 499 EVT SrcTy = TLI->getValueType(DL, Src); 500 EVT DstTy = TLI->getValueType(DL, Dst); 501 502 if (!SrcTy.isSimple() || !DstTy.isSimple()) 503 return AdjustCost( 504 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); 505 506 // Extending masked load/Truncating masked stores is expensive because we 507 // currently don't split them. This means that we'll likely end up 508 // loading/storing each element individually (hence the high cost). 509 if ((ST->hasMVEIntegerOps() && 510 (Opcode == Instruction::Trunc || Opcode == Instruction::ZExt || 511 Opcode == Instruction::SExt)) || 512 (ST->hasMVEFloatOps() && 513 (Opcode == Instruction::FPExt || Opcode == Instruction::FPTrunc) && 514 IsLegalFPType(SrcTy) && IsLegalFPType(DstTy))) 515 if (CCH == TTI::CastContextHint::Masked && DstTy.getSizeInBits() > 128) 516 return 2 * DstTy.getVectorNumElements() * 517 ST->getMVEVectorCostFactor(CostKind); 518 519 // The extend of other kinds of load is free 520 if (CCH == TTI::CastContextHint::Normal || 521 CCH == TTI::CastContextHint::Masked) { 522 static const TypeConversionCostTblEntry LoadConversionTbl[] = { 523 {ISD::SIGN_EXTEND, MVT::i32, MVT::i16, 0}, 524 {ISD::ZERO_EXTEND, MVT::i32, MVT::i16, 0}, 525 {ISD::SIGN_EXTEND, MVT::i32, MVT::i8, 0}, 526 {ISD::ZERO_EXTEND, MVT::i32, MVT::i8, 0}, 527 {ISD::SIGN_EXTEND, MVT::i16, MVT::i8, 0}, 528 {ISD::ZERO_EXTEND, MVT::i16, MVT::i8, 0}, 529 {ISD::SIGN_EXTEND, MVT::i64, MVT::i32, 1}, 530 {ISD::ZERO_EXTEND, MVT::i64, MVT::i32, 1}, 531 {ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 1}, 532 {ISD::ZERO_EXTEND, MVT::i64, MVT::i16, 1}, 533 {ISD::SIGN_EXTEND, MVT::i64, MVT::i8, 1}, 534 {ISD::ZERO_EXTEND, MVT::i64, MVT::i8, 1}, 535 }; 536 if (const auto *Entry = ConvertCostTableLookup( 537 LoadConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT())) 538 return AdjustCost(Entry->Cost); 539 540 static const TypeConversionCostTblEntry MVELoadConversionTbl[] = { 541 {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 0}, 542 {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 0}, 543 {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 0}, 544 {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 0}, 545 {ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 0}, 546 {ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 0}, 547 // The following extend from a legal type to an illegal type, so need to 548 // split the load. This introduced an extra load operation, but the 549 // extend is still "free". 550 {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1}, 551 {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1}, 552 {ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 3}, 553 {ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 3}, 554 {ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1}, 555 {ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1}, 556 }; 557 if (SrcTy.isVector() && ST->hasMVEIntegerOps()) { 558 if (const auto *Entry = 559 ConvertCostTableLookup(MVELoadConversionTbl, ISD, 560 DstTy.getSimpleVT(), SrcTy.getSimpleVT())) 561 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind); 562 } 563 564 static const TypeConversionCostTblEntry MVEFLoadConversionTbl[] = { 565 // FPExtends are similar but also require the VCVT instructions. 566 {ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, 1}, 567 {ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, 3}, 568 }; 569 if (SrcTy.isVector() && ST->hasMVEFloatOps()) { 570 if (const auto *Entry = 571 ConvertCostTableLookup(MVEFLoadConversionTbl, ISD, 572 DstTy.getSimpleVT(), SrcTy.getSimpleVT())) 573 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind); 574 } 575 576 // The truncate of a store is free. This is the mirror of extends above. 577 static const TypeConversionCostTblEntry MVEStoreConversionTbl[] = { 578 {ISD::TRUNCATE, MVT::v4i32, MVT::v4i16, 0}, 579 {ISD::TRUNCATE, MVT::v4i32, MVT::v4i8, 0}, 580 {ISD::TRUNCATE, MVT::v8i16, MVT::v8i8, 0}, 581 {ISD::TRUNCATE, MVT::v8i32, MVT::v8i16, 1}, 582 {ISD::TRUNCATE, MVT::v8i32, MVT::v8i8, 1}, 583 {ISD::TRUNCATE, MVT::v16i32, MVT::v16i8, 3}, 584 {ISD::TRUNCATE, MVT::v16i16, MVT::v16i8, 1}, 585 }; 586 if (SrcTy.isVector() && ST->hasMVEIntegerOps()) { 587 if (const auto *Entry = 588 ConvertCostTableLookup(MVEStoreConversionTbl, ISD, 589 SrcTy.getSimpleVT(), DstTy.getSimpleVT())) 590 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind); 591 } 592 593 static const TypeConversionCostTblEntry MVEFStoreConversionTbl[] = { 594 {ISD::FP_ROUND, MVT::v4f32, MVT::v4f16, 1}, 595 {ISD::FP_ROUND, MVT::v8f32, MVT::v8f16, 3}, 596 }; 597 if (SrcTy.isVector() && ST->hasMVEFloatOps()) { 598 if (const auto *Entry = 599 ConvertCostTableLookup(MVEFStoreConversionTbl, ISD, 600 SrcTy.getSimpleVT(), DstTy.getSimpleVT())) 601 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind); 602 } 603 } 604 605 // NEON vector operations that can extend their inputs. 606 if ((ISD == ISD::SIGN_EXTEND || ISD == ISD::ZERO_EXTEND) && 607 I && I->hasOneUse() && ST->hasNEON() && SrcTy.isVector()) { 608 static const TypeConversionCostTblEntry NEONDoubleWidthTbl[] = { 609 // vaddl 610 { ISD::ADD, MVT::v4i32, MVT::v4i16, 0 }, 611 { ISD::ADD, MVT::v8i16, MVT::v8i8, 0 }, 612 // vsubl 613 { ISD::SUB, MVT::v4i32, MVT::v4i16, 0 }, 614 { ISD::SUB, MVT::v8i16, MVT::v8i8, 0 }, 615 // vmull 616 { ISD::MUL, MVT::v4i32, MVT::v4i16, 0 }, 617 { ISD::MUL, MVT::v8i16, MVT::v8i8, 0 }, 618 // vshll 619 { ISD::SHL, MVT::v4i32, MVT::v4i16, 0 }, 620 { ISD::SHL, MVT::v8i16, MVT::v8i8, 0 }, 621 }; 622 623 auto *User = cast<Instruction>(*I->user_begin()); 624 int UserISD = TLI->InstructionOpcodeToISD(User->getOpcode()); 625 if (auto *Entry = ConvertCostTableLookup(NEONDoubleWidthTbl, UserISD, 626 DstTy.getSimpleVT(), 627 SrcTy.getSimpleVT())) { 628 return AdjustCost(Entry->Cost); 629 } 630 } 631 632 // Single to/from double precision conversions. 633 if (Src->isVectorTy() && ST->hasNEON() && 634 ((ISD == ISD::FP_ROUND && SrcTy.getScalarType() == MVT::f64 && 635 DstTy.getScalarType() == MVT::f32) || 636 (ISD == ISD::FP_EXTEND && SrcTy.getScalarType() == MVT::f32 && 637 DstTy.getScalarType() == MVT::f64))) { 638 static const CostTblEntry NEONFltDblTbl[] = { 639 // Vector fptrunc/fpext conversions. 640 {ISD::FP_ROUND, MVT::v2f64, 2}, 641 {ISD::FP_EXTEND, MVT::v2f32, 2}, 642 {ISD::FP_EXTEND, MVT::v4f32, 4}}; 643 644 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src); 645 if (const auto *Entry = CostTableLookup(NEONFltDblTbl, ISD, LT.second)) 646 return AdjustCost(LT.first * Entry->Cost); 647 } 648 649 // Some arithmetic, load and store operations have specific instructions 650 // to cast up/down their types automatically at no extra cost. 651 // TODO: Get these tables to know at least what the related operations are. 652 static const TypeConversionCostTblEntry NEONVectorConversionTbl[] = { 653 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 }, 654 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 }, 655 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 1 }, 656 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 1 }, 657 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 0 }, 658 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 }, 659 660 // The number of vmovl instructions for the extension. 661 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 }, 662 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 }, 663 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2 }, 664 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 }, 665 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8, 3 }, 666 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8, 3 }, 667 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 2 }, 668 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 }, 669 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, 670 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, 671 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, 672 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, 673 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 }, 674 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 }, 675 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 }, 676 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 }, 677 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 }, 678 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 }, 679 680 // Operations that we legalize using splitting. 681 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 }, 682 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 }, 683 684 // Vector float <-> i32 conversions. 685 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, 686 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, 687 688 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 }, 689 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 }, 690 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 2 }, 691 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 2 }, 692 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, 693 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, 694 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 }, 695 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 }, 696 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 }, 697 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 }, 698 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 }, 699 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 }, 700 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 }, 701 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 }, 702 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 }, 703 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 }, 704 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 8 }, 705 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 8 }, 706 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 4 }, 707 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 4 }, 708 709 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 }, 710 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 }, 711 { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 3 }, 712 { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 3 }, 713 { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 }, 714 { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 }, 715 716 // Vector double <-> i32 conversions. 717 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 }, 718 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 }, 719 720 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 }, 721 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 }, 722 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 3 }, 723 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 3 }, 724 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 }, 725 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 }, 726 727 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 }, 728 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 }, 729 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, 4 }, 730 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, 4 }, 731 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f32, 8 }, 732 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, 8 } 733 }; 734 735 if (SrcTy.isVector() && ST->hasNEON()) { 736 if (const auto *Entry = ConvertCostTableLookup(NEONVectorConversionTbl, ISD, 737 DstTy.getSimpleVT(), 738 SrcTy.getSimpleVT())) 739 return AdjustCost(Entry->Cost); 740 } 741 742 // Scalar float to integer conversions. 743 static const TypeConversionCostTblEntry NEONFloatConversionTbl[] = { 744 { ISD::FP_TO_SINT, MVT::i1, MVT::f32, 2 }, 745 { ISD::FP_TO_UINT, MVT::i1, MVT::f32, 2 }, 746 { ISD::FP_TO_SINT, MVT::i1, MVT::f64, 2 }, 747 { ISD::FP_TO_UINT, MVT::i1, MVT::f64, 2 }, 748 { ISD::FP_TO_SINT, MVT::i8, MVT::f32, 2 }, 749 { ISD::FP_TO_UINT, MVT::i8, MVT::f32, 2 }, 750 { ISD::FP_TO_SINT, MVT::i8, MVT::f64, 2 }, 751 { ISD::FP_TO_UINT, MVT::i8, MVT::f64, 2 }, 752 { ISD::FP_TO_SINT, MVT::i16, MVT::f32, 2 }, 753 { ISD::FP_TO_UINT, MVT::i16, MVT::f32, 2 }, 754 { ISD::FP_TO_SINT, MVT::i16, MVT::f64, 2 }, 755 { ISD::FP_TO_UINT, MVT::i16, MVT::f64, 2 }, 756 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, 2 }, 757 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, 2 }, 758 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, 2 }, 759 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, 2 }, 760 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, 10 }, 761 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 10 }, 762 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, 10 }, 763 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 10 } 764 }; 765 if (SrcTy.isFloatingPoint() && ST->hasNEON()) { 766 if (const auto *Entry = ConvertCostTableLookup(NEONFloatConversionTbl, ISD, 767 DstTy.getSimpleVT(), 768 SrcTy.getSimpleVT())) 769 return AdjustCost(Entry->Cost); 770 } 771 772 // Scalar integer to float conversions. 773 static const TypeConversionCostTblEntry NEONIntegerConversionTbl[] = { 774 { ISD::SINT_TO_FP, MVT::f32, MVT::i1, 2 }, 775 { ISD::UINT_TO_FP, MVT::f32, MVT::i1, 2 }, 776 { ISD::SINT_TO_FP, MVT::f64, MVT::i1, 2 }, 777 { ISD::UINT_TO_FP, MVT::f64, MVT::i1, 2 }, 778 { ISD::SINT_TO_FP, MVT::f32, MVT::i8, 2 }, 779 { ISD::UINT_TO_FP, MVT::f32, MVT::i8, 2 }, 780 { ISD::SINT_TO_FP, MVT::f64, MVT::i8, 2 }, 781 { ISD::UINT_TO_FP, MVT::f64, MVT::i8, 2 }, 782 { ISD::SINT_TO_FP, MVT::f32, MVT::i16, 2 }, 783 { ISD::UINT_TO_FP, MVT::f32, MVT::i16, 2 }, 784 { ISD::SINT_TO_FP, MVT::f64, MVT::i16, 2 }, 785 { ISD::UINT_TO_FP, MVT::f64, MVT::i16, 2 }, 786 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, 2 }, 787 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, 2 }, 788 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, 2 }, 789 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, 2 }, 790 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, 10 }, 791 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 10 }, 792 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, 10 }, 793 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 10 } 794 }; 795 796 if (SrcTy.isInteger() && ST->hasNEON()) { 797 if (const auto *Entry = ConvertCostTableLookup(NEONIntegerConversionTbl, 798 ISD, DstTy.getSimpleVT(), 799 SrcTy.getSimpleVT())) 800 return AdjustCost(Entry->Cost); 801 } 802 803 // MVE extend costs, taken from codegen tests. i8->i16 or i16->i32 is one 804 // instruction, i8->i32 is two. i64 zexts are an VAND with a constant, sext 805 // are linearised so take more. 806 static const TypeConversionCostTblEntry MVEVectorConversionTbl[] = { 807 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 }, 808 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 }, 809 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2 }, 810 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 }, 811 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8, 10 }, 812 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8, 2 }, 813 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 }, 814 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 }, 815 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 10 }, 816 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 }, 817 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 8 }, 818 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 2 }, 819 }; 820 821 if (SrcTy.isVector() && ST->hasMVEIntegerOps()) { 822 if (const auto *Entry = ConvertCostTableLookup(MVEVectorConversionTbl, 823 ISD, DstTy.getSimpleVT(), 824 SrcTy.getSimpleVT())) 825 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind); 826 } 827 828 if (ISD == ISD::FP_ROUND || ISD == ISD::FP_EXTEND) { 829 // As general rule, fp converts that were not matched above are scalarized 830 // and cost 1 vcvt for each lane, so long as the instruction is available. 831 // If not it will become a series of function calls. 832 const InstructionCost CallCost = 833 getCallInstrCost(nullptr, Dst, {Src}, CostKind); 834 int Lanes = 1; 835 if (SrcTy.isFixedLengthVector()) 836 Lanes = SrcTy.getVectorNumElements(); 837 838 if (IsLegalFPType(SrcTy) && IsLegalFPType(DstTy)) 839 return Lanes; 840 else 841 return Lanes * CallCost; 842 } 843 844 if (ISD == ISD::TRUNCATE && ST->hasMVEIntegerOps() && 845 SrcTy.isFixedLengthVector()) { 846 // Treat a truncate with larger than legal source (128bits for MVE) as 847 // expensive, 2 instructions per lane. 848 if ((SrcTy.getScalarType() == MVT::i8 || 849 SrcTy.getScalarType() == MVT::i16 || 850 SrcTy.getScalarType() == MVT::i32) && 851 SrcTy.getSizeInBits() > 128 && 852 SrcTy.getSizeInBits() > DstTy.getSizeInBits()) 853 return SrcTy.getVectorNumElements() * 2; 854 } 855 856 // Scalar integer conversion costs. 857 static const TypeConversionCostTblEntry ARMIntegerConversionTbl[] = { 858 // i16 -> i64 requires two dependent operations. 859 { ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 2 }, 860 861 // Truncates on i64 are assumed to be free. 862 { ISD::TRUNCATE, MVT::i32, MVT::i64, 0 }, 863 { ISD::TRUNCATE, MVT::i16, MVT::i64, 0 }, 864 { ISD::TRUNCATE, MVT::i8, MVT::i64, 0 }, 865 { ISD::TRUNCATE, MVT::i1, MVT::i64, 0 } 866 }; 867 868 if (SrcTy.isInteger()) { 869 if (const auto *Entry = ConvertCostTableLookup(ARMIntegerConversionTbl, ISD, 870 DstTy.getSimpleVT(), 871 SrcTy.getSimpleVT())) 872 return AdjustCost(Entry->Cost); 873 } 874 875 int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy() 876 ? ST->getMVEVectorCostFactor(CostKind) 877 : 1; 878 return AdjustCost( 879 BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); 880 } 881 882 InstructionCost ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, 883 TTI::TargetCostKind CostKind, 884 unsigned Index, Value *Op0, 885 Value *Op1) { 886 // Penalize inserting into an D-subregister. We end up with a three times 887 // lower estimated throughput on swift. 888 if (ST->hasSlowLoadDSubregister() && Opcode == Instruction::InsertElement && 889 ValTy->isVectorTy() && ValTy->getScalarSizeInBits() <= 32) 890 return 3; 891 892 if (ST->hasNEON() && (Opcode == Instruction::InsertElement || 893 Opcode == Instruction::ExtractElement)) { 894 // Cross-class copies are expensive on many microarchitectures, 895 // so assume they are expensive by default. 896 if (cast<VectorType>(ValTy)->getElementType()->isIntegerTy()) 897 return 3; 898 899 // Even if it's not a cross class copy, this likely leads to mixing 900 // of NEON and VFP code and should be therefore penalized. 901 if (ValTy->isVectorTy() && 902 ValTy->getScalarSizeInBits() <= 32) 903 return std::max<InstructionCost>( 904 BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1), 905 2U); 906 } 907 908 if (ST->hasMVEIntegerOps() && (Opcode == Instruction::InsertElement || 909 Opcode == Instruction::ExtractElement)) { 910 // Integer cross-lane moves are more expensive than float, which can 911 // sometimes just be vmovs. Integer involve being passes to GPR registers, 912 // causing more of a delay. 913 std::pair<InstructionCost, MVT> LT = 914 getTypeLegalizationCost(ValTy->getScalarType()); 915 return LT.first * (ValTy->getScalarType()->isIntegerTy() ? 4 : 1); 916 } 917 918 return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1); 919 } 920 921 InstructionCost ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, 922 Type *CondTy, 923 CmpInst::Predicate VecPred, 924 TTI::TargetCostKind CostKind, 925 const Instruction *I) { 926 int ISD = TLI->InstructionOpcodeToISD(Opcode); 927 928 // Thumb scalar code size cost for select. 929 if (CostKind == TTI::TCK_CodeSize && ISD == ISD::SELECT && 930 ST->isThumb() && !ValTy->isVectorTy()) { 931 // Assume expensive structs. 932 if (TLI->getValueType(DL, ValTy, true) == MVT::Other) 933 return TTI::TCC_Expensive; 934 935 // Select costs can vary because they: 936 // - may require one or more conditional mov (including an IT), 937 // - can't operate directly on immediates, 938 // - require live flags, which we can't copy around easily. 939 InstructionCost Cost = getTypeLegalizationCost(ValTy).first; 940 941 // Possible IT instruction for Thumb2, or more for Thumb1. 942 ++Cost; 943 944 // i1 values may need rematerialising by using mov immediates and/or 945 // flag setting instructions. 946 if (ValTy->isIntegerTy(1)) 947 ++Cost; 948 949 return Cost; 950 } 951 952 // If this is a vector min/max/abs, use the cost of that intrinsic directly 953 // instead. Hopefully when min/max intrinsics are more prevalent this code 954 // will not be needed. 955 const Instruction *Sel = I; 956 if ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) && Sel && 957 Sel->hasOneUse()) 958 Sel = cast<Instruction>(Sel->user_back()); 959 if (Sel && ValTy->isVectorTy() && 960 (ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy())) { 961 const Value *LHS, *RHS; 962 SelectPatternFlavor SPF = matchSelectPattern(Sel, LHS, RHS).Flavor; 963 unsigned IID = 0; 964 switch (SPF) { 965 case SPF_ABS: 966 IID = Intrinsic::abs; 967 break; 968 case SPF_SMIN: 969 IID = Intrinsic::smin; 970 break; 971 case SPF_SMAX: 972 IID = Intrinsic::smax; 973 break; 974 case SPF_UMIN: 975 IID = Intrinsic::umin; 976 break; 977 case SPF_UMAX: 978 IID = Intrinsic::umax; 979 break; 980 case SPF_FMINNUM: 981 IID = Intrinsic::minnum; 982 break; 983 case SPF_FMAXNUM: 984 IID = Intrinsic::maxnum; 985 break; 986 default: 987 break; 988 } 989 if (IID) { 990 // The ICmp is free, the select gets the cost of the min/max/etc 991 if (Sel != I) 992 return 0; 993 IntrinsicCostAttributes CostAttrs(IID, ValTy, {ValTy, ValTy}); 994 return getIntrinsicInstrCost(CostAttrs, CostKind); 995 } 996 } 997 998 // On NEON a vector select gets lowered to vbsl. 999 if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT && CondTy) { 1000 // Lowering of some vector selects is currently far from perfect. 1001 static const TypeConversionCostTblEntry NEONVectorSelectTbl[] = { 1002 { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4*4 + 1*2 + 1 }, 1003 { ISD::SELECT, MVT::v8i1, MVT::v8i64, 50 }, 1004 { ISD::SELECT, MVT::v16i1, MVT::v16i64, 100 } 1005 }; 1006 1007 EVT SelCondTy = TLI->getValueType(DL, CondTy); 1008 EVT SelValTy = TLI->getValueType(DL, ValTy); 1009 if (SelCondTy.isSimple() && SelValTy.isSimple()) { 1010 if (const auto *Entry = ConvertCostTableLookup(NEONVectorSelectTbl, ISD, 1011 SelCondTy.getSimpleVT(), 1012 SelValTy.getSimpleVT())) 1013 return Entry->Cost; 1014 } 1015 1016 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy); 1017 return LT.first; 1018 } 1019 1020 if (ST->hasMVEIntegerOps() && ValTy->isVectorTy() && 1021 (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) && 1022 cast<FixedVectorType>(ValTy)->getNumElements() > 1) { 1023 FixedVectorType *VecValTy = cast<FixedVectorType>(ValTy); 1024 FixedVectorType *VecCondTy = dyn_cast_or_null<FixedVectorType>(CondTy); 1025 if (!VecCondTy) 1026 VecCondTy = cast<FixedVectorType>(CmpInst::makeCmpResultType(VecValTy)); 1027 1028 // If we don't have mve.fp any fp operations will need to be scalarized. 1029 if (Opcode == Instruction::FCmp && !ST->hasMVEFloatOps()) { 1030 // One scalaization insert, one scalarization extract and the cost of the 1031 // fcmps. 1032 return BaseT::getScalarizationOverhead(VecValTy, /*Insert*/ false, 1033 /*Extract*/ true, CostKind) + 1034 BaseT::getScalarizationOverhead(VecCondTy, /*Insert*/ true, 1035 /*Extract*/ false, CostKind) + 1036 VecValTy->getNumElements() * 1037 getCmpSelInstrCost(Opcode, ValTy->getScalarType(), 1038 VecCondTy->getScalarType(), VecPred, 1039 CostKind, I); 1040 } 1041 1042 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy); 1043 int BaseCost = ST->getMVEVectorCostFactor(CostKind); 1044 // There are two types - the input that specifies the type of the compare 1045 // and the output vXi1 type. Because we don't know how the output will be 1046 // split, we may need an expensive shuffle to get two in sync. This has the 1047 // effect of making larger than legal compares (v8i32 for example) 1048 // expensive. 1049 if (LT.second.isVector() && LT.second.getVectorNumElements() > 2) { 1050 if (LT.first > 1) 1051 return LT.first * BaseCost + 1052 BaseT::getScalarizationOverhead(VecCondTy, /*Insert*/ true, 1053 /*Extract*/ false, CostKind); 1054 return BaseCost; 1055 } 1056 } 1057 1058 // Default to cheap (throughput/size of 1 instruction) but adjust throughput 1059 // for "multiple beats" potentially needed by MVE instructions. 1060 int BaseCost = 1; 1061 if (ST->hasMVEIntegerOps() && ValTy->isVectorTy()) 1062 BaseCost = ST->getMVEVectorCostFactor(CostKind); 1063 1064 return BaseCost * 1065 BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I); 1066 } 1067 1068 InstructionCost ARMTTIImpl::getAddressComputationCost(Type *Ty, 1069 ScalarEvolution *SE, 1070 const SCEV *Ptr) { 1071 // Address computations in vectorized code with non-consecutive addresses will 1072 // likely result in more instructions compared to scalar code where the 1073 // computation can more often be merged into the index mode. The resulting 1074 // extra micro-ops can significantly decrease throughput. 1075 unsigned NumVectorInstToHideOverhead = 10; 1076 int MaxMergeDistance = 64; 1077 1078 if (ST->hasNEON()) { 1079 if (Ty->isVectorTy() && SE && 1080 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1)) 1081 return NumVectorInstToHideOverhead; 1082 1083 // In many cases the address computation is not merged into the instruction 1084 // addressing mode. 1085 return 1; 1086 } 1087 return BaseT::getAddressComputationCost(Ty, SE, Ptr); 1088 } 1089 1090 bool ARMTTIImpl::isProfitableLSRChainElement(Instruction *I) { 1091 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { 1092 // If a VCTP is part of a chain, it's already profitable and shouldn't be 1093 // optimized, else LSR may block tail-predication. 1094 switch (II->getIntrinsicID()) { 1095 case Intrinsic::arm_mve_vctp8: 1096 case Intrinsic::arm_mve_vctp16: 1097 case Intrinsic::arm_mve_vctp32: 1098 case Intrinsic::arm_mve_vctp64: 1099 return true; 1100 default: 1101 break; 1102 } 1103 } 1104 return false; 1105 } 1106 1107 bool ARMTTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) { 1108 if (!EnableMaskedLoadStores || !ST->hasMVEIntegerOps()) 1109 return false; 1110 1111 if (auto *VecTy = dyn_cast<FixedVectorType>(DataTy)) { 1112 // Don't support v2i1 yet. 1113 if (VecTy->getNumElements() == 2) 1114 return false; 1115 1116 // We don't support extending fp types. 1117 unsigned VecWidth = DataTy->getPrimitiveSizeInBits(); 1118 if (VecWidth != 128 && VecTy->getElementType()->isFloatingPointTy()) 1119 return false; 1120 } 1121 1122 unsigned EltWidth = DataTy->getScalarSizeInBits(); 1123 return (EltWidth == 32 && Alignment >= 4) || 1124 (EltWidth == 16 && Alignment >= 2) || (EltWidth == 8); 1125 } 1126 1127 bool ARMTTIImpl::isLegalMaskedGather(Type *Ty, Align Alignment) { 1128 if (!EnableMaskedGatherScatters || !ST->hasMVEIntegerOps()) 1129 return false; 1130 1131 unsigned EltWidth = Ty->getScalarSizeInBits(); 1132 return ((EltWidth == 32 && Alignment >= 4) || 1133 (EltWidth == 16 && Alignment >= 2) || EltWidth == 8); 1134 } 1135 1136 /// Given a memcpy/memset/memmove instruction, return the number of memory 1137 /// operations performed, via querying findOptimalMemOpLowering. Returns -1 if a 1138 /// call is used. 1139 int ARMTTIImpl::getNumMemOps(const IntrinsicInst *I) const { 1140 MemOp MOp; 1141 unsigned DstAddrSpace = ~0u; 1142 unsigned SrcAddrSpace = ~0u; 1143 const Function *F = I->getParent()->getParent(); 1144 1145 if (const auto *MC = dyn_cast<MemTransferInst>(I)) { 1146 ConstantInt *C = dyn_cast<ConstantInt>(MC->getLength()); 1147 // If 'size' is not a constant, a library call will be generated. 1148 if (!C) 1149 return -1; 1150 1151 const unsigned Size = C->getValue().getZExtValue(); 1152 const Align DstAlign = *MC->getDestAlign(); 1153 const Align SrcAlign = *MC->getSourceAlign(); 1154 1155 MOp = MemOp::Copy(Size, /*DstAlignCanChange*/ false, DstAlign, SrcAlign, 1156 /*IsVolatile*/ false); 1157 DstAddrSpace = MC->getDestAddressSpace(); 1158 SrcAddrSpace = MC->getSourceAddressSpace(); 1159 } 1160 else if (const auto *MS = dyn_cast<MemSetInst>(I)) { 1161 ConstantInt *C = dyn_cast<ConstantInt>(MS->getLength()); 1162 // If 'size' is not a constant, a library call will be generated. 1163 if (!C) 1164 return -1; 1165 1166 const unsigned Size = C->getValue().getZExtValue(); 1167 const Align DstAlign = *MS->getDestAlign(); 1168 1169 MOp = MemOp::Set(Size, /*DstAlignCanChange*/ false, DstAlign, 1170 /*IsZeroMemset*/ false, /*IsVolatile*/ false); 1171 DstAddrSpace = MS->getDestAddressSpace(); 1172 } 1173 else 1174 llvm_unreachable("Expected a memcpy/move or memset!"); 1175 1176 unsigned Limit, Factor = 2; 1177 switch(I->getIntrinsicID()) { 1178 case Intrinsic::memcpy: 1179 Limit = TLI->getMaxStoresPerMemcpy(F->hasMinSize()); 1180 break; 1181 case Intrinsic::memmove: 1182 Limit = TLI->getMaxStoresPerMemmove(F->hasMinSize()); 1183 break; 1184 case Intrinsic::memset: 1185 Limit = TLI->getMaxStoresPerMemset(F->hasMinSize()); 1186 Factor = 1; 1187 break; 1188 default: 1189 llvm_unreachable("Expected a memcpy/move or memset!"); 1190 } 1191 1192 // MemOps will be poplulated with a list of data types that needs to be 1193 // loaded and stored. That's why we multiply the number of elements by 2 to 1194 // get the cost for this memcpy. 1195 std::vector<EVT> MemOps; 1196 if (getTLI()->findOptimalMemOpLowering( 1197 MemOps, Limit, MOp, DstAddrSpace, 1198 SrcAddrSpace, F->getAttributes())) 1199 return MemOps.size() * Factor; 1200 1201 // If we can't find an optimal memop lowering, return the default cost 1202 return -1; 1203 } 1204 1205 InstructionCost ARMTTIImpl::getMemcpyCost(const Instruction *I) { 1206 int NumOps = getNumMemOps(cast<IntrinsicInst>(I)); 1207 1208 // To model the cost of a library call, we assume 1 for the call, and 1209 // 3 for the argument setup. 1210 if (NumOps == -1) 1211 return 4; 1212 return NumOps; 1213 } 1214 1215 InstructionCost ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, 1216 VectorType *Tp, ArrayRef<int> Mask, 1217 TTI::TargetCostKind CostKind, 1218 int Index, VectorType *SubTp, 1219 ArrayRef<const Value *> Args, 1220 const Instruction *CxtI) { 1221 Kind = improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp); 1222 // Treat extractsubvector as single op permutation. 1223 bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector; 1224 if (IsExtractSubvector) 1225 Kind = TTI::SK_PermuteSingleSrc; 1226 if (ST->hasNEON()) { 1227 if (Kind == TTI::SK_Broadcast) { 1228 static const CostTblEntry NEONDupTbl[] = { 1229 // VDUP handles these cases. 1230 {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1}, 1231 {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1}, 1232 {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, 1233 {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, 1234 {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1}, 1235 {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1}, 1236 1237 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1}, 1238 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1}, 1239 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1}, 1240 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1}}; 1241 1242 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp); 1243 if (const auto *Entry = 1244 CostTableLookup(NEONDupTbl, ISD::VECTOR_SHUFFLE, LT.second)) 1245 return LT.first * Entry->Cost; 1246 } 1247 if (Kind == TTI::SK_Reverse) { 1248 static const CostTblEntry NEONShuffleTbl[] = { 1249 // Reverse shuffle cost one instruction if we are shuffling within a 1250 // double word (vrev) or two if we shuffle a quad word (vrev, vext). 1251 {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1}, 1252 {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1}, 1253 {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, 1254 {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, 1255 {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1}, 1256 {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1}, 1257 1258 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2}, 1259 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2}, 1260 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 2}, 1261 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 2}}; 1262 1263 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp); 1264 if (const auto *Entry = 1265 CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second)) 1266 return LT.first * Entry->Cost; 1267 } 1268 if (Kind == TTI::SK_Select) { 1269 static const CostTblEntry NEONSelShuffleTbl[] = { 1270 // Select shuffle cost table for ARM. Cost is the number of 1271 // instructions 1272 // required to create the shuffled vector. 1273 1274 {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1}, 1275 {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, 1276 {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, 1277 {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1}, 1278 1279 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2}, 1280 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2}, 1281 {ISD::VECTOR_SHUFFLE, MVT::v4i16, 2}, 1282 1283 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 16}, 1284 1285 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 32}}; 1286 1287 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp); 1288 if (const auto *Entry = CostTableLookup(NEONSelShuffleTbl, 1289 ISD::VECTOR_SHUFFLE, LT.second)) 1290 return LT.first * Entry->Cost; 1291 } 1292 } 1293 if (ST->hasMVEIntegerOps()) { 1294 if (Kind == TTI::SK_Broadcast) { 1295 static const CostTblEntry MVEDupTbl[] = { 1296 // VDUP handles these cases. 1297 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1}, 1298 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1}, 1299 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1}, 1300 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1}, 1301 {ISD::VECTOR_SHUFFLE, MVT::v8f16, 1}}; 1302 1303 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp); 1304 if (const auto *Entry = CostTableLookup(MVEDupTbl, ISD::VECTOR_SHUFFLE, 1305 LT.second)) 1306 return LT.first * Entry->Cost * 1307 ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput); 1308 } 1309 1310 if (!Mask.empty()) { 1311 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp); 1312 if (LT.second.isVector() && 1313 Mask.size() <= LT.second.getVectorNumElements() && 1314 (isVREVMask(Mask, LT.second, 16) || isVREVMask(Mask, LT.second, 32) || 1315 isVREVMask(Mask, LT.second, 64))) 1316 return ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput) * LT.first; 1317 } 1318 } 1319 1320 // Restore optimal kind. 1321 if (IsExtractSubvector) 1322 Kind = TTI::SK_ExtractSubvector; 1323 int BaseCost = ST->hasMVEIntegerOps() && Tp->isVectorTy() 1324 ? ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput) 1325 : 1; 1326 return BaseCost * 1327 BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp); 1328 } 1329 1330 InstructionCost ARMTTIImpl::getArithmeticInstrCost( 1331 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, 1332 TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, 1333 ArrayRef<const Value *> Args, 1334 const Instruction *CxtI) { 1335 int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode); 1336 if (ST->isThumb() && CostKind == TTI::TCK_CodeSize && Ty->isIntegerTy(1)) { 1337 // Make operations on i1 relatively expensive as this often involves 1338 // combining predicates. AND and XOR should be easier to handle with IT 1339 // blocks. 1340 switch (ISDOpcode) { 1341 default: 1342 break; 1343 case ISD::AND: 1344 case ISD::XOR: 1345 return 2; 1346 case ISD::OR: 1347 return 3; 1348 } 1349 } 1350 1351 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); 1352 1353 if (ST->hasNEON()) { 1354 const unsigned FunctionCallDivCost = 20; 1355 const unsigned ReciprocalDivCost = 10; 1356 static const CostTblEntry CostTbl[] = { 1357 // Division. 1358 // These costs are somewhat random. Choose a cost of 20 to indicate that 1359 // vectorizing devision (added function call) is going to be very expensive. 1360 // Double registers types. 1361 { ISD::SDIV, MVT::v1i64, 1 * FunctionCallDivCost}, 1362 { ISD::UDIV, MVT::v1i64, 1 * FunctionCallDivCost}, 1363 { ISD::SREM, MVT::v1i64, 1 * FunctionCallDivCost}, 1364 { ISD::UREM, MVT::v1i64, 1 * FunctionCallDivCost}, 1365 { ISD::SDIV, MVT::v2i32, 2 * FunctionCallDivCost}, 1366 { ISD::UDIV, MVT::v2i32, 2 * FunctionCallDivCost}, 1367 { ISD::SREM, MVT::v2i32, 2 * FunctionCallDivCost}, 1368 { ISD::UREM, MVT::v2i32, 2 * FunctionCallDivCost}, 1369 { ISD::SDIV, MVT::v4i16, ReciprocalDivCost}, 1370 { ISD::UDIV, MVT::v4i16, ReciprocalDivCost}, 1371 { ISD::SREM, MVT::v4i16, 4 * FunctionCallDivCost}, 1372 { ISD::UREM, MVT::v4i16, 4 * FunctionCallDivCost}, 1373 { ISD::SDIV, MVT::v8i8, ReciprocalDivCost}, 1374 { ISD::UDIV, MVT::v8i8, ReciprocalDivCost}, 1375 { ISD::SREM, MVT::v8i8, 8 * FunctionCallDivCost}, 1376 { ISD::UREM, MVT::v8i8, 8 * FunctionCallDivCost}, 1377 // Quad register types. 1378 { ISD::SDIV, MVT::v2i64, 2 * FunctionCallDivCost}, 1379 { ISD::UDIV, MVT::v2i64, 2 * FunctionCallDivCost}, 1380 { ISD::SREM, MVT::v2i64, 2 * FunctionCallDivCost}, 1381 { ISD::UREM, MVT::v2i64, 2 * FunctionCallDivCost}, 1382 { ISD::SDIV, MVT::v4i32, 4 * FunctionCallDivCost}, 1383 { ISD::UDIV, MVT::v4i32, 4 * FunctionCallDivCost}, 1384 { ISD::SREM, MVT::v4i32, 4 * FunctionCallDivCost}, 1385 { ISD::UREM, MVT::v4i32, 4 * FunctionCallDivCost}, 1386 { ISD::SDIV, MVT::v8i16, 8 * FunctionCallDivCost}, 1387 { ISD::UDIV, MVT::v8i16, 8 * FunctionCallDivCost}, 1388 { ISD::SREM, MVT::v8i16, 8 * FunctionCallDivCost}, 1389 { ISD::UREM, MVT::v8i16, 8 * FunctionCallDivCost}, 1390 { ISD::SDIV, MVT::v16i8, 16 * FunctionCallDivCost}, 1391 { ISD::UDIV, MVT::v16i8, 16 * FunctionCallDivCost}, 1392 { ISD::SREM, MVT::v16i8, 16 * FunctionCallDivCost}, 1393 { ISD::UREM, MVT::v16i8, 16 * FunctionCallDivCost}, 1394 // Multiplication. 1395 }; 1396 1397 if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second)) 1398 return LT.first * Entry->Cost; 1399 1400 InstructionCost Cost = BaseT::getArithmeticInstrCost( 1401 Opcode, Ty, CostKind, Op1Info, Op2Info); 1402 1403 // This is somewhat of a hack. The problem that we are facing is that SROA 1404 // creates a sequence of shift, and, or instructions to construct values. 1405 // These sequences are recognized by the ISel and have zero-cost. Not so for 1406 // the vectorized code. Because we have support for v2i64 but not i64 those 1407 // sequences look particularly beneficial to vectorize. 1408 // To work around this we increase the cost of v2i64 operations to make them 1409 // seem less beneficial. 1410 if (LT.second == MVT::v2i64 && Op2Info.isUniform() && Op2Info.isConstant()) 1411 Cost += 4; 1412 1413 return Cost; 1414 } 1415 1416 // If this operation is a shift on arm/thumb2, it might well be folded into 1417 // the following instruction, hence having a cost of 0. 1418 auto LooksLikeAFreeShift = [&]() { 1419 if (ST->isThumb1Only() || Ty->isVectorTy()) 1420 return false; 1421 1422 if (!CxtI || !CxtI->hasOneUse() || !CxtI->isShift()) 1423 return false; 1424 if (!Op2Info.isUniform() || !Op2Info.isConstant()) 1425 return false; 1426 1427 // Folded into a ADC/ADD/AND/BIC/CMP/EOR/MVN/ORR/ORN/RSB/SBC/SUB 1428 switch (cast<Instruction>(CxtI->user_back())->getOpcode()) { 1429 case Instruction::Add: 1430 case Instruction::Sub: 1431 case Instruction::And: 1432 case Instruction::Xor: 1433 case Instruction::Or: 1434 case Instruction::ICmp: 1435 return true; 1436 default: 1437 return false; 1438 } 1439 }; 1440 if (LooksLikeAFreeShift()) 1441 return 0; 1442 1443 // Default to cheap (throughput/size of 1 instruction) but adjust throughput 1444 // for "multiple beats" potentially needed by MVE instructions. 1445 int BaseCost = 1; 1446 if (ST->hasMVEIntegerOps() && Ty->isVectorTy()) 1447 BaseCost = ST->getMVEVectorCostFactor(CostKind); 1448 1449 // The rest of this mostly follows what is done in BaseT::getArithmeticInstrCost, 1450 // without treating floats as more expensive that scalars or increasing the 1451 // costs for custom operations. The results is also multiplied by the 1452 // MVEVectorCostFactor where appropriate. 1453 if (TLI->isOperationLegalOrCustomOrPromote(ISDOpcode, LT.second)) 1454 return LT.first * BaseCost; 1455 1456 // Else this is expand, assume that we need to scalarize this op. 1457 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) { 1458 unsigned Num = VTy->getNumElements(); 1459 InstructionCost Cost = 1460 getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind); 1461 // Return the cost of multiple scalar invocation plus the cost of 1462 // inserting and extracting the values. 1463 SmallVector<Type *> Tys(Args.size(), Ty); 1464 return BaseT::getScalarizationOverhead(VTy, Args, Tys, CostKind) + 1465 Num * Cost; 1466 } 1467 1468 return BaseCost; 1469 } 1470 1471 InstructionCost ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, 1472 MaybeAlign Alignment, 1473 unsigned AddressSpace, 1474 TTI::TargetCostKind CostKind, 1475 TTI::OperandValueInfo OpInfo, 1476 const Instruction *I) { 1477 // TODO: Handle other cost kinds. 1478 if (CostKind != TTI::TCK_RecipThroughput) 1479 return 1; 1480 1481 // Type legalization can't handle structs 1482 if (TLI->getValueType(DL, Src, true) == MVT::Other) 1483 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, 1484 CostKind); 1485 1486 if (ST->hasNEON() && Src->isVectorTy() && 1487 (Alignment && *Alignment != Align(16)) && 1488 cast<VectorType>(Src)->getElementType()->isDoubleTy()) { 1489 // Unaligned loads/stores are extremely inefficient. 1490 // We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr. 1491 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src); 1492 return LT.first * 4; 1493 } 1494 1495 // MVE can optimize a fpext(load(4xhalf)) using an extending integer load. 1496 // Same for stores. 1497 if (ST->hasMVEFloatOps() && isa<FixedVectorType>(Src) && I && 1498 ((Opcode == Instruction::Load && I->hasOneUse() && 1499 isa<FPExtInst>(*I->user_begin())) || 1500 (Opcode == Instruction::Store && isa<FPTruncInst>(I->getOperand(0))))) { 1501 FixedVectorType *SrcVTy = cast<FixedVectorType>(Src); 1502 Type *DstTy = 1503 Opcode == Instruction::Load 1504 ? (*I->user_begin())->getType() 1505 : cast<Instruction>(I->getOperand(0))->getOperand(0)->getType(); 1506 if (SrcVTy->getNumElements() == 4 && SrcVTy->getScalarType()->isHalfTy() && 1507 DstTy->getScalarType()->isFloatTy()) 1508 return ST->getMVEVectorCostFactor(CostKind); 1509 } 1510 1511 int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy() 1512 ? ST->getMVEVectorCostFactor(CostKind) 1513 : 1; 1514 return BaseCost * BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, 1515 CostKind, OpInfo, I); 1516 } 1517 1518 InstructionCost 1519 ARMTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, 1520 unsigned AddressSpace, 1521 TTI::TargetCostKind CostKind) { 1522 if (ST->hasMVEIntegerOps()) { 1523 if (Opcode == Instruction::Load && isLegalMaskedLoad(Src, Alignment)) 1524 return ST->getMVEVectorCostFactor(CostKind); 1525 if (Opcode == Instruction::Store && isLegalMaskedStore(Src, Alignment)) 1526 return ST->getMVEVectorCostFactor(CostKind); 1527 } 1528 if (!isa<FixedVectorType>(Src)) 1529 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace, 1530 CostKind); 1531 // Scalar cost, which is currently very high due to the efficiency of the 1532 // generated code. 1533 return cast<FixedVectorType>(Src)->getNumElements() * 8; 1534 } 1535 1536 InstructionCost ARMTTIImpl::getInterleavedMemoryOpCost( 1537 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, 1538 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, 1539 bool UseMaskForCond, bool UseMaskForGaps) { 1540 assert(Factor >= 2 && "Invalid interleave factor"); 1541 assert(isa<VectorType>(VecTy) && "Expect a vector type"); 1542 1543 // vldN/vstN doesn't support vector types of i64/f64 element. 1544 bool EltIs64Bits = DL.getTypeSizeInBits(VecTy->getScalarType()) == 64; 1545 1546 if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits && 1547 !UseMaskForCond && !UseMaskForGaps) { 1548 unsigned NumElts = cast<FixedVectorType>(VecTy)->getNumElements(); 1549 auto *SubVecTy = 1550 FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor); 1551 1552 // vldN/vstN only support legal vector types of size 64 or 128 in bits. 1553 // Accesses having vector types that are a multiple of 128 bits can be 1554 // matched to more than one vldN/vstN instruction. 1555 int BaseCost = 1556 ST->hasMVEIntegerOps() ? ST->getMVEVectorCostFactor(CostKind) : 1; 1557 if (NumElts % Factor == 0 && 1558 TLI->isLegalInterleavedAccessType(Factor, SubVecTy, Alignment, DL)) 1559 return Factor * BaseCost * TLI->getNumInterleavedAccesses(SubVecTy, DL); 1560 1561 // Some smaller than legal interleaved patterns are cheap as we can make 1562 // use of the vmovn or vrev patterns to interleave a standard load. This is 1563 // true for v4i8, v8i8 and v4i16 at least (but not for v4f16 as it is 1564 // promoted differently). The cost of 2 here is then a load and vrev or 1565 // vmovn. 1566 if (ST->hasMVEIntegerOps() && Factor == 2 && NumElts / Factor > 2 && 1567 VecTy->isIntOrIntVectorTy() && 1568 DL.getTypeSizeInBits(SubVecTy).getFixedValue() <= 64) 1569 return 2 * BaseCost; 1570 } 1571 1572 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, 1573 Alignment, AddressSpace, CostKind, 1574 UseMaskForCond, UseMaskForGaps); 1575 } 1576 1577 InstructionCost ARMTTIImpl::getGatherScatterOpCost( 1578 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, 1579 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) { 1580 using namespace PatternMatch; 1581 if (!ST->hasMVEIntegerOps() || !EnableMaskedGatherScatters) 1582 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, 1583 Alignment, CostKind, I); 1584 1585 assert(DataTy->isVectorTy() && "Can't do gather/scatters on scalar!"); 1586 auto *VTy = cast<FixedVectorType>(DataTy); 1587 1588 // TODO: Splitting, once we do that. 1589 1590 unsigned NumElems = VTy->getNumElements(); 1591 unsigned EltSize = VTy->getScalarSizeInBits(); 1592 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(DataTy); 1593 1594 // For now, it is assumed that for the MVE gather instructions the loads are 1595 // all effectively serialised. This means the cost is the scalar cost 1596 // multiplied by the number of elements being loaded. This is possibly very 1597 // conservative, but even so we still end up vectorising loops because the 1598 // cost per iteration for many loops is lower than for scalar loops. 1599 InstructionCost VectorCost = 1600 NumElems * LT.first * ST->getMVEVectorCostFactor(CostKind); 1601 // The scalarization cost should be a lot higher. We use the number of vector 1602 // elements plus the scalarization overhead. If masking is required then a lot 1603 // of little blocks will be needed and potentially a scalarized p0 mask, 1604 // greatly increasing the cost. 1605 InstructionCost ScalarCost = 1606 NumElems * LT.first + (VariableMask ? NumElems * 5 : 0) + 1607 BaseT::getScalarizationOverhead(VTy, /*Insert*/ true, /*Extract*/ false, 1608 CostKind) + 1609 BaseT::getScalarizationOverhead(VTy, /*Insert*/ false, /*Extract*/ true, 1610 CostKind); 1611 1612 if (EltSize < 8 || Alignment < EltSize / 8) 1613 return ScalarCost; 1614 1615 unsigned ExtSize = EltSize; 1616 // Check whether there's a single user that asks for an extended type 1617 if (I != nullptr) { 1618 // Dependent of the caller of this function, a gather instruction will 1619 // either have opcode Instruction::Load or be a call to the masked_gather 1620 // intrinsic 1621 if ((I->getOpcode() == Instruction::Load || 1622 match(I, m_Intrinsic<Intrinsic::masked_gather>())) && 1623 I->hasOneUse()) { 1624 const User *Us = *I->users().begin(); 1625 if (isa<ZExtInst>(Us) || isa<SExtInst>(Us)) { 1626 // only allow valid type combinations 1627 unsigned TypeSize = 1628 cast<Instruction>(Us)->getType()->getScalarSizeInBits(); 1629 if (((TypeSize == 32 && (EltSize == 8 || EltSize == 16)) || 1630 (TypeSize == 16 && EltSize == 8)) && 1631 TypeSize * NumElems == 128) { 1632 ExtSize = TypeSize; 1633 } 1634 } 1635 } 1636 // Check whether the input data needs to be truncated 1637 TruncInst *T; 1638 if ((I->getOpcode() == Instruction::Store || 1639 match(I, m_Intrinsic<Intrinsic::masked_scatter>())) && 1640 (T = dyn_cast<TruncInst>(I->getOperand(0)))) { 1641 // Only allow valid type combinations 1642 unsigned TypeSize = T->getOperand(0)->getType()->getScalarSizeInBits(); 1643 if (((EltSize == 16 && TypeSize == 32) || 1644 (EltSize == 8 && (TypeSize == 32 || TypeSize == 16))) && 1645 TypeSize * NumElems == 128) 1646 ExtSize = TypeSize; 1647 } 1648 } 1649 1650 if (ExtSize * NumElems != 128 || NumElems < 4) 1651 return ScalarCost; 1652 1653 // Any (aligned) i32 gather will not need to be scalarised. 1654 if (ExtSize == 32) 1655 return VectorCost; 1656 // For smaller types, we need to ensure that the gep's inputs are correctly 1657 // extended from a small enough value. Other sizes (including i64) are 1658 // scalarized for now. 1659 if (ExtSize != 8 && ExtSize != 16) 1660 return ScalarCost; 1661 1662 if (const auto *BC = dyn_cast<BitCastInst>(Ptr)) 1663 Ptr = BC->getOperand(0); 1664 if (const auto *GEP = dyn_cast<GetElementPtrInst>(Ptr)) { 1665 if (GEP->getNumOperands() != 2) 1666 return ScalarCost; 1667 unsigned Scale = DL.getTypeAllocSize(GEP->getResultElementType()); 1668 // Scale needs to be correct (which is only relevant for i16s). 1669 if (Scale != 1 && Scale * 8 != ExtSize) 1670 return ScalarCost; 1671 // And we need to zext (not sext) the indexes from a small enough type. 1672 if (const auto *ZExt = dyn_cast<ZExtInst>(GEP->getOperand(1))) { 1673 if (ZExt->getOperand(0)->getType()->getScalarSizeInBits() <= ExtSize) 1674 return VectorCost; 1675 } 1676 return ScalarCost; 1677 } 1678 return ScalarCost; 1679 } 1680 1681 InstructionCost 1682 ARMTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, 1683 std::optional<FastMathFlags> FMF, 1684 TTI::TargetCostKind CostKind) { 1685 1686 EVT ValVT = TLI->getValueType(DL, ValTy); 1687 int ISD = TLI->InstructionOpcodeToISD(Opcode); 1688 unsigned EltSize = ValVT.getScalarSizeInBits(); 1689 1690 // In general floating point reductions are a series of elementwise 1691 // operations, with free extracts on each step. These are either in-order or 1692 // treewise depending on whether that is allowed by the fast math flags. 1693 if ((ISD == ISD::FADD || ISD == ISD::FMUL) && 1694 ((EltSize == 32 && ST->hasVFP2Base()) || 1695 (EltSize == 64 && ST->hasFP64()) || 1696 (EltSize == 16 && ST->hasFullFP16()))) { 1697 unsigned NumElts = cast<FixedVectorType>(ValTy)->getNumElements(); 1698 unsigned VecLimit = ST->hasMVEFloatOps() ? 128 : (ST->hasNEON() ? 64 : -1); 1699 InstructionCost VecCost = 0; 1700 while (!TTI::requiresOrderedReduction(FMF) && isPowerOf2_32(NumElts) && 1701 NumElts * EltSize > VecLimit) { 1702 Type *VecTy = FixedVectorType::get(ValTy->getElementType(), NumElts / 2); 1703 VecCost += getArithmeticInstrCost(Opcode, VecTy, CostKind); 1704 NumElts /= 2; 1705 } 1706 1707 // For fp16 we need to extract the upper lane elements. MVE can add a 1708 // VREV+FMIN/MAX to perform another vector step instead. 1709 InstructionCost ExtractCost = 0; 1710 if (!TTI::requiresOrderedReduction(FMF) && ST->hasMVEFloatOps() && 1711 ValVT.getVectorElementType() == MVT::f16 && NumElts == 8) { 1712 VecCost += ST->getMVEVectorCostFactor(CostKind) * 2; 1713 NumElts /= 2; 1714 } else if (ValVT.getVectorElementType() == MVT::f16) 1715 ExtractCost = NumElts / 2; 1716 1717 return VecCost + ExtractCost + 1718 NumElts * 1719 getArithmeticInstrCost(Opcode, ValTy->getElementType(), CostKind); 1720 } 1721 1722 if ((ISD == ISD::AND || ISD == ISD::OR || ISD == ISD::XOR) && 1723 (EltSize == 64 || EltSize == 32 || EltSize == 16 || EltSize == 8)) { 1724 unsigned NumElts = cast<FixedVectorType>(ValTy)->getNumElements(); 1725 unsigned VecLimit = 1726 ST->hasMVEIntegerOps() ? 128 : (ST->hasNEON() ? 64 : -1); 1727 InstructionCost VecCost = 0; 1728 while (isPowerOf2_32(NumElts) && NumElts * EltSize > VecLimit) { 1729 Type *VecTy = FixedVectorType::get(ValTy->getElementType(), NumElts / 2); 1730 VecCost += getArithmeticInstrCost(Opcode, VecTy, CostKind); 1731 NumElts /= 2; 1732 } 1733 // For i16/i8, MVE will perform a VREV + VORR/VAND/VEOR for the 64bit vector 1734 // step. 1735 if (ST->hasMVEIntegerOps() && ValVT.getScalarSizeInBits() <= 16 && 1736 NumElts * EltSize == 64) { 1737 Type *VecTy = FixedVectorType::get(ValTy->getElementType(), NumElts); 1738 VecCost += ST->getMVEVectorCostFactor(CostKind) + 1739 getArithmeticInstrCost(Opcode, VecTy, CostKind); 1740 NumElts /= 2; 1741 } 1742 1743 // From here we extract the elements and perform the and/or/xor. 1744 InstructionCost ExtractCost = NumElts; 1745 return VecCost + ExtractCost + 1746 (NumElts - 1) * getArithmeticInstrCost( 1747 Opcode, ValTy->getElementType(), CostKind); 1748 } 1749 1750 if (!ST->hasMVEIntegerOps() || !ValVT.isSimple() || ISD != ISD::ADD || 1751 TTI::requiresOrderedReduction(FMF)) 1752 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind); 1753 1754 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy); 1755 1756 static const CostTblEntry CostTblAdd[]{ 1757 {ISD::ADD, MVT::v16i8, 1}, 1758 {ISD::ADD, MVT::v8i16, 1}, 1759 {ISD::ADD, MVT::v4i32, 1}, 1760 }; 1761 if (const auto *Entry = CostTableLookup(CostTblAdd, ISD, LT.second)) 1762 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first; 1763 1764 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind); 1765 } 1766 1767 InstructionCost ARMTTIImpl::getExtendedReductionCost( 1768 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, 1769 FastMathFlags FMF, TTI::TargetCostKind CostKind) { 1770 EVT ValVT = TLI->getValueType(DL, ValTy); 1771 EVT ResVT = TLI->getValueType(DL, ResTy); 1772 1773 int ISD = TLI->InstructionOpcodeToISD(Opcode); 1774 1775 switch (ISD) { 1776 case ISD::ADD: 1777 if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) { 1778 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy); 1779 1780 // The legal cases are: 1781 // VADDV u/s 8/16/32 1782 // VADDLV u/s 32 1783 // Codegen currently cannot always handle larger than legal vectors very 1784 // well, especially for predicated reductions where the mask needs to be 1785 // split, so restrict to 128bit or smaller input types. 1786 unsigned RevVTSize = ResVT.getSizeInBits(); 1787 if (ValVT.getSizeInBits() <= 128 && 1788 ((LT.second == MVT::v16i8 && RevVTSize <= 32) || 1789 (LT.second == MVT::v8i16 && RevVTSize <= 32) || 1790 (LT.second == MVT::v4i32 && RevVTSize <= 64))) 1791 return ST->getMVEVectorCostFactor(CostKind) * LT.first; 1792 } 1793 break; 1794 default: 1795 break; 1796 } 1797 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy, FMF, 1798 CostKind); 1799 } 1800 1801 InstructionCost 1802 ARMTTIImpl::getMulAccReductionCost(bool IsUnsigned, Type *ResTy, 1803 VectorType *ValTy, 1804 TTI::TargetCostKind CostKind) { 1805 EVT ValVT = TLI->getValueType(DL, ValTy); 1806 EVT ResVT = TLI->getValueType(DL, ResTy); 1807 1808 if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) { 1809 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy); 1810 1811 // The legal cases are: 1812 // VMLAV u/s 8/16/32 1813 // VMLALV u/s 16/32 1814 // Codegen currently cannot always handle larger than legal vectors very 1815 // well, especially for predicated reductions where the mask needs to be 1816 // split, so restrict to 128bit or smaller input types. 1817 unsigned RevVTSize = ResVT.getSizeInBits(); 1818 if (ValVT.getSizeInBits() <= 128 && 1819 ((LT.second == MVT::v16i8 && RevVTSize <= 32) || 1820 (LT.second == MVT::v8i16 && RevVTSize <= 64) || 1821 (LT.second == MVT::v4i32 && RevVTSize <= 64))) 1822 return ST->getMVEVectorCostFactor(CostKind) * LT.first; 1823 } 1824 1825 return BaseT::getMulAccReductionCost(IsUnsigned, ResTy, ValTy, CostKind); 1826 } 1827 1828 InstructionCost 1829 ARMTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, 1830 FastMathFlags FMF, 1831 TTI::TargetCostKind CostKind) { 1832 EVT ValVT = TLI->getValueType(DL, Ty); 1833 1834 // In general floating point reductions are a series of elementwise 1835 // operations, with free extracts on each step. These are either in-order or 1836 // treewise depending on whether that is allowed by the fast math flags. 1837 if ((IID == Intrinsic::minnum || IID == Intrinsic::maxnum) && 1838 ((ValVT.getVectorElementType() == MVT::f32 && ST->hasVFP2Base()) || 1839 (ValVT.getVectorElementType() == MVT::f64 && ST->hasFP64()) || 1840 (ValVT.getVectorElementType() == MVT::f16 && ST->hasFullFP16()))) { 1841 unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements(); 1842 unsigned EltSize = ValVT.getScalarSizeInBits(); 1843 unsigned VecLimit = ST->hasMVEFloatOps() ? 128 : (ST->hasNEON() ? 64 : -1); 1844 InstructionCost VecCost; 1845 while (isPowerOf2_32(NumElts) && NumElts * EltSize > VecLimit) { 1846 Type *VecTy = FixedVectorType::get(Ty->getElementType(), NumElts/2); 1847 IntrinsicCostAttributes ICA(IID, VecTy, {VecTy, VecTy}, FMF); 1848 VecCost += getIntrinsicInstrCost(ICA, CostKind); 1849 NumElts /= 2; 1850 } 1851 1852 // For fp16 we need to extract the upper lane elements. MVE can add a 1853 // VREV+FMIN/MAX to perform another vector step instead. 1854 InstructionCost ExtractCost = 0; 1855 if (ST->hasMVEFloatOps() && ValVT.getVectorElementType() == MVT::f16 && 1856 NumElts == 8) { 1857 VecCost += ST->getMVEVectorCostFactor(CostKind) * 2; 1858 NumElts /= 2; 1859 } else if (ValVT.getVectorElementType() == MVT::f16) 1860 ExtractCost = cast<FixedVectorType>(Ty)->getNumElements() / 2; 1861 1862 IntrinsicCostAttributes ICA(IID, Ty->getElementType(), 1863 {Ty->getElementType(), Ty->getElementType()}, 1864 FMF); 1865 return VecCost + ExtractCost + 1866 (NumElts - 1) * getIntrinsicInstrCost(ICA, CostKind); 1867 } 1868 1869 if (IID == Intrinsic::smin || IID == Intrinsic::smax || 1870 IID == Intrinsic::umin || IID == Intrinsic::umax) { 1871 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); 1872 1873 // All costs are the same for u/s min/max. These lower to vminv, which are 1874 // given a slightly higher cost as they tend to take multiple cycles for 1875 // smaller type sizes. 1876 static const CostTblEntry CostTblAdd[]{ 1877 {ISD::SMIN, MVT::v16i8, 4}, 1878 {ISD::SMIN, MVT::v8i16, 3}, 1879 {ISD::SMIN, MVT::v4i32, 2}, 1880 }; 1881 if (const auto *Entry = CostTableLookup(CostTblAdd, ISD::SMIN, LT.second)) 1882 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first; 1883 } 1884 1885 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind); 1886 } 1887 1888 InstructionCost 1889 ARMTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, 1890 TTI::TargetCostKind CostKind) { 1891 switch (ICA.getID()) { 1892 case Intrinsic::get_active_lane_mask: 1893 // Currently we make a somewhat optimistic assumption that 1894 // active_lane_mask's are always free. In reality it may be freely folded 1895 // into a tail predicated loop, expanded into a VCPT or expanded into a lot 1896 // of add/icmp code. We may need to improve this in the future, but being 1897 // able to detect if it is free or not involves looking at a lot of other 1898 // code. We currently assume that the vectorizer inserted these, and knew 1899 // what it was doing in adding one. 1900 if (ST->hasMVEIntegerOps()) 1901 return 0; 1902 break; 1903 case Intrinsic::sadd_sat: 1904 case Intrinsic::ssub_sat: 1905 case Intrinsic::uadd_sat: 1906 case Intrinsic::usub_sat: { 1907 if (!ST->hasMVEIntegerOps()) 1908 break; 1909 Type *VT = ICA.getReturnType(); 1910 1911 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VT); 1912 if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 || 1913 LT.second == MVT::v16i8) { 1914 // This is a base cost of 1 for the vqadd, plus 3 extract shifts if we 1915 // need to extend the type, as it uses shr(qadd(shl, shl)). 1916 unsigned Instrs = 1917 LT.second.getScalarSizeInBits() == VT->getScalarSizeInBits() ? 1 : 4; 1918 return LT.first * ST->getMVEVectorCostFactor(CostKind) * Instrs; 1919 } 1920 break; 1921 } 1922 case Intrinsic::abs: 1923 case Intrinsic::smin: 1924 case Intrinsic::smax: 1925 case Intrinsic::umin: 1926 case Intrinsic::umax: { 1927 if (!ST->hasMVEIntegerOps()) 1928 break; 1929 Type *VT = ICA.getReturnType(); 1930 1931 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VT); 1932 if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 || 1933 LT.second == MVT::v16i8) 1934 return LT.first * ST->getMVEVectorCostFactor(CostKind); 1935 break; 1936 } 1937 case Intrinsic::minnum: 1938 case Intrinsic::maxnum: { 1939 if (!ST->hasMVEFloatOps()) 1940 break; 1941 Type *VT = ICA.getReturnType(); 1942 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VT); 1943 if (LT.second == MVT::v4f32 || LT.second == MVT::v8f16) 1944 return LT.first * ST->getMVEVectorCostFactor(CostKind); 1945 break; 1946 } 1947 case Intrinsic::fptosi_sat: 1948 case Intrinsic::fptoui_sat: { 1949 if (ICA.getArgTypes().empty()) 1950 break; 1951 bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat; 1952 auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]); 1953 EVT MTy = TLI->getValueType(DL, ICA.getReturnType()); 1954 // Check for the legal types, with the corect subtarget features. 1955 if ((ST->hasVFP2Base() && LT.second == MVT::f32 && MTy == MVT::i32) || 1956 (ST->hasFP64() && LT.second == MVT::f64 && MTy == MVT::i32) || 1957 (ST->hasFullFP16() && LT.second == MVT::f16 && MTy == MVT::i32)) 1958 return LT.first; 1959 1960 // Equally for MVE vector types 1961 if (ST->hasMVEFloatOps() && 1962 (LT.second == MVT::v4f32 || LT.second == MVT::v8f16) && 1963 LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits()) 1964 return LT.first * ST->getMVEVectorCostFactor(CostKind); 1965 1966 // Otherwise we use a legal convert followed by a min+max 1967 if (((ST->hasVFP2Base() && LT.second == MVT::f32) || 1968 (ST->hasFP64() && LT.second == MVT::f64) || 1969 (ST->hasFullFP16() && LT.second == MVT::f16) || 1970 (ST->hasMVEFloatOps() && 1971 (LT.second == MVT::v4f32 || LT.second == MVT::v8f16))) && 1972 LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) { 1973 Type *LegalTy = Type::getIntNTy(ICA.getReturnType()->getContext(), 1974 LT.second.getScalarSizeInBits()); 1975 InstructionCost Cost = 1976 LT.second.isVector() ? ST->getMVEVectorCostFactor(CostKind) : 1; 1977 IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin 1978 : Intrinsic::umin, 1979 LegalTy, {LegalTy, LegalTy}); 1980 Cost += getIntrinsicInstrCost(Attrs1, CostKind); 1981 IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax 1982 : Intrinsic::umax, 1983 LegalTy, {LegalTy, LegalTy}); 1984 Cost += getIntrinsicInstrCost(Attrs2, CostKind); 1985 return LT.first * Cost; 1986 } 1987 break; 1988 } 1989 } 1990 1991 return BaseT::getIntrinsicInstrCost(ICA, CostKind); 1992 } 1993 1994 bool ARMTTIImpl::isLoweredToCall(const Function *F) { 1995 if (!F->isIntrinsic()) 1996 return BaseT::isLoweredToCall(F); 1997 1998 // Assume all Arm-specific intrinsics map to an instruction. 1999 if (F->getName().starts_with("llvm.arm")) 2000 return false; 2001 2002 switch (F->getIntrinsicID()) { 2003 default: break; 2004 case Intrinsic::powi: 2005 case Intrinsic::sin: 2006 case Intrinsic::cos: 2007 case Intrinsic::pow: 2008 case Intrinsic::log: 2009 case Intrinsic::log10: 2010 case Intrinsic::log2: 2011 case Intrinsic::exp: 2012 case Intrinsic::exp2: 2013 return true; 2014 case Intrinsic::sqrt: 2015 case Intrinsic::fabs: 2016 case Intrinsic::copysign: 2017 case Intrinsic::floor: 2018 case Intrinsic::ceil: 2019 case Intrinsic::trunc: 2020 case Intrinsic::rint: 2021 case Intrinsic::nearbyint: 2022 case Intrinsic::round: 2023 case Intrinsic::canonicalize: 2024 case Intrinsic::lround: 2025 case Intrinsic::llround: 2026 case Intrinsic::lrint: 2027 case Intrinsic::llrint: 2028 if (F->getReturnType()->isDoubleTy() && !ST->hasFP64()) 2029 return true; 2030 if (F->getReturnType()->isHalfTy() && !ST->hasFullFP16()) 2031 return true; 2032 // Some operations can be handled by vector instructions and assume 2033 // unsupported vectors will be expanded into supported scalar ones. 2034 // TODO Handle scalar operations properly. 2035 return !ST->hasFPARMv8Base() && !ST->hasVFP2Base(); 2036 case Intrinsic::masked_store: 2037 case Intrinsic::masked_load: 2038 case Intrinsic::masked_gather: 2039 case Intrinsic::masked_scatter: 2040 return !ST->hasMVEIntegerOps(); 2041 case Intrinsic::sadd_with_overflow: 2042 case Intrinsic::uadd_with_overflow: 2043 case Intrinsic::ssub_with_overflow: 2044 case Intrinsic::usub_with_overflow: 2045 case Intrinsic::sadd_sat: 2046 case Intrinsic::uadd_sat: 2047 case Intrinsic::ssub_sat: 2048 case Intrinsic::usub_sat: 2049 return false; 2050 } 2051 2052 return BaseT::isLoweredToCall(F); 2053 } 2054 2055 bool ARMTTIImpl::maybeLoweredToCall(Instruction &I) { 2056 unsigned ISD = TLI->InstructionOpcodeToISD(I.getOpcode()); 2057 EVT VT = TLI->getValueType(DL, I.getType(), true); 2058 if (TLI->getOperationAction(ISD, VT) == TargetLowering::LibCall) 2059 return true; 2060 2061 // Check if an intrinsic will be lowered to a call and assume that any 2062 // other CallInst will generate a bl. 2063 if (auto *Call = dyn_cast<CallInst>(&I)) { 2064 if (auto *II = dyn_cast<IntrinsicInst>(Call)) { 2065 switch(II->getIntrinsicID()) { 2066 case Intrinsic::memcpy: 2067 case Intrinsic::memset: 2068 case Intrinsic::memmove: 2069 return getNumMemOps(II) == -1; 2070 default: 2071 if (const Function *F = Call->getCalledFunction()) 2072 return isLoweredToCall(F); 2073 } 2074 } 2075 return true; 2076 } 2077 2078 // FPv5 provides conversions between integer, double-precision, 2079 // single-precision, and half-precision formats. 2080 switch (I.getOpcode()) { 2081 default: 2082 break; 2083 case Instruction::FPToSI: 2084 case Instruction::FPToUI: 2085 case Instruction::SIToFP: 2086 case Instruction::UIToFP: 2087 case Instruction::FPTrunc: 2088 case Instruction::FPExt: 2089 return !ST->hasFPARMv8Base(); 2090 } 2091 2092 // FIXME: Unfortunately the approach of checking the Operation Action does 2093 // not catch all cases of Legalization that use library calls. Our 2094 // Legalization step categorizes some transformations into library calls as 2095 // Custom, Expand or even Legal when doing type legalization. So for now 2096 // we have to special case for instance the SDIV of 64bit integers and the 2097 // use of floating point emulation. 2098 if (VT.isInteger() && VT.getSizeInBits() >= 64) { 2099 switch (ISD) { 2100 default: 2101 break; 2102 case ISD::SDIV: 2103 case ISD::UDIV: 2104 case ISD::SREM: 2105 case ISD::UREM: 2106 case ISD::SDIVREM: 2107 case ISD::UDIVREM: 2108 return true; 2109 } 2110 } 2111 2112 // Assume all other non-float operations are supported. 2113 if (!VT.isFloatingPoint()) 2114 return false; 2115 2116 // We'll need a library call to handle most floats when using soft. 2117 if (TLI->useSoftFloat()) { 2118 switch (I.getOpcode()) { 2119 default: 2120 return true; 2121 case Instruction::Alloca: 2122 case Instruction::Load: 2123 case Instruction::Store: 2124 case Instruction::Select: 2125 case Instruction::PHI: 2126 return false; 2127 } 2128 } 2129 2130 // We'll need a libcall to perform double precision operations on a single 2131 // precision only FPU. 2132 if (I.getType()->isDoubleTy() && !ST->hasFP64()) 2133 return true; 2134 2135 // Likewise for half precision arithmetic. 2136 if (I.getType()->isHalfTy() && !ST->hasFullFP16()) 2137 return true; 2138 2139 return false; 2140 } 2141 2142 bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, 2143 AssumptionCache &AC, 2144 TargetLibraryInfo *LibInfo, 2145 HardwareLoopInfo &HWLoopInfo) { 2146 // Low-overhead branches are only supported in the 'low-overhead branch' 2147 // extension of v8.1-m. 2148 if (!ST->hasLOB() || DisableLowOverheadLoops) { 2149 LLVM_DEBUG(dbgs() << "ARMHWLoops: Disabled\n"); 2150 return false; 2151 } 2152 2153 if (!SE.hasLoopInvariantBackedgeTakenCount(L)) { 2154 LLVM_DEBUG(dbgs() << "ARMHWLoops: No BETC\n"); 2155 return false; 2156 } 2157 2158 const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L); 2159 if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) { 2160 LLVM_DEBUG(dbgs() << "ARMHWLoops: Uncomputable BETC\n"); 2161 return false; 2162 } 2163 2164 const SCEV *TripCountSCEV = 2165 SE.getAddExpr(BackedgeTakenCount, 2166 SE.getOne(BackedgeTakenCount->getType())); 2167 2168 // We need to store the trip count in LR, a 32-bit register. 2169 if (SE.getUnsignedRangeMax(TripCountSCEV).getBitWidth() > 32) { 2170 LLVM_DEBUG(dbgs() << "ARMHWLoops: Trip count does not fit into 32bits\n"); 2171 return false; 2172 } 2173 2174 // Making a call will trash LR and clear LO_BRANCH_INFO, so there's little 2175 // point in generating a hardware loop if that's going to happen. 2176 2177 auto IsHardwareLoopIntrinsic = [](Instruction &I) { 2178 if (auto *Call = dyn_cast<IntrinsicInst>(&I)) { 2179 switch (Call->getIntrinsicID()) { 2180 default: 2181 break; 2182 case Intrinsic::start_loop_iterations: 2183 case Intrinsic::test_start_loop_iterations: 2184 case Intrinsic::loop_decrement: 2185 case Intrinsic::loop_decrement_reg: 2186 return true; 2187 } 2188 } 2189 return false; 2190 }; 2191 2192 // Scan the instructions to see if there's any that we know will turn into a 2193 // call or if this loop is already a low-overhead loop or will become a tail 2194 // predicated loop. 2195 bool IsTailPredLoop = false; 2196 auto ScanLoop = [&](Loop *L) { 2197 for (auto *BB : L->getBlocks()) { 2198 for (auto &I : *BB) { 2199 if (maybeLoweredToCall(I) || IsHardwareLoopIntrinsic(I) || 2200 isa<InlineAsm>(I)) { 2201 LLVM_DEBUG(dbgs() << "ARMHWLoops: Bad instruction: " << I << "\n"); 2202 return false; 2203 } 2204 if (auto *II = dyn_cast<IntrinsicInst>(&I)) 2205 IsTailPredLoop |= 2206 II->getIntrinsicID() == Intrinsic::get_active_lane_mask || 2207 II->getIntrinsicID() == Intrinsic::arm_mve_vctp8 || 2208 II->getIntrinsicID() == Intrinsic::arm_mve_vctp16 || 2209 II->getIntrinsicID() == Intrinsic::arm_mve_vctp32 || 2210 II->getIntrinsicID() == Intrinsic::arm_mve_vctp64; 2211 } 2212 } 2213 return true; 2214 }; 2215 2216 // Visit inner loops. 2217 for (auto *Inner : *L) 2218 if (!ScanLoop(Inner)) 2219 return false; 2220 2221 if (!ScanLoop(L)) 2222 return false; 2223 2224 // TODO: Check whether the trip count calculation is expensive. If L is the 2225 // inner loop but we know it has a low trip count, calculating that trip 2226 // count (in the parent loop) may be detrimental. 2227 2228 LLVMContext &C = L->getHeader()->getContext(); 2229 HWLoopInfo.CounterInReg = true; 2230 HWLoopInfo.IsNestingLegal = false; 2231 HWLoopInfo.PerformEntryTest = AllowWLSLoops && !IsTailPredLoop; 2232 HWLoopInfo.CountType = Type::getInt32Ty(C); 2233 HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1); 2234 return true; 2235 } 2236 2237 static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) { 2238 // We don't allow icmp's, and because we only look at single block loops, 2239 // we simply count the icmps, i.e. there should only be 1 for the backedge. 2240 if (isa<ICmpInst>(&I) && ++ICmpCount > 1) 2241 return false; 2242 // FIXME: This is a workaround for poor cost modelling. Min/Max intrinsics are 2243 // not currently canonical, but soon will be. Code without them uses icmp, and 2244 // so is not tail predicated as per the condition above. In order to get the 2245 // same performance we treat min and max the same as an icmp for tailpred 2246 // purposes for the moment (we often rely on non-tailpred and higher VF's to 2247 // pick more optimial instructions like VQDMULH. They need to be recognized 2248 // directly by the vectorizer). 2249 if (auto *II = dyn_cast<IntrinsicInst>(&I)) 2250 if ((II->getIntrinsicID() == Intrinsic::smin || 2251 II->getIntrinsicID() == Intrinsic::smax || 2252 II->getIntrinsicID() == Intrinsic::umin || 2253 II->getIntrinsicID() == Intrinsic::umax) && 2254 ++ICmpCount > 1) 2255 return false; 2256 2257 if (isa<FCmpInst>(&I)) 2258 return false; 2259 2260 // We could allow extending/narrowing FP loads/stores, but codegen is 2261 // too inefficient so reject this for now. 2262 if (isa<FPExtInst>(&I) || isa<FPTruncInst>(&I)) 2263 return false; 2264 2265 // Extends have to be extending-loads 2266 if (isa<SExtInst>(&I) || isa<ZExtInst>(&I) ) 2267 if (!I.getOperand(0)->hasOneUse() || !isa<LoadInst>(I.getOperand(0))) 2268 return false; 2269 2270 // Truncs have to be narrowing-stores 2271 if (isa<TruncInst>(&I) ) 2272 if (!I.hasOneUse() || !isa<StoreInst>(*I.user_begin())) 2273 return false; 2274 2275 return true; 2276 } 2277 2278 // To set up a tail-predicated loop, we need to know the total number of 2279 // elements processed by that loop. Thus, we need to determine the element 2280 // size and: 2281 // 1) it should be uniform for all operations in the vector loop, so we 2282 // e.g. don't want any widening/narrowing operations. 2283 // 2) it should be smaller than i64s because we don't have vector operations 2284 // that work on i64s. 2285 // 3) we don't want elements to be reversed or shuffled, to make sure the 2286 // tail-predication masks/predicates the right lanes. 2287 // 2288 static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE, 2289 const DataLayout &DL, 2290 const LoopAccessInfo *LAI) { 2291 LLVM_DEBUG(dbgs() << "Tail-predication: checking allowed instructions\n"); 2292 2293 // If there are live-out values, it is probably a reduction. We can predicate 2294 // most reduction operations freely under MVE using a combination of 2295 // prefer-predicated-reduction-select and inloop reductions. We limit this to 2296 // floating point and integer reductions, but don't check for operators 2297 // specifically here. If the value ends up not being a reduction (and so the 2298 // vectorizer cannot tailfold the loop), we should fall back to standard 2299 // vectorization automatically. 2300 SmallVector< Instruction *, 8 > LiveOuts; 2301 LiveOuts = llvm::findDefsUsedOutsideOfLoop(L); 2302 bool ReductionsDisabled = 2303 EnableTailPredication == TailPredication::EnabledNoReductions || 2304 EnableTailPredication == TailPredication::ForceEnabledNoReductions; 2305 2306 for (auto *I : LiveOuts) { 2307 if (!I->getType()->isIntegerTy() && !I->getType()->isFloatTy() && 2308 !I->getType()->isHalfTy()) { 2309 LLVM_DEBUG(dbgs() << "Don't tail-predicate loop with non-integer/float " 2310 "live-out value\n"); 2311 return false; 2312 } 2313 if (ReductionsDisabled) { 2314 LLVM_DEBUG(dbgs() << "Reductions not enabled\n"); 2315 return false; 2316 } 2317 } 2318 2319 // Next, check that all instructions can be tail-predicated. 2320 PredicatedScalarEvolution PSE = LAI->getPSE(); 2321 SmallVector<Instruction *, 16> LoadStores; 2322 int ICmpCount = 0; 2323 2324 for (BasicBlock *BB : L->blocks()) { 2325 for (Instruction &I : BB->instructionsWithoutDebug()) { 2326 if (isa<PHINode>(&I)) 2327 continue; 2328 if (!canTailPredicateInstruction(I, ICmpCount)) { 2329 LLVM_DEBUG(dbgs() << "Instruction not allowed: "; I.dump()); 2330 return false; 2331 } 2332 2333 Type *T = I.getType(); 2334 if (T->getScalarSizeInBits() > 32) { 2335 LLVM_DEBUG(dbgs() << "Unsupported Type: "; T->dump()); 2336 return false; 2337 } 2338 if (isa<StoreInst>(I) || isa<LoadInst>(I)) { 2339 Value *Ptr = getLoadStorePointerOperand(&I); 2340 Type *AccessTy = getLoadStoreType(&I); 2341 int64_t NextStride = getPtrStride(PSE, AccessTy, Ptr, L).value_or(0); 2342 if (NextStride == 1) { 2343 // TODO: for now only allow consecutive strides of 1. We could support 2344 // other strides as long as it is uniform, but let's keep it simple 2345 // for now. 2346 continue; 2347 } else if (NextStride == -1 || 2348 (NextStride == 2 && MVEMaxSupportedInterleaveFactor >= 2) || 2349 (NextStride == 4 && MVEMaxSupportedInterleaveFactor >= 4)) { 2350 LLVM_DEBUG(dbgs() 2351 << "Consecutive strides of 2 found, vld2/vstr2 can't " 2352 "be tail-predicated\n."); 2353 return false; 2354 // TODO: don't tail predicate if there is a reversed load? 2355 } else if (EnableMaskedGatherScatters) { 2356 // Gather/scatters do allow loading from arbitrary strides, at 2357 // least if they are loop invariant. 2358 // TODO: Loop variant strides should in theory work, too, but 2359 // this requires further testing. 2360 const SCEV *PtrScev = PSE.getSE()->getSCEV(Ptr); 2361 if (auto AR = dyn_cast<SCEVAddRecExpr>(PtrScev)) { 2362 const SCEV *Step = AR->getStepRecurrence(*PSE.getSE()); 2363 if (PSE.getSE()->isLoopInvariant(Step, L)) 2364 continue; 2365 } 2366 } 2367 LLVM_DEBUG(dbgs() << "Bad stride found, can't " 2368 "tail-predicate\n."); 2369 return false; 2370 } 2371 } 2372 } 2373 2374 LLVM_DEBUG(dbgs() << "tail-predication: all instructions allowed!\n"); 2375 return true; 2376 } 2377 2378 bool ARMTTIImpl::preferPredicateOverEpilogue(TailFoldingInfo *TFI) { 2379 if (!EnableTailPredication) { 2380 LLVM_DEBUG(dbgs() << "Tail-predication not enabled.\n"); 2381 return false; 2382 } 2383 2384 // Creating a predicated vector loop is the first step for generating a 2385 // tail-predicated hardware loop, for which we need the MVE masked 2386 // load/stores instructions: 2387 if (!ST->hasMVEIntegerOps()) 2388 return false; 2389 2390 LoopVectorizationLegality *LVL = TFI->LVL; 2391 Loop *L = LVL->getLoop(); 2392 2393 // For now, restrict this to single block loops. 2394 if (L->getNumBlocks() > 1) { 2395 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: not a single block " 2396 "loop.\n"); 2397 return false; 2398 } 2399 2400 assert(L->isInnermost() && "preferPredicateOverEpilogue: inner-loop expected"); 2401 2402 LoopInfo *LI = LVL->getLoopInfo(); 2403 HardwareLoopInfo HWLoopInfo(L); 2404 if (!HWLoopInfo.canAnalyze(*LI)) { 2405 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not " 2406 "analyzable.\n"); 2407 return false; 2408 } 2409 2410 AssumptionCache *AC = LVL->getAssumptionCache(); 2411 ScalarEvolution *SE = LVL->getScalarEvolution(); 2412 2413 // This checks if we have the low-overhead branch architecture 2414 // extension, and if we will create a hardware-loop: 2415 if (!isHardwareLoopProfitable(L, *SE, *AC, TFI->TLI, HWLoopInfo)) { 2416 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not " 2417 "profitable.\n"); 2418 return false; 2419 } 2420 2421 DominatorTree *DT = LVL->getDominatorTree(); 2422 if (!HWLoopInfo.isHardwareLoopCandidate(*SE, *LI, *DT)) { 2423 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not " 2424 "a candidate.\n"); 2425 return false; 2426 } 2427 2428 return canTailPredicateLoop(L, LI, *SE, DL, LVL->getLAI()); 2429 } 2430 2431 TailFoldingStyle 2432 ARMTTIImpl::getPreferredTailFoldingStyle(bool IVUpdateMayOverflow) const { 2433 if (!ST->hasMVEIntegerOps() || !EnableTailPredication) 2434 return TailFoldingStyle::DataWithoutLaneMask; 2435 2436 // Intrinsic @llvm.get.active.lane.mask is supported. 2437 // It is used in the MVETailPredication pass, which requires the number of 2438 // elements processed by this vector loop to setup the tail-predicated 2439 // loop. 2440 return TailFoldingStyle::Data; 2441 } 2442 void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, 2443 TTI::UnrollingPreferences &UP, 2444 OptimizationRemarkEmitter *ORE) { 2445 // Enable Upper bound unrolling universally, providing that we do not see an 2446 // active lane mask, which will be better kept as a loop to become tail 2447 // predicated than to be conditionally unrolled. 2448 UP.UpperBound = 2449 !ST->hasMVEIntegerOps() || !any_of(*L->getHeader(), [](Instruction &I) { 2450 return isa<IntrinsicInst>(I) && 2451 cast<IntrinsicInst>(I).getIntrinsicID() == 2452 Intrinsic::get_active_lane_mask; 2453 }); 2454 2455 // Only currently enable these preferences for M-Class cores. 2456 if (!ST->isMClass()) 2457 return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE); 2458 2459 // Disable loop unrolling for Oz and Os. 2460 UP.OptSizeThreshold = 0; 2461 UP.PartialOptSizeThreshold = 0; 2462 if (L->getHeader()->getParent()->hasOptSize()) 2463 return; 2464 2465 SmallVector<BasicBlock*, 4> ExitingBlocks; 2466 L->getExitingBlocks(ExitingBlocks); 2467 LLVM_DEBUG(dbgs() << "Loop has:\n" 2468 << "Blocks: " << L->getNumBlocks() << "\n" 2469 << "Exit blocks: " << ExitingBlocks.size() << "\n"); 2470 2471 // Only allow another exit other than the latch. This acts as an early exit 2472 // as it mirrors the profitability calculation of the runtime unroller. 2473 if (ExitingBlocks.size() > 2) 2474 return; 2475 2476 // Limit the CFG of the loop body for targets with a branch predictor. 2477 // Allowing 4 blocks permits if-then-else diamonds in the body. 2478 if (ST->hasBranchPredictor() && L->getNumBlocks() > 4) 2479 return; 2480 2481 // Don't unroll vectorized loops, including the remainder loop 2482 if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized")) 2483 return; 2484 2485 // Scan the loop: don't unroll loops with calls as this could prevent 2486 // inlining. 2487 InstructionCost Cost = 0; 2488 for (auto *BB : L->getBlocks()) { 2489 for (auto &I : *BB) { 2490 // Don't unroll vectorised loop. MVE does not benefit from it as much as 2491 // scalar code. 2492 if (I.getType()->isVectorTy()) 2493 return; 2494 2495 if (isa<CallInst>(I) || isa<InvokeInst>(I)) { 2496 if (const Function *F = cast<CallBase>(I).getCalledFunction()) { 2497 if (!isLoweredToCall(F)) 2498 continue; 2499 } 2500 return; 2501 } 2502 2503 SmallVector<const Value*, 4> Operands(I.operand_values()); 2504 Cost += getInstructionCost(&I, Operands, 2505 TargetTransformInfo::TCK_SizeAndLatency); 2506 } 2507 } 2508 2509 // On v6m cores, there are very few registers available. We can easily end up 2510 // spilling and reloading more registers in an unrolled loop. Look at the 2511 // number of LCSSA phis as a rough measure of how many registers will need to 2512 // be live out of the loop, reducing the default unroll count if more than 1 2513 // value is needed. In the long run, all of this should be being learnt by a 2514 // machine. 2515 unsigned UnrollCount = 4; 2516 if (ST->isThumb1Only()) { 2517 unsigned ExitingValues = 0; 2518 SmallVector<BasicBlock *, 4> ExitBlocks; 2519 L->getExitBlocks(ExitBlocks); 2520 for (auto *Exit : ExitBlocks) { 2521 // Count the number of LCSSA phis. Exclude values coming from GEP's as 2522 // only the last is expected to be needed for address operands. 2523 unsigned LiveOuts = count_if(Exit->phis(), [](auto &PH) { 2524 return PH.getNumOperands() != 1 || 2525 !isa<GetElementPtrInst>(PH.getOperand(0)); 2526 }); 2527 ExitingValues = ExitingValues < LiveOuts ? LiveOuts : ExitingValues; 2528 } 2529 if (ExitingValues) 2530 UnrollCount /= ExitingValues; 2531 if (UnrollCount <= 1) 2532 return; 2533 } 2534 2535 LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n"); 2536 LLVM_DEBUG(dbgs() << "Default Runtime Unroll Count: " << UnrollCount << "\n"); 2537 2538 UP.Partial = true; 2539 UP.Runtime = true; 2540 UP.UnrollRemainder = true; 2541 UP.DefaultUnrollRuntimeCount = UnrollCount; 2542 UP.UnrollAndJam = true; 2543 UP.UnrollAndJamInnerLoopThreshold = 60; 2544 2545 // Force unrolling small loops can be very useful because of the branch 2546 // taken cost of the backedge. 2547 if (Cost < 12) 2548 UP.Force = true; 2549 } 2550 2551 void ARMTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, 2552 TTI::PeelingPreferences &PP) { 2553 BaseT::getPeelingPreferences(L, SE, PP); 2554 } 2555 2556 bool ARMTTIImpl::preferInLoopReduction(unsigned Opcode, Type *Ty, 2557 TTI::ReductionFlags Flags) const { 2558 if (!ST->hasMVEIntegerOps()) 2559 return false; 2560 2561 unsigned ScalarBits = Ty->getScalarSizeInBits(); 2562 switch (Opcode) { 2563 case Instruction::Add: 2564 return ScalarBits <= 64; 2565 default: 2566 return false; 2567 } 2568 } 2569 2570 bool ARMTTIImpl::preferPredicatedReductionSelect( 2571 unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const { 2572 if (!ST->hasMVEIntegerOps()) 2573 return false; 2574 return true; 2575 } 2576 2577 InstructionCost ARMTTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, 2578 StackOffset BaseOffset, 2579 bool HasBaseReg, int64_t Scale, 2580 unsigned AddrSpace) const { 2581 TargetLoweringBase::AddrMode AM; 2582 AM.BaseGV = BaseGV; 2583 AM.BaseOffs = BaseOffset.getFixed(); 2584 AM.HasBaseReg = HasBaseReg; 2585 AM.Scale = Scale; 2586 AM.ScalableOffset = BaseOffset.getScalable(); 2587 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace)) { 2588 if (ST->hasFPAO()) 2589 return AM.Scale < 0 ? 1 : 0; // positive offsets execute faster 2590 return 0; 2591 } 2592 return -1; 2593 } 2594 2595 bool ARMTTIImpl::hasArmWideBranch(bool Thumb) const { 2596 if (Thumb) { 2597 // B.W is available in any Thumb2-supporting target, and also in every 2598 // version of Armv8-M, even Baseline which does not include the rest of 2599 // Thumb2. 2600 return ST->isThumb2() || ST->hasV8MBaselineOps(); 2601 } else { 2602 // B is available in all versions of the Arm ISA, so the only question is 2603 // whether that ISA is available at all. 2604 return ST->hasARMOps(); 2605 } 2606 } 2607