1 //===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "AArch64TargetTransformInfo.h" 10 #include "AArch64ExpandImm.h" 11 #include "AArch64PerfectShuffle.h" 12 #include "MCTargetDesc/AArch64AddressingModes.h" 13 #include "llvm/Analysis/IVDescriptors.h" 14 #include "llvm/Analysis/LoopInfo.h" 15 #include "llvm/Analysis/TargetTransformInfo.h" 16 #include "llvm/CodeGen/BasicTTIImpl.h" 17 #include "llvm/CodeGen/CostTable.h" 18 #include "llvm/CodeGen/TargetLowering.h" 19 #include "llvm/IR/IntrinsicInst.h" 20 #include "llvm/IR/Intrinsics.h" 21 #include "llvm/IR/IntrinsicsAArch64.h" 22 #include "llvm/IR/PatternMatch.h" 23 #include "llvm/Support/Debug.h" 24 #include "llvm/Transforms/InstCombine/InstCombiner.h" 25 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 26 #include <algorithm> 27 #include <optional> 28 using namespace llvm; 29 using namespace llvm::PatternMatch; 30 31 #define DEBUG_TYPE "aarch64tti" 32 33 static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix", 34 cl::init(true), cl::Hidden); 35 36 static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(10), 37 cl::Hidden); 38 39 static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead", 40 cl::init(10), cl::Hidden); 41 42 static cl::opt<unsigned> SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold", 43 cl::init(15), cl::Hidden); 44 45 static cl::opt<unsigned> 46 NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10), 47 cl::Hidden); 48 49 static cl::opt<unsigned> CallPenaltyChangeSM( 50 "call-penalty-sm-change", cl::init(5), cl::Hidden, 51 cl::desc( 52 "Penalty of calling a function that requires a change to PSTATE.SM")); 53 54 static cl::opt<unsigned> InlineCallPenaltyChangeSM( 55 "inline-call-penalty-sm-change", cl::init(10), cl::Hidden, 56 cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM")); 57 58 static cl::opt<bool> EnableOrLikeSelectOpt("enable-aarch64-or-like-select", 59 cl::init(true), cl::Hidden); 60 61 static cl::opt<bool> EnableLSRCostOpt("enable-aarch64-lsr-cost-opt", 62 cl::init(true), cl::Hidden); 63 64 // A complete guess as to a reasonable cost. 65 static cl::opt<unsigned> 66 BaseHistCntCost("aarch64-base-histcnt-cost", cl::init(8), cl::Hidden, 67 cl::desc("The cost of a histcnt instruction")); 68 69 namespace { 70 class TailFoldingOption { 71 // These bitfields will only ever be set to something non-zero in operator=, 72 // when setting the -sve-tail-folding option. This option should always be of 73 // the form (default|simple|all|disable)[+(Flag1|Flag2|etc)], where here 74 // InitialBits is one of (disabled|all|simple). EnableBits represents 75 // additional flags we're enabling, and DisableBits for those flags we're 76 // disabling. The default flag is tracked in the variable NeedsDefault, since 77 // at the time of setting the option we may not know what the default value 78 // for the CPU is. 79 TailFoldingOpts InitialBits = TailFoldingOpts::Disabled; 80 TailFoldingOpts EnableBits = TailFoldingOpts::Disabled; 81 TailFoldingOpts DisableBits = TailFoldingOpts::Disabled; 82 83 // This value needs to be initialised to true in case the user does not 84 // explicitly set the -sve-tail-folding option. 85 bool NeedsDefault = true; 86 87 void setInitialBits(TailFoldingOpts Bits) { InitialBits = Bits; } 88 89 void setNeedsDefault(bool V) { NeedsDefault = V; } 90 91 void setEnableBit(TailFoldingOpts Bit) { 92 EnableBits |= Bit; 93 DisableBits &= ~Bit; 94 } 95 96 void setDisableBit(TailFoldingOpts Bit) { 97 EnableBits &= ~Bit; 98 DisableBits |= Bit; 99 } 100 101 TailFoldingOpts getBits(TailFoldingOpts DefaultBits) const { 102 TailFoldingOpts Bits = TailFoldingOpts::Disabled; 103 104 assert((InitialBits == TailFoldingOpts::Disabled || !NeedsDefault) && 105 "Initial bits should only include one of " 106 "(disabled|all|simple|default)"); 107 Bits = NeedsDefault ? DefaultBits : InitialBits; 108 Bits |= EnableBits; 109 Bits &= ~DisableBits; 110 111 return Bits; 112 } 113 114 void reportError(std::string Opt) { 115 errs() << "invalid argument '" << Opt 116 << "' to -sve-tail-folding=; the option should be of the form\n" 117 " (disabled|all|default|simple)[+(reductions|recurrences" 118 "|reverse|noreductions|norecurrences|noreverse)]\n"; 119 report_fatal_error("Unrecognised tail-folding option"); 120 } 121 122 public: 123 124 void operator=(const std::string &Val) { 125 // If the user explicitly sets -sve-tail-folding= then treat as an error. 126 if (Val.empty()) { 127 reportError(""); 128 return; 129 } 130 131 // Since the user is explicitly setting the option we don't automatically 132 // need the default unless they require it. 133 setNeedsDefault(false); 134 135 SmallVector<StringRef, 4> TailFoldTypes; 136 StringRef(Val).split(TailFoldTypes, '+', -1, false); 137 138 unsigned StartIdx = 1; 139 if (TailFoldTypes[0] == "disabled") 140 setInitialBits(TailFoldingOpts::Disabled); 141 else if (TailFoldTypes[0] == "all") 142 setInitialBits(TailFoldingOpts::All); 143 else if (TailFoldTypes[0] == "default") 144 setNeedsDefault(true); 145 else if (TailFoldTypes[0] == "simple") 146 setInitialBits(TailFoldingOpts::Simple); 147 else { 148 StartIdx = 0; 149 setInitialBits(TailFoldingOpts::Disabled); 150 } 151 152 for (unsigned I = StartIdx; I < TailFoldTypes.size(); I++) { 153 if (TailFoldTypes[I] == "reductions") 154 setEnableBit(TailFoldingOpts::Reductions); 155 else if (TailFoldTypes[I] == "recurrences") 156 setEnableBit(TailFoldingOpts::Recurrences); 157 else if (TailFoldTypes[I] == "reverse") 158 setEnableBit(TailFoldingOpts::Reverse); 159 else if (TailFoldTypes[I] == "noreductions") 160 setDisableBit(TailFoldingOpts::Reductions); 161 else if (TailFoldTypes[I] == "norecurrences") 162 setDisableBit(TailFoldingOpts::Recurrences); 163 else if (TailFoldTypes[I] == "noreverse") 164 setDisableBit(TailFoldingOpts::Reverse); 165 else 166 reportError(Val); 167 } 168 } 169 170 bool satisfies(TailFoldingOpts DefaultBits, TailFoldingOpts Required) const { 171 return (getBits(DefaultBits) & Required) == Required; 172 } 173 }; 174 } // namespace 175 176 TailFoldingOption TailFoldingOptionLoc; 177 178 cl::opt<TailFoldingOption, true, cl::parser<std::string>> SVETailFolding( 179 "sve-tail-folding", 180 cl::desc( 181 "Control the use of vectorisation using tail-folding for SVE where the" 182 " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:" 183 "\ndisabled (Initial) No loop types will vectorize using " 184 "tail-folding" 185 "\ndefault (Initial) Uses the default tail-folding settings for " 186 "the target CPU" 187 "\nall (Initial) All legal loop types will vectorize using " 188 "tail-folding" 189 "\nsimple (Initial) Use tail-folding for simple loops (not " 190 "reductions or recurrences)" 191 "\nreductions Use tail-folding for loops containing reductions" 192 "\nnoreductions Inverse of above" 193 "\nrecurrences Use tail-folding for loops containing fixed order " 194 "recurrences" 195 "\nnorecurrences Inverse of above" 196 "\nreverse Use tail-folding for loops requiring reversed " 197 "predicates" 198 "\nnoreverse Inverse of above"), 199 cl::location(TailFoldingOptionLoc)); 200 201 // Experimental option that will only be fully functional when the 202 // code-generator is changed to use SVE instead of NEON for all fixed-width 203 // operations. 204 static cl::opt<bool> EnableFixedwidthAutovecInStreamingMode( 205 "enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden); 206 207 // Experimental option that will only be fully functional when the cost-model 208 // and code-generator have been changed to avoid using scalable vector 209 // instructions that are not legal in streaming SVE mode. 210 static cl::opt<bool> EnableScalableAutovecInStreamingMode( 211 "enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden); 212 213 static bool isSMEABIRoutineCall(const CallInst &CI) { 214 const auto *F = CI.getCalledFunction(); 215 return F && StringSwitch<bool>(F->getName()) 216 .Case("__arm_sme_state", true) 217 .Case("__arm_tpidr2_save", true) 218 .Case("__arm_tpidr2_restore", true) 219 .Case("__arm_za_disable", true) 220 .Default(false); 221 } 222 223 /// Returns true if the function has explicit operations that can only be 224 /// lowered using incompatible instructions for the selected mode. This also 225 /// returns true if the function F may use or modify ZA state. 226 static bool hasPossibleIncompatibleOps(const Function *F) { 227 for (const BasicBlock &BB : *F) { 228 for (const Instruction &I : BB) { 229 // Be conservative for now and assume that any call to inline asm or to 230 // intrinsics could could result in non-streaming ops (e.g. calls to 231 // @llvm.aarch64.* or @llvm.gather/scatter intrinsics). We can assume that 232 // all native LLVM instructions can be lowered to compatible instructions. 233 if (isa<CallInst>(I) && !I.isDebugOrPseudoInst() && 234 (cast<CallInst>(I).isInlineAsm() || isa<IntrinsicInst>(I) || 235 isSMEABIRoutineCall(cast<CallInst>(I)))) 236 return true; 237 } 238 } 239 return false; 240 } 241 242 bool AArch64TTIImpl::areInlineCompatible(const Function *Caller, 243 const Function *Callee) const { 244 SMEAttrs CallerAttrs(*Caller), CalleeAttrs(*Callee); 245 246 // When inlining, we should consider the body of the function, not the 247 // interface. 248 if (CalleeAttrs.hasStreamingBody()) { 249 CalleeAttrs.set(SMEAttrs::SM_Compatible, false); 250 CalleeAttrs.set(SMEAttrs::SM_Enabled, true); 251 } 252 253 if (CalleeAttrs.isNewZA()) 254 return false; 255 256 if (CallerAttrs.requiresLazySave(CalleeAttrs) || 257 CallerAttrs.requiresSMChange(CalleeAttrs) || 258 CallerAttrs.requiresPreservingZT0(CalleeAttrs)) { 259 if (hasPossibleIncompatibleOps(Callee)) 260 return false; 261 } 262 263 const TargetMachine &TM = getTLI()->getTargetMachine(); 264 265 const FeatureBitset &CallerBits = 266 TM.getSubtargetImpl(*Caller)->getFeatureBits(); 267 const FeatureBitset &CalleeBits = 268 TM.getSubtargetImpl(*Callee)->getFeatureBits(); 269 270 // Inline a callee if its target-features are a subset of the callers 271 // target-features. 272 return (CallerBits & CalleeBits) == CalleeBits; 273 } 274 275 bool AArch64TTIImpl::areTypesABICompatible( 276 const Function *Caller, const Function *Callee, 277 const ArrayRef<Type *> &Types) const { 278 if (!BaseT::areTypesABICompatible(Caller, Callee, Types)) 279 return false; 280 281 // We need to ensure that argument promotion does not attempt to promote 282 // pointers to fixed-length vector types larger than 128 bits like 283 // <8 x float> (and pointers to aggregate types which have such fixed-length 284 // vector type members) into the values of the pointees. Such vector types 285 // are used for SVE VLS but there is no ABI for SVE VLS arguments and the 286 // backend cannot lower such value arguments. The 128-bit fixed-length SVE 287 // types can be safely treated as 128-bit NEON types and they cannot be 288 // distinguished in IR. 289 if (ST->useSVEForFixedLengthVectors() && llvm::any_of(Types, [](Type *Ty) { 290 auto FVTy = dyn_cast<FixedVectorType>(Ty); 291 return FVTy && 292 FVTy->getScalarSizeInBits() * FVTy->getNumElements() > 128; 293 })) 294 return false; 295 296 return true; 297 } 298 299 unsigned 300 AArch64TTIImpl::getInlineCallPenalty(const Function *F, const CallBase &Call, 301 unsigned DefaultCallPenalty) const { 302 // This function calculates a penalty for executing Call in F. 303 // 304 // There are two ways this function can be called: 305 // (1) F: 306 // call from F -> G (the call here is Call) 307 // 308 // For (1), Call.getCaller() == F, so it will always return a high cost if 309 // a streaming-mode change is required (thus promoting the need to inline the 310 // function) 311 // 312 // (2) F: 313 // call from F -> G (the call here is not Call) 314 // G: 315 // call from G -> H (the call here is Call) 316 // 317 // For (2), if after inlining the body of G into F the call to H requires a 318 // streaming-mode change, and the call to G from F would also require a 319 // streaming-mode change, then there is benefit to do the streaming-mode 320 // change only once and avoid inlining of G into F. 321 SMEAttrs FAttrs(*F); 322 SMEAttrs CalleeAttrs(Call); 323 if (FAttrs.requiresSMChange(CalleeAttrs)) { 324 if (F == Call.getCaller()) // (1) 325 return CallPenaltyChangeSM * DefaultCallPenalty; 326 if (FAttrs.requiresSMChange(SMEAttrs(*Call.getCaller()))) // (2) 327 return InlineCallPenaltyChangeSM * DefaultCallPenalty; 328 } 329 330 return DefaultCallPenalty; 331 } 332 333 bool AArch64TTIImpl::shouldMaximizeVectorBandwidth( 334 TargetTransformInfo::RegisterKind K) const { 335 assert(K != TargetTransformInfo::RGK_Scalar); 336 return (K == TargetTransformInfo::RGK_FixedWidthVector && 337 ST->isNeonAvailable()); 338 } 339 340 /// Calculate the cost of materializing a 64-bit value. This helper 341 /// method might only calculate a fraction of a larger immediate. Therefore it 342 /// is valid to return a cost of ZERO. 343 InstructionCost AArch64TTIImpl::getIntImmCost(int64_t Val) { 344 // Check if the immediate can be encoded within an instruction. 345 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64)) 346 return 0; 347 348 if (Val < 0) 349 Val = ~Val; 350 351 // Calculate how many moves we will need to materialize this constant. 352 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn; 353 AArch64_IMM::expandMOVImm(Val, 64, Insn); 354 return Insn.size(); 355 } 356 357 /// Calculate the cost of materializing the given constant. 358 InstructionCost AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty, 359 TTI::TargetCostKind CostKind) { 360 assert(Ty->isIntegerTy()); 361 362 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 363 if (BitSize == 0) 364 return ~0U; 365 366 // Sign-extend all constants to a multiple of 64-bit. 367 APInt ImmVal = Imm; 368 if (BitSize & 0x3f) 369 ImmVal = Imm.sext((BitSize + 63) & ~0x3fU); 370 371 // Split the constant into 64-bit chunks and calculate the cost for each 372 // chunk. 373 InstructionCost Cost = 0; 374 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) { 375 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64); 376 int64_t Val = Tmp.getSExtValue(); 377 Cost += getIntImmCost(Val); 378 } 379 // We need at least one instruction to materialze the constant. 380 return std::max<InstructionCost>(1, Cost); 381 } 382 383 InstructionCost AArch64TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, 384 const APInt &Imm, Type *Ty, 385 TTI::TargetCostKind CostKind, 386 Instruction *Inst) { 387 assert(Ty->isIntegerTy()); 388 389 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 390 // There is no cost model for constants with a bit size of 0. Return TCC_Free 391 // here, so that constant hoisting will ignore this constant. 392 if (BitSize == 0) 393 return TTI::TCC_Free; 394 395 unsigned ImmIdx = ~0U; 396 switch (Opcode) { 397 default: 398 return TTI::TCC_Free; 399 case Instruction::GetElementPtr: 400 // Always hoist the base address of a GetElementPtr. 401 if (Idx == 0) 402 return 2 * TTI::TCC_Basic; 403 return TTI::TCC_Free; 404 case Instruction::Store: 405 ImmIdx = 0; 406 break; 407 case Instruction::Add: 408 case Instruction::Sub: 409 case Instruction::Mul: 410 case Instruction::UDiv: 411 case Instruction::SDiv: 412 case Instruction::URem: 413 case Instruction::SRem: 414 case Instruction::And: 415 case Instruction::Or: 416 case Instruction::Xor: 417 case Instruction::ICmp: 418 ImmIdx = 1; 419 break; 420 // Always return TCC_Free for the shift value of a shift instruction. 421 case Instruction::Shl: 422 case Instruction::LShr: 423 case Instruction::AShr: 424 if (Idx == 1) 425 return TTI::TCC_Free; 426 break; 427 case Instruction::Trunc: 428 case Instruction::ZExt: 429 case Instruction::SExt: 430 case Instruction::IntToPtr: 431 case Instruction::PtrToInt: 432 case Instruction::BitCast: 433 case Instruction::PHI: 434 case Instruction::Call: 435 case Instruction::Select: 436 case Instruction::Ret: 437 case Instruction::Load: 438 break; 439 } 440 441 if (Idx == ImmIdx) { 442 int NumConstants = (BitSize + 63) / 64; 443 InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); 444 return (Cost <= NumConstants * TTI::TCC_Basic) 445 ? static_cast<int>(TTI::TCC_Free) 446 : Cost; 447 } 448 return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); 449 } 450 451 InstructionCost 452 AArch64TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, 453 const APInt &Imm, Type *Ty, 454 TTI::TargetCostKind CostKind) { 455 assert(Ty->isIntegerTy()); 456 457 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 458 // There is no cost model for constants with a bit size of 0. Return TCC_Free 459 // here, so that constant hoisting will ignore this constant. 460 if (BitSize == 0) 461 return TTI::TCC_Free; 462 463 // Most (all?) AArch64 intrinsics do not support folding immediates into the 464 // selected instruction, so we compute the materialization cost for the 465 // immediate directly. 466 if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv) 467 return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); 468 469 switch (IID) { 470 default: 471 return TTI::TCC_Free; 472 case Intrinsic::sadd_with_overflow: 473 case Intrinsic::uadd_with_overflow: 474 case Intrinsic::ssub_with_overflow: 475 case Intrinsic::usub_with_overflow: 476 case Intrinsic::smul_with_overflow: 477 case Intrinsic::umul_with_overflow: 478 if (Idx == 1) { 479 int NumConstants = (BitSize + 63) / 64; 480 InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); 481 return (Cost <= NumConstants * TTI::TCC_Basic) 482 ? static_cast<int>(TTI::TCC_Free) 483 : Cost; 484 } 485 break; 486 case Intrinsic::experimental_stackmap: 487 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) 488 return TTI::TCC_Free; 489 break; 490 case Intrinsic::experimental_patchpoint_void: 491 case Intrinsic::experimental_patchpoint: 492 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) 493 return TTI::TCC_Free; 494 break; 495 case Intrinsic::experimental_gc_statepoint: 496 if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) 497 return TTI::TCC_Free; 498 break; 499 } 500 return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); 501 } 502 503 TargetTransformInfo::PopcntSupportKind 504 AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) { 505 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); 506 if (TyWidth == 32 || TyWidth == 64) 507 return TTI::PSK_FastHardware; 508 // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount. 509 return TTI::PSK_Software; 510 } 511 512 static bool isUnpackedVectorVT(EVT VecVT) { 513 return VecVT.isScalableVector() && 514 VecVT.getSizeInBits().getKnownMinValue() < AArch64::SVEBitsPerBlock; 515 } 516 517 static InstructionCost getHistogramCost(const IntrinsicCostAttributes &ICA) { 518 Type *BucketPtrsTy = ICA.getArgTypes()[0]; // Type of vector of pointers 519 Type *EltTy = ICA.getArgTypes()[1]; // Type of bucket elements 520 521 // Only allow (32b and 64b) integers or pointers for now... 522 if ((!EltTy->isIntegerTy() && !EltTy->isPointerTy()) || 523 (EltTy->getScalarSizeInBits() != 32 && 524 EltTy->getScalarSizeInBits() != 64)) 525 return InstructionCost::getInvalid(); 526 527 // FIXME: Hacky check for legal vector types. We can promote smaller types 528 // but we cannot legalize vectors via splitting for histcnt. 529 // FIXME: We should be able to generate histcnt for fixed-length vectors 530 // using ptrue with a specific VL. 531 if (VectorType *VTy = dyn_cast<VectorType>(BucketPtrsTy)) 532 if ((VTy->getElementCount().getKnownMinValue() != 2 && 533 VTy->getElementCount().getKnownMinValue() != 4) || 534 VTy->getPrimitiveSizeInBits().getKnownMinValue() > 128 || 535 !VTy->isScalableTy()) 536 return InstructionCost::getInvalid(); 537 538 return InstructionCost(BaseHistCntCost); 539 } 540 541 InstructionCost 542 AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, 543 TTI::TargetCostKind CostKind) { 544 // The code-generator is currently not able to handle scalable vectors 545 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting 546 // it. This change will be removed when code-generation for these types is 547 // sufficiently reliable. 548 auto *RetTy = ICA.getReturnType(); 549 if (auto *VTy = dyn_cast<ScalableVectorType>(RetTy)) 550 if (VTy->getElementCount() == ElementCount::getScalable(1)) 551 return InstructionCost::getInvalid(); 552 553 switch (ICA.getID()) { 554 case Intrinsic::experimental_vector_histogram_add: 555 if (!ST->hasSVE2()) 556 return InstructionCost::getInvalid(); 557 return getHistogramCost(ICA); 558 case Intrinsic::umin: 559 case Intrinsic::umax: 560 case Intrinsic::smin: 561 case Intrinsic::smax: { 562 static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16, 563 MVT::v8i16, MVT::v2i32, MVT::v4i32, 564 MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, 565 MVT::nxv2i64}; 566 auto LT = getTypeLegalizationCost(RetTy); 567 // v2i64 types get converted to cmp+bif hence the cost of 2 568 if (LT.second == MVT::v2i64) 569 return LT.first * 2; 570 if (any_of(ValidMinMaxTys, [<](MVT M) { return M == LT.second; })) 571 return LT.first; 572 break; 573 } 574 case Intrinsic::sadd_sat: 575 case Intrinsic::ssub_sat: 576 case Intrinsic::uadd_sat: 577 case Intrinsic::usub_sat: { 578 static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16, 579 MVT::v8i16, MVT::v2i32, MVT::v4i32, 580 MVT::v2i64}; 581 auto LT = getTypeLegalizationCost(RetTy); 582 // This is a base cost of 1 for the vadd, plus 3 extract shifts if we 583 // need to extend the type, as it uses shr(qadd(shl, shl)). 584 unsigned Instrs = 585 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4; 586 if (any_of(ValidSatTys, [<](MVT M) { return M == LT.second; })) 587 return LT.first * Instrs; 588 break; 589 } 590 case Intrinsic::abs: { 591 static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16, 592 MVT::v8i16, MVT::v2i32, MVT::v4i32, 593 MVT::v2i64}; 594 auto LT = getTypeLegalizationCost(RetTy); 595 if (any_of(ValidAbsTys, [<](MVT M) { return M == LT.second; })) 596 return LT.first; 597 break; 598 } 599 case Intrinsic::bswap: { 600 static const auto ValidAbsTys = {MVT::v4i16, MVT::v8i16, MVT::v2i32, 601 MVT::v4i32, MVT::v2i64}; 602 auto LT = getTypeLegalizationCost(RetTy); 603 if (any_of(ValidAbsTys, [<](MVT M) { return M == LT.second; }) && 604 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits()) 605 return LT.first; 606 break; 607 } 608 case Intrinsic::experimental_stepvector: { 609 InstructionCost Cost = 1; // Cost of the `index' instruction 610 auto LT = getTypeLegalizationCost(RetTy); 611 // Legalisation of illegal vectors involves an `index' instruction plus 612 // (LT.first - 1) vector adds. 613 if (LT.first > 1) { 614 Type *LegalVTy = EVT(LT.second).getTypeForEVT(RetTy->getContext()); 615 InstructionCost AddCost = 616 getArithmeticInstrCost(Instruction::Add, LegalVTy, CostKind); 617 Cost += AddCost * (LT.first - 1); 618 } 619 return Cost; 620 } 621 case Intrinsic::vector_extract: 622 case Intrinsic::vector_insert: { 623 // If both the vector and subvector types are legal types and the index 624 // is 0, then this should be a no-op or simple operation; return a 625 // relatively low cost. 626 627 // If arguments aren't actually supplied, then we cannot determine the 628 // value of the index. We also want to skip predicate types. 629 if (ICA.getArgs().size() != ICA.getArgTypes().size() || 630 ICA.getReturnType()->getScalarType()->isIntegerTy(1)) 631 break; 632 633 LLVMContext &C = RetTy->getContext(); 634 EVT VecVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]); 635 bool IsExtract = ICA.getID() == Intrinsic::vector_extract; 636 EVT SubVecVT = IsExtract ? getTLI()->getValueType(DL, RetTy) 637 : getTLI()->getValueType(DL, ICA.getArgTypes()[1]); 638 // Skip this if either the vector or subvector types are unpacked 639 // SVE types; they may get lowered to stack stores and loads. 640 if (isUnpackedVectorVT(VecVT) || isUnpackedVectorVT(SubVecVT)) 641 break; 642 643 TargetLoweringBase::LegalizeKind SubVecLK = 644 getTLI()->getTypeConversion(C, SubVecVT); 645 TargetLoweringBase::LegalizeKind VecLK = 646 getTLI()->getTypeConversion(C, VecVT); 647 const Value *Idx = IsExtract ? ICA.getArgs()[1] : ICA.getArgs()[2]; 648 const ConstantInt *CIdx = cast<ConstantInt>(Idx); 649 if (SubVecLK.first == TargetLoweringBase::TypeLegal && 650 VecLK.first == TargetLoweringBase::TypeLegal && CIdx->isZero()) 651 return TTI::TCC_Free; 652 break; 653 } 654 case Intrinsic::bitreverse: { 655 static const CostTblEntry BitreverseTbl[] = { 656 {Intrinsic::bitreverse, MVT::i32, 1}, 657 {Intrinsic::bitreverse, MVT::i64, 1}, 658 {Intrinsic::bitreverse, MVT::v8i8, 1}, 659 {Intrinsic::bitreverse, MVT::v16i8, 1}, 660 {Intrinsic::bitreverse, MVT::v4i16, 2}, 661 {Intrinsic::bitreverse, MVT::v8i16, 2}, 662 {Intrinsic::bitreverse, MVT::v2i32, 2}, 663 {Intrinsic::bitreverse, MVT::v4i32, 2}, 664 {Intrinsic::bitreverse, MVT::v1i64, 2}, 665 {Intrinsic::bitreverse, MVT::v2i64, 2}, 666 }; 667 const auto LegalisationCost = getTypeLegalizationCost(RetTy); 668 const auto *Entry = 669 CostTableLookup(BitreverseTbl, ICA.getID(), LegalisationCost.second); 670 if (Entry) { 671 // Cost Model is using the legal type(i32) that i8 and i16 will be 672 // converted to +1 so that we match the actual lowering cost 673 if (TLI->getValueType(DL, RetTy, true) == MVT::i8 || 674 TLI->getValueType(DL, RetTy, true) == MVT::i16) 675 return LegalisationCost.first * Entry->Cost + 1; 676 677 return LegalisationCost.first * Entry->Cost; 678 } 679 break; 680 } 681 case Intrinsic::ctpop: { 682 if (!ST->hasNEON()) { 683 // 32-bit or 64-bit ctpop without NEON is 12 instructions. 684 return getTypeLegalizationCost(RetTy).first * 12; 685 } 686 static const CostTblEntry CtpopCostTbl[] = { 687 {ISD::CTPOP, MVT::v2i64, 4}, 688 {ISD::CTPOP, MVT::v4i32, 3}, 689 {ISD::CTPOP, MVT::v8i16, 2}, 690 {ISD::CTPOP, MVT::v16i8, 1}, 691 {ISD::CTPOP, MVT::i64, 4}, 692 {ISD::CTPOP, MVT::v2i32, 3}, 693 {ISD::CTPOP, MVT::v4i16, 2}, 694 {ISD::CTPOP, MVT::v8i8, 1}, 695 {ISD::CTPOP, MVT::i32, 5}, 696 }; 697 auto LT = getTypeLegalizationCost(RetTy); 698 MVT MTy = LT.second; 699 if (const auto *Entry = CostTableLookup(CtpopCostTbl, ISD::CTPOP, MTy)) { 700 // Extra cost of +1 when illegal vector types are legalized by promoting 701 // the integer type. 702 int ExtraCost = MTy.isVector() && MTy.getScalarSizeInBits() != 703 RetTy->getScalarSizeInBits() 704 ? 1 705 : 0; 706 return LT.first * Entry->Cost + ExtraCost; 707 } 708 break; 709 } 710 case Intrinsic::sadd_with_overflow: 711 case Intrinsic::uadd_with_overflow: 712 case Intrinsic::ssub_with_overflow: 713 case Intrinsic::usub_with_overflow: 714 case Intrinsic::smul_with_overflow: 715 case Intrinsic::umul_with_overflow: { 716 static const CostTblEntry WithOverflowCostTbl[] = { 717 {Intrinsic::sadd_with_overflow, MVT::i8, 3}, 718 {Intrinsic::uadd_with_overflow, MVT::i8, 3}, 719 {Intrinsic::sadd_with_overflow, MVT::i16, 3}, 720 {Intrinsic::uadd_with_overflow, MVT::i16, 3}, 721 {Intrinsic::sadd_with_overflow, MVT::i32, 1}, 722 {Intrinsic::uadd_with_overflow, MVT::i32, 1}, 723 {Intrinsic::sadd_with_overflow, MVT::i64, 1}, 724 {Intrinsic::uadd_with_overflow, MVT::i64, 1}, 725 {Intrinsic::ssub_with_overflow, MVT::i8, 3}, 726 {Intrinsic::usub_with_overflow, MVT::i8, 3}, 727 {Intrinsic::ssub_with_overflow, MVT::i16, 3}, 728 {Intrinsic::usub_with_overflow, MVT::i16, 3}, 729 {Intrinsic::ssub_with_overflow, MVT::i32, 1}, 730 {Intrinsic::usub_with_overflow, MVT::i32, 1}, 731 {Intrinsic::ssub_with_overflow, MVT::i64, 1}, 732 {Intrinsic::usub_with_overflow, MVT::i64, 1}, 733 {Intrinsic::smul_with_overflow, MVT::i8, 5}, 734 {Intrinsic::umul_with_overflow, MVT::i8, 4}, 735 {Intrinsic::smul_with_overflow, MVT::i16, 5}, 736 {Intrinsic::umul_with_overflow, MVT::i16, 4}, 737 {Intrinsic::smul_with_overflow, MVT::i32, 2}, // eg umull;tst 738 {Intrinsic::umul_with_overflow, MVT::i32, 2}, // eg umull;cmp sxtw 739 {Intrinsic::smul_with_overflow, MVT::i64, 3}, // eg mul;smulh;cmp 740 {Intrinsic::umul_with_overflow, MVT::i64, 3}, // eg mul;umulh;cmp asr 741 }; 742 EVT MTy = TLI->getValueType(DL, RetTy->getContainedType(0), true); 743 if (MTy.isSimple()) 744 if (const auto *Entry = CostTableLookup(WithOverflowCostTbl, ICA.getID(), 745 MTy.getSimpleVT())) 746 return Entry->Cost; 747 break; 748 } 749 case Intrinsic::fptosi_sat: 750 case Intrinsic::fptoui_sat: { 751 if (ICA.getArgTypes().empty()) 752 break; 753 bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat; 754 auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]); 755 EVT MTy = TLI->getValueType(DL, RetTy); 756 // Check for the legal types, which are where the size of the input and the 757 // output are the same, or we are using cvt f64->i32 or f32->i64. 758 if ((LT.second == MVT::f32 || LT.second == MVT::f64 || 759 LT.second == MVT::v2f32 || LT.second == MVT::v4f32 || 760 LT.second == MVT::v2f64) && 761 (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits() || 762 (LT.second == MVT::f64 && MTy == MVT::i32) || 763 (LT.second == MVT::f32 && MTy == MVT::i64))) 764 return LT.first; 765 // Similarly for fp16 sizes 766 if (ST->hasFullFP16() && 767 ((LT.second == MVT::f16 && MTy == MVT::i32) || 768 ((LT.second == MVT::v4f16 || LT.second == MVT::v8f16) && 769 (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())))) 770 return LT.first; 771 772 // Otherwise we use a legal convert followed by a min+max 773 if ((LT.second.getScalarType() == MVT::f32 || 774 LT.second.getScalarType() == MVT::f64 || 775 (ST->hasFullFP16() && LT.second.getScalarType() == MVT::f16)) && 776 LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) { 777 Type *LegalTy = 778 Type::getIntNTy(RetTy->getContext(), LT.second.getScalarSizeInBits()); 779 if (LT.second.isVector()) 780 LegalTy = VectorType::get(LegalTy, LT.second.getVectorElementCount()); 781 InstructionCost Cost = 1; 782 IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin : Intrinsic::umin, 783 LegalTy, {LegalTy, LegalTy}); 784 Cost += getIntrinsicInstrCost(Attrs1, CostKind); 785 IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax : Intrinsic::umax, 786 LegalTy, {LegalTy, LegalTy}); 787 Cost += getIntrinsicInstrCost(Attrs2, CostKind); 788 return LT.first * Cost; 789 } 790 break; 791 } 792 case Intrinsic::fshl: 793 case Intrinsic::fshr: { 794 if (ICA.getArgs().empty()) 795 break; 796 797 // TODO: Add handling for fshl where third argument is not a constant. 798 const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(ICA.getArgs()[2]); 799 if (!OpInfoZ.isConstant()) 800 break; 801 802 const auto LegalisationCost = getTypeLegalizationCost(RetTy); 803 if (OpInfoZ.isUniform()) { 804 // FIXME: The costs could be lower if the codegen is better. 805 static const CostTblEntry FshlTbl[] = { 806 {Intrinsic::fshl, MVT::v4i32, 3}, // ushr + shl + orr 807 {Intrinsic::fshl, MVT::v2i64, 3}, {Intrinsic::fshl, MVT::v16i8, 4}, 808 {Intrinsic::fshl, MVT::v8i16, 4}, {Intrinsic::fshl, MVT::v2i32, 3}, 809 {Intrinsic::fshl, MVT::v8i8, 4}, {Intrinsic::fshl, MVT::v4i16, 4}}; 810 // Costs for both fshl & fshr are the same, so just pass Intrinsic::fshl 811 // to avoid having to duplicate the costs. 812 const auto *Entry = 813 CostTableLookup(FshlTbl, Intrinsic::fshl, LegalisationCost.second); 814 if (Entry) 815 return LegalisationCost.first * Entry->Cost; 816 } 817 818 auto TyL = getTypeLegalizationCost(RetTy); 819 if (!RetTy->isIntegerTy()) 820 break; 821 822 // Estimate cost manually, as types like i8 and i16 will get promoted to 823 // i32 and CostTableLookup will ignore the extra conversion cost. 824 bool HigherCost = (RetTy->getScalarSizeInBits() != 32 && 825 RetTy->getScalarSizeInBits() < 64) || 826 (RetTy->getScalarSizeInBits() % 64 != 0); 827 unsigned ExtraCost = HigherCost ? 1 : 0; 828 if (RetTy->getScalarSizeInBits() == 32 || 829 RetTy->getScalarSizeInBits() == 64) 830 ExtraCost = 0; // fhsl/fshr for i32 and i64 can be lowered to a single 831 // extr instruction. 832 else if (HigherCost) 833 ExtraCost = 1; 834 else 835 break; 836 return TyL.first + ExtraCost; 837 } 838 case Intrinsic::get_active_lane_mask: { 839 auto *RetTy = dyn_cast<FixedVectorType>(ICA.getReturnType()); 840 if (RetTy) { 841 EVT RetVT = getTLI()->getValueType(DL, RetTy); 842 EVT OpVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]); 843 if (!getTLI()->shouldExpandGetActiveLaneMask(RetVT, OpVT) && 844 !getTLI()->isTypeLegal(RetVT)) { 845 // We don't have enough context at this point to determine if the mask 846 // is going to be kept live after the block, which will force the vXi1 847 // type to be expanded to legal vectors of integers, e.g. v4i1->v4i32. 848 // For now, we just assume the vectorizer created this intrinsic and 849 // the result will be the input for a PHI. In this case the cost will 850 // be extremely high for fixed-width vectors. 851 // NOTE: getScalarizationOverhead returns a cost that's far too 852 // pessimistic for the actual generated codegen. In reality there are 853 // two instructions generated per lane. 854 return RetTy->getNumElements() * 2; 855 } 856 } 857 break; 858 } 859 default: 860 break; 861 } 862 return BaseT::getIntrinsicInstrCost(ICA, CostKind); 863 } 864 865 /// The function will remove redundant reinterprets casting in the presence 866 /// of the control flow 867 static std::optional<Instruction *> processPhiNode(InstCombiner &IC, 868 IntrinsicInst &II) { 869 SmallVector<Instruction *, 32> Worklist; 870 auto RequiredType = II.getType(); 871 872 auto *PN = dyn_cast<PHINode>(II.getArgOperand(0)); 873 assert(PN && "Expected Phi Node!"); 874 875 // Don't create a new Phi unless we can remove the old one. 876 if (!PN->hasOneUse()) 877 return std::nullopt; 878 879 for (Value *IncValPhi : PN->incoming_values()) { 880 auto *Reinterpret = dyn_cast<IntrinsicInst>(IncValPhi); 881 if (!Reinterpret || 882 Reinterpret->getIntrinsicID() != 883 Intrinsic::aarch64_sve_convert_to_svbool || 884 RequiredType != Reinterpret->getArgOperand(0)->getType()) 885 return std::nullopt; 886 } 887 888 // Create the new Phi 889 IC.Builder.SetInsertPoint(PN); 890 PHINode *NPN = IC.Builder.CreatePHI(RequiredType, PN->getNumIncomingValues()); 891 Worklist.push_back(PN); 892 893 for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) { 894 auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I)); 895 NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I)); 896 Worklist.push_back(Reinterpret); 897 } 898 899 // Cleanup Phi Node and reinterprets 900 return IC.replaceInstUsesWith(II, NPN); 901 } 902 903 // (from_svbool (binop (to_svbool pred) (svbool_t _) (svbool_t _)))) 904 // => (binop (pred) (from_svbool _) (from_svbool _)) 905 // 906 // The above transformation eliminates a `to_svbool` in the predicate 907 // operand of bitwise operation `binop` by narrowing the vector width of 908 // the operation. For example, it would convert a `<vscale x 16 x i1> 909 // and` into a `<vscale x 4 x i1> and`. This is profitable because 910 // to_svbool must zero the new lanes during widening, whereas 911 // from_svbool is free. 912 static std::optional<Instruction *> 913 tryCombineFromSVBoolBinOp(InstCombiner &IC, IntrinsicInst &II) { 914 auto BinOp = dyn_cast<IntrinsicInst>(II.getOperand(0)); 915 if (!BinOp) 916 return std::nullopt; 917 918 auto IntrinsicID = BinOp->getIntrinsicID(); 919 switch (IntrinsicID) { 920 case Intrinsic::aarch64_sve_and_z: 921 case Intrinsic::aarch64_sve_bic_z: 922 case Intrinsic::aarch64_sve_eor_z: 923 case Intrinsic::aarch64_sve_nand_z: 924 case Intrinsic::aarch64_sve_nor_z: 925 case Intrinsic::aarch64_sve_orn_z: 926 case Intrinsic::aarch64_sve_orr_z: 927 break; 928 default: 929 return std::nullopt; 930 } 931 932 auto BinOpPred = BinOp->getOperand(0); 933 auto BinOpOp1 = BinOp->getOperand(1); 934 auto BinOpOp2 = BinOp->getOperand(2); 935 936 auto PredIntr = dyn_cast<IntrinsicInst>(BinOpPred); 937 if (!PredIntr || 938 PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool) 939 return std::nullopt; 940 941 auto PredOp = PredIntr->getOperand(0); 942 auto PredOpTy = cast<VectorType>(PredOp->getType()); 943 if (PredOpTy != II.getType()) 944 return std::nullopt; 945 946 SmallVector<Value *> NarrowedBinOpArgs = {PredOp}; 947 auto NarrowBinOpOp1 = IC.Builder.CreateIntrinsic( 948 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp1}); 949 NarrowedBinOpArgs.push_back(NarrowBinOpOp1); 950 if (BinOpOp1 == BinOpOp2) 951 NarrowedBinOpArgs.push_back(NarrowBinOpOp1); 952 else 953 NarrowedBinOpArgs.push_back(IC.Builder.CreateIntrinsic( 954 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp2})); 955 956 auto NarrowedBinOp = 957 IC.Builder.CreateIntrinsic(IntrinsicID, {PredOpTy}, NarrowedBinOpArgs); 958 return IC.replaceInstUsesWith(II, NarrowedBinOp); 959 } 960 961 static std::optional<Instruction *> 962 instCombineConvertFromSVBool(InstCombiner &IC, IntrinsicInst &II) { 963 // If the reinterpret instruction operand is a PHI Node 964 if (isa<PHINode>(II.getArgOperand(0))) 965 return processPhiNode(IC, II); 966 967 if (auto BinOpCombine = tryCombineFromSVBoolBinOp(IC, II)) 968 return BinOpCombine; 969 970 // Ignore converts to/from svcount_t. 971 if (isa<TargetExtType>(II.getArgOperand(0)->getType()) || 972 isa<TargetExtType>(II.getType())) 973 return std::nullopt; 974 975 SmallVector<Instruction *, 32> CandidatesForRemoval; 976 Value *Cursor = II.getOperand(0), *EarliestReplacement = nullptr; 977 978 const auto *IVTy = cast<VectorType>(II.getType()); 979 980 // Walk the chain of conversions. 981 while (Cursor) { 982 // If the type of the cursor has fewer lanes than the final result, zeroing 983 // must take place, which breaks the equivalence chain. 984 const auto *CursorVTy = cast<VectorType>(Cursor->getType()); 985 if (CursorVTy->getElementCount().getKnownMinValue() < 986 IVTy->getElementCount().getKnownMinValue()) 987 break; 988 989 // If the cursor has the same type as I, it is a viable replacement. 990 if (Cursor->getType() == IVTy) 991 EarliestReplacement = Cursor; 992 993 auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor); 994 995 // If this is not an SVE conversion intrinsic, this is the end of the chain. 996 if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() == 997 Intrinsic::aarch64_sve_convert_to_svbool || 998 IntrinsicCursor->getIntrinsicID() == 999 Intrinsic::aarch64_sve_convert_from_svbool)) 1000 break; 1001 1002 CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor); 1003 Cursor = IntrinsicCursor->getOperand(0); 1004 } 1005 1006 // If no viable replacement in the conversion chain was found, there is 1007 // nothing to do. 1008 if (!EarliestReplacement) 1009 return std::nullopt; 1010 1011 return IC.replaceInstUsesWith(II, EarliestReplacement); 1012 } 1013 1014 static bool isAllActivePredicate(Value *Pred) { 1015 // Look through convert.from.svbool(convert.to.svbool(...) chain. 1016 Value *UncastedPred; 1017 if (match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_convert_from_svbool>( 1018 m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>( 1019 m_Value(UncastedPred))))) 1020 // If the predicate has the same or less lanes than the uncasted 1021 // predicate then we know the casting has no effect. 1022 if (cast<ScalableVectorType>(Pred->getType())->getMinNumElements() <= 1023 cast<ScalableVectorType>(UncastedPred->getType())->getMinNumElements()) 1024 Pred = UncastedPred; 1025 1026 return match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>( 1027 m_ConstantInt<AArch64SVEPredPattern::all>())); 1028 } 1029 1030 // Erase unary operation where predicate has all inactive lanes 1031 static std::optional<Instruction *> 1032 instCombineSVENoActiveUnaryErase(InstCombiner &IC, IntrinsicInst &II, 1033 int PredPos) { 1034 if (match(II.getOperand(PredPos), m_ZeroInt())) { 1035 return IC.eraseInstFromFunction(II); 1036 } 1037 return std::nullopt; 1038 } 1039 1040 // Simplify unary operation where predicate has all inactive lanes by replacing 1041 // instruction with zeroed object 1042 static std::optional<Instruction *> 1043 instCombineSVENoActiveUnaryZero(InstCombiner &IC, IntrinsicInst &II) { 1044 if (match(II.getOperand(0), m_ZeroInt())) { 1045 Constant *Node; 1046 Type *RetTy = II.getType(); 1047 if (RetTy->isStructTy()) { 1048 auto StructT = cast<StructType>(RetTy); 1049 auto VecT = StructT->getElementType(0); 1050 SmallVector<llvm::Constant *, 4> ZerVec; 1051 for (unsigned i = 0; i < StructT->getNumElements(); i++) { 1052 ZerVec.push_back(VecT->isFPOrFPVectorTy() ? ConstantFP::get(VecT, 0.0) 1053 : ConstantInt::get(VecT, 0)); 1054 } 1055 Node = ConstantStruct::get(StructT, ZerVec); 1056 } else if (RetTy->isFPOrFPVectorTy()) 1057 Node = ConstantFP::get(RetTy, 0.0); 1058 else 1059 Node = ConstantInt::get(II.getType(), 0); 1060 1061 IC.replaceInstUsesWith(II, Node); 1062 return IC.eraseInstFromFunction(II); 1063 } 1064 return std::nullopt; 1065 } 1066 1067 static std::optional<Instruction *> instCombineSVESel(InstCombiner &IC, 1068 IntrinsicInst &II) { 1069 // svsel(ptrue, x, y) => x 1070 auto *OpPredicate = II.getOperand(0); 1071 if (isAllActivePredicate(OpPredicate)) 1072 return IC.replaceInstUsesWith(II, II.getOperand(1)); 1073 1074 auto Select = 1075 IC.Builder.CreateSelect(OpPredicate, II.getOperand(1), II.getOperand(2)); 1076 return IC.replaceInstUsesWith(II, Select); 1077 } 1078 1079 static std::optional<Instruction *> instCombineSVEDup(InstCombiner &IC, 1080 IntrinsicInst &II) { 1081 IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(1)); 1082 if (!Pg) 1083 return std::nullopt; 1084 1085 if (Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue) 1086 return std::nullopt; 1087 1088 const auto PTruePattern = 1089 cast<ConstantInt>(Pg->getOperand(0))->getZExtValue(); 1090 if (PTruePattern != AArch64SVEPredPattern::vl1) 1091 return std::nullopt; 1092 1093 // The intrinsic is inserting into lane zero so use an insert instead. 1094 auto *IdxTy = Type::getInt64Ty(II.getContext()); 1095 auto *Insert = InsertElementInst::Create( 1096 II.getArgOperand(0), II.getArgOperand(2), ConstantInt::get(IdxTy, 0)); 1097 Insert->insertBefore(&II); 1098 Insert->takeName(&II); 1099 1100 return IC.replaceInstUsesWith(II, Insert); 1101 } 1102 1103 static std::optional<Instruction *> instCombineSVEDupX(InstCombiner &IC, 1104 IntrinsicInst &II) { 1105 // Replace DupX with a regular IR splat. 1106 auto *RetTy = cast<ScalableVectorType>(II.getType()); 1107 Value *Splat = IC.Builder.CreateVectorSplat(RetTy->getElementCount(), 1108 II.getArgOperand(0)); 1109 Splat->takeName(&II); 1110 return IC.replaceInstUsesWith(II, Splat); 1111 } 1112 1113 static std::optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC, 1114 IntrinsicInst &II) { 1115 LLVMContext &Ctx = II.getContext(); 1116 1117 // Check that the predicate is all active 1118 auto *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(0)); 1119 if (!Pg || Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue) 1120 return std::nullopt; 1121 1122 const auto PTruePattern = 1123 cast<ConstantInt>(Pg->getOperand(0))->getZExtValue(); 1124 if (PTruePattern != AArch64SVEPredPattern::all) 1125 return std::nullopt; 1126 1127 // Check that we have a compare of zero.. 1128 auto *SplatValue = 1129 dyn_cast_or_null<ConstantInt>(getSplatValue(II.getArgOperand(2))); 1130 if (!SplatValue || !SplatValue->isZero()) 1131 return std::nullopt; 1132 1133 // ..against a dupq 1134 auto *DupQLane = dyn_cast<IntrinsicInst>(II.getArgOperand(1)); 1135 if (!DupQLane || 1136 DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane) 1137 return std::nullopt; 1138 1139 // Where the dupq is a lane 0 replicate of a vector insert 1140 if (!cast<ConstantInt>(DupQLane->getArgOperand(1))->isZero()) 1141 return std::nullopt; 1142 1143 auto *VecIns = dyn_cast<IntrinsicInst>(DupQLane->getArgOperand(0)); 1144 if (!VecIns || VecIns->getIntrinsicID() != Intrinsic::vector_insert) 1145 return std::nullopt; 1146 1147 // Where the vector insert is a fixed constant vector insert into undef at 1148 // index zero 1149 if (!isa<UndefValue>(VecIns->getArgOperand(0))) 1150 return std::nullopt; 1151 1152 if (!cast<ConstantInt>(VecIns->getArgOperand(2))->isZero()) 1153 return std::nullopt; 1154 1155 auto *ConstVec = dyn_cast<Constant>(VecIns->getArgOperand(1)); 1156 if (!ConstVec) 1157 return std::nullopt; 1158 1159 auto *VecTy = dyn_cast<FixedVectorType>(ConstVec->getType()); 1160 auto *OutTy = dyn_cast<ScalableVectorType>(II.getType()); 1161 if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements()) 1162 return std::nullopt; 1163 1164 unsigned NumElts = VecTy->getNumElements(); 1165 unsigned PredicateBits = 0; 1166 1167 // Expand intrinsic operands to a 16-bit byte level predicate 1168 for (unsigned I = 0; I < NumElts; ++I) { 1169 auto *Arg = dyn_cast<ConstantInt>(ConstVec->getAggregateElement(I)); 1170 if (!Arg) 1171 return std::nullopt; 1172 if (!Arg->isZero()) 1173 PredicateBits |= 1 << (I * (16 / NumElts)); 1174 } 1175 1176 // If all bits are zero bail early with an empty predicate 1177 if (PredicateBits == 0) { 1178 auto *PFalse = Constant::getNullValue(II.getType()); 1179 PFalse->takeName(&II); 1180 return IC.replaceInstUsesWith(II, PFalse); 1181 } 1182 1183 // Calculate largest predicate type used (where byte predicate is largest) 1184 unsigned Mask = 8; 1185 for (unsigned I = 0; I < 16; ++I) 1186 if ((PredicateBits & (1 << I)) != 0) 1187 Mask |= (I % 8); 1188 1189 unsigned PredSize = Mask & -Mask; 1190 auto *PredType = ScalableVectorType::get( 1191 Type::getInt1Ty(Ctx), AArch64::SVEBitsPerBlock / (PredSize * 8)); 1192 1193 // Ensure all relevant bits are set 1194 for (unsigned I = 0; I < 16; I += PredSize) 1195 if ((PredicateBits & (1 << I)) == 0) 1196 return std::nullopt; 1197 1198 auto *PTruePat = 1199 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all); 1200 auto *PTrue = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, 1201 {PredType}, {PTruePat}); 1202 auto *ConvertToSVBool = IC.Builder.CreateIntrinsic( 1203 Intrinsic::aarch64_sve_convert_to_svbool, {PredType}, {PTrue}); 1204 auto *ConvertFromSVBool = 1205 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool, 1206 {II.getType()}, {ConvertToSVBool}); 1207 1208 ConvertFromSVBool->takeName(&II); 1209 return IC.replaceInstUsesWith(II, ConvertFromSVBool); 1210 } 1211 1212 static std::optional<Instruction *> instCombineSVELast(InstCombiner &IC, 1213 IntrinsicInst &II) { 1214 Value *Pg = II.getArgOperand(0); 1215 Value *Vec = II.getArgOperand(1); 1216 auto IntrinsicID = II.getIntrinsicID(); 1217 bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta; 1218 1219 // lastX(splat(X)) --> X 1220 if (auto *SplatVal = getSplatValue(Vec)) 1221 return IC.replaceInstUsesWith(II, SplatVal); 1222 1223 // If x and/or y is a splat value then: 1224 // lastX (binop (x, y)) --> binop(lastX(x), lastX(y)) 1225 Value *LHS, *RHS; 1226 if (match(Vec, m_OneUse(m_BinOp(m_Value(LHS), m_Value(RHS))))) { 1227 if (isSplatValue(LHS) || isSplatValue(RHS)) { 1228 auto *OldBinOp = cast<BinaryOperator>(Vec); 1229 auto OpC = OldBinOp->getOpcode(); 1230 auto *NewLHS = 1231 IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, LHS}); 1232 auto *NewRHS = 1233 IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, RHS}); 1234 auto *NewBinOp = BinaryOperator::CreateWithCopiedFlags( 1235 OpC, NewLHS, NewRHS, OldBinOp, OldBinOp->getName(), II.getIterator()); 1236 return IC.replaceInstUsesWith(II, NewBinOp); 1237 } 1238 } 1239 1240 auto *C = dyn_cast<Constant>(Pg); 1241 if (IsAfter && C && C->isNullValue()) { 1242 // The intrinsic is extracting lane 0 so use an extract instead. 1243 auto *IdxTy = Type::getInt64Ty(II.getContext()); 1244 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, 0)); 1245 Extract->insertBefore(&II); 1246 Extract->takeName(&II); 1247 return IC.replaceInstUsesWith(II, Extract); 1248 } 1249 1250 auto *IntrPG = dyn_cast<IntrinsicInst>(Pg); 1251 if (!IntrPG) 1252 return std::nullopt; 1253 1254 if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue) 1255 return std::nullopt; 1256 1257 const auto PTruePattern = 1258 cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue(); 1259 1260 // Can the intrinsic's predicate be converted to a known constant index? 1261 unsigned MinNumElts = getNumElementsFromSVEPredPattern(PTruePattern); 1262 if (!MinNumElts) 1263 return std::nullopt; 1264 1265 unsigned Idx = MinNumElts - 1; 1266 // Increment the index if extracting the element after the last active 1267 // predicate element. 1268 if (IsAfter) 1269 ++Idx; 1270 1271 // Ignore extracts whose index is larger than the known minimum vector 1272 // length. NOTE: This is an artificial constraint where we prefer to 1273 // maintain what the user asked for until an alternative is proven faster. 1274 auto *PgVTy = cast<ScalableVectorType>(Pg->getType()); 1275 if (Idx >= PgVTy->getMinNumElements()) 1276 return std::nullopt; 1277 1278 // The intrinsic is extracting a fixed lane so use an extract instead. 1279 auto *IdxTy = Type::getInt64Ty(II.getContext()); 1280 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, Idx)); 1281 Extract->insertBefore(&II); 1282 Extract->takeName(&II); 1283 return IC.replaceInstUsesWith(II, Extract); 1284 } 1285 1286 static std::optional<Instruction *> instCombineSVECondLast(InstCombiner &IC, 1287 IntrinsicInst &II) { 1288 // The SIMD&FP variant of CLAST[AB] is significantly faster than the scalar 1289 // integer variant across a variety of micro-architectures. Replace scalar 1290 // integer CLAST[AB] intrinsic with optimal SIMD&FP variant. A simple 1291 // bitcast-to-fp + clast[ab] + bitcast-to-int will cost a cycle or two more 1292 // depending on the micro-architecture, but has been observed as generally 1293 // being faster, particularly when the CLAST[AB] op is a loop-carried 1294 // dependency. 1295 Value *Pg = II.getArgOperand(0); 1296 Value *Fallback = II.getArgOperand(1); 1297 Value *Vec = II.getArgOperand(2); 1298 Type *Ty = II.getType(); 1299 1300 if (!Ty->isIntegerTy()) 1301 return std::nullopt; 1302 1303 Type *FPTy; 1304 switch (cast<IntegerType>(Ty)->getBitWidth()) { 1305 default: 1306 return std::nullopt; 1307 case 16: 1308 FPTy = IC.Builder.getHalfTy(); 1309 break; 1310 case 32: 1311 FPTy = IC.Builder.getFloatTy(); 1312 break; 1313 case 64: 1314 FPTy = IC.Builder.getDoubleTy(); 1315 break; 1316 } 1317 1318 Value *FPFallBack = IC.Builder.CreateBitCast(Fallback, FPTy); 1319 auto *FPVTy = VectorType::get( 1320 FPTy, cast<VectorType>(Vec->getType())->getElementCount()); 1321 Value *FPVec = IC.Builder.CreateBitCast(Vec, FPVTy); 1322 auto *FPII = IC.Builder.CreateIntrinsic( 1323 II.getIntrinsicID(), {FPVec->getType()}, {Pg, FPFallBack, FPVec}); 1324 Value *FPIItoInt = IC.Builder.CreateBitCast(FPII, II.getType()); 1325 return IC.replaceInstUsesWith(II, FPIItoInt); 1326 } 1327 1328 static std::optional<Instruction *> instCombineRDFFR(InstCombiner &IC, 1329 IntrinsicInst &II) { 1330 LLVMContext &Ctx = II.getContext(); 1331 // Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr 1332 // can work with RDFFR_PP for ptest elimination. 1333 auto *AllPat = 1334 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all); 1335 auto *PTrue = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, 1336 {II.getType()}, {AllPat}); 1337 auto *RDFFR = 1338 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_rdffr_z, {}, {PTrue}); 1339 RDFFR->takeName(&II); 1340 return IC.replaceInstUsesWith(II, RDFFR); 1341 } 1342 1343 static std::optional<Instruction *> 1344 instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts) { 1345 const auto Pattern = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue(); 1346 1347 if (Pattern == AArch64SVEPredPattern::all) { 1348 Constant *StepVal = ConstantInt::get(II.getType(), NumElts); 1349 auto *VScale = IC.Builder.CreateVScale(StepVal); 1350 VScale->takeName(&II); 1351 return IC.replaceInstUsesWith(II, VScale); 1352 } 1353 1354 unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern); 1355 1356 return MinNumElts && NumElts >= MinNumElts 1357 ? std::optional<Instruction *>(IC.replaceInstUsesWith( 1358 II, ConstantInt::get(II.getType(), MinNumElts))) 1359 : std::nullopt; 1360 } 1361 1362 static std::optional<Instruction *> instCombineSVEPTest(InstCombiner &IC, 1363 IntrinsicInst &II) { 1364 Value *PgVal = II.getArgOperand(0); 1365 Value *OpVal = II.getArgOperand(1); 1366 1367 // PTEST_<FIRST|LAST>(X, X) is equivalent to PTEST_ANY(X, X). 1368 // Later optimizations prefer this form. 1369 if (PgVal == OpVal && 1370 (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_first || 1371 II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_last)) { 1372 Value *Ops[] = {PgVal, OpVal}; 1373 Type *Tys[] = {PgVal->getType()}; 1374 1375 auto *PTest = 1376 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptest_any, Tys, Ops); 1377 PTest->takeName(&II); 1378 1379 return IC.replaceInstUsesWith(II, PTest); 1380 } 1381 1382 IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(PgVal); 1383 IntrinsicInst *Op = dyn_cast<IntrinsicInst>(OpVal); 1384 1385 if (!Pg || !Op) 1386 return std::nullopt; 1387 1388 Intrinsic::ID OpIID = Op->getIntrinsicID(); 1389 1390 if (Pg->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool && 1391 OpIID == Intrinsic::aarch64_sve_convert_to_svbool && 1392 Pg->getArgOperand(0)->getType() == Op->getArgOperand(0)->getType()) { 1393 Value *Ops[] = {Pg->getArgOperand(0), Op->getArgOperand(0)}; 1394 Type *Tys[] = {Pg->getArgOperand(0)->getType()}; 1395 1396 auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops); 1397 1398 PTest->takeName(&II); 1399 return IC.replaceInstUsesWith(II, PTest); 1400 } 1401 1402 // Transform PTEST_ANY(X=OP(PG,...), X) -> PTEST_ANY(PG, X)). 1403 // Later optimizations may rewrite sequence to use the flag-setting variant 1404 // of instruction X to remove PTEST. 1405 if ((Pg == Op) && (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_any) && 1406 ((OpIID == Intrinsic::aarch64_sve_brka_z) || 1407 (OpIID == Intrinsic::aarch64_sve_brkb_z) || 1408 (OpIID == Intrinsic::aarch64_sve_brkpa_z) || 1409 (OpIID == Intrinsic::aarch64_sve_brkpb_z) || 1410 (OpIID == Intrinsic::aarch64_sve_rdffr_z) || 1411 (OpIID == Intrinsic::aarch64_sve_and_z) || 1412 (OpIID == Intrinsic::aarch64_sve_bic_z) || 1413 (OpIID == Intrinsic::aarch64_sve_eor_z) || 1414 (OpIID == Intrinsic::aarch64_sve_nand_z) || 1415 (OpIID == Intrinsic::aarch64_sve_nor_z) || 1416 (OpIID == Intrinsic::aarch64_sve_orn_z) || 1417 (OpIID == Intrinsic::aarch64_sve_orr_z))) { 1418 Value *Ops[] = {Pg->getArgOperand(0), Pg}; 1419 Type *Tys[] = {Pg->getType()}; 1420 1421 auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops); 1422 PTest->takeName(&II); 1423 1424 return IC.replaceInstUsesWith(II, PTest); 1425 } 1426 1427 return std::nullopt; 1428 } 1429 1430 template <Intrinsic::ID MulOpc, typename Intrinsic::ID FuseOpc> 1431 static std::optional<Instruction *> 1432 instCombineSVEVectorFuseMulAddSub(InstCombiner &IC, IntrinsicInst &II, 1433 bool MergeIntoAddendOp) { 1434 Value *P = II.getOperand(0); 1435 Value *MulOp0, *MulOp1, *AddendOp, *Mul; 1436 if (MergeIntoAddendOp) { 1437 AddendOp = II.getOperand(1); 1438 Mul = II.getOperand(2); 1439 } else { 1440 AddendOp = II.getOperand(2); 1441 Mul = II.getOperand(1); 1442 } 1443 1444 if (!match(Mul, m_Intrinsic<MulOpc>(m_Specific(P), m_Value(MulOp0), 1445 m_Value(MulOp1)))) 1446 return std::nullopt; 1447 1448 if (!Mul->hasOneUse()) 1449 return std::nullopt; 1450 1451 Instruction *FMFSource = nullptr; 1452 if (II.getType()->isFPOrFPVectorTy()) { 1453 llvm::FastMathFlags FAddFlags = II.getFastMathFlags(); 1454 // Stop the combine when the flags on the inputs differ in case dropping 1455 // flags would lead to us missing out on more beneficial optimizations. 1456 if (FAddFlags != cast<CallInst>(Mul)->getFastMathFlags()) 1457 return std::nullopt; 1458 if (!FAddFlags.allowContract()) 1459 return std::nullopt; 1460 FMFSource = &II; 1461 } 1462 1463 CallInst *Res; 1464 if (MergeIntoAddendOp) 1465 Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()}, 1466 {P, AddendOp, MulOp0, MulOp1}, FMFSource); 1467 else 1468 Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()}, 1469 {P, MulOp0, MulOp1, AddendOp}, FMFSource); 1470 1471 return IC.replaceInstUsesWith(II, Res); 1472 } 1473 1474 static std::optional<Instruction *> 1475 instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) { 1476 Value *Pred = II.getOperand(0); 1477 Value *PtrOp = II.getOperand(1); 1478 Type *VecTy = II.getType(); 1479 1480 // Replace by zero constant when all lanes are inactive 1481 if (auto II_NA = instCombineSVENoActiveUnaryZero(IC, II)) 1482 return II_NA; 1483 1484 if (isAllActivePredicate(Pred)) { 1485 LoadInst *Load = IC.Builder.CreateLoad(VecTy, PtrOp); 1486 Load->copyMetadata(II); 1487 return IC.replaceInstUsesWith(II, Load); 1488 } 1489 1490 CallInst *MaskedLoad = 1491 IC.Builder.CreateMaskedLoad(VecTy, PtrOp, PtrOp->getPointerAlignment(DL), 1492 Pred, ConstantAggregateZero::get(VecTy)); 1493 MaskedLoad->copyMetadata(II); 1494 return IC.replaceInstUsesWith(II, MaskedLoad); 1495 } 1496 1497 static std::optional<Instruction *> 1498 instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) { 1499 Value *VecOp = II.getOperand(0); 1500 Value *Pred = II.getOperand(1); 1501 Value *PtrOp = II.getOperand(2); 1502 1503 if (isAllActivePredicate(Pred)) { 1504 StoreInst *Store = IC.Builder.CreateStore(VecOp, PtrOp); 1505 Store->copyMetadata(II); 1506 return IC.eraseInstFromFunction(II); 1507 } 1508 1509 CallInst *MaskedStore = IC.Builder.CreateMaskedStore( 1510 VecOp, PtrOp, PtrOp->getPointerAlignment(DL), Pred); 1511 MaskedStore->copyMetadata(II); 1512 return IC.eraseInstFromFunction(II); 1513 } 1514 1515 static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic) { 1516 switch (Intrinsic) { 1517 case Intrinsic::aarch64_sve_fmul_u: 1518 return Instruction::BinaryOps::FMul; 1519 case Intrinsic::aarch64_sve_fadd_u: 1520 return Instruction::BinaryOps::FAdd; 1521 case Intrinsic::aarch64_sve_fsub_u: 1522 return Instruction::BinaryOps::FSub; 1523 default: 1524 return Instruction::BinaryOpsEnd; 1525 } 1526 } 1527 1528 static std::optional<Instruction *> 1529 instCombineSVEVectorBinOp(InstCombiner &IC, IntrinsicInst &II) { 1530 // Bail due to missing support for ISD::STRICT_ scalable vector operations. 1531 if (II.isStrictFP()) 1532 return std::nullopt; 1533 1534 auto *OpPredicate = II.getOperand(0); 1535 auto BinOpCode = intrinsicIDToBinOpCode(II.getIntrinsicID()); 1536 if (BinOpCode == Instruction::BinaryOpsEnd || 1537 !match(OpPredicate, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>( 1538 m_ConstantInt<AArch64SVEPredPattern::all>()))) 1539 return std::nullopt; 1540 IRBuilderBase::FastMathFlagGuard FMFGuard(IC.Builder); 1541 IC.Builder.setFastMathFlags(II.getFastMathFlags()); 1542 auto BinOp = 1543 IC.Builder.CreateBinOp(BinOpCode, II.getOperand(1), II.getOperand(2)); 1544 return IC.replaceInstUsesWith(II, BinOp); 1545 } 1546 1547 // Canonicalise operations that take an all active predicate (e.g. sve.add -> 1548 // sve.add_u). 1549 static std::optional<Instruction *> instCombineSVEAllActive(IntrinsicInst &II, 1550 Intrinsic::ID IID) { 1551 auto *OpPredicate = II.getOperand(0); 1552 if (!match(OpPredicate, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>( 1553 m_ConstantInt<AArch64SVEPredPattern::all>()))) 1554 return std::nullopt; 1555 1556 auto *Mod = II.getModule(); 1557 auto *NewDecl = Intrinsic::getDeclaration(Mod, IID, {II.getType()}); 1558 II.setCalledFunction(NewDecl); 1559 1560 return &II; 1561 } 1562 1563 // Simplify operations where predicate has all inactive lanes or try to replace 1564 // with _u form when all lanes are active 1565 static std::optional<Instruction *> 1566 instCombineSVEAllOrNoActive(InstCombiner &IC, IntrinsicInst &II, 1567 Intrinsic::ID IID) { 1568 if (match(II.getOperand(0), m_ZeroInt())) { 1569 // llvm_ir, pred(0), op1, op2 - Spec says to return op1 when all lanes are 1570 // inactive for sv[func]_m 1571 return IC.replaceInstUsesWith(II, II.getOperand(1)); 1572 } 1573 return instCombineSVEAllActive(II, IID); 1574 } 1575 1576 static std::optional<Instruction *> instCombineSVEVectorAdd(InstCombiner &IC, 1577 IntrinsicInst &II) { 1578 if (auto II_U = 1579 instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_add_u)) 1580 return II_U; 1581 if (auto MLA = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul, 1582 Intrinsic::aarch64_sve_mla>( 1583 IC, II, true)) 1584 return MLA; 1585 if (auto MAD = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul, 1586 Intrinsic::aarch64_sve_mad>( 1587 IC, II, false)) 1588 return MAD; 1589 return std::nullopt; 1590 } 1591 1592 static std::optional<Instruction *> 1593 instCombineSVEVectorFAdd(InstCombiner &IC, IntrinsicInst &II) { 1594 if (auto II_U = 1595 instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fadd_u)) 1596 return II_U; 1597 if (auto FMLA = 1598 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul, 1599 Intrinsic::aarch64_sve_fmla>(IC, II, 1600 true)) 1601 return FMLA; 1602 if (auto FMAD = 1603 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul, 1604 Intrinsic::aarch64_sve_fmad>(IC, II, 1605 false)) 1606 return FMAD; 1607 if (auto FMLA = 1608 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u, 1609 Intrinsic::aarch64_sve_fmla>(IC, II, 1610 true)) 1611 return FMLA; 1612 return std::nullopt; 1613 } 1614 1615 static std::optional<Instruction *> 1616 instCombineSVEVectorFAddU(InstCombiner &IC, IntrinsicInst &II) { 1617 if (auto FMLA = 1618 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul, 1619 Intrinsic::aarch64_sve_fmla>(IC, II, 1620 true)) 1621 return FMLA; 1622 if (auto FMAD = 1623 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul, 1624 Intrinsic::aarch64_sve_fmad>(IC, II, 1625 false)) 1626 return FMAD; 1627 if (auto FMLA_U = 1628 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u, 1629 Intrinsic::aarch64_sve_fmla_u>( 1630 IC, II, true)) 1631 return FMLA_U; 1632 return instCombineSVEVectorBinOp(IC, II); 1633 } 1634 1635 static std::optional<Instruction *> 1636 instCombineSVEVectorFSub(InstCombiner &IC, IntrinsicInst &II) { 1637 if (auto II_U = 1638 instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fsub_u)) 1639 return II_U; 1640 if (auto FMLS = 1641 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul, 1642 Intrinsic::aarch64_sve_fmls>(IC, II, 1643 true)) 1644 return FMLS; 1645 if (auto FMSB = 1646 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul, 1647 Intrinsic::aarch64_sve_fnmsb>( 1648 IC, II, false)) 1649 return FMSB; 1650 if (auto FMLS = 1651 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u, 1652 Intrinsic::aarch64_sve_fmls>(IC, II, 1653 true)) 1654 return FMLS; 1655 return std::nullopt; 1656 } 1657 1658 static std::optional<Instruction *> 1659 instCombineSVEVectorFSubU(InstCombiner &IC, IntrinsicInst &II) { 1660 if (auto FMLS = 1661 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul, 1662 Intrinsic::aarch64_sve_fmls>(IC, II, 1663 true)) 1664 return FMLS; 1665 if (auto FMSB = 1666 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul, 1667 Intrinsic::aarch64_sve_fnmsb>( 1668 IC, II, false)) 1669 return FMSB; 1670 if (auto FMLS_U = 1671 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u, 1672 Intrinsic::aarch64_sve_fmls_u>( 1673 IC, II, true)) 1674 return FMLS_U; 1675 return instCombineSVEVectorBinOp(IC, II); 1676 } 1677 1678 static std::optional<Instruction *> instCombineSVEVectorSub(InstCombiner &IC, 1679 IntrinsicInst &II) { 1680 if (auto II_U = 1681 instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_sub_u)) 1682 return II_U; 1683 if (auto MLS = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul, 1684 Intrinsic::aarch64_sve_mls>( 1685 IC, II, true)) 1686 return MLS; 1687 return std::nullopt; 1688 } 1689 1690 static std::optional<Instruction *> instCombineSVEVectorMul(InstCombiner &IC, 1691 IntrinsicInst &II, 1692 Intrinsic::ID IID) { 1693 auto *OpPredicate = II.getOperand(0); 1694 auto *OpMultiplicand = II.getOperand(1); 1695 auto *OpMultiplier = II.getOperand(2); 1696 1697 // Return true if a given instruction is a unit splat value, false otherwise. 1698 auto IsUnitSplat = [](auto *I) { 1699 auto *SplatValue = getSplatValue(I); 1700 if (!SplatValue) 1701 return false; 1702 return match(SplatValue, m_FPOne()) || match(SplatValue, m_One()); 1703 }; 1704 1705 // Return true if a given instruction is an aarch64_sve_dup intrinsic call 1706 // with a unit splat value, false otherwise. 1707 auto IsUnitDup = [](auto *I) { 1708 auto *IntrI = dyn_cast<IntrinsicInst>(I); 1709 if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup) 1710 return false; 1711 1712 auto *SplatValue = IntrI->getOperand(2); 1713 return match(SplatValue, m_FPOne()) || match(SplatValue, m_One()); 1714 }; 1715 1716 if (IsUnitSplat(OpMultiplier)) { 1717 // [f]mul pg %n, (dupx 1) => %n 1718 OpMultiplicand->takeName(&II); 1719 return IC.replaceInstUsesWith(II, OpMultiplicand); 1720 } else if (IsUnitDup(OpMultiplier)) { 1721 // [f]mul pg %n, (dup pg 1) => %n 1722 auto *DupInst = cast<IntrinsicInst>(OpMultiplier); 1723 auto *DupPg = DupInst->getOperand(1); 1724 // TODO: this is naive. The optimization is still valid if DupPg 1725 // 'encompasses' OpPredicate, not only if they're the same predicate. 1726 if (OpPredicate == DupPg) { 1727 OpMultiplicand->takeName(&II); 1728 return IC.replaceInstUsesWith(II, OpMultiplicand); 1729 } 1730 } 1731 1732 return instCombineSVEVectorBinOp(IC, II); 1733 } 1734 1735 static std::optional<Instruction *> instCombineSVEUnpack(InstCombiner &IC, 1736 IntrinsicInst &II) { 1737 Value *UnpackArg = II.getArgOperand(0); 1738 auto *RetTy = cast<ScalableVectorType>(II.getType()); 1739 bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi || 1740 II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo; 1741 1742 // Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X)) 1743 // Lo = uunpklo(splat(X)) --> Lo = splat(extend(X)) 1744 if (auto *ScalarArg = getSplatValue(UnpackArg)) { 1745 ScalarArg = 1746 IC.Builder.CreateIntCast(ScalarArg, RetTy->getScalarType(), IsSigned); 1747 Value *NewVal = 1748 IC.Builder.CreateVectorSplat(RetTy->getElementCount(), ScalarArg); 1749 NewVal->takeName(&II); 1750 return IC.replaceInstUsesWith(II, NewVal); 1751 } 1752 1753 return std::nullopt; 1754 } 1755 static std::optional<Instruction *> instCombineSVETBL(InstCombiner &IC, 1756 IntrinsicInst &II) { 1757 auto *OpVal = II.getOperand(0); 1758 auto *OpIndices = II.getOperand(1); 1759 VectorType *VTy = cast<VectorType>(II.getType()); 1760 1761 // Check whether OpIndices is a constant splat value < minimal element count 1762 // of result. 1763 auto *SplatValue = dyn_cast_or_null<ConstantInt>(getSplatValue(OpIndices)); 1764 if (!SplatValue || 1765 SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue())) 1766 return std::nullopt; 1767 1768 // Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to 1769 // splat_vector(extractelement(OpVal, SplatValue)) for further optimization. 1770 auto *Extract = IC.Builder.CreateExtractElement(OpVal, SplatValue); 1771 auto *VectorSplat = 1772 IC.Builder.CreateVectorSplat(VTy->getElementCount(), Extract); 1773 1774 VectorSplat->takeName(&II); 1775 return IC.replaceInstUsesWith(II, VectorSplat); 1776 } 1777 1778 static std::optional<Instruction *> instCombineSVEUzp1(InstCombiner &IC, 1779 IntrinsicInst &II) { 1780 Value *A, *B; 1781 Type *RetTy = II.getType(); 1782 constexpr Intrinsic::ID FromSVB = Intrinsic::aarch64_sve_convert_from_svbool; 1783 constexpr Intrinsic::ID ToSVB = Intrinsic::aarch64_sve_convert_to_svbool; 1784 1785 // uzp1(to_svbool(A), to_svbool(B)) --> <A, B> 1786 // uzp1(from_svbool(to_svbool(A)), from_svbool(to_svbool(B))) --> <A, B> 1787 if ((match(II.getArgOperand(0), 1788 m_Intrinsic<FromSVB>(m_Intrinsic<ToSVB>(m_Value(A)))) && 1789 match(II.getArgOperand(1), 1790 m_Intrinsic<FromSVB>(m_Intrinsic<ToSVB>(m_Value(B))))) || 1791 (match(II.getArgOperand(0), m_Intrinsic<ToSVB>(m_Value(A))) && 1792 match(II.getArgOperand(1), m_Intrinsic<ToSVB>(m_Value(B))))) { 1793 auto *TyA = cast<ScalableVectorType>(A->getType()); 1794 if (TyA == B->getType() && 1795 RetTy == ScalableVectorType::getDoubleElementsVectorType(TyA)) { 1796 auto *SubVec = IC.Builder.CreateInsertVector( 1797 RetTy, PoisonValue::get(RetTy), A, IC.Builder.getInt64(0)); 1798 auto *ConcatVec = IC.Builder.CreateInsertVector( 1799 RetTy, SubVec, B, IC.Builder.getInt64(TyA->getMinNumElements())); 1800 ConcatVec->takeName(&II); 1801 return IC.replaceInstUsesWith(II, ConcatVec); 1802 } 1803 } 1804 1805 return std::nullopt; 1806 } 1807 1808 static std::optional<Instruction *> instCombineSVEZip(InstCombiner &IC, 1809 IntrinsicInst &II) { 1810 // zip1(uzp1(A, B), uzp2(A, B)) --> A 1811 // zip2(uzp1(A, B), uzp2(A, B)) --> B 1812 Value *A, *B; 1813 if (match(II.getArgOperand(0), 1814 m_Intrinsic<Intrinsic::aarch64_sve_uzp1>(m_Value(A), m_Value(B))) && 1815 match(II.getArgOperand(1), m_Intrinsic<Intrinsic::aarch64_sve_uzp2>( 1816 m_Specific(A), m_Specific(B)))) 1817 return IC.replaceInstUsesWith( 1818 II, (II.getIntrinsicID() == Intrinsic::aarch64_sve_zip1 ? A : B)); 1819 1820 return std::nullopt; 1821 } 1822 1823 static std::optional<Instruction *> 1824 instCombineLD1GatherIndex(InstCombiner &IC, IntrinsicInst &II) { 1825 Value *Mask = II.getOperand(0); 1826 Value *BasePtr = II.getOperand(1); 1827 Value *Index = II.getOperand(2); 1828 Type *Ty = II.getType(); 1829 Value *PassThru = ConstantAggregateZero::get(Ty); 1830 1831 // Replace by zero constant when all lanes are inactive 1832 if (auto II_NA = instCombineSVENoActiveUnaryZero(IC, II)) 1833 return II_NA; 1834 1835 // Contiguous gather => masked load. 1836 // (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1)) 1837 // => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer) 1838 Value *IndexBase; 1839 if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>( 1840 m_Value(IndexBase), m_SpecificInt(1)))) { 1841 Align Alignment = 1842 BasePtr->getPointerAlignment(II.getDataLayout()); 1843 1844 Type *VecPtrTy = PointerType::getUnqual(Ty); 1845 Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(), 1846 BasePtr, IndexBase); 1847 Ptr = IC.Builder.CreateBitCast(Ptr, VecPtrTy); 1848 CallInst *MaskedLoad = 1849 IC.Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask, PassThru); 1850 MaskedLoad->takeName(&II); 1851 return IC.replaceInstUsesWith(II, MaskedLoad); 1852 } 1853 1854 return std::nullopt; 1855 } 1856 1857 static std::optional<Instruction *> 1858 instCombineST1ScatterIndex(InstCombiner &IC, IntrinsicInst &II) { 1859 Value *Val = II.getOperand(0); 1860 Value *Mask = II.getOperand(1); 1861 Value *BasePtr = II.getOperand(2); 1862 Value *Index = II.getOperand(3); 1863 Type *Ty = Val->getType(); 1864 1865 // Contiguous scatter => masked store. 1866 // (sve.st1.scatter.index Value Mask BasePtr (sve.index IndexBase 1)) 1867 // => (masked.store Value (gep BasePtr IndexBase) Align Mask) 1868 Value *IndexBase; 1869 if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>( 1870 m_Value(IndexBase), m_SpecificInt(1)))) { 1871 Align Alignment = 1872 BasePtr->getPointerAlignment(II.getDataLayout()); 1873 1874 Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(), 1875 BasePtr, IndexBase); 1876 Type *VecPtrTy = PointerType::getUnqual(Ty); 1877 Ptr = IC.Builder.CreateBitCast(Ptr, VecPtrTy); 1878 1879 (void)IC.Builder.CreateMaskedStore(Val, Ptr, Alignment, Mask); 1880 1881 return IC.eraseInstFromFunction(II); 1882 } 1883 1884 return std::nullopt; 1885 } 1886 1887 static std::optional<Instruction *> instCombineSVESDIV(InstCombiner &IC, 1888 IntrinsicInst &II) { 1889 Type *Int32Ty = IC.Builder.getInt32Ty(); 1890 Value *Pred = II.getOperand(0); 1891 Value *Vec = II.getOperand(1); 1892 Value *DivVec = II.getOperand(2); 1893 1894 Value *SplatValue = getSplatValue(DivVec); 1895 ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(SplatValue); 1896 if (!SplatConstantInt) 1897 return std::nullopt; 1898 APInt Divisor = SplatConstantInt->getValue(); 1899 1900 if (Divisor.isPowerOf2()) { 1901 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2()); 1902 auto ASRD = IC.Builder.CreateIntrinsic( 1903 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2}); 1904 return IC.replaceInstUsesWith(II, ASRD); 1905 } 1906 if (Divisor.isNegatedPowerOf2()) { 1907 Divisor.negate(); 1908 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2()); 1909 auto ASRD = IC.Builder.CreateIntrinsic( 1910 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2}); 1911 auto NEG = IC.Builder.CreateIntrinsic( 1912 Intrinsic::aarch64_sve_neg, {ASRD->getType()}, {ASRD, Pred, ASRD}); 1913 return IC.replaceInstUsesWith(II, NEG); 1914 } 1915 1916 return std::nullopt; 1917 } 1918 1919 bool SimplifyValuePattern(SmallVector<Value *> &Vec, bool AllowPoison) { 1920 size_t VecSize = Vec.size(); 1921 if (VecSize == 1) 1922 return true; 1923 if (!isPowerOf2_64(VecSize)) 1924 return false; 1925 size_t HalfVecSize = VecSize / 2; 1926 1927 for (auto LHS = Vec.begin(), RHS = Vec.begin() + HalfVecSize; 1928 RHS != Vec.end(); LHS++, RHS++) { 1929 if (*LHS != nullptr && *RHS != nullptr) { 1930 if (*LHS == *RHS) 1931 continue; 1932 else 1933 return false; 1934 } 1935 if (!AllowPoison) 1936 return false; 1937 if (*LHS == nullptr && *RHS != nullptr) 1938 *LHS = *RHS; 1939 } 1940 1941 Vec.resize(HalfVecSize); 1942 SimplifyValuePattern(Vec, AllowPoison); 1943 return true; 1944 } 1945 1946 // Try to simplify dupqlane patterns like dupqlane(f32 A, f32 B, f32 A, f32 B) 1947 // to dupqlane(f64(C)) where C is A concatenated with B 1948 static std::optional<Instruction *> instCombineSVEDupqLane(InstCombiner &IC, 1949 IntrinsicInst &II) { 1950 Value *CurrentInsertElt = nullptr, *Default = nullptr; 1951 if (!match(II.getOperand(0), 1952 m_Intrinsic<Intrinsic::vector_insert>( 1953 m_Value(Default), m_Value(CurrentInsertElt), m_Value())) || 1954 !isa<FixedVectorType>(CurrentInsertElt->getType())) 1955 return std::nullopt; 1956 auto IIScalableTy = cast<ScalableVectorType>(II.getType()); 1957 1958 // Insert the scalars into a container ordered by InsertElement index 1959 SmallVector<Value *> Elts(IIScalableTy->getMinNumElements(), nullptr); 1960 while (auto InsertElt = dyn_cast<InsertElementInst>(CurrentInsertElt)) { 1961 auto Idx = cast<ConstantInt>(InsertElt->getOperand(2)); 1962 Elts[Idx->getValue().getZExtValue()] = InsertElt->getOperand(1); 1963 CurrentInsertElt = InsertElt->getOperand(0); 1964 } 1965 1966 bool AllowPoison = 1967 isa<PoisonValue>(CurrentInsertElt) && isa<PoisonValue>(Default); 1968 if (!SimplifyValuePattern(Elts, AllowPoison)) 1969 return std::nullopt; 1970 1971 // Rebuild the simplified chain of InsertElements. e.g. (a, b, a, b) as (a, b) 1972 Value *InsertEltChain = PoisonValue::get(CurrentInsertElt->getType()); 1973 for (size_t I = 0; I < Elts.size(); I++) { 1974 if (Elts[I] == nullptr) 1975 continue; 1976 InsertEltChain = IC.Builder.CreateInsertElement(InsertEltChain, Elts[I], 1977 IC.Builder.getInt64(I)); 1978 } 1979 if (InsertEltChain == nullptr) 1980 return std::nullopt; 1981 1982 // Splat the simplified sequence, e.g. (f16 a, f16 b, f16 c, f16 d) as one i64 1983 // value or (f16 a, f16 b) as one i32 value. This requires an InsertSubvector 1984 // be bitcast to a type wide enough to fit the sequence, be splatted, and then 1985 // be narrowed back to the original type. 1986 unsigned PatternWidth = IIScalableTy->getScalarSizeInBits() * Elts.size(); 1987 unsigned PatternElementCount = IIScalableTy->getScalarSizeInBits() * 1988 IIScalableTy->getMinNumElements() / 1989 PatternWidth; 1990 1991 IntegerType *WideTy = IC.Builder.getIntNTy(PatternWidth); 1992 auto *WideScalableTy = ScalableVectorType::get(WideTy, PatternElementCount); 1993 auto *WideShuffleMaskTy = 1994 ScalableVectorType::get(IC.Builder.getInt32Ty(), PatternElementCount); 1995 1996 auto ZeroIdx = ConstantInt::get(IC.Builder.getInt64Ty(), APInt(64, 0)); 1997 auto InsertSubvector = IC.Builder.CreateInsertVector( 1998 II.getType(), PoisonValue::get(II.getType()), InsertEltChain, ZeroIdx); 1999 auto WideBitcast = 2000 IC.Builder.CreateBitOrPointerCast(InsertSubvector, WideScalableTy); 2001 auto WideShuffleMask = ConstantAggregateZero::get(WideShuffleMaskTy); 2002 auto WideShuffle = IC.Builder.CreateShuffleVector( 2003 WideBitcast, PoisonValue::get(WideScalableTy), WideShuffleMask); 2004 auto NarrowBitcast = 2005 IC.Builder.CreateBitOrPointerCast(WideShuffle, II.getType()); 2006 2007 return IC.replaceInstUsesWith(II, NarrowBitcast); 2008 } 2009 2010 static std::optional<Instruction *> instCombineMaxMinNM(InstCombiner &IC, 2011 IntrinsicInst &II) { 2012 Value *A = II.getArgOperand(0); 2013 Value *B = II.getArgOperand(1); 2014 if (A == B) 2015 return IC.replaceInstUsesWith(II, A); 2016 2017 return std::nullopt; 2018 } 2019 2020 static std::optional<Instruction *> instCombineSVESrshl(InstCombiner &IC, 2021 IntrinsicInst &II) { 2022 Value *Pred = II.getOperand(0); 2023 Value *Vec = II.getOperand(1); 2024 Value *Shift = II.getOperand(2); 2025 2026 // Convert SRSHL into the simpler LSL intrinsic when fed by an ABS intrinsic. 2027 Value *AbsPred, *MergedValue; 2028 if (!match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_sqabs>( 2029 m_Value(MergedValue), m_Value(AbsPred), m_Value())) && 2030 !match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_abs>( 2031 m_Value(MergedValue), m_Value(AbsPred), m_Value()))) 2032 2033 return std::nullopt; 2034 2035 // Transform is valid if any of the following are true: 2036 // * The ABS merge value is an undef or non-negative 2037 // * The ABS predicate is all active 2038 // * The ABS predicate and the SRSHL predicates are the same 2039 if (!isa<UndefValue>(MergedValue) && !match(MergedValue, m_NonNegative()) && 2040 AbsPred != Pred && !isAllActivePredicate(AbsPred)) 2041 return std::nullopt; 2042 2043 // Only valid when the shift amount is non-negative, otherwise the rounding 2044 // behaviour of SRSHL cannot be ignored. 2045 if (!match(Shift, m_NonNegative())) 2046 return std::nullopt; 2047 2048 auto LSL = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_lsl, 2049 {II.getType()}, {Pred, Vec, Shift}); 2050 2051 return IC.replaceInstUsesWith(II, LSL); 2052 } 2053 2054 std::optional<Instruction *> 2055 AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC, 2056 IntrinsicInst &II) const { 2057 Intrinsic::ID IID = II.getIntrinsicID(); 2058 switch (IID) { 2059 default: 2060 break; 2061 2062 case Intrinsic::aarch64_sve_st1_scatter: 2063 case Intrinsic::aarch64_sve_st1_scatter_scalar_offset: 2064 case Intrinsic::aarch64_sve_st1_scatter_sxtw: 2065 case Intrinsic::aarch64_sve_st1_scatter_sxtw_index: 2066 case Intrinsic::aarch64_sve_st1_scatter_uxtw: 2067 case Intrinsic::aarch64_sve_st1_scatter_uxtw_index: 2068 case Intrinsic::aarch64_sve_st1dq: 2069 case Intrinsic::aarch64_sve_st1q_scatter_index: 2070 case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset: 2071 case Intrinsic::aarch64_sve_st1q_scatter_vector_offset: 2072 case Intrinsic::aarch64_sve_st1wq: 2073 case Intrinsic::aarch64_sve_stnt1: 2074 case Intrinsic::aarch64_sve_stnt1_scatter: 2075 case Intrinsic::aarch64_sve_stnt1_scatter_index: 2076 case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset: 2077 case Intrinsic::aarch64_sve_stnt1_scatter_uxtw: 2078 return instCombineSVENoActiveUnaryErase(IC, II, 1); 2079 case Intrinsic::aarch64_sve_st2: 2080 case Intrinsic::aarch64_sve_st2q: 2081 return instCombineSVENoActiveUnaryErase(IC, II, 2); 2082 case Intrinsic::aarch64_sve_st3: 2083 case Intrinsic::aarch64_sve_st3q: 2084 return instCombineSVENoActiveUnaryErase(IC, II, 3); 2085 case Intrinsic::aarch64_sve_st4: 2086 case Intrinsic::aarch64_sve_st4q: 2087 return instCombineSVENoActiveUnaryErase(IC, II, 4); 2088 case Intrinsic::aarch64_sve_ld1_gather: 2089 case Intrinsic::aarch64_sve_ld1_gather_scalar_offset: 2090 case Intrinsic::aarch64_sve_ld1_gather_sxtw: 2091 case Intrinsic::aarch64_sve_ld1_gather_sxtw_index: 2092 case Intrinsic::aarch64_sve_ld1_gather_uxtw: 2093 case Intrinsic::aarch64_sve_ld1_gather_uxtw_index: 2094 case Intrinsic::aarch64_sve_ld1q_gather_index: 2095 case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset: 2096 case Intrinsic::aarch64_sve_ld1q_gather_vector_offset: 2097 case Intrinsic::aarch64_sve_ld1ro: 2098 case Intrinsic::aarch64_sve_ld1rq: 2099 case Intrinsic::aarch64_sve_ld1udq: 2100 case Intrinsic::aarch64_sve_ld1uwq: 2101 case Intrinsic::aarch64_sve_ld2_sret: 2102 case Intrinsic::aarch64_sve_ld2q_sret: 2103 case Intrinsic::aarch64_sve_ld3_sret: 2104 case Intrinsic::aarch64_sve_ld3q_sret: 2105 case Intrinsic::aarch64_sve_ld4_sret: 2106 case Intrinsic::aarch64_sve_ld4q_sret: 2107 case Intrinsic::aarch64_sve_ldff1: 2108 case Intrinsic::aarch64_sve_ldff1_gather: 2109 case Intrinsic::aarch64_sve_ldff1_gather_index: 2110 case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset: 2111 case Intrinsic::aarch64_sve_ldff1_gather_sxtw: 2112 case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index: 2113 case Intrinsic::aarch64_sve_ldff1_gather_uxtw: 2114 case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index: 2115 case Intrinsic::aarch64_sve_ldnf1: 2116 case Intrinsic::aarch64_sve_ldnt1: 2117 case Intrinsic::aarch64_sve_ldnt1_gather: 2118 case Intrinsic::aarch64_sve_ldnt1_gather_index: 2119 case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset: 2120 case Intrinsic::aarch64_sve_ldnt1_gather_uxtw: 2121 return instCombineSVENoActiveUnaryZero(IC, II); 2122 case Intrinsic::aarch64_neon_fmaxnm: 2123 case Intrinsic::aarch64_neon_fminnm: 2124 return instCombineMaxMinNM(IC, II); 2125 case Intrinsic::aarch64_sve_convert_from_svbool: 2126 return instCombineConvertFromSVBool(IC, II); 2127 case Intrinsic::aarch64_sve_dup: 2128 return instCombineSVEDup(IC, II); 2129 case Intrinsic::aarch64_sve_dup_x: 2130 return instCombineSVEDupX(IC, II); 2131 case Intrinsic::aarch64_sve_cmpne: 2132 case Intrinsic::aarch64_sve_cmpne_wide: 2133 return instCombineSVECmpNE(IC, II); 2134 case Intrinsic::aarch64_sve_rdffr: 2135 return instCombineRDFFR(IC, II); 2136 case Intrinsic::aarch64_sve_lasta: 2137 case Intrinsic::aarch64_sve_lastb: 2138 return instCombineSVELast(IC, II); 2139 case Intrinsic::aarch64_sve_clasta_n: 2140 case Intrinsic::aarch64_sve_clastb_n: 2141 return instCombineSVECondLast(IC, II); 2142 case Intrinsic::aarch64_sve_cntd: 2143 return instCombineSVECntElts(IC, II, 2); 2144 case Intrinsic::aarch64_sve_cntw: 2145 return instCombineSVECntElts(IC, II, 4); 2146 case Intrinsic::aarch64_sve_cnth: 2147 return instCombineSVECntElts(IC, II, 8); 2148 case Intrinsic::aarch64_sve_cntb: 2149 return instCombineSVECntElts(IC, II, 16); 2150 case Intrinsic::aarch64_sve_ptest_any: 2151 case Intrinsic::aarch64_sve_ptest_first: 2152 case Intrinsic::aarch64_sve_ptest_last: 2153 return instCombineSVEPTest(IC, II); 2154 case Intrinsic::aarch64_sve_fabd: 2155 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fabd_u); 2156 case Intrinsic::aarch64_sve_fadd: 2157 return instCombineSVEVectorFAdd(IC, II); 2158 case Intrinsic::aarch64_sve_fadd_u: 2159 return instCombineSVEVectorFAddU(IC, II); 2160 case Intrinsic::aarch64_sve_fdiv: 2161 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fdiv_u); 2162 case Intrinsic::aarch64_sve_fmax: 2163 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmax_u); 2164 case Intrinsic::aarch64_sve_fmaxnm: 2165 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmaxnm_u); 2166 case Intrinsic::aarch64_sve_fmin: 2167 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmin_u); 2168 case Intrinsic::aarch64_sve_fminnm: 2169 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fminnm_u); 2170 case Intrinsic::aarch64_sve_fmla: 2171 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmla_u); 2172 case Intrinsic::aarch64_sve_fmls: 2173 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmls_u); 2174 case Intrinsic::aarch64_sve_fmul: 2175 if (auto II_U = 2176 instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmul_u)) 2177 return II_U; 2178 return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_fmul_u); 2179 case Intrinsic::aarch64_sve_fmul_u: 2180 return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_fmul_u); 2181 case Intrinsic::aarch64_sve_fmulx: 2182 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmulx_u); 2183 case Intrinsic::aarch64_sve_fnmla: 2184 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fnmla_u); 2185 case Intrinsic::aarch64_sve_fnmls: 2186 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fnmls_u); 2187 case Intrinsic::aarch64_sve_fsub: 2188 return instCombineSVEVectorFSub(IC, II); 2189 case Intrinsic::aarch64_sve_fsub_u: 2190 return instCombineSVEVectorFSubU(IC, II); 2191 case Intrinsic::aarch64_sve_add: 2192 return instCombineSVEVectorAdd(IC, II); 2193 case Intrinsic::aarch64_sve_add_u: 2194 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u, 2195 Intrinsic::aarch64_sve_mla_u>( 2196 IC, II, true); 2197 case Intrinsic::aarch64_sve_mla: 2198 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_mla_u); 2199 case Intrinsic::aarch64_sve_mls: 2200 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_mls_u); 2201 case Intrinsic::aarch64_sve_mul: 2202 if (auto II_U = 2203 instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_mul_u)) 2204 return II_U; 2205 return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_mul_u); 2206 case Intrinsic::aarch64_sve_mul_u: 2207 return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_mul_u); 2208 case Intrinsic::aarch64_sve_sabd: 2209 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_sabd_u); 2210 case Intrinsic::aarch64_sve_smax: 2211 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_smax_u); 2212 case Intrinsic::aarch64_sve_smin: 2213 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_smin_u); 2214 case Intrinsic::aarch64_sve_smulh: 2215 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_smulh_u); 2216 case Intrinsic::aarch64_sve_sub: 2217 return instCombineSVEVectorSub(IC, II); 2218 case Intrinsic::aarch64_sve_sub_u: 2219 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u, 2220 Intrinsic::aarch64_sve_mls_u>( 2221 IC, II, true); 2222 case Intrinsic::aarch64_sve_uabd: 2223 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_uabd_u); 2224 case Intrinsic::aarch64_sve_umax: 2225 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_umax_u); 2226 case Intrinsic::aarch64_sve_umin: 2227 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_umin_u); 2228 case Intrinsic::aarch64_sve_umulh: 2229 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_umulh_u); 2230 case Intrinsic::aarch64_sve_asr: 2231 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_asr_u); 2232 case Intrinsic::aarch64_sve_lsl: 2233 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_lsl_u); 2234 case Intrinsic::aarch64_sve_lsr: 2235 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_lsr_u); 2236 case Intrinsic::aarch64_sve_and: 2237 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_and_u); 2238 case Intrinsic::aarch64_sve_bic: 2239 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_bic_u); 2240 case Intrinsic::aarch64_sve_eor: 2241 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_eor_u); 2242 case Intrinsic::aarch64_sve_orr: 2243 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_orr_u); 2244 case Intrinsic::aarch64_sve_sqsub: 2245 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_sqsub_u); 2246 case Intrinsic::aarch64_sve_uqsub: 2247 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_uqsub_u); 2248 case Intrinsic::aarch64_sve_tbl: 2249 return instCombineSVETBL(IC, II); 2250 case Intrinsic::aarch64_sve_uunpkhi: 2251 case Intrinsic::aarch64_sve_uunpklo: 2252 case Intrinsic::aarch64_sve_sunpkhi: 2253 case Intrinsic::aarch64_sve_sunpklo: 2254 return instCombineSVEUnpack(IC, II); 2255 case Intrinsic::aarch64_sve_uzp1: 2256 return instCombineSVEUzp1(IC, II); 2257 case Intrinsic::aarch64_sve_zip1: 2258 case Intrinsic::aarch64_sve_zip2: 2259 return instCombineSVEZip(IC, II); 2260 case Intrinsic::aarch64_sve_ld1_gather_index: 2261 return instCombineLD1GatherIndex(IC, II); 2262 case Intrinsic::aarch64_sve_st1_scatter_index: 2263 return instCombineST1ScatterIndex(IC, II); 2264 case Intrinsic::aarch64_sve_ld1: 2265 return instCombineSVELD1(IC, II, DL); 2266 case Intrinsic::aarch64_sve_st1: 2267 return instCombineSVEST1(IC, II, DL); 2268 case Intrinsic::aarch64_sve_sdiv: 2269 return instCombineSVESDIV(IC, II); 2270 case Intrinsic::aarch64_sve_sel: 2271 return instCombineSVESel(IC, II); 2272 case Intrinsic::aarch64_sve_srshl: 2273 return instCombineSVESrshl(IC, II); 2274 case Intrinsic::aarch64_sve_dupq_lane: 2275 return instCombineSVEDupqLane(IC, II); 2276 } 2277 2278 return std::nullopt; 2279 } 2280 2281 std::optional<Value *> AArch64TTIImpl::simplifyDemandedVectorEltsIntrinsic( 2282 InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts, 2283 APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, 2284 std::function<void(Instruction *, unsigned, APInt, APInt &)> 2285 SimplifyAndSetOp) const { 2286 switch (II.getIntrinsicID()) { 2287 default: 2288 break; 2289 case Intrinsic::aarch64_neon_fcvtxn: 2290 case Intrinsic::aarch64_neon_rshrn: 2291 case Intrinsic::aarch64_neon_sqrshrn: 2292 case Intrinsic::aarch64_neon_sqrshrun: 2293 case Intrinsic::aarch64_neon_sqshrn: 2294 case Intrinsic::aarch64_neon_sqshrun: 2295 case Intrinsic::aarch64_neon_sqxtn: 2296 case Intrinsic::aarch64_neon_sqxtun: 2297 case Intrinsic::aarch64_neon_uqrshrn: 2298 case Intrinsic::aarch64_neon_uqshrn: 2299 case Intrinsic::aarch64_neon_uqxtn: 2300 SimplifyAndSetOp(&II, 0, OrigDemandedElts, UndefElts); 2301 break; 2302 } 2303 2304 return std::nullopt; 2305 } 2306 2307 bool AArch64TTIImpl::enableScalableVectorization() const { 2308 return ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() && 2309 EnableScalableAutovecInStreamingMode); 2310 } 2311 2312 TypeSize 2313 AArch64TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { 2314 switch (K) { 2315 case TargetTransformInfo::RGK_Scalar: 2316 return TypeSize::getFixed(64); 2317 case TargetTransformInfo::RGK_FixedWidthVector: 2318 if (ST->useSVEForFixedLengthVectors() && 2319 (ST->isSVEAvailable() || EnableFixedwidthAutovecInStreamingMode)) 2320 return TypeSize::getFixed( 2321 std::max(ST->getMinSVEVectorSizeInBits(), 128u)); 2322 else if (ST->isNeonAvailable()) 2323 return TypeSize::getFixed(128); 2324 else 2325 return TypeSize::getFixed(0); 2326 case TargetTransformInfo::RGK_ScalableVector: 2327 if (ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() && 2328 EnableScalableAutovecInStreamingMode)) 2329 return TypeSize::getScalable(128); 2330 else 2331 return TypeSize::getScalable(0); 2332 } 2333 llvm_unreachable("Unsupported register kind"); 2334 } 2335 2336 bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode, 2337 ArrayRef<const Value *> Args, 2338 Type *SrcOverrideTy) { 2339 // A helper that returns a vector type from the given type. The number of 2340 // elements in type Ty determines the vector width. 2341 auto toVectorTy = [&](Type *ArgTy) { 2342 return VectorType::get(ArgTy->getScalarType(), 2343 cast<VectorType>(DstTy)->getElementCount()); 2344 }; 2345 2346 // Exit early if DstTy is not a vector type whose elements are one of [i16, 2347 // i32, i64]. SVE doesn't generally have the same set of instructions to 2348 // perform an extend with the add/sub/mul. There are SMULLB style 2349 // instructions, but they operate on top/bottom, requiring some sort of lane 2350 // interleaving to be used with zext/sext. 2351 unsigned DstEltSize = DstTy->getScalarSizeInBits(); 2352 if (!useNeonVector(DstTy) || Args.size() != 2 || 2353 (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64)) 2354 return false; 2355 2356 // Determine if the operation has a widening variant. We consider both the 2357 // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the 2358 // instructions. 2359 // 2360 // TODO: Add additional widening operations (e.g., shl, etc.) once we 2361 // verify that their extending operands are eliminated during code 2362 // generation. 2363 Type *SrcTy = SrcOverrideTy; 2364 switch (Opcode) { 2365 case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2). 2366 case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2). 2367 // The second operand needs to be an extend 2368 if (isa<SExtInst>(Args[1]) || isa<ZExtInst>(Args[1])) { 2369 if (!SrcTy) 2370 SrcTy = 2371 toVectorTy(cast<Instruction>(Args[1])->getOperand(0)->getType()); 2372 } else 2373 return false; 2374 break; 2375 case Instruction::Mul: { // SMULL(2), UMULL(2) 2376 // Both operands need to be extends of the same type. 2377 if ((isa<SExtInst>(Args[0]) && isa<SExtInst>(Args[1])) || 2378 (isa<ZExtInst>(Args[0]) && isa<ZExtInst>(Args[1]))) { 2379 if (!SrcTy) 2380 SrcTy = 2381 toVectorTy(cast<Instruction>(Args[0])->getOperand(0)->getType()); 2382 } else if (isa<ZExtInst>(Args[0]) || isa<ZExtInst>(Args[1])) { 2383 // If one of the operands is a Zext and the other has enough zero bits to 2384 // be treated as unsigned, we can still general a umull, meaning the zext 2385 // is free. 2386 KnownBits Known = 2387 computeKnownBits(isa<ZExtInst>(Args[0]) ? Args[1] : Args[0], DL); 2388 if (Args[0]->getType()->getScalarSizeInBits() - 2389 Known.Zero.countLeadingOnes() > 2390 DstTy->getScalarSizeInBits() / 2) 2391 return false; 2392 if (!SrcTy) 2393 SrcTy = toVectorTy(Type::getIntNTy(DstTy->getContext(), 2394 DstTy->getScalarSizeInBits() / 2)); 2395 } else 2396 return false; 2397 break; 2398 } 2399 default: 2400 return false; 2401 } 2402 2403 // Legalize the destination type and ensure it can be used in a widening 2404 // operation. 2405 auto DstTyL = getTypeLegalizationCost(DstTy); 2406 if (!DstTyL.second.isVector() || DstEltSize != DstTy->getScalarSizeInBits()) 2407 return false; 2408 2409 // Legalize the source type and ensure it can be used in a widening 2410 // operation. 2411 assert(SrcTy && "Expected some SrcTy"); 2412 auto SrcTyL = getTypeLegalizationCost(SrcTy); 2413 unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits(); 2414 if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits()) 2415 return false; 2416 2417 // Get the total number of vector elements in the legalized types. 2418 InstructionCost NumDstEls = 2419 DstTyL.first * DstTyL.second.getVectorMinNumElements(); 2420 InstructionCost NumSrcEls = 2421 SrcTyL.first * SrcTyL.second.getVectorMinNumElements(); 2422 2423 // Return true if the legalized types have the same number of vector elements 2424 // and the destination element type size is twice that of the source type. 2425 return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstEltSize; 2426 } 2427 2428 // s/urhadd instructions implement the following pattern, making the 2429 // extends free: 2430 // %x = add ((zext i8 -> i16), 1) 2431 // %y = (zext i8 -> i16) 2432 // trunc i16 (lshr (add %x, %y), 1) -> i8 2433 // 2434 bool AArch64TTIImpl::isExtPartOfAvgExpr(const Instruction *ExtUser, Type *Dst, 2435 Type *Src) { 2436 // The source should be a legal vector type. 2437 if (!Src->isVectorTy() || !TLI->isTypeLegal(TLI->getValueType(DL, Src)) || 2438 (Src->isScalableTy() && !ST->hasSVE2())) 2439 return false; 2440 2441 if (ExtUser->getOpcode() != Instruction::Add || !ExtUser->hasOneUse()) 2442 return false; 2443 2444 // Look for trunc/shl/add before trying to match the pattern. 2445 const Instruction *Add = ExtUser; 2446 auto *AddUser = 2447 dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser()); 2448 if (AddUser && AddUser->getOpcode() == Instruction::Add) 2449 Add = AddUser; 2450 2451 auto *Shr = dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser()); 2452 if (!Shr || Shr->getOpcode() != Instruction::LShr) 2453 return false; 2454 2455 auto *Trunc = dyn_cast_or_null<Instruction>(Shr->getUniqueUndroppableUser()); 2456 if (!Trunc || Trunc->getOpcode() != Instruction::Trunc || 2457 Src->getScalarSizeInBits() != 2458 cast<CastInst>(Trunc)->getDestTy()->getScalarSizeInBits()) 2459 return false; 2460 2461 // Try to match the whole pattern. Ext could be either the first or second 2462 // m_ZExtOrSExt matched. 2463 Instruction *Ex1, *Ex2; 2464 if (!(match(Add, m_c_Add(m_Instruction(Ex1), 2465 m_c_Add(m_Instruction(Ex2), m_SpecificInt(1)))))) 2466 return false; 2467 2468 // Ensure both extends are of the same type 2469 if (match(Ex1, m_ZExtOrSExt(m_Value())) && 2470 Ex1->getOpcode() == Ex2->getOpcode()) 2471 return true; 2472 2473 return false; 2474 } 2475 2476 InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, 2477 Type *Src, 2478 TTI::CastContextHint CCH, 2479 TTI::TargetCostKind CostKind, 2480 const Instruction *I) { 2481 int ISD = TLI->InstructionOpcodeToISD(Opcode); 2482 assert(ISD && "Invalid opcode"); 2483 // If the cast is observable, and it is used by a widening instruction (e.g., 2484 // uaddl, saddw, etc.), it may be free. 2485 if (I && I->hasOneUser()) { 2486 auto *SingleUser = cast<Instruction>(*I->user_begin()); 2487 SmallVector<const Value *, 4> Operands(SingleUser->operand_values()); 2488 if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands, Src)) { 2489 // For adds only count the second operand as free if both operands are 2490 // extends but not the same operation. (i.e both operands are not free in 2491 // add(sext, zext)). 2492 if (SingleUser->getOpcode() == Instruction::Add) { 2493 if (I == SingleUser->getOperand(1) || 2494 (isa<CastInst>(SingleUser->getOperand(1)) && 2495 cast<CastInst>(SingleUser->getOperand(1))->getOpcode() == Opcode)) 2496 return 0; 2497 } else // Others are free so long as isWideningInstruction returned true. 2498 return 0; 2499 } 2500 2501 // The cast will be free for the s/urhadd instructions 2502 if ((isa<ZExtInst>(I) || isa<SExtInst>(I)) && 2503 isExtPartOfAvgExpr(SingleUser, Dst, Src)) 2504 return 0; 2505 } 2506 2507 // TODO: Allow non-throughput costs that aren't binary. 2508 auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost { 2509 if (CostKind != TTI::TCK_RecipThroughput) 2510 return Cost == 0 ? 0 : 1; 2511 return Cost; 2512 }; 2513 2514 EVT SrcTy = TLI->getValueType(DL, Src); 2515 EVT DstTy = TLI->getValueType(DL, Dst); 2516 2517 if (!SrcTy.isSimple() || !DstTy.isSimple()) 2518 return AdjustCost( 2519 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); 2520 2521 static const TypeConversionCostTblEntry 2522 ConversionTbl[] = { 2523 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 1}, // xtn 2524 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1}, // xtn 2525 { ISD::TRUNCATE, MVT::v2i32, MVT::v2i64, 1}, // xtn 2526 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 1}, // xtn 2527 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 3}, // 2 xtn + 1 uzp1 2528 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1}, // xtn 2529 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2}, // 1 uzp1 + 1 xtn 2530 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1}, // 1 uzp1 2531 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 1}, // 1 xtn 2532 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2}, // 1 uzp1 + 1 xtn 2533 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 4}, // 3 x uzp1 + xtn 2534 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 1}, // 1 uzp1 2535 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 3}, // 3 x uzp1 2536 { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 2}, // 2 x uzp1 2537 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 1}, // uzp1 2538 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 3}, // (2 + 1) x uzp1 2539 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 7}, // (4 + 2 + 1) x uzp1 2540 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2}, // 2 x uzp1 2541 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i64, 6}, // (4 + 2) x uzp1 2542 { ISD::TRUNCATE, MVT::v16i32, MVT::v16i64, 4}, // 4 x uzp1 2543 2544 // Truncations on nxvmiN 2545 { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i16, 1 }, 2546 { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i32, 1 }, 2547 { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i64, 1 }, 2548 { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i16, 1 }, 2549 { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i32, 1 }, 2550 { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i64, 2 }, 2551 { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i16, 1 }, 2552 { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i32, 3 }, 2553 { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i64, 5 }, 2554 { ISD::TRUNCATE, MVT::nxv16i1, MVT::nxv16i8, 1 }, 2555 { ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i32, 1 }, 2556 { ISD::TRUNCATE, MVT::nxv2i32, MVT::nxv2i64, 1 }, 2557 { ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i32, 1 }, 2558 { ISD::TRUNCATE, MVT::nxv4i32, MVT::nxv4i64, 2 }, 2559 { ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i32, 3 }, 2560 { ISD::TRUNCATE, MVT::nxv8i32, MVT::nxv8i64, 6 }, 2561 2562 // The number of shll instructions for the extension. 2563 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, 2564 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, 2565 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 }, 2566 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 }, 2567 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, 2568 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, 2569 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, 2570 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, 2571 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 }, 2572 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 }, 2573 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 }, 2574 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 }, 2575 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, 2576 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, 2577 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 }, 2578 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 }, 2579 2580 // LowerVectorINT_TO_FP: 2581 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, 2582 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, 2583 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, 2584 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, 2585 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, 2586 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, 2587 2588 // Complex: to v2f32 2589 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 }, 2590 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 }, 2591 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 }, 2592 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 }, 2593 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 }, 2594 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 }, 2595 2596 // Complex: to v4f32 2597 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 4 }, 2598 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 }, 2599 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 }, 2600 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 }, 2601 2602 // Complex: to v8f32 2603 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 }, 2604 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 }, 2605 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 }, 2606 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 }, 2607 2608 // Complex: to v16f32 2609 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 }, 2610 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 }, 2611 2612 // Complex: to v2f64 2613 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 }, 2614 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 }, 2615 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 }, 2616 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 }, 2617 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 }, 2618 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 }, 2619 2620 // Complex: to v4f64 2621 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 4 }, 2622 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 4 }, 2623 2624 // LowerVectorFP_TO_INT 2625 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1 }, 2626 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 }, 2627 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 }, 2628 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 }, 2629 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 }, 2630 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 }, 2631 2632 // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext). 2633 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2 }, 2634 { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1 }, 2635 { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 1 }, 2636 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2 }, 2637 { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1 }, 2638 { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 1 }, 2639 2640 // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2 2641 { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 }, 2642 { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 2 }, 2643 { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 }, 2644 { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 2 }, 2645 2646 // Complex, from nxv2f32. 2647 { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f32, 1 }, 2648 { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f32, 1 }, 2649 { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f32, 1 }, 2650 { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f32, 1 }, 2651 { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f32, 1 }, 2652 { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f32, 1 }, 2653 { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f32, 1 }, 2654 { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f32, 1 }, 2655 2656 // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2. 2657 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 }, 2658 { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2 }, 2659 { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 2 }, 2660 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 }, 2661 { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2 }, 2662 { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2 }, 2663 2664 // Complex, from nxv2f64. 2665 { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f64, 1 }, 2666 { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f64, 1 }, 2667 { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f64, 1 }, 2668 { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f64, 1 }, 2669 { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f64, 1 }, 2670 { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f64, 1 }, 2671 { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f64, 1 }, 2672 { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f64, 1 }, 2673 2674 // Complex, from nxv4f32. 2675 { ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f32, 4 }, 2676 { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f32, 1 }, 2677 { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f32, 1 }, 2678 { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f32, 1 }, 2679 { ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f32, 4 }, 2680 { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f32, 1 }, 2681 { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f32, 1 }, 2682 { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f32, 1 }, 2683 2684 // Complex, from nxv8f64. Illegal -> illegal conversions not required. 2685 { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f64, 7 }, 2686 { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f64, 7 }, 2687 { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f64, 7 }, 2688 { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f64, 7 }, 2689 2690 // Complex, from nxv4f64. Illegal -> illegal conversions not required. 2691 { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f64, 3 }, 2692 { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f64, 3 }, 2693 { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f64, 3 }, 2694 { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f64, 3 }, 2695 { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f64, 3 }, 2696 { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f64, 3 }, 2697 2698 // Complex, from nxv8f32. Illegal -> illegal conversions not required. 2699 { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f32, 3 }, 2700 { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f32, 3 }, 2701 { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f32, 3 }, 2702 { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f32, 3 }, 2703 2704 // Complex, from nxv8f16. 2705 { ISD::FP_TO_SINT, MVT::nxv8i64, MVT::nxv8f16, 10 }, 2706 { ISD::FP_TO_SINT, MVT::nxv8i32, MVT::nxv8f16, 4 }, 2707 { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f16, 1 }, 2708 { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f16, 1 }, 2709 { ISD::FP_TO_UINT, MVT::nxv8i64, MVT::nxv8f16, 10 }, 2710 { ISD::FP_TO_UINT, MVT::nxv8i32, MVT::nxv8f16, 4 }, 2711 { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f16, 1 }, 2712 { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f16, 1 }, 2713 2714 // Complex, from nxv4f16. 2715 { ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f16, 4 }, 2716 { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f16, 1 }, 2717 { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f16, 1 }, 2718 { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f16, 1 }, 2719 { ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f16, 4 }, 2720 { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f16, 1 }, 2721 { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f16, 1 }, 2722 { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f16, 1 }, 2723 2724 // Complex, from nxv2f16. 2725 { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f16, 1 }, 2726 { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f16, 1 }, 2727 { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f16, 1 }, 2728 { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f16, 1 }, 2729 { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f16, 1 }, 2730 { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f16, 1 }, 2731 { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f16, 1 }, 2732 { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f16, 1 }, 2733 2734 // Truncate from nxvmf32 to nxvmf16. 2735 { ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f32, 1 }, 2736 { ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f32, 1 }, 2737 { ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f32, 3 }, 2738 2739 // Truncate from nxvmf64 to nxvmf16. 2740 { ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f64, 1 }, 2741 { ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f64, 3 }, 2742 { ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f64, 7 }, 2743 2744 // Truncate from nxvmf64 to nxvmf32. 2745 { ISD::FP_ROUND, MVT::nxv2f32, MVT::nxv2f64, 1 }, 2746 { ISD::FP_ROUND, MVT::nxv4f32, MVT::nxv4f64, 3 }, 2747 { ISD::FP_ROUND, MVT::nxv8f32, MVT::nxv8f64, 6 }, 2748 2749 // Extend from nxvmf16 to nxvmf32. 2750 { ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2f16, 1}, 2751 { ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4f16, 1}, 2752 { ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8f16, 2}, 2753 2754 // Extend from nxvmf16 to nxvmf64. 2755 { ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f16, 1}, 2756 { ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f16, 2}, 2757 { ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f16, 4}, 2758 2759 // Extend from nxvmf32 to nxvmf64. 2760 { ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f32, 1}, 2761 { ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f32, 2}, 2762 { ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f32, 6}, 2763 2764 // Bitcasts from float to integer 2765 { ISD::BITCAST, MVT::nxv2f16, MVT::nxv2i16, 0 }, 2766 { ISD::BITCAST, MVT::nxv4f16, MVT::nxv4i16, 0 }, 2767 { ISD::BITCAST, MVT::nxv2f32, MVT::nxv2i32, 0 }, 2768 2769 // Bitcasts from integer to float 2770 { ISD::BITCAST, MVT::nxv2i16, MVT::nxv2f16, 0 }, 2771 { ISD::BITCAST, MVT::nxv4i16, MVT::nxv4f16, 0 }, 2772 { ISD::BITCAST, MVT::nxv2i32, MVT::nxv2f32, 0 }, 2773 2774 // Add cost for extending to illegal -too wide- scalable vectors. 2775 // zero/sign extend are implemented by multiple unpack operations, 2776 // where each operation has a cost of 1. 2777 { ISD::ZERO_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2}, 2778 { ISD::ZERO_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6}, 2779 { ISD::ZERO_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14}, 2780 { ISD::ZERO_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2}, 2781 { ISD::ZERO_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6}, 2782 { ISD::ZERO_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2}, 2783 2784 { ISD::SIGN_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2}, 2785 { ISD::SIGN_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6}, 2786 { ISD::SIGN_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14}, 2787 { ISD::SIGN_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2}, 2788 { ISD::SIGN_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6}, 2789 { ISD::SIGN_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2}, 2790 }; 2791 2792 // We have to estimate a cost of fixed length operation upon 2793 // SVE registers(operations) with the number of registers required 2794 // for a fixed type to be represented upon SVE registers. 2795 EVT WiderTy = SrcTy.bitsGT(DstTy) ? SrcTy : DstTy; 2796 if (SrcTy.isFixedLengthVector() && DstTy.isFixedLengthVector() && 2797 SrcTy.getVectorNumElements() == DstTy.getVectorNumElements() && 2798 ST->useSVEForFixedLengthVectors(WiderTy)) { 2799 std::pair<InstructionCost, MVT> LT = 2800 getTypeLegalizationCost(WiderTy.getTypeForEVT(Dst->getContext())); 2801 unsigned NumElements = AArch64::SVEBitsPerBlock / 2802 LT.second.getScalarSizeInBits(); 2803 return AdjustCost( 2804 LT.first * 2805 getCastInstrCost( 2806 Opcode, ScalableVectorType::get(Dst->getScalarType(), NumElements), 2807 ScalableVectorType::get(Src->getScalarType(), NumElements), CCH, 2808 CostKind, I)); 2809 } 2810 2811 if (const auto *Entry = ConvertCostTableLookup(ConversionTbl, ISD, 2812 DstTy.getSimpleVT(), 2813 SrcTy.getSimpleVT())) 2814 return AdjustCost(Entry->Cost); 2815 2816 static const TypeConversionCostTblEntry FP16Tbl[] = { 2817 {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f16, 1}, // fcvtzs 2818 {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f16, 1}, 2819 {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f16, 1}, // fcvtzs 2820 {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f16, 1}, 2821 {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f16, 2}, // fcvtl+fcvtzs 2822 {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f16, 2}, 2823 {ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f16, 2}, // fcvtzs+xtn 2824 {ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f16, 2}, 2825 {ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f16, 1}, // fcvtzs 2826 {ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f16, 1}, 2827 {ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f16, 4}, // 2*fcvtl+2*fcvtzs 2828 {ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f16, 4}, 2829 {ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f16, 3}, // 2*fcvtzs+xtn 2830 {ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f16, 3}, 2831 {ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f16, 2}, // 2*fcvtzs 2832 {ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f16, 2}, 2833 {ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f16, 8}, // 4*fcvtl+4*fcvtzs 2834 {ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f16, 8}, 2835 {ISD::UINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // ushll + ucvtf 2836 {ISD::SINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // sshll + scvtf 2837 {ISD::UINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * ushl(2) + 2 * ucvtf 2838 {ISD::SINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * sshl(2) + 2 * scvtf 2839 }; 2840 2841 if (ST->hasFullFP16()) 2842 if (const auto *Entry = ConvertCostTableLookup( 2843 FP16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT())) 2844 return AdjustCost(Entry->Cost); 2845 2846 if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) && 2847 CCH == TTI::CastContextHint::Masked && 2848 ST->isSVEorStreamingSVEAvailable() && 2849 TLI->getTypeAction(Src->getContext(), SrcTy) == 2850 TargetLowering::TypePromoteInteger && 2851 TLI->getTypeAction(Dst->getContext(), DstTy) == 2852 TargetLowering::TypeSplitVector) { 2853 // The standard behaviour in the backend for these cases is to split the 2854 // extend up into two parts: 2855 // 1. Perform an extending load or masked load up to the legal type. 2856 // 2. Extend the loaded data to the final type. 2857 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src); 2858 Type *LegalTy = EVT(SrcLT.second).getTypeForEVT(Src->getContext()); 2859 InstructionCost Part1 = AArch64TTIImpl::getCastInstrCost( 2860 Opcode, LegalTy, Src, CCH, CostKind, I); 2861 InstructionCost Part2 = AArch64TTIImpl::getCastInstrCost( 2862 Opcode, Dst, LegalTy, TTI::CastContextHint::None, CostKind, I); 2863 return Part1 + Part2; 2864 } 2865 2866 // The BasicTTIImpl version only deals with CCH==TTI::CastContextHint::Normal, 2867 // but we also want to include the TTI::CastContextHint::Masked case too. 2868 if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) && 2869 CCH == TTI::CastContextHint::Masked && 2870 ST->isSVEorStreamingSVEAvailable() && TLI->isTypeLegal(DstTy)) 2871 CCH = TTI::CastContextHint::Normal; 2872 2873 return AdjustCost( 2874 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); 2875 } 2876 2877 InstructionCost AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, 2878 Type *Dst, 2879 VectorType *VecTy, 2880 unsigned Index) { 2881 2882 // Make sure we were given a valid extend opcode. 2883 assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) && 2884 "Invalid opcode"); 2885 2886 // We are extending an element we extract from a vector, so the source type 2887 // of the extend is the element type of the vector. 2888 auto *Src = VecTy->getElementType(); 2889 2890 // Sign- and zero-extends are for integer types only. 2891 assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type"); 2892 2893 // Get the cost for the extract. We compute the cost (if any) for the extend 2894 // below. 2895 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 2896 InstructionCost Cost = getVectorInstrCost(Instruction::ExtractElement, VecTy, 2897 CostKind, Index, nullptr, nullptr); 2898 2899 // Legalize the types. 2900 auto VecLT = getTypeLegalizationCost(VecTy); 2901 auto DstVT = TLI->getValueType(DL, Dst); 2902 auto SrcVT = TLI->getValueType(DL, Src); 2903 2904 // If the resulting type is still a vector and the destination type is legal, 2905 // we may get the extension for free. If not, get the default cost for the 2906 // extend. 2907 if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT)) 2908 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None, 2909 CostKind); 2910 2911 // The destination type should be larger than the element type. If not, get 2912 // the default cost for the extend. 2913 if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits()) 2914 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None, 2915 CostKind); 2916 2917 switch (Opcode) { 2918 default: 2919 llvm_unreachable("Opcode should be either SExt or ZExt"); 2920 2921 // For sign-extends, we only need a smov, which performs the extension 2922 // automatically. 2923 case Instruction::SExt: 2924 return Cost; 2925 2926 // For zero-extends, the extend is performed automatically by a umov unless 2927 // the destination type is i64 and the element type is i8 or i16. 2928 case Instruction::ZExt: 2929 if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u) 2930 return Cost; 2931 } 2932 2933 // If we are unable to perform the extend for free, get the default cost. 2934 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None, 2935 CostKind); 2936 } 2937 2938 InstructionCost AArch64TTIImpl::getCFInstrCost(unsigned Opcode, 2939 TTI::TargetCostKind CostKind, 2940 const Instruction *I) { 2941 if (CostKind != TTI::TCK_RecipThroughput) 2942 return Opcode == Instruction::PHI ? 0 : 1; 2943 assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind"); 2944 // Branches are assumed to be predicted. 2945 return 0; 2946 } 2947 2948 InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(const Instruction *I, 2949 Type *Val, 2950 unsigned Index, 2951 bool HasRealUse) { 2952 assert(Val->isVectorTy() && "This must be a vector type"); 2953 2954 if (Index != -1U) { 2955 // Legalize the type. 2956 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val); 2957 2958 // This type is legalized to a scalar type. 2959 if (!LT.second.isVector()) 2960 return 0; 2961 2962 // The type may be split. For fixed-width vectors we can normalize the 2963 // index to the new type. 2964 if (LT.second.isFixedLengthVector()) { 2965 unsigned Width = LT.second.getVectorNumElements(); 2966 Index = Index % Width; 2967 } 2968 2969 // The element at index zero is already inside the vector. 2970 // - For a physical (HasRealUse==true) insert-element or extract-element 2971 // instruction that extracts integers, an explicit FPR -> GPR move is 2972 // needed. So it has non-zero cost. 2973 // - For the rest of cases (virtual instruction or element type is float), 2974 // consider the instruction free. 2975 if (Index == 0 && (!HasRealUse || !Val->getScalarType()->isIntegerTy())) 2976 return 0; 2977 2978 // This is recognising a LD1 single-element structure to one lane of one 2979 // register instruction. I.e., if this is an `insertelement` instruction, 2980 // and its second operand is a load, then we will generate a LD1, which 2981 // are expensive instructions. 2982 if (I && dyn_cast<LoadInst>(I->getOperand(1))) 2983 return ST->getVectorInsertExtractBaseCost() + 1; 2984 2985 // i1 inserts and extract will include an extra cset or cmp of the vector 2986 // value. Increase the cost by 1 to account. 2987 if (Val->getScalarSizeInBits() == 1) 2988 return ST->getVectorInsertExtractBaseCost() + 1; 2989 2990 // FIXME: 2991 // If the extract-element and insert-element instructions could be 2992 // simplified away (e.g., could be combined into users by looking at use-def 2993 // context), they have no cost. This is not done in the first place for 2994 // compile-time considerations. 2995 } 2996 2997 // All other insert/extracts cost this much. 2998 return ST->getVectorInsertExtractBaseCost(); 2999 } 3000 3001 InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, 3002 TTI::TargetCostKind CostKind, 3003 unsigned Index, Value *Op0, 3004 Value *Op1) { 3005 bool HasRealUse = 3006 Opcode == Instruction::InsertElement && Op0 && !isa<UndefValue>(Op0); 3007 return getVectorInstrCostHelper(nullptr, Val, Index, HasRealUse); 3008 } 3009 3010 InstructionCost AArch64TTIImpl::getVectorInstrCost(const Instruction &I, 3011 Type *Val, 3012 TTI::TargetCostKind CostKind, 3013 unsigned Index) { 3014 return getVectorInstrCostHelper(&I, Val, Index, true /* HasRealUse */); 3015 } 3016 3017 InstructionCost AArch64TTIImpl::getScalarizationOverhead( 3018 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, 3019 TTI::TargetCostKind CostKind) { 3020 if (isa<ScalableVectorType>(Ty)) 3021 return InstructionCost::getInvalid(); 3022 if (Ty->getElementType()->isFloatingPointTy()) 3023 return BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, Extract, 3024 CostKind); 3025 return DemandedElts.popcount() * (Insert + Extract) * 3026 ST->getVectorInsertExtractBaseCost(); 3027 } 3028 3029 InstructionCost AArch64TTIImpl::getArithmeticInstrCost( 3030 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, 3031 TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, 3032 ArrayRef<const Value *> Args, 3033 const Instruction *CxtI) { 3034 3035 // The code-generator is currently not able to handle scalable vectors 3036 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting 3037 // it. This change will be removed when code-generation for these types is 3038 // sufficiently reliable. 3039 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty)) 3040 if (VTy->getElementCount() == ElementCount::getScalable(1)) 3041 return InstructionCost::getInvalid(); 3042 3043 // TODO: Handle more cost kinds. 3044 if (CostKind != TTI::TCK_RecipThroughput) 3045 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, 3046 Op2Info, Args, CxtI); 3047 3048 // Legalize the type. 3049 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); 3050 int ISD = TLI->InstructionOpcodeToISD(Opcode); 3051 3052 switch (ISD) { 3053 default: 3054 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, 3055 Op2Info); 3056 case ISD::SDIV: 3057 if (Op2Info.isConstant() && Op2Info.isUniform() && Op2Info.isPowerOf2()) { 3058 // On AArch64, scalar signed division by constants power-of-two are 3059 // normally expanded to the sequence ADD + CMP + SELECT + SRA. 3060 // The OperandValue properties many not be same as that of previous 3061 // operation; conservatively assume OP_None. 3062 InstructionCost Cost = getArithmeticInstrCost( 3063 Instruction::Add, Ty, CostKind, 3064 Op1Info.getNoProps(), Op2Info.getNoProps()); 3065 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, 3066 Op1Info.getNoProps(), Op2Info.getNoProps()); 3067 Cost += getArithmeticInstrCost( 3068 Instruction::Select, Ty, CostKind, 3069 Op1Info.getNoProps(), Op2Info.getNoProps()); 3070 Cost += getArithmeticInstrCost(Instruction::AShr, Ty, CostKind, 3071 Op1Info.getNoProps(), Op2Info.getNoProps()); 3072 return Cost; 3073 } 3074 [[fallthrough]]; 3075 case ISD::UDIV: { 3076 if (Op2Info.isConstant() && Op2Info.isUniform()) { 3077 auto VT = TLI->getValueType(DL, Ty); 3078 if (TLI->isOperationLegalOrCustom(ISD::MULHU, VT)) { 3079 // Vector signed division by constant are expanded to the 3080 // sequence MULHS + ADD/SUB + SRA + SRL + ADD, and unsigned division 3081 // to MULHS + SUB + SRL + ADD + SRL. 3082 InstructionCost MulCost = getArithmeticInstrCost( 3083 Instruction::Mul, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps()); 3084 InstructionCost AddCost = getArithmeticInstrCost( 3085 Instruction::Add, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps()); 3086 InstructionCost ShrCost = getArithmeticInstrCost( 3087 Instruction::AShr, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps()); 3088 return MulCost * 2 + AddCost * 2 + ShrCost * 2 + 1; 3089 } 3090 } 3091 3092 InstructionCost Cost = BaseT::getArithmeticInstrCost( 3093 Opcode, Ty, CostKind, Op1Info, Op2Info); 3094 if (Ty->isVectorTy()) { 3095 if (TLI->isOperationLegalOrCustom(ISD, LT.second) && ST->hasSVE()) { 3096 // SDIV/UDIV operations are lowered using SVE, then we can have less 3097 // costs. 3098 if (isa<FixedVectorType>(Ty) && cast<FixedVectorType>(Ty) 3099 ->getPrimitiveSizeInBits() 3100 .getFixedValue() < 128) { 3101 EVT VT = TLI->getValueType(DL, Ty); 3102 static const CostTblEntry DivTbl[]{ 3103 {ISD::SDIV, MVT::v2i8, 5}, {ISD::SDIV, MVT::v4i8, 8}, 3104 {ISD::SDIV, MVT::v8i8, 8}, {ISD::SDIV, MVT::v2i16, 5}, 3105 {ISD::SDIV, MVT::v4i16, 5}, {ISD::SDIV, MVT::v2i32, 1}, 3106 {ISD::UDIV, MVT::v2i8, 5}, {ISD::UDIV, MVT::v4i8, 8}, 3107 {ISD::UDIV, MVT::v8i8, 8}, {ISD::UDIV, MVT::v2i16, 5}, 3108 {ISD::UDIV, MVT::v4i16, 5}, {ISD::UDIV, MVT::v2i32, 1}}; 3109 3110 const auto *Entry = CostTableLookup(DivTbl, ISD, VT.getSimpleVT()); 3111 if (nullptr != Entry) 3112 return Entry->Cost; 3113 } 3114 // For 8/16-bit elements, the cost is higher because the type 3115 // requires promotion and possibly splitting: 3116 if (LT.second.getScalarType() == MVT::i8) 3117 Cost *= 8; 3118 else if (LT.second.getScalarType() == MVT::i16) 3119 Cost *= 4; 3120 return Cost; 3121 } else { 3122 // If one of the operands is a uniform constant then the cost for each 3123 // element is Cost for insertion, extraction and division. 3124 // Insertion cost = 2, Extraction Cost = 2, Division = cost for the 3125 // operation with scalar type 3126 if ((Op1Info.isConstant() && Op1Info.isUniform()) || 3127 (Op2Info.isConstant() && Op2Info.isUniform())) { 3128 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) { 3129 InstructionCost DivCost = BaseT::getArithmeticInstrCost( 3130 Opcode, Ty->getScalarType(), CostKind, Op1Info, Op2Info); 3131 return (4 + DivCost) * VTy->getNumElements(); 3132 } 3133 } 3134 // On AArch64, without SVE, vector divisions are expanded 3135 // into scalar divisions of each pair of elements. 3136 Cost += getArithmeticInstrCost(Instruction::ExtractElement, Ty, 3137 CostKind, Op1Info, Op2Info); 3138 Cost += getArithmeticInstrCost(Instruction::InsertElement, Ty, CostKind, 3139 Op1Info, Op2Info); 3140 } 3141 3142 // TODO: if one of the arguments is scalar, then it's not necessary to 3143 // double the cost of handling the vector elements. 3144 Cost += Cost; 3145 } 3146 return Cost; 3147 } 3148 case ISD::MUL: 3149 // When SVE is available, then we can lower the v2i64 operation using 3150 // the SVE mul instruction, which has a lower cost. 3151 if (LT.second == MVT::v2i64 && ST->hasSVE()) 3152 return LT.first; 3153 3154 // When SVE is not available, there is no MUL.2d instruction, 3155 // which means mul <2 x i64> is expensive as elements are extracted 3156 // from the vectors and the muls scalarized. 3157 // As getScalarizationOverhead is a bit too pessimistic, we 3158 // estimate the cost for a i64 vector directly here, which is: 3159 // - four 2-cost i64 extracts, 3160 // - two 2-cost i64 inserts, and 3161 // - two 1-cost muls. 3162 // So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with 3163 // LT.first = 2 the cost is 28. If both operands are extensions it will not 3164 // need to scalarize so the cost can be cheaper (smull or umull). 3165 // so the cost can be cheaper (smull or umull). 3166 if (LT.second != MVT::v2i64 || isWideningInstruction(Ty, Opcode, Args)) 3167 return LT.first; 3168 return LT.first * 14; 3169 case ISD::ADD: 3170 case ISD::XOR: 3171 case ISD::OR: 3172 case ISD::AND: 3173 case ISD::SRL: 3174 case ISD::SRA: 3175 case ISD::SHL: 3176 // These nodes are marked as 'custom' for combining purposes only. 3177 // We know that they are legal. See LowerAdd in ISelLowering. 3178 return LT.first; 3179 3180 case ISD::FNEG: 3181 case ISD::FADD: 3182 case ISD::FSUB: 3183 // Increase the cost for half and bfloat types if not architecturally 3184 // supported. 3185 if ((Ty->getScalarType()->isHalfTy() && !ST->hasFullFP16()) || 3186 (Ty->getScalarType()->isBFloatTy() && !ST->hasBF16())) 3187 return 2 * LT.first; 3188 if (!Ty->getScalarType()->isFP128Ty()) 3189 return LT.first; 3190 [[fallthrough]]; 3191 case ISD::FMUL: 3192 case ISD::FDIV: 3193 // These nodes are marked as 'custom' just to lower them to SVE. 3194 // We know said lowering will incur no additional cost. 3195 if (!Ty->getScalarType()->isFP128Ty()) 3196 return 2 * LT.first; 3197 3198 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, 3199 Op2Info); 3200 case ISD::FREM: 3201 // Pass nullptr as fmod/fmodf calls are emitted by the backend even when 3202 // those functions are not declared in the module. 3203 if (!Ty->isVectorTy()) 3204 return getCallInstrCost(/*Function*/ nullptr, Ty, {Ty, Ty}, CostKind); 3205 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, 3206 Op2Info); 3207 } 3208 } 3209 3210 InstructionCost AArch64TTIImpl::getAddressComputationCost(Type *Ty, 3211 ScalarEvolution *SE, 3212 const SCEV *Ptr) { 3213 // Address computations in vectorized code with non-consecutive addresses will 3214 // likely result in more instructions compared to scalar code where the 3215 // computation can more often be merged into the index mode. The resulting 3216 // extra micro-ops can significantly decrease throughput. 3217 unsigned NumVectorInstToHideOverhead = NeonNonConstStrideOverhead; 3218 int MaxMergeDistance = 64; 3219 3220 if (Ty->isVectorTy() && SE && 3221 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1)) 3222 return NumVectorInstToHideOverhead; 3223 3224 // In many cases the address computation is not merged into the instruction 3225 // addressing mode. 3226 return 1; 3227 } 3228 3229 InstructionCost AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, 3230 Type *CondTy, 3231 CmpInst::Predicate VecPred, 3232 TTI::TargetCostKind CostKind, 3233 const Instruction *I) { 3234 // TODO: Handle other cost kinds. 3235 if (CostKind != TTI::TCK_RecipThroughput) 3236 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, 3237 I); 3238 3239 int ISD = TLI->InstructionOpcodeToISD(Opcode); 3240 // We don't lower some vector selects well that are wider than the register 3241 // width. 3242 if (isa<FixedVectorType>(ValTy) && ISD == ISD::SELECT) { 3243 // We would need this many instructions to hide the scalarization happening. 3244 const int AmortizationCost = 20; 3245 3246 // If VecPred is not set, check if we can get a predicate from the context 3247 // instruction, if its type matches the requested ValTy. 3248 if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) { 3249 CmpInst::Predicate CurrentPred; 3250 if (match(I, m_Select(m_Cmp(CurrentPred, m_Value(), m_Value()), m_Value(), 3251 m_Value()))) 3252 VecPred = CurrentPred; 3253 } 3254 // Check if we have a compare/select chain that can be lowered using 3255 // a (F)CMxx & BFI pair. 3256 if (CmpInst::isIntPredicate(VecPred) || VecPred == CmpInst::FCMP_OLE || 3257 VecPred == CmpInst::FCMP_OLT || VecPred == CmpInst::FCMP_OGT || 3258 VecPred == CmpInst::FCMP_OGE || VecPred == CmpInst::FCMP_OEQ || 3259 VecPred == CmpInst::FCMP_UNE) { 3260 static const auto ValidMinMaxTys = { 3261 MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32, 3262 MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32, MVT::v2f64}; 3263 static const auto ValidFP16MinMaxTys = {MVT::v4f16, MVT::v8f16}; 3264 3265 auto LT = getTypeLegalizationCost(ValTy); 3266 if (any_of(ValidMinMaxTys, [<](MVT M) { return M == LT.second; }) || 3267 (ST->hasFullFP16() && 3268 any_of(ValidFP16MinMaxTys, [<](MVT M) { return M == LT.second; }))) 3269 return LT.first; 3270 } 3271 3272 static const TypeConversionCostTblEntry 3273 VectorSelectTbl[] = { 3274 { ISD::SELECT, MVT::v2i1, MVT::v2f32, 2 }, 3275 { ISD::SELECT, MVT::v2i1, MVT::v2f64, 2 }, 3276 { ISD::SELECT, MVT::v4i1, MVT::v4f32, 2 }, 3277 { ISD::SELECT, MVT::v4i1, MVT::v4f16, 2 }, 3278 { ISD::SELECT, MVT::v8i1, MVT::v8f16, 2 }, 3279 { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 }, 3280 { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 }, 3281 { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 }, 3282 { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost }, 3283 { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost }, 3284 { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost } 3285 }; 3286 3287 EVT SelCondTy = TLI->getValueType(DL, CondTy); 3288 EVT SelValTy = TLI->getValueType(DL, ValTy); 3289 if (SelCondTy.isSimple() && SelValTy.isSimple()) { 3290 if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, ISD, 3291 SelCondTy.getSimpleVT(), 3292 SelValTy.getSimpleVT())) 3293 return Entry->Cost; 3294 } 3295 } 3296 3297 if (isa<FixedVectorType>(ValTy) && ISD == ISD::SETCC) { 3298 auto LT = getTypeLegalizationCost(ValTy); 3299 // Cost v4f16 FCmp without FP16 support via converting to v4f32 and back. 3300 if (LT.second == MVT::v4f16 && !ST->hasFullFP16()) 3301 return LT.first * 4; // fcvtl + fcvtl + fcmp + xtn 3302 } 3303 3304 // Treat the icmp in icmp(and, 0) as free, as we can make use of ands. 3305 // FIXME: This can apply to more conditions and add/sub if it can be shown to 3306 // be profitable. 3307 if (ValTy->isIntegerTy() && ISD == ISD::SETCC && I && 3308 ICmpInst::isEquality(VecPred) && 3309 TLI->isTypeLegal(TLI->getValueType(DL, ValTy)) && 3310 match(I->getOperand(1), m_Zero()) && 3311 match(I->getOperand(0), m_And(m_Value(), m_Value()))) 3312 return 0; 3313 3314 // The base case handles scalable vectors fine for now, since it treats the 3315 // cost as 1 * legalization cost. 3316 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I); 3317 } 3318 3319 AArch64TTIImpl::TTI::MemCmpExpansionOptions 3320 AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const { 3321 TTI::MemCmpExpansionOptions Options; 3322 if (ST->requiresStrictAlign()) { 3323 // TODO: Add cost modeling for strict align. Misaligned loads expand to 3324 // a bunch of instructions when strict align is enabled. 3325 return Options; 3326 } 3327 Options.AllowOverlappingLoads = true; 3328 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize); 3329 Options.NumLoadsPerBlock = Options.MaxNumLoads; 3330 // TODO: Though vector loads usually perform well on AArch64, in some targets 3331 // they may wake up the FP unit, which raises the power consumption. Perhaps 3332 // they could be used with no holds barred (-O3). 3333 Options.LoadSizes = {8, 4, 2, 1}; 3334 Options.AllowedTailExpansions = {3, 5, 6}; 3335 return Options; 3336 } 3337 3338 bool AArch64TTIImpl::prefersVectorizedAddressing() const { 3339 return ST->hasSVE(); 3340 } 3341 3342 InstructionCost 3343 AArch64TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, 3344 Align Alignment, unsigned AddressSpace, 3345 TTI::TargetCostKind CostKind) { 3346 if (useNeonVector(Src)) 3347 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace, 3348 CostKind); 3349 auto LT = getTypeLegalizationCost(Src); 3350 if (!LT.first.isValid()) 3351 return InstructionCost::getInvalid(); 3352 3353 // Return an invalid cost for element types that we are unable to lower. 3354 auto *VT = cast<VectorType>(Src); 3355 if (VT->getElementType()->isIntegerTy(1)) 3356 return InstructionCost::getInvalid(); 3357 3358 // The code-generator is currently not able to handle scalable vectors 3359 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting 3360 // it. This change will be removed when code-generation for these types is 3361 // sufficiently reliable. 3362 if (VT->getElementCount() == ElementCount::getScalable(1)) 3363 return InstructionCost::getInvalid(); 3364 3365 return LT.first; 3366 } 3367 3368 static unsigned getSVEGatherScatterOverhead(unsigned Opcode) { 3369 return Opcode == Instruction::Load ? SVEGatherOverhead : SVEScatterOverhead; 3370 } 3371 3372 InstructionCost AArch64TTIImpl::getGatherScatterOpCost( 3373 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, 3374 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) { 3375 if (useNeonVector(DataTy) || !isLegalMaskedGatherScatter(DataTy)) 3376 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, 3377 Alignment, CostKind, I); 3378 auto *VT = cast<VectorType>(DataTy); 3379 auto LT = getTypeLegalizationCost(DataTy); 3380 if (!LT.first.isValid()) 3381 return InstructionCost::getInvalid(); 3382 3383 // Return an invalid cost for element types that we are unable to lower. 3384 if (!LT.second.isVector() || 3385 !isElementTypeLegalForScalableVector(VT->getElementType()) || 3386 VT->getElementType()->isIntegerTy(1)) 3387 return InstructionCost::getInvalid(); 3388 3389 // The code-generator is currently not able to handle scalable vectors 3390 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting 3391 // it. This change will be removed when code-generation for these types is 3392 // sufficiently reliable. 3393 if (VT->getElementCount() == ElementCount::getScalable(1)) 3394 return InstructionCost::getInvalid(); 3395 3396 ElementCount LegalVF = LT.second.getVectorElementCount(); 3397 InstructionCost MemOpCost = 3398 getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind, 3399 {TTI::OK_AnyValue, TTI::OP_None}, I); 3400 // Add on an overhead cost for using gathers/scatters. 3401 // TODO: At the moment this is applied unilaterally for all CPUs, but at some 3402 // point we may want a per-CPU overhead. 3403 MemOpCost *= getSVEGatherScatterOverhead(Opcode); 3404 return LT.first * MemOpCost * getMaxNumElements(LegalVF); 3405 } 3406 3407 bool AArch64TTIImpl::useNeonVector(const Type *Ty) const { 3408 return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors(); 3409 } 3410 3411 InstructionCost AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty, 3412 MaybeAlign Alignment, 3413 unsigned AddressSpace, 3414 TTI::TargetCostKind CostKind, 3415 TTI::OperandValueInfo OpInfo, 3416 const Instruction *I) { 3417 EVT VT = TLI->getValueType(DL, Ty, true); 3418 // Type legalization can't handle structs 3419 if (VT == MVT::Other) 3420 return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace, 3421 CostKind); 3422 3423 auto LT = getTypeLegalizationCost(Ty); 3424 if (!LT.first.isValid()) 3425 return InstructionCost::getInvalid(); 3426 3427 // The code-generator is currently not able to handle scalable vectors 3428 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting 3429 // it. This change will be removed when code-generation for these types is 3430 // sufficiently reliable. 3431 // We also only support full register predicate loads and stores. 3432 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty)) 3433 if (VTy->getElementCount() == ElementCount::getScalable(1) || 3434 (VTy->getElementType()->isIntegerTy(1) && 3435 !VTy->getElementCount().isKnownMultipleOf( 3436 ElementCount::getScalable(16)))) 3437 return InstructionCost::getInvalid(); 3438 3439 // TODO: consider latency as well for TCK_SizeAndLatency. 3440 if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency) 3441 return LT.first; 3442 3443 if (CostKind != TTI::TCK_RecipThroughput) 3444 return 1; 3445 3446 if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store && 3447 LT.second.is128BitVector() && (!Alignment || *Alignment < Align(16))) { 3448 // Unaligned stores are extremely inefficient. We don't split all 3449 // unaligned 128-bit stores because the negative impact that has shown in 3450 // practice on inlined block copy code. 3451 // We make such stores expensive so that we will only vectorize if there 3452 // are 6 other instructions getting vectorized. 3453 const int AmortizationCost = 6; 3454 3455 return LT.first * 2 * AmortizationCost; 3456 } 3457 3458 // Opaque ptr or ptr vector types are i64s and can be lowered to STP/LDPs. 3459 if (Ty->isPtrOrPtrVectorTy()) 3460 return LT.first; 3461 3462 if (useNeonVector(Ty)) { 3463 // Check truncating stores and extending loads. 3464 if (Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) { 3465 // v4i8 types are lowered to scalar a load/store and sshll/xtn. 3466 if (VT == MVT::v4i8) 3467 return 2; 3468 // Otherwise we need to scalarize. 3469 return cast<FixedVectorType>(Ty)->getNumElements() * 2; 3470 } 3471 EVT EltVT = VT.getVectorElementType(); 3472 unsigned EltSize = EltVT.getScalarSizeInBits(); 3473 if (!isPowerOf2_32(EltSize) || EltSize < 8 || EltSize > 64 || 3474 VT.getVectorNumElements() >= (128 / EltSize) || !Alignment || 3475 *Alignment != Align(1)) 3476 return LT.first; 3477 // FIXME: v3i8 lowering currently is very inefficient, due to automatic 3478 // widening to v4i8, which produces suboptimal results. 3479 if (VT.getVectorNumElements() == 3 && EltVT == MVT::i8) 3480 return LT.first; 3481 3482 // Check non-power-of-2 loads/stores for legal vector element types with 3483 // NEON. Non-power-of-2 memory ops will get broken down to a set of 3484 // operations on smaller power-of-2 ops, including ld1/st1. 3485 LLVMContext &C = Ty->getContext(); 3486 InstructionCost Cost(0); 3487 SmallVector<EVT> TypeWorklist; 3488 TypeWorklist.push_back(VT); 3489 while (!TypeWorklist.empty()) { 3490 EVT CurrVT = TypeWorklist.pop_back_val(); 3491 unsigned CurrNumElements = CurrVT.getVectorNumElements(); 3492 if (isPowerOf2_32(CurrNumElements)) { 3493 Cost += 1; 3494 continue; 3495 } 3496 3497 unsigned PrevPow2 = NextPowerOf2(CurrNumElements) / 2; 3498 TypeWorklist.push_back(EVT::getVectorVT(C, EltVT, PrevPow2)); 3499 TypeWorklist.push_back( 3500 EVT::getVectorVT(C, EltVT, CurrNumElements - PrevPow2)); 3501 } 3502 return Cost; 3503 } 3504 3505 return LT.first; 3506 } 3507 3508 InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost( 3509 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, 3510 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, 3511 bool UseMaskForCond, bool UseMaskForGaps) { 3512 assert(Factor >= 2 && "Invalid interleave factor"); 3513 auto *VecVTy = cast<VectorType>(VecTy); 3514 3515 if (VecTy->isScalableTy() && (!ST->hasSVE() || Factor != 2)) 3516 return InstructionCost::getInvalid(); 3517 3518 // Vectorization for masked interleaved accesses is only enabled for scalable 3519 // VF. 3520 if (!VecTy->isScalableTy() && (UseMaskForCond || UseMaskForGaps)) 3521 return InstructionCost::getInvalid(); 3522 3523 if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) { 3524 unsigned MinElts = VecVTy->getElementCount().getKnownMinValue(); 3525 auto *SubVecTy = 3526 VectorType::get(VecVTy->getElementType(), 3527 VecVTy->getElementCount().divideCoefficientBy(Factor)); 3528 3529 // ldN/stN only support legal vector types of size 64 or 128 in bits. 3530 // Accesses having vector types that are a multiple of 128 bits can be 3531 // matched to more than one ldN/stN instruction. 3532 bool UseScalable; 3533 if (MinElts % Factor == 0 && 3534 TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable)) 3535 return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable); 3536 } 3537 3538 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, 3539 Alignment, AddressSpace, CostKind, 3540 UseMaskForCond, UseMaskForGaps); 3541 } 3542 3543 InstructionCost 3544 AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) { 3545 InstructionCost Cost = 0; 3546 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 3547 for (auto *I : Tys) { 3548 if (!I->isVectorTy()) 3549 continue; 3550 if (I->getScalarSizeInBits() * cast<FixedVectorType>(I)->getNumElements() == 3551 128) 3552 Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0, CostKind) + 3553 getMemoryOpCost(Instruction::Load, I, Align(128), 0, CostKind); 3554 } 3555 return Cost; 3556 } 3557 3558 unsigned AArch64TTIImpl::getMaxInterleaveFactor(ElementCount VF) { 3559 return ST->getMaxInterleaveFactor(); 3560 } 3561 3562 // For Falkor, we want to avoid having too many strided loads in a loop since 3563 // that can exhaust the HW prefetcher resources. We adjust the unroller 3564 // MaxCount preference below to attempt to ensure unrolling doesn't create too 3565 // many strided loads. 3566 static void 3567 getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE, 3568 TargetTransformInfo::UnrollingPreferences &UP) { 3569 enum { MaxStridedLoads = 7 }; 3570 auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) { 3571 int StridedLoads = 0; 3572 // FIXME? We could make this more precise by looking at the CFG and 3573 // e.g. not counting loads in each side of an if-then-else diamond. 3574 for (const auto BB : L->blocks()) { 3575 for (auto &I : *BB) { 3576 LoadInst *LMemI = dyn_cast<LoadInst>(&I); 3577 if (!LMemI) 3578 continue; 3579 3580 Value *PtrValue = LMemI->getPointerOperand(); 3581 if (L->isLoopInvariant(PtrValue)) 3582 continue; 3583 3584 const SCEV *LSCEV = SE.getSCEV(PtrValue); 3585 const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV); 3586 if (!LSCEVAddRec || !LSCEVAddRec->isAffine()) 3587 continue; 3588 3589 // FIXME? We could take pairing of unrolled load copies into account 3590 // by looking at the AddRec, but we would probably have to limit this 3591 // to loops with no stores or other memory optimization barriers. 3592 ++StridedLoads; 3593 // We've seen enough strided loads that seeing more won't make a 3594 // difference. 3595 if (StridedLoads > MaxStridedLoads / 2) 3596 return StridedLoads; 3597 } 3598 } 3599 return StridedLoads; 3600 }; 3601 3602 int StridedLoads = countStridedLoads(L, SE); 3603 LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads 3604 << " strided loads\n"); 3605 // Pick the largest power of 2 unroll count that won't result in too many 3606 // strided loads. 3607 if (StridedLoads) { 3608 UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads); 3609 LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to " 3610 << UP.MaxCount << '\n'); 3611 } 3612 } 3613 3614 void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, 3615 TTI::UnrollingPreferences &UP, 3616 OptimizationRemarkEmitter *ORE) { 3617 // Enable partial unrolling and runtime unrolling. 3618 BaseT::getUnrollingPreferences(L, SE, UP, ORE); 3619 3620 UP.UpperBound = true; 3621 3622 // For inner loop, it is more likely to be a hot one, and the runtime check 3623 // can be promoted out from LICM pass, so the overhead is less, let's try 3624 // a larger threshold to unroll more loops. 3625 if (L->getLoopDepth() > 1) 3626 UP.PartialThreshold *= 2; 3627 3628 // Disable partial & runtime unrolling on -Os. 3629 UP.PartialOptSizeThreshold = 0; 3630 3631 if (ST->getProcFamily() == AArch64Subtarget::Falkor && 3632 EnableFalkorHWPFUnrollFix) 3633 getFalkorUnrollingPreferences(L, SE, UP); 3634 3635 // Scan the loop: don't unroll loops with calls as this could prevent 3636 // inlining. Don't unroll vector loops either, as they don't benefit much from 3637 // unrolling. 3638 for (auto *BB : L->getBlocks()) { 3639 for (auto &I : *BB) { 3640 // Don't unroll vectorised loop. 3641 if (I.getType()->isVectorTy()) 3642 return; 3643 3644 if (isa<CallInst>(I) || isa<InvokeInst>(I)) { 3645 if (const Function *F = cast<CallBase>(I).getCalledFunction()) { 3646 if (!isLoweredToCall(F)) 3647 continue; 3648 } 3649 return; 3650 } 3651 } 3652 } 3653 3654 // Enable runtime unrolling for in-order models 3655 // If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by 3656 // checking for that case, we can ensure that the default behaviour is 3657 // unchanged 3658 if (ST->getProcFamily() != AArch64Subtarget::Others && 3659 !ST->getSchedModel().isOutOfOrder()) { 3660 UP.Runtime = true; 3661 UP.Partial = true; 3662 UP.UnrollRemainder = true; 3663 UP.DefaultUnrollRuntimeCount = 4; 3664 3665 UP.UnrollAndJam = true; 3666 UP.UnrollAndJamInnerLoopThreshold = 60; 3667 } 3668 } 3669 3670 void AArch64TTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, 3671 TTI::PeelingPreferences &PP) { 3672 BaseT::getPeelingPreferences(L, SE, PP); 3673 } 3674 3675 Value *AArch64TTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, 3676 Type *ExpectedType) { 3677 switch (Inst->getIntrinsicID()) { 3678 default: 3679 return nullptr; 3680 case Intrinsic::aarch64_neon_st2: 3681 case Intrinsic::aarch64_neon_st3: 3682 case Intrinsic::aarch64_neon_st4: { 3683 // Create a struct type 3684 StructType *ST = dyn_cast<StructType>(ExpectedType); 3685 if (!ST) 3686 return nullptr; 3687 unsigned NumElts = Inst->arg_size() - 1; 3688 if (ST->getNumElements() != NumElts) 3689 return nullptr; 3690 for (unsigned i = 0, e = NumElts; i != e; ++i) { 3691 if (Inst->getArgOperand(i)->getType() != ST->getElementType(i)) 3692 return nullptr; 3693 } 3694 Value *Res = PoisonValue::get(ExpectedType); 3695 IRBuilder<> Builder(Inst); 3696 for (unsigned i = 0, e = NumElts; i != e; ++i) { 3697 Value *L = Inst->getArgOperand(i); 3698 Res = Builder.CreateInsertValue(Res, L, i); 3699 } 3700 return Res; 3701 } 3702 case Intrinsic::aarch64_neon_ld2: 3703 case Intrinsic::aarch64_neon_ld3: 3704 case Intrinsic::aarch64_neon_ld4: 3705 if (Inst->getType() == ExpectedType) 3706 return Inst; 3707 return nullptr; 3708 } 3709 } 3710 3711 bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, 3712 MemIntrinsicInfo &Info) { 3713 switch (Inst->getIntrinsicID()) { 3714 default: 3715 break; 3716 case Intrinsic::aarch64_neon_ld2: 3717 case Intrinsic::aarch64_neon_ld3: 3718 case Intrinsic::aarch64_neon_ld4: 3719 Info.ReadMem = true; 3720 Info.WriteMem = false; 3721 Info.PtrVal = Inst->getArgOperand(0); 3722 break; 3723 case Intrinsic::aarch64_neon_st2: 3724 case Intrinsic::aarch64_neon_st3: 3725 case Intrinsic::aarch64_neon_st4: 3726 Info.ReadMem = false; 3727 Info.WriteMem = true; 3728 Info.PtrVal = Inst->getArgOperand(Inst->arg_size() - 1); 3729 break; 3730 } 3731 3732 switch (Inst->getIntrinsicID()) { 3733 default: 3734 return false; 3735 case Intrinsic::aarch64_neon_ld2: 3736 case Intrinsic::aarch64_neon_st2: 3737 Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS; 3738 break; 3739 case Intrinsic::aarch64_neon_ld3: 3740 case Intrinsic::aarch64_neon_st3: 3741 Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS; 3742 break; 3743 case Intrinsic::aarch64_neon_ld4: 3744 case Intrinsic::aarch64_neon_st4: 3745 Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS; 3746 break; 3747 } 3748 return true; 3749 } 3750 3751 /// See if \p I should be considered for address type promotion. We check if \p 3752 /// I is a sext with right type and used in memory accesses. If it used in a 3753 /// "complex" getelementptr, we allow it to be promoted without finding other 3754 /// sext instructions that sign extended the same initial value. A getelementptr 3755 /// is considered as "complex" if it has more than 2 operands. 3756 bool AArch64TTIImpl::shouldConsiderAddressTypePromotion( 3757 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) { 3758 bool Considerable = false; 3759 AllowPromotionWithoutCommonHeader = false; 3760 if (!isa<SExtInst>(&I)) 3761 return false; 3762 Type *ConsideredSExtType = 3763 Type::getInt64Ty(I.getParent()->getParent()->getContext()); 3764 if (I.getType() != ConsideredSExtType) 3765 return false; 3766 // See if the sext is the one with the right type and used in at least one 3767 // GetElementPtrInst. 3768 for (const User *U : I.users()) { 3769 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) { 3770 Considerable = true; 3771 // A getelementptr is considered as "complex" if it has more than 2 3772 // operands. We will promote a SExt used in such complex GEP as we 3773 // expect some computation to be merged if they are done on 64 bits. 3774 if (GEPInst->getNumOperands() > 2) { 3775 AllowPromotionWithoutCommonHeader = true; 3776 break; 3777 } 3778 } 3779 } 3780 return Considerable; 3781 } 3782 3783 bool AArch64TTIImpl::isLegalToVectorizeReduction( 3784 const RecurrenceDescriptor &RdxDesc, ElementCount VF) const { 3785 if (!VF.isScalable()) 3786 return true; 3787 3788 Type *Ty = RdxDesc.getRecurrenceType(); 3789 if (Ty->isBFloatTy() || !isElementTypeLegalForScalableVector(Ty)) 3790 return false; 3791 3792 switch (RdxDesc.getRecurrenceKind()) { 3793 case RecurKind::Add: 3794 case RecurKind::FAdd: 3795 case RecurKind::And: 3796 case RecurKind::Or: 3797 case RecurKind::Xor: 3798 case RecurKind::SMin: 3799 case RecurKind::SMax: 3800 case RecurKind::UMin: 3801 case RecurKind::UMax: 3802 case RecurKind::FMin: 3803 case RecurKind::FMax: 3804 case RecurKind::FMulAdd: 3805 case RecurKind::IAnyOf: 3806 case RecurKind::FAnyOf: 3807 return true; 3808 default: 3809 return false; 3810 } 3811 } 3812 3813 InstructionCost 3814 AArch64TTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, 3815 FastMathFlags FMF, 3816 TTI::TargetCostKind CostKind) { 3817 // The code-generator is currently not able to handle scalable vectors 3818 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting 3819 // it. This change will be removed when code-generation for these types is 3820 // sufficiently reliable. 3821 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty)) 3822 if (VTy->getElementCount() == ElementCount::getScalable(1)) 3823 return InstructionCost::getInvalid(); 3824 3825 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); 3826 3827 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16()) 3828 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind); 3829 3830 InstructionCost LegalizationCost = 0; 3831 if (LT.first > 1) { 3832 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext()); 3833 IntrinsicCostAttributes Attrs(IID, LegalVTy, {LegalVTy, LegalVTy}, FMF); 3834 LegalizationCost = getIntrinsicInstrCost(Attrs, CostKind) * (LT.first - 1); 3835 } 3836 3837 return LegalizationCost + /*Cost of horizontal reduction*/ 2; 3838 } 3839 3840 InstructionCost AArch64TTIImpl::getArithmeticReductionCostSVE( 3841 unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) { 3842 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy); 3843 InstructionCost LegalizationCost = 0; 3844 if (LT.first > 1) { 3845 Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext()); 3846 LegalizationCost = getArithmeticInstrCost(Opcode, LegalVTy, CostKind); 3847 LegalizationCost *= LT.first - 1; 3848 } 3849 3850 int ISD = TLI->InstructionOpcodeToISD(Opcode); 3851 assert(ISD && "Invalid opcode"); 3852 // Add the final reduction cost for the legal horizontal reduction 3853 switch (ISD) { 3854 case ISD::ADD: 3855 case ISD::AND: 3856 case ISD::OR: 3857 case ISD::XOR: 3858 case ISD::FADD: 3859 return LegalizationCost + 2; 3860 default: 3861 return InstructionCost::getInvalid(); 3862 } 3863 } 3864 3865 InstructionCost 3866 AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, 3867 std::optional<FastMathFlags> FMF, 3868 TTI::TargetCostKind CostKind) { 3869 // The code-generator is currently not able to handle scalable vectors 3870 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting 3871 // it. This change will be removed when code-generation for these types is 3872 // sufficiently reliable. 3873 if (auto *VTy = dyn_cast<ScalableVectorType>(ValTy)) 3874 if (VTy->getElementCount() == ElementCount::getScalable(1)) 3875 return InstructionCost::getInvalid(); 3876 3877 if (TTI::requiresOrderedReduction(FMF)) { 3878 if (auto *FixedVTy = dyn_cast<FixedVectorType>(ValTy)) { 3879 InstructionCost BaseCost = 3880 BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind); 3881 // Add on extra cost to reflect the extra overhead on some CPUs. We still 3882 // end up vectorizing for more computationally intensive loops. 3883 return BaseCost + FixedVTy->getNumElements(); 3884 } 3885 3886 if (Opcode != Instruction::FAdd) 3887 return InstructionCost::getInvalid(); 3888 3889 auto *VTy = cast<ScalableVectorType>(ValTy); 3890 InstructionCost Cost = 3891 getArithmeticInstrCost(Opcode, VTy->getScalarType(), CostKind); 3892 Cost *= getMaxNumElements(VTy->getElementCount()); 3893 return Cost; 3894 } 3895 3896 if (isa<ScalableVectorType>(ValTy)) 3897 return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind); 3898 3899 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy); 3900 MVT MTy = LT.second; 3901 int ISD = TLI->InstructionOpcodeToISD(Opcode); 3902 assert(ISD && "Invalid opcode"); 3903 3904 // Horizontal adds can use the 'addv' instruction. We model the cost of these 3905 // instructions as twice a normal vector add, plus 1 for each legalization 3906 // step (LT.first). This is the only arithmetic vector reduction operation for 3907 // which we have an instruction. 3908 // OR, XOR and AND costs should match the codegen from: 3909 // OR: llvm/test/CodeGen/AArch64/reduce-or.ll 3910 // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll 3911 // AND: llvm/test/CodeGen/AArch64/reduce-and.ll 3912 static const CostTblEntry CostTblNoPairwise[]{ 3913 {ISD::ADD, MVT::v8i8, 2}, 3914 {ISD::ADD, MVT::v16i8, 2}, 3915 {ISD::ADD, MVT::v4i16, 2}, 3916 {ISD::ADD, MVT::v8i16, 2}, 3917 {ISD::ADD, MVT::v4i32, 2}, 3918 {ISD::ADD, MVT::v2i64, 2}, 3919 {ISD::OR, MVT::v8i8, 15}, 3920 {ISD::OR, MVT::v16i8, 17}, 3921 {ISD::OR, MVT::v4i16, 7}, 3922 {ISD::OR, MVT::v8i16, 9}, 3923 {ISD::OR, MVT::v2i32, 3}, 3924 {ISD::OR, MVT::v4i32, 5}, 3925 {ISD::OR, MVT::v2i64, 3}, 3926 {ISD::XOR, MVT::v8i8, 15}, 3927 {ISD::XOR, MVT::v16i8, 17}, 3928 {ISD::XOR, MVT::v4i16, 7}, 3929 {ISD::XOR, MVT::v8i16, 9}, 3930 {ISD::XOR, MVT::v2i32, 3}, 3931 {ISD::XOR, MVT::v4i32, 5}, 3932 {ISD::XOR, MVT::v2i64, 3}, 3933 {ISD::AND, MVT::v8i8, 15}, 3934 {ISD::AND, MVT::v16i8, 17}, 3935 {ISD::AND, MVT::v4i16, 7}, 3936 {ISD::AND, MVT::v8i16, 9}, 3937 {ISD::AND, MVT::v2i32, 3}, 3938 {ISD::AND, MVT::v4i32, 5}, 3939 {ISD::AND, MVT::v2i64, 3}, 3940 }; 3941 switch (ISD) { 3942 default: 3943 break; 3944 case ISD::ADD: 3945 if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy)) 3946 return (LT.first - 1) + Entry->Cost; 3947 break; 3948 case ISD::XOR: 3949 case ISD::AND: 3950 case ISD::OR: 3951 const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy); 3952 if (!Entry) 3953 break; 3954 auto *ValVTy = cast<FixedVectorType>(ValTy); 3955 if (MTy.getVectorNumElements() <= ValVTy->getNumElements() && 3956 isPowerOf2_32(ValVTy->getNumElements())) { 3957 InstructionCost ExtraCost = 0; 3958 if (LT.first != 1) { 3959 // Type needs to be split, so there is an extra cost of LT.first - 1 3960 // arithmetic ops. 3961 auto *Ty = FixedVectorType::get(ValTy->getElementType(), 3962 MTy.getVectorNumElements()); 3963 ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind); 3964 ExtraCost *= LT.first - 1; 3965 } 3966 // All and/or/xor of i1 will be lowered with maxv/minv/addv + fmov 3967 auto Cost = ValVTy->getElementType()->isIntegerTy(1) ? 2 : Entry->Cost; 3968 return Cost + ExtraCost; 3969 } 3970 break; 3971 } 3972 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind); 3973 } 3974 3975 InstructionCost AArch64TTIImpl::getSpliceCost(VectorType *Tp, int Index) { 3976 static const CostTblEntry ShuffleTbl[] = { 3977 { TTI::SK_Splice, MVT::nxv16i8, 1 }, 3978 { TTI::SK_Splice, MVT::nxv8i16, 1 }, 3979 { TTI::SK_Splice, MVT::nxv4i32, 1 }, 3980 { TTI::SK_Splice, MVT::nxv2i64, 1 }, 3981 { TTI::SK_Splice, MVT::nxv2f16, 1 }, 3982 { TTI::SK_Splice, MVT::nxv4f16, 1 }, 3983 { TTI::SK_Splice, MVT::nxv8f16, 1 }, 3984 { TTI::SK_Splice, MVT::nxv2bf16, 1 }, 3985 { TTI::SK_Splice, MVT::nxv4bf16, 1 }, 3986 { TTI::SK_Splice, MVT::nxv8bf16, 1 }, 3987 { TTI::SK_Splice, MVT::nxv2f32, 1 }, 3988 { TTI::SK_Splice, MVT::nxv4f32, 1 }, 3989 { TTI::SK_Splice, MVT::nxv2f64, 1 }, 3990 }; 3991 3992 // The code-generator is currently not able to handle scalable vectors 3993 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting 3994 // it. This change will be removed when code-generation for these types is 3995 // sufficiently reliable. 3996 if (Tp->getElementCount() == ElementCount::getScalable(1)) 3997 return InstructionCost::getInvalid(); 3998 3999 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp); 4000 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Tp->getContext()); 4001 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 4002 EVT PromotedVT = LT.second.getScalarType() == MVT::i1 4003 ? TLI->getPromotedVTForPredicate(EVT(LT.second)) 4004 : LT.second; 4005 Type *PromotedVTy = EVT(PromotedVT).getTypeForEVT(Tp->getContext()); 4006 InstructionCost LegalizationCost = 0; 4007 if (Index < 0) { 4008 LegalizationCost = 4009 getCmpSelInstrCost(Instruction::ICmp, PromotedVTy, PromotedVTy, 4010 CmpInst::BAD_ICMP_PREDICATE, CostKind) + 4011 getCmpSelInstrCost(Instruction::Select, PromotedVTy, LegalVTy, 4012 CmpInst::BAD_ICMP_PREDICATE, CostKind); 4013 } 4014 4015 // Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp 4016 // Cost performed on a promoted type. 4017 if (LT.second.getScalarType() == MVT::i1) { 4018 LegalizationCost += 4019 getCastInstrCost(Instruction::ZExt, PromotedVTy, LegalVTy, 4020 TTI::CastContextHint::None, CostKind) + 4021 getCastInstrCost(Instruction::Trunc, LegalVTy, PromotedVTy, 4022 TTI::CastContextHint::None, CostKind); 4023 } 4024 const auto *Entry = 4025 CostTableLookup(ShuffleTbl, TTI::SK_Splice, PromotedVT.getSimpleVT()); 4026 assert(Entry && "Illegal Type for Splice"); 4027 LegalizationCost += Entry->Cost; 4028 return LegalizationCost * LT.first; 4029 } 4030 4031 InstructionCost AArch64TTIImpl::getShuffleCost( 4032 TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef<int> Mask, 4033 TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, 4034 ArrayRef<const Value *> Args, const Instruction *CxtI) { 4035 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp); 4036 4037 // If we have a Mask, and the LT is being legalized somehow, split the Mask 4038 // into smaller vectors and sum the cost of each shuffle. 4039 if (!Mask.empty() && isa<FixedVectorType>(Tp) && LT.second.isVector() && 4040 Tp->getScalarSizeInBits() == LT.second.getScalarSizeInBits() && 4041 Mask.size() > LT.second.getVectorNumElements() && !Index && !SubTp) { 4042 4043 // Check for LD3/LD4 instructions, which are represented in llvm IR as 4044 // deinterleaving-shuffle(load). The shuffle cost could potentially be free, 4045 // but we model it with a cost of LT.first so that LD3/LD4 have a higher 4046 // cost than just the load. 4047 if (Args.size() >= 1 && isa<LoadInst>(Args[0]) && 4048 (ShuffleVectorInst::isDeInterleaveMaskOfFactor(Mask, 3) || 4049 ShuffleVectorInst::isDeInterleaveMaskOfFactor(Mask, 4))) 4050 return std::max<InstructionCost>(1, LT.first / 4); 4051 4052 // Check for ST3/ST4 instructions, which are represented in llvm IR as 4053 // store(interleaving-shuffle). The shuffle cost could potentially be free, 4054 // but we model it with a cost of LT.first so that ST3/ST4 have a higher 4055 // cost than just the store. 4056 if (CxtI && CxtI->hasOneUse() && isa<StoreInst>(*CxtI->user_begin()) && 4057 (ShuffleVectorInst::isInterleaveMask( 4058 Mask, 4, Tp->getElementCount().getKnownMinValue() * 2) || 4059 ShuffleVectorInst::isInterleaveMask( 4060 Mask, 3, Tp->getElementCount().getKnownMinValue() * 2))) 4061 return LT.first; 4062 4063 unsigned TpNumElts = Mask.size(); 4064 unsigned LTNumElts = LT.second.getVectorNumElements(); 4065 unsigned NumVecs = (TpNumElts + LTNumElts - 1) / LTNumElts; 4066 VectorType *NTp = 4067 VectorType::get(Tp->getScalarType(), LT.second.getVectorElementCount()); 4068 InstructionCost Cost; 4069 for (unsigned N = 0; N < NumVecs; N++) { 4070 SmallVector<int> NMask; 4071 // Split the existing mask into chunks of size LTNumElts. Track the source 4072 // sub-vectors to ensure the result has at most 2 inputs. 4073 unsigned Source1, Source2; 4074 unsigned NumSources = 0; 4075 for (unsigned E = 0; E < LTNumElts; E++) { 4076 int MaskElt = (N * LTNumElts + E < TpNumElts) ? Mask[N * LTNumElts + E] 4077 : PoisonMaskElem; 4078 if (MaskElt < 0) { 4079 NMask.push_back(PoisonMaskElem); 4080 continue; 4081 } 4082 4083 // Calculate which source from the input this comes from and whether it 4084 // is new to us. 4085 unsigned Source = MaskElt / LTNumElts; 4086 if (NumSources == 0) { 4087 Source1 = Source; 4088 NumSources = 1; 4089 } else if (NumSources == 1 && Source != Source1) { 4090 Source2 = Source; 4091 NumSources = 2; 4092 } else if (NumSources >= 2 && Source != Source1 && Source != Source2) { 4093 NumSources++; 4094 } 4095 4096 // Add to the new mask. For the NumSources>2 case these are not correct, 4097 // but are only used for the modular lane number. 4098 if (Source == Source1) 4099 NMask.push_back(MaskElt % LTNumElts); 4100 else if (Source == Source2) 4101 NMask.push_back(MaskElt % LTNumElts + LTNumElts); 4102 else 4103 NMask.push_back(MaskElt % LTNumElts); 4104 } 4105 // If the sub-mask has at most 2 input sub-vectors then re-cost it using 4106 // getShuffleCost. If not then cost it using the worst case. 4107 if (NumSources <= 2) 4108 Cost += getShuffleCost(NumSources <= 1 ? TTI::SK_PermuteSingleSrc 4109 : TTI::SK_PermuteTwoSrc, 4110 NTp, NMask, CostKind, 0, nullptr, Args, CxtI); 4111 else if (any_of(enumerate(NMask), [&](const auto &ME) { 4112 return ME.value() % LTNumElts == ME.index(); 4113 })) 4114 Cost += LTNumElts - 1; 4115 else 4116 Cost += LTNumElts; 4117 } 4118 return Cost; 4119 } 4120 4121 Kind = improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp); 4122 // Treat extractsubvector as single op permutation. 4123 bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector; 4124 if (IsExtractSubvector && LT.second.isFixedLengthVector()) 4125 Kind = TTI::SK_PermuteSingleSrc; 4126 4127 // Check for broadcast loads, which are supported by the LD1R instruction. 4128 // In terms of code-size, the shuffle vector is free when a load + dup get 4129 // folded into a LD1R. That's what we check and return here. For performance 4130 // and reciprocal throughput, a LD1R is not completely free. In this case, we 4131 // return the cost for the broadcast below (i.e. 1 for most/all types), so 4132 // that we model the load + dup sequence slightly higher because LD1R is a 4133 // high latency instruction. 4134 if (CostKind == TTI::TCK_CodeSize && Kind == TTI::SK_Broadcast) { 4135 bool IsLoad = !Args.empty() && isa<LoadInst>(Args[0]); 4136 if (IsLoad && LT.second.isVector() && 4137 isLegalBroadcastLoad(Tp->getElementType(), 4138 LT.second.getVectorElementCount())) 4139 return 0; 4140 } 4141 4142 // If we have 4 elements for the shuffle and a Mask, get the cost straight 4143 // from the perfect shuffle tables. 4144 if (Mask.size() == 4 && Tp->getElementCount() == ElementCount::getFixed(4) && 4145 (Tp->getScalarSizeInBits() == 16 || Tp->getScalarSizeInBits() == 32) && 4146 all_of(Mask, [](int E) { return E < 8; })) 4147 return getPerfectShuffleCost(Mask); 4148 4149 // Check for identity masks, which we can treat as free. 4150 if (!Mask.empty() && LT.second.isFixedLengthVector() && 4151 (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) && 4152 all_of(enumerate(Mask), [](const auto &M) { 4153 return M.value() < 0 || M.value() == (int)M.index(); 4154 })) 4155 return 0; 4156 4157 // Check for other shuffles that are not SK_ kinds but we have native 4158 // instructions for, for example ZIP and UZP. 4159 unsigned Unused; 4160 if (LT.second.isFixedLengthVector() && 4161 LT.second.getVectorNumElements() == Mask.size() && 4162 (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) && 4163 (isZIPMask(Mask, LT.second.getVectorNumElements(), Unused) || 4164 isUZPMask(Mask, LT.second.getVectorNumElements(), Unused) || 4165 // Check for non-zero lane splats 4166 all_of(drop_begin(Mask), 4167 [&Mask](int M) { return M < 0 || M == Mask[0]; }))) 4168 return 1; 4169 4170 if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose || 4171 Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc || 4172 Kind == TTI::SK_Reverse || Kind == TTI::SK_Splice) { 4173 static const CostTblEntry ShuffleTbl[] = { 4174 // Broadcast shuffle kinds can be performed with 'dup'. 4175 {TTI::SK_Broadcast, MVT::v8i8, 1}, 4176 {TTI::SK_Broadcast, MVT::v16i8, 1}, 4177 {TTI::SK_Broadcast, MVT::v4i16, 1}, 4178 {TTI::SK_Broadcast, MVT::v8i16, 1}, 4179 {TTI::SK_Broadcast, MVT::v2i32, 1}, 4180 {TTI::SK_Broadcast, MVT::v4i32, 1}, 4181 {TTI::SK_Broadcast, MVT::v2i64, 1}, 4182 {TTI::SK_Broadcast, MVT::v4f16, 1}, 4183 {TTI::SK_Broadcast, MVT::v8f16, 1}, 4184 {TTI::SK_Broadcast, MVT::v2f32, 1}, 4185 {TTI::SK_Broadcast, MVT::v4f32, 1}, 4186 {TTI::SK_Broadcast, MVT::v2f64, 1}, 4187 // Transpose shuffle kinds can be performed with 'trn1/trn2' and 4188 // 'zip1/zip2' instructions. 4189 {TTI::SK_Transpose, MVT::v8i8, 1}, 4190 {TTI::SK_Transpose, MVT::v16i8, 1}, 4191 {TTI::SK_Transpose, MVT::v4i16, 1}, 4192 {TTI::SK_Transpose, MVT::v8i16, 1}, 4193 {TTI::SK_Transpose, MVT::v2i32, 1}, 4194 {TTI::SK_Transpose, MVT::v4i32, 1}, 4195 {TTI::SK_Transpose, MVT::v2i64, 1}, 4196 {TTI::SK_Transpose, MVT::v4f16, 1}, 4197 {TTI::SK_Transpose, MVT::v8f16, 1}, 4198 {TTI::SK_Transpose, MVT::v2f32, 1}, 4199 {TTI::SK_Transpose, MVT::v4f32, 1}, 4200 {TTI::SK_Transpose, MVT::v2f64, 1}, 4201 // Select shuffle kinds. 4202 // TODO: handle vXi8/vXi16. 4203 {TTI::SK_Select, MVT::v2i32, 1}, // mov. 4204 {TTI::SK_Select, MVT::v4i32, 2}, // rev+trn (or similar). 4205 {TTI::SK_Select, MVT::v2i64, 1}, // mov. 4206 {TTI::SK_Select, MVT::v2f32, 1}, // mov. 4207 {TTI::SK_Select, MVT::v4f32, 2}, // rev+trn (or similar). 4208 {TTI::SK_Select, MVT::v2f64, 1}, // mov. 4209 // PermuteSingleSrc shuffle kinds. 4210 {TTI::SK_PermuteSingleSrc, MVT::v2i32, 1}, // mov. 4211 {TTI::SK_PermuteSingleSrc, MVT::v4i32, 3}, // perfectshuffle worst case. 4212 {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // mov. 4213 {TTI::SK_PermuteSingleSrc, MVT::v2f32, 1}, // mov. 4214 {TTI::SK_PermuteSingleSrc, MVT::v4f32, 3}, // perfectshuffle worst case. 4215 {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // mov. 4216 {TTI::SK_PermuteSingleSrc, MVT::v4i16, 3}, // perfectshuffle worst case. 4217 {TTI::SK_PermuteSingleSrc, MVT::v4f16, 3}, // perfectshuffle worst case. 4218 {TTI::SK_PermuteSingleSrc, MVT::v4bf16, 3}, // same 4219 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 8}, // constpool + load + tbl 4220 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 8}, // constpool + load + tbl 4221 {TTI::SK_PermuteSingleSrc, MVT::v8bf16, 8}, // constpool + load + tbl 4222 {TTI::SK_PermuteSingleSrc, MVT::v8i8, 8}, // constpool + load + tbl 4223 {TTI::SK_PermuteSingleSrc, MVT::v16i8, 8}, // constpool + load + tbl 4224 // Reverse can be lowered with `rev`. 4225 {TTI::SK_Reverse, MVT::v2i32, 1}, // REV64 4226 {TTI::SK_Reverse, MVT::v4i32, 2}, // REV64; EXT 4227 {TTI::SK_Reverse, MVT::v2i64, 1}, // EXT 4228 {TTI::SK_Reverse, MVT::v2f32, 1}, // REV64 4229 {TTI::SK_Reverse, MVT::v4f32, 2}, // REV64; EXT 4230 {TTI::SK_Reverse, MVT::v2f64, 1}, // EXT 4231 {TTI::SK_Reverse, MVT::v8f16, 2}, // REV64; EXT 4232 {TTI::SK_Reverse, MVT::v8i16, 2}, // REV64; EXT 4233 {TTI::SK_Reverse, MVT::v16i8, 2}, // REV64; EXT 4234 {TTI::SK_Reverse, MVT::v4f16, 1}, // REV64 4235 {TTI::SK_Reverse, MVT::v4i16, 1}, // REV64 4236 {TTI::SK_Reverse, MVT::v8i8, 1}, // REV64 4237 // Splice can all be lowered as `ext`. 4238 {TTI::SK_Splice, MVT::v2i32, 1}, 4239 {TTI::SK_Splice, MVT::v4i32, 1}, 4240 {TTI::SK_Splice, MVT::v2i64, 1}, 4241 {TTI::SK_Splice, MVT::v2f32, 1}, 4242 {TTI::SK_Splice, MVT::v4f32, 1}, 4243 {TTI::SK_Splice, MVT::v2f64, 1}, 4244 {TTI::SK_Splice, MVT::v8f16, 1}, 4245 {TTI::SK_Splice, MVT::v8bf16, 1}, 4246 {TTI::SK_Splice, MVT::v8i16, 1}, 4247 {TTI::SK_Splice, MVT::v16i8, 1}, 4248 {TTI::SK_Splice, MVT::v4bf16, 1}, 4249 {TTI::SK_Splice, MVT::v4f16, 1}, 4250 {TTI::SK_Splice, MVT::v4i16, 1}, 4251 {TTI::SK_Splice, MVT::v8i8, 1}, 4252 // Broadcast shuffle kinds for scalable vectors 4253 {TTI::SK_Broadcast, MVT::nxv16i8, 1}, 4254 {TTI::SK_Broadcast, MVT::nxv8i16, 1}, 4255 {TTI::SK_Broadcast, MVT::nxv4i32, 1}, 4256 {TTI::SK_Broadcast, MVT::nxv2i64, 1}, 4257 {TTI::SK_Broadcast, MVT::nxv2f16, 1}, 4258 {TTI::SK_Broadcast, MVT::nxv4f16, 1}, 4259 {TTI::SK_Broadcast, MVT::nxv8f16, 1}, 4260 {TTI::SK_Broadcast, MVT::nxv2bf16, 1}, 4261 {TTI::SK_Broadcast, MVT::nxv4bf16, 1}, 4262 {TTI::SK_Broadcast, MVT::nxv8bf16, 1}, 4263 {TTI::SK_Broadcast, MVT::nxv2f32, 1}, 4264 {TTI::SK_Broadcast, MVT::nxv4f32, 1}, 4265 {TTI::SK_Broadcast, MVT::nxv2f64, 1}, 4266 {TTI::SK_Broadcast, MVT::nxv16i1, 1}, 4267 {TTI::SK_Broadcast, MVT::nxv8i1, 1}, 4268 {TTI::SK_Broadcast, MVT::nxv4i1, 1}, 4269 {TTI::SK_Broadcast, MVT::nxv2i1, 1}, 4270 // Handle the cases for vector.reverse with scalable vectors 4271 {TTI::SK_Reverse, MVT::nxv16i8, 1}, 4272 {TTI::SK_Reverse, MVT::nxv8i16, 1}, 4273 {TTI::SK_Reverse, MVT::nxv4i32, 1}, 4274 {TTI::SK_Reverse, MVT::nxv2i64, 1}, 4275 {TTI::SK_Reverse, MVT::nxv2f16, 1}, 4276 {TTI::SK_Reverse, MVT::nxv4f16, 1}, 4277 {TTI::SK_Reverse, MVT::nxv8f16, 1}, 4278 {TTI::SK_Reverse, MVT::nxv2bf16, 1}, 4279 {TTI::SK_Reverse, MVT::nxv4bf16, 1}, 4280 {TTI::SK_Reverse, MVT::nxv8bf16, 1}, 4281 {TTI::SK_Reverse, MVT::nxv2f32, 1}, 4282 {TTI::SK_Reverse, MVT::nxv4f32, 1}, 4283 {TTI::SK_Reverse, MVT::nxv2f64, 1}, 4284 {TTI::SK_Reverse, MVT::nxv16i1, 1}, 4285 {TTI::SK_Reverse, MVT::nxv8i1, 1}, 4286 {TTI::SK_Reverse, MVT::nxv4i1, 1}, 4287 {TTI::SK_Reverse, MVT::nxv2i1, 1}, 4288 }; 4289 if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second)) 4290 return LT.first * Entry->Cost; 4291 } 4292 4293 if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(Tp)) 4294 return getSpliceCost(Tp, Index); 4295 4296 // Inserting a subvector can often be done with either a D, S or H register 4297 // move, so long as the inserted vector is "aligned". 4298 if (Kind == TTI::SK_InsertSubvector && LT.second.isFixedLengthVector() && 4299 LT.second.getSizeInBits() <= 128 && SubTp) { 4300 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp); 4301 if (SubLT.second.isVector()) { 4302 int NumElts = LT.second.getVectorNumElements(); 4303 int NumSubElts = SubLT.second.getVectorNumElements(); 4304 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0) 4305 return SubLT.first; 4306 } 4307 } 4308 4309 // Restore optimal kind. 4310 if (IsExtractSubvector) 4311 Kind = TTI::SK_ExtractSubvector; 4312 return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args, 4313 CxtI); 4314 } 4315 4316 static bool containsDecreasingPointers(Loop *TheLoop, 4317 PredicatedScalarEvolution *PSE) { 4318 const auto &Strides = DenseMap<Value *, const SCEV *>(); 4319 for (BasicBlock *BB : TheLoop->blocks()) { 4320 // Scan the instructions in the block and look for addresses that are 4321 // consecutive and decreasing. 4322 for (Instruction &I : *BB) { 4323 if (isa<LoadInst>(&I) || isa<StoreInst>(&I)) { 4324 Value *Ptr = getLoadStorePointerOperand(&I); 4325 Type *AccessTy = getLoadStoreType(&I); 4326 if (getPtrStride(*PSE, AccessTy, Ptr, TheLoop, Strides, /*Assume=*/true, 4327 /*ShouldCheckWrap=*/false) 4328 .value_or(0) < 0) 4329 return true; 4330 } 4331 } 4332 } 4333 return false; 4334 } 4335 4336 bool AArch64TTIImpl::preferPredicateOverEpilogue(TailFoldingInfo *TFI) { 4337 if (!ST->hasSVE()) 4338 return false; 4339 4340 // We don't currently support vectorisation with interleaving for SVE - with 4341 // such loops we're better off not using tail-folding. This gives us a chance 4342 // to fall back on fixed-width vectorisation using NEON's ld2/st2/etc. 4343 if (TFI->IAI->hasGroups()) 4344 return false; 4345 4346 TailFoldingOpts Required = TailFoldingOpts::Disabled; 4347 if (TFI->LVL->getReductionVars().size()) 4348 Required |= TailFoldingOpts::Reductions; 4349 if (TFI->LVL->getFixedOrderRecurrences().size()) 4350 Required |= TailFoldingOpts::Recurrences; 4351 4352 // We call this to discover whether any load/store pointers in the loop have 4353 // negative strides. This will require extra work to reverse the loop 4354 // predicate, which may be expensive. 4355 if (containsDecreasingPointers(TFI->LVL->getLoop(), 4356 TFI->LVL->getPredicatedScalarEvolution())) 4357 Required |= TailFoldingOpts::Reverse; 4358 if (Required == TailFoldingOpts::Disabled) 4359 Required |= TailFoldingOpts::Simple; 4360 4361 if (!TailFoldingOptionLoc.satisfies(ST->getSVETailFoldingDefaultOpts(), 4362 Required)) 4363 return false; 4364 4365 // Don't tail-fold for tight loops where we would be better off interleaving 4366 // with an unpredicated loop. 4367 unsigned NumInsns = 0; 4368 for (BasicBlock *BB : TFI->LVL->getLoop()->blocks()) { 4369 NumInsns += BB->sizeWithoutDebug(); 4370 } 4371 4372 // We expect 4 of these to be a IV PHI, IV add, IV compare and branch. 4373 return NumInsns >= SVETailFoldInsnThreshold; 4374 } 4375 4376 InstructionCost 4377 AArch64TTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, 4378 StackOffset BaseOffset, bool HasBaseReg, 4379 int64_t Scale, unsigned AddrSpace) const { 4380 // Scaling factors are not free at all. 4381 // Operands | Rt Latency 4382 // ------------------------------------------- 4383 // Rt, [Xn, Xm] | 4 4384 // ------------------------------------------- 4385 // Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5 4386 // Rt, [Xn, Wm, <extend> #imm] | 4387 TargetLoweringBase::AddrMode AM; 4388 AM.BaseGV = BaseGV; 4389 AM.BaseOffs = BaseOffset.getFixed(); 4390 AM.HasBaseReg = HasBaseReg; 4391 AM.Scale = Scale; 4392 AM.ScalableOffset = BaseOffset.getScalable(); 4393 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace)) 4394 // Scale represents reg2 * scale, thus account for 1 if 4395 // it is not equal to 0 or 1. 4396 return AM.Scale != 0 && AM.Scale != 1; 4397 return -1; 4398 } 4399 4400 bool AArch64TTIImpl::shouldTreatInstructionLikeSelect(const Instruction *I) { 4401 // For the binary operators (e.g. or) we need to be more careful than 4402 // selects, here we only transform them if they are already at a natural 4403 // break point in the code - the end of a block with an unconditional 4404 // terminator. 4405 if (EnableOrLikeSelectOpt && I->getOpcode() == Instruction::Or && 4406 isa<BranchInst>(I->getNextNode()) && 4407 cast<BranchInst>(I->getNextNode())->isUnconditional()) 4408 return true; 4409 return BaseT::shouldTreatInstructionLikeSelect(I); 4410 } 4411 4412 bool AArch64TTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1, 4413 const TargetTransformInfo::LSRCost &C2) { 4414 // AArch64 specific here is adding the number of instructions to the 4415 // comparison (though not as the first consideration, as some targets do) 4416 // along with changing the priority of the base additions. 4417 // TODO: Maybe a more nuanced tradeoff between instruction count 4418 // and number of registers? To be investigated at a later date. 4419 if (EnableLSRCostOpt) 4420 return std::tie(C1.NumRegs, C1.Insns, C1.NumBaseAdds, C1.AddRecCost, 4421 C1.NumIVMuls, C1.ScaleCost, C1.ImmCost, C1.SetupCost) < 4422 std::tie(C2.NumRegs, C2.Insns, C2.NumBaseAdds, C2.AddRecCost, 4423 C2.NumIVMuls, C2.ScaleCost, C2.ImmCost, C2.SetupCost); 4424 4425 return TargetTransformInfoImplBase::isLSRCostLess(C1, C2); 4426 } 4427