1 //===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "AArch64TargetTransformInfo.h" 10 #include "AArch64ExpandImm.h" 11 #include "AArch64PerfectShuffle.h" 12 #include "MCTargetDesc/AArch64AddressingModes.h" 13 #include "llvm/Analysis/IVDescriptors.h" 14 #include "llvm/Analysis/LoopInfo.h" 15 #include "llvm/Analysis/TargetTransformInfo.h" 16 #include "llvm/CodeGen/BasicTTIImpl.h" 17 #include "llvm/CodeGen/CostTable.h" 18 #include "llvm/CodeGen/TargetLowering.h" 19 #include "llvm/IR/IntrinsicInst.h" 20 #include "llvm/IR/Intrinsics.h" 21 #include "llvm/IR/IntrinsicsAArch64.h" 22 #include "llvm/IR/PatternMatch.h" 23 #include "llvm/Support/Debug.h" 24 #include "llvm/Transforms/InstCombine/InstCombiner.h" 25 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 26 #include <algorithm> 27 #include <optional> 28 using namespace llvm; 29 using namespace llvm::PatternMatch; 30 31 #define DEBUG_TYPE "aarch64tti" 32 33 static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix", 34 cl::init(true), cl::Hidden); 35 36 static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(10), 37 cl::Hidden); 38 39 static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead", 40 cl::init(10), cl::Hidden); 41 42 namespace { 43 class TailFoldingKind { 44 private: 45 uint8_t Bits = 0; // Currently defaults to disabled. 46 47 public: 48 enum TailFoldingOpts { 49 TFDisabled = 0x0, 50 TFReductions = 0x01, 51 TFRecurrences = 0x02, 52 TFSimple = 0x80, 53 TFAll = TFReductions | TFRecurrences | TFSimple 54 }; 55 56 void operator=(const std::string &Val) { 57 if (Val.empty()) 58 return; 59 SmallVector<StringRef, 6> TailFoldTypes; 60 StringRef(Val).split(TailFoldTypes, '+', -1, false); 61 for (auto TailFoldType : TailFoldTypes) { 62 if (TailFoldType == "disabled") 63 Bits = 0; 64 else if (TailFoldType == "all") 65 Bits = TFAll; 66 else if (TailFoldType == "default") 67 Bits = 0; // Currently defaults to never tail-folding. 68 else if (TailFoldType == "simple") 69 add(TFSimple); 70 else if (TailFoldType == "reductions") 71 add(TFReductions); 72 else if (TailFoldType == "recurrences") 73 add(TFRecurrences); 74 else if (TailFoldType == "noreductions") 75 remove(TFReductions); 76 else if (TailFoldType == "norecurrences") 77 remove(TFRecurrences); 78 else { 79 errs() 80 << "invalid argument " << TailFoldType.str() 81 << " to -sve-tail-folding=; each element must be one of: disabled, " 82 "all, default, simple, reductions, noreductions, recurrences, " 83 "norecurrences\n"; 84 } 85 } 86 } 87 88 operator uint8_t() const { return Bits; } 89 90 void add(uint8_t Flag) { Bits |= Flag; } 91 void remove(uint8_t Flag) { Bits &= ~Flag; } 92 }; 93 } // namespace 94 95 TailFoldingKind TailFoldingKindLoc; 96 97 cl::opt<TailFoldingKind, true, cl::parser<std::string>> SVETailFolding( 98 "sve-tail-folding", 99 cl::desc( 100 "Control the use of vectorisation using tail-folding for SVE:" 101 "\ndisabled No loop types will vectorize using tail-folding" 102 "\ndefault Uses the default tail-folding settings for the target " 103 "CPU" 104 "\nall All legal loop types will vectorize using tail-folding" 105 "\nsimple Use tail-folding for simple loops (not reductions or " 106 "recurrences)" 107 "\nreductions Use tail-folding for loops containing reductions" 108 "\nrecurrences Use tail-folding for loops containing fixed order " 109 "recurrences"), 110 cl::location(TailFoldingKindLoc)); 111 112 // Experimental option that will only be fully functional when the 113 // code-generator is changed to use SVE instead of NEON for all fixed-width 114 // operations. 115 static cl::opt<bool> EnableFixedwidthAutovecInStreamingMode( 116 "enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden); 117 118 // Experimental option that will only be fully functional when the cost-model 119 // and code-generator have been changed to avoid using scalable vector 120 // instructions that are not legal in streaming SVE mode. 121 static cl::opt<bool> EnableScalableAutovecInStreamingMode( 122 "enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden); 123 124 bool AArch64TTIImpl::areInlineCompatible(const Function *Caller, 125 const Function *Callee) const { 126 SMEAttrs CallerAttrs(*Caller); 127 SMEAttrs CalleeAttrs(*Callee); 128 if (CallerAttrs.requiresSMChange(CalleeAttrs, 129 /*BodyOverridesInterface=*/true) || 130 CallerAttrs.requiresLazySave(CalleeAttrs) || 131 CalleeAttrs.hasNewZAInterface()) 132 return false; 133 134 const TargetMachine &TM = getTLI()->getTargetMachine(); 135 136 const FeatureBitset &CallerBits = 137 TM.getSubtargetImpl(*Caller)->getFeatureBits(); 138 const FeatureBitset &CalleeBits = 139 TM.getSubtargetImpl(*Callee)->getFeatureBits(); 140 141 // Inline a callee if its target-features are a subset of the callers 142 // target-features. 143 return (CallerBits & CalleeBits) == CalleeBits; 144 } 145 146 bool AArch64TTIImpl::shouldMaximizeVectorBandwidth( 147 TargetTransformInfo::RegisterKind K) const { 148 assert(K != TargetTransformInfo::RGK_Scalar); 149 return K == TargetTransformInfo::RGK_FixedWidthVector; 150 } 151 152 /// Calculate the cost of materializing a 64-bit value. This helper 153 /// method might only calculate a fraction of a larger immediate. Therefore it 154 /// is valid to return a cost of ZERO. 155 InstructionCost AArch64TTIImpl::getIntImmCost(int64_t Val) { 156 // Check if the immediate can be encoded within an instruction. 157 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64)) 158 return 0; 159 160 if (Val < 0) 161 Val = ~Val; 162 163 // Calculate how many moves we will need to materialize this constant. 164 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn; 165 AArch64_IMM::expandMOVImm(Val, 64, Insn); 166 return Insn.size(); 167 } 168 169 /// Calculate the cost of materializing the given constant. 170 InstructionCost AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty, 171 TTI::TargetCostKind CostKind) { 172 assert(Ty->isIntegerTy()); 173 174 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 175 if (BitSize == 0) 176 return ~0U; 177 178 // Sign-extend all constants to a multiple of 64-bit. 179 APInt ImmVal = Imm; 180 if (BitSize & 0x3f) 181 ImmVal = Imm.sext((BitSize + 63) & ~0x3fU); 182 183 // Split the constant into 64-bit chunks and calculate the cost for each 184 // chunk. 185 InstructionCost Cost = 0; 186 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) { 187 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64); 188 int64_t Val = Tmp.getSExtValue(); 189 Cost += getIntImmCost(Val); 190 } 191 // We need at least one instruction to materialze the constant. 192 return std::max<InstructionCost>(1, Cost); 193 } 194 195 InstructionCost AArch64TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, 196 const APInt &Imm, Type *Ty, 197 TTI::TargetCostKind CostKind, 198 Instruction *Inst) { 199 assert(Ty->isIntegerTy()); 200 201 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 202 // There is no cost model for constants with a bit size of 0. Return TCC_Free 203 // here, so that constant hoisting will ignore this constant. 204 if (BitSize == 0) 205 return TTI::TCC_Free; 206 207 unsigned ImmIdx = ~0U; 208 switch (Opcode) { 209 default: 210 return TTI::TCC_Free; 211 case Instruction::GetElementPtr: 212 // Always hoist the base address of a GetElementPtr. 213 if (Idx == 0) 214 return 2 * TTI::TCC_Basic; 215 return TTI::TCC_Free; 216 case Instruction::Store: 217 ImmIdx = 0; 218 break; 219 case Instruction::Add: 220 case Instruction::Sub: 221 case Instruction::Mul: 222 case Instruction::UDiv: 223 case Instruction::SDiv: 224 case Instruction::URem: 225 case Instruction::SRem: 226 case Instruction::And: 227 case Instruction::Or: 228 case Instruction::Xor: 229 case Instruction::ICmp: 230 ImmIdx = 1; 231 break; 232 // Always return TCC_Free for the shift value of a shift instruction. 233 case Instruction::Shl: 234 case Instruction::LShr: 235 case Instruction::AShr: 236 if (Idx == 1) 237 return TTI::TCC_Free; 238 break; 239 case Instruction::Trunc: 240 case Instruction::ZExt: 241 case Instruction::SExt: 242 case Instruction::IntToPtr: 243 case Instruction::PtrToInt: 244 case Instruction::BitCast: 245 case Instruction::PHI: 246 case Instruction::Call: 247 case Instruction::Select: 248 case Instruction::Ret: 249 case Instruction::Load: 250 break; 251 } 252 253 if (Idx == ImmIdx) { 254 int NumConstants = (BitSize + 63) / 64; 255 InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); 256 return (Cost <= NumConstants * TTI::TCC_Basic) 257 ? static_cast<int>(TTI::TCC_Free) 258 : Cost; 259 } 260 return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); 261 } 262 263 InstructionCost 264 AArch64TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, 265 const APInt &Imm, Type *Ty, 266 TTI::TargetCostKind CostKind) { 267 assert(Ty->isIntegerTy()); 268 269 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 270 // There is no cost model for constants with a bit size of 0. Return TCC_Free 271 // here, so that constant hoisting will ignore this constant. 272 if (BitSize == 0) 273 return TTI::TCC_Free; 274 275 // Most (all?) AArch64 intrinsics do not support folding immediates into the 276 // selected instruction, so we compute the materialization cost for the 277 // immediate directly. 278 if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv) 279 return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); 280 281 switch (IID) { 282 default: 283 return TTI::TCC_Free; 284 case Intrinsic::sadd_with_overflow: 285 case Intrinsic::uadd_with_overflow: 286 case Intrinsic::ssub_with_overflow: 287 case Intrinsic::usub_with_overflow: 288 case Intrinsic::smul_with_overflow: 289 case Intrinsic::umul_with_overflow: 290 if (Idx == 1) { 291 int NumConstants = (BitSize + 63) / 64; 292 InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); 293 return (Cost <= NumConstants * TTI::TCC_Basic) 294 ? static_cast<int>(TTI::TCC_Free) 295 : Cost; 296 } 297 break; 298 case Intrinsic::experimental_stackmap: 299 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) 300 return TTI::TCC_Free; 301 break; 302 case Intrinsic::experimental_patchpoint_void: 303 case Intrinsic::experimental_patchpoint_i64: 304 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) 305 return TTI::TCC_Free; 306 break; 307 case Intrinsic::experimental_gc_statepoint: 308 if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) 309 return TTI::TCC_Free; 310 break; 311 } 312 return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); 313 } 314 315 TargetTransformInfo::PopcntSupportKind 316 AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) { 317 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); 318 if (TyWidth == 32 || TyWidth == 64) 319 return TTI::PSK_FastHardware; 320 // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount. 321 return TTI::PSK_Software; 322 } 323 324 InstructionCost 325 AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, 326 TTI::TargetCostKind CostKind) { 327 auto *RetTy = ICA.getReturnType(); 328 switch (ICA.getID()) { 329 case Intrinsic::umin: 330 case Intrinsic::umax: 331 case Intrinsic::smin: 332 case Intrinsic::smax: { 333 static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16, 334 MVT::v8i16, MVT::v2i32, MVT::v4i32}; 335 auto LT = getTypeLegalizationCost(RetTy); 336 // v2i64 types get converted to cmp+bif hence the cost of 2 337 if (LT.second == MVT::v2i64) 338 return LT.first * 2; 339 if (any_of(ValidMinMaxTys, [<](MVT M) { return M == LT.second; })) 340 return LT.first; 341 break; 342 } 343 case Intrinsic::sadd_sat: 344 case Intrinsic::ssub_sat: 345 case Intrinsic::uadd_sat: 346 case Intrinsic::usub_sat: { 347 static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16, 348 MVT::v8i16, MVT::v2i32, MVT::v4i32, 349 MVT::v2i64}; 350 auto LT = getTypeLegalizationCost(RetTy); 351 // This is a base cost of 1 for the vadd, plus 3 extract shifts if we 352 // need to extend the type, as it uses shr(qadd(shl, shl)). 353 unsigned Instrs = 354 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4; 355 if (any_of(ValidSatTys, [<](MVT M) { return M == LT.second; })) 356 return LT.first * Instrs; 357 break; 358 } 359 case Intrinsic::abs: { 360 static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16, 361 MVT::v8i16, MVT::v2i32, MVT::v4i32, 362 MVT::v2i64}; 363 auto LT = getTypeLegalizationCost(RetTy); 364 if (any_of(ValidAbsTys, [<](MVT M) { return M == LT.second; })) 365 return LT.first; 366 break; 367 } 368 case Intrinsic::experimental_stepvector: { 369 InstructionCost Cost = 1; // Cost of the `index' instruction 370 auto LT = getTypeLegalizationCost(RetTy); 371 // Legalisation of illegal vectors involves an `index' instruction plus 372 // (LT.first - 1) vector adds. 373 if (LT.first > 1) { 374 Type *LegalVTy = EVT(LT.second).getTypeForEVT(RetTy->getContext()); 375 InstructionCost AddCost = 376 getArithmeticInstrCost(Instruction::Add, LegalVTy, CostKind); 377 Cost += AddCost * (LT.first - 1); 378 } 379 return Cost; 380 } 381 case Intrinsic::bitreverse: { 382 static const CostTblEntry BitreverseTbl[] = { 383 {Intrinsic::bitreverse, MVT::i32, 1}, 384 {Intrinsic::bitreverse, MVT::i64, 1}, 385 {Intrinsic::bitreverse, MVT::v8i8, 1}, 386 {Intrinsic::bitreverse, MVT::v16i8, 1}, 387 {Intrinsic::bitreverse, MVT::v4i16, 2}, 388 {Intrinsic::bitreverse, MVT::v8i16, 2}, 389 {Intrinsic::bitreverse, MVT::v2i32, 2}, 390 {Intrinsic::bitreverse, MVT::v4i32, 2}, 391 {Intrinsic::bitreverse, MVT::v1i64, 2}, 392 {Intrinsic::bitreverse, MVT::v2i64, 2}, 393 }; 394 const auto LegalisationCost = getTypeLegalizationCost(RetTy); 395 const auto *Entry = 396 CostTableLookup(BitreverseTbl, ICA.getID(), LegalisationCost.second); 397 if (Entry) { 398 // Cost Model is using the legal type(i32) that i8 and i16 will be 399 // converted to +1 so that we match the actual lowering cost 400 if (TLI->getValueType(DL, RetTy, true) == MVT::i8 || 401 TLI->getValueType(DL, RetTy, true) == MVT::i16) 402 return LegalisationCost.first * Entry->Cost + 1; 403 404 return LegalisationCost.first * Entry->Cost; 405 } 406 break; 407 } 408 case Intrinsic::ctpop: { 409 if (!ST->hasNEON()) { 410 // 32-bit or 64-bit ctpop without NEON is 12 instructions. 411 return getTypeLegalizationCost(RetTy).first * 12; 412 } 413 static const CostTblEntry CtpopCostTbl[] = { 414 {ISD::CTPOP, MVT::v2i64, 4}, 415 {ISD::CTPOP, MVT::v4i32, 3}, 416 {ISD::CTPOP, MVT::v8i16, 2}, 417 {ISD::CTPOP, MVT::v16i8, 1}, 418 {ISD::CTPOP, MVT::i64, 4}, 419 {ISD::CTPOP, MVT::v2i32, 3}, 420 {ISD::CTPOP, MVT::v4i16, 2}, 421 {ISD::CTPOP, MVT::v8i8, 1}, 422 {ISD::CTPOP, MVT::i32, 5}, 423 }; 424 auto LT = getTypeLegalizationCost(RetTy); 425 MVT MTy = LT.second; 426 if (const auto *Entry = CostTableLookup(CtpopCostTbl, ISD::CTPOP, MTy)) { 427 // Extra cost of +1 when illegal vector types are legalized by promoting 428 // the integer type. 429 int ExtraCost = MTy.isVector() && MTy.getScalarSizeInBits() != 430 RetTy->getScalarSizeInBits() 431 ? 1 432 : 0; 433 return LT.first * Entry->Cost + ExtraCost; 434 } 435 break; 436 } 437 case Intrinsic::sadd_with_overflow: 438 case Intrinsic::uadd_with_overflow: 439 case Intrinsic::ssub_with_overflow: 440 case Intrinsic::usub_with_overflow: 441 case Intrinsic::smul_with_overflow: 442 case Intrinsic::umul_with_overflow: { 443 static const CostTblEntry WithOverflowCostTbl[] = { 444 {Intrinsic::sadd_with_overflow, MVT::i8, 3}, 445 {Intrinsic::uadd_with_overflow, MVT::i8, 3}, 446 {Intrinsic::sadd_with_overflow, MVT::i16, 3}, 447 {Intrinsic::uadd_with_overflow, MVT::i16, 3}, 448 {Intrinsic::sadd_with_overflow, MVT::i32, 1}, 449 {Intrinsic::uadd_with_overflow, MVT::i32, 1}, 450 {Intrinsic::sadd_with_overflow, MVT::i64, 1}, 451 {Intrinsic::uadd_with_overflow, MVT::i64, 1}, 452 {Intrinsic::ssub_with_overflow, MVT::i8, 3}, 453 {Intrinsic::usub_with_overflow, MVT::i8, 3}, 454 {Intrinsic::ssub_with_overflow, MVT::i16, 3}, 455 {Intrinsic::usub_with_overflow, MVT::i16, 3}, 456 {Intrinsic::ssub_with_overflow, MVT::i32, 1}, 457 {Intrinsic::usub_with_overflow, MVT::i32, 1}, 458 {Intrinsic::ssub_with_overflow, MVT::i64, 1}, 459 {Intrinsic::usub_with_overflow, MVT::i64, 1}, 460 {Intrinsic::smul_with_overflow, MVT::i8, 5}, 461 {Intrinsic::umul_with_overflow, MVT::i8, 4}, 462 {Intrinsic::smul_with_overflow, MVT::i16, 5}, 463 {Intrinsic::umul_with_overflow, MVT::i16, 4}, 464 {Intrinsic::smul_with_overflow, MVT::i32, 2}, // eg umull;tst 465 {Intrinsic::umul_with_overflow, MVT::i32, 2}, // eg umull;cmp sxtw 466 {Intrinsic::smul_with_overflow, MVT::i64, 3}, // eg mul;smulh;cmp 467 {Intrinsic::umul_with_overflow, MVT::i64, 3}, // eg mul;umulh;cmp asr 468 }; 469 EVT MTy = TLI->getValueType(DL, RetTy->getContainedType(0), true); 470 if (MTy.isSimple()) 471 if (const auto *Entry = CostTableLookup(WithOverflowCostTbl, ICA.getID(), 472 MTy.getSimpleVT())) 473 return Entry->Cost; 474 break; 475 } 476 case Intrinsic::fptosi_sat: 477 case Intrinsic::fptoui_sat: { 478 if (ICA.getArgTypes().empty()) 479 break; 480 bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat; 481 auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]); 482 EVT MTy = TLI->getValueType(DL, RetTy); 483 // Check for the legal types, which are where the size of the input and the 484 // output are the same, or we are using cvt f64->i32 or f32->i64. 485 if ((LT.second == MVT::f32 || LT.second == MVT::f64 || 486 LT.second == MVT::v2f32 || LT.second == MVT::v4f32 || 487 LT.second == MVT::v2f64) && 488 (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits() || 489 (LT.second == MVT::f64 && MTy == MVT::i32) || 490 (LT.second == MVT::f32 && MTy == MVT::i64))) 491 return LT.first; 492 // Similarly for fp16 sizes 493 if (ST->hasFullFP16() && 494 ((LT.second == MVT::f16 && MTy == MVT::i32) || 495 ((LT.second == MVT::v4f16 || LT.second == MVT::v8f16) && 496 (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())))) 497 return LT.first; 498 499 // Otherwise we use a legal convert followed by a min+max 500 if ((LT.second.getScalarType() == MVT::f32 || 501 LT.second.getScalarType() == MVT::f64 || 502 (ST->hasFullFP16() && LT.second.getScalarType() == MVT::f16)) && 503 LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) { 504 Type *LegalTy = 505 Type::getIntNTy(RetTy->getContext(), LT.second.getScalarSizeInBits()); 506 if (LT.second.isVector()) 507 LegalTy = VectorType::get(LegalTy, LT.second.getVectorElementCount()); 508 InstructionCost Cost = 1; 509 IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin : Intrinsic::umin, 510 LegalTy, {LegalTy, LegalTy}); 511 Cost += getIntrinsicInstrCost(Attrs1, CostKind); 512 IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax : Intrinsic::umax, 513 LegalTy, {LegalTy, LegalTy}); 514 Cost += getIntrinsicInstrCost(Attrs2, CostKind); 515 return LT.first * Cost; 516 } 517 break; 518 } 519 default: 520 break; 521 } 522 return BaseT::getIntrinsicInstrCost(ICA, CostKind); 523 } 524 525 /// The function will remove redundant reinterprets casting in the presence 526 /// of the control flow 527 static std::optional<Instruction *> processPhiNode(InstCombiner &IC, 528 IntrinsicInst &II) { 529 SmallVector<Instruction *, 32> Worklist; 530 auto RequiredType = II.getType(); 531 532 auto *PN = dyn_cast<PHINode>(II.getArgOperand(0)); 533 assert(PN && "Expected Phi Node!"); 534 535 // Don't create a new Phi unless we can remove the old one. 536 if (!PN->hasOneUse()) 537 return std::nullopt; 538 539 for (Value *IncValPhi : PN->incoming_values()) { 540 auto *Reinterpret = dyn_cast<IntrinsicInst>(IncValPhi); 541 if (!Reinterpret || 542 Reinterpret->getIntrinsicID() != 543 Intrinsic::aarch64_sve_convert_to_svbool || 544 RequiredType != Reinterpret->getArgOperand(0)->getType()) 545 return std::nullopt; 546 } 547 548 // Create the new Phi 549 LLVMContext &Ctx = PN->getContext(); 550 IRBuilder<> Builder(Ctx); 551 Builder.SetInsertPoint(PN); 552 PHINode *NPN = Builder.CreatePHI(RequiredType, PN->getNumIncomingValues()); 553 Worklist.push_back(PN); 554 555 for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) { 556 auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I)); 557 NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I)); 558 Worklist.push_back(Reinterpret); 559 } 560 561 // Cleanup Phi Node and reinterprets 562 return IC.replaceInstUsesWith(II, NPN); 563 } 564 565 // (from_svbool (binop (to_svbool pred) (svbool_t _) (svbool_t _)))) 566 // => (binop (pred) (from_svbool _) (from_svbool _)) 567 // 568 // The above transformation eliminates a `to_svbool` in the predicate 569 // operand of bitwise operation `binop` by narrowing the vector width of 570 // the operation. For example, it would convert a `<vscale x 16 x i1> 571 // and` into a `<vscale x 4 x i1> and`. This is profitable because 572 // to_svbool must zero the new lanes during widening, whereas 573 // from_svbool is free. 574 static std::optional<Instruction *> 575 tryCombineFromSVBoolBinOp(InstCombiner &IC, IntrinsicInst &II) { 576 auto BinOp = dyn_cast<IntrinsicInst>(II.getOperand(0)); 577 if (!BinOp) 578 return std::nullopt; 579 580 auto IntrinsicID = BinOp->getIntrinsicID(); 581 switch (IntrinsicID) { 582 case Intrinsic::aarch64_sve_and_z: 583 case Intrinsic::aarch64_sve_bic_z: 584 case Intrinsic::aarch64_sve_eor_z: 585 case Intrinsic::aarch64_sve_nand_z: 586 case Intrinsic::aarch64_sve_nor_z: 587 case Intrinsic::aarch64_sve_orn_z: 588 case Intrinsic::aarch64_sve_orr_z: 589 break; 590 default: 591 return std::nullopt; 592 } 593 594 auto BinOpPred = BinOp->getOperand(0); 595 auto BinOpOp1 = BinOp->getOperand(1); 596 auto BinOpOp2 = BinOp->getOperand(2); 597 598 auto PredIntr = dyn_cast<IntrinsicInst>(BinOpPred); 599 if (!PredIntr || 600 PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool) 601 return std::nullopt; 602 603 auto PredOp = PredIntr->getOperand(0); 604 auto PredOpTy = cast<VectorType>(PredOp->getType()); 605 if (PredOpTy != II.getType()) 606 return std::nullopt; 607 608 IRBuilder<> Builder(II.getContext()); 609 Builder.SetInsertPoint(&II); 610 611 SmallVector<Value *> NarrowedBinOpArgs = {PredOp}; 612 auto NarrowBinOpOp1 = Builder.CreateIntrinsic( 613 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp1}); 614 NarrowedBinOpArgs.push_back(NarrowBinOpOp1); 615 if (BinOpOp1 == BinOpOp2) 616 NarrowedBinOpArgs.push_back(NarrowBinOpOp1); 617 else 618 NarrowedBinOpArgs.push_back(Builder.CreateIntrinsic( 619 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp2})); 620 621 auto NarrowedBinOp = 622 Builder.CreateIntrinsic(IntrinsicID, {PredOpTy}, NarrowedBinOpArgs); 623 return IC.replaceInstUsesWith(II, NarrowedBinOp); 624 } 625 626 static std::optional<Instruction *> 627 instCombineConvertFromSVBool(InstCombiner &IC, IntrinsicInst &II) { 628 // If the reinterpret instruction operand is a PHI Node 629 if (isa<PHINode>(II.getArgOperand(0))) 630 return processPhiNode(IC, II); 631 632 if (auto BinOpCombine = tryCombineFromSVBoolBinOp(IC, II)) 633 return BinOpCombine; 634 635 SmallVector<Instruction *, 32> CandidatesForRemoval; 636 Value *Cursor = II.getOperand(0), *EarliestReplacement = nullptr; 637 638 const auto *IVTy = cast<VectorType>(II.getType()); 639 640 // Walk the chain of conversions. 641 while (Cursor) { 642 // If the type of the cursor has fewer lanes than the final result, zeroing 643 // must take place, which breaks the equivalence chain. 644 const auto *CursorVTy = cast<VectorType>(Cursor->getType()); 645 if (CursorVTy->getElementCount().getKnownMinValue() < 646 IVTy->getElementCount().getKnownMinValue()) 647 break; 648 649 // If the cursor has the same type as I, it is a viable replacement. 650 if (Cursor->getType() == IVTy) 651 EarliestReplacement = Cursor; 652 653 auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor); 654 655 // If this is not an SVE conversion intrinsic, this is the end of the chain. 656 if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() == 657 Intrinsic::aarch64_sve_convert_to_svbool || 658 IntrinsicCursor->getIntrinsicID() == 659 Intrinsic::aarch64_sve_convert_from_svbool)) 660 break; 661 662 CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor); 663 Cursor = IntrinsicCursor->getOperand(0); 664 } 665 666 // If no viable replacement in the conversion chain was found, there is 667 // nothing to do. 668 if (!EarliestReplacement) 669 return std::nullopt; 670 671 return IC.replaceInstUsesWith(II, EarliestReplacement); 672 } 673 674 static std::optional<Instruction *> instCombineSVESel(InstCombiner &IC, 675 IntrinsicInst &II) { 676 IRBuilder<> Builder(&II); 677 auto Select = Builder.CreateSelect(II.getOperand(0), II.getOperand(1), 678 II.getOperand(2)); 679 return IC.replaceInstUsesWith(II, Select); 680 } 681 682 static std::optional<Instruction *> instCombineSVEDup(InstCombiner &IC, 683 IntrinsicInst &II) { 684 IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(1)); 685 if (!Pg) 686 return std::nullopt; 687 688 if (Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue) 689 return std::nullopt; 690 691 const auto PTruePattern = 692 cast<ConstantInt>(Pg->getOperand(0))->getZExtValue(); 693 if (PTruePattern != AArch64SVEPredPattern::vl1) 694 return std::nullopt; 695 696 // The intrinsic is inserting into lane zero so use an insert instead. 697 auto *IdxTy = Type::getInt64Ty(II.getContext()); 698 auto *Insert = InsertElementInst::Create( 699 II.getArgOperand(0), II.getArgOperand(2), ConstantInt::get(IdxTy, 0)); 700 Insert->insertBefore(&II); 701 Insert->takeName(&II); 702 703 return IC.replaceInstUsesWith(II, Insert); 704 } 705 706 static std::optional<Instruction *> instCombineSVEDupX(InstCombiner &IC, 707 IntrinsicInst &II) { 708 // Replace DupX with a regular IR splat. 709 IRBuilder<> Builder(II.getContext()); 710 Builder.SetInsertPoint(&II); 711 auto *RetTy = cast<ScalableVectorType>(II.getType()); 712 Value *Splat = 713 Builder.CreateVectorSplat(RetTy->getElementCount(), II.getArgOperand(0)); 714 Splat->takeName(&II); 715 return IC.replaceInstUsesWith(II, Splat); 716 } 717 718 static std::optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC, 719 IntrinsicInst &II) { 720 LLVMContext &Ctx = II.getContext(); 721 IRBuilder<> Builder(Ctx); 722 Builder.SetInsertPoint(&II); 723 724 // Check that the predicate is all active 725 auto *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(0)); 726 if (!Pg || Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue) 727 return std::nullopt; 728 729 const auto PTruePattern = 730 cast<ConstantInt>(Pg->getOperand(0))->getZExtValue(); 731 if (PTruePattern != AArch64SVEPredPattern::all) 732 return std::nullopt; 733 734 // Check that we have a compare of zero.. 735 auto *SplatValue = 736 dyn_cast_or_null<ConstantInt>(getSplatValue(II.getArgOperand(2))); 737 if (!SplatValue || !SplatValue->isZero()) 738 return std::nullopt; 739 740 // ..against a dupq 741 auto *DupQLane = dyn_cast<IntrinsicInst>(II.getArgOperand(1)); 742 if (!DupQLane || 743 DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane) 744 return std::nullopt; 745 746 // Where the dupq is a lane 0 replicate of a vector insert 747 if (!cast<ConstantInt>(DupQLane->getArgOperand(1))->isZero()) 748 return std::nullopt; 749 750 auto *VecIns = dyn_cast<IntrinsicInst>(DupQLane->getArgOperand(0)); 751 if (!VecIns || VecIns->getIntrinsicID() != Intrinsic::vector_insert) 752 return std::nullopt; 753 754 // Where the vector insert is a fixed constant vector insert into undef at 755 // index zero 756 if (!isa<UndefValue>(VecIns->getArgOperand(0))) 757 return std::nullopt; 758 759 if (!cast<ConstantInt>(VecIns->getArgOperand(2))->isZero()) 760 return std::nullopt; 761 762 auto *ConstVec = dyn_cast<Constant>(VecIns->getArgOperand(1)); 763 if (!ConstVec) 764 return std::nullopt; 765 766 auto *VecTy = dyn_cast<FixedVectorType>(ConstVec->getType()); 767 auto *OutTy = dyn_cast<ScalableVectorType>(II.getType()); 768 if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements()) 769 return std::nullopt; 770 771 unsigned NumElts = VecTy->getNumElements(); 772 unsigned PredicateBits = 0; 773 774 // Expand intrinsic operands to a 16-bit byte level predicate 775 for (unsigned I = 0; I < NumElts; ++I) { 776 auto *Arg = dyn_cast<ConstantInt>(ConstVec->getAggregateElement(I)); 777 if (!Arg) 778 return std::nullopt; 779 if (!Arg->isZero()) 780 PredicateBits |= 1 << (I * (16 / NumElts)); 781 } 782 783 // If all bits are zero bail early with an empty predicate 784 if (PredicateBits == 0) { 785 auto *PFalse = Constant::getNullValue(II.getType()); 786 PFalse->takeName(&II); 787 return IC.replaceInstUsesWith(II, PFalse); 788 } 789 790 // Calculate largest predicate type used (where byte predicate is largest) 791 unsigned Mask = 8; 792 for (unsigned I = 0; I < 16; ++I) 793 if ((PredicateBits & (1 << I)) != 0) 794 Mask |= (I % 8); 795 796 unsigned PredSize = Mask & -Mask; 797 auto *PredType = ScalableVectorType::get( 798 Type::getInt1Ty(Ctx), AArch64::SVEBitsPerBlock / (PredSize * 8)); 799 800 // Ensure all relevant bits are set 801 for (unsigned I = 0; I < 16; I += PredSize) 802 if ((PredicateBits & (1 << I)) == 0) 803 return std::nullopt; 804 805 auto *PTruePat = 806 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all); 807 auto *PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, 808 {PredType}, {PTruePat}); 809 auto *ConvertToSVBool = Builder.CreateIntrinsic( 810 Intrinsic::aarch64_sve_convert_to_svbool, {PredType}, {PTrue}); 811 auto *ConvertFromSVBool = 812 Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool, 813 {II.getType()}, {ConvertToSVBool}); 814 815 ConvertFromSVBool->takeName(&II); 816 return IC.replaceInstUsesWith(II, ConvertFromSVBool); 817 } 818 819 static std::optional<Instruction *> instCombineSVELast(InstCombiner &IC, 820 IntrinsicInst &II) { 821 IRBuilder<> Builder(II.getContext()); 822 Builder.SetInsertPoint(&II); 823 Value *Pg = II.getArgOperand(0); 824 Value *Vec = II.getArgOperand(1); 825 auto IntrinsicID = II.getIntrinsicID(); 826 bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta; 827 828 // lastX(splat(X)) --> X 829 if (auto *SplatVal = getSplatValue(Vec)) 830 return IC.replaceInstUsesWith(II, SplatVal); 831 832 // If x and/or y is a splat value then: 833 // lastX (binop (x, y)) --> binop(lastX(x), lastX(y)) 834 Value *LHS, *RHS; 835 if (match(Vec, m_OneUse(m_BinOp(m_Value(LHS), m_Value(RHS))))) { 836 if (isSplatValue(LHS) || isSplatValue(RHS)) { 837 auto *OldBinOp = cast<BinaryOperator>(Vec); 838 auto OpC = OldBinOp->getOpcode(); 839 auto *NewLHS = 840 Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, LHS}); 841 auto *NewRHS = 842 Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, RHS}); 843 auto *NewBinOp = BinaryOperator::CreateWithCopiedFlags( 844 OpC, NewLHS, NewRHS, OldBinOp, OldBinOp->getName(), &II); 845 return IC.replaceInstUsesWith(II, NewBinOp); 846 } 847 } 848 849 auto *C = dyn_cast<Constant>(Pg); 850 if (IsAfter && C && C->isNullValue()) { 851 // The intrinsic is extracting lane 0 so use an extract instead. 852 auto *IdxTy = Type::getInt64Ty(II.getContext()); 853 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, 0)); 854 Extract->insertBefore(&II); 855 Extract->takeName(&II); 856 return IC.replaceInstUsesWith(II, Extract); 857 } 858 859 auto *IntrPG = dyn_cast<IntrinsicInst>(Pg); 860 if (!IntrPG) 861 return std::nullopt; 862 863 if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue) 864 return std::nullopt; 865 866 const auto PTruePattern = 867 cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue(); 868 869 // Can the intrinsic's predicate be converted to a known constant index? 870 unsigned MinNumElts = getNumElementsFromSVEPredPattern(PTruePattern); 871 if (!MinNumElts) 872 return std::nullopt; 873 874 unsigned Idx = MinNumElts - 1; 875 // Increment the index if extracting the element after the last active 876 // predicate element. 877 if (IsAfter) 878 ++Idx; 879 880 // Ignore extracts whose index is larger than the known minimum vector 881 // length. NOTE: This is an artificial constraint where we prefer to 882 // maintain what the user asked for until an alternative is proven faster. 883 auto *PgVTy = cast<ScalableVectorType>(Pg->getType()); 884 if (Idx >= PgVTy->getMinNumElements()) 885 return std::nullopt; 886 887 // The intrinsic is extracting a fixed lane so use an extract instead. 888 auto *IdxTy = Type::getInt64Ty(II.getContext()); 889 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, Idx)); 890 Extract->insertBefore(&II); 891 Extract->takeName(&II); 892 return IC.replaceInstUsesWith(II, Extract); 893 } 894 895 static std::optional<Instruction *> instCombineSVECondLast(InstCombiner &IC, 896 IntrinsicInst &II) { 897 // The SIMD&FP variant of CLAST[AB] is significantly faster than the scalar 898 // integer variant across a variety of micro-architectures. Replace scalar 899 // integer CLAST[AB] intrinsic with optimal SIMD&FP variant. A simple 900 // bitcast-to-fp + clast[ab] + bitcast-to-int will cost a cycle or two more 901 // depending on the micro-architecture, but has been observed as generally 902 // being faster, particularly when the CLAST[AB] op is a loop-carried 903 // dependency. 904 IRBuilder<> Builder(II.getContext()); 905 Builder.SetInsertPoint(&II); 906 Value *Pg = II.getArgOperand(0); 907 Value *Fallback = II.getArgOperand(1); 908 Value *Vec = II.getArgOperand(2); 909 Type *Ty = II.getType(); 910 911 if (!Ty->isIntegerTy()) 912 return std::nullopt; 913 914 Type *FPTy; 915 switch (cast<IntegerType>(Ty)->getBitWidth()) { 916 default: 917 return std::nullopt; 918 case 16: 919 FPTy = Builder.getHalfTy(); 920 break; 921 case 32: 922 FPTy = Builder.getFloatTy(); 923 break; 924 case 64: 925 FPTy = Builder.getDoubleTy(); 926 break; 927 } 928 929 Value *FPFallBack = Builder.CreateBitCast(Fallback, FPTy); 930 auto *FPVTy = VectorType::get( 931 FPTy, cast<VectorType>(Vec->getType())->getElementCount()); 932 Value *FPVec = Builder.CreateBitCast(Vec, FPVTy); 933 auto *FPII = Builder.CreateIntrinsic(II.getIntrinsicID(), {FPVec->getType()}, 934 {Pg, FPFallBack, FPVec}); 935 Value *FPIItoInt = Builder.CreateBitCast(FPII, II.getType()); 936 return IC.replaceInstUsesWith(II, FPIItoInt); 937 } 938 939 static std::optional<Instruction *> instCombineRDFFR(InstCombiner &IC, 940 IntrinsicInst &II) { 941 LLVMContext &Ctx = II.getContext(); 942 IRBuilder<> Builder(Ctx); 943 Builder.SetInsertPoint(&II); 944 // Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr 945 // can work with RDFFR_PP for ptest elimination. 946 auto *AllPat = 947 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all); 948 auto *PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, 949 {II.getType()}, {AllPat}); 950 auto *RDFFR = 951 Builder.CreateIntrinsic(Intrinsic::aarch64_sve_rdffr_z, {}, {PTrue}); 952 RDFFR->takeName(&II); 953 return IC.replaceInstUsesWith(II, RDFFR); 954 } 955 956 static std::optional<Instruction *> 957 instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts) { 958 const auto Pattern = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue(); 959 960 if (Pattern == AArch64SVEPredPattern::all) { 961 LLVMContext &Ctx = II.getContext(); 962 IRBuilder<> Builder(Ctx); 963 Builder.SetInsertPoint(&II); 964 965 Constant *StepVal = ConstantInt::get(II.getType(), NumElts); 966 auto *VScale = Builder.CreateVScale(StepVal); 967 VScale->takeName(&II); 968 return IC.replaceInstUsesWith(II, VScale); 969 } 970 971 unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern); 972 973 return MinNumElts && NumElts >= MinNumElts 974 ? std::optional<Instruction *>(IC.replaceInstUsesWith( 975 II, ConstantInt::get(II.getType(), MinNumElts))) 976 : std::nullopt; 977 } 978 979 static std::optional<Instruction *> instCombineSVEPTest(InstCombiner &IC, 980 IntrinsicInst &II) { 981 Value *PgVal = II.getArgOperand(0); 982 Value *OpVal = II.getArgOperand(1); 983 984 IRBuilder<> Builder(II.getContext()); 985 Builder.SetInsertPoint(&II); 986 987 // PTEST_<FIRST|LAST>(X, X) is equivalent to PTEST_ANY(X, X). 988 // Later optimizations prefer this form. 989 if (PgVal == OpVal && 990 (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_first || 991 II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_last)) { 992 Value *Ops[] = {PgVal, OpVal}; 993 Type *Tys[] = {PgVal->getType()}; 994 995 auto *PTest = 996 Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptest_any, Tys, Ops); 997 PTest->takeName(&II); 998 999 return IC.replaceInstUsesWith(II, PTest); 1000 } 1001 1002 IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(PgVal); 1003 IntrinsicInst *Op = dyn_cast<IntrinsicInst>(OpVal); 1004 1005 if (!Pg || !Op) 1006 return std::nullopt; 1007 1008 Intrinsic::ID OpIID = Op->getIntrinsicID(); 1009 1010 if (Pg->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool && 1011 OpIID == Intrinsic::aarch64_sve_convert_to_svbool && 1012 Pg->getArgOperand(0)->getType() == Op->getArgOperand(0)->getType()) { 1013 Value *Ops[] = {Pg->getArgOperand(0), Op->getArgOperand(0)}; 1014 Type *Tys[] = {Pg->getArgOperand(0)->getType()}; 1015 1016 auto *PTest = Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops); 1017 1018 PTest->takeName(&II); 1019 return IC.replaceInstUsesWith(II, PTest); 1020 } 1021 1022 // Transform PTEST_ANY(X=OP(PG,...), X) -> PTEST_ANY(PG, X)). 1023 // Later optimizations may rewrite sequence to use the flag-setting variant 1024 // of instruction X to remove PTEST. 1025 if ((Pg == Op) && (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_any) && 1026 ((OpIID == Intrinsic::aarch64_sve_brka_z) || 1027 (OpIID == Intrinsic::aarch64_sve_brkb_z) || 1028 (OpIID == Intrinsic::aarch64_sve_brkpa_z) || 1029 (OpIID == Intrinsic::aarch64_sve_brkpb_z) || 1030 (OpIID == Intrinsic::aarch64_sve_rdffr_z) || 1031 (OpIID == Intrinsic::aarch64_sve_and_z) || 1032 (OpIID == Intrinsic::aarch64_sve_bic_z) || 1033 (OpIID == Intrinsic::aarch64_sve_eor_z) || 1034 (OpIID == Intrinsic::aarch64_sve_nand_z) || 1035 (OpIID == Intrinsic::aarch64_sve_nor_z) || 1036 (OpIID == Intrinsic::aarch64_sve_orn_z) || 1037 (OpIID == Intrinsic::aarch64_sve_orr_z))) { 1038 Value *Ops[] = {Pg->getArgOperand(0), Pg}; 1039 Type *Tys[] = {Pg->getType()}; 1040 1041 auto *PTest = Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops); 1042 PTest->takeName(&II); 1043 1044 return IC.replaceInstUsesWith(II, PTest); 1045 } 1046 1047 return std::nullopt; 1048 } 1049 1050 template <Intrinsic::ID MulOpc, typename Intrinsic::ID FuseOpc> 1051 static std::optional<Instruction *> 1052 instCombineSVEVectorFuseMulAddSub(InstCombiner &IC, IntrinsicInst &II, 1053 bool MergeIntoAddendOp) { 1054 Value *P = II.getOperand(0); 1055 Value *MulOp0, *MulOp1, *AddendOp, *Mul; 1056 if (MergeIntoAddendOp) { 1057 AddendOp = II.getOperand(1); 1058 Mul = II.getOperand(2); 1059 } else { 1060 AddendOp = II.getOperand(2); 1061 Mul = II.getOperand(1); 1062 } 1063 1064 if (!match(Mul, m_Intrinsic<MulOpc>(m_Specific(P), m_Value(MulOp0), 1065 m_Value(MulOp1)))) 1066 return std::nullopt; 1067 1068 if (!Mul->hasOneUse()) 1069 return std::nullopt; 1070 1071 Instruction *FMFSource = nullptr; 1072 if (II.getType()->isFPOrFPVectorTy()) { 1073 llvm::FastMathFlags FAddFlags = II.getFastMathFlags(); 1074 // Stop the combine when the flags on the inputs differ in case dropping 1075 // flags would lead to us missing out on more beneficial optimizations. 1076 if (FAddFlags != cast<CallInst>(Mul)->getFastMathFlags()) 1077 return std::nullopt; 1078 if (!FAddFlags.allowContract()) 1079 return std::nullopt; 1080 FMFSource = &II; 1081 } 1082 1083 IRBuilder<> Builder(II.getContext()); 1084 Builder.SetInsertPoint(&II); 1085 1086 CallInst *Res; 1087 if (MergeIntoAddendOp) 1088 Res = Builder.CreateIntrinsic(FuseOpc, {II.getType()}, 1089 {P, AddendOp, MulOp0, MulOp1}, FMFSource); 1090 else 1091 Res = Builder.CreateIntrinsic(FuseOpc, {II.getType()}, 1092 {P, MulOp0, MulOp1, AddendOp}, FMFSource); 1093 1094 return IC.replaceInstUsesWith(II, Res); 1095 } 1096 1097 static bool isAllActivePredicate(Value *Pred) { 1098 // Look through convert.from.svbool(convert.to.svbool(...) chain. 1099 Value *UncastedPred; 1100 if (match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_convert_from_svbool>( 1101 m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>( 1102 m_Value(UncastedPred))))) 1103 // If the predicate has the same or less lanes than the uncasted 1104 // predicate then we know the casting has no effect. 1105 if (cast<ScalableVectorType>(Pred->getType())->getMinNumElements() <= 1106 cast<ScalableVectorType>(UncastedPred->getType())->getMinNumElements()) 1107 Pred = UncastedPred; 1108 1109 return match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>( 1110 m_ConstantInt<AArch64SVEPredPattern::all>())); 1111 } 1112 1113 static std::optional<Instruction *> 1114 instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) { 1115 IRBuilder<> Builder(II.getContext()); 1116 Builder.SetInsertPoint(&II); 1117 1118 Value *Pred = II.getOperand(0); 1119 Value *PtrOp = II.getOperand(1); 1120 Type *VecTy = II.getType(); 1121 Value *VecPtr = Builder.CreateBitCast(PtrOp, VecTy->getPointerTo()); 1122 1123 if (isAllActivePredicate(Pred)) { 1124 LoadInst *Load = Builder.CreateLoad(VecTy, VecPtr); 1125 Load->copyMetadata(II); 1126 return IC.replaceInstUsesWith(II, Load); 1127 } 1128 1129 CallInst *MaskedLoad = 1130 Builder.CreateMaskedLoad(VecTy, VecPtr, PtrOp->getPointerAlignment(DL), 1131 Pred, ConstantAggregateZero::get(VecTy)); 1132 MaskedLoad->copyMetadata(II); 1133 return IC.replaceInstUsesWith(II, MaskedLoad); 1134 } 1135 1136 static std::optional<Instruction *> 1137 instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) { 1138 IRBuilder<> Builder(II.getContext()); 1139 Builder.SetInsertPoint(&II); 1140 1141 Value *VecOp = II.getOperand(0); 1142 Value *Pred = II.getOperand(1); 1143 Value *PtrOp = II.getOperand(2); 1144 Value *VecPtr = 1145 Builder.CreateBitCast(PtrOp, VecOp->getType()->getPointerTo()); 1146 1147 if (isAllActivePredicate(Pred)) { 1148 StoreInst *Store = Builder.CreateStore(VecOp, VecPtr); 1149 Store->copyMetadata(II); 1150 return IC.eraseInstFromFunction(II); 1151 } 1152 1153 CallInst *MaskedStore = Builder.CreateMaskedStore( 1154 VecOp, VecPtr, PtrOp->getPointerAlignment(DL), Pred); 1155 MaskedStore->copyMetadata(II); 1156 return IC.eraseInstFromFunction(II); 1157 } 1158 1159 static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic) { 1160 switch (Intrinsic) { 1161 case Intrinsic::aarch64_sve_fmul: 1162 return Instruction::BinaryOps::FMul; 1163 case Intrinsic::aarch64_sve_fadd: 1164 return Instruction::BinaryOps::FAdd; 1165 case Intrinsic::aarch64_sve_fsub: 1166 return Instruction::BinaryOps::FSub; 1167 default: 1168 return Instruction::BinaryOpsEnd; 1169 } 1170 } 1171 1172 static std::optional<Instruction *> 1173 instCombineSVEVectorBinOp(InstCombiner &IC, IntrinsicInst &II) { 1174 auto *OpPredicate = II.getOperand(0); 1175 auto BinOpCode = intrinsicIDToBinOpCode(II.getIntrinsicID()); 1176 if (BinOpCode == Instruction::BinaryOpsEnd || 1177 !match(OpPredicate, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>( 1178 m_ConstantInt<AArch64SVEPredPattern::all>()))) 1179 return std::nullopt; 1180 IRBuilder<> Builder(II.getContext()); 1181 Builder.SetInsertPoint(&II); 1182 Builder.setFastMathFlags(II.getFastMathFlags()); 1183 auto BinOp = 1184 Builder.CreateBinOp(BinOpCode, II.getOperand(1), II.getOperand(2)); 1185 return IC.replaceInstUsesWith(II, BinOp); 1186 } 1187 1188 static std::optional<Instruction *> instCombineSVEVectorAdd(InstCombiner &IC, 1189 IntrinsicInst &II) { 1190 if (auto FMLA = 1191 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul, 1192 Intrinsic::aarch64_sve_fmla>(IC, II, 1193 true)) 1194 return FMLA; 1195 if (auto MLA = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul, 1196 Intrinsic::aarch64_sve_mla>( 1197 IC, II, true)) 1198 return MLA; 1199 if (auto FMAD = 1200 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul, 1201 Intrinsic::aarch64_sve_fmad>(IC, II, 1202 false)) 1203 return FMAD; 1204 if (auto MAD = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul, 1205 Intrinsic::aarch64_sve_mad>( 1206 IC, II, false)) 1207 return MAD; 1208 return instCombineSVEVectorBinOp(IC, II); 1209 } 1210 1211 static std::optional<Instruction *> instCombineSVEVectorSub(InstCombiner &IC, 1212 IntrinsicInst &II) { 1213 if (auto FMLS = 1214 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul, 1215 Intrinsic::aarch64_sve_fmls>(IC, II, 1216 true)) 1217 return FMLS; 1218 if (auto MLS = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul, 1219 Intrinsic::aarch64_sve_mls>( 1220 IC, II, true)) 1221 return MLS; 1222 if (auto FMSB = 1223 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul, 1224 Intrinsic::aarch64_sve_fnmsb>( 1225 IC, II, false)) 1226 return FMSB; 1227 return instCombineSVEVectorBinOp(IC, II); 1228 } 1229 1230 static std::optional<Instruction *> instCombineSVEVectorMul(InstCombiner &IC, 1231 IntrinsicInst &II) { 1232 auto *OpPredicate = II.getOperand(0); 1233 auto *OpMultiplicand = II.getOperand(1); 1234 auto *OpMultiplier = II.getOperand(2); 1235 1236 IRBuilder<> Builder(II.getContext()); 1237 Builder.SetInsertPoint(&II); 1238 1239 // Return true if a given instruction is a unit splat value, false otherwise. 1240 auto IsUnitSplat = [](auto *I) { 1241 auto *SplatValue = getSplatValue(I); 1242 if (!SplatValue) 1243 return false; 1244 return match(SplatValue, m_FPOne()) || match(SplatValue, m_One()); 1245 }; 1246 1247 // Return true if a given instruction is an aarch64_sve_dup intrinsic call 1248 // with a unit splat value, false otherwise. 1249 auto IsUnitDup = [](auto *I) { 1250 auto *IntrI = dyn_cast<IntrinsicInst>(I); 1251 if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup) 1252 return false; 1253 1254 auto *SplatValue = IntrI->getOperand(2); 1255 return match(SplatValue, m_FPOne()) || match(SplatValue, m_One()); 1256 }; 1257 1258 if (IsUnitSplat(OpMultiplier)) { 1259 // [f]mul pg %n, (dupx 1) => %n 1260 OpMultiplicand->takeName(&II); 1261 return IC.replaceInstUsesWith(II, OpMultiplicand); 1262 } else if (IsUnitDup(OpMultiplier)) { 1263 // [f]mul pg %n, (dup pg 1) => %n 1264 auto *DupInst = cast<IntrinsicInst>(OpMultiplier); 1265 auto *DupPg = DupInst->getOperand(1); 1266 // TODO: this is naive. The optimization is still valid if DupPg 1267 // 'encompasses' OpPredicate, not only if they're the same predicate. 1268 if (OpPredicate == DupPg) { 1269 OpMultiplicand->takeName(&II); 1270 return IC.replaceInstUsesWith(II, OpMultiplicand); 1271 } 1272 } 1273 1274 return instCombineSVEVectorBinOp(IC, II); 1275 } 1276 1277 static std::optional<Instruction *> instCombineSVEUnpack(InstCombiner &IC, 1278 IntrinsicInst &II) { 1279 IRBuilder<> Builder(II.getContext()); 1280 Builder.SetInsertPoint(&II); 1281 Value *UnpackArg = II.getArgOperand(0); 1282 auto *RetTy = cast<ScalableVectorType>(II.getType()); 1283 bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi || 1284 II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo; 1285 1286 // Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X)) 1287 // Lo = uunpklo(splat(X)) --> Lo = splat(extend(X)) 1288 if (auto *ScalarArg = getSplatValue(UnpackArg)) { 1289 ScalarArg = 1290 Builder.CreateIntCast(ScalarArg, RetTy->getScalarType(), IsSigned); 1291 Value *NewVal = 1292 Builder.CreateVectorSplat(RetTy->getElementCount(), ScalarArg); 1293 NewVal->takeName(&II); 1294 return IC.replaceInstUsesWith(II, NewVal); 1295 } 1296 1297 return std::nullopt; 1298 } 1299 static std::optional<Instruction *> instCombineSVETBL(InstCombiner &IC, 1300 IntrinsicInst &II) { 1301 auto *OpVal = II.getOperand(0); 1302 auto *OpIndices = II.getOperand(1); 1303 VectorType *VTy = cast<VectorType>(II.getType()); 1304 1305 // Check whether OpIndices is a constant splat value < minimal element count 1306 // of result. 1307 auto *SplatValue = dyn_cast_or_null<ConstantInt>(getSplatValue(OpIndices)); 1308 if (!SplatValue || 1309 SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue())) 1310 return std::nullopt; 1311 1312 // Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to 1313 // splat_vector(extractelement(OpVal, SplatValue)) for further optimization. 1314 IRBuilder<> Builder(II.getContext()); 1315 Builder.SetInsertPoint(&II); 1316 auto *Extract = Builder.CreateExtractElement(OpVal, SplatValue); 1317 auto *VectorSplat = 1318 Builder.CreateVectorSplat(VTy->getElementCount(), Extract); 1319 1320 VectorSplat->takeName(&II); 1321 return IC.replaceInstUsesWith(II, VectorSplat); 1322 } 1323 1324 static std::optional<Instruction *> instCombineSVEZip(InstCombiner &IC, 1325 IntrinsicInst &II) { 1326 // zip1(uzp1(A, B), uzp2(A, B)) --> A 1327 // zip2(uzp1(A, B), uzp2(A, B)) --> B 1328 Value *A, *B; 1329 if (match(II.getArgOperand(0), 1330 m_Intrinsic<Intrinsic::aarch64_sve_uzp1>(m_Value(A), m_Value(B))) && 1331 match(II.getArgOperand(1), m_Intrinsic<Intrinsic::aarch64_sve_uzp2>( 1332 m_Specific(A), m_Specific(B)))) 1333 return IC.replaceInstUsesWith( 1334 II, (II.getIntrinsicID() == Intrinsic::aarch64_sve_zip1 ? A : B)); 1335 1336 return std::nullopt; 1337 } 1338 1339 static std::optional<Instruction *> 1340 instCombineLD1GatherIndex(InstCombiner &IC, IntrinsicInst &II) { 1341 Value *Mask = II.getOperand(0); 1342 Value *BasePtr = II.getOperand(1); 1343 Value *Index = II.getOperand(2); 1344 Type *Ty = II.getType(); 1345 Value *PassThru = ConstantAggregateZero::get(Ty); 1346 1347 // Contiguous gather => masked load. 1348 // (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1)) 1349 // => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer) 1350 Value *IndexBase; 1351 if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>( 1352 m_Value(IndexBase), m_SpecificInt(1)))) { 1353 IRBuilder<> Builder(II.getContext()); 1354 Builder.SetInsertPoint(&II); 1355 1356 Align Alignment = 1357 BasePtr->getPointerAlignment(II.getModule()->getDataLayout()); 1358 1359 Type *VecPtrTy = PointerType::getUnqual(Ty); 1360 Value *Ptr = Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(), 1361 BasePtr, IndexBase); 1362 Ptr = Builder.CreateBitCast(Ptr, VecPtrTy); 1363 CallInst *MaskedLoad = 1364 Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask, PassThru); 1365 MaskedLoad->takeName(&II); 1366 return IC.replaceInstUsesWith(II, MaskedLoad); 1367 } 1368 1369 return std::nullopt; 1370 } 1371 1372 static std::optional<Instruction *> 1373 instCombineST1ScatterIndex(InstCombiner &IC, IntrinsicInst &II) { 1374 Value *Val = II.getOperand(0); 1375 Value *Mask = II.getOperand(1); 1376 Value *BasePtr = II.getOperand(2); 1377 Value *Index = II.getOperand(3); 1378 Type *Ty = Val->getType(); 1379 1380 // Contiguous scatter => masked store. 1381 // (sve.st1.scatter.index Value Mask BasePtr (sve.index IndexBase 1)) 1382 // => (masked.store Value (gep BasePtr IndexBase) Align Mask) 1383 Value *IndexBase; 1384 if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>( 1385 m_Value(IndexBase), m_SpecificInt(1)))) { 1386 IRBuilder<> Builder(II.getContext()); 1387 Builder.SetInsertPoint(&II); 1388 1389 Align Alignment = 1390 BasePtr->getPointerAlignment(II.getModule()->getDataLayout()); 1391 1392 Value *Ptr = Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(), 1393 BasePtr, IndexBase); 1394 Type *VecPtrTy = PointerType::getUnqual(Ty); 1395 Ptr = Builder.CreateBitCast(Ptr, VecPtrTy); 1396 1397 (void)Builder.CreateMaskedStore(Val, Ptr, Alignment, Mask); 1398 1399 return IC.eraseInstFromFunction(II); 1400 } 1401 1402 return std::nullopt; 1403 } 1404 1405 static std::optional<Instruction *> instCombineSVESDIV(InstCombiner &IC, 1406 IntrinsicInst &II) { 1407 IRBuilder<> Builder(II.getContext()); 1408 Builder.SetInsertPoint(&II); 1409 Type *Int32Ty = Builder.getInt32Ty(); 1410 Value *Pred = II.getOperand(0); 1411 Value *Vec = II.getOperand(1); 1412 Value *DivVec = II.getOperand(2); 1413 1414 Value *SplatValue = getSplatValue(DivVec); 1415 ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(SplatValue); 1416 if (!SplatConstantInt) 1417 return std::nullopt; 1418 APInt Divisor = SplatConstantInt->getValue(); 1419 1420 if (Divisor.isPowerOf2()) { 1421 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2()); 1422 auto ASRD = Builder.CreateIntrinsic( 1423 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2}); 1424 return IC.replaceInstUsesWith(II, ASRD); 1425 } 1426 if (Divisor.isNegatedPowerOf2()) { 1427 Divisor.negate(); 1428 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2()); 1429 auto ASRD = Builder.CreateIntrinsic( 1430 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2}); 1431 auto NEG = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_neg, 1432 {ASRD->getType()}, {ASRD, Pred, ASRD}); 1433 return IC.replaceInstUsesWith(II, NEG); 1434 } 1435 1436 return std::nullopt; 1437 } 1438 1439 bool SimplifyValuePattern(SmallVector<Value *> &Vec, bool AllowPoison) { 1440 size_t VecSize = Vec.size(); 1441 if (VecSize == 1) 1442 return true; 1443 if (!isPowerOf2_64(VecSize)) 1444 return false; 1445 size_t HalfVecSize = VecSize / 2; 1446 1447 for (auto LHS = Vec.begin(), RHS = Vec.begin() + HalfVecSize; 1448 RHS != Vec.end(); LHS++, RHS++) { 1449 if (*LHS != nullptr && *RHS != nullptr) { 1450 if (*LHS == *RHS) 1451 continue; 1452 else 1453 return false; 1454 } 1455 if (!AllowPoison) 1456 return false; 1457 if (*LHS == nullptr && *RHS != nullptr) 1458 *LHS = *RHS; 1459 } 1460 1461 Vec.resize(HalfVecSize); 1462 SimplifyValuePattern(Vec, AllowPoison); 1463 return true; 1464 } 1465 1466 // Try to simplify dupqlane patterns like dupqlane(f32 A, f32 B, f32 A, f32 B) 1467 // to dupqlane(f64(C)) where C is A concatenated with B 1468 static std::optional<Instruction *> instCombineSVEDupqLane(InstCombiner &IC, 1469 IntrinsicInst &II) { 1470 Value *CurrentInsertElt = nullptr, *Default = nullptr; 1471 if (!match(II.getOperand(0), 1472 m_Intrinsic<Intrinsic::vector_insert>( 1473 m_Value(Default), m_Value(CurrentInsertElt), m_Value())) || 1474 !isa<FixedVectorType>(CurrentInsertElt->getType())) 1475 return std::nullopt; 1476 auto IIScalableTy = cast<ScalableVectorType>(II.getType()); 1477 1478 // Insert the scalars into a container ordered by InsertElement index 1479 SmallVector<Value *> Elts(IIScalableTy->getMinNumElements(), nullptr); 1480 while (auto InsertElt = dyn_cast<InsertElementInst>(CurrentInsertElt)) { 1481 auto Idx = cast<ConstantInt>(InsertElt->getOperand(2)); 1482 Elts[Idx->getValue().getZExtValue()] = InsertElt->getOperand(1); 1483 CurrentInsertElt = InsertElt->getOperand(0); 1484 } 1485 1486 bool AllowPoison = 1487 isa<PoisonValue>(CurrentInsertElt) && isa<PoisonValue>(Default); 1488 if (!SimplifyValuePattern(Elts, AllowPoison)) 1489 return std::nullopt; 1490 1491 // Rebuild the simplified chain of InsertElements. e.g. (a, b, a, b) as (a, b) 1492 IRBuilder<> Builder(II.getContext()); 1493 Builder.SetInsertPoint(&II); 1494 Value *InsertEltChain = PoisonValue::get(CurrentInsertElt->getType()); 1495 for (size_t I = 0; I < Elts.size(); I++) { 1496 if (Elts[I] == nullptr) 1497 continue; 1498 InsertEltChain = Builder.CreateInsertElement(InsertEltChain, Elts[I], 1499 Builder.getInt64(I)); 1500 } 1501 if (InsertEltChain == nullptr) 1502 return std::nullopt; 1503 1504 // Splat the simplified sequence, e.g. (f16 a, f16 b, f16 c, f16 d) as one i64 1505 // value or (f16 a, f16 b) as one i32 value. This requires an InsertSubvector 1506 // be bitcast to a type wide enough to fit the sequence, be splatted, and then 1507 // be narrowed back to the original type. 1508 unsigned PatternWidth = IIScalableTy->getScalarSizeInBits() * Elts.size(); 1509 unsigned PatternElementCount = IIScalableTy->getScalarSizeInBits() * 1510 IIScalableTy->getMinNumElements() / 1511 PatternWidth; 1512 1513 IntegerType *WideTy = Builder.getIntNTy(PatternWidth); 1514 auto *WideScalableTy = ScalableVectorType::get(WideTy, PatternElementCount); 1515 auto *WideShuffleMaskTy = 1516 ScalableVectorType::get(Builder.getInt32Ty(), PatternElementCount); 1517 1518 auto ZeroIdx = ConstantInt::get(Builder.getInt64Ty(), APInt(64, 0)); 1519 auto InsertSubvector = Builder.CreateInsertVector( 1520 II.getType(), PoisonValue::get(II.getType()), InsertEltChain, ZeroIdx); 1521 auto WideBitcast = 1522 Builder.CreateBitOrPointerCast(InsertSubvector, WideScalableTy); 1523 auto WideShuffleMask = ConstantAggregateZero::get(WideShuffleMaskTy); 1524 auto WideShuffle = Builder.CreateShuffleVector( 1525 WideBitcast, PoisonValue::get(WideScalableTy), WideShuffleMask); 1526 auto NarrowBitcast = 1527 Builder.CreateBitOrPointerCast(WideShuffle, II.getType()); 1528 1529 return IC.replaceInstUsesWith(II, NarrowBitcast); 1530 } 1531 1532 static std::optional<Instruction *> instCombineMaxMinNM(InstCombiner &IC, 1533 IntrinsicInst &II) { 1534 Value *A = II.getArgOperand(0); 1535 Value *B = II.getArgOperand(1); 1536 if (A == B) 1537 return IC.replaceInstUsesWith(II, A); 1538 1539 return std::nullopt; 1540 } 1541 1542 static std::optional<Instruction *> instCombineSVESrshl(InstCombiner &IC, 1543 IntrinsicInst &II) { 1544 IRBuilder<> Builder(&II); 1545 Value *Pred = II.getOperand(0); 1546 Value *Vec = II.getOperand(1); 1547 Value *Shift = II.getOperand(2); 1548 1549 // Convert SRSHL into the simpler LSL intrinsic when fed by an ABS intrinsic. 1550 Value *AbsPred, *MergedValue; 1551 if (!match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_sqabs>( 1552 m_Value(MergedValue), m_Value(AbsPred), m_Value())) && 1553 !match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_abs>( 1554 m_Value(MergedValue), m_Value(AbsPred), m_Value()))) 1555 1556 return std::nullopt; 1557 1558 // Transform is valid if any of the following are true: 1559 // * The ABS merge value is an undef or non-negative 1560 // * The ABS predicate is all active 1561 // * The ABS predicate and the SRSHL predicates are the same 1562 if (!isa<UndefValue>(MergedValue) && !match(MergedValue, m_NonNegative()) && 1563 AbsPred != Pred && !isAllActivePredicate(AbsPred)) 1564 return std::nullopt; 1565 1566 // Only valid when the shift amount is non-negative, otherwise the rounding 1567 // behaviour of SRSHL cannot be ignored. 1568 if (!match(Shift, m_NonNegative())) 1569 return std::nullopt; 1570 1571 auto LSL = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_lsl, {II.getType()}, 1572 {Pred, Vec, Shift}); 1573 1574 return IC.replaceInstUsesWith(II, LSL); 1575 } 1576 1577 std::optional<Instruction *> 1578 AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC, 1579 IntrinsicInst &II) const { 1580 Intrinsic::ID IID = II.getIntrinsicID(); 1581 switch (IID) { 1582 default: 1583 break; 1584 case Intrinsic::aarch64_neon_fmaxnm: 1585 case Intrinsic::aarch64_neon_fminnm: 1586 return instCombineMaxMinNM(IC, II); 1587 case Intrinsic::aarch64_sve_convert_from_svbool: 1588 return instCombineConvertFromSVBool(IC, II); 1589 case Intrinsic::aarch64_sve_dup: 1590 return instCombineSVEDup(IC, II); 1591 case Intrinsic::aarch64_sve_dup_x: 1592 return instCombineSVEDupX(IC, II); 1593 case Intrinsic::aarch64_sve_cmpne: 1594 case Intrinsic::aarch64_sve_cmpne_wide: 1595 return instCombineSVECmpNE(IC, II); 1596 case Intrinsic::aarch64_sve_rdffr: 1597 return instCombineRDFFR(IC, II); 1598 case Intrinsic::aarch64_sve_lasta: 1599 case Intrinsic::aarch64_sve_lastb: 1600 return instCombineSVELast(IC, II); 1601 case Intrinsic::aarch64_sve_clasta_n: 1602 case Intrinsic::aarch64_sve_clastb_n: 1603 return instCombineSVECondLast(IC, II); 1604 case Intrinsic::aarch64_sve_cntd: 1605 return instCombineSVECntElts(IC, II, 2); 1606 case Intrinsic::aarch64_sve_cntw: 1607 return instCombineSVECntElts(IC, II, 4); 1608 case Intrinsic::aarch64_sve_cnth: 1609 return instCombineSVECntElts(IC, II, 8); 1610 case Intrinsic::aarch64_sve_cntb: 1611 return instCombineSVECntElts(IC, II, 16); 1612 case Intrinsic::aarch64_sve_ptest_any: 1613 case Intrinsic::aarch64_sve_ptest_first: 1614 case Intrinsic::aarch64_sve_ptest_last: 1615 return instCombineSVEPTest(IC, II); 1616 case Intrinsic::aarch64_sve_mul: 1617 case Intrinsic::aarch64_sve_fmul: 1618 return instCombineSVEVectorMul(IC, II); 1619 case Intrinsic::aarch64_sve_fadd: 1620 case Intrinsic::aarch64_sve_add: 1621 return instCombineSVEVectorAdd(IC, II); 1622 case Intrinsic::aarch64_sve_fsub: 1623 case Intrinsic::aarch64_sve_sub: 1624 return instCombineSVEVectorSub(IC, II); 1625 case Intrinsic::aarch64_sve_tbl: 1626 return instCombineSVETBL(IC, II); 1627 case Intrinsic::aarch64_sve_uunpkhi: 1628 case Intrinsic::aarch64_sve_uunpklo: 1629 case Intrinsic::aarch64_sve_sunpkhi: 1630 case Intrinsic::aarch64_sve_sunpklo: 1631 return instCombineSVEUnpack(IC, II); 1632 case Intrinsic::aarch64_sve_zip1: 1633 case Intrinsic::aarch64_sve_zip2: 1634 return instCombineSVEZip(IC, II); 1635 case Intrinsic::aarch64_sve_ld1_gather_index: 1636 return instCombineLD1GatherIndex(IC, II); 1637 case Intrinsic::aarch64_sve_st1_scatter_index: 1638 return instCombineST1ScatterIndex(IC, II); 1639 case Intrinsic::aarch64_sve_ld1: 1640 return instCombineSVELD1(IC, II, DL); 1641 case Intrinsic::aarch64_sve_st1: 1642 return instCombineSVEST1(IC, II, DL); 1643 case Intrinsic::aarch64_sve_sdiv: 1644 return instCombineSVESDIV(IC, II); 1645 case Intrinsic::aarch64_sve_sel: 1646 return instCombineSVESel(IC, II); 1647 case Intrinsic::aarch64_sve_srshl: 1648 return instCombineSVESrshl(IC, II); 1649 case Intrinsic::aarch64_sve_dupq_lane: 1650 return instCombineSVEDupqLane(IC, II); 1651 } 1652 1653 return std::nullopt; 1654 } 1655 1656 std::optional<Value *> AArch64TTIImpl::simplifyDemandedVectorEltsIntrinsic( 1657 InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts, 1658 APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, 1659 std::function<void(Instruction *, unsigned, APInt, APInt &)> 1660 SimplifyAndSetOp) const { 1661 switch (II.getIntrinsicID()) { 1662 default: 1663 break; 1664 case Intrinsic::aarch64_neon_fcvtxn: 1665 case Intrinsic::aarch64_neon_rshrn: 1666 case Intrinsic::aarch64_neon_sqrshrn: 1667 case Intrinsic::aarch64_neon_sqrshrun: 1668 case Intrinsic::aarch64_neon_sqshrn: 1669 case Intrinsic::aarch64_neon_sqshrun: 1670 case Intrinsic::aarch64_neon_sqxtn: 1671 case Intrinsic::aarch64_neon_sqxtun: 1672 case Intrinsic::aarch64_neon_uqrshrn: 1673 case Intrinsic::aarch64_neon_uqshrn: 1674 case Intrinsic::aarch64_neon_uqxtn: 1675 SimplifyAndSetOp(&II, 0, OrigDemandedElts, UndefElts); 1676 break; 1677 } 1678 1679 return std::nullopt; 1680 } 1681 1682 TypeSize 1683 AArch64TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { 1684 switch (K) { 1685 case TargetTransformInfo::RGK_Scalar: 1686 return TypeSize::getFixed(64); 1687 case TargetTransformInfo::RGK_FixedWidthVector: 1688 if (!ST->isStreamingSVEModeDisabled() && 1689 !EnableFixedwidthAutovecInStreamingMode) 1690 return TypeSize::getFixed(0); 1691 1692 if (ST->hasSVE()) 1693 return TypeSize::getFixed( 1694 std::max(ST->getMinSVEVectorSizeInBits(), 128u)); 1695 1696 return TypeSize::getFixed(ST->hasNEON() ? 128 : 0); 1697 case TargetTransformInfo::RGK_ScalableVector: 1698 if (!ST->isStreamingSVEModeDisabled() && !EnableScalableAutovecInStreamingMode) 1699 return TypeSize::getScalable(0); 1700 1701 return TypeSize::getScalable(ST->hasSVE() ? 128 : 0); 1702 } 1703 llvm_unreachable("Unsupported register kind"); 1704 } 1705 1706 bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode, 1707 ArrayRef<const Value *> Args) { 1708 1709 // A helper that returns a vector type from the given type. The number of 1710 // elements in type Ty determines the vector width. 1711 auto toVectorTy = [&](Type *ArgTy) { 1712 return VectorType::get(ArgTy->getScalarType(), 1713 cast<VectorType>(DstTy)->getElementCount()); 1714 }; 1715 1716 // Exit early if DstTy is not a vector type whose elements are at least 1717 // 16-bits wide. SVE doesn't generally have the same set of instructions to 1718 // perform an extend with the add/sub/mul. There are SMULLB style 1719 // instructions, but they operate on top/bottom, requiring some sort of lane 1720 // interleaving to be used with zext/sext. 1721 if (!useNeonVector(DstTy) || DstTy->getScalarSizeInBits() < 16) 1722 return false; 1723 1724 // Determine if the operation has a widening variant. We consider both the 1725 // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the 1726 // instructions. 1727 // 1728 // TODO: Add additional widening operations (e.g., shl, etc.) once we 1729 // verify that their extending operands are eliminated during code 1730 // generation. 1731 switch (Opcode) { 1732 case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2). 1733 case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2). 1734 case Instruction::Mul: // SMULL(2), UMULL(2) 1735 break; 1736 default: 1737 return false; 1738 } 1739 1740 // To be a widening instruction (either the "wide" or "long" versions), the 1741 // second operand must be a sign- or zero extend. 1742 if (Args.size() != 2 || 1743 (!isa<SExtInst>(Args[1]) && !isa<ZExtInst>(Args[1]))) 1744 return false; 1745 auto *Extend = cast<CastInst>(Args[1]); 1746 auto *Arg0 = dyn_cast<CastInst>(Args[0]); 1747 1748 // A mul only has a mull version (not like addw). Both operands need to be 1749 // extending and the same type. 1750 if (Opcode == Instruction::Mul && 1751 (!Arg0 || Arg0->getOpcode() != Extend->getOpcode() || 1752 Arg0->getOperand(0)->getType() != Extend->getOperand(0)->getType())) 1753 return false; 1754 1755 // Legalize the destination type and ensure it can be used in a widening 1756 // operation. 1757 auto DstTyL = getTypeLegalizationCost(DstTy); 1758 unsigned DstElTySize = DstTyL.second.getScalarSizeInBits(); 1759 if (!DstTyL.second.isVector() || DstElTySize != DstTy->getScalarSizeInBits()) 1760 return false; 1761 1762 // Legalize the source type and ensure it can be used in a widening 1763 // operation. 1764 auto *SrcTy = toVectorTy(Extend->getSrcTy()); 1765 auto SrcTyL = getTypeLegalizationCost(SrcTy); 1766 unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits(); 1767 if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits()) 1768 return false; 1769 1770 // Get the total number of vector elements in the legalized types. 1771 InstructionCost NumDstEls = 1772 DstTyL.first * DstTyL.second.getVectorMinNumElements(); 1773 InstructionCost NumSrcEls = 1774 SrcTyL.first * SrcTyL.second.getVectorMinNumElements(); 1775 1776 // Return true if the legalized types have the same number of vector elements 1777 // and the destination element type size is twice that of the source type. 1778 return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstElTySize; 1779 } 1780 1781 InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, 1782 Type *Src, 1783 TTI::CastContextHint CCH, 1784 TTI::TargetCostKind CostKind, 1785 const Instruction *I) { 1786 int ISD = TLI->InstructionOpcodeToISD(Opcode); 1787 assert(ISD && "Invalid opcode"); 1788 1789 // If the cast is observable, and it is used by a widening instruction (e.g., 1790 // uaddl, saddw, etc.), it may be free. 1791 if (I && I->hasOneUser()) { 1792 auto *SingleUser = cast<Instruction>(*I->user_begin()); 1793 SmallVector<const Value *, 4> Operands(SingleUser->operand_values()); 1794 if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands)) { 1795 // If the cast is the second operand, it is free. We will generate either 1796 // a "wide" or "long" version of the widening instruction. 1797 if (I == SingleUser->getOperand(1)) 1798 return 0; 1799 // If the cast is not the second operand, it will be free if it looks the 1800 // same as the second operand. In this case, we will generate a "long" 1801 // version of the widening instruction. 1802 if (auto *Cast = dyn_cast<CastInst>(SingleUser->getOperand(1))) 1803 if (I->getOpcode() == unsigned(Cast->getOpcode()) && 1804 cast<CastInst>(I)->getSrcTy() == Cast->getSrcTy()) 1805 return 0; 1806 } 1807 } 1808 1809 // TODO: Allow non-throughput costs that aren't binary. 1810 auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost { 1811 if (CostKind != TTI::TCK_RecipThroughput) 1812 return Cost == 0 ? 0 : 1; 1813 return Cost; 1814 }; 1815 1816 EVT SrcTy = TLI->getValueType(DL, Src); 1817 EVT DstTy = TLI->getValueType(DL, Dst); 1818 1819 if (!SrcTy.isSimple() || !DstTy.isSimple()) 1820 return AdjustCost( 1821 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); 1822 1823 static const TypeConversionCostTblEntry 1824 ConversionTbl[] = { 1825 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 1}, // xtn 1826 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1}, // xtn 1827 { ISD::TRUNCATE, MVT::v2i32, MVT::v2i64, 1}, // xtn 1828 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 1}, // xtn 1829 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 3}, // 2 xtn + 1 uzp1 1830 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1}, // xtn 1831 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2}, // 1 uzp1 + 1 xtn 1832 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1}, // 1 uzp1 1833 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 1}, // 1 xtn 1834 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2}, // 1 uzp1 + 1 xtn 1835 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 4}, // 3 x uzp1 + xtn 1836 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 1}, // 1 uzp1 1837 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 3}, // 3 x uzp1 1838 { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 2}, // 2 x uzp1 1839 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 1}, // uzp1 1840 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 3}, // (2 + 1) x uzp1 1841 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 7}, // (4 + 2 + 1) x uzp1 1842 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2}, // 2 x uzp1 1843 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i64, 6}, // (4 + 2) x uzp1 1844 { ISD::TRUNCATE, MVT::v16i32, MVT::v16i64, 4}, // 4 x uzp1 1845 1846 // Truncations on nxvmiN 1847 { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i16, 1 }, 1848 { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i32, 1 }, 1849 { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i64, 1 }, 1850 { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i16, 1 }, 1851 { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i32, 1 }, 1852 { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i64, 2 }, 1853 { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i16, 1 }, 1854 { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i32, 3 }, 1855 { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i64, 5 }, 1856 { ISD::TRUNCATE, MVT::nxv16i1, MVT::nxv16i8, 1 }, 1857 { ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i32, 1 }, 1858 { ISD::TRUNCATE, MVT::nxv2i32, MVT::nxv2i64, 1 }, 1859 { ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i32, 1 }, 1860 { ISD::TRUNCATE, MVT::nxv4i32, MVT::nxv4i64, 2 }, 1861 { ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i32, 3 }, 1862 { ISD::TRUNCATE, MVT::nxv8i32, MVT::nxv8i64, 6 }, 1863 1864 // The number of shll instructions for the extension. 1865 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, 1866 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, 1867 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 }, 1868 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 }, 1869 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, 1870 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, 1871 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, 1872 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, 1873 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 }, 1874 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 }, 1875 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 }, 1876 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 }, 1877 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, 1878 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, 1879 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 }, 1880 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 }, 1881 1882 // LowerVectorINT_TO_FP: 1883 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, 1884 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, 1885 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, 1886 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, 1887 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, 1888 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, 1889 1890 // Complex: to v2f32 1891 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 }, 1892 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 }, 1893 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 }, 1894 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 }, 1895 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 }, 1896 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 }, 1897 1898 // Complex: to v4f32 1899 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 4 }, 1900 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 }, 1901 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 }, 1902 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 }, 1903 1904 // Complex: to v8f32 1905 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 }, 1906 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 }, 1907 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 }, 1908 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 }, 1909 1910 // Complex: to v16f32 1911 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 }, 1912 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 }, 1913 1914 // Complex: to v2f64 1915 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 }, 1916 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 }, 1917 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 }, 1918 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 }, 1919 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 }, 1920 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 }, 1921 1922 // Complex: to v4f64 1923 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 4 }, 1924 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 4 }, 1925 1926 // LowerVectorFP_TO_INT 1927 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1 }, 1928 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 }, 1929 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 }, 1930 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 }, 1931 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 }, 1932 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 }, 1933 1934 // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext). 1935 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2 }, 1936 { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1 }, 1937 { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 1 }, 1938 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2 }, 1939 { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1 }, 1940 { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 1 }, 1941 1942 // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2 1943 { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 }, 1944 { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 2 }, 1945 { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 }, 1946 { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 2 }, 1947 1948 // Complex, from nxv2f32. 1949 { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f32, 1 }, 1950 { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f32, 1 }, 1951 { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f32, 1 }, 1952 { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f32, 1 }, 1953 { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f32, 1 }, 1954 { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f32, 1 }, 1955 { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f32, 1 }, 1956 { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f32, 1 }, 1957 1958 // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2. 1959 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 }, 1960 { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2 }, 1961 { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 2 }, 1962 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 }, 1963 { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2 }, 1964 { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2 }, 1965 1966 // Complex, from nxv2f64. 1967 { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f64, 1 }, 1968 { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f64, 1 }, 1969 { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f64, 1 }, 1970 { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f64, 1 }, 1971 { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f64, 1 }, 1972 { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f64, 1 }, 1973 { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f64, 1 }, 1974 { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f64, 1 }, 1975 1976 // Complex, from nxv4f32. 1977 { ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f32, 4 }, 1978 { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f32, 1 }, 1979 { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f32, 1 }, 1980 { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f32, 1 }, 1981 { ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f32, 4 }, 1982 { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f32, 1 }, 1983 { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f32, 1 }, 1984 { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f32, 1 }, 1985 1986 // Complex, from nxv8f64. Illegal -> illegal conversions not required. 1987 { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f64, 7 }, 1988 { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f64, 7 }, 1989 { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f64, 7 }, 1990 { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f64, 7 }, 1991 1992 // Complex, from nxv4f64. Illegal -> illegal conversions not required. 1993 { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f64, 3 }, 1994 { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f64, 3 }, 1995 { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f64, 3 }, 1996 { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f64, 3 }, 1997 { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f64, 3 }, 1998 { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f64, 3 }, 1999 2000 // Complex, from nxv8f32. Illegal -> illegal conversions not required. 2001 { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f32, 3 }, 2002 { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f32, 3 }, 2003 { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f32, 3 }, 2004 { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f32, 3 }, 2005 2006 // Complex, from nxv8f16. 2007 { ISD::FP_TO_SINT, MVT::nxv8i64, MVT::nxv8f16, 10 }, 2008 { ISD::FP_TO_SINT, MVT::nxv8i32, MVT::nxv8f16, 4 }, 2009 { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f16, 1 }, 2010 { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f16, 1 }, 2011 { ISD::FP_TO_UINT, MVT::nxv8i64, MVT::nxv8f16, 10 }, 2012 { ISD::FP_TO_UINT, MVT::nxv8i32, MVT::nxv8f16, 4 }, 2013 { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f16, 1 }, 2014 { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f16, 1 }, 2015 2016 // Complex, from nxv4f16. 2017 { ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f16, 4 }, 2018 { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f16, 1 }, 2019 { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f16, 1 }, 2020 { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f16, 1 }, 2021 { ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f16, 4 }, 2022 { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f16, 1 }, 2023 { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f16, 1 }, 2024 { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f16, 1 }, 2025 2026 // Complex, from nxv2f16. 2027 { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f16, 1 }, 2028 { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f16, 1 }, 2029 { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f16, 1 }, 2030 { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f16, 1 }, 2031 { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f16, 1 }, 2032 { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f16, 1 }, 2033 { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f16, 1 }, 2034 { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f16, 1 }, 2035 2036 // Truncate from nxvmf32 to nxvmf16. 2037 { ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f32, 1 }, 2038 { ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f32, 1 }, 2039 { ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f32, 3 }, 2040 2041 // Truncate from nxvmf64 to nxvmf16. 2042 { ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f64, 1 }, 2043 { ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f64, 3 }, 2044 { ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f64, 7 }, 2045 2046 // Truncate from nxvmf64 to nxvmf32. 2047 { ISD::FP_ROUND, MVT::nxv2f32, MVT::nxv2f64, 1 }, 2048 { ISD::FP_ROUND, MVT::nxv4f32, MVT::nxv4f64, 3 }, 2049 { ISD::FP_ROUND, MVT::nxv8f32, MVT::nxv8f64, 6 }, 2050 2051 // Extend from nxvmf16 to nxvmf32. 2052 { ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2f16, 1}, 2053 { ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4f16, 1}, 2054 { ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8f16, 2}, 2055 2056 // Extend from nxvmf16 to nxvmf64. 2057 { ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f16, 1}, 2058 { ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f16, 2}, 2059 { ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f16, 4}, 2060 2061 // Extend from nxvmf32 to nxvmf64. 2062 { ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f32, 1}, 2063 { ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f32, 2}, 2064 { ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f32, 6}, 2065 2066 // Bitcasts from float to integer 2067 { ISD::BITCAST, MVT::nxv2f16, MVT::nxv2i16, 0 }, 2068 { ISD::BITCAST, MVT::nxv4f16, MVT::nxv4i16, 0 }, 2069 { ISD::BITCAST, MVT::nxv2f32, MVT::nxv2i32, 0 }, 2070 2071 // Bitcasts from integer to float 2072 { ISD::BITCAST, MVT::nxv2i16, MVT::nxv2f16, 0 }, 2073 { ISD::BITCAST, MVT::nxv4i16, MVT::nxv4f16, 0 }, 2074 { ISD::BITCAST, MVT::nxv2i32, MVT::nxv2f32, 0 }, 2075 }; 2076 2077 if (const auto *Entry = ConvertCostTableLookup(ConversionTbl, ISD, 2078 DstTy.getSimpleVT(), 2079 SrcTy.getSimpleVT())) 2080 return AdjustCost(Entry->Cost); 2081 2082 static const TypeConversionCostTblEntry FP16Tbl[] = { 2083 {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f16, 1}, // fcvtzs 2084 {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f16, 1}, 2085 {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f16, 1}, // fcvtzs 2086 {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f16, 1}, 2087 {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f16, 2}, // fcvtl+fcvtzs 2088 {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f16, 2}, 2089 {ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f16, 2}, // fcvtzs+xtn 2090 {ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f16, 2}, 2091 {ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f16, 1}, // fcvtzs 2092 {ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f16, 1}, 2093 {ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f16, 4}, // 2*fcvtl+2*fcvtzs 2094 {ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f16, 4}, 2095 {ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f16, 3}, // 2*fcvtzs+xtn 2096 {ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f16, 3}, 2097 {ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f16, 2}, // 2*fcvtzs 2098 {ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f16, 2}, 2099 {ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f16, 8}, // 4*fcvtl+4*fcvtzs 2100 {ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f16, 8}, 2101 {ISD::UINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // ushll + ucvtf 2102 {ISD::SINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // sshll + scvtf 2103 {ISD::UINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * ushl(2) + 2 * ucvtf 2104 {ISD::SINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * sshl(2) + 2 * scvtf 2105 }; 2106 2107 if (ST->hasFullFP16()) 2108 if (const auto *Entry = ConvertCostTableLookup( 2109 FP16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT())) 2110 return AdjustCost(Entry->Cost); 2111 2112 return AdjustCost( 2113 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); 2114 } 2115 2116 InstructionCost AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, 2117 Type *Dst, 2118 VectorType *VecTy, 2119 unsigned Index) { 2120 2121 // Make sure we were given a valid extend opcode. 2122 assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) && 2123 "Invalid opcode"); 2124 2125 // We are extending an element we extract from a vector, so the source type 2126 // of the extend is the element type of the vector. 2127 auto *Src = VecTy->getElementType(); 2128 2129 // Sign- and zero-extends are for integer types only. 2130 assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type"); 2131 2132 // Get the cost for the extract. We compute the cost (if any) for the extend 2133 // below. 2134 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 2135 InstructionCost Cost = getVectorInstrCost(Instruction::ExtractElement, VecTy, 2136 CostKind, Index, nullptr, nullptr); 2137 2138 // Legalize the types. 2139 auto VecLT = getTypeLegalizationCost(VecTy); 2140 auto DstVT = TLI->getValueType(DL, Dst); 2141 auto SrcVT = TLI->getValueType(DL, Src); 2142 2143 // If the resulting type is still a vector and the destination type is legal, 2144 // we may get the extension for free. If not, get the default cost for the 2145 // extend. 2146 if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT)) 2147 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None, 2148 CostKind); 2149 2150 // The destination type should be larger than the element type. If not, get 2151 // the default cost for the extend. 2152 if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits()) 2153 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None, 2154 CostKind); 2155 2156 switch (Opcode) { 2157 default: 2158 llvm_unreachable("Opcode should be either SExt or ZExt"); 2159 2160 // For sign-extends, we only need a smov, which performs the extension 2161 // automatically. 2162 case Instruction::SExt: 2163 return Cost; 2164 2165 // For zero-extends, the extend is performed automatically by a umov unless 2166 // the destination type is i64 and the element type is i8 or i16. 2167 case Instruction::ZExt: 2168 if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u) 2169 return Cost; 2170 } 2171 2172 // If we are unable to perform the extend for free, get the default cost. 2173 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None, 2174 CostKind); 2175 } 2176 2177 InstructionCost AArch64TTIImpl::getCFInstrCost(unsigned Opcode, 2178 TTI::TargetCostKind CostKind, 2179 const Instruction *I) { 2180 if (CostKind != TTI::TCK_RecipThroughput) 2181 return Opcode == Instruction::PHI ? 0 : 1; 2182 assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind"); 2183 // Branches are assumed to be predicted. 2184 return 0; 2185 } 2186 2187 InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(Type *Val, 2188 unsigned Index, 2189 bool HasRealUse) { 2190 assert(Val->isVectorTy() && "This must be a vector type"); 2191 2192 if (Index != -1U) { 2193 // Legalize the type. 2194 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val); 2195 2196 // This type is legalized to a scalar type. 2197 if (!LT.second.isVector()) 2198 return 0; 2199 2200 // The type may be split. For fixed-width vectors we can normalize the 2201 // index to the new type. 2202 if (LT.second.isFixedLengthVector()) { 2203 unsigned Width = LT.second.getVectorNumElements(); 2204 Index = Index % Width; 2205 } 2206 2207 // The element at index zero is already inside the vector. 2208 // - For a physical (HasRealUse==true) insert-element or extract-element 2209 // instruction that extracts integers, an explicit FPR -> GPR move is 2210 // needed. So it has non-zero cost. 2211 // - For the rest of cases (virtual instruction or element type is float), 2212 // consider the instruction free. 2213 // 2214 // FIXME: 2215 // If the extract-element and insert-element instructions could be 2216 // simplified away (e.g., could be combined into users by looking at use-def 2217 // context), they have no cost. This is not done in the first place for 2218 // compile-time considerations. 2219 if (Index == 0 && (!HasRealUse || !Val->getScalarType()->isIntegerTy())) 2220 return 0; 2221 } 2222 2223 // All other insert/extracts cost this much. 2224 return ST->getVectorInsertExtractBaseCost(); 2225 } 2226 2227 InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, 2228 TTI::TargetCostKind CostKind, 2229 unsigned Index, Value *Op0, 2230 Value *Op1) { 2231 return getVectorInstrCostHelper(Val, Index, false /* HasRealUse */); 2232 } 2233 2234 InstructionCost AArch64TTIImpl::getVectorInstrCost(const Instruction &I, 2235 Type *Val, 2236 TTI::TargetCostKind CostKind, 2237 unsigned Index) { 2238 return getVectorInstrCostHelper(Val, Index, true /* HasRealUse */); 2239 } 2240 2241 InstructionCost AArch64TTIImpl::getArithmeticInstrCost( 2242 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, 2243 TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, 2244 ArrayRef<const Value *> Args, 2245 const Instruction *CxtI) { 2246 2247 // TODO: Handle more cost kinds. 2248 if (CostKind != TTI::TCK_RecipThroughput) 2249 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, 2250 Op2Info, Args, CxtI); 2251 2252 // Legalize the type. 2253 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); 2254 int ISD = TLI->InstructionOpcodeToISD(Opcode); 2255 2256 switch (ISD) { 2257 default: 2258 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, 2259 Op2Info); 2260 case ISD::SDIV: 2261 if (Op2Info.isConstant() && Op2Info.isUniform() && Op2Info.isPowerOf2()) { 2262 // On AArch64, scalar signed division by constants power-of-two are 2263 // normally expanded to the sequence ADD + CMP + SELECT + SRA. 2264 // The OperandValue properties many not be same as that of previous 2265 // operation; conservatively assume OP_None. 2266 InstructionCost Cost = getArithmeticInstrCost( 2267 Instruction::Add, Ty, CostKind, 2268 Op1Info.getNoProps(), Op2Info.getNoProps()); 2269 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, 2270 Op1Info.getNoProps(), Op2Info.getNoProps()); 2271 Cost += getArithmeticInstrCost( 2272 Instruction::Select, Ty, CostKind, 2273 Op1Info.getNoProps(), Op2Info.getNoProps()); 2274 Cost += getArithmeticInstrCost(Instruction::AShr, Ty, CostKind, 2275 Op1Info.getNoProps(), Op2Info.getNoProps()); 2276 return Cost; 2277 } 2278 [[fallthrough]]; 2279 case ISD::UDIV: { 2280 if (Op2Info.isConstant() && Op2Info.isUniform()) { 2281 auto VT = TLI->getValueType(DL, Ty); 2282 if (TLI->isOperationLegalOrCustom(ISD::MULHU, VT)) { 2283 // Vector signed division by constant are expanded to the 2284 // sequence MULHS + ADD/SUB + SRA + SRL + ADD, and unsigned division 2285 // to MULHS + SUB + SRL + ADD + SRL. 2286 InstructionCost MulCost = getArithmeticInstrCost( 2287 Instruction::Mul, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps()); 2288 InstructionCost AddCost = getArithmeticInstrCost( 2289 Instruction::Add, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps()); 2290 InstructionCost ShrCost = getArithmeticInstrCost( 2291 Instruction::AShr, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps()); 2292 return MulCost * 2 + AddCost * 2 + ShrCost * 2 + 1; 2293 } 2294 } 2295 2296 InstructionCost Cost = BaseT::getArithmeticInstrCost( 2297 Opcode, Ty, CostKind, Op1Info, Op2Info); 2298 if (Ty->isVectorTy()) { 2299 if (TLI->isOperationLegalOrCustom(ISD, LT.second) && ST->hasSVE()) { 2300 // SDIV/UDIV operations are lowered using SVE, then we can have less 2301 // costs. 2302 if (isa<FixedVectorType>(Ty) && cast<FixedVectorType>(Ty) 2303 ->getPrimitiveSizeInBits() 2304 .getFixedValue() < 128) { 2305 EVT VT = TLI->getValueType(DL, Ty); 2306 static const CostTblEntry DivTbl[]{ 2307 {ISD::SDIV, MVT::v2i8, 5}, {ISD::SDIV, MVT::v4i8, 8}, 2308 {ISD::SDIV, MVT::v8i8, 8}, {ISD::SDIV, MVT::v2i16, 5}, 2309 {ISD::SDIV, MVT::v4i16, 5}, {ISD::SDIV, MVT::v2i32, 1}, 2310 {ISD::UDIV, MVT::v2i8, 5}, {ISD::UDIV, MVT::v4i8, 8}, 2311 {ISD::UDIV, MVT::v8i8, 8}, {ISD::UDIV, MVT::v2i16, 5}, 2312 {ISD::UDIV, MVT::v4i16, 5}, {ISD::UDIV, MVT::v2i32, 1}}; 2313 2314 const auto *Entry = CostTableLookup(DivTbl, ISD, VT.getSimpleVT()); 2315 if (nullptr != Entry) 2316 return Entry->Cost; 2317 } 2318 // For 8/16-bit elements, the cost is higher because the type 2319 // requires promotion and possibly splitting: 2320 if (LT.second.getScalarType() == MVT::i8) 2321 Cost *= 8; 2322 else if (LT.second.getScalarType() == MVT::i16) 2323 Cost *= 4; 2324 return Cost; 2325 } else { 2326 // If one of the operands is a uniform constant then the cost for each 2327 // element is Cost for insertion, extraction and division. 2328 // Insertion cost = 2, Extraction Cost = 2, Division = cost for the 2329 // operation with scalar type 2330 if ((Op1Info.isConstant() && Op1Info.isUniform()) || 2331 (Op2Info.isConstant() && Op2Info.isUniform())) { 2332 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) { 2333 InstructionCost DivCost = BaseT::getArithmeticInstrCost( 2334 Opcode, Ty->getScalarType(), CostKind, Op1Info, Op2Info); 2335 return (4 + DivCost) * VTy->getNumElements(); 2336 } 2337 } 2338 // On AArch64, without SVE, vector divisions are expanded 2339 // into scalar divisions of each pair of elements. 2340 Cost += getArithmeticInstrCost(Instruction::ExtractElement, Ty, 2341 CostKind, Op1Info, Op2Info); 2342 Cost += getArithmeticInstrCost(Instruction::InsertElement, Ty, CostKind, 2343 Op1Info, Op2Info); 2344 } 2345 2346 // TODO: if one of the arguments is scalar, then it's not necessary to 2347 // double the cost of handling the vector elements. 2348 Cost += Cost; 2349 } 2350 return Cost; 2351 } 2352 case ISD::MUL: 2353 // When SVE is available, then we can lower the v2i64 operation using 2354 // the SVE mul instruction, which has a lower cost. 2355 if (LT.second == MVT::v2i64 && ST->hasSVE()) 2356 return LT.first; 2357 2358 // When SVE is not available, there is no MUL.2d instruction, 2359 // which means mul <2 x i64> is expensive as elements are extracted 2360 // from the vectors and the muls scalarized. 2361 // As getScalarizationOverhead is a bit too pessimistic, we 2362 // estimate the cost for a i64 vector directly here, which is: 2363 // - four 2-cost i64 extracts, 2364 // - two 2-cost i64 inserts, and 2365 // - two 1-cost muls. 2366 // So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with 2367 // LT.first = 2 the cost is 28. If both operands are extensions it will not 2368 // need to scalarize so the cost can be cheaper (smull or umull). 2369 // so the cost can be cheaper (smull or umull). 2370 if (LT.second != MVT::v2i64 || isWideningInstruction(Ty, Opcode, Args)) 2371 return LT.first; 2372 return LT.first * 14; 2373 case ISD::ADD: 2374 case ISD::XOR: 2375 case ISD::OR: 2376 case ISD::AND: 2377 case ISD::SRL: 2378 case ISD::SRA: 2379 case ISD::SHL: 2380 // These nodes are marked as 'custom' for combining purposes only. 2381 // We know that they are legal. See LowerAdd in ISelLowering. 2382 return LT.first; 2383 2384 case ISD::FADD: 2385 case ISD::FSUB: 2386 case ISD::FMUL: 2387 case ISD::FDIV: 2388 case ISD::FNEG: 2389 // These nodes are marked as 'custom' just to lower them to SVE. 2390 // We know said lowering will incur no additional cost. 2391 if (!Ty->getScalarType()->isFP128Ty()) 2392 return 2 * LT.first; 2393 2394 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, 2395 Op2Info); 2396 } 2397 } 2398 2399 InstructionCost AArch64TTIImpl::getAddressComputationCost(Type *Ty, 2400 ScalarEvolution *SE, 2401 const SCEV *Ptr) { 2402 // Address computations in vectorized code with non-consecutive addresses will 2403 // likely result in more instructions compared to scalar code where the 2404 // computation can more often be merged into the index mode. The resulting 2405 // extra micro-ops can significantly decrease throughput. 2406 unsigned NumVectorInstToHideOverhead = 10; 2407 int MaxMergeDistance = 64; 2408 2409 if (Ty->isVectorTy() && SE && 2410 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1)) 2411 return NumVectorInstToHideOverhead; 2412 2413 // In many cases the address computation is not merged into the instruction 2414 // addressing mode. 2415 return 1; 2416 } 2417 2418 InstructionCost AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, 2419 Type *CondTy, 2420 CmpInst::Predicate VecPred, 2421 TTI::TargetCostKind CostKind, 2422 const Instruction *I) { 2423 // TODO: Handle other cost kinds. 2424 if (CostKind != TTI::TCK_RecipThroughput) 2425 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, 2426 I); 2427 2428 int ISD = TLI->InstructionOpcodeToISD(Opcode); 2429 // We don't lower some vector selects well that are wider than the register 2430 // width. 2431 if (isa<FixedVectorType>(ValTy) && ISD == ISD::SELECT) { 2432 // We would need this many instructions to hide the scalarization happening. 2433 const int AmortizationCost = 20; 2434 2435 // If VecPred is not set, check if we can get a predicate from the context 2436 // instruction, if its type matches the requested ValTy. 2437 if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) { 2438 CmpInst::Predicate CurrentPred; 2439 if (match(I, m_Select(m_Cmp(CurrentPred, m_Value(), m_Value()), m_Value(), 2440 m_Value()))) 2441 VecPred = CurrentPred; 2442 } 2443 // Check if we have a compare/select chain that can be lowered using 2444 // a (F)CMxx & BFI pair. 2445 if (CmpInst::isIntPredicate(VecPred) || VecPred == CmpInst::FCMP_OLE || 2446 VecPred == CmpInst::FCMP_OLT || VecPred == CmpInst::FCMP_OGT || 2447 VecPred == CmpInst::FCMP_OGE || VecPred == CmpInst::FCMP_OEQ || 2448 VecPred == CmpInst::FCMP_UNE) { 2449 static const auto ValidMinMaxTys = { 2450 MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32, 2451 MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32, MVT::v2f64}; 2452 static const auto ValidFP16MinMaxTys = {MVT::v4f16, MVT::v8f16}; 2453 2454 auto LT = getTypeLegalizationCost(ValTy); 2455 if (any_of(ValidMinMaxTys, [<](MVT M) { return M == LT.second; }) || 2456 (ST->hasFullFP16() && 2457 any_of(ValidFP16MinMaxTys, [<](MVT M) { return M == LT.second; }))) 2458 return LT.first; 2459 } 2460 2461 static const TypeConversionCostTblEntry 2462 VectorSelectTbl[] = { 2463 { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 }, 2464 { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 }, 2465 { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 }, 2466 { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost }, 2467 { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost }, 2468 { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost } 2469 }; 2470 2471 EVT SelCondTy = TLI->getValueType(DL, CondTy); 2472 EVT SelValTy = TLI->getValueType(DL, ValTy); 2473 if (SelCondTy.isSimple() && SelValTy.isSimple()) { 2474 if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, ISD, 2475 SelCondTy.getSimpleVT(), 2476 SelValTy.getSimpleVT())) 2477 return Entry->Cost; 2478 } 2479 } 2480 // The base case handles scalable vectors fine for now, since it treats the 2481 // cost as 1 * legalization cost. 2482 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I); 2483 } 2484 2485 AArch64TTIImpl::TTI::MemCmpExpansionOptions 2486 AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const { 2487 TTI::MemCmpExpansionOptions Options; 2488 if (ST->requiresStrictAlign()) { 2489 // TODO: Add cost modeling for strict align. Misaligned loads expand to 2490 // a bunch of instructions when strict align is enabled. 2491 return Options; 2492 } 2493 Options.AllowOverlappingLoads = true; 2494 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize); 2495 Options.NumLoadsPerBlock = Options.MaxNumLoads; 2496 // TODO: Though vector loads usually perform well on AArch64, in some targets 2497 // they may wake up the FP unit, which raises the power consumption. Perhaps 2498 // they could be used with no holds barred (-O3). 2499 Options.LoadSizes = {8, 4, 2, 1}; 2500 return Options; 2501 } 2502 2503 bool AArch64TTIImpl::prefersVectorizedAddressing() const { 2504 return ST->hasSVE(); 2505 } 2506 2507 InstructionCost 2508 AArch64TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, 2509 Align Alignment, unsigned AddressSpace, 2510 TTI::TargetCostKind CostKind) { 2511 if (useNeonVector(Src)) 2512 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace, 2513 CostKind); 2514 auto LT = getTypeLegalizationCost(Src); 2515 if (!LT.first.isValid()) 2516 return InstructionCost::getInvalid(); 2517 2518 // The code-generator is currently not able to handle scalable vectors 2519 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting 2520 // it. This change will be removed when code-generation for these types is 2521 // sufficiently reliable. 2522 if (cast<VectorType>(Src)->getElementCount() == ElementCount::getScalable(1)) 2523 return InstructionCost::getInvalid(); 2524 2525 return LT.first; 2526 } 2527 2528 static unsigned getSVEGatherScatterOverhead(unsigned Opcode) { 2529 return Opcode == Instruction::Load ? SVEGatherOverhead : SVEScatterOverhead; 2530 } 2531 2532 InstructionCost AArch64TTIImpl::getGatherScatterOpCost( 2533 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, 2534 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) { 2535 if (useNeonVector(DataTy)) 2536 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, 2537 Alignment, CostKind, I); 2538 auto *VT = cast<VectorType>(DataTy); 2539 auto LT = getTypeLegalizationCost(DataTy); 2540 if (!LT.first.isValid()) 2541 return InstructionCost::getInvalid(); 2542 2543 // The code-generator is currently not able to handle scalable vectors 2544 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting 2545 // it. This change will be removed when code-generation for these types is 2546 // sufficiently reliable. 2547 if (cast<VectorType>(DataTy)->getElementCount() == 2548 ElementCount::getScalable(1)) 2549 return InstructionCost::getInvalid(); 2550 2551 ElementCount LegalVF = LT.second.getVectorElementCount(); 2552 InstructionCost MemOpCost = 2553 getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind, 2554 {TTI::OK_AnyValue, TTI::OP_None}, I); 2555 // Add on an overhead cost for using gathers/scatters. 2556 // TODO: At the moment this is applied unilaterally for all CPUs, but at some 2557 // point we may want a per-CPU overhead. 2558 MemOpCost *= getSVEGatherScatterOverhead(Opcode); 2559 return LT.first * MemOpCost * getMaxNumElements(LegalVF); 2560 } 2561 2562 bool AArch64TTIImpl::useNeonVector(const Type *Ty) const { 2563 return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors(); 2564 } 2565 2566 InstructionCost AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty, 2567 MaybeAlign Alignment, 2568 unsigned AddressSpace, 2569 TTI::TargetCostKind CostKind, 2570 TTI::OperandValueInfo OpInfo, 2571 const Instruction *I) { 2572 EVT VT = TLI->getValueType(DL, Ty, true); 2573 // Type legalization can't handle structs 2574 if (VT == MVT::Other) 2575 return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace, 2576 CostKind); 2577 2578 auto LT = getTypeLegalizationCost(Ty); 2579 if (!LT.first.isValid()) 2580 return InstructionCost::getInvalid(); 2581 2582 // The code-generator is currently not able to handle scalable vectors 2583 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting 2584 // it. This change will be removed when code-generation for these types is 2585 // sufficiently reliable. 2586 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty)) 2587 if (VTy->getElementCount() == ElementCount::getScalable(1)) 2588 return InstructionCost::getInvalid(); 2589 2590 // TODO: consider latency as well for TCK_SizeAndLatency. 2591 if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency) 2592 return LT.first; 2593 2594 if (CostKind != TTI::TCK_RecipThroughput) 2595 return 1; 2596 2597 if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store && 2598 LT.second.is128BitVector() && (!Alignment || *Alignment < Align(16))) { 2599 // Unaligned stores are extremely inefficient. We don't split all 2600 // unaligned 128-bit stores because the negative impact that has shown in 2601 // practice on inlined block copy code. 2602 // We make such stores expensive so that we will only vectorize if there 2603 // are 6 other instructions getting vectorized. 2604 const int AmortizationCost = 6; 2605 2606 return LT.first * 2 * AmortizationCost; 2607 } 2608 2609 // Opaque ptr or ptr vector types are i64s and can be lowered to STP/LDPs. 2610 if (Ty->isPtrOrPtrVectorTy()) 2611 return LT.first; 2612 2613 // Check truncating stores and extending loads. 2614 if (useNeonVector(Ty) && 2615 Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) { 2616 // v4i8 types are lowered to scalar a load/store and sshll/xtn. 2617 if (VT == MVT::v4i8) 2618 return 2; 2619 // Otherwise we need to scalarize. 2620 return cast<FixedVectorType>(Ty)->getNumElements() * 2; 2621 } 2622 2623 return LT.first; 2624 } 2625 2626 InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost( 2627 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, 2628 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, 2629 bool UseMaskForCond, bool UseMaskForGaps) { 2630 assert(Factor >= 2 && "Invalid interleave factor"); 2631 auto *VecVTy = cast<FixedVectorType>(VecTy); 2632 2633 if (!UseMaskForCond && !UseMaskForGaps && 2634 Factor <= TLI->getMaxSupportedInterleaveFactor()) { 2635 unsigned NumElts = VecVTy->getNumElements(); 2636 auto *SubVecTy = 2637 FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor); 2638 2639 // ldN/stN only support legal vector types of size 64 or 128 in bits. 2640 // Accesses having vector types that are a multiple of 128 bits can be 2641 // matched to more than one ldN/stN instruction. 2642 bool UseScalable; 2643 if (NumElts % Factor == 0 && 2644 TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable)) 2645 return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable); 2646 } 2647 2648 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, 2649 Alignment, AddressSpace, CostKind, 2650 UseMaskForCond, UseMaskForGaps); 2651 } 2652 2653 InstructionCost 2654 AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) { 2655 InstructionCost Cost = 0; 2656 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 2657 for (auto *I : Tys) { 2658 if (!I->isVectorTy()) 2659 continue; 2660 if (I->getScalarSizeInBits() * cast<FixedVectorType>(I)->getNumElements() == 2661 128) 2662 Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0, CostKind) + 2663 getMemoryOpCost(Instruction::Load, I, Align(128), 0, CostKind); 2664 } 2665 return Cost; 2666 } 2667 2668 unsigned AArch64TTIImpl::getMaxInterleaveFactor(unsigned VF) { 2669 return ST->getMaxInterleaveFactor(); 2670 } 2671 2672 // For Falkor, we want to avoid having too many strided loads in a loop since 2673 // that can exhaust the HW prefetcher resources. We adjust the unroller 2674 // MaxCount preference below to attempt to ensure unrolling doesn't create too 2675 // many strided loads. 2676 static void 2677 getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE, 2678 TargetTransformInfo::UnrollingPreferences &UP) { 2679 enum { MaxStridedLoads = 7 }; 2680 auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) { 2681 int StridedLoads = 0; 2682 // FIXME? We could make this more precise by looking at the CFG and 2683 // e.g. not counting loads in each side of an if-then-else diamond. 2684 for (const auto BB : L->blocks()) { 2685 for (auto &I : *BB) { 2686 LoadInst *LMemI = dyn_cast<LoadInst>(&I); 2687 if (!LMemI) 2688 continue; 2689 2690 Value *PtrValue = LMemI->getPointerOperand(); 2691 if (L->isLoopInvariant(PtrValue)) 2692 continue; 2693 2694 const SCEV *LSCEV = SE.getSCEV(PtrValue); 2695 const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV); 2696 if (!LSCEVAddRec || !LSCEVAddRec->isAffine()) 2697 continue; 2698 2699 // FIXME? We could take pairing of unrolled load copies into account 2700 // by looking at the AddRec, but we would probably have to limit this 2701 // to loops with no stores or other memory optimization barriers. 2702 ++StridedLoads; 2703 // We've seen enough strided loads that seeing more won't make a 2704 // difference. 2705 if (StridedLoads > MaxStridedLoads / 2) 2706 return StridedLoads; 2707 } 2708 } 2709 return StridedLoads; 2710 }; 2711 2712 int StridedLoads = countStridedLoads(L, SE); 2713 LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads 2714 << " strided loads\n"); 2715 // Pick the largest power of 2 unroll count that won't result in too many 2716 // strided loads. 2717 if (StridedLoads) { 2718 UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads); 2719 LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to " 2720 << UP.MaxCount << '\n'); 2721 } 2722 } 2723 2724 void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, 2725 TTI::UnrollingPreferences &UP, 2726 OptimizationRemarkEmitter *ORE) { 2727 // Enable partial unrolling and runtime unrolling. 2728 BaseT::getUnrollingPreferences(L, SE, UP, ORE); 2729 2730 UP.UpperBound = true; 2731 2732 // For inner loop, it is more likely to be a hot one, and the runtime check 2733 // can be promoted out from LICM pass, so the overhead is less, let's try 2734 // a larger threshold to unroll more loops. 2735 if (L->getLoopDepth() > 1) 2736 UP.PartialThreshold *= 2; 2737 2738 // Disable partial & runtime unrolling on -Os. 2739 UP.PartialOptSizeThreshold = 0; 2740 2741 if (ST->getProcFamily() == AArch64Subtarget::Falkor && 2742 EnableFalkorHWPFUnrollFix) 2743 getFalkorUnrollingPreferences(L, SE, UP); 2744 2745 // Scan the loop: don't unroll loops with calls as this could prevent 2746 // inlining. Don't unroll vector loops either, as they don't benefit much from 2747 // unrolling. 2748 for (auto *BB : L->getBlocks()) { 2749 for (auto &I : *BB) { 2750 // Don't unroll vectorised loop. 2751 if (I.getType()->isVectorTy()) 2752 return; 2753 2754 if (isa<CallInst>(I) || isa<InvokeInst>(I)) { 2755 if (const Function *F = cast<CallBase>(I).getCalledFunction()) { 2756 if (!isLoweredToCall(F)) 2757 continue; 2758 } 2759 return; 2760 } 2761 } 2762 } 2763 2764 // Enable runtime unrolling for in-order models 2765 // If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by 2766 // checking for that case, we can ensure that the default behaviour is 2767 // unchanged 2768 if (ST->getProcFamily() != AArch64Subtarget::Others && 2769 !ST->getSchedModel().isOutOfOrder()) { 2770 UP.Runtime = true; 2771 UP.Partial = true; 2772 UP.UnrollRemainder = true; 2773 UP.DefaultUnrollRuntimeCount = 4; 2774 2775 UP.UnrollAndJam = true; 2776 UP.UnrollAndJamInnerLoopThreshold = 60; 2777 } 2778 } 2779 2780 void AArch64TTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, 2781 TTI::PeelingPreferences &PP) { 2782 BaseT::getPeelingPreferences(L, SE, PP); 2783 } 2784 2785 Value *AArch64TTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, 2786 Type *ExpectedType) { 2787 switch (Inst->getIntrinsicID()) { 2788 default: 2789 return nullptr; 2790 case Intrinsic::aarch64_neon_st2: 2791 case Intrinsic::aarch64_neon_st3: 2792 case Intrinsic::aarch64_neon_st4: { 2793 // Create a struct type 2794 StructType *ST = dyn_cast<StructType>(ExpectedType); 2795 if (!ST) 2796 return nullptr; 2797 unsigned NumElts = Inst->arg_size() - 1; 2798 if (ST->getNumElements() != NumElts) 2799 return nullptr; 2800 for (unsigned i = 0, e = NumElts; i != e; ++i) { 2801 if (Inst->getArgOperand(i)->getType() != ST->getElementType(i)) 2802 return nullptr; 2803 } 2804 Value *Res = PoisonValue::get(ExpectedType); 2805 IRBuilder<> Builder(Inst); 2806 for (unsigned i = 0, e = NumElts; i != e; ++i) { 2807 Value *L = Inst->getArgOperand(i); 2808 Res = Builder.CreateInsertValue(Res, L, i); 2809 } 2810 return Res; 2811 } 2812 case Intrinsic::aarch64_neon_ld2: 2813 case Intrinsic::aarch64_neon_ld3: 2814 case Intrinsic::aarch64_neon_ld4: 2815 if (Inst->getType() == ExpectedType) 2816 return Inst; 2817 return nullptr; 2818 } 2819 } 2820 2821 bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, 2822 MemIntrinsicInfo &Info) { 2823 switch (Inst->getIntrinsicID()) { 2824 default: 2825 break; 2826 case Intrinsic::aarch64_neon_ld2: 2827 case Intrinsic::aarch64_neon_ld3: 2828 case Intrinsic::aarch64_neon_ld4: 2829 Info.ReadMem = true; 2830 Info.WriteMem = false; 2831 Info.PtrVal = Inst->getArgOperand(0); 2832 break; 2833 case Intrinsic::aarch64_neon_st2: 2834 case Intrinsic::aarch64_neon_st3: 2835 case Intrinsic::aarch64_neon_st4: 2836 Info.ReadMem = false; 2837 Info.WriteMem = true; 2838 Info.PtrVal = Inst->getArgOperand(Inst->arg_size() - 1); 2839 break; 2840 } 2841 2842 switch (Inst->getIntrinsicID()) { 2843 default: 2844 return false; 2845 case Intrinsic::aarch64_neon_ld2: 2846 case Intrinsic::aarch64_neon_st2: 2847 Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS; 2848 break; 2849 case Intrinsic::aarch64_neon_ld3: 2850 case Intrinsic::aarch64_neon_st3: 2851 Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS; 2852 break; 2853 case Intrinsic::aarch64_neon_ld4: 2854 case Intrinsic::aarch64_neon_st4: 2855 Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS; 2856 break; 2857 } 2858 return true; 2859 } 2860 2861 /// See if \p I should be considered for address type promotion. We check if \p 2862 /// I is a sext with right type and used in memory accesses. If it used in a 2863 /// "complex" getelementptr, we allow it to be promoted without finding other 2864 /// sext instructions that sign extended the same initial value. A getelementptr 2865 /// is considered as "complex" if it has more than 2 operands. 2866 bool AArch64TTIImpl::shouldConsiderAddressTypePromotion( 2867 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) { 2868 bool Considerable = false; 2869 AllowPromotionWithoutCommonHeader = false; 2870 if (!isa<SExtInst>(&I)) 2871 return false; 2872 Type *ConsideredSExtType = 2873 Type::getInt64Ty(I.getParent()->getParent()->getContext()); 2874 if (I.getType() != ConsideredSExtType) 2875 return false; 2876 // See if the sext is the one with the right type and used in at least one 2877 // GetElementPtrInst. 2878 for (const User *U : I.users()) { 2879 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) { 2880 Considerable = true; 2881 // A getelementptr is considered as "complex" if it has more than 2 2882 // operands. We will promote a SExt used in such complex GEP as we 2883 // expect some computation to be merged if they are done on 64 bits. 2884 if (GEPInst->getNumOperands() > 2) { 2885 AllowPromotionWithoutCommonHeader = true; 2886 break; 2887 } 2888 } 2889 } 2890 return Considerable; 2891 } 2892 2893 bool AArch64TTIImpl::isLegalToVectorizeReduction( 2894 const RecurrenceDescriptor &RdxDesc, ElementCount VF) const { 2895 if (!VF.isScalable()) 2896 return true; 2897 2898 Type *Ty = RdxDesc.getRecurrenceType(); 2899 if (Ty->isBFloatTy() || !isElementTypeLegalForScalableVector(Ty)) 2900 return false; 2901 2902 switch (RdxDesc.getRecurrenceKind()) { 2903 case RecurKind::Add: 2904 case RecurKind::FAdd: 2905 case RecurKind::And: 2906 case RecurKind::Or: 2907 case RecurKind::Xor: 2908 case RecurKind::SMin: 2909 case RecurKind::SMax: 2910 case RecurKind::UMin: 2911 case RecurKind::UMax: 2912 case RecurKind::FMin: 2913 case RecurKind::FMax: 2914 case RecurKind::SelectICmp: 2915 case RecurKind::SelectFCmp: 2916 case RecurKind::FMulAdd: 2917 return true; 2918 default: 2919 return false; 2920 } 2921 } 2922 2923 InstructionCost 2924 AArch64TTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, 2925 bool IsUnsigned, 2926 TTI::TargetCostKind CostKind) { 2927 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); 2928 2929 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16()) 2930 return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind); 2931 2932 assert((isa<ScalableVectorType>(Ty) == isa<ScalableVectorType>(CondTy)) && 2933 "Both vector needs to be equally scalable"); 2934 2935 InstructionCost LegalizationCost = 0; 2936 if (LT.first > 1) { 2937 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext()); 2938 unsigned MinMaxOpcode = 2939 Ty->isFPOrFPVectorTy() 2940 ? Intrinsic::maxnum 2941 : (IsUnsigned ? Intrinsic::umin : Intrinsic::smin); 2942 IntrinsicCostAttributes Attrs(MinMaxOpcode, LegalVTy, {LegalVTy, LegalVTy}); 2943 LegalizationCost = getIntrinsicInstrCost(Attrs, CostKind) * (LT.first - 1); 2944 } 2945 2946 return LegalizationCost + /*Cost of horizontal reduction*/ 2; 2947 } 2948 2949 InstructionCost AArch64TTIImpl::getArithmeticReductionCostSVE( 2950 unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) { 2951 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy); 2952 InstructionCost LegalizationCost = 0; 2953 if (LT.first > 1) { 2954 Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext()); 2955 LegalizationCost = getArithmeticInstrCost(Opcode, LegalVTy, CostKind); 2956 LegalizationCost *= LT.first - 1; 2957 } 2958 2959 int ISD = TLI->InstructionOpcodeToISD(Opcode); 2960 assert(ISD && "Invalid opcode"); 2961 // Add the final reduction cost for the legal horizontal reduction 2962 switch (ISD) { 2963 case ISD::ADD: 2964 case ISD::AND: 2965 case ISD::OR: 2966 case ISD::XOR: 2967 case ISD::FADD: 2968 return LegalizationCost + 2; 2969 default: 2970 return InstructionCost::getInvalid(); 2971 } 2972 } 2973 2974 InstructionCost 2975 AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, 2976 std::optional<FastMathFlags> FMF, 2977 TTI::TargetCostKind CostKind) { 2978 if (TTI::requiresOrderedReduction(FMF)) { 2979 if (auto *FixedVTy = dyn_cast<FixedVectorType>(ValTy)) { 2980 InstructionCost BaseCost = 2981 BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind); 2982 // Add on extra cost to reflect the extra overhead on some CPUs. We still 2983 // end up vectorizing for more computationally intensive loops. 2984 return BaseCost + FixedVTy->getNumElements(); 2985 } 2986 2987 if (Opcode != Instruction::FAdd) 2988 return InstructionCost::getInvalid(); 2989 2990 auto *VTy = cast<ScalableVectorType>(ValTy); 2991 InstructionCost Cost = 2992 getArithmeticInstrCost(Opcode, VTy->getScalarType(), CostKind); 2993 Cost *= getMaxNumElements(VTy->getElementCount()); 2994 return Cost; 2995 } 2996 2997 if (isa<ScalableVectorType>(ValTy)) 2998 return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind); 2999 3000 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy); 3001 MVT MTy = LT.second; 3002 int ISD = TLI->InstructionOpcodeToISD(Opcode); 3003 assert(ISD && "Invalid opcode"); 3004 3005 // Horizontal adds can use the 'addv' instruction. We model the cost of these 3006 // instructions as twice a normal vector add, plus 1 for each legalization 3007 // step (LT.first). This is the only arithmetic vector reduction operation for 3008 // which we have an instruction. 3009 // OR, XOR and AND costs should match the codegen from: 3010 // OR: llvm/test/CodeGen/AArch64/reduce-or.ll 3011 // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll 3012 // AND: llvm/test/CodeGen/AArch64/reduce-and.ll 3013 static const CostTblEntry CostTblNoPairwise[]{ 3014 {ISD::ADD, MVT::v8i8, 2}, 3015 {ISD::ADD, MVT::v16i8, 2}, 3016 {ISD::ADD, MVT::v4i16, 2}, 3017 {ISD::ADD, MVT::v8i16, 2}, 3018 {ISD::ADD, MVT::v4i32, 2}, 3019 {ISD::ADD, MVT::v2i64, 2}, 3020 {ISD::OR, MVT::v8i8, 15}, 3021 {ISD::OR, MVT::v16i8, 17}, 3022 {ISD::OR, MVT::v4i16, 7}, 3023 {ISD::OR, MVT::v8i16, 9}, 3024 {ISD::OR, MVT::v2i32, 3}, 3025 {ISD::OR, MVT::v4i32, 5}, 3026 {ISD::OR, MVT::v2i64, 3}, 3027 {ISD::XOR, MVT::v8i8, 15}, 3028 {ISD::XOR, MVT::v16i8, 17}, 3029 {ISD::XOR, MVT::v4i16, 7}, 3030 {ISD::XOR, MVT::v8i16, 9}, 3031 {ISD::XOR, MVT::v2i32, 3}, 3032 {ISD::XOR, MVT::v4i32, 5}, 3033 {ISD::XOR, MVT::v2i64, 3}, 3034 {ISD::AND, MVT::v8i8, 15}, 3035 {ISD::AND, MVT::v16i8, 17}, 3036 {ISD::AND, MVT::v4i16, 7}, 3037 {ISD::AND, MVT::v8i16, 9}, 3038 {ISD::AND, MVT::v2i32, 3}, 3039 {ISD::AND, MVT::v4i32, 5}, 3040 {ISD::AND, MVT::v2i64, 3}, 3041 }; 3042 switch (ISD) { 3043 default: 3044 break; 3045 case ISD::ADD: 3046 if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy)) 3047 return (LT.first - 1) + Entry->Cost; 3048 break; 3049 case ISD::XOR: 3050 case ISD::AND: 3051 case ISD::OR: 3052 const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy); 3053 if (!Entry) 3054 break; 3055 auto *ValVTy = cast<FixedVectorType>(ValTy); 3056 if (!ValVTy->getElementType()->isIntegerTy(1) && 3057 MTy.getVectorNumElements() <= ValVTy->getNumElements() && 3058 isPowerOf2_32(ValVTy->getNumElements())) { 3059 InstructionCost ExtraCost = 0; 3060 if (LT.first != 1) { 3061 // Type needs to be split, so there is an extra cost of LT.first - 1 3062 // arithmetic ops. 3063 auto *Ty = FixedVectorType::get(ValTy->getElementType(), 3064 MTy.getVectorNumElements()); 3065 ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind); 3066 ExtraCost *= LT.first - 1; 3067 } 3068 return Entry->Cost + ExtraCost; 3069 } 3070 break; 3071 } 3072 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind); 3073 } 3074 3075 InstructionCost AArch64TTIImpl::getSpliceCost(VectorType *Tp, int Index) { 3076 static const CostTblEntry ShuffleTbl[] = { 3077 { TTI::SK_Splice, MVT::nxv16i8, 1 }, 3078 { TTI::SK_Splice, MVT::nxv8i16, 1 }, 3079 { TTI::SK_Splice, MVT::nxv4i32, 1 }, 3080 { TTI::SK_Splice, MVT::nxv2i64, 1 }, 3081 { TTI::SK_Splice, MVT::nxv2f16, 1 }, 3082 { TTI::SK_Splice, MVT::nxv4f16, 1 }, 3083 { TTI::SK_Splice, MVT::nxv8f16, 1 }, 3084 { TTI::SK_Splice, MVT::nxv2bf16, 1 }, 3085 { TTI::SK_Splice, MVT::nxv4bf16, 1 }, 3086 { TTI::SK_Splice, MVT::nxv8bf16, 1 }, 3087 { TTI::SK_Splice, MVT::nxv2f32, 1 }, 3088 { TTI::SK_Splice, MVT::nxv4f32, 1 }, 3089 { TTI::SK_Splice, MVT::nxv2f64, 1 }, 3090 }; 3091 3092 // The code-generator is currently not able to handle scalable vectors 3093 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting 3094 // it. This change will be removed when code-generation for these types is 3095 // sufficiently reliable. 3096 if (Tp->getElementCount() == ElementCount::getScalable(1)) 3097 return InstructionCost::getInvalid(); 3098 3099 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp); 3100 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Tp->getContext()); 3101 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 3102 EVT PromotedVT = LT.second.getScalarType() == MVT::i1 3103 ? TLI->getPromotedVTForPredicate(EVT(LT.second)) 3104 : LT.second; 3105 Type *PromotedVTy = EVT(PromotedVT).getTypeForEVT(Tp->getContext()); 3106 InstructionCost LegalizationCost = 0; 3107 if (Index < 0) { 3108 LegalizationCost = 3109 getCmpSelInstrCost(Instruction::ICmp, PromotedVTy, PromotedVTy, 3110 CmpInst::BAD_ICMP_PREDICATE, CostKind) + 3111 getCmpSelInstrCost(Instruction::Select, PromotedVTy, LegalVTy, 3112 CmpInst::BAD_ICMP_PREDICATE, CostKind); 3113 } 3114 3115 // Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp 3116 // Cost performed on a promoted type. 3117 if (LT.second.getScalarType() == MVT::i1) { 3118 LegalizationCost += 3119 getCastInstrCost(Instruction::ZExt, PromotedVTy, LegalVTy, 3120 TTI::CastContextHint::None, CostKind) + 3121 getCastInstrCost(Instruction::Trunc, LegalVTy, PromotedVTy, 3122 TTI::CastContextHint::None, CostKind); 3123 } 3124 const auto *Entry = 3125 CostTableLookup(ShuffleTbl, TTI::SK_Splice, PromotedVT.getSimpleVT()); 3126 assert(Entry && "Illegal Type for Splice"); 3127 LegalizationCost += Entry->Cost; 3128 return LegalizationCost * LT.first; 3129 } 3130 3131 InstructionCost AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, 3132 VectorType *Tp, 3133 ArrayRef<int> Mask, 3134 TTI::TargetCostKind CostKind, 3135 int Index, VectorType *SubTp, 3136 ArrayRef<const Value *> Args) { 3137 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp); 3138 // If we have a Mask, and the LT is being legalized somehow, split the Mask 3139 // into smaller vectors and sum the cost of each shuffle. 3140 if (!Mask.empty() && isa<FixedVectorType>(Tp) && LT.second.isVector() && 3141 Tp->getScalarSizeInBits() == LT.second.getScalarSizeInBits() && 3142 cast<FixedVectorType>(Tp)->getNumElements() > 3143 LT.second.getVectorNumElements() && 3144 !Index && !SubTp) { 3145 unsigned TpNumElts = cast<FixedVectorType>(Tp)->getNumElements(); 3146 assert(Mask.size() == TpNumElts && "Expected Mask and Tp size to match!"); 3147 unsigned LTNumElts = LT.second.getVectorNumElements(); 3148 unsigned NumVecs = (TpNumElts + LTNumElts - 1) / LTNumElts; 3149 VectorType *NTp = 3150 VectorType::get(Tp->getScalarType(), LT.second.getVectorElementCount()); 3151 InstructionCost Cost; 3152 for (unsigned N = 0; N < NumVecs; N++) { 3153 SmallVector<int> NMask; 3154 // Split the existing mask into chunks of size LTNumElts. Track the source 3155 // sub-vectors to ensure the result has at most 2 inputs. 3156 unsigned Source1, Source2; 3157 unsigned NumSources = 0; 3158 for (unsigned E = 0; E < LTNumElts; E++) { 3159 int MaskElt = (N * LTNumElts + E < TpNumElts) ? Mask[N * LTNumElts + E] 3160 : UndefMaskElem; 3161 if (MaskElt < 0) { 3162 NMask.push_back(UndefMaskElem); 3163 continue; 3164 } 3165 3166 // Calculate which source from the input this comes from and whether it 3167 // is new to us. 3168 unsigned Source = MaskElt / LTNumElts; 3169 if (NumSources == 0) { 3170 Source1 = Source; 3171 NumSources = 1; 3172 } else if (NumSources == 1 && Source != Source1) { 3173 Source2 = Source; 3174 NumSources = 2; 3175 } else if (NumSources >= 2 && Source != Source1 && Source != Source2) { 3176 NumSources++; 3177 } 3178 3179 // Add to the new mask. For the NumSources>2 case these are not correct, 3180 // but are only used for the modular lane number. 3181 if (Source == Source1) 3182 NMask.push_back(MaskElt % LTNumElts); 3183 else if (Source == Source2) 3184 NMask.push_back(MaskElt % LTNumElts + LTNumElts); 3185 else 3186 NMask.push_back(MaskElt % LTNumElts); 3187 } 3188 // If the sub-mask has at most 2 input sub-vectors then re-cost it using 3189 // getShuffleCost. If not then cost it using the worst case. 3190 if (NumSources <= 2) 3191 Cost += getShuffleCost(NumSources <= 1 ? TTI::SK_PermuteSingleSrc 3192 : TTI::SK_PermuteTwoSrc, 3193 NTp, NMask, CostKind, 0, nullptr, Args); 3194 else if (any_of(enumerate(NMask), [&](const auto &ME) { 3195 return ME.value() % LTNumElts == ME.index(); 3196 })) 3197 Cost += LTNumElts - 1; 3198 else 3199 Cost += LTNumElts; 3200 } 3201 return Cost; 3202 } 3203 3204 Kind = improveShuffleKindFromMask(Kind, Mask); 3205 3206 // Check for broadcast loads. 3207 if (Kind == TTI::SK_Broadcast) { 3208 bool IsLoad = !Args.empty() && isa<LoadInst>(Args[0]); 3209 if (IsLoad && LT.second.isVector() && 3210 isLegalBroadcastLoad(Tp->getElementType(), 3211 LT.second.getVectorElementCount())) 3212 return 0; // broadcast is handled by ld1r 3213 } 3214 3215 // If we have 4 elements for the shuffle and a Mask, get the cost straight 3216 // from the perfect shuffle tables. 3217 if (Mask.size() == 4 && Tp->getElementCount() == ElementCount::getFixed(4) && 3218 (Tp->getScalarSizeInBits() == 16 || Tp->getScalarSizeInBits() == 32) && 3219 all_of(Mask, [](int E) { return E < 8; })) 3220 return getPerfectShuffleCost(Mask); 3221 3222 if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose || 3223 Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc || 3224 Kind == TTI::SK_Reverse || Kind == TTI::SK_Splice) { 3225 static const CostTblEntry ShuffleTbl[] = { 3226 // Broadcast shuffle kinds can be performed with 'dup'. 3227 {TTI::SK_Broadcast, MVT::v8i8, 1}, 3228 {TTI::SK_Broadcast, MVT::v16i8, 1}, 3229 {TTI::SK_Broadcast, MVT::v4i16, 1}, 3230 {TTI::SK_Broadcast, MVT::v8i16, 1}, 3231 {TTI::SK_Broadcast, MVT::v2i32, 1}, 3232 {TTI::SK_Broadcast, MVT::v4i32, 1}, 3233 {TTI::SK_Broadcast, MVT::v2i64, 1}, 3234 {TTI::SK_Broadcast, MVT::v2f32, 1}, 3235 {TTI::SK_Broadcast, MVT::v4f32, 1}, 3236 {TTI::SK_Broadcast, MVT::v2f64, 1}, 3237 // Transpose shuffle kinds can be performed with 'trn1/trn2' and 3238 // 'zip1/zip2' instructions. 3239 {TTI::SK_Transpose, MVT::v8i8, 1}, 3240 {TTI::SK_Transpose, MVT::v16i8, 1}, 3241 {TTI::SK_Transpose, MVT::v4i16, 1}, 3242 {TTI::SK_Transpose, MVT::v8i16, 1}, 3243 {TTI::SK_Transpose, MVT::v2i32, 1}, 3244 {TTI::SK_Transpose, MVT::v4i32, 1}, 3245 {TTI::SK_Transpose, MVT::v2i64, 1}, 3246 {TTI::SK_Transpose, MVT::v2f32, 1}, 3247 {TTI::SK_Transpose, MVT::v4f32, 1}, 3248 {TTI::SK_Transpose, MVT::v2f64, 1}, 3249 // Select shuffle kinds. 3250 // TODO: handle vXi8/vXi16. 3251 {TTI::SK_Select, MVT::v2i32, 1}, // mov. 3252 {TTI::SK_Select, MVT::v4i32, 2}, // rev+trn (or similar). 3253 {TTI::SK_Select, MVT::v2i64, 1}, // mov. 3254 {TTI::SK_Select, MVT::v2f32, 1}, // mov. 3255 {TTI::SK_Select, MVT::v4f32, 2}, // rev+trn (or similar). 3256 {TTI::SK_Select, MVT::v2f64, 1}, // mov. 3257 // PermuteSingleSrc shuffle kinds. 3258 {TTI::SK_PermuteSingleSrc, MVT::v2i32, 1}, // mov. 3259 {TTI::SK_PermuteSingleSrc, MVT::v4i32, 3}, // perfectshuffle worst case. 3260 {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // mov. 3261 {TTI::SK_PermuteSingleSrc, MVT::v2f32, 1}, // mov. 3262 {TTI::SK_PermuteSingleSrc, MVT::v4f32, 3}, // perfectshuffle worst case. 3263 {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // mov. 3264 {TTI::SK_PermuteSingleSrc, MVT::v4i16, 3}, // perfectshuffle worst case. 3265 {TTI::SK_PermuteSingleSrc, MVT::v4f16, 3}, // perfectshuffle worst case. 3266 {TTI::SK_PermuteSingleSrc, MVT::v4bf16, 3}, // same 3267 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 8}, // constpool + load + tbl 3268 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 8}, // constpool + load + tbl 3269 {TTI::SK_PermuteSingleSrc, MVT::v8bf16, 8}, // constpool + load + tbl 3270 {TTI::SK_PermuteSingleSrc, MVT::v8i8, 8}, // constpool + load + tbl 3271 {TTI::SK_PermuteSingleSrc, MVT::v16i8, 8}, // constpool + load + tbl 3272 // Reverse can be lowered with `rev`. 3273 {TTI::SK_Reverse, MVT::v2i32, 1}, // REV64 3274 {TTI::SK_Reverse, MVT::v4i32, 2}, // REV64; EXT 3275 {TTI::SK_Reverse, MVT::v2i64, 1}, // EXT 3276 {TTI::SK_Reverse, MVT::v2f32, 1}, // REV64 3277 {TTI::SK_Reverse, MVT::v4f32, 2}, // REV64; EXT 3278 {TTI::SK_Reverse, MVT::v2f64, 1}, // EXT 3279 {TTI::SK_Reverse, MVT::v8f16, 2}, // REV64; EXT 3280 {TTI::SK_Reverse, MVT::v8i16, 2}, // REV64; EXT 3281 {TTI::SK_Reverse, MVT::v16i8, 2}, // REV64; EXT 3282 {TTI::SK_Reverse, MVT::v4f16, 1}, // REV64 3283 {TTI::SK_Reverse, MVT::v4i16, 1}, // REV64 3284 {TTI::SK_Reverse, MVT::v8i8, 1}, // REV64 3285 // Splice can all be lowered as `ext`. 3286 {TTI::SK_Splice, MVT::v2i32, 1}, 3287 {TTI::SK_Splice, MVT::v4i32, 1}, 3288 {TTI::SK_Splice, MVT::v2i64, 1}, 3289 {TTI::SK_Splice, MVT::v2f32, 1}, 3290 {TTI::SK_Splice, MVT::v4f32, 1}, 3291 {TTI::SK_Splice, MVT::v2f64, 1}, 3292 {TTI::SK_Splice, MVT::v8f16, 1}, 3293 {TTI::SK_Splice, MVT::v8bf16, 1}, 3294 {TTI::SK_Splice, MVT::v8i16, 1}, 3295 {TTI::SK_Splice, MVT::v16i8, 1}, 3296 {TTI::SK_Splice, MVT::v4bf16, 1}, 3297 {TTI::SK_Splice, MVT::v4f16, 1}, 3298 {TTI::SK_Splice, MVT::v4i16, 1}, 3299 {TTI::SK_Splice, MVT::v8i8, 1}, 3300 // Broadcast shuffle kinds for scalable vectors 3301 {TTI::SK_Broadcast, MVT::nxv16i8, 1}, 3302 {TTI::SK_Broadcast, MVT::nxv8i16, 1}, 3303 {TTI::SK_Broadcast, MVT::nxv4i32, 1}, 3304 {TTI::SK_Broadcast, MVT::nxv2i64, 1}, 3305 {TTI::SK_Broadcast, MVT::nxv2f16, 1}, 3306 {TTI::SK_Broadcast, MVT::nxv4f16, 1}, 3307 {TTI::SK_Broadcast, MVT::nxv8f16, 1}, 3308 {TTI::SK_Broadcast, MVT::nxv2bf16, 1}, 3309 {TTI::SK_Broadcast, MVT::nxv4bf16, 1}, 3310 {TTI::SK_Broadcast, MVT::nxv8bf16, 1}, 3311 {TTI::SK_Broadcast, MVT::nxv2f32, 1}, 3312 {TTI::SK_Broadcast, MVT::nxv4f32, 1}, 3313 {TTI::SK_Broadcast, MVT::nxv2f64, 1}, 3314 {TTI::SK_Broadcast, MVT::nxv16i1, 1}, 3315 {TTI::SK_Broadcast, MVT::nxv8i1, 1}, 3316 {TTI::SK_Broadcast, MVT::nxv4i1, 1}, 3317 {TTI::SK_Broadcast, MVT::nxv2i1, 1}, 3318 // Handle the cases for vector.reverse with scalable vectors 3319 {TTI::SK_Reverse, MVT::nxv16i8, 1}, 3320 {TTI::SK_Reverse, MVT::nxv8i16, 1}, 3321 {TTI::SK_Reverse, MVT::nxv4i32, 1}, 3322 {TTI::SK_Reverse, MVT::nxv2i64, 1}, 3323 {TTI::SK_Reverse, MVT::nxv2f16, 1}, 3324 {TTI::SK_Reverse, MVT::nxv4f16, 1}, 3325 {TTI::SK_Reverse, MVT::nxv8f16, 1}, 3326 {TTI::SK_Reverse, MVT::nxv2bf16, 1}, 3327 {TTI::SK_Reverse, MVT::nxv4bf16, 1}, 3328 {TTI::SK_Reverse, MVT::nxv8bf16, 1}, 3329 {TTI::SK_Reverse, MVT::nxv2f32, 1}, 3330 {TTI::SK_Reverse, MVT::nxv4f32, 1}, 3331 {TTI::SK_Reverse, MVT::nxv2f64, 1}, 3332 {TTI::SK_Reverse, MVT::nxv16i1, 1}, 3333 {TTI::SK_Reverse, MVT::nxv8i1, 1}, 3334 {TTI::SK_Reverse, MVT::nxv4i1, 1}, 3335 {TTI::SK_Reverse, MVT::nxv2i1, 1}, 3336 }; 3337 if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second)) 3338 return LT.first * Entry->Cost; 3339 } 3340 3341 if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(Tp)) 3342 return getSpliceCost(Tp, Index); 3343 3344 // Inserting a subvector can often be done with either a D, S or H register 3345 // move, so long as the inserted vector is "aligned". 3346 if (Kind == TTI::SK_InsertSubvector && LT.second.isFixedLengthVector() && 3347 LT.second.getSizeInBits() <= 128 && SubTp) { 3348 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp); 3349 if (SubLT.second.isVector()) { 3350 int NumElts = LT.second.getVectorNumElements(); 3351 int NumSubElts = SubLT.second.getVectorNumElements(); 3352 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0) 3353 return SubLT.first; 3354 } 3355 } 3356 3357 return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp); 3358 } 3359 3360 bool AArch64TTIImpl::preferPredicateOverEpilogue( 3361 Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC, 3362 TargetLibraryInfo *TLI, DominatorTree *DT, LoopVectorizationLegality *LVL, 3363 InterleavedAccessInfo *IAI) { 3364 if (!ST->hasSVE() || TailFoldingKindLoc == TailFoldingKind::TFDisabled) 3365 return false; 3366 3367 // We don't currently support vectorisation with interleaving for SVE - with 3368 // such loops we're better off not using tail-folding. This gives us a chance 3369 // to fall back on fixed-width vectorisation using NEON's ld2/st2/etc. 3370 if (IAI->hasGroups()) 3371 return false; 3372 3373 TailFoldingKind Required; // Defaults to 0. 3374 if (LVL->getReductionVars().size()) 3375 Required.add(TailFoldingKind::TFReductions); 3376 if (LVL->getFixedOrderRecurrences().size()) 3377 Required.add(TailFoldingKind::TFRecurrences); 3378 if (!Required) 3379 Required.add(TailFoldingKind::TFSimple); 3380 3381 return (TailFoldingKindLoc & Required) == Required; 3382 } 3383 3384 InstructionCost 3385 AArch64TTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, 3386 int64_t BaseOffset, bool HasBaseReg, 3387 int64_t Scale, unsigned AddrSpace) const { 3388 // Scaling factors are not free at all. 3389 // Operands | Rt Latency 3390 // ------------------------------------------- 3391 // Rt, [Xn, Xm] | 4 3392 // ------------------------------------------- 3393 // Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5 3394 // Rt, [Xn, Wm, <extend> #imm] | 3395 TargetLoweringBase::AddrMode AM; 3396 AM.BaseGV = BaseGV; 3397 AM.BaseOffs = BaseOffset; 3398 AM.HasBaseReg = HasBaseReg; 3399 AM.Scale = Scale; 3400 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace)) 3401 // Scale represents reg2 * scale, thus account for 1 if 3402 // it is not equal to 0 or 1. 3403 return AM.Scale != 0 && AM.Scale != 1; 3404 return -1; 3405 } 3406