1 //===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "AArch64TargetTransformInfo.h" 10 #include "AArch64ExpandImm.h" 11 #include "AArch64PerfectShuffle.h" 12 #include "MCTargetDesc/AArch64AddressingModes.h" 13 #include "llvm/Analysis/IVDescriptors.h" 14 #include "llvm/Analysis/LoopInfo.h" 15 #include "llvm/Analysis/TargetTransformInfo.h" 16 #include "llvm/CodeGen/BasicTTIImpl.h" 17 #include "llvm/CodeGen/CostTable.h" 18 #include "llvm/CodeGen/TargetLowering.h" 19 #include "llvm/IR/IntrinsicInst.h" 20 #include "llvm/IR/Intrinsics.h" 21 #include "llvm/IR/IntrinsicsAArch64.h" 22 #include "llvm/IR/PatternMatch.h" 23 #include "llvm/Support/Debug.h" 24 #include "llvm/Transforms/InstCombine/InstCombiner.h" 25 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 26 #include <algorithm> 27 using namespace llvm; 28 using namespace llvm::PatternMatch; 29 30 #define DEBUG_TYPE "aarch64tti" 31 32 static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix", 33 cl::init(true), cl::Hidden); 34 35 static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(10), 36 cl::Hidden); 37 38 static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead", 39 cl::init(10), cl::Hidden); 40 41 class TailFoldingKind { 42 private: 43 uint8_t Bits = 0; // Currently defaults to disabled. 44 45 public: 46 enum TailFoldingOpts { 47 TFDisabled = 0x0, 48 TFReductions = 0x01, 49 TFRecurrences = 0x02, 50 TFSimple = 0x80, 51 TFAll = TFReductions | TFRecurrences | TFSimple 52 }; 53 54 void operator=(const std::string &Val) { 55 if (Val.empty()) 56 return; 57 SmallVector<StringRef, 6> TailFoldTypes; 58 StringRef(Val).split(TailFoldTypes, '+', -1, false); 59 for (auto TailFoldType : TailFoldTypes) { 60 if (TailFoldType == "disabled") 61 Bits = 0; 62 else if (TailFoldType == "all") 63 Bits = TFAll; 64 else if (TailFoldType == "default") 65 Bits = 0; // Currently defaults to never tail-folding. 66 else if (TailFoldType == "simple") 67 add(TFSimple); 68 else if (TailFoldType == "reductions") 69 add(TFReductions); 70 else if (TailFoldType == "recurrences") 71 add(TFRecurrences); 72 else if (TailFoldType == "noreductions") 73 remove(TFReductions); 74 else if (TailFoldType == "norecurrences") 75 remove(TFRecurrences); 76 else { 77 errs() 78 << "invalid argument " << TailFoldType.str() 79 << " to -sve-tail-folding=; each element must be one of: disabled, " 80 "all, default, simple, reductions, noreductions, recurrences, " 81 "norecurrences\n"; 82 } 83 } 84 } 85 86 operator uint8_t() const { return Bits; } 87 88 void add(uint8_t Flag) { Bits |= Flag; } 89 void remove(uint8_t Flag) { Bits &= ~Flag; } 90 }; 91 92 TailFoldingKind TailFoldingKindLoc; 93 94 cl::opt<TailFoldingKind, true, cl::parser<std::string>> SVETailFolding( 95 "sve-tail-folding", 96 cl::desc( 97 "Control the use of vectorisation using tail-folding for SVE:" 98 "\ndisabled No loop types will vectorize using tail-folding" 99 "\ndefault Uses the default tail-folding settings for the target " 100 "CPU" 101 "\nall All legal loop types will vectorize using tail-folding" 102 "\nsimple Use tail-folding for simple loops (not reductions or " 103 "recurrences)" 104 "\nreductions Use tail-folding for loops containing reductions" 105 "\nrecurrences Use tail-folding for loops containing first order " 106 "recurrences"), 107 cl::location(TailFoldingKindLoc)); 108 109 bool AArch64TTIImpl::areInlineCompatible(const Function *Caller, 110 const Function *Callee) const { 111 const TargetMachine &TM = getTLI()->getTargetMachine(); 112 113 const FeatureBitset &CallerBits = 114 TM.getSubtargetImpl(*Caller)->getFeatureBits(); 115 const FeatureBitset &CalleeBits = 116 TM.getSubtargetImpl(*Callee)->getFeatureBits(); 117 118 // Inline a callee if its target-features are a subset of the callers 119 // target-features. 120 return (CallerBits & CalleeBits) == CalleeBits; 121 } 122 123 bool AArch64TTIImpl::shouldMaximizeVectorBandwidth( 124 TargetTransformInfo::RegisterKind K) const { 125 assert(K != TargetTransformInfo::RGK_Scalar); 126 return K == TargetTransformInfo::RGK_FixedWidthVector; 127 } 128 129 /// Calculate the cost of materializing a 64-bit value. This helper 130 /// method might only calculate a fraction of a larger immediate. Therefore it 131 /// is valid to return a cost of ZERO. 132 InstructionCost AArch64TTIImpl::getIntImmCost(int64_t Val) { 133 // Check if the immediate can be encoded within an instruction. 134 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64)) 135 return 0; 136 137 if (Val < 0) 138 Val = ~Val; 139 140 // Calculate how many moves we will need to materialize this constant. 141 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn; 142 AArch64_IMM::expandMOVImm(Val, 64, Insn); 143 return Insn.size(); 144 } 145 146 /// Calculate the cost of materializing the given constant. 147 InstructionCost AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty, 148 TTI::TargetCostKind CostKind) { 149 assert(Ty->isIntegerTy()); 150 151 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 152 if (BitSize == 0) 153 return ~0U; 154 155 // Sign-extend all constants to a multiple of 64-bit. 156 APInt ImmVal = Imm; 157 if (BitSize & 0x3f) 158 ImmVal = Imm.sext((BitSize + 63) & ~0x3fU); 159 160 // Split the constant into 64-bit chunks and calculate the cost for each 161 // chunk. 162 InstructionCost Cost = 0; 163 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) { 164 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64); 165 int64_t Val = Tmp.getSExtValue(); 166 Cost += getIntImmCost(Val); 167 } 168 // We need at least one instruction to materialze the constant. 169 return std::max<InstructionCost>(1, Cost); 170 } 171 172 InstructionCost AArch64TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, 173 const APInt &Imm, Type *Ty, 174 TTI::TargetCostKind CostKind, 175 Instruction *Inst) { 176 assert(Ty->isIntegerTy()); 177 178 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 179 // There is no cost model for constants with a bit size of 0. Return TCC_Free 180 // here, so that constant hoisting will ignore this constant. 181 if (BitSize == 0) 182 return TTI::TCC_Free; 183 184 unsigned ImmIdx = ~0U; 185 switch (Opcode) { 186 default: 187 return TTI::TCC_Free; 188 case Instruction::GetElementPtr: 189 // Always hoist the base address of a GetElementPtr. 190 if (Idx == 0) 191 return 2 * TTI::TCC_Basic; 192 return TTI::TCC_Free; 193 case Instruction::Store: 194 ImmIdx = 0; 195 break; 196 case Instruction::Add: 197 case Instruction::Sub: 198 case Instruction::Mul: 199 case Instruction::UDiv: 200 case Instruction::SDiv: 201 case Instruction::URem: 202 case Instruction::SRem: 203 case Instruction::And: 204 case Instruction::Or: 205 case Instruction::Xor: 206 case Instruction::ICmp: 207 ImmIdx = 1; 208 break; 209 // Always return TCC_Free for the shift value of a shift instruction. 210 case Instruction::Shl: 211 case Instruction::LShr: 212 case Instruction::AShr: 213 if (Idx == 1) 214 return TTI::TCC_Free; 215 break; 216 case Instruction::Trunc: 217 case Instruction::ZExt: 218 case Instruction::SExt: 219 case Instruction::IntToPtr: 220 case Instruction::PtrToInt: 221 case Instruction::BitCast: 222 case Instruction::PHI: 223 case Instruction::Call: 224 case Instruction::Select: 225 case Instruction::Ret: 226 case Instruction::Load: 227 break; 228 } 229 230 if (Idx == ImmIdx) { 231 int NumConstants = (BitSize + 63) / 64; 232 InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); 233 return (Cost <= NumConstants * TTI::TCC_Basic) 234 ? static_cast<int>(TTI::TCC_Free) 235 : Cost; 236 } 237 return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); 238 } 239 240 InstructionCost 241 AArch64TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, 242 const APInt &Imm, Type *Ty, 243 TTI::TargetCostKind CostKind) { 244 assert(Ty->isIntegerTy()); 245 246 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 247 // There is no cost model for constants with a bit size of 0. Return TCC_Free 248 // here, so that constant hoisting will ignore this constant. 249 if (BitSize == 0) 250 return TTI::TCC_Free; 251 252 // Most (all?) AArch64 intrinsics do not support folding immediates into the 253 // selected instruction, so we compute the materialization cost for the 254 // immediate directly. 255 if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv) 256 return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); 257 258 switch (IID) { 259 default: 260 return TTI::TCC_Free; 261 case Intrinsic::sadd_with_overflow: 262 case Intrinsic::uadd_with_overflow: 263 case Intrinsic::ssub_with_overflow: 264 case Intrinsic::usub_with_overflow: 265 case Intrinsic::smul_with_overflow: 266 case Intrinsic::umul_with_overflow: 267 if (Idx == 1) { 268 int NumConstants = (BitSize + 63) / 64; 269 InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); 270 return (Cost <= NumConstants * TTI::TCC_Basic) 271 ? static_cast<int>(TTI::TCC_Free) 272 : Cost; 273 } 274 break; 275 case Intrinsic::experimental_stackmap: 276 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) 277 return TTI::TCC_Free; 278 break; 279 case Intrinsic::experimental_patchpoint_void: 280 case Intrinsic::experimental_patchpoint_i64: 281 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) 282 return TTI::TCC_Free; 283 break; 284 case Intrinsic::experimental_gc_statepoint: 285 if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) 286 return TTI::TCC_Free; 287 break; 288 } 289 return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); 290 } 291 292 TargetTransformInfo::PopcntSupportKind 293 AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) { 294 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); 295 if (TyWidth == 32 || TyWidth == 64) 296 return TTI::PSK_FastHardware; 297 // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount. 298 return TTI::PSK_Software; 299 } 300 301 InstructionCost 302 AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, 303 TTI::TargetCostKind CostKind) { 304 auto *RetTy = ICA.getReturnType(); 305 switch (ICA.getID()) { 306 case Intrinsic::umin: 307 case Intrinsic::umax: 308 case Intrinsic::smin: 309 case Intrinsic::smax: { 310 static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16, 311 MVT::v8i16, MVT::v2i32, MVT::v4i32}; 312 auto LT = TLI->getTypeLegalizationCost(DL, RetTy); 313 // v2i64 types get converted to cmp+bif hence the cost of 2 314 if (LT.second == MVT::v2i64) 315 return LT.first * 2; 316 if (any_of(ValidMinMaxTys, [<](MVT M) { return M == LT.second; })) 317 return LT.first; 318 break; 319 } 320 case Intrinsic::sadd_sat: 321 case Intrinsic::ssub_sat: 322 case Intrinsic::uadd_sat: 323 case Intrinsic::usub_sat: { 324 static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16, 325 MVT::v8i16, MVT::v2i32, MVT::v4i32, 326 MVT::v2i64}; 327 auto LT = TLI->getTypeLegalizationCost(DL, RetTy); 328 // This is a base cost of 1 for the vadd, plus 3 extract shifts if we 329 // need to extend the type, as it uses shr(qadd(shl, shl)). 330 unsigned Instrs = 331 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4; 332 if (any_of(ValidSatTys, [<](MVT M) { return M == LT.second; })) 333 return LT.first * Instrs; 334 break; 335 } 336 case Intrinsic::abs: { 337 static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16, 338 MVT::v8i16, MVT::v2i32, MVT::v4i32, 339 MVT::v2i64}; 340 auto LT = TLI->getTypeLegalizationCost(DL, RetTy); 341 if (any_of(ValidAbsTys, [<](MVT M) { return M == LT.second; })) 342 return LT.first; 343 break; 344 } 345 case Intrinsic::experimental_stepvector: { 346 InstructionCost Cost = 1; // Cost of the `index' instruction 347 auto LT = TLI->getTypeLegalizationCost(DL, RetTy); 348 // Legalisation of illegal vectors involves an `index' instruction plus 349 // (LT.first - 1) vector adds. 350 if (LT.first > 1) { 351 Type *LegalVTy = EVT(LT.second).getTypeForEVT(RetTy->getContext()); 352 InstructionCost AddCost = 353 getArithmeticInstrCost(Instruction::Add, LegalVTy, CostKind); 354 Cost += AddCost * (LT.first - 1); 355 } 356 return Cost; 357 } 358 case Intrinsic::bitreverse: { 359 static const CostTblEntry BitreverseTbl[] = { 360 {Intrinsic::bitreverse, MVT::i32, 1}, 361 {Intrinsic::bitreverse, MVT::i64, 1}, 362 {Intrinsic::bitreverse, MVT::v8i8, 1}, 363 {Intrinsic::bitreverse, MVT::v16i8, 1}, 364 {Intrinsic::bitreverse, MVT::v4i16, 2}, 365 {Intrinsic::bitreverse, MVT::v8i16, 2}, 366 {Intrinsic::bitreverse, MVT::v2i32, 2}, 367 {Intrinsic::bitreverse, MVT::v4i32, 2}, 368 {Intrinsic::bitreverse, MVT::v1i64, 2}, 369 {Intrinsic::bitreverse, MVT::v2i64, 2}, 370 }; 371 const auto LegalisationCost = TLI->getTypeLegalizationCost(DL, RetTy); 372 const auto *Entry = 373 CostTableLookup(BitreverseTbl, ICA.getID(), LegalisationCost.second); 374 if (Entry) { 375 // Cost Model is using the legal type(i32) that i8 and i16 will be 376 // converted to +1 so that we match the actual lowering cost 377 if (TLI->getValueType(DL, RetTy, true) == MVT::i8 || 378 TLI->getValueType(DL, RetTy, true) == MVT::i16) 379 return LegalisationCost.first * Entry->Cost + 1; 380 381 return LegalisationCost.first * Entry->Cost; 382 } 383 break; 384 } 385 case Intrinsic::ctpop: { 386 static const CostTblEntry CtpopCostTbl[] = { 387 {ISD::CTPOP, MVT::v2i64, 4}, 388 {ISD::CTPOP, MVT::v4i32, 3}, 389 {ISD::CTPOP, MVT::v8i16, 2}, 390 {ISD::CTPOP, MVT::v16i8, 1}, 391 {ISD::CTPOP, MVT::i64, 4}, 392 {ISD::CTPOP, MVT::v2i32, 3}, 393 {ISD::CTPOP, MVT::v4i16, 2}, 394 {ISD::CTPOP, MVT::v8i8, 1}, 395 {ISD::CTPOP, MVT::i32, 5}, 396 }; 397 auto LT = TLI->getTypeLegalizationCost(DL, RetTy); 398 MVT MTy = LT.second; 399 if (const auto *Entry = CostTableLookup(CtpopCostTbl, ISD::CTPOP, MTy)) { 400 // Extra cost of +1 when illegal vector types are legalized by promoting 401 // the integer type. 402 int ExtraCost = MTy.isVector() && MTy.getScalarSizeInBits() != 403 RetTy->getScalarSizeInBits() 404 ? 1 405 : 0; 406 return LT.first * Entry->Cost + ExtraCost; 407 } 408 break; 409 } 410 case Intrinsic::sadd_with_overflow: 411 case Intrinsic::uadd_with_overflow: 412 case Intrinsic::ssub_with_overflow: 413 case Intrinsic::usub_with_overflow: 414 case Intrinsic::smul_with_overflow: 415 case Intrinsic::umul_with_overflow: { 416 static const CostTblEntry WithOverflowCostTbl[] = { 417 {Intrinsic::sadd_with_overflow, MVT::i8, 3}, 418 {Intrinsic::uadd_with_overflow, MVT::i8, 3}, 419 {Intrinsic::sadd_with_overflow, MVT::i16, 3}, 420 {Intrinsic::uadd_with_overflow, MVT::i16, 3}, 421 {Intrinsic::sadd_with_overflow, MVT::i32, 1}, 422 {Intrinsic::uadd_with_overflow, MVT::i32, 1}, 423 {Intrinsic::sadd_with_overflow, MVT::i64, 1}, 424 {Intrinsic::uadd_with_overflow, MVT::i64, 1}, 425 {Intrinsic::ssub_with_overflow, MVT::i8, 3}, 426 {Intrinsic::usub_with_overflow, MVT::i8, 3}, 427 {Intrinsic::ssub_with_overflow, MVT::i16, 3}, 428 {Intrinsic::usub_with_overflow, MVT::i16, 3}, 429 {Intrinsic::ssub_with_overflow, MVT::i32, 1}, 430 {Intrinsic::usub_with_overflow, MVT::i32, 1}, 431 {Intrinsic::ssub_with_overflow, MVT::i64, 1}, 432 {Intrinsic::usub_with_overflow, MVT::i64, 1}, 433 {Intrinsic::smul_with_overflow, MVT::i8, 5}, 434 {Intrinsic::umul_with_overflow, MVT::i8, 4}, 435 {Intrinsic::smul_with_overflow, MVT::i16, 5}, 436 {Intrinsic::umul_with_overflow, MVT::i16, 4}, 437 {Intrinsic::smul_with_overflow, MVT::i32, 2}, // eg umull;tst 438 {Intrinsic::umul_with_overflow, MVT::i32, 2}, // eg umull;cmp sxtw 439 {Intrinsic::smul_with_overflow, MVT::i64, 3}, // eg mul;smulh;cmp 440 {Intrinsic::umul_with_overflow, MVT::i64, 3}, // eg mul;umulh;cmp asr 441 }; 442 EVT MTy = TLI->getValueType(DL, RetTy->getContainedType(0), true); 443 if (MTy.isSimple()) 444 if (const auto *Entry = CostTableLookup(WithOverflowCostTbl, ICA.getID(), 445 MTy.getSimpleVT())) 446 return Entry->Cost; 447 break; 448 } 449 case Intrinsic::fptosi_sat: 450 case Intrinsic::fptoui_sat: { 451 if (ICA.getArgTypes().empty()) 452 break; 453 bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat; 454 auto LT = TLI->getTypeLegalizationCost(DL, ICA.getArgTypes()[0]); 455 EVT MTy = TLI->getValueType(DL, RetTy); 456 // Check for the legal types, which are where the size of the input and the 457 // output are the same, or we are using cvt f64->i32 or f32->i64. 458 if ((LT.second == MVT::f32 || LT.second == MVT::f64 || 459 LT.second == MVT::v2f32 || LT.second == MVT::v4f32 || 460 LT.second == MVT::v2f64) && 461 (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits() || 462 (LT.second == MVT::f64 && MTy == MVT::i32) || 463 (LT.second == MVT::f32 && MTy == MVT::i64))) 464 return LT.first; 465 // Similarly for fp16 sizes 466 if (ST->hasFullFP16() && 467 ((LT.second == MVT::f16 && MTy == MVT::i32) || 468 ((LT.second == MVT::v4f16 || LT.second == MVT::v8f16) && 469 (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())))) 470 return LT.first; 471 472 // Otherwise we use a legal convert followed by a min+max 473 if ((LT.second.getScalarType() == MVT::f32 || 474 LT.second.getScalarType() == MVT::f64 || 475 (ST->hasFullFP16() && LT.second.getScalarType() == MVT::f16)) && 476 LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) { 477 Type *LegalTy = 478 Type::getIntNTy(RetTy->getContext(), LT.second.getScalarSizeInBits()); 479 if (LT.second.isVector()) 480 LegalTy = VectorType::get(LegalTy, LT.second.getVectorElementCount()); 481 InstructionCost Cost = 1; 482 IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin : Intrinsic::umin, 483 LegalTy, {LegalTy, LegalTy}); 484 Cost += getIntrinsicInstrCost(Attrs1, CostKind); 485 IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax : Intrinsic::umax, 486 LegalTy, {LegalTy, LegalTy}); 487 Cost += getIntrinsicInstrCost(Attrs2, CostKind); 488 return LT.first * Cost; 489 } 490 break; 491 } 492 default: 493 break; 494 } 495 return BaseT::getIntrinsicInstrCost(ICA, CostKind); 496 } 497 498 /// The function will remove redundant reinterprets casting in the presence 499 /// of the control flow 500 static Optional<Instruction *> processPhiNode(InstCombiner &IC, 501 IntrinsicInst &II) { 502 SmallVector<Instruction *, 32> Worklist; 503 auto RequiredType = II.getType(); 504 505 auto *PN = dyn_cast<PHINode>(II.getArgOperand(0)); 506 assert(PN && "Expected Phi Node!"); 507 508 // Don't create a new Phi unless we can remove the old one. 509 if (!PN->hasOneUse()) 510 return None; 511 512 for (Value *IncValPhi : PN->incoming_values()) { 513 auto *Reinterpret = dyn_cast<IntrinsicInst>(IncValPhi); 514 if (!Reinterpret || 515 Reinterpret->getIntrinsicID() != 516 Intrinsic::aarch64_sve_convert_to_svbool || 517 RequiredType != Reinterpret->getArgOperand(0)->getType()) 518 return None; 519 } 520 521 // Create the new Phi 522 LLVMContext &Ctx = PN->getContext(); 523 IRBuilder<> Builder(Ctx); 524 Builder.SetInsertPoint(PN); 525 PHINode *NPN = Builder.CreatePHI(RequiredType, PN->getNumIncomingValues()); 526 Worklist.push_back(PN); 527 528 for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) { 529 auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I)); 530 NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I)); 531 Worklist.push_back(Reinterpret); 532 } 533 534 // Cleanup Phi Node and reinterprets 535 return IC.replaceInstUsesWith(II, NPN); 536 } 537 538 // (from_svbool (binop (to_svbool pred) (svbool_t _) (svbool_t _)))) 539 // => (binop (pred) (from_svbool _) (from_svbool _)) 540 // 541 // The above transformation eliminates a `to_svbool` in the predicate 542 // operand of bitwise operation `binop` by narrowing the vector width of 543 // the operation. For example, it would convert a `<vscale x 16 x i1> 544 // and` into a `<vscale x 4 x i1> and`. This is profitable because 545 // to_svbool must zero the new lanes during widening, whereas 546 // from_svbool is free. 547 static Optional<Instruction *> tryCombineFromSVBoolBinOp(InstCombiner &IC, 548 IntrinsicInst &II) { 549 auto BinOp = dyn_cast<IntrinsicInst>(II.getOperand(0)); 550 if (!BinOp) 551 return None; 552 553 auto IntrinsicID = BinOp->getIntrinsicID(); 554 switch (IntrinsicID) { 555 case Intrinsic::aarch64_sve_and_z: 556 case Intrinsic::aarch64_sve_bic_z: 557 case Intrinsic::aarch64_sve_eor_z: 558 case Intrinsic::aarch64_sve_nand_z: 559 case Intrinsic::aarch64_sve_nor_z: 560 case Intrinsic::aarch64_sve_orn_z: 561 case Intrinsic::aarch64_sve_orr_z: 562 break; 563 default: 564 return None; 565 } 566 567 auto BinOpPred = BinOp->getOperand(0); 568 auto BinOpOp1 = BinOp->getOperand(1); 569 auto BinOpOp2 = BinOp->getOperand(2); 570 571 auto PredIntr = dyn_cast<IntrinsicInst>(BinOpPred); 572 if (!PredIntr || 573 PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool) 574 return None; 575 576 auto PredOp = PredIntr->getOperand(0); 577 auto PredOpTy = cast<VectorType>(PredOp->getType()); 578 if (PredOpTy != II.getType()) 579 return None; 580 581 IRBuilder<> Builder(II.getContext()); 582 Builder.SetInsertPoint(&II); 583 584 SmallVector<Value *> NarrowedBinOpArgs = {PredOp}; 585 auto NarrowBinOpOp1 = Builder.CreateIntrinsic( 586 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp1}); 587 NarrowedBinOpArgs.push_back(NarrowBinOpOp1); 588 if (BinOpOp1 == BinOpOp2) 589 NarrowedBinOpArgs.push_back(NarrowBinOpOp1); 590 else 591 NarrowedBinOpArgs.push_back(Builder.CreateIntrinsic( 592 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp2})); 593 594 auto NarrowedBinOp = 595 Builder.CreateIntrinsic(IntrinsicID, {PredOpTy}, NarrowedBinOpArgs); 596 return IC.replaceInstUsesWith(II, NarrowedBinOp); 597 } 598 599 static Optional<Instruction *> instCombineConvertFromSVBool(InstCombiner &IC, 600 IntrinsicInst &II) { 601 // If the reinterpret instruction operand is a PHI Node 602 if (isa<PHINode>(II.getArgOperand(0))) 603 return processPhiNode(IC, II); 604 605 if (auto BinOpCombine = tryCombineFromSVBoolBinOp(IC, II)) 606 return BinOpCombine; 607 608 SmallVector<Instruction *, 32> CandidatesForRemoval; 609 Value *Cursor = II.getOperand(0), *EarliestReplacement = nullptr; 610 611 const auto *IVTy = cast<VectorType>(II.getType()); 612 613 // Walk the chain of conversions. 614 while (Cursor) { 615 // If the type of the cursor has fewer lanes than the final result, zeroing 616 // must take place, which breaks the equivalence chain. 617 const auto *CursorVTy = cast<VectorType>(Cursor->getType()); 618 if (CursorVTy->getElementCount().getKnownMinValue() < 619 IVTy->getElementCount().getKnownMinValue()) 620 break; 621 622 // If the cursor has the same type as I, it is a viable replacement. 623 if (Cursor->getType() == IVTy) 624 EarliestReplacement = Cursor; 625 626 auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor); 627 628 // If this is not an SVE conversion intrinsic, this is the end of the chain. 629 if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() == 630 Intrinsic::aarch64_sve_convert_to_svbool || 631 IntrinsicCursor->getIntrinsicID() == 632 Intrinsic::aarch64_sve_convert_from_svbool)) 633 break; 634 635 CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor); 636 Cursor = IntrinsicCursor->getOperand(0); 637 } 638 639 // If no viable replacement in the conversion chain was found, there is 640 // nothing to do. 641 if (!EarliestReplacement) 642 return None; 643 644 return IC.replaceInstUsesWith(II, EarliestReplacement); 645 } 646 647 static Optional<Instruction *> instCombineSVESel(InstCombiner &IC, 648 IntrinsicInst &II) { 649 IRBuilder<> Builder(&II); 650 auto Select = Builder.CreateSelect(II.getOperand(0), II.getOperand(1), 651 II.getOperand(2)); 652 return IC.replaceInstUsesWith(II, Select); 653 } 654 655 static Optional<Instruction *> instCombineSVEDup(InstCombiner &IC, 656 IntrinsicInst &II) { 657 IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(1)); 658 if (!Pg) 659 return None; 660 661 if (Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue) 662 return None; 663 664 const auto PTruePattern = 665 cast<ConstantInt>(Pg->getOperand(0))->getZExtValue(); 666 if (PTruePattern != AArch64SVEPredPattern::vl1) 667 return None; 668 669 // The intrinsic is inserting into lane zero so use an insert instead. 670 auto *IdxTy = Type::getInt64Ty(II.getContext()); 671 auto *Insert = InsertElementInst::Create( 672 II.getArgOperand(0), II.getArgOperand(2), ConstantInt::get(IdxTy, 0)); 673 Insert->insertBefore(&II); 674 Insert->takeName(&II); 675 676 return IC.replaceInstUsesWith(II, Insert); 677 } 678 679 static Optional<Instruction *> instCombineSVEDupX(InstCombiner &IC, 680 IntrinsicInst &II) { 681 // Replace DupX with a regular IR splat. 682 IRBuilder<> Builder(II.getContext()); 683 Builder.SetInsertPoint(&II); 684 auto *RetTy = cast<ScalableVectorType>(II.getType()); 685 Value *Splat = 686 Builder.CreateVectorSplat(RetTy->getElementCount(), II.getArgOperand(0)); 687 Splat->takeName(&II); 688 return IC.replaceInstUsesWith(II, Splat); 689 } 690 691 static Optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC, 692 IntrinsicInst &II) { 693 LLVMContext &Ctx = II.getContext(); 694 IRBuilder<> Builder(Ctx); 695 Builder.SetInsertPoint(&II); 696 697 // Check that the predicate is all active 698 auto *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(0)); 699 if (!Pg || Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue) 700 return None; 701 702 const auto PTruePattern = 703 cast<ConstantInt>(Pg->getOperand(0))->getZExtValue(); 704 if (PTruePattern != AArch64SVEPredPattern::all) 705 return None; 706 707 // Check that we have a compare of zero.. 708 auto *SplatValue = 709 dyn_cast_or_null<ConstantInt>(getSplatValue(II.getArgOperand(2))); 710 if (!SplatValue || !SplatValue->isZero()) 711 return None; 712 713 // ..against a dupq 714 auto *DupQLane = dyn_cast<IntrinsicInst>(II.getArgOperand(1)); 715 if (!DupQLane || 716 DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane) 717 return None; 718 719 // Where the dupq is a lane 0 replicate of a vector insert 720 if (!cast<ConstantInt>(DupQLane->getArgOperand(1))->isZero()) 721 return None; 722 723 auto *VecIns = dyn_cast<IntrinsicInst>(DupQLane->getArgOperand(0)); 724 if (!VecIns || VecIns->getIntrinsicID() != Intrinsic::vector_insert) 725 return None; 726 727 // Where the vector insert is a fixed constant vector insert into undef at 728 // index zero 729 if (!isa<UndefValue>(VecIns->getArgOperand(0))) 730 return None; 731 732 if (!cast<ConstantInt>(VecIns->getArgOperand(2))->isZero()) 733 return None; 734 735 auto *ConstVec = dyn_cast<Constant>(VecIns->getArgOperand(1)); 736 if (!ConstVec) 737 return None; 738 739 auto *VecTy = dyn_cast<FixedVectorType>(ConstVec->getType()); 740 auto *OutTy = dyn_cast<ScalableVectorType>(II.getType()); 741 if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements()) 742 return None; 743 744 unsigned NumElts = VecTy->getNumElements(); 745 unsigned PredicateBits = 0; 746 747 // Expand intrinsic operands to a 16-bit byte level predicate 748 for (unsigned I = 0; I < NumElts; ++I) { 749 auto *Arg = dyn_cast<ConstantInt>(ConstVec->getAggregateElement(I)); 750 if (!Arg) 751 return None; 752 if (!Arg->isZero()) 753 PredicateBits |= 1 << (I * (16 / NumElts)); 754 } 755 756 // If all bits are zero bail early with an empty predicate 757 if (PredicateBits == 0) { 758 auto *PFalse = Constant::getNullValue(II.getType()); 759 PFalse->takeName(&II); 760 return IC.replaceInstUsesWith(II, PFalse); 761 } 762 763 // Calculate largest predicate type used (where byte predicate is largest) 764 unsigned Mask = 8; 765 for (unsigned I = 0; I < 16; ++I) 766 if ((PredicateBits & (1 << I)) != 0) 767 Mask |= (I % 8); 768 769 unsigned PredSize = Mask & -Mask; 770 auto *PredType = ScalableVectorType::get( 771 Type::getInt1Ty(Ctx), AArch64::SVEBitsPerBlock / (PredSize * 8)); 772 773 // Ensure all relevant bits are set 774 for (unsigned I = 0; I < 16; I += PredSize) 775 if ((PredicateBits & (1 << I)) == 0) 776 return None; 777 778 auto *PTruePat = 779 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all); 780 auto *PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, 781 {PredType}, {PTruePat}); 782 auto *ConvertToSVBool = Builder.CreateIntrinsic( 783 Intrinsic::aarch64_sve_convert_to_svbool, {PredType}, {PTrue}); 784 auto *ConvertFromSVBool = 785 Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool, 786 {II.getType()}, {ConvertToSVBool}); 787 788 ConvertFromSVBool->takeName(&II); 789 return IC.replaceInstUsesWith(II, ConvertFromSVBool); 790 } 791 792 static Optional<Instruction *> instCombineSVELast(InstCombiner &IC, 793 IntrinsicInst &II) { 794 IRBuilder<> Builder(II.getContext()); 795 Builder.SetInsertPoint(&II); 796 Value *Pg = II.getArgOperand(0); 797 Value *Vec = II.getArgOperand(1); 798 auto IntrinsicID = II.getIntrinsicID(); 799 bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta; 800 801 // lastX(splat(X)) --> X 802 if (auto *SplatVal = getSplatValue(Vec)) 803 return IC.replaceInstUsesWith(II, SplatVal); 804 805 // If x and/or y is a splat value then: 806 // lastX (binop (x, y)) --> binop(lastX(x), lastX(y)) 807 Value *LHS, *RHS; 808 if (match(Vec, m_OneUse(m_BinOp(m_Value(LHS), m_Value(RHS))))) { 809 if (isSplatValue(LHS) || isSplatValue(RHS)) { 810 auto *OldBinOp = cast<BinaryOperator>(Vec); 811 auto OpC = OldBinOp->getOpcode(); 812 auto *NewLHS = 813 Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, LHS}); 814 auto *NewRHS = 815 Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, RHS}); 816 auto *NewBinOp = BinaryOperator::CreateWithCopiedFlags( 817 OpC, NewLHS, NewRHS, OldBinOp, OldBinOp->getName(), &II); 818 return IC.replaceInstUsesWith(II, NewBinOp); 819 } 820 } 821 822 auto *C = dyn_cast<Constant>(Pg); 823 if (IsAfter && C && C->isNullValue()) { 824 // The intrinsic is extracting lane 0 so use an extract instead. 825 auto *IdxTy = Type::getInt64Ty(II.getContext()); 826 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, 0)); 827 Extract->insertBefore(&II); 828 Extract->takeName(&II); 829 return IC.replaceInstUsesWith(II, Extract); 830 } 831 832 auto *IntrPG = dyn_cast<IntrinsicInst>(Pg); 833 if (!IntrPG) 834 return None; 835 836 if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue) 837 return None; 838 839 const auto PTruePattern = 840 cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue(); 841 842 // Can the intrinsic's predicate be converted to a known constant index? 843 unsigned MinNumElts = getNumElementsFromSVEPredPattern(PTruePattern); 844 if (!MinNumElts) 845 return None; 846 847 unsigned Idx = MinNumElts - 1; 848 // Increment the index if extracting the element after the last active 849 // predicate element. 850 if (IsAfter) 851 ++Idx; 852 853 // Ignore extracts whose index is larger than the known minimum vector 854 // length. NOTE: This is an artificial constraint where we prefer to 855 // maintain what the user asked for until an alternative is proven faster. 856 auto *PgVTy = cast<ScalableVectorType>(Pg->getType()); 857 if (Idx >= PgVTy->getMinNumElements()) 858 return None; 859 860 // The intrinsic is extracting a fixed lane so use an extract instead. 861 auto *IdxTy = Type::getInt64Ty(II.getContext()); 862 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, Idx)); 863 Extract->insertBefore(&II); 864 Extract->takeName(&II); 865 return IC.replaceInstUsesWith(II, Extract); 866 } 867 868 static Optional<Instruction *> instCombineSVECondLast(InstCombiner &IC, 869 IntrinsicInst &II) { 870 // The SIMD&FP variant of CLAST[AB] is significantly faster than the scalar 871 // integer variant across a variety of micro-architectures. Replace scalar 872 // integer CLAST[AB] intrinsic with optimal SIMD&FP variant. A simple 873 // bitcast-to-fp + clast[ab] + bitcast-to-int will cost a cycle or two more 874 // depending on the micro-architecture, but has been observed as generally 875 // being faster, particularly when the CLAST[AB] op is a loop-carried 876 // dependency. 877 IRBuilder<> Builder(II.getContext()); 878 Builder.SetInsertPoint(&II); 879 Value *Pg = II.getArgOperand(0); 880 Value *Fallback = II.getArgOperand(1); 881 Value *Vec = II.getArgOperand(2); 882 Type *Ty = II.getType(); 883 884 if (!Ty->isIntegerTy()) 885 return None; 886 887 Type *FPTy; 888 switch (cast<IntegerType>(Ty)->getBitWidth()) { 889 default: 890 return None; 891 case 16: 892 FPTy = Builder.getHalfTy(); 893 break; 894 case 32: 895 FPTy = Builder.getFloatTy(); 896 break; 897 case 64: 898 FPTy = Builder.getDoubleTy(); 899 break; 900 } 901 902 Value *FPFallBack = Builder.CreateBitCast(Fallback, FPTy); 903 auto *FPVTy = VectorType::get( 904 FPTy, cast<VectorType>(Vec->getType())->getElementCount()); 905 Value *FPVec = Builder.CreateBitCast(Vec, FPVTy); 906 auto *FPII = Builder.CreateIntrinsic(II.getIntrinsicID(), {FPVec->getType()}, 907 {Pg, FPFallBack, FPVec}); 908 Value *FPIItoInt = Builder.CreateBitCast(FPII, II.getType()); 909 return IC.replaceInstUsesWith(II, FPIItoInt); 910 } 911 912 static Optional<Instruction *> instCombineRDFFR(InstCombiner &IC, 913 IntrinsicInst &II) { 914 LLVMContext &Ctx = II.getContext(); 915 IRBuilder<> Builder(Ctx); 916 Builder.SetInsertPoint(&II); 917 // Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr 918 // can work with RDFFR_PP for ptest elimination. 919 auto *AllPat = 920 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all); 921 auto *PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, 922 {II.getType()}, {AllPat}); 923 auto *RDFFR = 924 Builder.CreateIntrinsic(Intrinsic::aarch64_sve_rdffr_z, {}, {PTrue}); 925 RDFFR->takeName(&II); 926 return IC.replaceInstUsesWith(II, RDFFR); 927 } 928 929 static Optional<Instruction *> 930 instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts) { 931 const auto Pattern = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue(); 932 933 if (Pattern == AArch64SVEPredPattern::all) { 934 LLVMContext &Ctx = II.getContext(); 935 IRBuilder<> Builder(Ctx); 936 Builder.SetInsertPoint(&II); 937 938 Constant *StepVal = ConstantInt::get(II.getType(), NumElts); 939 auto *VScale = Builder.CreateVScale(StepVal); 940 VScale->takeName(&II); 941 return IC.replaceInstUsesWith(II, VScale); 942 } 943 944 unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern); 945 946 return MinNumElts && NumElts >= MinNumElts 947 ? Optional<Instruction *>(IC.replaceInstUsesWith( 948 II, ConstantInt::get(II.getType(), MinNumElts))) 949 : None; 950 } 951 952 static Optional<Instruction *> instCombineSVEPTest(InstCombiner &IC, 953 IntrinsicInst &II) { 954 IntrinsicInst *Op1 = dyn_cast<IntrinsicInst>(II.getArgOperand(0)); 955 IntrinsicInst *Op2 = dyn_cast<IntrinsicInst>(II.getArgOperand(1)); 956 957 if (Op1 && Op2 && 958 Op1->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool && 959 Op2->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool && 960 Op1->getArgOperand(0)->getType() == Op2->getArgOperand(0)->getType()) { 961 962 IRBuilder<> Builder(II.getContext()); 963 Builder.SetInsertPoint(&II); 964 965 Value *Ops[] = {Op1->getArgOperand(0), Op2->getArgOperand(0)}; 966 Type *Tys[] = {Op1->getArgOperand(0)->getType()}; 967 968 auto *PTest = Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops); 969 970 PTest->takeName(&II); 971 return IC.replaceInstUsesWith(II, PTest); 972 } 973 974 return None; 975 } 976 977 static Optional<Instruction *> instCombineSVEVectorFMLA(InstCombiner &IC, 978 IntrinsicInst &II) { 979 // fold (fadd p a (fmul p b c)) -> (fma p a b c) 980 Value *P = II.getOperand(0); 981 Value *A = II.getOperand(1); 982 auto FMul = II.getOperand(2); 983 Value *B, *C; 984 if (!match(FMul, m_Intrinsic<Intrinsic::aarch64_sve_fmul>( 985 m_Specific(P), m_Value(B), m_Value(C)))) 986 return None; 987 988 if (!FMul->hasOneUse()) 989 return None; 990 991 llvm::FastMathFlags FAddFlags = II.getFastMathFlags(); 992 // Stop the combine when the flags on the inputs differ in case dropping flags 993 // would lead to us missing out on more beneficial optimizations. 994 if (FAddFlags != cast<CallInst>(FMul)->getFastMathFlags()) 995 return None; 996 if (!FAddFlags.allowContract()) 997 return None; 998 999 IRBuilder<> Builder(II.getContext()); 1000 Builder.SetInsertPoint(&II); 1001 auto FMLA = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_fmla, 1002 {II.getType()}, {P, A, B, C}, &II); 1003 FMLA->setFastMathFlags(FAddFlags); 1004 return IC.replaceInstUsesWith(II, FMLA); 1005 } 1006 1007 static bool isAllActivePredicate(Value *Pred) { 1008 // Look through convert.from.svbool(convert.to.svbool(...) chain. 1009 Value *UncastedPred; 1010 if (match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_convert_from_svbool>( 1011 m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>( 1012 m_Value(UncastedPred))))) 1013 // If the predicate has the same or less lanes than the uncasted 1014 // predicate then we know the casting has no effect. 1015 if (cast<ScalableVectorType>(Pred->getType())->getMinNumElements() <= 1016 cast<ScalableVectorType>(UncastedPred->getType())->getMinNumElements()) 1017 Pred = UncastedPred; 1018 1019 return match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>( 1020 m_ConstantInt<AArch64SVEPredPattern::all>())); 1021 } 1022 1023 static Optional<Instruction *> 1024 instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) { 1025 IRBuilder<> Builder(II.getContext()); 1026 Builder.SetInsertPoint(&II); 1027 1028 Value *Pred = II.getOperand(0); 1029 Value *PtrOp = II.getOperand(1); 1030 Type *VecTy = II.getType(); 1031 Value *VecPtr = Builder.CreateBitCast(PtrOp, VecTy->getPointerTo()); 1032 1033 if (isAllActivePredicate(Pred)) { 1034 LoadInst *Load = Builder.CreateLoad(VecTy, VecPtr); 1035 Load->copyMetadata(II); 1036 return IC.replaceInstUsesWith(II, Load); 1037 } 1038 1039 CallInst *MaskedLoad = 1040 Builder.CreateMaskedLoad(VecTy, VecPtr, PtrOp->getPointerAlignment(DL), 1041 Pred, ConstantAggregateZero::get(VecTy)); 1042 MaskedLoad->copyMetadata(II); 1043 return IC.replaceInstUsesWith(II, MaskedLoad); 1044 } 1045 1046 static Optional<Instruction *> 1047 instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) { 1048 IRBuilder<> Builder(II.getContext()); 1049 Builder.SetInsertPoint(&II); 1050 1051 Value *VecOp = II.getOperand(0); 1052 Value *Pred = II.getOperand(1); 1053 Value *PtrOp = II.getOperand(2); 1054 Value *VecPtr = 1055 Builder.CreateBitCast(PtrOp, VecOp->getType()->getPointerTo()); 1056 1057 if (isAllActivePredicate(Pred)) { 1058 StoreInst *Store = Builder.CreateStore(VecOp, VecPtr); 1059 Store->copyMetadata(II); 1060 return IC.eraseInstFromFunction(II); 1061 } 1062 1063 CallInst *MaskedStore = Builder.CreateMaskedStore( 1064 VecOp, VecPtr, PtrOp->getPointerAlignment(DL), Pred); 1065 MaskedStore->copyMetadata(II); 1066 return IC.eraseInstFromFunction(II); 1067 } 1068 1069 static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic) { 1070 switch (Intrinsic) { 1071 case Intrinsic::aarch64_sve_fmul: 1072 return Instruction::BinaryOps::FMul; 1073 case Intrinsic::aarch64_sve_fadd: 1074 return Instruction::BinaryOps::FAdd; 1075 case Intrinsic::aarch64_sve_fsub: 1076 return Instruction::BinaryOps::FSub; 1077 default: 1078 return Instruction::BinaryOpsEnd; 1079 } 1080 } 1081 1082 static Optional<Instruction *> instCombineSVEVectorBinOp(InstCombiner &IC, 1083 IntrinsicInst &II) { 1084 auto *OpPredicate = II.getOperand(0); 1085 auto BinOpCode = intrinsicIDToBinOpCode(II.getIntrinsicID()); 1086 if (BinOpCode == Instruction::BinaryOpsEnd || 1087 !match(OpPredicate, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>( 1088 m_ConstantInt<AArch64SVEPredPattern::all>()))) 1089 return None; 1090 IRBuilder<> Builder(II.getContext()); 1091 Builder.SetInsertPoint(&II); 1092 Builder.setFastMathFlags(II.getFastMathFlags()); 1093 auto BinOp = 1094 Builder.CreateBinOp(BinOpCode, II.getOperand(1), II.getOperand(2)); 1095 return IC.replaceInstUsesWith(II, BinOp); 1096 } 1097 1098 static Optional<Instruction *> instCombineSVEVectorFAdd(InstCombiner &IC, 1099 IntrinsicInst &II) { 1100 if (auto FMLA = instCombineSVEVectorFMLA(IC, II)) 1101 return FMLA; 1102 return instCombineSVEVectorBinOp(IC, II); 1103 } 1104 1105 static Optional<Instruction *> instCombineSVEVectorMul(InstCombiner &IC, 1106 IntrinsicInst &II) { 1107 auto *OpPredicate = II.getOperand(0); 1108 auto *OpMultiplicand = II.getOperand(1); 1109 auto *OpMultiplier = II.getOperand(2); 1110 1111 IRBuilder<> Builder(II.getContext()); 1112 Builder.SetInsertPoint(&II); 1113 1114 // Return true if a given instruction is a unit splat value, false otherwise. 1115 auto IsUnitSplat = [](auto *I) { 1116 auto *SplatValue = getSplatValue(I); 1117 if (!SplatValue) 1118 return false; 1119 return match(SplatValue, m_FPOne()) || match(SplatValue, m_One()); 1120 }; 1121 1122 // Return true if a given instruction is an aarch64_sve_dup intrinsic call 1123 // with a unit splat value, false otherwise. 1124 auto IsUnitDup = [](auto *I) { 1125 auto *IntrI = dyn_cast<IntrinsicInst>(I); 1126 if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup) 1127 return false; 1128 1129 auto *SplatValue = IntrI->getOperand(2); 1130 return match(SplatValue, m_FPOne()) || match(SplatValue, m_One()); 1131 }; 1132 1133 if (IsUnitSplat(OpMultiplier)) { 1134 // [f]mul pg %n, (dupx 1) => %n 1135 OpMultiplicand->takeName(&II); 1136 return IC.replaceInstUsesWith(II, OpMultiplicand); 1137 } else if (IsUnitDup(OpMultiplier)) { 1138 // [f]mul pg %n, (dup pg 1) => %n 1139 auto *DupInst = cast<IntrinsicInst>(OpMultiplier); 1140 auto *DupPg = DupInst->getOperand(1); 1141 // TODO: this is naive. The optimization is still valid if DupPg 1142 // 'encompasses' OpPredicate, not only if they're the same predicate. 1143 if (OpPredicate == DupPg) { 1144 OpMultiplicand->takeName(&II); 1145 return IC.replaceInstUsesWith(II, OpMultiplicand); 1146 } 1147 } 1148 1149 return instCombineSVEVectorBinOp(IC, II); 1150 } 1151 1152 static Optional<Instruction *> instCombineSVEUnpack(InstCombiner &IC, 1153 IntrinsicInst &II) { 1154 IRBuilder<> Builder(II.getContext()); 1155 Builder.SetInsertPoint(&II); 1156 Value *UnpackArg = II.getArgOperand(0); 1157 auto *RetTy = cast<ScalableVectorType>(II.getType()); 1158 bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi || 1159 II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo; 1160 1161 // Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X)) 1162 // Lo = uunpklo(splat(X)) --> Lo = splat(extend(X)) 1163 if (auto *ScalarArg = getSplatValue(UnpackArg)) { 1164 ScalarArg = 1165 Builder.CreateIntCast(ScalarArg, RetTy->getScalarType(), IsSigned); 1166 Value *NewVal = 1167 Builder.CreateVectorSplat(RetTy->getElementCount(), ScalarArg); 1168 NewVal->takeName(&II); 1169 return IC.replaceInstUsesWith(II, NewVal); 1170 } 1171 1172 return None; 1173 } 1174 static Optional<Instruction *> instCombineSVETBL(InstCombiner &IC, 1175 IntrinsicInst &II) { 1176 auto *OpVal = II.getOperand(0); 1177 auto *OpIndices = II.getOperand(1); 1178 VectorType *VTy = cast<VectorType>(II.getType()); 1179 1180 // Check whether OpIndices is a constant splat value < minimal element count 1181 // of result. 1182 auto *SplatValue = dyn_cast_or_null<ConstantInt>(getSplatValue(OpIndices)); 1183 if (!SplatValue || 1184 SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue())) 1185 return None; 1186 1187 // Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to 1188 // splat_vector(extractelement(OpVal, SplatValue)) for further optimization. 1189 IRBuilder<> Builder(II.getContext()); 1190 Builder.SetInsertPoint(&II); 1191 auto *Extract = Builder.CreateExtractElement(OpVal, SplatValue); 1192 auto *VectorSplat = 1193 Builder.CreateVectorSplat(VTy->getElementCount(), Extract); 1194 1195 VectorSplat->takeName(&II); 1196 return IC.replaceInstUsesWith(II, VectorSplat); 1197 } 1198 1199 static Optional<Instruction *> instCombineSVETupleGet(InstCombiner &IC, 1200 IntrinsicInst &II) { 1201 // Try to remove sequences of tuple get/set. 1202 Value *SetTuple, *SetIndex, *SetValue; 1203 auto *GetTuple = II.getArgOperand(0); 1204 auto *GetIndex = II.getArgOperand(1); 1205 // Check that we have tuple_get(GetTuple, GetIndex) where GetTuple is a 1206 // call to tuple_set i.e. tuple_set(SetTuple, SetIndex, SetValue). 1207 // Make sure that the types of the current intrinsic and SetValue match 1208 // in order to safely remove the sequence. 1209 if (!match(GetTuple, 1210 m_Intrinsic<Intrinsic::aarch64_sve_tuple_set>( 1211 m_Value(SetTuple), m_Value(SetIndex), m_Value(SetValue))) || 1212 SetValue->getType() != II.getType()) 1213 return None; 1214 // Case where we get the same index right after setting it. 1215 // tuple_get(tuple_set(SetTuple, SetIndex, SetValue), GetIndex) --> SetValue 1216 if (GetIndex == SetIndex) 1217 return IC.replaceInstUsesWith(II, SetValue); 1218 // If we are getting a different index than what was set in the tuple_set 1219 // intrinsic. We can just set the input tuple to the one up in the chain. 1220 // tuple_get(tuple_set(SetTuple, SetIndex, SetValue), GetIndex) 1221 // --> tuple_get(SetTuple, GetIndex) 1222 return IC.replaceOperand(II, 0, SetTuple); 1223 } 1224 1225 static Optional<Instruction *> instCombineSVEZip(InstCombiner &IC, 1226 IntrinsicInst &II) { 1227 // zip1(uzp1(A, B), uzp2(A, B)) --> A 1228 // zip2(uzp1(A, B), uzp2(A, B)) --> B 1229 Value *A, *B; 1230 if (match(II.getArgOperand(0), 1231 m_Intrinsic<Intrinsic::aarch64_sve_uzp1>(m_Value(A), m_Value(B))) && 1232 match(II.getArgOperand(1), m_Intrinsic<Intrinsic::aarch64_sve_uzp2>( 1233 m_Specific(A), m_Specific(B)))) 1234 return IC.replaceInstUsesWith( 1235 II, (II.getIntrinsicID() == Intrinsic::aarch64_sve_zip1 ? A : B)); 1236 1237 return None; 1238 } 1239 1240 static Optional<Instruction *> instCombineLD1GatherIndex(InstCombiner &IC, 1241 IntrinsicInst &II) { 1242 Value *Mask = II.getOperand(0); 1243 Value *BasePtr = II.getOperand(1); 1244 Value *Index = II.getOperand(2); 1245 Type *Ty = II.getType(); 1246 Value *PassThru = ConstantAggregateZero::get(Ty); 1247 1248 // Contiguous gather => masked load. 1249 // (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1)) 1250 // => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer) 1251 Value *IndexBase; 1252 if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>( 1253 m_Value(IndexBase), m_SpecificInt(1)))) { 1254 IRBuilder<> Builder(II.getContext()); 1255 Builder.SetInsertPoint(&II); 1256 1257 Align Alignment = 1258 BasePtr->getPointerAlignment(II.getModule()->getDataLayout()); 1259 1260 Type *VecPtrTy = PointerType::getUnqual(Ty); 1261 Value *Ptr = Builder.CreateGEP( 1262 cast<VectorType>(Ty)->getElementType(), BasePtr, IndexBase); 1263 Ptr = Builder.CreateBitCast(Ptr, VecPtrTy); 1264 CallInst *MaskedLoad = 1265 Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask, PassThru); 1266 MaskedLoad->takeName(&II); 1267 return IC.replaceInstUsesWith(II, MaskedLoad); 1268 } 1269 1270 return None; 1271 } 1272 1273 static Optional<Instruction *> instCombineST1ScatterIndex(InstCombiner &IC, 1274 IntrinsicInst &II) { 1275 Value *Val = II.getOperand(0); 1276 Value *Mask = II.getOperand(1); 1277 Value *BasePtr = II.getOperand(2); 1278 Value *Index = II.getOperand(3); 1279 Type *Ty = Val->getType(); 1280 1281 // Contiguous scatter => masked store. 1282 // (sve.st1.scatter.index Value Mask BasePtr (sve.index IndexBase 1)) 1283 // => (masked.store Value (gep BasePtr IndexBase) Align Mask) 1284 Value *IndexBase; 1285 if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>( 1286 m_Value(IndexBase), m_SpecificInt(1)))) { 1287 IRBuilder<> Builder(II.getContext()); 1288 Builder.SetInsertPoint(&II); 1289 1290 Align Alignment = 1291 BasePtr->getPointerAlignment(II.getModule()->getDataLayout()); 1292 1293 Value *Ptr = Builder.CreateGEP( 1294 cast<VectorType>(Ty)->getElementType(), BasePtr, IndexBase); 1295 Type *VecPtrTy = PointerType::getUnqual(Ty); 1296 Ptr = Builder.CreateBitCast(Ptr, VecPtrTy); 1297 1298 (void)Builder.CreateMaskedStore(Val, Ptr, Alignment, Mask); 1299 1300 return IC.eraseInstFromFunction(II); 1301 } 1302 1303 return None; 1304 } 1305 1306 static Optional<Instruction *> instCombineSVESDIV(InstCombiner &IC, 1307 IntrinsicInst &II) { 1308 IRBuilder<> Builder(II.getContext()); 1309 Builder.SetInsertPoint(&II); 1310 Type *Int32Ty = Builder.getInt32Ty(); 1311 Value *Pred = II.getOperand(0); 1312 Value *Vec = II.getOperand(1); 1313 Value *DivVec = II.getOperand(2); 1314 1315 Value *SplatValue = getSplatValue(DivVec); 1316 ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(SplatValue); 1317 if (!SplatConstantInt) 1318 return None; 1319 APInt Divisor = SplatConstantInt->getValue(); 1320 1321 if (Divisor.isPowerOf2()) { 1322 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2()); 1323 auto ASRD = Builder.CreateIntrinsic( 1324 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2}); 1325 return IC.replaceInstUsesWith(II, ASRD); 1326 } 1327 if (Divisor.isNegatedPowerOf2()) { 1328 Divisor.negate(); 1329 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2()); 1330 auto ASRD = Builder.CreateIntrinsic( 1331 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2}); 1332 auto NEG = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_neg, 1333 {ASRD->getType()}, {ASRD, Pred, ASRD}); 1334 return IC.replaceInstUsesWith(II, NEG); 1335 } 1336 1337 return None; 1338 } 1339 1340 static Optional<Instruction *> instCombineMaxMinNM(InstCombiner &IC, 1341 IntrinsicInst &II) { 1342 Value *A = II.getArgOperand(0); 1343 Value *B = II.getArgOperand(1); 1344 if (A == B) 1345 return IC.replaceInstUsesWith(II, A); 1346 1347 return None; 1348 } 1349 1350 static Optional<Instruction *> instCombineSVESrshl(InstCombiner &IC, 1351 IntrinsicInst &II) { 1352 IRBuilder<> Builder(&II); 1353 Value *Pred = II.getOperand(0); 1354 Value *Vec = II.getOperand(1); 1355 Value *Shift = II.getOperand(2); 1356 1357 // Convert SRSHL into the simpler LSL intrinsic when fed by an ABS intrinsic. 1358 Value *AbsPred, *MergedValue; 1359 if (!match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_sqabs>( 1360 m_Value(MergedValue), m_Value(AbsPred), m_Value())) && 1361 !match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_abs>( 1362 m_Value(MergedValue), m_Value(AbsPred), m_Value()))) 1363 1364 return None; 1365 1366 // Transform is valid if any of the following are true: 1367 // * The ABS merge value is an undef or non-negative 1368 // * The ABS predicate is all active 1369 // * The ABS predicate and the SRSHL predicates are the same 1370 if (!isa<UndefValue>(MergedValue) && 1371 !match(MergedValue, m_NonNegative()) && 1372 AbsPred != Pred && !isAllActivePredicate(AbsPred)) 1373 return None; 1374 1375 // Only valid when the shift amount is non-negative, otherwise the rounding 1376 // behaviour of SRSHL cannot be ignored. 1377 if (!match(Shift, m_NonNegative())) 1378 return None; 1379 1380 auto LSL = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_lsl, {II.getType()}, 1381 {Pred, Vec, Shift}); 1382 1383 return IC.replaceInstUsesWith(II, LSL); 1384 } 1385 1386 Optional<Instruction *> 1387 AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC, 1388 IntrinsicInst &II) const { 1389 Intrinsic::ID IID = II.getIntrinsicID(); 1390 switch (IID) { 1391 default: 1392 break; 1393 case Intrinsic::aarch64_neon_fmaxnm: 1394 case Intrinsic::aarch64_neon_fminnm: 1395 return instCombineMaxMinNM(IC, II); 1396 case Intrinsic::aarch64_sve_convert_from_svbool: 1397 return instCombineConvertFromSVBool(IC, II); 1398 case Intrinsic::aarch64_sve_dup: 1399 return instCombineSVEDup(IC, II); 1400 case Intrinsic::aarch64_sve_dup_x: 1401 return instCombineSVEDupX(IC, II); 1402 case Intrinsic::aarch64_sve_cmpne: 1403 case Intrinsic::aarch64_sve_cmpne_wide: 1404 return instCombineSVECmpNE(IC, II); 1405 case Intrinsic::aarch64_sve_rdffr: 1406 return instCombineRDFFR(IC, II); 1407 case Intrinsic::aarch64_sve_lasta: 1408 case Intrinsic::aarch64_sve_lastb: 1409 return instCombineSVELast(IC, II); 1410 case Intrinsic::aarch64_sve_clasta_n: 1411 case Intrinsic::aarch64_sve_clastb_n: 1412 return instCombineSVECondLast(IC, II); 1413 case Intrinsic::aarch64_sve_cntd: 1414 return instCombineSVECntElts(IC, II, 2); 1415 case Intrinsic::aarch64_sve_cntw: 1416 return instCombineSVECntElts(IC, II, 4); 1417 case Intrinsic::aarch64_sve_cnth: 1418 return instCombineSVECntElts(IC, II, 8); 1419 case Intrinsic::aarch64_sve_cntb: 1420 return instCombineSVECntElts(IC, II, 16); 1421 case Intrinsic::aarch64_sve_ptest_any: 1422 case Intrinsic::aarch64_sve_ptest_first: 1423 case Intrinsic::aarch64_sve_ptest_last: 1424 return instCombineSVEPTest(IC, II); 1425 case Intrinsic::aarch64_sve_mul: 1426 case Intrinsic::aarch64_sve_fmul: 1427 return instCombineSVEVectorMul(IC, II); 1428 case Intrinsic::aarch64_sve_fadd: 1429 return instCombineSVEVectorFAdd(IC, II); 1430 case Intrinsic::aarch64_sve_fsub: 1431 return instCombineSVEVectorBinOp(IC, II); 1432 case Intrinsic::aarch64_sve_tbl: 1433 return instCombineSVETBL(IC, II); 1434 case Intrinsic::aarch64_sve_uunpkhi: 1435 case Intrinsic::aarch64_sve_uunpklo: 1436 case Intrinsic::aarch64_sve_sunpkhi: 1437 case Intrinsic::aarch64_sve_sunpklo: 1438 return instCombineSVEUnpack(IC, II); 1439 case Intrinsic::aarch64_sve_tuple_get: 1440 return instCombineSVETupleGet(IC, II); 1441 case Intrinsic::aarch64_sve_zip1: 1442 case Intrinsic::aarch64_sve_zip2: 1443 return instCombineSVEZip(IC, II); 1444 case Intrinsic::aarch64_sve_ld1_gather_index: 1445 return instCombineLD1GatherIndex(IC, II); 1446 case Intrinsic::aarch64_sve_st1_scatter_index: 1447 return instCombineST1ScatterIndex(IC, II); 1448 case Intrinsic::aarch64_sve_ld1: 1449 return instCombineSVELD1(IC, II, DL); 1450 case Intrinsic::aarch64_sve_st1: 1451 return instCombineSVEST1(IC, II, DL); 1452 case Intrinsic::aarch64_sve_sdiv: 1453 return instCombineSVESDIV(IC, II); 1454 case Intrinsic::aarch64_sve_sel: 1455 return instCombineSVESel(IC, II); 1456 case Intrinsic::aarch64_sve_srshl: 1457 return instCombineSVESrshl(IC, II); 1458 } 1459 1460 return None; 1461 } 1462 1463 Optional<Value *> AArch64TTIImpl::simplifyDemandedVectorEltsIntrinsic( 1464 InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts, 1465 APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, 1466 std::function<void(Instruction *, unsigned, APInt, APInt &)> 1467 SimplifyAndSetOp) const { 1468 switch (II.getIntrinsicID()) { 1469 default: 1470 break; 1471 case Intrinsic::aarch64_neon_fcvtxn: 1472 case Intrinsic::aarch64_neon_rshrn: 1473 case Intrinsic::aarch64_neon_sqrshrn: 1474 case Intrinsic::aarch64_neon_sqrshrun: 1475 case Intrinsic::aarch64_neon_sqshrn: 1476 case Intrinsic::aarch64_neon_sqshrun: 1477 case Intrinsic::aarch64_neon_sqxtn: 1478 case Intrinsic::aarch64_neon_sqxtun: 1479 case Intrinsic::aarch64_neon_uqrshrn: 1480 case Intrinsic::aarch64_neon_uqshrn: 1481 case Intrinsic::aarch64_neon_uqxtn: 1482 SimplifyAndSetOp(&II, 0, OrigDemandedElts, UndefElts); 1483 break; 1484 } 1485 1486 return None; 1487 } 1488 1489 bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode, 1490 ArrayRef<const Value *> Args) { 1491 1492 // A helper that returns a vector type from the given type. The number of 1493 // elements in type Ty determines the vector width. 1494 auto toVectorTy = [&](Type *ArgTy) { 1495 return VectorType::get(ArgTy->getScalarType(), 1496 cast<VectorType>(DstTy)->getElementCount()); 1497 }; 1498 1499 // Exit early if DstTy is not a vector type whose elements are at least 1500 // 16-bits wide. 1501 if (!DstTy->isVectorTy() || DstTy->getScalarSizeInBits() < 16) 1502 return false; 1503 1504 // Determine if the operation has a widening variant. We consider both the 1505 // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the 1506 // instructions. 1507 // 1508 // TODO: Add additional widening operations (e.g., shl, etc.) once we 1509 // verify that their extending operands are eliminated during code 1510 // generation. 1511 switch (Opcode) { 1512 case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2). 1513 case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2). 1514 case Instruction::Mul: // SMULL(2), UMULL(2) 1515 break; 1516 default: 1517 return false; 1518 } 1519 1520 // To be a widening instruction (either the "wide" or "long" versions), the 1521 // second operand must be a sign- or zero extend. 1522 if (Args.size() != 2 || 1523 (!isa<SExtInst>(Args[1]) && !isa<ZExtInst>(Args[1]))) 1524 return false; 1525 auto *Extend = cast<CastInst>(Args[1]); 1526 auto *Arg0 = dyn_cast<CastInst>(Args[0]); 1527 1528 // A mul only has a mull version (not like addw). Both operands need to be 1529 // extending and the same type. 1530 if (Opcode == Instruction::Mul && 1531 (!Arg0 || Arg0->getOpcode() != Extend->getOpcode() || 1532 Arg0->getOperand(0)->getType() != Extend->getOperand(0)->getType())) 1533 return false; 1534 1535 // Legalize the destination type and ensure it can be used in a widening 1536 // operation. 1537 auto DstTyL = TLI->getTypeLegalizationCost(DL, DstTy); 1538 unsigned DstElTySize = DstTyL.second.getScalarSizeInBits(); 1539 if (!DstTyL.second.isVector() || DstElTySize != DstTy->getScalarSizeInBits()) 1540 return false; 1541 1542 // Legalize the source type and ensure it can be used in a widening 1543 // operation. 1544 auto *SrcTy = toVectorTy(Extend->getSrcTy()); 1545 auto SrcTyL = TLI->getTypeLegalizationCost(DL, SrcTy); 1546 unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits(); 1547 if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits()) 1548 return false; 1549 1550 // Get the total number of vector elements in the legalized types. 1551 InstructionCost NumDstEls = 1552 DstTyL.first * DstTyL.second.getVectorMinNumElements(); 1553 InstructionCost NumSrcEls = 1554 SrcTyL.first * SrcTyL.second.getVectorMinNumElements(); 1555 1556 // Return true if the legalized types have the same number of vector elements 1557 // and the destination element type size is twice that of the source type. 1558 return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstElTySize; 1559 } 1560 1561 InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, 1562 Type *Src, 1563 TTI::CastContextHint CCH, 1564 TTI::TargetCostKind CostKind, 1565 const Instruction *I) { 1566 int ISD = TLI->InstructionOpcodeToISD(Opcode); 1567 assert(ISD && "Invalid opcode"); 1568 1569 // If the cast is observable, and it is used by a widening instruction (e.g., 1570 // uaddl, saddw, etc.), it may be free. 1571 if (I && I->hasOneUser()) { 1572 auto *SingleUser = cast<Instruction>(*I->user_begin()); 1573 SmallVector<const Value *, 4> Operands(SingleUser->operand_values()); 1574 if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands)) { 1575 // If the cast is the second operand, it is free. We will generate either 1576 // a "wide" or "long" version of the widening instruction. 1577 if (I == SingleUser->getOperand(1)) 1578 return 0; 1579 // If the cast is not the second operand, it will be free if it looks the 1580 // same as the second operand. In this case, we will generate a "long" 1581 // version of the widening instruction. 1582 if (auto *Cast = dyn_cast<CastInst>(SingleUser->getOperand(1))) 1583 if (I->getOpcode() == unsigned(Cast->getOpcode()) && 1584 cast<CastInst>(I)->getSrcTy() == Cast->getSrcTy()) 1585 return 0; 1586 } 1587 } 1588 1589 // TODO: Allow non-throughput costs that aren't binary. 1590 auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost { 1591 if (CostKind != TTI::TCK_RecipThroughput) 1592 return Cost == 0 ? 0 : 1; 1593 return Cost; 1594 }; 1595 1596 EVT SrcTy = TLI->getValueType(DL, Src); 1597 EVT DstTy = TLI->getValueType(DL, Dst); 1598 1599 if (!SrcTy.isSimple() || !DstTy.isSimple()) 1600 return AdjustCost( 1601 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); 1602 1603 static const TypeConversionCostTblEntry 1604 ConversionTbl[] = { 1605 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 }, 1606 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 0 }, 1607 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 }, 1608 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 }, 1609 1610 // Truncations on nxvmiN 1611 { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i16, 1 }, 1612 { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i32, 1 }, 1613 { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i64, 1 }, 1614 { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i16, 1 }, 1615 { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i32, 1 }, 1616 { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i64, 2 }, 1617 { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i16, 1 }, 1618 { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i32, 3 }, 1619 { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i64, 5 }, 1620 { ISD::TRUNCATE, MVT::nxv16i1, MVT::nxv16i8, 1 }, 1621 { ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i32, 1 }, 1622 { ISD::TRUNCATE, MVT::nxv2i32, MVT::nxv2i64, 1 }, 1623 { ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i32, 1 }, 1624 { ISD::TRUNCATE, MVT::nxv4i32, MVT::nxv4i64, 2 }, 1625 { ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i32, 3 }, 1626 { ISD::TRUNCATE, MVT::nxv8i32, MVT::nxv8i64, 6 }, 1627 1628 // The number of shll instructions for the extension. 1629 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, 1630 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, 1631 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 }, 1632 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 }, 1633 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, 1634 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, 1635 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, 1636 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, 1637 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 }, 1638 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 }, 1639 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 }, 1640 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 }, 1641 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, 1642 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, 1643 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 }, 1644 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 }, 1645 1646 // LowerVectorINT_TO_FP: 1647 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, 1648 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, 1649 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, 1650 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, 1651 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, 1652 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, 1653 1654 // Complex: to v2f32 1655 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 }, 1656 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 }, 1657 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 }, 1658 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 }, 1659 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 }, 1660 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 }, 1661 1662 // Complex: to v4f32 1663 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 4 }, 1664 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 }, 1665 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 }, 1666 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 }, 1667 1668 // Complex: to v8f32 1669 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 }, 1670 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 }, 1671 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 }, 1672 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 }, 1673 1674 // Complex: to v16f32 1675 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 }, 1676 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 }, 1677 1678 // Complex: to v2f64 1679 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 }, 1680 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 }, 1681 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 }, 1682 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 }, 1683 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 }, 1684 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 }, 1685 1686 1687 // LowerVectorFP_TO_INT 1688 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1 }, 1689 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 }, 1690 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 }, 1691 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 }, 1692 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 }, 1693 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 }, 1694 1695 // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext). 1696 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2 }, 1697 { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1 }, 1698 { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 1 }, 1699 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2 }, 1700 { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1 }, 1701 { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 1 }, 1702 1703 // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2 1704 { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 }, 1705 { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 2 }, 1706 { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 }, 1707 { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 2 }, 1708 1709 // Complex, from nxv2f32. 1710 { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f32, 1 }, 1711 { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f32, 1 }, 1712 { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f32, 1 }, 1713 { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f32, 1 }, 1714 { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f32, 1 }, 1715 { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f32, 1 }, 1716 { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f32, 1 }, 1717 { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f32, 1 }, 1718 1719 // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2. 1720 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 }, 1721 { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2 }, 1722 { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 2 }, 1723 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 }, 1724 { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2 }, 1725 { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2 }, 1726 1727 // Complex, from nxv2f64. 1728 { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f64, 1 }, 1729 { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f64, 1 }, 1730 { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f64, 1 }, 1731 { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f64, 1 }, 1732 { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f64, 1 }, 1733 { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f64, 1 }, 1734 { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f64, 1 }, 1735 { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f64, 1 }, 1736 1737 // Complex, from nxv4f32. 1738 { ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f32, 4 }, 1739 { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f32, 1 }, 1740 { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f32, 1 }, 1741 { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f32, 1 }, 1742 { ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f32, 4 }, 1743 { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f32, 1 }, 1744 { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f32, 1 }, 1745 { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f32, 1 }, 1746 1747 // Complex, from nxv8f64. Illegal -> illegal conversions not required. 1748 { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f64, 7 }, 1749 { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f64, 7 }, 1750 { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f64, 7 }, 1751 { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f64, 7 }, 1752 1753 // Complex, from nxv4f64. Illegal -> illegal conversions not required. 1754 { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f64, 3 }, 1755 { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f64, 3 }, 1756 { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f64, 3 }, 1757 { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f64, 3 }, 1758 { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f64, 3 }, 1759 { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f64, 3 }, 1760 1761 // Complex, from nxv8f32. Illegal -> illegal conversions not required. 1762 { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f32, 3 }, 1763 { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f32, 3 }, 1764 { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f32, 3 }, 1765 { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f32, 3 }, 1766 1767 // Complex, from nxv8f16. 1768 { ISD::FP_TO_SINT, MVT::nxv8i64, MVT::nxv8f16, 10 }, 1769 { ISD::FP_TO_SINT, MVT::nxv8i32, MVT::nxv8f16, 4 }, 1770 { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f16, 1 }, 1771 { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f16, 1 }, 1772 { ISD::FP_TO_UINT, MVT::nxv8i64, MVT::nxv8f16, 10 }, 1773 { ISD::FP_TO_UINT, MVT::nxv8i32, MVT::nxv8f16, 4 }, 1774 { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f16, 1 }, 1775 { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f16, 1 }, 1776 1777 // Complex, from nxv4f16. 1778 { ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f16, 4 }, 1779 { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f16, 1 }, 1780 { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f16, 1 }, 1781 { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f16, 1 }, 1782 { ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f16, 4 }, 1783 { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f16, 1 }, 1784 { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f16, 1 }, 1785 { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f16, 1 }, 1786 1787 // Complex, from nxv2f16. 1788 { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f16, 1 }, 1789 { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f16, 1 }, 1790 { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f16, 1 }, 1791 { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f16, 1 }, 1792 { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f16, 1 }, 1793 { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f16, 1 }, 1794 { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f16, 1 }, 1795 { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f16, 1 }, 1796 1797 // Truncate from nxvmf32 to nxvmf16. 1798 { ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f32, 1 }, 1799 { ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f32, 1 }, 1800 { ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f32, 3 }, 1801 1802 // Truncate from nxvmf64 to nxvmf16. 1803 { ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f64, 1 }, 1804 { ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f64, 3 }, 1805 { ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f64, 7 }, 1806 1807 // Truncate from nxvmf64 to nxvmf32. 1808 { ISD::FP_ROUND, MVT::nxv2f32, MVT::nxv2f64, 1 }, 1809 { ISD::FP_ROUND, MVT::nxv4f32, MVT::nxv4f64, 3 }, 1810 { ISD::FP_ROUND, MVT::nxv8f32, MVT::nxv8f64, 6 }, 1811 1812 // Extend from nxvmf16 to nxvmf32. 1813 { ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2f16, 1}, 1814 { ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4f16, 1}, 1815 { ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8f16, 2}, 1816 1817 // Extend from nxvmf16 to nxvmf64. 1818 { ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f16, 1}, 1819 { ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f16, 2}, 1820 { ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f16, 4}, 1821 1822 // Extend from nxvmf32 to nxvmf64. 1823 { ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f32, 1}, 1824 { ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f32, 2}, 1825 { ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f32, 6}, 1826 1827 // Bitcasts from float to integer 1828 { ISD::BITCAST, MVT::nxv2f16, MVT::nxv2i16, 0 }, 1829 { ISD::BITCAST, MVT::nxv4f16, MVT::nxv4i16, 0 }, 1830 { ISD::BITCAST, MVT::nxv2f32, MVT::nxv2i32, 0 }, 1831 1832 // Bitcasts from integer to float 1833 { ISD::BITCAST, MVT::nxv2i16, MVT::nxv2f16, 0 }, 1834 { ISD::BITCAST, MVT::nxv4i16, MVT::nxv4f16, 0 }, 1835 { ISD::BITCAST, MVT::nxv2i32, MVT::nxv2f32, 0 }, 1836 }; 1837 1838 if (const auto *Entry = ConvertCostTableLookup(ConversionTbl, ISD, 1839 DstTy.getSimpleVT(), 1840 SrcTy.getSimpleVT())) 1841 return AdjustCost(Entry->Cost); 1842 1843 static const TypeConversionCostTblEntry FP16Tbl[] = { 1844 {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f16, 1}, // fcvtzs 1845 {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f16, 1}, 1846 {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f16, 1}, // fcvtzs 1847 {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f16, 1}, 1848 {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f16, 2}, // fcvtl+fcvtzs 1849 {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f16, 2}, 1850 {ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f16, 2}, // fcvtzs+xtn 1851 {ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f16, 2}, 1852 {ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f16, 1}, // fcvtzs 1853 {ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f16, 1}, 1854 {ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f16, 4}, // 2*fcvtl+2*fcvtzs 1855 {ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f16, 4}, 1856 {ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f16, 3}, // 2*fcvtzs+xtn 1857 {ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f16, 3}, 1858 {ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f16, 2}, // 2*fcvtzs 1859 {ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f16, 2}, 1860 {ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f16, 8}, // 4*fcvtl+4*fcvtzs 1861 {ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f16, 8}, 1862 {ISD::UINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // ushll + ucvtf 1863 {ISD::SINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // sshll + scvtf 1864 {ISD::UINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * ushl(2) + 2 * ucvtf 1865 {ISD::SINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * sshl(2) + 2 * scvtf 1866 }; 1867 1868 if (ST->hasFullFP16()) 1869 if (const auto *Entry = ConvertCostTableLookup( 1870 FP16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT())) 1871 return AdjustCost(Entry->Cost); 1872 1873 return AdjustCost( 1874 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); 1875 } 1876 1877 InstructionCost AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, 1878 Type *Dst, 1879 VectorType *VecTy, 1880 unsigned Index) { 1881 1882 // Make sure we were given a valid extend opcode. 1883 assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) && 1884 "Invalid opcode"); 1885 1886 // We are extending an element we extract from a vector, so the source type 1887 // of the extend is the element type of the vector. 1888 auto *Src = VecTy->getElementType(); 1889 1890 // Sign- and zero-extends are for integer types only. 1891 assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type"); 1892 1893 // Get the cost for the extract. We compute the cost (if any) for the extend 1894 // below. 1895 InstructionCost Cost = 1896 getVectorInstrCost(Instruction::ExtractElement, VecTy, Index); 1897 1898 // Legalize the types. 1899 auto VecLT = TLI->getTypeLegalizationCost(DL, VecTy); 1900 auto DstVT = TLI->getValueType(DL, Dst); 1901 auto SrcVT = TLI->getValueType(DL, Src); 1902 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 1903 1904 // If the resulting type is still a vector and the destination type is legal, 1905 // we may get the extension for free. If not, get the default cost for the 1906 // extend. 1907 if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT)) 1908 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None, 1909 CostKind); 1910 1911 // The destination type should be larger than the element type. If not, get 1912 // the default cost for the extend. 1913 if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits()) 1914 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None, 1915 CostKind); 1916 1917 switch (Opcode) { 1918 default: 1919 llvm_unreachable("Opcode should be either SExt or ZExt"); 1920 1921 // For sign-extends, we only need a smov, which performs the extension 1922 // automatically. 1923 case Instruction::SExt: 1924 return Cost; 1925 1926 // For zero-extends, the extend is performed automatically by a umov unless 1927 // the destination type is i64 and the element type is i8 or i16. 1928 case Instruction::ZExt: 1929 if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u) 1930 return Cost; 1931 } 1932 1933 // If we are unable to perform the extend for free, get the default cost. 1934 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None, 1935 CostKind); 1936 } 1937 1938 InstructionCost AArch64TTIImpl::getCFInstrCost(unsigned Opcode, 1939 TTI::TargetCostKind CostKind, 1940 const Instruction *I) { 1941 if (CostKind != TTI::TCK_RecipThroughput) 1942 return Opcode == Instruction::PHI ? 0 : 1; 1943 assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind"); 1944 // Branches are assumed to be predicted. 1945 return 0; 1946 } 1947 1948 InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, 1949 unsigned Index) { 1950 assert(Val->isVectorTy() && "This must be a vector type"); 1951 1952 if (Index != -1U) { 1953 // Legalize the type. 1954 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Val); 1955 1956 // This type is legalized to a scalar type. 1957 if (!LT.second.isVector()) 1958 return 0; 1959 1960 // The type may be split. For fixed-width vectors we can normalize the 1961 // index to the new type. 1962 if (LT.second.isFixedLengthVector()) { 1963 unsigned Width = LT.second.getVectorNumElements(); 1964 Index = Index % Width; 1965 } 1966 1967 // The element at index zero is already inside the vector. 1968 if (Index == 0) 1969 return 0; 1970 } 1971 1972 // All other insert/extracts cost this much. 1973 return ST->getVectorInsertExtractBaseCost(); 1974 } 1975 1976 InstructionCost AArch64TTIImpl::getArithmeticInstrCost( 1977 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, 1978 TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info, 1979 TTI::OperandValueProperties Opd1PropInfo, 1980 TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args, 1981 const Instruction *CxtI) { 1982 // TODO: Handle more cost kinds. 1983 if (CostKind != TTI::TCK_RecipThroughput) 1984 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, 1985 Opd2Info, Opd1PropInfo, 1986 Opd2PropInfo, Args, CxtI); 1987 1988 // Legalize the type. 1989 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); 1990 int ISD = TLI->InstructionOpcodeToISD(Opcode); 1991 1992 switch (ISD) { 1993 default: 1994 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, 1995 Opd2Info, Opd1PropInfo, Opd2PropInfo); 1996 case ISD::SDIV: 1997 if (Opd2Info == TargetTransformInfo::OK_UniformConstantValue && 1998 Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) { 1999 // On AArch64, scalar signed division by constants power-of-two are 2000 // normally expanded to the sequence ADD + CMP + SELECT + SRA. 2001 // The OperandValue properties many not be same as that of previous 2002 // operation; conservatively assume OP_None. 2003 InstructionCost Cost = getArithmeticInstrCost( 2004 Instruction::Add, Ty, CostKind, Opd1Info, Opd2Info, 2005 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); 2006 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Opd1Info, 2007 Opd2Info, TargetTransformInfo::OP_None, 2008 TargetTransformInfo::OP_None); 2009 Cost += getArithmeticInstrCost( 2010 Instruction::Select, Ty, CostKind, Opd1Info, Opd2Info, 2011 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); 2012 Cost += getArithmeticInstrCost(Instruction::AShr, Ty, CostKind, Opd1Info, 2013 Opd2Info, TargetTransformInfo::OP_None, 2014 TargetTransformInfo::OP_None); 2015 return Cost; 2016 } 2017 LLVM_FALLTHROUGH; 2018 case ISD::UDIV: { 2019 if (Opd2Info == TargetTransformInfo::OK_UniformConstantValue) { 2020 auto VT = TLI->getValueType(DL, Ty); 2021 if (TLI->isOperationLegalOrCustom(ISD::MULHU, VT)) { 2022 // Vector signed division by constant are expanded to the 2023 // sequence MULHS + ADD/SUB + SRA + SRL + ADD, and unsigned division 2024 // to MULHS + SUB + SRL + ADD + SRL. 2025 InstructionCost MulCost = getArithmeticInstrCost( 2026 Instruction::Mul, Ty, CostKind, Opd1Info, Opd2Info, 2027 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); 2028 InstructionCost AddCost = getArithmeticInstrCost( 2029 Instruction::Add, Ty, CostKind, Opd1Info, Opd2Info, 2030 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); 2031 InstructionCost ShrCost = getArithmeticInstrCost( 2032 Instruction::AShr, Ty, CostKind, Opd1Info, Opd2Info, 2033 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); 2034 return MulCost * 2 + AddCost * 2 + ShrCost * 2 + 1; 2035 } 2036 } 2037 2038 InstructionCost Cost = BaseT::getArithmeticInstrCost( 2039 Opcode, Ty, CostKind, Opd1Info, Opd2Info, Opd1PropInfo, Opd2PropInfo); 2040 if (Ty->isVectorTy()) { 2041 // On AArch64, vector divisions are not supported natively and are 2042 // expanded into scalar divisions of each pair of elements. 2043 Cost += getArithmeticInstrCost(Instruction::ExtractElement, Ty, CostKind, 2044 Opd1Info, Opd2Info, Opd1PropInfo, 2045 Opd2PropInfo); 2046 Cost += getArithmeticInstrCost(Instruction::InsertElement, Ty, CostKind, 2047 Opd1Info, Opd2Info, Opd1PropInfo, 2048 Opd2PropInfo); 2049 // TODO: if one of the arguments is scalar, then it's not necessary to 2050 // double the cost of handling the vector elements. 2051 Cost += Cost; 2052 } 2053 return Cost; 2054 } 2055 case ISD::MUL: 2056 // Since we do not have a MUL.2d instruction, a mul <2 x i64> is expensive 2057 // as elements are extracted from the vectors and the muls scalarized. 2058 // As getScalarizationOverhead is a bit too pessimistic, we estimate the 2059 // cost for a i64 vector directly here, which is: 2060 // - four 2-cost i64 extracts, 2061 // - two 2-cost i64 inserts, and 2062 // - two 1-cost muls. 2063 // So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with 2064 // LT.first = 2 the cost is 28. If both operands are extensions it will not 2065 // need to scalarize so the cost can be cheaper (smull or umull). 2066 if (LT.second != MVT::v2i64 || isWideningInstruction(Ty, Opcode, Args)) 2067 return LT.first; 2068 return LT.first * 14; 2069 case ISD::ADD: 2070 case ISD::XOR: 2071 case ISD::OR: 2072 case ISD::AND: 2073 case ISD::SRL: 2074 case ISD::SRA: 2075 case ISD::SHL: 2076 // These nodes are marked as 'custom' for combining purposes only. 2077 // We know that they are legal. See LowerAdd in ISelLowering. 2078 return LT.first; 2079 2080 case ISD::FADD: 2081 case ISD::FSUB: 2082 case ISD::FMUL: 2083 case ISD::FDIV: 2084 case ISD::FNEG: 2085 // These nodes are marked as 'custom' just to lower them to SVE. 2086 // We know said lowering will incur no additional cost. 2087 if (!Ty->getScalarType()->isFP128Ty()) 2088 return 2 * LT.first; 2089 2090 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, 2091 Opd2Info, Opd1PropInfo, Opd2PropInfo); 2092 } 2093 } 2094 2095 InstructionCost AArch64TTIImpl::getAddressComputationCost(Type *Ty, 2096 ScalarEvolution *SE, 2097 const SCEV *Ptr) { 2098 // Address computations in vectorized code with non-consecutive addresses will 2099 // likely result in more instructions compared to scalar code where the 2100 // computation can more often be merged into the index mode. The resulting 2101 // extra micro-ops can significantly decrease throughput. 2102 unsigned NumVectorInstToHideOverhead = 10; 2103 int MaxMergeDistance = 64; 2104 2105 if (Ty->isVectorTy() && SE && 2106 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1)) 2107 return NumVectorInstToHideOverhead; 2108 2109 // In many cases the address computation is not merged into the instruction 2110 // addressing mode. 2111 return 1; 2112 } 2113 2114 InstructionCost AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, 2115 Type *CondTy, 2116 CmpInst::Predicate VecPred, 2117 TTI::TargetCostKind CostKind, 2118 const Instruction *I) { 2119 // TODO: Handle other cost kinds. 2120 if (CostKind != TTI::TCK_RecipThroughput) 2121 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, 2122 I); 2123 2124 int ISD = TLI->InstructionOpcodeToISD(Opcode); 2125 // We don't lower some vector selects well that are wider than the register 2126 // width. 2127 if (isa<FixedVectorType>(ValTy) && ISD == ISD::SELECT) { 2128 // We would need this many instructions to hide the scalarization happening. 2129 const int AmortizationCost = 20; 2130 2131 // If VecPred is not set, check if we can get a predicate from the context 2132 // instruction, if its type matches the requested ValTy. 2133 if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) { 2134 CmpInst::Predicate CurrentPred; 2135 if (match(I, m_Select(m_Cmp(CurrentPred, m_Value(), m_Value()), m_Value(), 2136 m_Value()))) 2137 VecPred = CurrentPred; 2138 } 2139 // Check if we have a compare/select chain that can be lowered using 2140 // a (F)CMxx & BFI pair. 2141 if (CmpInst::isIntPredicate(VecPred) || VecPred == CmpInst::FCMP_OLE || 2142 VecPred == CmpInst::FCMP_OLT || VecPred == CmpInst::FCMP_OGT || 2143 VecPred == CmpInst::FCMP_OGE || VecPred == CmpInst::FCMP_OEQ || 2144 VecPred == CmpInst::FCMP_UNE) { 2145 static const auto ValidMinMaxTys = { 2146 MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32, 2147 MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32, MVT::v2f64}; 2148 static const auto ValidFP16MinMaxTys = {MVT::v4f16, MVT::v8f16}; 2149 2150 auto LT = TLI->getTypeLegalizationCost(DL, ValTy); 2151 if (any_of(ValidMinMaxTys, [<](MVT M) { return M == LT.second; }) || 2152 (ST->hasFullFP16() && 2153 any_of(ValidFP16MinMaxTys, [<](MVT M) { return M == LT.second; }))) 2154 return LT.first; 2155 } 2156 2157 static const TypeConversionCostTblEntry 2158 VectorSelectTbl[] = { 2159 { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 }, 2160 { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 }, 2161 { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 }, 2162 { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost }, 2163 { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost }, 2164 { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost } 2165 }; 2166 2167 EVT SelCondTy = TLI->getValueType(DL, CondTy); 2168 EVT SelValTy = TLI->getValueType(DL, ValTy); 2169 if (SelCondTy.isSimple() && SelValTy.isSimple()) { 2170 if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, ISD, 2171 SelCondTy.getSimpleVT(), 2172 SelValTy.getSimpleVT())) 2173 return Entry->Cost; 2174 } 2175 } 2176 // The base case handles scalable vectors fine for now, since it treats the 2177 // cost as 1 * legalization cost. 2178 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I); 2179 } 2180 2181 AArch64TTIImpl::TTI::MemCmpExpansionOptions 2182 AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const { 2183 TTI::MemCmpExpansionOptions Options; 2184 if (ST->requiresStrictAlign()) { 2185 // TODO: Add cost modeling for strict align. Misaligned loads expand to 2186 // a bunch of instructions when strict align is enabled. 2187 return Options; 2188 } 2189 Options.AllowOverlappingLoads = true; 2190 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize); 2191 Options.NumLoadsPerBlock = Options.MaxNumLoads; 2192 // TODO: Though vector loads usually perform well on AArch64, in some targets 2193 // they may wake up the FP unit, which raises the power consumption. Perhaps 2194 // they could be used with no holds barred (-O3). 2195 Options.LoadSizes = {8, 4, 2, 1}; 2196 return Options; 2197 } 2198 2199 bool AArch64TTIImpl::prefersVectorizedAddressing() const { 2200 return ST->hasSVE(); 2201 } 2202 2203 InstructionCost 2204 AArch64TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, 2205 Align Alignment, unsigned AddressSpace, 2206 TTI::TargetCostKind CostKind) { 2207 if (useNeonVector(Src)) 2208 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace, 2209 CostKind); 2210 auto LT = TLI->getTypeLegalizationCost(DL, Src); 2211 if (!LT.first.isValid()) 2212 return InstructionCost::getInvalid(); 2213 2214 // The code-generator is currently not able to handle scalable vectors 2215 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting 2216 // it. This change will be removed when code-generation for these types is 2217 // sufficiently reliable. 2218 if (cast<VectorType>(Src)->getElementCount() == ElementCount::getScalable(1)) 2219 return InstructionCost::getInvalid(); 2220 2221 return LT.first * 2; 2222 } 2223 2224 static unsigned getSVEGatherScatterOverhead(unsigned Opcode) { 2225 return Opcode == Instruction::Load ? SVEGatherOverhead : SVEScatterOverhead; 2226 } 2227 2228 InstructionCost AArch64TTIImpl::getGatherScatterOpCost( 2229 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, 2230 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) { 2231 if (useNeonVector(DataTy)) 2232 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, 2233 Alignment, CostKind, I); 2234 auto *VT = cast<VectorType>(DataTy); 2235 auto LT = TLI->getTypeLegalizationCost(DL, DataTy); 2236 if (!LT.first.isValid()) 2237 return InstructionCost::getInvalid(); 2238 2239 // The code-generator is currently not able to handle scalable vectors 2240 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting 2241 // it. This change will be removed when code-generation for these types is 2242 // sufficiently reliable. 2243 if (cast<VectorType>(DataTy)->getElementCount() == 2244 ElementCount::getScalable(1)) 2245 return InstructionCost::getInvalid(); 2246 2247 ElementCount LegalVF = LT.second.getVectorElementCount(); 2248 InstructionCost MemOpCost = 2249 getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind, I); 2250 // Add on an overhead cost for using gathers/scatters. 2251 // TODO: At the moment this is applied unilaterally for all CPUs, but at some 2252 // point we may want a per-CPU overhead. 2253 MemOpCost *= getSVEGatherScatterOverhead(Opcode); 2254 return LT.first * MemOpCost * getMaxNumElements(LegalVF); 2255 } 2256 2257 bool AArch64TTIImpl::useNeonVector(const Type *Ty) const { 2258 return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors(); 2259 } 2260 2261 InstructionCost AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty, 2262 MaybeAlign Alignment, 2263 unsigned AddressSpace, 2264 TTI::TargetCostKind CostKind, 2265 const Instruction *I) { 2266 EVT VT = TLI->getValueType(DL, Ty, true); 2267 // Type legalization can't handle structs 2268 if (VT == MVT::Other) 2269 return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace, 2270 CostKind); 2271 2272 auto LT = TLI->getTypeLegalizationCost(DL, Ty); 2273 if (!LT.first.isValid()) 2274 return InstructionCost::getInvalid(); 2275 2276 // The code-generator is currently not able to handle scalable vectors 2277 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting 2278 // it. This change will be removed when code-generation for these types is 2279 // sufficiently reliable. 2280 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty)) 2281 if (VTy->getElementCount() == ElementCount::getScalable(1)) 2282 return InstructionCost::getInvalid(); 2283 2284 // TODO: consider latency as well for TCK_SizeAndLatency. 2285 if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency) 2286 return LT.first; 2287 2288 if (CostKind != TTI::TCK_RecipThroughput) 2289 return 1; 2290 2291 if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store && 2292 LT.second.is128BitVector() && (!Alignment || *Alignment < Align(16))) { 2293 // Unaligned stores are extremely inefficient. We don't split all 2294 // unaligned 128-bit stores because the negative impact that has shown in 2295 // practice on inlined block copy code. 2296 // We make such stores expensive so that we will only vectorize if there 2297 // are 6 other instructions getting vectorized. 2298 const int AmortizationCost = 6; 2299 2300 return LT.first * 2 * AmortizationCost; 2301 } 2302 2303 // Check truncating stores and extending loads. 2304 if (useNeonVector(Ty) && 2305 Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) { 2306 // v4i8 types are lowered to scalar a load/store and sshll/xtn. 2307 if (VT == MVT::v4i8) 2308 return 2; 2309 // Otherwise we need to scalarize. 2310 return cast<FixedVectorType>(Ty)->getNumElements() * 2; 2311 } 2312 2313 return LT.first; 2314 } 2315 2316 InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost( 2317 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, 2318 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, 2319 bool UseMaskForCond, bool UseMaskForGaps) { 2320 assert(Factor >= 2 && "Invalid interleave factor"); 2321 auto *VecVTy = cast<FixedVectorType>(VecTy); 2322 2323 if (!UseMaskForCond && !UseMaskForGaps && 2324 Factor <= TLI->getMaxSupportedInterleaveFactor()) { 2325 unsigned NumElts = VecVTy->getNumElements(); 2326 auto *SubVecTy = 2327 FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor); 2328 2329 // ldN/stN only support legal vector types of size 64 or 128 in bits. 2330 // Accesses having vector types that are a multiple of 128 bits can be 2331 // matched to more than one ldN/stN instruction. 2332 bool UseScalable; 2333 if (NumElts % Factor == 0 && 2334 TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable)) 2335 return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable); 2336 } 2337 2338 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, 2339 Alignment, AddressSpace, CostKind, 2340 UseMaskForCond, UseMaskForGaps); 2341 } 2342 2343 InstructionCost 2344 AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) { 2345 InstructionCost Cost = 0; 2346 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 2347 for (auto *I : Tys) { 2348 if (!I->isVectorTy()) 2349 continue; 2350 if (I->getScalarSizeInBits() * cast<FixedVectorType>(I)->getNumElements() == 2351 128) 2352 Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0, CostKind) + 2353 getMemoryOpCost(Instruction::Load, I, Align(128), 0, CostKind); 2354 } 2355 return Cost; 2356 } 2357 2358 unsigned AArch64TTIImpl::getMaxInterleaveFactor(unsigned VF) { 2359 return ST->getMaxInterleaveFactor(); 2360 } 2361 2362 // For Falkor, we want to avoid having too many strided loads in a loop since 2363 // that can exhaust the HW prefetcher resources. We adjust the unroller 2364 // MaxCount preference below to attempt to ensure unrolling doesn't create too 2365 // many strided loads. 2366 static void 2367 getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE, 2368 TargetTransformInfo::UnrollingPreferences &UP) { 2369 enum { MaxStridedLoads = 7 }; 2370 auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) { 2371 int StridedLoads = 0; 2372 // FIXME? We could make this more precise by looking at the CFG and 2373 // e.g. not counting loads in each side of an if-then-else diamond. 2374 for (const auto BB : L->blocks()) { 2375 for (auto &I : *BB) { 2376 LoadInst *LMemI = dyn_cast<LoadInst>(&I); 2377 if (!LMemI) 2378 continue; 2379 2380 Value *PtrValue = LMemI->getPointerOperand(); 2381 if (L->isLoopInvariant(PtrValue)) 2382 continue; 2383 2384 const SCEV *LSCEV = SE.getSCEV(PtrValue); 2385 const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV); 2386 if (!LSCEVAddRec || !LSCEVAddRec->isAffine()) 2387 continue; 2388 2389 // FIXME? We could take pairing of unrolled load copies into account 2390 // by looking at the AddRec, but we would probably have to limit this 2391 // to loops with no stores or other memory optimization barriers. 2392 ++StridedLoads; 2393 // We've seen enough strided loads that seeing more won't make a 2394 // difference. 2395 if (StridedLoads > MaxStridedLoads / 2) 2396 return StridedLoads; 2397 } 2398 } 2399 return StridedLoads; 2400 }; 2401 2402 int StridedLoads = countStridedLoads(L, SE); 2403 LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads 2404 << " strided loads\n"); 2405 // Pick the largest power of 2 unroll count that won't result in too many 2406 // strided loads. 2407 if (StridedLoads) { 2408 UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads); 2409 LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to " 2410 << UP.MaxCount << '\n'); 2411 } 2412 } 2413 2414 void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, 2415 TTI::UnrollingPreferences &UP, 2416 OptimizationRemarkEmitter *ORE) { 2417 // Enable partial unrolling and runtime unrolling. 2418 BaseT::getUnrollingPreferences(L, SE, UP, ORE); 2419 2420 UP.UpperBound = true; 2421 2422 // For inner loop, it is more likely to be a hot one, and the runtime check 2423 // can be promoted out from LICM pass, so the overhead is less, let's try 2424 // a larger threshold to unroll more loops. 2425 if (L->getLoopDepth() > 1) 2426 UP.PartialThreshold *= 2; 2427 2428 // Disable partial & runtime unrolling on -Os. 2429 UP.PartialOptSizeThreshold = 0; 2430 2431 if (ST->getProcFamily() == AArch64Subtarget::Falkor && 2432 EnableFalkorHWPFUnrollFix) 2433 getFalkorUnrollingPreferences(L, SE, UP); 2434 2435 // Scan the loop: don't unroll loops with calls as this could prevent 2436 // inlining. Don't unroll vector loops either, as they don't benefit much from 2437 // unrolling. 2438 for (auto *BB : L->getBlocks()) { 2439 for (auto &I : *BB) { 2440 // Don't unroll vectorised loop. 2441 if (I.getType()->isVectorTy()) 2442 return; 2443 2444 if (isa<CallInst>(I) || isa<InvokeInst>(I)) { 2445 if (const Function *F = cast<CallBase>(I).getCalledFunction()) { 2446 if (!isLoweredToCall(F)) 2447 continue; 2448 } 2449 return; 2450 } 2451 } 2452 } 2453 2454 // Enable runtime unrolling for in-order models 2455 // If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by 2456 // checking for that case, we can ensure that the default behaviour is 2457 // unchanged 2458 if (ST->getProcFamily() != AArch64Subtarget::Others && 2459 !ST->getSchedModel().isOutOfOrder()) { 2460 UP.Runtime = true; 2461 UP.Partial = true; 2462 UP.UnrollRemainder = true; 2463 UP.DefaultUnrollRuntimeCount = 4; 2464 2465 UP.UnrollAndJam = true; 2466 UP.UnrollAndJamInnerLoopThreshold = 60; 2467 } 2468 } 2469 2470 void AArch64TTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, 2471 TTI::PeelingPreferences &PP) { 2472 BaseT::getPeelingPreferences(L, SE, PP); 2473 } 2474 2475 Value *AArch64TTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, 2476 Type *ExpectedType) { 2477 switch (Inst->getIntrinsicID()) { 2478 default: 2479 return nullptr; 2480 case Intrinsic::aarch64_neon_st2: 2481 case Intrinsic::aarch64_neon_st3: 2482 case Intrinsic::aarch64_neon_st4: { 2483 // Create a struct type 2484 StructType *ST = dyn_cast<StructType>(ExpectedType); 2485 if (!ST) 2486 return nullptr; 2487 unsigned NumElts = Inst->arg_size() - 1; 2488 if (ST->getNumElements() != NumElts) 2489 return nullptr; 2490 for (unsigned i = 0, e = NumElts; i != e; ++i) { 2491 if (Inst->getArgOperand(i)->getType() != ST->getElementType(i)) 2492 return nullptr; 2493 } 2494 Value *Res = UndefValue::get(ExpectedType); 2495 IRBuilder<> Builder(Inst); 2496 for (unsigned i = 0, e = NumElts; i != e; ++i) { 2497 Value *L = Inst->getArgOperand(i); 2498 Res = Builder.CreateInsertValue(Res, L, i); 2499 } 2500 return Res; 2501 } 2502 case Intrinsic::aarch64_neon_ld2: 2503 case Intrinsic::aarch64_neon_ld3: 2504 case Intrinsic::aarch64_neon_ld4: 2505 if (Inst->getType() == ExpectedType) 2506 return Inst; 2507 return nullptr; 2508 } 2509 } 2510 2511 bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, 2512 MemIntrinsicInfo &Info) { 2513 switch (Inst->getIntrinsicID()) { 2514 default: 2515 break; 2516 case Intrinsic::aarch64_neon_ld2: 2517 case Intrinsic::aarch64_neon_ld3: 2518 case Intrinsic::aarch64_neon_ld4: 2519 Info.ReadMem = true; 2520 Info.WriteMem = false; 2521 Info.PtrVal = Inst->getArgOperand(0); 2522 break; 2523 case Intrinsic::aarch64_neon_st2: 2524 case Intrinsic::aarch64_neon_st3: 2525 case Intrinsic::aarch64_neon_st4: 2526 Info.ReadMem = false; 2527 Info.WriteMem = true; 2528 Info.PtrVal = Inst->getArgOperand(Inst->arg_size() - 1); 2529 break; 2530 } 2531 2532 switch (Inst->getIntrinsicID()) { 2533 default: 2534 return false; 2535 case Intrinsic::aarch64_neon_ld2: 2536 case Intrinsic::aarch64_neon_st2: 2537 Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS; 2538 break; 2539 case Intrinsic::aarch64_neon_ld3: 2540 case Intrinsic::aarch64_neon_st3: 2541 Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS; 2542 break; 2543 case Intrinsic::aarch64_neon_ld4: 2544 case Intrinsic::aarch64_neon_st4: 2545 Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS; 2546 break; 2547 } 2548 return true; 2549 } 2550 2551 /// See if \p I should be considered for address type promotion. We check if \p 2552 /// I is a sext with right type and used in memory accesses. If it used in a 2553 /// "complex" getelementptr, we allow it to be promoted without finding other 2554 /// sext instructions that sign extended the same initial value. A getelementptr 2555 /// is considered as "complex" if it has more than 2 operands. 2556 bool AArch64TTIImpl::shouldConsiderAddressTypePromotion( 2557 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) { 2558 bool Considerable = false; 2559 AllowPromotionWithoutCommonHeader = false; 2560 if (!isa<SExtInst>(&I)) 2561 return false; 2562 Type *ConsideredSExtType = 2563 Type::getInt64Ty(I.getParent()->getParent()->getContext()); 2564 if (I.getType() != ConsideredSExtType) 2565 return false; 2566 // See if the sext is the one with the right type and used in at least one 2567 // GetElementPtrInst. 2568 for (const User *U : I.users()) { 2569 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) { 2570 Considerable = true; 2571 // A getelementptr is considered as "complex" if it has more than 2 2572 // operands. We will promote a SExt used in such complex GEP as we 2573 // expect some computation to be merged if they are done on 64 bits. 2574 if (GEPInst->getNumOperands() > 2) { 2575 AllowPromotionWithoutCommonHeader = true; 2576 break; 2577 } 2578 } 2579 } 2580 return Considerable; 2581 } 2582 2583 bool AArch64TTIImpl::isLegalToVectorizeReduction( 2584 const RecurrenceDescriptor &RdxDesc, ElementCount VF) const { 2585 if (!VF.isScalable()) 2586 return true; 2587 2588 Type *Ty = RdxDesc.getRecurrenceType(); 2589 if (Ty->isBFloatTy() || !isElementTypeLegalForScalableVector(Ty)) 2590 return false; 2591 2592 switch (RdxDesc.getRecurrenceKind()) { 2593 case RecurKind::Add: 2594 case RecurKind::FAdd: 2595 case RecurKind::And: 2596 case RecurKind::Or: 2597 case RecurKind::Xor: 2598 case RecurKind::SMin: 2599 case RecurKind::SMax: 2600 case RecurKind::UMin: 2601 case RecurKind::UMax: 2602 case RecurKind::FMin: 2603 case RecurKind::FMax: 2604 case RecurKind::SelectICmp: 2605 case RecurKind::SelectFCmp: 2606 case RecurKind::FMulAdd: 2607 return true; 2608 default: 2609 return false; 2610 } 2611 } 2612 2613 InstructionCost 2614 AArch64TTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, 2615 bool IsUnsigned, 2616 TTI::TargetCostKind CostKind) { 2617 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); 2618 2619 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16()) 2620 return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind); 2621 2622 assert((isa<ScalableVectorType>(Ty) == isa<ScalableVectorType>(CondTy)) && 2623 "Both vector needs to be equally scalable"); 2624 2625 InstructionCost LegalizationCost = 0; 2626 if (LT.first > 1) { 2627 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext()); 2628 unsigned MinMaxOpcode = 2629 Ty->isFPOrFPVectorTy() 2630 ? Intrinsic::maxnum 2631 : (IsUnsigned ? Intrinsic::umin : Intrinsic::smin); 2632 IntrinsicCostAttributes Attrs(MinMaxOpcode, LegalVTy, {LegalVTy, LegalVTy}); 2633 LegalizationCost = getIntrinsicInstrCost(Attrs, CostKind) * (LT.first - 1); 2634 } 2635 2636 return LegalizationCost + /*Cost of horizontal reduction*/ 2; 2637 } 2638 2639 InstructionCost AArch64TTIImpl::getArithmeticReductionCostSVE( 2640 unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) { 2641 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); 2642 InstructionCost LegalizationCost = 0; 2643 if (LT.first > 1) { 2644 Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext()); 2645 LegalizationCost = getArithmeticInstrCost(Opcode, LegalVTy, CostKind); 2646 LegalizationCost *= LT.first - 1; 2647 } 2648 2649 int ISD = TLI->InstructionOpcodeToISD(Opcode); 2650 assert(ISD && "Invalid opcode"); 2651 // Add the final reduction cost for the legal horizontal reduction 2652 switch (ISD) { 2653 case ISD::ADD: 2654 case ISD::AND: 2655 case ISD::OR: 2656 case ISD::XOR: 2657 case ISD::FADD: 2658 return LegalizationCost + 2; 2659 default: 2660 return InstructionCost::getInvalid(); 2661 } 2662 } 2663 2664 InstructionCost 2665 AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, 2666 Optional<FastMathFlags> FMF, 2667 TTI::TargetCostKind CostKind) { 2668 if (TTI::requiresOrderedReduction(FMF)) { 2669 if (auto *FixedVTy = dyn_cast<FixedVectorType>(ValTy)) { 2670 InstructionCost BaseCost = 2671 BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind); 2672 // Add on extra cost to reflect the extra overhead on some CPUs. We still 2673 // end up vectorizing for more computationally intensive loops. 2674 return BaseCost + FixedVTy->getNumElements(); 2675 } 2676 2677 if (Opcode != Instruction::FAdd) 2678 return InstructionCost::getInvalid(); 2679 2680 auto *VTy = cast<ScalableVectorType>(ValTy); 2681 InstructionCost Cost = 2682 getArithmeticInstrCost(Opcode, VTy->getScalarType(), CostKind); 2683 Cost *= getMaxNumElements(VTy->getElementCount()); 2684 return Cost; 2685 } 2686 2687 if (isa<ScalableVectorType>(ValTy)) 2688 return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind); 2689 2690 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); 2691 MVT MTy = LT.second; 2692 int ISD = TLI->InstructionOpcodeToISD(Opcode); 2693 assert(ISD && "Invalid opcode"); 2694 2695 // Horizontal adds can use the 'addv' instruction. We model the cost of these 2696 // instructions as twice a normal vector add, plus 1 for each legalization 2697 // step (LT.first). This is the only arithmetic vector reduction operation for 2698 // which we have an instruction. 2699 // OR, XOR and AND costs should match the codegen from: 2700 // OR: llvm/test/CodeGen/AArch64/reduce-or.ll 2701 // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll 2702 // AND: llvm/test/CodeGen/AArch64/reduce-and.ll 2703 static const CostTblEntry CostTblNoPairwise[]{ 2704 {ISD::ADD, MVT::v8i8, 2}, 2705 {ISD::ADD, MVT::v16i8, 2}, 2706 {ISD::ADD, MVT::v4i16, 2}, 2707 {ISD::ADD, MVT::v8i16, 2}, 2708 {ISD::ADD, MVT::v4i32, 2}, 2709 {ISD::OR, MVT::v8i8, 15}, 2710 {ISD::OR, MVT::v16i8, 17}, 2711 {ISD::OR, MVT::v4i16, 7}, 2712 {ISD::OR, MVT::v8i16, 9}, 2713 {ISD::OR, MVT::v2i32, 3}, 2714 {ISD::OR, MVT::v4i32, 5}, 2715 {ISD::OR, MVT::v2i64, 3}, 2716 {ISD::XOR, MVT::v8i8, 15}, 2717 {ISD::XOR, MVT::v16i8, 17}, 2718 {ISD::XOR, MVT::v4i16, 7}, 2719 {ISD::XOR, MVT::v8i16, 9}, 2720 {ISD::XOR, MVT::v2i32, 3}, 2721 {ISD::XOR, MVT::v4i32, 5}, 2722 {ISD::XOR, MVT::v2i64, 3}, 2723 {ISD::AND, MVT::v8i8, 15}, 2724 {ISD::AND, MVT::v16i8, 17}, 2725 {ISD::AND, MVT::v4i16, 7}, 2726 {ISD::AND, MVT::v8i16, 9}, 2727 {ISD::AND, MVT::v2i32, 3}, 2728 {ISD::AND, MVT::v4i32, 5}, 2729 {ISD::AND, MVT::v2i64, 3}, 2730 }; 2731 switch (ISD) { 2732 default: 2733 break; 2734 case ISD::ADD: 2735 if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy)) 2736 return (LT.first - 1) + Entry->Cost; 2737 break; 2738 case ISD::XOR: 2739 case ISD::AND: 2740 case ISD::OR: 2741 const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy); 2742 if (!Entry) 2743 break; 2744 auto *ValVTy = cast<FixedVectorType>(ValTy); 2745 if (!ValVTy->getElementType()->isIntegerTy(1) && 2746 MTy.getVectorNumElements() <= ValVTy->getNumElements() && 2747 isPowerOf2_32(ValVTy->getNumElements())) { 2748 InstructionCost ExtraCost = 0; 2749 if (LT.first != 1) { 2750 // Type needs to be split, so there is an extra cost of LT.first - 1 2751 // arithmetic ops. 2752 auto *Ty = FixedVectorType::get(ValTy->getElementType(), 2753 MTy.getVectorNumElements()); 2754 ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind); 2755 ExtraCost *= LT.first - 1; 2756 } 2757 return Entry->Cost + ExtraCost; 2758 } 2759 break; 2760 } 2761 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind); 2762 } 2763 2764 InstructionCost AArch64TTIImpl::getSpliceCost(VectorType *Tp, int Index) { 2765 static const CostTblEntry ShuffleTbl[] = { 2766 { TTI::SK_Splice, MVT::nxv16i8, 1 }, 2767 { TTI::SK_Splice, MVT::nxv8i16, 1 }, 2768 { TTI::SK_Splice, MVT::nxv4i32, 1 }, 2769 { TTI::SK_Splice, MVT::nxv2i64, 1 }, 2770 { TTI::SK_Splice, MVT::nxv2f16, 1 }, 2771 { TTI::SK_Splice, MVT::nxv4f16, 1 }, 2772 { TTI::SK_Splice, MVT::nxv8f16, 1 }, 2773 { TTI::SK_Splice, MVT::nxv2bf16, 1 }, 2774 { TTI::SK_Splice, MVT::nxv4bf16, 1 }, 2775 { TTI::SK_Splice, MVT::nxv8bf16, 1 }, 2776 { TTI::SK_Splice, MVT::nxv2f32, 1 }, 2777 { TTI::SK_Splice, MVT::nxv4f32, 1 }, 2778 { TTI::SK_Splice, MVT::nxv2f64, 1 }, 2779 }; 2780 2781 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); 2782 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Tp->getContext()); 2783 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 2784 EVT PromotedVT = LT.second.getScalarType() == MVT::i1 2785 ? TLI->getPromotedVTForPredicate(EVT(LT.second)) 2786 : LT.second; 2787 Type *PromotedVTy = EVT(PromotedVT).getTypeForEVT(Tp->getContext()); 2788 InstructionCost LegalizationCost = 0; 2789 if (Index < 0) { 2790 LegalizationCost = 2791 getCmpSelInstrCost(Instruction::ICmp, PromotedVTy, PromotedVTy, 2792 CmpInst::BAD_ICMP_PREDICATE, CostKind) + 2793 getCmpSelInstrCost(Instruction::Select, PromotedVTy, LegalVTy, 2794 CmpInst::BAD_ICMP_PREDICATE, CostKind); 2795 } 2796 2797 // Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp 2798 // Cost performed on a promoted type. 2799 if (LT.second.getScalarType() == MVT::i1) { 2800 LegalizationCost += 2801 getCastInstrCost(Instruction::ZExt, PromotedVTy, LegalVTy, 2802 TTI::CastContextHint::None, CostKind) + 2803 getCastInstrCost(Instruction::Trunc, LegalVTy, PromotedVTy, 2804 TTI::CastContextHint::None, CostKind); 2805 } 2806 const auto *Entry = 2807 CostTableLookup(ShuffleTbl, TTI::SK_Splice, PromotedVT.getSimpleVT()); 2808 assert(Entry && "Illegal Type for Splice"); 2809 LegalizationCost += Entry->Cost; 2810 return LegalizationCost * LT.first; 2811 } 2812 2813 InstructionCost AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, 2814 VectorType *Tp, 2815 ArrayRef<int> Mask, int Index, 2816 VectorType *SubTp, 2817 ArrayRef<const Value *> Args) { 2818 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); 2819 // If we have a Mask, and the LT is being legalized somehow, split the Mask 2820 // into smaller vectors and sum the cost of each shuffle. 2821 if (!Mask.empty() && isa<FixedVectorType>(Tp) && LT.second.isVector() && 2822 Tp->getScalarSizeInBits() == LT.second.getScalarSizeInBits() && 2823 cast<FixedVectorType>(Tp)->getNumElements() > 2824 LT.second.getVectorNumElements() && 2825 !Index && !SubTp) { 2826 unsigned TpNumElts = cast<FixedVectorType>(Tp)->getNumElements(); 2827 assert(Mask.size() == TpNumElts && "Expected Mask and Tp size to match!"); 2828 unsigned LTNumElts = LT.second.getVectorNumElements(); 2829 unsigned NumVecs = (TpNumElts + LTNumElts - 1) / LTNumElts; 2830 VectorType *NTp = 2831 VectorType::get(Tp->getScalarType(), LT.second.getVectorElementCount()); 2832 InstructionCost Cost; 2833 for (unsigned N = 0; N < NumVecs; N++) { 2834 SmallVector<int> NMask; 2835 // Split the existing mask into chunks of size LTNumElts. Track the source 2836 // sub-vectors to ensure the result has at most 2 inputs. 2837 unsigned Source1, Source2; 2838 unsigned NumSources = 0; 2839 for (unsigned E = 0; E < LTNumElts; E++) { 2840 int MaskElt = (N * LTNumElts + E < TpNumElts) ? Mask[N * LTNumElts + E] 2841 : UndefMaskElem; 2842 if (MaskElt < 0) { 2843 NMask.push_back(UndefMaskElem); 2844 continue; 2845 } 2846 2847 // Calculate which source from the input this comes from and whether it 2848 // is new to us. 2849 unsigned Source = MaskElt / LTNumElts; 2850 if (NumSources == 0) { 2851 Source1 = Source; 2852 NumSources = 1; 2853 } else if (NumSources == 1 && Source != Source1) { 2854 Source2 = Source; 2855 NumSources = 2; 2856 } else if (NumSources >= 2 && Source != Source1 && Source != Source2) { 2857 NumSources++; 2858 } 2859 2860 // Add to the new mask. For the NumSources>2 case these are not correct, 2861 // but are only used for the modular lane number. 2862 if (Source == Source1) 2863 NMask.push_back(MaskElt % LTNumElts); 2864 else if (Source == Source2) 2865 NMask.push_back(MaskElt % LTNumElts + LTNumElts); 2866 else 2867 NMask.push_back(MaskElt % LTNumElts); 2868 } 2869 // If the sub-mask has at most 2 input sub-vectors then re-cost it using 2870 // getShuffleCost. If not then cost it using the worst case. 2871 if (NumSources <= 2) 2872 Cost += getShuffleCost(NumSources <= 1 ? TTI::SK_PermuteSingleSrc 2873 : TTI::SK_PermuteTwoSrc, 2874 NTp, NMask, 0, nullptr, Args); 2875 else if (any_of(enumerate(NMask), [&](const auto &ME) { 2876 return ME.value() % LTNumElts == ME.index(); 2877 })) 2878 Cost += LTNumElts - 1; 2879 else 2880 Cost += LTNumElts; 2881 } 2882 return Cost; 2883 } 2884 2885 Kind = improveShuffleKindFromMask(Kind, Mask); 2886 2887 // Check for broadcast loads. 2888 if (Kind == TTI::SK_Broadcast) { 2889 bool IsLoad = !Args.empty() && isa<LoadInst>(Args[0]); 2890 if (IsLoad && LT.second.isVector() && 2891 isLegalBroadcastLoad(Tp->getElementType(), 2892 LT.second.getVectorElementCount())) 2893 return 0; // broadcast is handled by ld1r 2894 } 2895 2896 // If we have 4 elements for the shuffle and a Mask, get the cost straight 2897 // from the perfect shuffle tables. 2898 if (Mask.size() == 4 && Tp->getElementCount() == ElementCount::getFixed(4) && 2899 (Tp->getScalarSizeInBits() == 16 || Tp->getScalarSizeInBits() == 32) && 2900 all_of(Mask, [](int E) { return E < 8; })) 2901 return getPerfectShuffleCost(Mask); 2902 2903 if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose || 2904 Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc || 2905 Kind == TTI::SK_Reverse) { 2906 2907 static const CostTblEntry ShuffleTbl[] = { 2908 // Broadcast shuffle kinds can be performed with 'dup'. 2909 { TTI::SK_Broadcast, MVT::v8i8, 1 }, 2910 { TTI::SK_Broadcast, MVT::v16i8, 1 }, 2911 { TTI::SK_Broadcast, MVT::v4i16, 1 }, 2912 { TTI::SK_Broadcast, MVT::v8i16, 1 }, 2913 { TTI::SK_Broadcast, MVT::v2i32, 1 }, 2914 { TTI::SK_Broadcast, MVT::v4i32, 1 }, 2915 { TTI::SK_Broadcast, MVT::v2i64, 1 }, 2916 { TTI::SK_Broadcast, MVT::v2f32, 1 }, 2917 { TTI::SK_Broadcast, MVT::v4f32, 1 }, 2918 { TTI::SK_Broadcast, MVT::v2f64, 1 }, 2919 // Transpose shuffle kinds can be performed with 'trn1/trn2' and 2920 // 'zip1/zip2' instructions. 2921 { TTI::SK_Transpose, MVT::v8i8, 1 }, 2922 { TTI::SK_Transpose, MVT::v16i8, 1 }, 2923 { TTI::SK_Transpose, MVT::v4i16, 1 }, 2924 { TTI::SK_Transpose, MVT::v8i16, 1 }, 2925 { TTI::SK_Transpose, MVT::v2i32, 1 }, 2926 { TTI::SK_Transpose, MVT::v4i32, 1 }, 2927 { TTI::SK_Transpose, MVT::v2i64, 1 }, 2928 { TTI::SK_Transpose, MVT::v2f32, 1 }, 2929 { TTI::SK_Transpose, MVT::v4f32, 1 }, 2930 { TTI::SK_Transpose, MVT::v2f64, 1 }, 2931 // Select shuffle kinds. 2932 // TODO: handle vXi8/vXi16. 2933 { TTI::SK_Select, MVT::v2i32, 1 }, // mov. 2934 { TTI::SK_Select, MVT::v4i32, 2 }, // rev+trn (or similar). 2935 { TTI::SK_Select, MVT::v2i64, 1 }, // mov. 2936 { TTI::SK_Select, MVT::v2f32, 1 }, // mov. 2937 { TTI::SK_Select, MVT::v4f32, 2 }, // rev+trn (or similar). 2938 { TTI::SK_Select, MVT::v2f64, 1 }, // mov. 2939 // PermuteSingleSrc shuffle kinds. 2940 { TTI::SK_PermuteSingleSrc, MVT::v2i32, 1 }, // mov. 2941 { TTI::SK_PermuteSingleSrc, MVT::v4i32, 3 }, // perfectshuffle worst case. 2942 { TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // mov. 2943 { TTI::SK_PermuteSingleSrc, MVT::v2f32, 1 }, // mov. 2944 { TTI::SK_PermuteSingleSrc, MVT::v4f32, 3 }, // perfectshuffle worst case. 2945 { TTI::SK_PermuteSingleSrc, MVT::v2f64, 1 }, // mov. 2946 { TTI::SK_PermuteSingleSrc, MVT::v4i16, 3 }, // perfectshuffle worst case. 2947 { TTI::SK_PermuteSingleSrc, MVT::v4f16, 3 }, // perfectshuffle worst case. 2948 { TTI::SK_PermuteSingleSrc, MVT::v4bf16, 3 }, // perfectshuffle worst case. 2949 { TTI::SK_PermuteSingleSrc, MVT::v8i16, 8 }, // constpool + load + tbl 2950 { TTI::SK_PermuteSingleSrc, MVT::v8f16, 8 }, // constpool + load + tbl 2951 { TTI::SK_PermuteSingleSrc, MVT::v8bf16, 8 }, // constpool + load + tbl 2952 { TTI::SK_PermuteSingleSrc, MVT::v8i8, 8 }, // constpool + load + tbl 2953 { TTI::SK_PermuteSingleSrc, MVT::v16i8, 8 }, // constpool + load + tbl 2954 // Reverse can be lowered with `rev`. 2955 { TTI::SK_Reverse, MVT::v2i32, 1 }, // mov. 2956 { TTI::SK_Reverse, MVT::v4i32, 2 }, // REV64; EXT 2957 { TTI::SK_Reverse, MVT::v2i64, 1 }, // mov. 2958 { TTI::SK_Reverse, MVT::v2f32, 1 }, // mov. 2959 { TTI::SK_Reverse, MVT::v4f32, 2 }, // REV64; EXT 2960 { TTI::SK_Reverse, MVT::v2f64, 1 }, // mov. 2961 { TTI::SK_Reverse, MVT::v8f16, 2 }, // REV64; EXT 2962 { TTI::SK_Reverse, MVT::v8i16, 2 }, // REV64; EXT 2963 { TTI::SK_Reverse, MVT::v16i8, 2 }, // REV64; EXT 2964 { TTI::SK_Reverse, MVT::v4f16, 1 }, // REV64 2965 { TTI::SK_Reverse, MVT::v4i16, 1 }, // REV64 2966 { TTI::SK_Reverse, MVT::v8i8, 1 }, // REV64 2967 // Broadcast shuffle kinds for scalable vectors 2968 { TTI::SK_Broadcast, MVT::nxv16i8, 1 }, 2969 { TTI::SK_Broadcast, MVT::nxv8i16, 1 }, 2970 { TTI::SK_Broadcast, MVT::nxv4i32, 1 }, 2971 { TTI::SK_Broadcast, MVT::nxv2i64, 1 }, 2972 { TTI::SK_Broadcast, MVT::nxv2f16, 1 }, 2973 { TTI::SK_Broadcast, MVT::nxv4f16, 1 }, 2974 { TTI::SK_Broadcast, MVT::nxv8f16, 1 }, 2975 { TTI::SK_Broadcast, MVT::nxv2bf16, 1 }, 2976 { TTI::SK_Broadcast, MVT::nxv4bf16, 1 }, 2977 { TTI::SK_Broadcast, MVT::nxv8bf16, 1 }, 2978 { TTI::SK_Broadcast, MVT::nxv2f32, 1 }, 2979 { TTI::SK_Broadcast, MVT::nxv4f32, 1 }, 2980 { TTI::SK_Broadcast, MVT::nxv2f64, 1 }, 2981 { TTI::SK_Broadcast, MVT::nxv16i1, 1 }, 2982 { TTI::SK_Broadcast, MVT::nxv8i1, 1 }, 2983 { TTI::SK_Broadcast, MVT::nxv4i1, 1 }, 2984 { TTI::SK_Broadcast, MVT::nxv2i1, 1 }, 2985 // Handle the cases for vector.reverse with scalable vectors 2986 { TTI::SK_Reverse, MVT::nxv16i8, 1 }, 2987 { TTI::SK_Reverse, MVT::nxv8i16, 1 }, 2988 { TTI::SK_Reverse, MVT::nxv4i32, 1 }, 2989 { TTI::SK_Reverse, MVT::nxv2i64, 1 }, 2990 { TTI::SK_Reverse, MVT::nxv2f16, 1 }, 2991 { TTI::SK_Reverse, MVT::nxv4f16, 1 }, 2992 { TTI::SK_Reverse, MVT::nxv8f16, 1 }, 2993 { TTI::SK_Reverse, MVT::nxv2bf16, 1 }, 2994 { TTI::SK_Reverse, MVT::nxv4bf16, 1 }, 2995 { TTI::SK_Reverse, MVT::nxv8bf16, 1 }, 2996 { TTI::SK_Reverse, MVT::nxv2f32, 1 }, 2997 { TTI::SK_Reverse, MVT::nxv4f32, 1 }, 2998 { TTI::SK_Reverse, MVT::nxv2f64, 1 }, 2999 { TTI::SK_Reverse, MVT::nxv16i1, 1 }, 3000 { TTI::SK_Reverse, MVT::nxv8i1, 1 }, 3001 { TTI::SK_Reverse, MVT::nxv4i1, 1 }, 3002 { TTI::SK_Reverse, MVT::nxv2i1, 1 }, 3003 }; 3004 if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second)) 3005 return LT.first * Entry->Cost; 3006 } 3007 3008 if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(Tp)) 3009 return getSpliceCost(Tp, Index); 3010 3011 // Inserting a subvector can often be done with either a D, S or H register 3012 // move, so long as the inserted vector is "aligned". 3013 if (Kind == TTI::SK_InsertSubvector && LT.second.isFixedLengthVector() && 3014 LT.second.getSizeInBits() <= 128 && SubTp) { 3015 std::pair<InstructionCost, MVT> SubLT = 3016 TLI->getTypeLegalizationCost(DL, SubTp); 3017 if (SubLT.second.isVector()) { 3018 int NumElts = LT.second.getVectorNumElements(); 3019 int NumSubElts = SubLT.second.getVectorNumElements(); 3020 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0) 3021 return SubLT.first; 3022 } 3023 } 3024 3025 return BaseT::getShuffleCost(Kind, Tp, Mask, Index, SubTp); 3026 } 3027 3028 bool AArch64TTIImpl::preferPredicateOverEpilogue( 3029 Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC, 3030 TargetLibraryInfo *TLI, DominatorTree *DT, LoopVectorizationLegality *LVL) { 3031 if (!ST->hasSVE() || TailFoldingKindLoc == TailFoldingKind::TFDisabled) 3032 return false; 3033 3034 TailFoldingKind Required; // Defaults to 0. 3035 if (LVL->getReductionVars().size()) 3036 Required.add(TailFoldingKind::TFReductions); 3037 if (LVL->getFirstOrderRecurrences().size()) 3038 Required.add(TailFoldingKind::TFRecurrences); 3039 if (!Required) 3040 Required.add(TailFoldingKind::TFSimple); 3041 3042 return (TailFoldingKindLoc & Required) == Required; 3043 } 3044