1 //===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements a TargetTransformInfo analysis pass specific to the 10 /// X86 target machine. It uses the target's detailed information to provide 11 /// more precise answers to certain TTI queries, while letting the target 12 /// independent and default TTI implementations handle the rest. 13 /// 14 //===----------------------------------------------------------------------===// 15 /// About Cost Model numbers used below it's necessary to say the following: 16 /// the numbers correspond to some "generic" X86 CPU instead of usage of a 17 /// specific CPU model. Usually the numbers correspond to the CPU where the 18 /// feature first appeared. For example, if we do Subtarget.hasSSE42() in 19 /// the lookups below the cost is based on Nehalem as that was the first CPU 20 /// to support that feature level and thus has most likely the worst case cost, 21 /// although we may discard an outlying worst cost from one CPU (e.g. Atom). 22 /// 23 /// Some examples of other technologies/CPUs: 24 /// SSE 3 - Pentium4 / Athlon64 25 /// SSE 4.1 - Penryn 26 /// SSE 4.2 - Nehalem / Silvermont 27 /// AVX - Sandy Bridge / Jaguar / Bulldozer 28 /// AVX2 - Haswell / Ryzen 29 /// AVX-512 - Xeon Phi / Skylake 30 /// 31 /// And some examples of instruction target dependent costs (latency) 32 /// divss sqrtss rsqrtss 33 /// AMD K7 11-16 19 3 34 /// Piledriver 9-24 13-15 5 35 /// Jaguar 14 16 2 36 /// Pentium II,III 18 30 2 37 /// Nehalem 7-14 7-18 3 38 /// Haswell 10-13 11 5 39 /// 40 /// Interpreting the 4 TargetCostKind types: 41 /// TCK_RecipThroughput and TCK_Latency should try to match the worst case 42 /// values reported by the CPU scheduler models (and llvm-mca). 43 /// TCK_CodeSize should match the instruction count (e.g. divss = 1), NOT the 44 /// actual encoding size of the instruction. 45 /// TCK_SizeAndLatency should match the worst case micro-op counts reported by 46 /// by the CPU scheduler models (and llvm-mca), to ensure that they are 47 /// compatible with the MicroOpBufferSize and LoopMicroOpBufferSize values which are 48 /// often used as the cost thresholds where TCK_SizeAndLatency is requested. 49 //===----------------------------------------------------------------------===// 50 51 #include "X86TargetTransformInfo.h" 52 #include "llvm/Analysis/TargetTransformInfo.h" 53 #include "llvm/CodeGen/BasicTTIImpl.h" 54 #include "llvm/CodeGen/CostTable.h" 55 #include "llvm/CodeGen/TargetLowering.h" 56 #include "llvm/IR/InstIterator.h" 57 #include "llvm/IR/IntrinsicInst.h" 58 #include "llvm/Support/Debug.h" 59 #include <optional> 60 61 using namespace llvm; 62 63 #define DEBUG_TYPE "x86tti" 64 65 //===----------------------------------------------------------------------===// 66 // 67 // X86 cost model. 68 // 69 //===----------------------------------------------------------------------===// 70 71 // Helper struct to store/access costs for each cost kind. 72 // TODO: Move this to allow other targets to use it? 73 struct CostKindCosts { 74 unsigned RecipThroughputCost = ~0U; 75 unsigned LatencyCost = ~0U; 76 unsigned CodeSizeCost = ~0U; 77 unsigned SizeAndLatencyCost = ~0U; 78 79 std::optional<unsigned> 80 operator[](TargetTransformInfo::TargetCostKind Kind) const { 81 unsigned Cost = ~0U; 82 switch (Kind) { 83 case TargetTransformInfo::TCK_RecipThroughput: 84 Cost = RecipThroughputCost; 85 break; 86 case TargetTransformInfo::TCK_Latency: 87 Cost = LatencyCost; 88 break; 89 case TargetTransformInfo::TCK_CodeSize: 90 Cost = CodeSizeCost; 91 break; 92 case TargetTransformInfo::TCK_SizeAndLatency: 93 Cost = SizeAndLatencyCost; 94 break; 95 } 96 if (Cost == ~0U) 97 return std::nullopt; 98 return Cost; 99 } 100 }; 101 using CostKindTblEntry = CostTblEntryT<CostKindCosts>; 102 103 TargetTransformInfo::PopcntSupportKind 104 X86TTIImpl::getPopcntSupport(unsigned TyWidth) { 105 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); 106 // TODO: Currently the __builtin_popcount() implementation using SSE3 107 // instructions is inefficient. Once the problem is fixed, we should 108 // call ST->hasSSE3() instead of ST->hasPOPCNT(). 109 return ST->hasPOPCNT() ? TTI::PSK_FastHardware : TTI::PSK_Software; 110 } 111 112 std::optional<unsigned> X86TTIImpl::getCacheSize( 113 TargetTransformInfo::CacheLevel Level) const { 114 switch (Level) { 115 case TargetTransformInfo::CacheLevel::L1D: 116 // - Penryn 117 // - Nehalem 118 // - Westmere 119 // - Sandy Bridge 120 // - Ivy Bridge 121 // - Haswell 122 // - Broadwell 123 // - Skylake 124 // - Kabylake 125 return 32 * 1024; // 32 KByte 126 case TargetTransformInfo::CacheLevel::L2D: 127 // - Penryn 128 // - Nehalem 129 // - Westmere 130 // - Sandy Bridge 131 // - Ivy Bridge 132 // - Haswell 133 // - Broadwell 134 // - Skylake 135 // - Kabylake 136 return 256 * 1024; // 256 KByte 137 } 138 139 llvm_unreachable("Unknown TargetTransformInfo::CacheLevel"); 140 } 141 142 std::optional<unsigned> X86TTIImpl::getCacheAssociativity( 143 TargetTransformInfo::CacheLevel Level) const { 144 // - Penryn 145 // - Nehalem 146 // - Westmere 147 // - Sandy Bridge 148 // - Ivy Bridge 149 // - Haswell 150 // - Broadwell 151 // - Skylake 152 // - Kabylake 153 switch (Level) { 154 case TargetTransformInfo::CacheLevel::L1D: 155 [[fallthrough]]; 156 case TargetTransformInfo::CacheLevel::L2D: 157 return 8; 158 } 159 160 llvm_unreachable("Unknown TargetTransformInfo::CacheLevel"); 161 } 162 163 unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const { 164 bool Vector = (ClassID == 1); 165 if (Vector && !ST->hasSSE1()) 166 return 0; 167 168 if (ST->is64Bit()) { 169 if (Vector && ST->hasAVX512()) 170 return 32; 171 return 16; 172 } 173 return 8; 174 } 175 176 TypeSize 177 X86TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { 178 unsigned PreferVectorWidth = ST->getPreferVectorWidth(); 179 switch (K) { 180 case TargetTransformInfo::RGK_Scalar: 181 return TypeSize::getFixed(ST->is64Bit() ? 64 : 32); 182 case TargetTransformInfo::RGK_FixedWidthVector: 183 if (ST->hasAVX512() && PreferVectorWidth >= 512) 184 return TypeSize::getFixed(512); 185 if (ST->hasAVX() && PreferVectorWidth >= 256) 186 return TypeSize::getFixed(256); 187 if (ST->hasSSE1() && PreferVectorWidth >= 128) 188 return TypeSize::getFixed(128); 189 return TypeSize::getFixed(0); 190 case TargetTransformInfo::RGK_ScalableVector: 191 return TypeSize::getScalable(0); 192 } 193 194 llvm_unreachable("Unsupported register kind"); 195 } 196 197 unsigned X86TTIImpl::getLoadStoreVecRegBitWidth(unsigned) const { 198 return getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) 199 .getFixedValue(); 200 } 201 202 unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) { 203 // If the loop will not be vectorized, don't interleave the loop. 204 // Let regular unroll to unroll the loop, which saves the overflow 205 // check and memory check cost. 206 if (VF == 1) 207 return 1; 208 209 if (ST->isAtom()) 210 return 1; 211 212 // Sandybridge and Haswell have multiple execution ports and pipelined 213 // vector units. 214 if (ST->hasAVX()) 215 return 4; 216 217 return 2; 218 } 219 220 InstructionCost X86TTIImpl::getArithmeticInstrCost( 221 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, 222 TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, 223 ArrayRef<const Value *> Args, 224 const Instruction *CxtI) { 225 226 // vXi8 multiplications are always promoted to vXi16. 227 if (Opcode == Instruction::Mul && Ty->isVectorTy() && 228 Ty->getScalarSizeInBits() == 8) { 229 Type *WideVecTy = 230 VectorType::getExtendedElementVectorType(cast<VectorType>(Ty)); 231 return getCastInstrCost(Instruction::ZExt, WideVecTy, Ty, 232 TargetTransformInfo::CastContextHint::None, 233 CostKind) + 234 getCastInstrCost(Instruction::Trunc, Ty, WideVecTy, 235 TargetTransformInfo::CastContextHint::None, 236 CostKind) + 237 getArithmeticInstrCost(Opcode, WideVecTy, CostKind, Op1Info, Op2Info); 238 } 239 240 // Legalize the type. 241 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); 242 243 int ISD = TLI->InstructionOpcodeToISD(Opcode); 244 assert(ISD && "Invalid opcode"); 245 246 if (ISD == ISD::MUL && Args.size() == 2 && LT.second.isVector() && 247 LT.second.getScalarType() == MVT::i32) { 248 // Check if the operands can be represented as a smaller datatype. 249 bool Op1Signed = false, Op2Signed = false; 250 unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed); 251 unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed); 252 unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize); 253 bool SignedMode = Op1Signed || Op2Signed; 254 255 // If both are representable as i15 and at least one is constant, 256 // zero-extended, or sign-extended from vXi16 (or less pre-SSE41) then we 257 // can treat this as PMADDWD which has the same costs as a vXi16 multiply. 258 if (OpMinSize <= 15 && !ST->isPMADDWDSlow()) { 259 bool Op1Constant = 260 isa<ConstantDataVector>(Args[0]) || isa<ConstantVector>(Args[0]); 261 bool Op2Constant = 262 isa<ConstantDataVector>(Args[1]) || isa<ConstantVector>(Args[1]); 263 bool Op1Sext = isa<SExtInst>(Args[0]) && 264 (Op1MinSize == 15 || (Op1MinSize < 15 && !ST->hasSSE41())); 265 bool Op2Sext = isa<SExtInst>(Args[1]) && 266 (Op2MinSize == 15 || (Op2MinSize < 15 && !ST->hasSSE41())); 267 268 bool IsZeroExtended = !Op1Signed || !Op2Signed; 269 bool IsConstant = Op1Constant || Op2Constant; 270 bool IsSext = Op1Sext || Op2Sext; 271 if (IsConstant || IsZeroExtended || IsSext) 272 LT.second = 273 MVT::getVectorVT(MVT::i16, 2 * LT.second.getVectorNumElements()); 274 } 275 276 // Check if the vXi32 operands can be shrunk into a smaller datatype. 277 // This should match the codegen from reduceVMULWidth. 278 // TODO: Make this generic (!ST->SSE41 || ST->isPMULLDSlow()). 279 if (ST->useSLMArithCosts() && LT.second == MVT::v4i32) { 280 if (OpMinSize <= 7) 281 return LT.first * 3; // pmullw/sext 282 if (!SignedMode && OpMinSize <= 8) 283 return LT.first * 3; // pmullw/zext 284 if (OpMinSize <= 15) 285 return LT.first * 5; // pmullw/pmulhw/pshuf 286 if (!SignedMode && OpMinSize <= 16) 287 return LT.first * 5; // pmullw/pmulhw/pshuf 288 } 289 } 290 291 // Vector multiply by pow2 will be simplified to shifts. 292 // Vector multiply by -pow2 will be simplified to shifts/negates. 293 if (ISD == ISD::MUL && Op2Info.isConstant() && 294 (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2())) { 295 InstructionCost Cost = 296 getArithmeticInstrCost(Instruction::Shl, Ty, CostKind, 297 Op1Info.getNoProps(), Op2Info.getNoProps()); 298 if (Op2Info.isNegatedPowerOf2()) 299 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind); 300 return Cost; 301 } 302 303 // On X86, vector signed division by constants power-of-two are 304 // normally expanded to the sequence SRA + SRL + ADD + SRA. 305 // The OperandValue properties may not be the same as that of the previous 306 // operation; conservatively assume OP_None. 307 if ((ISD == ISD::SDIV || ISD == ISD::SREM) && 308 Op2Info.isConstant() && Op2Info.isPowerOf2()) { 309 InstructionCost Cost = 310 2 * getArithmeticInstrCost(Instruction::AShr, Ty, CostKind, 311 Op1Info.getNoProps(), Op2Info.getNoProps()); 312 Cost += getArithmeticInstrCost(Instruction::LShr, Ty, CostKind, 313 Op1Info.getNoProps(), Op2Info.getNoProps()); 314 Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind, 315 Op1Info.getNoProps(), Op2Info.getNoProps()); 316 317 if (ISD == ISD::SREM) { 318 // For SREM: (X % C) is the equivalent of (X - (X/C)*C) 319 Cost += getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, Op1Info.getNoProps(), 320 Op2Info.getNoProps()); 321 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Op1Info.getNoProps(), 322 Op2Info.getNoProps()); 323 } 324 325 return Cost; 326 } 327 328 // Vector unsigned division/remainder will be simplified to shifts/masks. 329 if ((ISD == ISD::UDIV || ISD == ISD::UREM) && 330 Op2Info.isConstant() && Op2Info.isPowerOf2()) { 331 if (ISD == ISD::UDIV) 332 return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind, 333 Op1Info.getNoProps(), Op2Info.getNoProps()); 334 // UREM 335 return getArithmeticInstrCost(Instruction::And, Ty, CostKind, 336 Op1Info.getNoProps(), Op2Info.getNoProps()); 337 } 338 339 static const CostKindTblEntry AVX512BWUniformConstCostTable[] = { 340 { ISD::SHL, MVT::v16i8, { 1, 7, 2, 3 } }, // psllw + pand. 341 { ISD::SRL, MVT::v16i8, { 1, 7, 2, 3 } }, // psrlw + pand. 342 { ISD::SRA, MVT::v16i8, { 1, 8, 4, 5 } }, // psrlw, pand, pxor, psubb. 343 { ISD::SHL, MVT::v32i8, { 1, 8, 2, 3 } }, // psllw + pand. 344 { ISD::SRL, MVT::v32i8, { 1, 8, 2, 3 } }, // psrlw + pand. 345 { ISD::SRA, MVT::v32i8, { 1, 9, 4, 5 } }, // psrlw, pand, pxor, psubb. 346 { ISD::SHL, MVT::v64i8, { 1, 8, 2, 3 } }, // psllw + pand. 347 { ISD::SRL, MVT::v64i8, { 1, 8, 2, 3 } }, // psrlw + pand. 348 { ISD::SRA, MVT::v64i8, { 1, 9, 4, 6 } }, // psrlw, pand, pxor, psubb. 349 350 { ISD::SHL, MVT::v16i16, { 1, 1, 1, 1 } }, // psllw 351 { ISD::SRL, MVT::v16i16, { 1, 1, 1, 1 } }, // psrlw 352 { ISD::SRA, MVT::v16i16, { 1, 1, 1, 1 } }, // psrlw 353 { ISD::SHL, MVT::v32i16, { 1, 1, 1, 1 } }, // psllw 354 { ISD::SRL, MVT::v32i16, { 1, 1, 1, 1 } }, // psrlw 355 { ISD::SRA, MVT::v32i16, { 1, 1, 1, 1 } }, // psrlw 356 }; 357 358 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasBWI()) 359 if (const auto *Entry = 360 CostTableLookup(AVX512BWUniformConstCostTable, ISD, LT.second)) 361 if (auto KindCost = Entry->Cost[CostKind]) 362 return LT.first * *KindCost; 363 364 static const CostKindTblEntry AVX512UniformConstCostTable[] = { 365 { ISD::SHL, MVT::v64i8, { 2, 12, 5, 6 } }, // psllw + pand. 366 { ISD::SRL, MVT::v64i8, { 2, 12, 5, 6 } }, // psrlw + pand. 367 { ISD::SRA, MVT::v64i8, { 3, 10, 12, 12 } }, // psrlw, pand, pxor, psubb. 368 369 { ISD::SHL, MVT::v16i16, { 2, 7, 4, 4 } }, // psllw + split. 370 { ISD::SRL, MVT::v16i16, { 2, 7, 4, 4 } }, // psrlw + split. 371 { ISD::SRA, MVT::v16i16, { 2, 7, 4, 4 } }, // psraw + split. 372 373 { ISD::SHL, MVT::v8i32, { 1, 1, 1, 1 } }, // pslld 374 { ISD::SRL, MVT::v8i32, { 1, 1, 1, 1 } }, // psrld 375 { ISD::SRA, MVT::v8i32, { 1, 1, 1, 1 } }, // psrad 376 { ISD::SHL, MVT::v16i32, { 1, 1, 1, 1 } }, // pslld 377 { ISD::SRL, MVT::v16i32, { 1, 1, 1, 1 } }, // psrld 378 { ISD::SRA, MVT::v16i32, { 1, 1, 1, 1 } }, // psrad 379 380 { ISD::SRA, MVT::v2i64, { 1, 1, 1, 1 } }, // psraq 381 { ISD::SHL, MVT::v4i64, { 1, 1, 1, 1 } }, // psllq 382 { ISD::SRL, MVT::v4i64, { 1, 1, 1, 1 } }, // psrlq 383 { ISD::SRA, MVT::v4i64, { 1, 1, 1, 1 } }, // psraq 384 { ISD::SHL, MVT::v8i64, { 1, 1, 1, 1 } }, // psllq 385 { ISD::SRL, MVT::v8i64, { 1, 1, 1, 1 } }, // psrlq 386 { ISD::SRA, MVT::v8i64, { 1, 1, 1, 1 } }, // psraq 387 388 { ISD::SDIV, MVT::v16i32, { 6 } }, // pmuludq sequence 389 { ISD::SREM, MVT::v16i32, { 8 } }, // pmuludq+mul+sub sequence 390 { ISD::UDIV, MVT::v16i32, { 5 } }, // pmuludq sequence 391 { ISD::UREM, MVT::v16i32, { 7 } }, // pmuludq+mul+sub sequence 392 }; 393 394 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX512()) 395 if (const auto *Entry = 396 CostTableLookup(AVX512UniformConstCostTable, ISD, LT.second)) 397 if (auto KindCost = Entry->Cost[CostKind]) 398 return LT.first * *KindCost; 399 400 static const CostKindTblEntry AVX2UniformConstCostTable[] = { 401 { ISD::SHL, MVT::v16i8, { 1, 8, 2, 3 } }, // psllw + pand. 402 { ISD::SRL, MVT::v16i8, { 1, 8, 2, 3 } }, // psrlw + pand. 403 { ISD::SRA, MVT::v16i8, { 2, 10, 5, 6 } }, // psrlw, pand, pxor, psubb. 404 { ISD::SHL, MVT::v32i8, { 2, 8, 2, 4 } }, // psllw + pand. 405 { ISD::SRL, MVT::v32i8, { 2, 8, 2, 4 } }, // psrlw + pand. 406 { ISD::SRA, MVT::v32i8, { 3, 10, 5, 9 } }, // psrlw, pand, pxor, psubb. 407 408 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // psllw 409 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // psrlw 410 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // psraw 411 { ISD::SHL, MVT::v16i16,{ 2, 2, 1, 2 } }, // psllw 412 { ISD::SRL, MVT::v16i16,{ 2, 2, 1, 2 } }, // psrlw 413 { ISD::SRA, MVT::v16i16,{ 2, 2, 1, 2 } }, // psraw 414 415 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } }, // pslld 416 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } }, // psrld 417 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } }, // psrad 418 { ISD::SHL, MVT::v8i32, { 2, 2, 1, 2 } }, // pslld 419 { ISD::SRL, MVT::v8i32, { 2, 2, 1, 2 } }, // psrld 420 { ISD::SRA, MVT::v8i32, { 2, 2, 1, 2 } }, // psrad 421 422 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } }, // psllq 423 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } }, // psrlq 424 { ISD::SRA, MVT::v2i64, { 2, 3, 3, 3 } }, // psrad + shuffle. 425 { ISD::SHL, MVT::v4i64, { 2, 2, 1, 2 } }, // psllq 426 { ISD::SRL, MVT::v4i64, { 2, 2, 1, 2 } }, // psrlq 427 { ISD::SRA, MVT::v4i64, { 4, 4, 3, 6 } }, // psrad + shuffle + split. 428 429 { ISD::SDIV, MVT::v8i32, { 6 } }, // pmuludq sequence 430 { ISD::SREM, MVT::v8i32, { 8 } }, // pmuludq+mul+sub sequence 431 { ISD::UDIV, MVT::v8i32, { 5 } }, // pmuludq sequence 432 { ISD::UREM, MVT::v8i32, { 7 } }, // pmuludq+mul+sub sequence 433 }; 434 435 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX2()) 436 if (const auto *Entry = 437 CostTableLookup(AVX2UniformConstCostTable, ISD, LT.second)) 438 if (auto KindCost = Entry->Cost[CostKind]) 439 return LT.first * *KindCost; 440 441 static const CostKindTblEntry AVXUniformConstCostTable[] = { 442 { ISD::SHL, MVT::v16i8, { 2, 7, 2, 3 } }, // psllw + pand. 443 { ISD::SRL, MVT::v16i8, { 2, 7, 2, 3 } }, // psrlw + pand. 444 { ISD::SRA, MVT::v16i8, { 3, 9, 5, 6 } }, // psrlw, pand, pxor, psubb. 445 { ISD::SHL, MVT::v32i8, { 4, 7, 7, 8 } }, // 2*(psllw + pand) + split. 446 { ISD::SRL, MVT::v32i8, { 4, 7, 7, 8 } }, // 2*(psrlw + pand) + split. 447 { ISD::SRA, MVT::v32i8, { 7, 7, 12, 13 } }, // 2*(psrlw, pand, pxor, psubb) + split. 448 449 { ISD::SHL, MVT::v8i16, { 1, 2, 1, 1 } }, // psllw. 450 { ISD::SRL, MVT::v8i16, { 1, 2, 1, 1 } }, // psrlw. 451 { ISD::SRA, MVT::v8i16, { 1, 2, 1, 1 } }, // psraw. 452 { ISD::SHL, MVT::v16i16,{ 3, 6, 4, 5 } }, // psllw + split. 453 { ISD::SRL, MVT::v16i16,{ 3, 6, 4, 5 } }, // psrlw + split. 454 { ISD::SRA, MVT::v16i16,{ 3, 6, 4, 5 } }, // psraw + split. 455 456 { ISD::SHL, MVT::v4i32, { 1, 2, 1, 1 } }, // pslld. 457 { ISD::SRL, MVT::v4i32, { 1, 2, 1, 1 } }, // psrld. 458 { ISD::SRA, MVT::v4i32, { 1, 2, 1, 1 } }, // psrad. 459 { ISD::SHL, MVT::v8i32, { 3, 6, 4, 5 } }, // pslld + split. 460 { ISD::SRL, MVT::v8i32, { 3, 6, 4, 5 } }, // psrld + split. 461 { ISD::SRA, MVT::v8i32, { 3, 6, 4, 5 } }, // psrad + split. 462 463 { ISD::SHL, MVT::v2i64, { 1, 2, 1, 1 } }, // psllq. 464 { ISD::SRL, MVT::v2i64, { 1, 2, 1, 1 } }, // psrlq. 465 { ISD::SRA, MVT::v2i64, { 2, 3, 3, 3 } }, // psrad + shuffle. 466 { ISD::SHL, MVT::v4i64, { 3, 6, 4, 5 } }, // 2 x psllq + split. 467 { ISD::SRL, MVT::v4i64, { 3, 6, 4, 5 } }, // 2 x psllq + split. 468 { ISD::SRA, MVT::v4i64, { 5, 7, 8, 9 } }, // 2 x psrad + shuffle + split. 469 470 { ISD::SDIV, MVT::v8i32, { 14 } }, // 2*pmuludq sequence + split. 471 { ISD::SREM, MVT::v8i32, { 18 } }, // 2*pmuludq+mul+sub sequence + split. 472 { ISD::UDIV, MVT::v8i32, { 12 } }, // 2*pmuludq sequence + split. 473 { ISD::UREM, MVT::v8i32, { 16 } }, // 2*pmuludq+mul+sub sequence + split. 474 }; 475 476 // XOP has faster vXi8 shifts. 477 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX() && 478 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8)) 479 if (const auto *Entry = 480 CostTableLookup(AVXUniformConstCostTable, ISD, LT.second)) 481 if (auto KindCost = Entry->Cost[CostKind]) 482 return LT.first * *KindCost; 483 484 static const CostKindTblEntry SSE2UniformConstCostTable[] = { 485 { ISD::SHL, MVT::v16i8, { 1, 7, 2, 3 } }, // psllw + pand. 486 { ISD::SRL, MVT::v16i8, { 1, 7, 2, 3 } }, // psrlw + pand. 487 { ISD::SRA, MVT::v16i8, { 3, 9, 5, 6 } }, // psrlw, pand, pxor, psubb. 488 489 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // psllw. 490 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // psrlw. 491 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // psraw. 492 493 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } }, // pslld 494 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } }, // psrld. 495 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } }, // psrad. 496 497 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } }, // psllq. 498 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } }, // psrlq. 499 { ISD::SRA, MVT::v2i64, { 3, 5, 6, 6 } }, // 2 x psrad + shuffle. 500 501 { ISD::SDIV, MVT::v4i32, { 6 } }, // pmuludq sequence 502 { ISD::SREM, MVT::v4i32, { 8 } }, // pmuludq+mul+sub sequence 503 { ISD::UDIV, MVT::v4i32, { 5 } }, // pmuludq sequence 504 { ISD::UREM, MVT::v4i32, { 7 } }, // pmuludq+mul+sub sequence 505 }; 506 507 // XOP has faster vXi8 shifts. 508 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasSSE2() && 509 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8)) 510 if (const auto *Entry = 511 CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second)) 512 if (auto KindCost = Entry->Cost[CostKind]) 513 return LT.first * *KindCost; 514 515 static const CostKindTblEntry AVX512BWConstCostTable[] = { 516 { ISD::SDIV, MVT::v64i8, { 14 } }, // 2*ext+2*pmulhw sequence 517 { ISD::SREM, MVT::v64i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence 518 { ISD::UDIV, MVT::v64i8, { 14 } }, // 2*ext+2*pmulhw sequence 519 { ISD::UREM, MVT::v64i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence 520 521 { ISD::SDIV, MVT::v32i16, { 6 } }, // vpmulhw sequence 522 { ISD::SREM, MVT::v32i16, { 8 } }, // vpmulhw+mul+sub sequence 523 { ISD::UDIV, MVT::v32i16, { 6 } }, // vpmulhuw sequence 524 { ISD::UREM, MVT::v32i16, { 8 } }, // vpmulhuw+mul+sub sequence 525 }; 526 527 if (Op2Info.isConstant() && ST->hasBWI()) 528 if (const auto *Entry = 529 CostTableLookup(AVX512BWConstCostTable, ISD, LT.second)) 530 if (auto KindCost = Entry->Cost[CostKind]) 531 return LT.first * *KindCost; 532 533 static const CostKindTblEntry AVX512ConstCostTable[] = { 534 { ISD::SDIV, MVT::v64i8, { 28 } }, // 4*ext+4*pmulhw sequence 535 { ISD::SREM, MVT::v64i8, { 32 } }, // 4*ext+4*pmulhw+mul+sub sequence 536 { ISD::UDIV, MVT::v64i8, { 28 } }, // 4*ext+4*pmulhw sequence 537 { ISD::UREM, MVT::v64i8, { 32 } }, // 4*ext+4*pmulhw+mul+sub sequence 538 539 { ISD::SDIV, MVT::v32i16, { 12 } }, // 2*vpmulhw sequence 540 { ISD::SREM, MVT::v32i16, { 16 } }, // 2*vpmulhw+mul+sub sequence 541 { ISD::UDIV, MVT::v32i16, { 12 } }, // 2*vpmulhuw sequence 542 { ISD::UREM, MVT::v32i16, { 16 } }, // 2*vpmulhuw+mul+sub sequence 543 544 { ISD::SDIV, MVT::v16i32, { 15 } }, // vpmuldq sequence 545 { ISD::SREM, MVT::v16i32, { 17 } }, // vpmuldq+mul+sub sequence 546 { ISD::UDIV, MVT::v16i32, { 15 } }, // vpmuludq sequence 547 { ISD::UREM, MVT::v16i32, { 17 } }, // vpmuludq+mul+sub sequence 548 }; 549 550 if (Op2Info.isConstant() && ST->hasAVX512()) 551 if (const auto *Entry = 552 CostTableLookup(AVX512ConstCostTable, ISD, LT.second)) 553 if (auto KindCost = Entry->Cost[CostKind]) 554 return LT.first * *KindCost; 555 556 static const CostKindTblEntry AVX2ConstCostTable[] = { 557 { ISD::SDIV, MVT::v32i8, { 14 } }, // 2*ext+2*pmulhw sequence 558 { ISD::SREM, MVT::v32i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence 559 { ISD::UDIV, MVT::v32i8, { 14 } }, // 2*ext+2*pmulhw sequence 560 { ISD::UREM, MVT::v32i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence 561 562 { ISD::SDIV, MVT::v16i16, { 6 } }, // vpmulhw sequence 563 { ISD::SREM, MVT::v16i16, { 8 } }, // vpmulhw+mul+sub sequence 564 { ISD::UDIV, MVT::v16i16, { 6 } }, // vpmulhuw sequence 565 { ISD::UREM, MVT::v16i16, { 8 } }, // vpmulhuw+mul+sub sequence 566 567 { ISD::SDIV, MVT::v8i32, { 15 } }, // vpmuldq sequence 568 { ISD::SREM, MVT::v8i32, { 19 } }, // vpmuldq+mul+sub sequence 569 { ISD::UDIV, MVT::v8i32, { 15 } }, // vpmuludq sequence 570 { ISD::UREM, MVT::v8i32, { 19 } }, // vpmuludq+mul+sub sequence 571 }; 572 573 if (Op2Info.isConstant() && ST->hasAVX2()) 574 if (const auto *Entry = CostTableLookup(AVX2ConstCostTable, ISD, LT.second)) 575 if (auto KindCost = Entry->Cost[CostKind]) 576 return LT.first * *KindCost; 577 578 static const CostKindTblEntry AVXConstCostTable[] = { 579 { ISD::SDIV, MVT::v32i8, { 30 } }, // 4*ext+4*pmulhw sequence + split. 580 { ISD::SREM, MVT::v32i8, { 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split. 581 { ISD::UDIV, MVT::v32i8, { 30 } }, // 4*ext+4*pmulhw sequence + split. 582 { ISD::UREM, MVT::v32i8, { 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split. 583 584 { ISD::SDIV, MVT::v16i16, { 14 } }, // 2*pmulhw sequence + split. 585 { ISD::SREM, MVT::v16i16, { 18 } }, // 2*pmulhw+mul+sub sequence + split. 586 { ISD::UDIV, MVT::v16i16, { 14 } }, // 2*pmulhuw sequence + split. 587 { ISD::UREM, MVT::v16i16, { 18 } }, // 2*pmulhuw+mul+sub sequence + split. 588 589 { ISD::SDIV, MVT::v8i32, { 32 } }, // vpmuludq sequence 590 { ISD::SREM, MVT::v8i32, { 38 } }, // vpmuludq+mul+sub sequence 591 { ISD::UDIV, MVT::v8i32, { 32 } }, // 2*pmuludq sequence + split. 592 { ISD::UREM, MVT::v8i32, { 42 } }, // 2*pmuludq+mul+sub sequence + split. 593 }; 594 595 if (Op2Info.isConstant() && ST->hasAVX()) 596 if (const auto *Entry = CostTableLookup(AVXConstCostTable, ISD, LT.second)) 597 if (auto KindCost = Entry->Cost[CostKind]) 598 return LT.first * *KindCost; 599 600 static const CostKindTblEntry SSE41ConstCostTable[] = { 601 { ISD::SDIV, MVT::v4i32, { 15 } }, // vpmuludq sequence 602 { ISD::SREM, MVT::v4i32, { 20 } }, // vpmuludq+mul+sub sequence 603 }; 604 605 if (Op2Info.isConstant() && ST->hasSSE41()) 606 if (const auto *Entry = 607 CostTableLookup(SSE41ConstCostTable, ISD, LT.second)) 608 if (auto KindCost = Entry->Cost[CostKind]) 609 return LT.first * *KindCost; 610 611 static const CostKindTblEntry SSE2ConstCostTable[] = { 612 { ISD::SDIV, MVT::v16i8, { 14 } }, // 2*ext+2*pmulhw sequence 613 { ISD::SREM, MVT::v16i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence 614 { ISD::UDIV, MVT::v16i8, { 14 } }, // 2*ext+2*pmulhw sequence 615 { ISD::UREM, MVT::v16i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence 616 617 { ISD::SDIV, MVT::v8i16, { 6 } }, // pmulhw sequence 618 { ISD::SREM, MVT::v8i16, { 8 } }, // pmulhw+mul+sub sequence 619 { ISD::UDIV, MVT::v8i16, { 6 } }, // pmulhuw sequence 620 { ISD::UREM, MVT::v8i16, { 8 } }, // pmulhuw+mul+sub sequence 621 622 { ISD::SDIV, MVT::v4i32, { 19 } }, // pmuludq sequence 623 { ISD::SREM, MVT::v4i32, { 24 } }, // pmuludq+mul+sub sequence 624 { ISD::UDIV, MVT::v4i32, { 15 } }, // pmuludq sequence 625 { ISD::UREM, MVT::v4i32, { 20 } }, // pmuludq+mul+sub sequence 626 }; 627 628 if (Op2Info.isConstant() && ST->hasSSE2()) 629 if (const auto *Entry = CostTableLookup(SSE2ConstCostTable, ISD, LT.second)) 630 if (auto KindCost = Entry->Cost[CostKind]) 631 return LT.first * *KindCost; 632 633 static const CostKindTblEntry AVX512BWUniformCostTable[] = { 634 { ISD::SHL, MVT::v16i8, { 3, 5, 5, 7 } }, // psllw + pand. 635 { ISD::SRL, MVT::v16i8, { 3,10, 5, 8 } }, // psrlw + pand. 636 { ISD::SRA, MVT::v16i8, { 4,12, 8,12 } }, // psrlw, pand, pxor, psubb. 637 { ISD::SHL, MVT::v32i8, { 4, 7, 6, 8 } }, // psllw + pand. 638 { ISD::SRL, MVT::v32i8, { 4, 8, 7, 9 } }, // psrlw + pand. 639 { ISD::SRA, MVT::v32i8, { 5,10,10,13 } }, // psrlw, pand, pxor, psubb. 640 { ISD::SHL, MVT::v64i8, { 4, 7, 6, 8 } }, // psllw + pand. 641 { ISD::SRL, MVT::v64i8, { 4, 8, 7,10 } }, // psrlw + pand. 642 { ISD::SRA, MVT::v64i8, { 5,10,10,15 } }, // psrlw, pand, pxor, psubb. 643 644 { ISD::SHL, MVT::v32i16, { 2, 4, 2, 3 } }, // psllw 645 { ISD::SRL, MVT::v32i16, { 2, 4, 2, 3 } }, // psrlw 646 { ISD::SRA, MVT::v32i16, { 2, 4, 2, 3 } }, // psrqw 647 }; 648 649 if (ST->hasBWI() && Op2Info.isUniform()) 650 if (const auto *Entry = 651 CostTableLookup(AVX512BWUniformCostTable, ISD, LT.second)) 652 if (auto KindCost = Entry->Cost[CostKind]) 653 return LT.first * *KindCost; 654 655 static const CostKindTblEntry AVX512UniformCostTable[] = { 656 { ISD::SHL, MVT::v32i16, { 5,10, 5, 7 } }, // psllw + split. 657 { ISD::SRL, MVT::v32i16, { 5,10, 5, 7 } }, // psrlw + split. 658 { ISD::SRA, MVT::v32i16, { 5,10, 5, 7 } }, // psraw + split. 659 660 { ISD::SHL, MVT::v16i32, { 2, 4, 2, 3 } }, // pslld 661 { ISD::SRL, MVT::v16i32, { 2, 4, 2, 3 } }, // psrld 662 { ISD::SRA, MVT::v16i32, { 2, 4, 2, 3 } }, // psrad 663 664 { ISD::SRA, MVT::v2i64, { 1, 2, 1, 2 } }, // psraq 665 { ISD::SHL, MVT::v4i64, { 1, 4, 1, 2 } }, // psllq 666 { ISD::SRL, MVT::v4i64, { 1, 4, 1, 2 } }, // psrlq 667 { ISD::SRA, MVT::v4i64, { 1, 4, 1, 2 } }, // psraq 668 { ISD::SHL, MVT::v8i64, { 1, 4, 1, 2 } }, // psllq 669 { ISD::SRL, MVT::v8i64, { 1, 4, 1, 2 } }, // psrlq 670 { ISD::SRA, MVT::v8i64, { 1, 4, 1, 2 } }, // psraq 671 }; 672 673 if (ST->hasAVX512() && Op2Info.isUniform()) 674 if (const auto *Entry = 675 CostTableLookup(AVX512UniformCostTable, ISD, LT.second)) 676 if (auto KindCost = Entry->Cost[CostKind]) 677 return LT.first * *KindCost; 678 679 static const CostKindTblEntry AVX2UniformCostTable[] = { 680 // Uniform splats are cheaper for the following instructions. 681 { ISD::SHL, MVT::v16i8, { 3, 5, 5, 7 } }, // psllw + pand. 682 { ISD::SRL, MVT::v16i8, { 3, 9, 5, 8 } }, // psrlw + pand. 683 { ISD::SRA, MVT::v16i8, { 4, 5, 9,13 } }, // psrlw, pand, pxor, psubb. 684 { ISD::SHL, MVT::v32i8, { 4, 7, 6, 8 } }, // psllw + pand. 685 { ISD::SRL, MVT::v32i8, { 4, 8, 7, 9 } }, // psrlw + pand. 686 { ISD::SRA, MVT::v32i8, { 6, 9,11,16 } }, // psrlw, pand, pxor, psubb. 687 688 { ISD::SHL, MVT::v8i16, { 1, 2, 1, 2 } }, // psllw. 689 { ISD::SRL, MVT::v8i16, { 1, 2, 1, 2 } }, // psrlw. 690 { ISD::SRA, MVT::v8i16, { 1, 2, 1, 2 } }, // psraw. 691 { ISD::SHL, MVT::v16i16, { 2, 4, 2, 3 } }, // psllw. 692 { ISD::SRL, MVT::v16i16, { 2, 4, 2, 3 } }, // psrlw. 693 { ISD::SRA, MVT::v16i16, { 2, 4, 2, 3 } }, // psraw. 694 695 { ISD::SHL, MVT::v4i32, { 1, 2, 1, 2 } }, // pslld 696 { ISD::SRL, MVT::v4i32, { 1, 2, 1, 2 } }, // psrld 697 { ISD::SRA, MVT::v4i32, { 1, 2, 1, 2 } }, // psrad 698 { ISD::SHL, MVT::v8i32, { 2, 4, 2, 3 } }, // pslld 699 { ISD::SRL, MVT::v8i32, { 2, 4, 2, 3 } }, // psrld 700 { ISD::SRA, MVT::v8i32, { 2, 4, 2, 3 } }, // psrad 701 702 { ISD::SHL, MVT::v2i64, { 1, 2, 1, 2 } }, // psllq 703 { ISD::SRL, MVT::v2i64, { 1, 2, 1, 2 } }, // psrlq 704 { ISD::SRA, MVT::v2i64, { 2, 4, 5, 7 } }, // 2 x psrad + shuffle. 705 { ISD::SHL, MVT::v4i64, { 2, 4, 1, 2 } }, // psllq 706 { ISD::SRL, MVT::v4i64, { 2, 4, 1, 2 } }, // psrlq 707 { ISD::SRA, MVT::v4i64, { 4, 6, 5, 9 } }, // 2 x psrad + shuffle. 708 }; 709 710 if (ST->hasAVX2() && Op2Info.isUniform()) 711 if (const auto *Entry = 712 CostTableLookup(AVX2UniformCostTable, ISD, LT.second)) 713 if (auto KindCost = Entry->Cost[CostKind]) 714 return LT.first * *KindCost; 715 716 static const CostKindTblEntry AVXUniformCostTable[] = { 717 { ISD::SHL, MVT::v16i8, { 4, 4, 6, 8 } }, // psllw + pand. 718 { ISD::SRL, MVT::v16i8, { 4, 8, 5, 8 } }, // psrlw + pand. 719 { ISD::SRA, MVT::v16i8, { 6, 6, 9,13 } }, // psrlw, pand, pxor, psubb. 720 { ISD::SHL, MVT::v32i8, { 7, 8,11,14 } }, // psllw + pand + split. 721 { ISD::SRL, MVT::v32i8, { 7, 9,10,14 } }, // psrlw + pand + split. 722 { ISD::SRA, MVT::v32i8, { 10,11,16,21 } }, // psrlw, pand, pxor, psubb + split. 723 724 { ISD::SHL, MVT::v8i16, { 1, 3, 1, 2 } }, // psllw. 725 { ISD::SRL, MVT::v8i16, { 1, 3, 1, 2 } }, // psrlw. 726 { ISD::SRA, MVT::v8i16, { 1, 3, 1, 2 } }, // psraw. 727 { ISD::SHL, MVT::v16i16, { 3, 7, 5, 7 } }, // psllw + split. 728 { ISD::SRL, MVT::v16i16, { 3, 7, 5, 7 } }, // psrlw + split. 729 { ISD::SRA, MVT::v16i16, { 3, 7, 5, 7 } }, // psraw + split. 730 731 { ISD::SHL, MVT::v4i32, { 1, 3, 1, 2 } }, // pslld. 732 { ISD::SRL, MVT::v4i32, { 1, 3, 1, 2 } }, // psrld. 733 { ISD::SRA, MVT::v4i32, { 1, 3, 1, 2 } }, // psrad. 734 { ISD::SHL, MVT::v8i32, { 3, 7, 5, 7 } }, // pslld + split. 735 { ISD::SRL, MVT::v8i32, { 3, 7, 5, 7 } }, // psrld + split. 736 { ISD::SRA, MVT::v8i32, { 3, 7, 5, 7 } }, // psrad + split. 737 738 { ISD::SHL, MVT::v2i64, { 1, 3, 1, 2 } }, // psllq. 739 { ISD::SRL, MVT::v2i64, { 1, 3, 1, 2 } }, // psrlq. 740 { ISD::SRA, MVT::v2i64, { 3, 4, 5, 7 } }, // 2 x psrad + shuffle. 741 { ISD::SHL, MVT::v4i64, { 3, 7, 4, 6 } }, // psllq + split. 742 { ISD::SRL, MVT::v4i64, { 3, 7, 4, 6 } }, // psrlq + split. 743 { ISD::SRA, MVT::v4i64, { 6, 7,10,13 } }, // 2 x (2 x psrad + shuffle) + split. 744 }; 745 746 // XOP has faster vXi8 shifts. 747 if (ST->hasAVX() && Op2Info.isUniform() && 748 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8)) 749 if (const auto *Entry = 750 CostTableLookup(AVXUniformCostTable, ISD, LT.second)) 751 if (auto KindCost = Entry->Cost[CostKind]) 752 return LT.first * *KindCost; 753 754 static const CostKindTblEntry SSE2UniformCostTable[] = { 755 // Uniform splats are cheaper for the following instructions. 756 { ISD::SHL, MVT::v16i8, { 9, 10, 6, 9 } }, // psllw + pand. 757 { ISD::SRL, MVT::v16i8, { 9, 13, 5, 9 } }, // psrlw + pand. 758 { ISD::SRA, MVT::v16i8, { 11, 15, 9,13 } }, // pcmpgtb sequence. 759 760 { ISD::SHL, MVT::v8i16, { 2, 2, 1, 2 } }, // psllw. 761 { ISD::SRL, MVT::v8i16, { 2, 2, 1, 2 } }, // psrlw. 762 { ISD::SRA, MVT::v8i16, { 2, 2, 1, 2 } }, // psraw. 763 764 { ISD::SHL, MVT::v4i32, { 2, 2, 1, 2 } }, // pslld 765 { ISD::SRL, MVT::v4i32, { 2, 2, 1, 2 } }, // psrld. 766 { ISD::SRA, MVT::v4i32, { 2, 2, 1, 2 } }, // psrad. 767 768 { ISD::SHL, MVT::v2i64, { 2, 2, 1, 2 } }, // psllq. 769 { ISD::SRL, MVT::v2i64, { 2, 2, 1, 2 } }, // psrlq. 770 { ISD::SRA, MVT::v2i64, { 5, 9, 5, 7 } }, // 2*psrlq + xor + sub. 771 }; 772 773 if (ST->hasSSE2() && Op2Info.isUniform() && 774 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8)) 775 if (const auto *Entry = 776 CostTableLookup(SSE2UniformCostTable, ISD, LT.second)) 777 if (auto KindCost = Entry->Cost[CostKind]) 778 return LT.first * *KindCost; 779 780 static const CostKindTblEntry AVX512DQCostTable[] = { 781 { ISD::MUL, MVT::v2i64, { 2, 15, 1, 3 } }, // pmullq 782 { ISD::MUL, MVT::v4i64, { 2, 15, 1, 3 } }, // pmullq 783 { ISD::MUL, MVT::v8i64, { 3, 15, 1, 3 } } // pmullq 784 }; 785 786 // Look for AVX512DQ lowering tricks for custom cases. 787 if (ST->hasDQI()) 788 if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second)) 789 if (auto KindCost = Entry->Cost[CostKind]) 790 return LT.first * *KindCost; 791 792 static const CostKindTblEntry AVX512BWCostTable[] = { 793 { ISD::SHL, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsllvw/pack sequence. 794 { ISD::SRL, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsrlvw/pack sequence. 795 { ISD::SRA, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsravw/pack sequence. 796 { ISD::SHL, MVT::v32i8, { 4, 23,11,16 } }, // extend/vpsllvw/pack sequence. 797 { ISD::SRL, MVT::v32i8, { 4, 30,12,18 } }, // extend/vpsrlvw/pack sequence. 798 { ISD::SRA, MVT::v32i8, { 6, 13,24,30 } }, // extend/vpsravw/pack sequence. 799 { ISD::SHL, MVT::v64i8, { 6, 19,13,15 } }, // extend/vpsllvw/pack sequence. 800 { ISD::SRL, MVT::v64i8, { 7, 27,15,18 } }, // extend/vpsrlvw/pack sequence. 801 { ISD::SRA, MVT::v64i8, { 15, 15,30,30 } }, // extend/vpsravw/pack sequence. 802 803 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsllvw 804 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsrlvw 805 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsravw 806 { ISD::SHL, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsllvw 807 { ISD::SRL, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsrlvw 808 { ISD::SRA, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsravw 809 { ISD::SHL, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsllvw 810 { ISD::SRL, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsrlvw 811 { ISD::SRA, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsravw 812 813 { ISD::ADD, MVT::v64i8, { 1, 1, 1, 1 } }, // paddb 814 { ISD::ADD, MVT::v32i16, { 1, 1, 1, 1 } }, // paddw 815 816 { ISD::ADD, MVT::v32i8, { 1, 1, 1, 1 } }, // paddb 817 { ISD::ADD, MVT::v16i16, { 1, 1, 1, 1 } }, // paddw 818 { ISD::ADD, MVT::v8i32, { 1, 1, 1, 1 } }, // paddd 819 { ISD::ADD, MVT::v4i64, { 1, 1, 1, 1 } }, // paddq 820 821 { ISD::SUB, MVT::v64i8, { 1, 1, 1, 1 } }, // psubb 822 { ISD::SUB, MVT::v32i16, { 1, 1, 1, 1 } }, // psubw 823 824 { ISD::MUL, MVT::v32i16, { 1, 5, 1, 1 } }, // pmullw 825 826 { ISD::SUB, MVT::v32i8, { 1, 1, 1, 1 } }, // psubb 827 { ISD::SUB, MVT::v16i16, { 1, 1, 1, 1 } }, // psubw 828 { ISD::SUB, MVT::v8i32, { 1, 1, 1, 1 } }, // psubd 829 { ISD::SUB, MVT::v4i64, { 1, 1, 1, 1 } }, // psubq 830 }; 831 832 // Look for AVX512BW lowering tricks for custom cases. 833 if (ST->hasBWI()) 834 if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second)) 835 if (auto KindCost = Entry->Cost[CostKind]) 836 return LT.first * *KindCost; 837 838 static const CostKindTblEntry AVX512CostTable[] = { 839 { ISD::SHL, MVT::v64i8, { 15, 19,27,33 } }, // vpblendv+split sequence. 840 { ISD::SRL, MVT::v64i8, { 15, 19,30,36 } }, // vpblendv+split sequence. 841 { ISD::SRA, MVT::v64i8, { 37, 37,51,63 } }, // vpblendv+split sequence. 842 843 { ISD::SHL, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsrlvd/pack sequence. 844 { ISD::SRL, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsrlvd/pack sequence. 845 { ISD::SRA, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsravd/pack sequence. 846 847 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } }, 848 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } }, 849 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } }, 850 { ISD::SHL, MVT::v8i32, { 1, 1, 1, 1 } }, 851 { ISD::SRL, MVT::v8i32, { 1, 1, 1, 1 } }, 852 { ISD::SRA, MVT::v8i32, { 1, 1, 1, 1 } }, 853 { ISD::SHL, MVT::v16i32, { 1, 1, 1, 1 } }, 854 { ISD::SRL, MVT::v16i32, { 1, 1, 1, 1 } }, 855 { ISD::SRA, MVT::v16i32, { 1, 1, 1, 1 } }, 856 857 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } }, 858 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } }, 859 { ISD::SRA, MVT::v2i64, { 1, 1, 1, 1 } }, 860 { ISD::SHL, MVT::v4i64, { 1, 1, 1, 1 } }, 861 { ISD::SRL, MVT::v4i64, { 1, 1, 1, 1 } }, 862 { ISD::SRA, MVT::v4i64, { 1, 1, 1, 1 } }, 863 { ISD::SHL, MVT::v8i64, { 1, 1, 1, 1 } }, 864 { ISD::SRL, MVT::v8i64, { 1, 1, 1, 1 } }, 865 { ISD::SRA, MVT::v8i64, { 1, 1, 1, 1 } }, 866 867 { ISD::ADD, MVT::v64i8, { 3, 7, 5, 5 } }, // 2*paddb + split 868 { ISD::ADD, MVT::v32i16, { 3, 7, 5, 5 } }, // 2*paddw + split 869 870 { ISD::SUB, MVT::v64i8, { 3, 7, 5, 5 } }, // 2*psubb + split 871 { ISD::SUB, MVT::v32i16, { 3, 7, 5, 5 } }, // 2*psubw + split 872 873 { ISD::AND, MVT::v32i8, { 1, 1, 1, 1 } }, 874 { ISD::AND, MVT::v16i16, { 1, 1, 1, 1 } }, 875 { ISD::AND, MVT::v8i32, { 1, 1, 1, 1 } }, 876 { ISD::AND, MVT::v4i64, { 1, 1, 1, 1 } }, 877 878 { ISD::OR, MVT::v32i8, { 1, 1, 1, 1 } }, 879 { ISD::OR, MVT::v16i16, { 1, 1, 1, 1 } }, 880 { ISD::OR, MVT::v8i32, { 1, 1, 1, 1 } }, 881 { ISD::OR, MVT::v4i64, { 1, 1, 1, 1 } }, 882 883 { ISD::XOR, MVT::v32i8, { 1, 1, 1, 1 } }, 884 { ISD::XOR, MVT::v16i16, { 1, 1, 1, 1 } }, 885 { ISD::XOR, MVT::v8i32, { 1, 1, 1, 1 } }, 886 { ISD::XOR, MVT::v4i64, { 1, 1, 1, 1 } }, 887 888 { ISD::MUL, MVT::v16i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org) 889 { ISD::MUL, MVT::v8i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org) 890 { ISD::MUL, MVT::v4i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org) 891 { ISD::MUL, MVT::v8i64, { 6, 9, 8, 8 } }, // 3*pmuludq/3*shift/2*add 892 { ISD::MUL, MVT::i64, { 1 } }, // Skylake from http://www.agner.org/ 893 894 { ISD::FNEG, MVT::v8f64, { 1, 1, 1, 2 } }, // Skylake from http://www.agner.org/ 895 { ISD::FADD, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ 896 { ISD::FADD, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ 897 { ISD::FSUB, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ 898 { ISD::FSUB, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ 899 { ISD::FMUL, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ 900 { ISD::FMUL, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ 901 { ISD::FMUL, MVT::v2f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ 902 { ISD::FMUL, MVT::f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ 903 904 { ISD::FDIV, MVT::f64, { 4, 14, 1, 1 } }, // Skylake from http://www.agner.org/ 905 { ISD::FDIV, MVT::v2f64, { 4, 14, 1, 1 } }, // Skylake from http://www.agner.org/ 906 { ISD::FDIV, MVT::v4f64, { 8, 14, 1, 1 } }, // Skylake from http://www.agner.org/ 907 { ISD::FDIV, MVT::v8f64, { 16, 23, 1, 3 } }, // Skylake from http://www.agner.org/ 908 909 { ISD::FNEG, MVT::v16f32, { 1, 1, 1, 2 } }, // Skylake from http://www.agner.org/ 910 { ISD::FADD, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ 911 { ISD::FADD, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ 912 { ISD::FSUB, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ 913 { ISD::FSUB, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ 914 { ISD::FMUL, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ 915 { ISD::FMUL, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ 916 { ISD::FMUL, MVT::v4f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ 917 { ISD::FMUL, MVT::f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ 918 919 { ISD::FDIV, MVT::f32, { 3, 11, 1, 1 } }, // Skylake from http://www.agner.org/ 920 { ISD::FDIV, MVT::v4f32, { 3, 11, 1, 1 } }, // Skylake from http://www.agner.org/ 921 { ISD::FDIV, MVT::v8f32, { 5, 11, 1, 1 } }, // Skylake from http://www.agner.org/ 922 { ISD::FDIV, MVT::v16f32, { 10, 18, 1, 3 } }, // Skylake from http://www.agner.org/ 923 }; 924 925 if (ST->hasAVX512()) 926 if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second)) 927 if (auto KindCost = Entry->Cost[CostKind]) 928 return LT.first * *KindCost; 929 930 static const CostKindTblEntry AVX2ShiftCostTable[] = { 931 // Shifts on vXi64/vXi32 on AVX2 is legal even though we declare to 932 // customize them to detect the cases where shift amount is a scalar one. 933 { ISD::SHL, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsllvd (Haswell from agner.org) 934 { ISD::SRL, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsrlvd (Haswell from agner.org) 935 { ISD::SRA, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsravd (Haswell from agner.org) 936 { ISD::SHL, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsllvd (Haswell from agner.org) 937 { ISD::SRL, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsrlvd (Haswell from agner.org) 938 { ISD::SRA, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsravd (Haswell from agner.org) 939 { ISD::SHL, MVT::v2i64, { 2, 3, 1, 1 } }, // vpsllvq (Haswell from agner.org) 940 { ISD::SRL, MVT::v2i64, { 2, 3, 1, 1 } }, // vpsrlvq (Haswell from agner.org) 941 { ISD::SHL, MVT::v4i64, { 4, 4, 1, 2 } }, // vpsllvq (Haswell from agner.org) 942 { ISD::SRL, MVT::v4i64, { 4, 4, 1, 2 } }, // vpsrlvq (Haswell from agner.org) 943 }; 944 945 if (ST->hasAVX512()) { 946 if (ISD == ISD::SHL && LT.second == MVT::v32i16 && Op2Info.isConstant()) 947 // On AVX512, a packed v32i16 shift left by a constant build_vector 948 // is lowered into a vector multiply (vpmullw). 949 return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, 950 Op1Info.getNoProps(), Op2Info.getNoProps()); 951 } 952 953 // Look for AVX2 lowering tricks (XOP is always better at v4i32 shifts). 954 if (ST->hasAVX2() && !(ST->hasXOP() && LT.second == MVT::v4i32)) { 955 if (ISD == ISD::SHL && LT.second == MVT::v16i16 && 956 Op2Info.isConstant()) 957 // On AVX2, a packed v16i16 shift left by a constant build_vector 958 // is lowered into a vector multiply (vpmullw). 959 return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, 960 Op1Info.getNoProps(), Op2Info.getNoProps()); 961 962 if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second)) 963 if (auto KindCost = Entry->Cost[CostKind]) 964 return LT.first * *KindCost; 965 } 966 967 static const CostKindTblEntry XOPShiftCostTable[] = { 968 // 128bit shifts take 1cy, but right shifts require negation beforehand. 969 { ISD::SHL, MVT::v16i8, { 1, 3, 1, 1 } }, 970 { ISD::SRL, MVT::v16i8, { 2, 3, 1, 1 } }, 971 { ISD::SRA, MVT::v16i8, { 2, 3, 1, 1 } }, 972 { ISD::SHL, MVT::v8i16, { 1, 3, 1, 1 } }, 973 { ISD::SRL, MVT::v8i16, { 2, 3, 1, 1 } }, 974 { ISD::SRA, MVT::v8i16, { 2, 3, 1, 1 } }, 975 { ISD::SHL, MVT::v4i32, { 1, 3, 1, 1 } }, 976 { ISD::SRL, MVT::v4i32, { 2, 3, 1, 1 } }, 977 { ISD::SRA, MVT::v4i32, { 2, 3, 1, 1 } }, 978 { ISD::SHL, MVT::v2i64, { 1, 3, 1, 1 } }, 979 { ISD::SRL, MVT::v2i64, { 2, 3, 1, 1 } }, 980 { ISD::SRA, MVT::v2i64, { 2, 3, 1, 1 } }, 981 // 256bit shifts require splitting if AVX2 didn't catch them above. 982 { ISD::SHL, MVT::v32i8, { 4, 7, 5, 6 } }, 983 { ISD::SRL, MVT::v32i8, { 6, 7, 5, 6 } }, 984 { ISD::SRA, MVT::v32i8, { 6, 7, 5, 6 } }, 985 { ISD::SHL, MVT::v16i16, { 4, 7, 5, 6 } }, 986 { ISD::SRL, MVT::v16i16, { 6, 7, 5, 6 } }, 987 { ISD::SRA, MVT::v16i16, { 6, 7, 5, 6 } }, 988 { ISD::SHL, MVT::v8i32, { 4, 7, 5, 6 } }, 989 { ISD::SRL, MVT::v8i32, { 6, 7, 5, 6 } }, 990 { ISD::SRA, MVT::v8i32, { 6, 7, 5, 6 } }, 991 { ISD::SHL, MVT::v4i64, { 4, 7, 5, 6 } }, 992 { ISD::SRL, MVT::v4i64, { 6, 7, 5, 6 } }, 993 { ISD::SRA, MVT::v4i64, { 6, 7, 5, 6 } }, 994 }; 995 996 // Look for XOP lowering tricks. 997 if (ST->hasXOP()) { 998 // If the right shift is constant then we'll fold the negation so 999 // it's as cheap as a left shift. 1000 int ShiftISD = ISD; 1001 if ((ShiftISD == ISD::SRL || ShiftISD == ISD::SRA) && Op2Info.isConstant()) 1002 ShiftISD = ISD::SHL; 1003 if (const auto *Entry = 1004 CostTableLookup(XOPShiftCostTable, ShiftISD, LT.second)) 1005 if (auto KindCost = Entry->Cost[CostKind]) 1006 return LT.first * *KindCost; 1007 } 1008 1009 if (ISD == ISD::SHL && !Op2Info.isUniform() && Op2Info.isConstant()) { 1010 MVT VT = LT.second; 1011 // Vector shift left by non uniform constant can be lowered 1012 // into vector multiply. 1013 if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) || 1014 ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX())) 1015 ISD = ISD::MUL; 1016 } 1017 1018 static const CostKindTblEntry GLMCostTable[] = { 1019 { ISD::FDIV, MVT::f32, { 18, 19, 1, 1 } }, // divss 1020 { ISD::FDIV, MVT::v4f32, { 35, 36, 1, 1 } }, // divps 1021 { ISD::FDIV, MVT::f64, { 33, 34, 1, 1 } }, // divsd 1022 { ISD::FDIV, MVT::v2f64, { 65, 66, 1, 1 } }, // divpd 1023 }; 1024 1025 if (ST->useGLMDivSqrtCosts()) 1026 if (const auto *Entry = CostTableLookup(GLMCostTable, ISD, LT.second)) 1027 if (auto KindCost = Entry->Cost[CostKind]) 1028 return LT.first * *KindCost; 1029 1030 static const CostKindTblEntry SLMCostTable[] = { 1031 { ISD::MUL, MVT::v4i32, { 11, 11, 1, 7 } }, // pmulld 1032 { ISD::MUL, MVT::v8i16, { 2, 5, 1, 1 } }, // pmullw 1033 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // mulsd 1034 { ISD::FMUL, MVT::f32, { 1, 4, 1, 1 } }, // mulss 1035 { ISD::FMUL, MVT::v2f64, { 4, 7, 1, 1 } }, // mulpd 1036 { ISD::FMUL, MVT::v4f32, { 2, 5, 1, 1 } }, // mulps 1037 { ISD::FDIV, MVT::f32, { 17, 19, 1, 1 } }, // divss 1038 { ISD::FDIV, MVT::v4f32, { 39, 39, 1, 6 } }, // divps 1039 { ISD::FDIV, MVT::f64, { 32, 34, 1, 1 } }, // divsd 1040 { ISD::FDIV, MVT::v2f64, { 69, 69, 1, 6 } }, // divpd 1041 { ISD::FADD, MVT::v2f64, { 2, 4, 1, 1 } }, // addpd 1042 { ISD::FSUB, MVT::v2f64, { 2, 4, 1, 1 } }, // subpd 1043 // v2i64/v4i64 mul is custom lowered as a series of long: 1044 // multiplies(3), shifts(3) and adds(2) 1045 // slm muldq version throughput is 2 and addq throughput 4 1046 // thus: 3X2 (muldq throughput) + 3X1 (shift throughput) + 1047 // 3X4 (addq throughput) = 17 1048 { ISD::MUL, MVT::v2i64, { 17, 22, 9, 9 } }, 1049 // slm addq\subq throughput is 4 1050 { ISD::ADD, MVT::v2i64, { 4, 2, 1, 2 } }, 1051 { ISD::SUB, MVT::v2i64, { 4, 2, 1, 2 } }, 1052 }; 1053 1054 if (ST->useSLMArithCosts()) 1055 if (const auto *Entry = CostTableLookup(SLMCostTable, ISD, LT.second)) 1056 if (auto KindCost = Entry->Cost[CostKind]) 1057 return LT.first * *KindCost; 1058 1059 static const CostKindTblEntry AVX2CostTable[] = { 1060 { ISD::SHL, MVT::v16i8, { 6, 21,11,16 } }, // vpblendvb sequence. 1061 { ISD::SHL, MVT::v32i8, { 6, 23,11,22 } }, // vpblendvb sequence. 1062 { ISD::SHL, MVT::v8i16, { 5, 18, 5,10 } }, // extend/vpsrlvd/pack sequence. 1063 { ISD::SHL, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsrlvd/pack sequence. 1064 1065 { ISD::SRL, MVT::v16i8, { 6, 27,12,18 } }, // vpblendvb sequence. 1066 { ISD::SRL, MVT::v32i8, { 8, 30,12,24 } }, // vpblendvb sequence. 1067 { ISD::SRL, MVT::v8i16, { 5, 11, 5,10 } }, // extend/vpsrlvd/pack sequence. 1068 { ISD::SRL, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsrlvd/pack sequence. 1069 1070 { ISD::SRA, MVT::v16i8, { 17, 17,24,30 } }, // vpblendvb sequence. 1071 { ISD::SRA, MVT::v32i8, { 18, 20,24,43 } }, // vpblendvb sequence. 1072 { ISD::SRA, MVT::v8i16, { 5, 11, 5,10 } }, // extend/vpsravd/pack sequence. 1073 { ISD::SRA, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsravd/pack sequence. 1074 { ISD::SRA, MVT::v2i64, { 4, 5, 5, 5 } }, // srl/xor/sub sequence. 1075 { ISD::SRA, MVT::v4i64, { 8, 8, 5, 9 } }, // srl/xor/sub sequence. 1076 1077 { ISD::SUB, MVT::v32i8, { 1, 1, 1, 2 } }, // psubb 1078 { ISD::ADD, MVT::v32i8, { 1, 1, 1, 2 } }, // paddb 1079 { ISD::SUB, MVT::v16i16, { 1, 1, 1, 2 } }, // psubw 1080 { ISD::ADD, MVT::v16i16, { 1, 1, 1, 2 } }, // paddw 1081 { ISD::SUB, MVT::v8i32, { 1, 1, 1, 2 } }, // psubd 1082 { ISD::ADD, MVT::v8i32, { 1, 1, 1, 2 } }, // paddd 1083 { ISD::SUB, MVT::v4i64, { 1, 1, 1, 2 } }, // psubq 1084 { ISD::ADD, MVT::v4i64, { 1, 1, 1, 2 } }, // paddq 1085 1086 { ISD::MUL, MVT::v16i16, { 2, 5, 1, 1 } }, // pmullw 1087 { ISD::MUL, MVT::v8i32, { 4, 10, 1, 2 } }, // pmulld 1088 { ISD::MUL, MVT::v4i32, { 2, 10, 1, 2 } }, // pmulld 1089 { ISD::MUL, MVT::v4i64, { 6, 10, 8,13 } }, // 3*pmuludq/3*shift/2*add 1090 { ISD::MUL, MVT::v2i64, { 6, 10, 8, 8 } }, // 3*pmuludq/3*shift/2*add 1091 1092 { ISD::FNEG, MVT::v4f64, { 1, 1, 1, 2 } }, // vxorpd 1093 { ISD::FNEG, MVT::v8f32, { 1, 1, 1, 2 } }, // vxorps 1094 1095 { ISD::FADD, MVT::f64, { 1, 4, 1, 1 } }, // vaddsd 1096 { ISD::FADD, MVT::f32, { 1, 4, 1, 1 } }, // vaddss 1097 { ISD::FADD, MVT::v2f64, { 1, 4, 1, 1 } }, // vaddpd 1098 { ISD::FADD, MVT::v4f32, { 1, 4, 1, 1 } }, // vaddps 1099 { ISD::FADD, MVT::v4f64, { 1, 4, 1, 2 } }, // vaddpd 1100 { ISD::FADD, MVT::v8f32, { 1, 4, 1, 2 } }, // vaddps 1101 1102 { ISD::FSUB, MVT::f64, { 1, 4, 1, 1 } }, // vsubsd 1103 { ISD::FSUB, MVT::f32, { 1, 4, 1, 1 } }, // vsubss 1104 { ISD::FSUB, MVT::v2f64, { 1, 4, 1, 1 } }, // vsubpd 1105 { ISD::FSUB, MVT::v4f32, { 1, 4, 1, 1 } }, // vsubps 1106 { ISD::FSUB, MVT::v4f64, { 1, 4, 1, 2 } }, // vsubpd 1107 { ISD::FSUB, MVT::v8f32, { 1, 4, 1, 2 } }, // vsubps 1108 1109 { ISD::FMUL, MVT::f64, { 1, 5, 1, 1 } }, // vmulsd 1110 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // vmulss 1111 { ISD::FMUL, MVT::v2f64, { 1, 5, 1, 1 } }, // vmulpd 1112 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // vmulps 1113 { ISD::FMUL, MVT::v4f64, { 1, 5, 1, 2 } }, // vmulpd 1114 { ISD::FMUL, MVT::v8f32, { 1, 5, 1, 2 } }, // vmulps 1115 1116 { ISD::FDIV, MVT::f32, { 7, 13, 1, 1 } }, // vdivss 1117 { ISD::FDIV, MVT::v4f32, { 7, 13, 1, 1 } }, // vdivps 1118 { ISD::FDIV, MVT::v8f32, { 14, 21, 1, 3 } }, // vdivps 1119 { ISD::FDIV, MVT::f64, { 14, 20, 1, 1 } }, // vdivsd 1120 { ISD::FDIV, MVT::v2f64, { 14, 20, 1, 1 } }, // vdivpd 1121 { ISD::FDIV, MVT::v4f64, { 28, 35, 1, 3 } }, // vdivpd 1122 }; 1123 1124 // Look for AVX2 lowering tricks for custom cases. 1125 if (ST->hasAVX2()) 1126 if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second)) 1127 if (auto KindCost = Entry->Cost[CostKind]) 1128 return LT.first * *KindCost; 1129 1130 static const CostKindTblEntry AVX1CostTable[] = { 1131 // We don't have to scalarize unsupported ops. We can issue two half-sized 1132 // operations and we only need to extract the upper YMM half. 1133 // Two ops + 1 extract + 1 insert = 4. 1134 { ISD::MUL, MVT::v16i16, { 4, 8, 5, 6 } }, // pmullw + split 1135 { ISD::MUL, MVT::v8i32, { 5, 8, 5, 10 } }, // pmulld + split 1136 { ISD::MUL, MVT::v4i32, { 2, 5, 1, 3 } }, // pmulld 1137 { ISD::MUL, MVT::v4i64, { 12, 15, 19, 20 } }, 1138 1139 { ISD::AND, MVT::v32i8, { 1, 1, 1, 2 } }, // vandps 1140 { ISD::AND, MVT::v16i16, { 1, 1, 1, 2 } }, // vandps 1141 { ISD::AND, MVT::v8i32, { 1, 1, 1, 2 } }, // vandps 1142 { ISD::AND, MVT::v4i64, { 1, 1, 1, 2 } }, // vandps 1143 1144 { ISD::OR, MVT::v32i8, { 1, 1, 1, 2 } }, // vorps 1145 { ISD::OR, MVT::v16i16, { 1, 1, 1, 2 } }, // vorps 1146 { ISD::OR, MVT::v8i32, { 1, 1, 1, 2 } }, // vorps 1147 { ISD::OR, MVT::v4i64, { 1, 1, 1, 2 } }, // vorps 1148 1149 { ISD::XOR, MVT::v32i8, { 1, 1, 1, 2 } }, // vxorps 1150 { ISD::XOR, MVT::v16i16, { 1, 1, 1, 2 } }, // vxorps 1151 { ISD::XOR, MVT::v8i32, { 1, 1, 1, 2 } }, // vxorps 1152 { ISD::XOR, MVT::v4i64, { 1, 1, 1, 2 } }, // vxorps 1153 1154 { ISD::SUB, MVT::v32i8, { 4, 2, 5, 6 } }, // psubb + split 1155 { ISD::ADD, MVT::v32i8, { 4, 2, 5, 6 } }, // paddb + split 1156 { ISD::SUB, MVT::v16i16, { 4, 2, 5, 6 } }, // psubw + split 1157 { ISD::ADD, MVT::v16i16, { 4, 2, 5, 6 } }, // paddw + split 1158 { ISD::SUB, MVT::v8i32, { 4, 2, 5, 6 } }, // psubd + split 1159 { ISD::ADD, MVT::v8i32, { 4, 2, 5, 6 } }, // paddd + split 1160 { ISD::SUB, MVT::v4i64, { 4, 2, 5, 6 } }, // psubq + split 1161 { ISD::ADD, MVT::v4i64, { 4, 2, 5, 6 } }, // paddq + split 1162 { ISD::SUB, MVT::v2i64, { 1, 1, 1, 1 } }, // psubq 1163 { ISD::ADD, MVT::v2i64, { 1, 1, 1, 1 } }, // paddq 1164 1165 { ISD::SHL, MVT::v16i8, { 10, 21,11,17 } }, // pblendvb sequence. 1166 { ISD::SHL, MVT::v32i8, { 22, 22,27,40 } }, // pblendvb sequence + split. 1167 { ISD::SHL, MVT::v8i16, { 6, 9,11,11 } }, // pblendvb sequence. 1168 { ISD::SHL, MVT::v16i16, { 13, 16,24,25 } }, // pblendvb sequence + split. 1169 { ISD::SHL, MVT::v4i32, { 3, 11, 4, 6 } }, // pslld/paddd/cvttps2dq/pmulld 1170 { ISD::SHL, MVT::v8i32, { 9, 11,12,17 } }, // pslld/paddd/cvttps2dq/pmulld + split 1171 { ISD::SHL, MVT::v2i64, { 2, 4, 4, 6 } }, // Shift each lane + blend. 1172 { ISD::SHL, MVT::v4i64, { 6, 7,11,15 } }, // Shift each lane + blend + split. 1173 1174 { ISD::SRL, MVT::v16i8, { 11, 27,12,18 } }, // pblendvb sequence. 1175 { ISD::SRL, MVT::v32i8, { 23, 23,30,43 } }, // pblendvb sequence + split. 1176 { ISD::SRL, MVT::v8i16, { 13, 16,14,22 } }, // pblendvb sequence. 1177 { ISD::SRL, MVT::v16i16, { 28, 30,31,48 } }, // pblendvb sequence + split. 1178 { ISD::SRL, MVT::v4i32, { 6, 7,12,16 } }, // Shift each lane + blend. 1179 { ISD::SRL, MVT::v8i32, { 14, 14,26,34 } }, // Shift each lane + blend + split. 1180 { ISD::SRL, MVT::v2i64, { 2, 4, 4, 6 } }, // Shift each lane + blend. 1181 { ISD::SRL, MVT::v4i64, { 6, 7,11,15 } }, // Shift each lane + blend + split. 1182 1183 { ISD::SRA, MVT::v16i8, { 21, 22,24,36 } }, // pblendvb sequence. 1184 { ISD::SRA, MVT::v32i8, { 44, 45,51,76 } }, // pblendvb sequence + split. 1185 { ISD::SRA, MVT::v8i16, { 13, 16,14,22 } }, // pblendvb sequence. 1186 { ISD::SRA, MVT::v16i16, { 28, 30,31,48 } }, // pblendvb sequence + split. 1187 { ISD::SRA, MVT::v4i32, { 6, 7,12,16 } }, // Shift each lane + blend. 1188 { ISD::SRA, MVT::v8i32, { 14, 14,26,34 } }, // Shift each lane + blend + split. 1189 { ISD::SRA, MVT::v2i64, { 5, 6,10,14 } }, // Shift each lane + blend. 1190 { ISD::SRA, MVT::v4i64, { 12, 12,22,30 } }, // Shift each lane + blend + split. 1191 1192 { ISD::FNEG, MVT::v4f64, { 2, 2, 1, 2 } }, // BTVER2 from http://www.agner.org/ 1193 { ISD::FNEG, MVT::v8f32, { 2, 2, 1, 2 } }, // BTVER2 from http://www.agner.org/ 1194 1195 { ISD::FADD, MVT::f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/ 1196 { ISD::FADD, MVT::f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/ 1197 { ISD::FADD, MVT::v2f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/ 1198 { ISD::FADD, MVT::v4f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/ 1199 { ISD::FADD, MVT::v4f64, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/ 1200 { ISD::FADD, MVT::v8f32, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/ 1201 1202 { ISD::FSUB, MVT::f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/ 1203 { ISD::FSUB, MVT::f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/ 1204 { ISD::FSUB, MVT::v2f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/ 1205 { ISD::FSUB, MVT::v4f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/ 1206 { ISD::FSUB, MVT::v4f64, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/ 1207 { ISD::FSUB, MVT::v8f32, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/ 1208 1209 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/ 1210 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/ 1211 { ISD::FMUL, MVT::v2f64, { 2, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/ 1212 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/ 1213 { ISD::FMUL, MVT::v4f64, { 4, 5, 1, 2 } }, // BTVER2 from http://www.agner.org/ 1214 { ISD::FMUL, MVT::v8f32, { 2, 5, 1, 2 } }, // BTVER2 from http://www.agner.org/ 1215 1216 { ISD::FDIV, MVT::f32, { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/ 1217 { ISD::FDIV, MVT::v4f32, { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/ 1218 { ISD::FDIV, MVT::v8f32, { 28, 29, 1, 3 } }, // SNB from http://www.agner.org/ 1219 { ISD::FDIV, MVT::f64, { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/ 1220 { ISD::FDIV, MVT::v2f64, { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/ 1221 { ISD::FDIV, MVT::v4f64, { 44, 45, 1, 3 } }, // SNB from http://www.agner.org/ 1222 }; 1223 1224 if (ST->hasAVX()) 1225 if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second)) 1226 if (auto KindCost = Entry->Cost[CostKind]) 1227 return LT.first * *KindCost; 1228 1229 static const CostKindTblEntry SSE42CostTable[] = { 1230 { ISD::FADD, MVT::f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/ 1231 { ISD::FADD, MVT::f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/ 1232 { ISD::FADD, MVT::v2f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/ 1233 { ISD::FADD, MVT::v4f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/ 1234 1235 { ISD::FSUB, MVT::f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/ 1236 { ISD::FSUB, MVT::f32 , { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/ 1237 { ISD::FSUB, MVT::v2f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/ 1238 { ISD::FSUB, MVT::v4f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/ 1239 1240 { ISD::FMUL, MVT::f64, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/ 1241 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/ 1242 { ISD::FMUL, MVT::v2f64, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/ 1243 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/ 1244 1245 { ISD::FDIV, MVT::f32, { 14, 14, 1, 1 } }, // Nehalem from http://www.agner.org/ 1246 { ISD::FDIV, MVT::v4f32, { 14, 14, 1, 1 } }, // Nehalem from http://www.agner.org/ 1247 { ISD::FDIV, MVT::f64, { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/ 1248 { ISD::FDIV, MVT::v2f64, { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/ 1249 1250 { ISD::MUL, MVT::v2i64, { 6, 10,10,10 } } // 3*pmuludq/3*shift/2*add 1251 }; 1252 1253 if (ST->hasSSE42()) 1254 if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second)) 1255 if (auto KindCost = Entry->Cost[CostKind]) 1256 return LT.first * *KindCost; 1257 1258 static const CostKindTblEntry SSE41CostTable[] = { 1259 { ISD::SHL, MVT::v16i8, { 15, 24,17,22 } }, // pblendvb sequence. 1260 { ISD::SHL, MVT::v8i16, { 11, 14,11,11 } }, // pblendvb sequence. 1261 { ISD::SHL, MVT::v4i32, { 14, 20, 4,10 } }, // pslld/paddd/cvttps2dq/pmulld 1262 1263 { ISD::SRL, MVT::v16i8, { 16, 27,18,24 } }, // pblendvb sequence. 1264 { ISD::SRL, MVT::v8i16, { 22, 26,23,27 } }, // pblendvb sequence. 1265 { ISD::SRL, MVT::v4i32, { 16, 17,15,19 } }, // Shift each lane + blend. 1266 { ISD::SRL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence. 1267 1268 { ISD::SRA, MVT::v16i8, { 38, 41,30,36 } }, // pblendvb sequence. 1269 { ISD::SRA, MVT::v8i16, { 22, 26,23,27 } }, // pblendvb sequence. 1270 { ISD::SRA, MVT::v4i32, { 16, 17,15,19 } }, // Shift each lane + blend. 1271 { ISD::SRA, MVT::v2i64, { 8, 17, 5, 7 } }, // splat+shuffle sequence. 1272 1273 { ISD::MUL, MVT::v4i32, { 2, 11, 1, 1 } } // pmulld (Nehalem from agner.org) 1274 }; 1275 1276 if (ST->hasSSE41()) 1277 if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second)) 1278 if (auto KindCost = Entry->Cost[CostKind]) 1279 return LT.first * *KindCost; 1280 1281 static const CostKindTblEntry SSE2CostTable[] = { 1282 // We don't correctly identify costs of casts because they are marked as 1283 // custom. 1284 { ISD::SHL, MVT::v16i8, { 13, 21,26,28 } }, // cmpgtb sequence. 1285 { ISD::SHL, MVT::v8i16, { 24, 27,16,20 } }, // cmpgtw sequence. 1286 { ISD::SHL, MVT::v4i32, { 17, 19,10,12 } }, // pslld/paddd/cvttps2dq/pmuludq. 1287 { ISD::SHL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence. 1288 1289 { ISD::SRL, MVT::v16i8, { 14, 28,27,30 } }, // cmpgtb sequence. 1290 { ISD::SRL, MVT::v8i16, { 16, 19,31,31 } }, // cmpgtw sequence. 1291 { ISD::SRL, MVT::v4i32, { 12, 12,15,19 } }, // Shift each lane + blend. 1292 { ISD::SRL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence. 1293 1294 { ISD::SRA, MVT::v16i8, { 27, 30,54,54 } }, // unpacked cmpgtb sequence. 1295 { ISD::SRA, MVT::v8i16, { 16, 19,31,31 } }, // cmpgtw sequence. 1296 { ISD::SRA, MVT::v4i32, { 12, 12,15,19 } }, // Shift each lane + blend. 1297 { ISD::SRA, MVT::v2i64, { 8, 11,12,16 } }, // srl/xor/sub splat+shuffle sequence. 1298 1299 { ISD::AND, MVT::v16i8, { 1, 1, 1, 1 } }, // pand 1300 { ISD::AND, MVT::v8i16, { 1, 1, 1, 1 } }, // pand 1301 { ISD::AND, MVT::v4i32, { 1, 1, 1, 1 } }, // pand 1302 { ISD::AND, MVT::v2i64, { 1, 1, 1, 1 } }, // pand 1303 1304 { ISD::OR, MVT::v16i8, { 1, 1, 1, 1 } }, // por 1305 { ISD::OR, MVT::v8i16, { 1, 1, 1, 1 } }, // por 1306 { ISD::OR, MVT::v4i32, { 1, 1, 1, 1 } }, // por 1307 { ISD::OR, MVT::v2i64, { 1, 1, 1, 1 } }, // por 1308 1309 { ISD::XOR, MVT::v16i8, { 1, 1, 1, 1 } }, // pxor 1310 { ISD::XOR, MVT::v8i16, { 1, 1, 1, 1 } }, // pxor 1311 { ISD::XOR, MVT::v4i32, { 1, 1, 1, 1 } }, // pxor 1312 { ISD::XOR, MVT::v2i64, { 1, 1, 1, 1 } }, // pxor 1313 1314 { ISD::ADD, MVT::v2i64, { 1, 2, 1, 2 } }, // paddq 1315 { ISD::SUB, MVT::v2i64, { 1, 2, 1, 2 } }, // psubq 1316 1317 { ISD::MUL, MVT::v8i16, { 1, 5, 1, 1 } }, // pmullw 1318 { ISD::MUL, MVT::v4i32, { 6, 8, 7, 7 } }, // 3*pmuludq/4*shuffle 1319 { ISD::MUL, MVT::v2i64, { 8, 10, 8, 8 } }, // 3*pmuludq/3*shift/2*add 1320 1321 { ISD::FDIV, MVT::f32, { 23, 23, 1, 1 } }, // Pentium IV from http://www.agner.org/ 1322 { ISD::FDIV, MVT::v4f32, { 39, 39, 1, 1 } }, // Pentium IV from http://www.agner.org/ 1323 { ISD::FDIV, MVT::f64, { 38, 38, 1, 1 } }, // Pentium IV from http://www.agner.org/ 1324 { ISD::FDIV, MVT::v2f64, { 69, 69, 1, 1 } }, // Pentium IV from http://www.agner.org/ 1325 1326 { ISD::FNEG, MVT::f32, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/ 1327 { ISD::FNEG, MVT::f64, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/ 1328 { ISD::FNEG, MVT::v4f32, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/ 1329 { ISD::FNEG, MVT::v2f64, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/ 1330 1331 { ISD::FADD, MVT::f32, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/ 1332 { ISD::FADD, MVT::f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/ 1333 { ISD::FADD, MVT::v2f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/ 1334 1335 { ISD::FSUB, MVT::f32, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/ 1336 { ISD::FSUB, MVT::f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/ 1337 { ISD::FSUB, MVT::v2f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/ 1338 1339 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // Pentium IV from http://www.agner.org/ 1340 { ISD::FMUL, MVT::v2f64, { 2, 5, 1, 1 } }, // Pentium IV from http://www.agner.org/ 1341 }; 1342 1343 if (ST->hasSSE2()) 1344 if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second)) 1345 if (auto KindCost = Entry->Cost[CostKind]) 1346 return LT.first * *KindCost; 1347 1348 static const CostKindTblEntry SSE1CostTable[] = { 1349 { ISD::FDIV, MVT::f32, { 17, 18, 1, 1 } }, // Pentium III from http://www.agner.org/ 1350 { ISD::FDIV, MVT::v4f32, { 34, 48, 1, 1 } }, // Pentium III from http://www.agner.org/ 1351 1352 { ISD::FNEG, MVT::f32, { 2, 2, 1, 2 } }, // Pentium III from http://www.agner.org/ 1353 { ISD::FNEG, MVT::v4f32, { 2, 2, 1, 2 } }, // Pentium III from http://www.agner.org/ 1354 1355 { ISD::FADD, MVT::f32, { 1, 3, 1, 1 } }, // Pentium III from http://www.agner.org/ 1356 { ISD::FADD, MVT::v4f32, { 2, 3, 1, 1 } }, // Pentium III from http://www.agner.org/ 1357 1358 { ISD::FSUB, MVT::f32, { 1, 3, 1, 1 } }, // Pentium III from http://www.agner.org/ 1359 { ISD::FSUB, MVT::v4f32, { 2, 3, 1, 1 } }, // Pentium III from http://www.agner.org/ 1360 1361 { ISD::FMUL, MVT::f32, { 2, 5, 1, 1 } }, // Pentium III from http://www.agner.org/ 1362 { ISD::FMUL, MVT::v4f32, { 2, 5, 1, 1 } }, // Pentium III from http://www.agner.org/ 1363 }; 1364 1365 if (ST->hasSSE1()) 1366 if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second)) 1367 if (auto KindCost = Entry->Cost[CostKind]) 1368 return LT.first * *KindCost; 1369 1370 static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets 1371 { ISD::ADD, MVT::i64, { 1 } }, // Core (Merom) from http://www.agner.org/ 1372 { ISD::SUB, MVT::i64, { 1 } }, // Core (Merom) from http://www.agner.org/ 1373 { ISD::MUL, MVT::i64, { 2 } }, // Nehalem from http://www.agner.org/ 1374 }; 1375 1376 if (ST->is64Bit()) 1377 if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, LT.second)) 1378 if (auto KindCost = Entry->Cost[CostKind]) 1379 return LT.first * *KindCost; 1380 1381 static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets 1382 { ISD::ADD, MVT::i8, { 1 } }, // Pentium III from http://www.agner.org/ 1383 { ISD::ADD, MVT::i16, { 1 } }, // Pentium III from http://www.agner.org/ 1384 { ISD::ADD, MVT::i32, { 1 } }, // Pentium III from http://www.agner.org/ 1385 1386 { ISD::SUB, MVT::i8, { 1 } }, // Pentium III from http://www.agner.org/ 1387 { ISD::SUB, MVT::i16, { 1 } }, // Pentium III from http://www.agner.org/ 1388 { ISD::SUB, MVT::i32, { 1 } }, // Pentium III from http://www.agner.org/ 1389 1390 { ISD::FNEG, MVT::f64, { 2, 2, 1, 3 } }, // (x87) 1391 { ISD::FADD, MVT::f64, { 2, 3, 1, 1 } }, // (x87) 1392 { ISD::FSUB, MVT::f64, { 2, 3, 1, 1 } }, // (x87) 1393 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // (x87) 1394 { ISD::FDIV, MVT::f64, { 38, 38, 1, 1 } }, // (x87) 1395 }; 1396 1397 if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, LT.second)) 1398 if (auto KindCost = Entry->Cost[CostKind]) 1399 return LT.first * *KindCost; 1400 1401 // It is not a good idea to vectorize division. We have to scalarize it and 1402 // in the process we will often end up having to spilling regular 1403 // registers. The overhead of division is going to dominate most kernels 1404 // anyways so try hard to prevent vectorization of division - it is 1405 // generally a bad idea. Assume somewhat arbitrarily that we have to be able 1406 // to hide "20 cycles" for each lane. 1407 if (CostKind == TTI::TCK_RecipThroughput && LT.second.isVector() && 1408 (ISD == ISD::SDIV || ISD == ISD::SREM || ISD == ISD::UDIV || 1409 ISD == ISD::UREM)) { 1410 InstructionCost ScalarCost = 1411 getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind, 1412 Op1Info.getNoProps(), Op2Info.getNoProps()); 1413 return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost; 1414 } 1415 1416 // Handle some basic single instruction code size cases. 1417 if (CostKind == TTI::TCK_CodeSize) { 1418 switch (ISD) { 1419 case ISD::FADD: 1420 case ISD::FSUB: 1421 case ISD::FMUL: 1422 case ISD::FDIV: 1423 case ISD::FNEG: 1424 case ISD::AND: 1425 case ISD::OR: 1426 case ISD::XOR: 1427 return LT.first; 1428 break; 1429 } 1430 } 1431 1432 // Fallback to the default implementation. 1433 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info, 1434 Args, CxtI); 1435 } 1436 1437 InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, 1438 VectorType *BaseTp, 1439 ArrayRef<int> Mask, 1440 TTI::TargetCostKind CostKind, 1441 int Index, VectorType *SubTp, 1442 ArrayRef<const Value *> Args) { 1443 // 64-bit packed float vectors (v2f32) are widened to type v4f32. 1444 // 64-bit packed integer vectors (v2i32) are widened to type v4i32. 1445 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(BaseTp); 1446 1447 Kind = improveShuffleKindFromMask(Kind, Mask); 1448 1449 // Treat Transpose as 2-op shuffles - there's no difference in lowering. 1450 if (Kind == TTI::SK_Transpose) 1451 Kind = TTI::SK_PermuteTwoSrc; 1452 1453 // For Broadcasts we are splatting the first element from the first input 1454 // register, so only need to reference that input and all the output 1455 // registers are the same. 1456 if (Kind == TTI::SK_Broadcast) 1457 LT.first = 1; 1458 1459 // Subvector extractions are free if they start at the beginning of a 1460 // vector and cheap if the subvectors are aligned. 1461 if (Kind == TTI::SK_ExtractSubvector && LT.second.isVector()) { 1462 int NumElts = LT.second.getVectorNumElements(); 1463 if ((Index % NumElts) == 0) 1464 return 0; 1465 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp); 1466 if (SubLT.second.isVector()) { 1467 int NumSubElts = SubLT.second.getVectorNumElements(); 1468 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0) 1469 return SubLT.first; 1470 // Handle some cases for widening legalization. For now we only handle 1471 // cases where the original subvector was naturally aligned and evenly 1472 // fit in its legalized subvector type. 1473 // FIXME: Remove some of the alignment restrictions. 1474 // FIXME: We can use permq for 64-bit or larger extracts from 256-bit 1475 // vectors. 1476 int OrigSubElts = cast<FixedVectorType>(SubTp)->getNumElements(); 1477 if (NumSubElts > OrigSubElts && (Index % OrigSubElts) == 0 && 1478 (NumSubElts % OrigSubElts) == 0 && 1479 LT.second.getVectorElementType() == 1480 SubLT.second.getVectorElementType() && 1481 LT.second.getVectorElementType().getSizeInBits() == 1482 BaseTp->getElementType()->getPrimitiveSizeInBits()) { 1483 assert(NumElts >= NumSubElts && NumElts > OrigSubElts && 1484 "Unexpected number of elements!"); 1485 auto *VecTy = FixedVectorType::get(BaseTp->getElementType(), 1486 LT.second.getVectorNumElements()); 1487 auto *SubTy = FixedVectorType::get(BaseTp->getElementType(), 1488 SubLT.second.getVectorNumElements()); 1489 int ExtractIndex = alignDown((Index % NumElts), NumSubElts); 1490 InstructionCost ExtractCost = 1491 getShuffleCost(TTI::SK_ExtractSubvector, VecTy, std::nullopt, 1492 CostKind, ExtractIndex, SubTy); 1493 1494 // If the original size is 32-bits or more, we can use pshufd. Otherwise 1495 // if we have SSSE3 we can use pshufb. 1496 if (SubTp->getPrimitiveSizeInBits() >= 32 || ST->hasSSSE3()) 1497 return ExtractCost + 1; // pshufd or pshufb 1498 1499 assert(SubTp->getPrimitiveSizeInBits() == 16 && 1500 "Unexpected vector size"); 1501 1502 return ExtractCost + 2; // worst case pshufhw + pshufd 1503 } 1504 } 1505 } 1506 1507 // Subvector insertions are cheap if the subvectors are aligned. 1508 // Note that in general, the insertion starting at the beginning of a vector 1509 // isn't free, because we need to preserve the rest of the wide vector. 1510 if (Kind == TTI::SK_InsertSubvector && LT.second.isVector()) { 1511 int NumElts = LT.second.getVectorNumElements(); 1512 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp); 1513 if (SubLT.second.isVector()) { 1514 int NumSubElts = SubLT.second.getVectorNumElements(); 1515 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0) 1516 return SubLT.first; 1517 } 1518 1519 // If the insertion isn't aligned, treat it like a 2-op shuffle. 1520 Kind = TTI::SK_PermuteTwoSrc; 1521 } 1522 1523 // Handle some common (illegal) sub-vector types as they are often very cheap 1524 // to shuffle even on targets without PSHUFB. 1525 EVT VT = TLI->getValueType(DL, BaseTp); 1526 if (VT.isSimple() && VT.isVector() && VT.getSizeInBits() < 128 && 1527 !ST->hasSSSE3()) { 1528 static const CostTblEntry SSE2SubVectorShuffleTbl[] = { 1529 {TTI::SK_Broadcast, MVT::v4i16, 1}, // pshuflw 1530 {TTI::SK_Broadcast, MVT::v2i16, 1}, // pshuflw 1531 {TTI::SK_Broadcast, MVT::v8i8, 2}, // punpck/pshuflw 1532 {TTI::SK_Broadcast, MVT::v4i8, 2}, // punpck/pshuflw 1533 {TTI::SK_Broadcast, MVT::v2i8, 1}, // punpck 1534 1535 {TTI::SK_Reverse, MVT::v4i16, 1}, // pshuflw 1536 {TTI::SK_Reverse, MVT::v2i16, 1}, // pshuflw 1537 {TTI::SK_Reverse, MVT::v4i8, 3}, // punpck/pshuflw/packus 1538 {TTI::SK_Reverse, MVT::v2i8, 1}, // punpck 1539 1540 {TTI::SK_Splice, MVT::v4i16, 2}, // punpck+psrldq 1541 {TTI::SK_Splice, MVT::v2i16, 2}, // punpck+psrldq 1542 {TTI::SK_Splice, MVT::v4i8, 2}, // punpck+psrldq 1543 {TTI::SK_Splice, MVT::v2i8, 2}, // punpck+psrldq 1544 1545 {TTI::SK_PermuteTwoSrc, MVT::v4i16, 2}, // punpck/pshuflw 1546 {TTI::SK_PermuteTwoSrc, MVT::v2i16, 2}, // punpck/pshuflw 1547 {TTI::SK_PermuteTwoSrc, MVT::v8i8, 7}, // punpck/pshuflw 1548 {TTI::SK_PermuteTwoSrc, MVT::v4i8, 4}, // punpck/pshuflw 1549 {TTI::SK_PermuteTwoSrc, MVT::v2i8, 2}, // punpck 1550 1551 {TTI::SK_PermuteSingleSrc, MVT::v4i16, 1}, // pshuflw 1552 {TTI::SK_PermuteSingleSrc, MVT::v2i16, 1}, // pshuflw 1553 {TTI::SK_PermuteSingleSrc, MVT::v8i8, 5}, // punpck/pshuflw 1554 {TTI::SK_PermuteSingleSrc, MVT::v4i8, 3}, // punpck/pshuflw 1555 {TTI::SK_PermuteSingleSrc, MVT::v2i8, 1}, // punpck 1556 }; 1557 1558 if (ST->hasSSE2()) 1559 if (const auto *Entry = 1560 CostTableLookup(SSE2SubVectorShuffleTbl, Kind, VT.getSimpleVT())) 1561 return Entry->Cost; 1562 } 1563 1564 // We are going to permute multiple sources and the result will be in multiple 1565 // destinations. Providing an accurate cost only for splits where the element 1566 // type remains the same. 1567 if (Kind == TTI::SK_PermuteSingleSrc && LT.first != 1) { 1568 MVT LegalVT = LT.second; 1569 if (LegalVT.isVector() && 1570 LegalVT.getVectorElementType().getSizeInBits() == 1571 BaseTp->getElementType()->getPrimitiveSizeInBits() && 1572 LegalVT.getVectorNumElements() < 1573 cast<FixedVectorType>(BaseTp)->getNumElements()) { 1574 1575 unsigned VecTySize = DL.getTypeStoreSize(BaseTp); 1576 unsigned LegalVTSize = LegalVT.getStoreSize(); 1577 // Number of source vectors after legalization: 1578 unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize; 1579 // Number of destination vectors after legalization: 1580 InstructionCost NumOfDests = LT.first; 1581 1582 auto *SingleOpTy = FixedVectorType::get(BaseTp->getElementType(), 1583 LegalVT.getVectorNumElements()); 1584 1585 if (!Mask.empty() && NumOfDests.isValid()) { 1586 // Try to perform better estimation of the permutation. 1587 // 1. Split the source/destination vectors into real registers. 1588 // 2. Do the mask analysis to identify which real registers are 1589 // permuted. If more than 1 source registers are used for the 1590 // destination register building, the cost for this destination register 1591 // is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one 1592 // source register is used, build mask and calculate the cost as a cost 1593 // of PermuteSingleSrc. 1594 // Also, for the single register permute we try to identify if the 1595 // destination register is just a copy of the source register or the 1596 // copy of the previous destination register (the cost is 1597 // TTI::TCC_Basic). If the source register is just reused, the cost for 1598 // this operation is 0. 1599 unsigned E = *NumOfDests.getValue(); 1600 unsigned NormalizedVF = 1601 LegalVT.getVectorNumElements() * std::max(NumOfSrcs, E); 1602 unsigned NumOfSrcRegs = NormalizedVF / LegalVT.getVectorNumElements(); 1603 unsigned NumOfDestRegs = NormalizedVF / LegalVT.getVectorNumElements(); 1604 SmallVector<int> NormalizedMask(NormalizedVF, UndefMaskElem); 1605 copy(Mask, NormalizedMask.begin()); 1606 unsigned PrevSrcReg = 0; 1607 ArrayRef<int> PrevRegMask; 1608 InstructionCost Cost = 0; 1609 processShuffleMasks( 1610 NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {}, 1611 [this, SingleOpTy, CostKind, &PrevSrcReg, &PrevRegMask, 1612 &Cost](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) { 1613 if (!ShuffleVectorInst::isIdentityMask(RegMask)) { 1614 // Check if the previous register can be just copied to the next 1615 // one. 1616 if (PrevRegMask.empty() || PrevSrcReg != SrcReg || 1617 PrevRegMask != RegMask) 1618 Cost += getShuffleCost(TTI::SK_PermuteSingleSrc, SingleOpTy, 1619 RegMask, CostKind, 0, nullptr); 1620 else 1621 // Just a copy of previous destination register. 1622 Cost += TTI::TCC_Basic; 1623 return; 1624 } 1625 if (SrcReg != DestReg && 1626 any_of(RegMask, [](int I) { return I != UndefMaskElem; })) { 1627 // Just a copy of the source register. 1628 Cost += TTI::TCC_Basic; 1629 } 1630 PrevSrcReg = SrcReg; 1631 PrevRegMask = RegMask; 1632 }, 1633 [this, SingleOpTy, CostKind, &Cost](ArrayRef<int> RegMask, 1634 unsigned /*Unused*/, 1635 unsigned /*Unused*/) { 1636 Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, RegMask, 1637 CostKind, 0, nullptr); 1638 }); 1639 return Cost; 1640 } 1641 1642 InstructionCost NumOfShuffles = (NumOfSrcs - 1) * NumOfDests; 1643 return NumOfShuffles * getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, 1644 std::nullopt, CostKind, 0, nullptr); 1645 } 1646 1647 return BaseT::getShuffleCost(Kind, BaseTp, Mask, CostKind, Index, SubTp); 1648 } 1649 1650 // For 2-input shuffles, we must account for splitting the 2 inputs into many. 1651 if (Kind == TTI::SK_PermuteTwoSrc && LT.first != 1) { 1652 // We assume that source and destination have the same vector type. 1653 InstructionCost NumOfDests = LT.first; 1654 InstructionCost NumOfShufflesPerDest = LT.first * 2 - 1; 1655 LT.first = NumOfDests * NumOfShufflesPerDest; 1656 } 1657 1658 static const CostTblEntry AVX512VBMIShuffleTbl[] = { 1659 {TTI::SK_Reverse, MVT::v64i8, 1}, // vpermb 1660 {TTI::SK_Reverse, MVT::v32i8, 1}, // vpermb 1661 1662 {TTI::SK_PermuteSingleSrc, MVT::v64i8, 1}, // vpermb 1663 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 1}, // vpermb 1664 1665 {TTI::SK_PermuteTwoSrc, MVT::v64i8, 2}, // vpermt2b 1666 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 2}, // vpermt2b 1667 {TTI::SK_PermuteTwoSrc, MVT::v16i8, 2} // vpermt2b 1668 }; 1669 1670 if (ST->hasVBMI()) 1671 if (const auto *Entry = 1672 CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second)) 1673 return LT.first * Entry->Cost; 1674 1675 static const CostTblEntry AVX512BWShuffleTbl[] = { 1676 {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw 1677 {TTI::SK_Broadcast, MVT::v32f16, 1}, // vpbroadcastw 1678 {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb 1679 1680 {TTI::SK_Reverse, MVT::v32i16, 2}, // vpermw 1681 {TTI::SK_Reverse, MVT::v32f16, 2}, // vpermw 1682 {TTI::SK_Reverse, MVT::v16i16, 2}, // vpermw 1683 {TTI::SK_Reverse, MVT::v64i8, 2}, // pshufb + vshufi64x2 1684 1685 {TTI::SK_PermuteSingleSrc, MVT::v32i16, 2}, // vpermw 1686 {TTI::SK_PermuteSingleSrc, MVT::v32f16, 2}, // vpermw 1687 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 2}, // vpermw 1688 {TTI::SK_PermuteSingleSrc, MVT::v16f16, 2}, // vpermw 1689 {TTI::SK_PermuteSingleSrc, MVT::v64i8, 8}, // extend to v32i16 1690 1691 {TTI::SK_PermuteTwoSrc, MVT::v32i16, 2}, // vpermt2w 1692 {TTI::SK_PermuteTwoSrc, MVT::v32f16, 2}, // vpermt2w 1693 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 2}, // vpermt2w 1694 {TTI::SK_PermuteTwoSrc, MVT::v8i16, 2}, // vpermt2w 1695 {TTI::SK_PermuteTwoSrc, MVT::v64i8, 19}, // 6 * v32i8 + 1 1696 1697 {TTI::SK_Select, MVT::v32i16, 1}, // vblendmw 1698 {TTI::SK_Select, MVT::v64i8, 1}, // vblendmb 1699 1700 {TTI::SK_Splice, MVT::v32i16, 2}, // vshufi64x2 + palignr 1701 {TTI::SK_Splice, MVT::v32f16, 2}, // vshufi64x2 + palignr 1702 {TTI::SK_Splice, MVT::v64i8, 2}, // vshufi64x2 + palignr 1703 }; 1704 1705 if (ST->hasBWI()) 1706 if (const auto *Entry = 1707 CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second)) 1708 return LT.first * Entry->Cost; 1709 1710 static const CostKindTblEntry AVX512ShuffleTbl[] = { 1711 {TTI::SK_Broadcast, MVT::v8f64, { 1, 1, 1, 1 } }, // vbroadcastsd 1712 {TTI::SK_Broadcast, MVT::v16f32, { 1, 1, 1, 1 } }, // vbroadcastss 1713 {TTI::SK_Broadcast, MVT::v8i64, { 1, 1, 1, 1 } }, // vpbroadcastq 1714 {TTI::SK_Broadcast, MVT::v16i32, { 1, 1, 1, 1 } }, // vpbroadcastd 1715 {TTI::SK_Broadcast, MVT::v32i16, { 1, 1, 1, 1 } }, // vpbroadcastw 1716 {TTI::SK_Broadcast, MVT::v32f16, { 1, 1, 1, 1 } }, // vpbroadcastw 1717 {TTI::SK_Broadcast, MVT::v64i8, { 1, 1, 1, 1 } }, // vpbroadcastb 1718 1719 {TTI::SK_Reverse, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermpd 1720 {TTI::SK_Reverse, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermps 1721 {TTI::SK_Reverse, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermq 1722 {TTI::SK_Reverse, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermd 1723 {TTI::SK_Reverse, MVT::v32i16, { 7, 7, 7, 7 } }, // per mca 1724 {TTI::SK_Reverse, MVT::v32f16, { 7, 7, 7, 7 } }, // per mca 1725 {TTI::SK_Reverse, MVT::v64i8, { 7, 7, 7, 7 } }, // per mca 1726 1727 {TTI::SK_Splice, MVT::v8f64, { 1, 1, 1, 1 } }, // vpalignd 1728 {TTI::SK_Splice, MVT::v4f64, { 1, 1, 1, 1 } }, // vpalignd 1729 {TTI::SK_Splice, MVT::v16f32, { 1, 1, 1, 1 } }, // vpalignd 1730 {TTI::SK_Splice, MVT::v8f32, { 1, 1, 1, 1 } }, // vpalignd 1731 {TTI::SK_Splice, MVT::v8i64, { 1, 1, 1, 1 } }, // vpalignd 1732 {TTI::SK_Splice, MVT::v4i64, { 1, 1, 1, 1 } }, // vpalignd 1733 {TTI::SK_Splice, MVT::v16i32, { 1, 1, 1, 1 } }, // vpalignd 1734 {TTI::SK_Splice, MVT::v8i32, { 1, 1, 1, 1 } }, // vpalignd 1735 {TTI::SK_Splice, MVT::v32i16, { 4, 4, 4, 4 } }, // split + palignr 1736 {TTI::SK_Splice, MVT::v32f16, { 4, 4, 4, 4 } }, // split + palignr 1737 {TTI::SK_Splice, MVT::v64i8, { 4, 4, 4, 4 } }, // split + palignr 1738 1739 {TTI::SK_PermuteSingleSrc, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermpd 1740 {TTI::SK_PermuteSingleSrc, MVT::v4f64, { 1, 3, 1, 1 } }, // vpermpd 1741 {TTI::SK_PermuteSingleSrc, MVT::v2f64, { 1, 3, 1, 1 } }, // vpermpd 1742 {TTI::SK_PermuteSingleSrc, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermps 1743 {TTI::SK_PermuteSingleSrc, MVT::v8f32, { 1, 3, 1, 1 } }, // vpermps 1744 {TTI::SK_PermuteSingleSrc, MVT::v4f32, { 1, 3, 1, 1 } }, // vpermps 1745 {TTI::SK_PermuteSingleSrc, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermq 1746 {TTI::SK_PermuteSingleSrc, MVT::v4i64, { 1, 3, 1, 1 } }, // vpermq 1747 {TTI::SK_PermuteSingleSrc, MVT::v2i64, { 1, 3, 1, 1 } }, // vpermq 1748 {TTI::SK_PermuteSingleSrc, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermd 1749 {TTI::SK_PermuteSingleSrc, MVT::v8i32, { 1, 3, 1, 1 } }, // vpermd 1750 {TTI::SK_PermuteSingleSrc, MVT::v4i32, { 1, 3, 1, 1 } }, // vpermd 1751 {TTI::SK_PermuteSingleSrc, MVT::v16i8, { 1, 3, 1, 1 } }, // pshufb 1752 1753 {TTI::SK_PermuteTwoSrc, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermt2pd 1754 {TTI::SK_PermuteTwoSrc, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermt2ps 1755 {TTI::SK_PermuteTwoSrc, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermt2q 1756 {TTI::SK_PermuteTwoSrc, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermt2d 1757 {TTI::SK_PermuteTwoSrc, MVT::v4f64, { 1, 3, 1, 1 } }, // vpermt2pd 1758 {TTI::SK_PermuteTwoSrc, MVT::v8f32, { 1, 3, 1, 1 } }, // vpermt2ps 1759 {TTI::SK_PermuteTwoSrc, MVT::v4i64, { 1, 3, 1, 1 } }, // vpermt2q 1760 {TTI::SK_PermuteTwoSrc, MVT::v8i32, { 1, 3, 1, 1 } }, // vpermt2d 1761 {TTI::SK_PermuteTwoSrc, MVT::v2f64, { 1, 3, 1, 1 } }, // vpermt2pd 1762 {TTI::SK_PermuteTwoSrc, MVT::v4f32, { 1, 3, 1, 1 } }, // vpermt2ps 1763 {TTI::SK_PermuteTwoSrc, MVT::v2i64, { 1, 3, 1, 1 } }, // vpermt2q 1764 {TTI::SK_PermuteTwoSrc, MVT::v4i32, { 1, 3, 1, 1 } }, // vpermt2d 1765 1766 // FIXME: This just applies the type legalization cost rules above 1767 // assuming these completely split. 1768 {TTI::SK_PermuteSingleSrc, MVT::v32i16, { 14, 14, 14, 14 } }, 1769 {TTI::SK_PermuteSingleSrc, MVT::v32f16, { 14, 14, 14, 14 } }, 1770 {TTI::SK_PermuteSingleSrc, MVT::v64i8, { 14, 14, 14, 14 } }, 1771 {TTI::SK_PermuteTwoSrc, MVT::v32i16, { 42, 42, 42, 42 } }, 1772 {TTI::SK_PermuteTwoSrc, MVT::v32f16, { 42, 42, 42, 42 } }, 1773 {TTI::SK_PermuteTwoSrc, MVT::v64i8, { 42, 42, 42, 42 } }, 1774 1775 {TTI::SK_Select, MVT::v32i16, { 1, 1, 1, 1 } }, // vpternlogq 1776 {TTI::SK_Select, MVT::v32f16, { 1, 1, 1, 1 } }, // vpternlogq 1777 {TTI::SK_Select, MVT::v64i8, { 1, 1, 1, 1 } }, // vpternlogq 1778 {TTI::SK_Select, MVT::v8f64, { 1, 1, 1, 1 } }, // vblendmpd 1779 {TTI::SK_Select, MVT::v16f32, { 1, 1, 1, 1 } }, // vblendmps 1780 {TTI::SK_Select, MVT::v8i64, { 1, 1, 1, 1 } }, // vblendmq 1781 {TTI::SK_Select, MVT::v16i32, { 1, 1, 1, 1 } }, // vblendmd 1782 }; 1783 1784 if (ST->hasAVX512()) 1785 if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second)) 1786 if (auto KindCost = Entry->Cost[CostKind]) 1787 return LT.first * *KindCost; 1788 1789 static const CostTblEntry AVX2ShuffleTbl[] = { 1790 {TTI::SK_Broadcast, MVT::v4f64, 1}, // vbroadcastpd 1791 {TTI::SK_Broadcast, MVT::v8f32, 1}, // vbroadcastps 1792 {TTI::SK_Broadcast, MVT::v4i64, 1}, // vpbroadcastq 1793 {TTI::SK_Broadcast, MVT::v8i32, 1}, // vpbroadcastd 1794 {TTI::SK_Broadcast, MVT::v16i16, 1}, // vpbroadcastw 1795 {TTI::SK_Broadcast, MVT::v16f16, 1}, // vpbroadcastw 1796 {TTI::SK_Broadcast, MVT::v32i8, 1}, // vpbroadcastb 1797 1798 {TTI::SK_Reverse, MVT::v4f64, 1}, // vpermpd 1799 {TTI::SK_Reverse, MVT::v8f32, 1}, // vpermps 1800 {TTI::SK_Reverse, MVT::v4i64, 1}, // vpermq 1801 {TTI::SK_Reverse, MVT::v8i32, 1}, // vpermd 1802 {TTI::SK_Reverse, MVT::v16i16, 2}, // vperm2i128 + pshufb 1803 {TTI::SK_Reverse, MVT::v16f16, 2}, // vperm2i128 + pshufb 1804 {TTI::SK_Reverse, MVT::v32i8, 2}, // vperm2i128 + pshufb 1805 1806 {TTI::SK_Select, MVT::v16i16, 1}, // vpblendvb 1807 {TTI::SK_Select, MVT::v16f16, 1}, // vpblendvb 1808 {TTI::SK_Select, MVT::v32i8, 1}, // vpblendvb 1809 1810 {TTI::SK_Splice, MVT::v8i32, 2}, // vperm2i128 + vpalignr 1811 {TTI::SK_Splice, MVT::v8f32, 2}, // vperm2i128 + vpalignr 1812 {TTI::SK_Splice, MVT::v16i16, 2}, // vperm2i128 + vpalignr 1813 {TTI::SK_Splice, MVT::v16f16, 2}, // vperm2i128 + vpalignr 1814 {TTI::SK_Splice, MVT::v32i8, 2}, // vperm2i128 + vpalignr 1815 1816 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd 1817 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps 1818 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq 1819 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd 1820 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vperm2i128 + 2*vpshufb 1821 // + vpblendvb 1822 {TTI::SK_PermuteSingleSrc, MVT::v16f16, 4}, // vperm2i128 + 2*vpshufb 1823 // + vpblendvb 1824 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vperm2i128 + 2*vpshufb 1825 // + vpblendvb 1826 1827 {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vpermpd + vblendpd 1828 {TTI::SK_PermuteTwoSrc, MVT::v8f32, 3}, // 2*vpermps + vblendps 1829 {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vpermq + vpblendd 1830 {TTI::SK_PermuteTwoSrc, MVT::v8i32, 3}, // 2*vpermd + vpblendd 1831 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 7}, // 2*vperm2i128 + 4*vpshufb 1832 // + vpblendvb 1833 {TTI::SK_PermuteTwoSrc, MVT::v16f16, 7}, // 2*vperm2i128 + 4*vpshufb 1834 // + vpblendvb 1835 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 7}, // 2*vperm2i128 + 4*vpshufb 1836 // + vpblendvb 1837 }; 1838 1839 if (ST->hasAVX2()) 1840 if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second)) 1841 return LT.first * Entry->Cost; 1842 1843 static const CostTblEntry XOPShuffleTbl[] = { 1844 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vpermil2pd 1845 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 2}, // vperm2f128 + vpermil2ps 1846 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vpermil2pd 1847 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 2}, // vperm2f128 + vpermil2ps 1848 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vextractf128 + 2*vpperm 1849 // + vinsertf128 1850 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vextractf128 + 2*vpperm 1851 // + vinsertf128 1852 1853 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 9}, // 2*vextractf128 + 6*vpperm 1854 // + vinsertf128 1855 {TTI::SK_PermuteTwoSrc, MVT::v8i16, 1}, // vpperm 1856 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 9}, // 2*vextractf128 + 6*vpperm 1857 // + vinsertf128 1858 {TTI::SK_PermuteTwoSrc, MVT::v16i8, 1}, // vpperm 1859 }; 1860 1861 if (ST->hasXOP()) 1862 if (const auto *Entry = CostTableLookup(XOPShuffleTbl, Kind, LT.second)) 1863 return LT.first * Entry->Cost; 1864 1865 static const CostTblEntry AVX1ShuffleTbl[] = { 1866 {TTI::SK_Broadcast, MVT::v4f64, 2}, // vperm2f128 + vpermilpd 1867 {TTI::SK_Broadcast, MVT::v8f32, 2}, // vperm2f128 + vpermilps 1868 {TTI::SK_Broadcast, MVT::v4i64, 2}, // vperm2f128 + vpermilpd 1869 {TTI::SK_Broadcast, MVT::v8i32, 2}, // vperm2f128 + vpermilps 1870 {TTI::SK_Broadcast, MVT::v16i16, 3}, // vpshuflw + vpshufd + vinsertf128 1871 {TTI::SK_Broadcast, MVT::v16f16, 3}, // vpshuflw + vpshufd + vinsertf128 1872 {TTI::SK_Broadcast, MVT::v32i8, 2}, // vpshufb + vinsertf128 1873 1874 {TTI::SK_Reverse, MVT::v4f64, 2}, // vperm2f128 + vpermilpd 1875 {TTI::SK_Reverse, MVT::v8f32, 2}, // vperm2f128 + vpermilps 1876 {TTI::SK_Reverse, MVT::v4i64, 2}, // vperm2f128 + vpermilpd 1877 {TTI::SK_Reverse, MVT::v8i32, 2}, // vperm2f128 + vpermilps 1878 {TTI::SK_Reverse, MVT::v16i16, 4}, // vextractf128 + 2*pshufb 1879 // + vinsertf128 1880 {TTI::SK_Reverse, MVT::v16f16, 4}, // vextractf128 + 2*pshufb 1881 // + vinsertf128 1882 {TTI::SK_Reverse, MVT::v32i8, 4}, // vextractf128 + 2*pshufb 1883 // + vinsertf128 1884 1885 {TTI::SK_Select, MVT::v4i64, 1}, // vblendpd 1886 {TTI::SK_Select, MVT::v4f64, 1}, // vblendpd 1887 {TTI::SK_Select, MVT::v8i32, 1}, // vblendps 1888 {TTI::SK_Select, MVT::v8f32, 1}, // vblendps 1889 {TTI::SK_Select, MVT::v16i16, 3}, // vpand + vpandn + vpor 1890 {TTI::SK_Select, MVT::v16f16, 3}, // vpand + vpandn + vpor 1891 {TTI::SK_Select, MVT::v32i8, 3}, // vpand + vpandn + vpor 1892 1893 {TTI::SK_Splice, MVT::v4i64, 2}, // vperm2f128 + shufpd 1894 {TTI::SK_Splice, MVT::v4f64, 2}, // vperm2f128 + shufpd 1895 {TTI::SK_Splice, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps 1896 {TTI::SK_Splice, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps 1897 {TTI::SK_Splice, MVT::v16i16, 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128 1898 {TTI::SK_Splice, MVT::v16f16, 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128 1899 {TTI::SK_Splice, MVT::v32i8, 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128 1900 1901 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vshufpd 1902 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vshufpd 1903 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps 1904 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps 1905 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 8}, // vextractf128 + 4*pshufb 1906 // + 2*por + vinsertf128 1907 {TTI::SK_PermuteSingleSrc, MVT::v16f16, 8}, // vextractf128 + 4*pshufb 1908 // + 2*por + vinsertf128 1909 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 8}, // vextractf128 + 4*pshufb 1910 // + 2*por + vinsertf128 1911 1912 {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vperm2f128 + vshufpd 1913 {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vperm2f128 + vshufpd 1914 {TTI::SK_PermuteTwoSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps 1915 {TTI::SK_PermuteTwoSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps 1916 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 15}, // 2*vextractf128 + 8*pshufb 1917 // + 4*por + vinsertf128 1918 {TTI::SK_PermuteTwoSrc, MVT::v16f16, 15}, // 2*vextractf128 + 8*pshufb 1919 // + 4*por + vinsertf128 1920 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 15}, // 2*vextractf128 + 8*pshufb 1921 // + 4*por + vinsertf128 1922 }; 1923 1924 if (ST->hasAVX()) 1925 if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second)) 1926 return LT.first * Entry->Cost; 1927 1928 static const CostTblEntry SSE41ShuffleTbl[] = { 1929 {TTI::SK_Select, MVT::v2i64, 1}, // pblendw 1930 {TTI::SK_Select, MVT::v2f64, 1}, // movsd 1931 {TTI::SK_Select, MVT::v4i32, 1}, // pblendw 1932 {TTI::SK_Select, MVT::v4f32, 1}, // blendps 1933 {TTI::SK_Select, MVT::v8i16, 1}, // pblendw 1934 {TTI::SK_Select, MVT::v8f16, 1}, // pblendw 1935 {TTI::SK_Select, MVT::v16i8, 1} // pblendvb 1936 }; 1937 1938 if (ST->hasSSE41()) 1939 if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second)) 1940 return LT.first * Entry->Cost; 1941 1942 static const CostTblEntry SSSE3ShuffleTbl[] = { 1943 {TTI::SK_Broadcast, MVT::v8i16, 1}, // pshufb 1944 {TTI::SK_Broadcast, MVT::v8f16, 1}, // pshufb 1945 {TTI::SK_Broadcast, MVT::v16i8, 1}, // pshufb 1946 1947 {TTI::SK_Reverse, MVT::v8i16, 1}, // pshufb 1948 {TTI::SK_Reverse, MVT::v8f16, 1}, // pshufb 1949 {TTI::SK_Reverse, MVT::v16i8, 1}, // pshufb 1950 1951 {TTI::SK_Select, MVT::v8i16, 3}, // 2*pshufb + por 1952 {TTI::SK_Select, MVT::v8f16, 3}, // 2*pshufb + por 1953 {TTI::SK_Select, MVT::v16i8, 3}, // 2*pshufb + por 1954 1955 {TTI::SK_Splice, MVT::v4i32, 1}, // palignr 1956 {TTI::SK_Splice, MVT::v4f32, 1}, // palignr 1957 {TTI::SK_Splice, MVT::v8i16, 1}, // palignr 1958 {TTI::SK_Splice, MVT::v8f16, 1}, // palignr 1959 {TTI::SK_Splice, MVT::v16i8, 1}, // palignr 1960 1961 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 1}, // pshufb 1962 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 1}, // pshufb 1963 {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb 1964 1965 {TTI::SK_PermuteTwoSrc, MVT::v8i16, 3}, // 2*pshufb + por 1966 {TTI::SK_PermuteTwoSrc, MVT::v8f16, 3}, // 2*pshufb + por 1967 {TTI::SK_PermuteTwoSrc, MVT::v16i8, 3}, // 2*pshufb + por 1968 }; 1969 1970 if (ST->hasSSSE3()) 1971 if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second)) 1972 return LT.first * Entry->Cost; 1973 1974 static const CostTblEntry SSE2ShuffleTbl[] = { 1975 {TTI::SK_Broadcast, MVT::v2f64, 1}, // shufpd 1976 {TTI::SK_Broadcast, MVT::v2i64, 1}, // pshufd 1977 {TTI::SK_Broadcast, MVT::v4i32, 1}, // pshufd 1978 {TTI::SK_Broadcast, MVT::v8i16, 2}, // pshuflw + pshufd 1979 {TTI::SK_Broadcast, MVT::v8f16, 2}, // pshuflw + pshufd 1980 {TTI::SK_Broadcast, MVT::v16i8, 3}, // unpck + pshuflw + pshufd 1981 1982 {TTI::SK_Reverse, MVT::v2f64, 1}, // shufpd 1983 {TTI::SK_Reverse, MVT::v2i64, 1}, // pshufd 1984 {TTI::SK_Reverse, MVT::v4i32, 1}, // pshufd 1985 {TTI::SK_Reverse, MVT::v8i16, 3}, // pshuflw + pshufhw + pshufd 1986 {TTI::SK_Reverse, MVT::v8f16, 3}, // pshuflw + pshufhw + pshufd 1987 {TTI::SK_Reverse, MVT::v16i8, 9}, // 2*pshuflw + 2*pshufhw 1988 // + 2*pshufd + 2*unpck + packus 1989 1990 {TTI::SK_Select, MVT::v2i64, 1}, // movsd 1991 {TTI::SK_Select, MVT::v2f64, 1}, // movsd 1992 {TTI::SK_Select, MVT::v4i32, 2}, // 2*shufps 1993 {TTI::SK_Select, MVT::v8i16, 3}, // pand + pandn + por 1994 {TTI::SK_Select, MVT::v8f16, 3}, // pand + pandn + por 1995 {TTI::SK_Select, MVT::v16i8, 3}, // pand + pandn + por 1996 1997 {TTI::SK_Splice, MVT::v2i64, 1}, // shufpd 1998 {TTI::SK_Splice, MVT::v2f64, 1}, // shufpd 1999 {TTI::SK_Splice, MVT::v4i32, 2}, // 2*{unpck,movsd,pshufd} 2000 {TTI::SK_Splice, MVT::v8i16, 3}, // psrldq + psrlldq + por 2001 {TTI::SK_Splice, MVT::v8f16, 3}, // psrldq + psrlldq + por 2002 {TTI::SK_Splice, MVT::v16i8, 3}, // psrldq + psrlldq + por 2003 2004 {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // shufpd 2005 {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // pshufd 2006 {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // pshufd 2007 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 5}, // 2*pshuflw + 2*pshufhw 2008 // + pshufd/unpck 2009 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 5}, // 2*pshuflw + 2*pshufhw 2010 // + pshufd/unpck 2011 { TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2*pshuflw + 2*pshufhw 2012 // + 2*pshufd + 2*unpck + 2*packus 2013 2014 { TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // shufpd 2015 { TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // shufpd 2016 { TTI::SK_PermuteTwoSrc, MVT::v4i32, 2 }, // 2*{unpck,movsd,pshufd} 2017 { TTI::SK_PermuteTwoSrc, MVT::v8i16, 8 }, // blend+permute 2018 { TTI::SK_PermuteTwoSrc, MVT::v8f16, 8 }, // blend+permute 2019 { TTI::SK_PermuteTwoSrc, MVT::v16i8, 13 }, // blend+permute 2020 }; 2021 2022 static const CostTblEntry SSE3BroadcastLoadTbl[] = { 2023 {TTI::SK_Broadcast, MVT::v2f64, 0}, // broadcast handled by movddup 2024 }; 2025 2026 if (ST->hasSSE2()) { 2027 bool IsLoad = 2028 llvm::any_of(Args, [](const auto &V) { return isa<LoadInst>(V); }); 2029 if (ST->hasSSE3() && IsLoad) 2030 if (const auto *Entry = 2031 CostTableLookup(SSE3BroadcastLoadTbl, Kind, LT.second)) { 2032 assert(isLegalBroadcastLoad(BaseTp->getElementType(), 2033 LT.second.getVectorElementCount()) && 2034 "Table entry missing from isLegalBroadcastLoad()"); 2035 return LT.first * Entry->Cost; 2036 } 2037 2038 if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second)) 2039 return LT.first * Entry->Cost; 2040 } 2041 2042 static const CostTblEntry SSE1ShuffleTbl[] = { 2043 { TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps 2044 { TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps 2045 { TTI::SK_Select, MVT::v4f32, 2 }, // 2*shufps 2046 { TTI::SK_Splice, MVT::v4f32, 2 }, // 2*shufps 2047 { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // shufps 2048 { TTI::SK_PermuteTwoSrc, MVT::v4f32, 2 }, // 2*shufps 2049 }; 2050 2051 if (ST->hasSSE1()) 2052 if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second)) 2053 return LT.first * Entry->Cost; 2054 2055 return BaseT::getShuffleCost(Kind, BaseTp, Mask, CostKind, Index, SubTp); 2056 } 2057 2058 InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, 2059 Type *Src, 2060 TTI::CastContextHint CCH, 2061 TTI::TargetCostKind CostKind, 2062 const Instruction *I) { 2063 int ISD = TLI->InstructionOpcodeToISD(Opcode); 2064 assert(ISD && "Invalid opcode"); 2065 2066 // TODO: Allow non-throughput costs that aren't binary. 2067 auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost { 2068 if (CostKind != TTI::TCK_RecipThroughput) 2069 return Cost == 0 ? 0 : 1; 2070 return Cost; 2071 }; 2072 2073 // The cost tables include both specific, custom (non-legal) src/dst type 2074 // conversions and generic, legalized types. We test for customs first, before 2075 // falling back to legalization. 2076 // FIXME: Need a better design of the cost table to handle non-simple types of 2077 // potential massive combinations (elem_num x src_type x dst_type). 2078 static const TypeConversionCostTblEntry AVX512BWConversionTbl[] { 2079 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 1 }, 2080 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 1 }, 2081 2082 // Mask sign extend has an instruction. 2083 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 1 }, 2084 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, 1 }, 2085 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 1 }, 2086 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, 1 }, 2087 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 1 }, 2088 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, 1 }, 2089 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 1 }, 2090 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, 1 }, 2091 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 1 }, 2092 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, 1 }, 2093 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 1 }, 2094 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 1 }, 2095 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 }, 2096 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, 1 }, 2097 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i1, 1 }, 2098 { ISD::SIGN_EXTEND, MVT::v64i8, MVT::v64i1, 1 }, 2099 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v64i1, 1 }, 2100 2101 // Mask zero extend is a sext + shift. 2102 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 2 }, 2103 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, 2 }, 2104 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 2 }, 2105 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, 2 }, 2106 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 2 }, 2107 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, 2 }, 2108 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 2 }, 2109 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, 2 }, 2110 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 2 }, 2111 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, 2 }, 2112 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 2 }, 2113 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 2 }, 2114 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 }, 2115 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, 2 }, 2116 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i1, 2 }, 2117 { ISD::ZERO_EXTEND, MVT::v64i8, MVT::v64i1, 2 }, 2118 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v64i1, 2 }, 2119 2120 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 2 }, 2121 { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, 2 }, 2122 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, 2123 { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, 2 }, 2124 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, 2125 { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, 2 }, 2126 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 2 }, 2127 { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, 2 }, 2128 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 2 }, 2129 { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, 2 }, 2130 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 2 }, 2131 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 2 }, 2132 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 2 }, 2133 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, 2 }, 2134 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i16, 2 }, 2135 { ISD::TRUNCATE, MVT::v64i1, MVT::v64i8, 2 }, 2136 { ISD::TRUNCATE, MVT::v64i1, MVT::v32i16, 2 }, 2137 2138 { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, 2 }, 2139 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, // widen to zmm 2140 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, 2 }, // vpmovwb 2141 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 2 }, // vpmovwb 2142 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 2 }, // vpmovwb 2143 }; 2144 2145 static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = { 2146 // Mask sign extend has an instruction. 2147 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, 2148 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, 1 }, 2149 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, 2150 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, 2151 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, 2152 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v16i1, 1 }, 2153 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, 1 }, 2154 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 1 }, 2155 2156 // Mask zero extend is a sext + shift. 2157 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, 2158 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, 2 }, 2159 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, 2160 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, 2161 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, 2162 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v16i1, 2 }, 2163 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, 2 }, 2164 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, 2 }, 2165 2166 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, 2167 { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, 2 }, 2168 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, 2169 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, 2170 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, 2171 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 2 }, 2172 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, 2 }, 2173 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i64, 2 }, 2174 2175 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 }, 2176 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 }, 2177 2178 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 }, 2179 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 }, 2180 2181 { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f32, 1 }, 2182 { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f64, 1 }, 2183 2184 { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f32, 1 }, 2185 { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f64, 1 }, 2186 }; 2187 2188 // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and 2189 // 256-bit wide vectors. 2190 2191 static const TypeConversionCostTblEntry AVX512FConversionTbl[] = { 2192 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 1 }, 2193 { ISD::FP_EXTEND, MVT::v8f64, MVT::v16f32, 3 }, 2194 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 1 }, 2195 2196 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // sext+vpslld+vptestmd 2197 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 3 }, // sext+vpslld+vptestmd 2198 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 3 }, // sext+vpslld+vptestmd 2199 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 3 }, // sext+vpslld+vptestmd 2200 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 3 }, // sext+vpsllq+vptestmq 2201 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 3 }, // sext+vpsllq+vptestmq 2202 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 3 }, // sext+vpsllq+vptestmq 2203 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 3 }, // sext+vpslld+vptestmd 2204 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 2 }, // zmm vpslld+vptestmd 2205 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, // zmm vpslld+vptestmd 2206 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, // zmm vpslld+vptestmd 2207 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, 2 }, // vpslld+vptestmd 2208 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, // zmm vpsllq+vptestmq 2209 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, // zmm vpsllq+vptestmq 2210 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 2 }, // vpsllq+vptestmq 2211 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, 2 }, // vpmovdb 2212 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 2 }, // vpmovdb 2213 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 2 }, // vpmovdb 2214 { ISD::TRUNCATE, MVT::v32i8, MVT::v16i32, 2 }, // vpmovdb 2215 { ISD::TRUNCATE, MVT::v64i8, MVT::v16i32, 2 }, // vpmovdb 2216 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2 }, // vpmovdw 2217 { ISD::TRUNCATE, MVT::v32i16, MVT::v16i32, 2 }, // vpmovdw 2218 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 2 }, // vpmovqb 2219 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1 }, // vpshufb 2220 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 2 }, // vpmovqb 2221 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i64, 2 }, // vpmovqb 2222 { ISD::TRUNCATE, MVT::v32i8, MVT::v8i64, 2 }, // vpmovqb 2223 { ISD::TRUNCATE, MVT::v64i8, MVT::v8i64, 2 }, // vpmovqb 2224 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 2 }, // vpmovqw 2225 { ISD::TRUNCATE, MVT::v16i16, MVT::v8i64, 2 }, // vpmovqw 2226 { ISD::TRUNCATE, MVT::v32i16, MVT::v8i64, 2 }, // vpmovqw 2227 { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 1 }, // vpmovqd 2228 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, // zmm vpmovqd 2229 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 5 },// 2*vpmovqd+concat+vpmovdb 2230 2231 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 }, // extend to v16i32 2232 { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, 8 }, 2233 { ISD::TRUNCATE, MVT::v64i8, MVT::v32i16, 8 }, 2234 2235 // Sign extend is zmm vpternlogd+vptruncdb. 2236 // Zero extend is zmm broadcast load+vptruncdw. 2237 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 3 }, 2238 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 4 }, 2239 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 3 }, 2240 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 4 }, 2241 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 3 }, 2242 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 4 }, 2243 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 3 }, 2244 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 4 }, 2245 2246 // Sign extend is zmm vpternlogd+vptruncdw. 2247 // Zero extend is zmm vpternlogd+vptruncdw+vpsrlw. 2248 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 3 }, 2249 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 4 }, 2250 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 3 }, 2251 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 4 }, 2252 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 3 }, 2253 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 4 }, 2254 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 3 }, 2255 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 4 }, 2256 2257 { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, 1 }, // zmm vpternlogd 2258 { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, 2 }, // zmm vpternlogd+psrld 2259 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, // zmm vpternlogd 2260 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, // zmm vpternlogd+psrld 2261 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, // zmm vpternlogd 2262 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, // zmm vpternlogd+psrld 2263 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, // zmm vpternlogq 2264 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, // zmm vpternlogq+psrlq 2265 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, // zmm vpternlogq 2266 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, // zmm vpternlogq+psrlq 2267 2268 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 1 }, // vpternlogd 2269 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, 2 }, // vpternlogd+psrld 2270 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, 1 }, // vpternlogq 2271 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, 2 }, // vpternlogq+psrlq 2272 2273 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 1 }, 2274 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 1 }, 2275 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 1 }, 2276 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 1 }, 2277 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 1 }, 2278 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 1 }, 2279 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 1 }, 2280 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 1 }, 2281 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, 1 }, 2282 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, 1 }, 2283 2284 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right 2285 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right 2286 2287 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 }, 2288 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 }, 2289 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v16i8, 2 }, 2290 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 1 }, 2291 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 }, 2292 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 1 }, 2293 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 }, 2294 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 }, 2295 2296 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 }, 2297 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 }, 2298 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v16i8, 2 }, 2299 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 1 }, 2300 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 }, 2301 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 1 }, 2302 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 }, 2303 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 }, 2304 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 26 }, 2305 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 5 }, 2306 2307 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, 2 }, 2308 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f64, 7 }, 2309 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f64,15 }, 2310 { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f32,11 }, 2311 { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f64,31 }, 2312 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f64, 3 }, 2313 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f64, 7 }, 2314 { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f32, 5 }, 2315 { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f64,15 }, 2316 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, 1 }, 2317 { ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f64, 3 }, 2318 2319 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, 1 }, 2320 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f64, 3 }, 2321 { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f64, 3 }, 2322 { ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f32, 1 }, 2323 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, 3 }, 2324 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f32, 3 }, 2325 }; 2326 2327 static const TypeConversionCostTblEntry AVX512BWVLConversionTbl[] { 2328 // Mask sign extend has an instruction. 2329 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 1 }, 2330 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, 1 }, 2331 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 1 }, 2332 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, 1 }, 2333 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 1 }, 2334 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, 1 }, 2335 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 1 }, 2336 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, 1 }, 2337 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 1 }, 2338 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, 1 }, 2339 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 1 }, 2340 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 1 }, 2341 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 }, 2342 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, 1 }, 2343 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v32i1, 1 }, 2344 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v64i1, 1 }, 2345 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v64i1, 1 }, 2346 2347 // Mask zero extend is a sext + shift. 2348 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 2 }, 2349 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, 2 }, 2350 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 2 }, 2351 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, 2 }, 2352 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 2 }, 2353 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, 2 }, 2354 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 2 }, 2355 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, 2 }, 2356 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 2 }, 2357 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, 2 }, 2358 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 2 }, 2359 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 2 }, 2360 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 }, 2361 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, 2 }, 2362 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v32i1, 2 }, 2363 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v64i1, 2 }, 2364 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v64i1, 2 }, 2365 2366 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 2 }, 2367 { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, 2 }, 2368 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, 2369 { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, 2 }, 2370 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, 2371 { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, 2 }, 2372 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 2 }, 2373 { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, 2 }, 2374 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 2 }, 2375 { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, 2 }, 2376 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 2 }, 2377 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 2 }, 2378 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 2 }, 2379 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, 2 }, 2380 { ISD::TRUNCATE, MVT::v32i1, MVT::v16i16, 2 }, 2381 { ISD::TRUNCATE, MVT::v64i1, MVT::v32i8, 2 }, 2382 { ISD::TRUNCATE, MVT::v64i1, MVT::v16i16, 2 }, 2383 2384 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, 2385 }; 2386 2387 static const TypeConversionCostTblEntry AVX512DQVLConversionTbl[] = { 2388 // Mask sign extend has an instruction. 2389 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, 2390 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, 1 }, 2391 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, 2392 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i1, 1 }, 2393 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, 2394 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i1, 1 }, 2395 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i1, 1 }, 2396 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, 2397 2398 // Mask zero extend is a sext + shift. 2399 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, 2400 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, 2 }, 2401 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, 2402 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i1, 2 }, 2403 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, 2404 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i1, 2 }, 2405 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i1, 2 }, 2406 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, 2407 2408 { ISD::TRUNCATE, MVT::v16i1, MVT::v4i64, 2 }, 2409 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i32, 2 }, 2410 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, 2411 { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, 2 }, 2412 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, 2413 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, 2414 { ISD::TRUNCATE, MVT::v8i1, MVT::v4i64, 2 }, 2415 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, 2416 2417 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 }, 2418 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, 2419 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 }, 2420 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 }, 2421 2422 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 }, 2423 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, 2424 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 }, 2425 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 }, 2426 2427 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v4f32, 1 }, 2428 { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f32, 1 }, 2429 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 }, 2430 { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f64, 1 }, 2431 2432 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v4f32, 1 }, 2433 { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f32, 1 }, 2434 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 }, 2435 { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f64, 1 }, 2436 }; 2437 2438 static const TypeConversionCostTblEntry AVX512VLConversionTbl[] = { 2439 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // sext+vpslld+vptestmd 2440 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 3 }, // sext+vpslld+vptestmd 2441 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 3 }, // sext+vpslld+vptestmd 2442 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 8 }, // split+2*v8i8 2443 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 3 }, // sext+vpsllq+vptestmq 2444 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 3 }, // sext+vpsllq+vptestmq 2445 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 3 }, // sext+vpsllq+vptestmq 2446 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 8 }, // split+2*v8i16 2447 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 2 }, // vpslld+vptestmd 2448 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, // vpslld+vptestmd 2449 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, // vpslld+vptestmd 2450 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i32, 2 }, // vpslld+vptestmd 2451 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, // vpsllq+vptestmq 2452 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, // vpsllq+vptestmq 2453 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, // vpmovqd 2454 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 2 }, // vpmovqb 2455 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2 }, // vpmovqw 2456 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2 }, // vpmovwb 2457 2458 // sign extend is vpcmpeq+maskedmove+vpmovdw+vpacksswb 2459 // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw+vpackuswb 2460 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 5 }, 2461 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 6 }, 2462 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 5 }, 2463 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 6 }, 2464 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 5 }, 2465 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 6 }, 2466 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 10 }, 2467 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 12 }, 2468 2469 // sign extend is vpcmpeq+maskedmove+vpmovdw 2470 // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw 2471 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 4 }, 2472 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 5 }, 2473 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 4 }, 2474 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 5 }, 2475 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 4 }, 2476 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 5 }, 2477 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 10 }, 2478 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 12 }, 2479 2480 { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, 1 }, // vpternlogd 2481 { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, 2 }, // vpternlogd+psrld 2482 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, // vpternlogd 2483 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, // vpternlogd+psrld 2484 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, // vpternlogd 2485 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, // vpternlogd+psrld 2486 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i1, 1 }, // vpternlogd 2487 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i1, 2 }, // vpternlogd+psrld 2488 2489 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, // vpternlogq 2490 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, // vpternlogq+psrlq 2491 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, // vpternlogq 2492 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, // vpternlogq+psrlq 2493 2494 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, 1 }, 2495 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, 1 }, 2496 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, 1 }, 2497 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, 1 }, 2498 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1 }, 2499 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1 }, 2500 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, 1 }, 2501 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, 1 }, 2502 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1 }, 2503 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 }, 2504 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 }, 2505 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 1 }, 2506 2507 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 }, 2508 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, 1 }, 2509 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 }, 2510 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 1 }, 2511 2512 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 1 }, 2513 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 1 }, 2514 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 }, 2515 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, 1 }, 2516 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 }, 2517 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 1 }, 2518 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, 2519 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, 2520 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 }, 2521 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 }, 2522 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 5 }, 2523 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 5 }, 2524 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 5 }, 2525 2526 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, 2 }, 2527 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, 2 }, 2528 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f32, 5 }, 2529 2530 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 1 }, 2531 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 1 }, 2532 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 }, 2533 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 1 }, 2534 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 1 }, 2535 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 1 }, 2536 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, 1 }, 2537 }; 2538 2539 static const TypeConversionCostTblEntry AVX2ConversionTbl[] = { 2540 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 3 }, 2541 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 3 }, 2542 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 3 }, 2543 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 3 }, 2544 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 }, 2545 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 1 }, 2546 2547 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, 2 }, 2548 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, 2 }, 2549 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, 2 }, 2550 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, 2 }, 2551 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, 2552 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, 2553 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, 2 }, 2554 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, 2 }, 2555 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, 2556 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, 2557 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 3 }, 2558 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 3 }, 2559 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 }, 2560 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 }, 2561 2562 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, 2563 2564 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 4 }, 2565 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 4 }, 2566 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, 1 }, 2567 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, 1 }, 2568 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, 1 }, 2569 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, 4 }, 2570 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, 4 }, 2571 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, 1 }, 2572 { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, 1 }, 2573 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, 5 }, 2574 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, 2575 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 2 }, 2576 2577 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 3 }, 2578 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 3 }, 2579 2580 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, 1 }, 2581 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, 1 }, 2582 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, 1 }, 2583 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, 3 }, 2584 2585 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 3 }, 2586 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 3 }, 2587 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, 1 }, 2588 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 3 }, 2589 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 4 }, 2590 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 4 }, 2591 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 3 }, 2592 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, 4 }, 2593 2594 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 2 }, 2595 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, 2 }, 2596 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 2 }, 2597 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 2 }, 2598 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 }, 2599 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 }, 2600 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 3 }, 2601 2602 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 2 }, 2603 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, 2 }, 2604 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 2 }, 2605 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 2 }, 2606 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 2 }, 2607 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 }, 2608 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 2 }, 2609 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 2 }, 2610 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 }, 2611 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 4 }, 2612 }; 2613 2614 static const TypeConversionCostTblEntry AVXConversionTbl[] = { 2615 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 6 }, 2616 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 4 }, 2617 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 7 }, 2618 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 4 }, 2619 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 4 }, 2620 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 4 }, 2621 2622 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, 3 }, 2623 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, 3 }, 2624 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, 3 }, 2625 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, 3 }, 2626 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 3 }, 2627 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 3 }, 2628 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, 3 }, 2629 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, 3 }, 2630 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 3 }, 2631 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 3 }, 2632 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 3 }, 2633 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 3 }, 2634 2635 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 4 }, 2636 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 5 }, 2637 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 4 }, 2638 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 9 }, 2639 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i64, 11 }, 2640 2641 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 6 }, 2642 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 }, 2643 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, // and+extract+packuswb 2644 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, 5 }, 2645 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 }, 2646 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, 5 }, 2647 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, 3 }, // and+extract+2*packusdw 2648 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 2 }, 2649 2650 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 }, 2651 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i1, 3 }, 2652 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i1, 8 }, 2653 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, 4 }, 2654 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v16i8, 2 }, 2655 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 }, 2656 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v8i16, 2 }, 2657 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 2 }, 2658 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 }, 2659 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 4 }, 2660 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 5 }, 2661 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, 8 }, 2662 2663 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, 7 }, 2664 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i1, 7 }, 2665 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i1, 6 }, 2666 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, 4 }, 2667 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v16i8, 2 }, 2668 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 }, 2669 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v8i16, 2 }, 2670 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 4 }, 2671 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 4 }, 2672 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 5 }, 2673 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 6 }, 2674 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 8 }, 2675 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 10 }, 2676 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 10 }, 2677 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 18 }, 2678 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 5 }, 2679 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 10 }, 2680 2681 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, 2 }, 2682 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f64, 2 }, 2683 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v8f32, 2 }, 2684 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v4f64, 2 }, 2685 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, 2 }, 2686 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f64, 2 }, 2687 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, 2 }, 2688 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v4f64, 2 }, 2689 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, 2 }, 2690 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, 2 }, 2691 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, 5 }, 2692 2693 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v8f32, 2 }, 2694 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f64, 2 }, 2695 { ISD::FP_TO_UINT, MVT::v32i8, MVT::v8f32, 2 }, 2696 { ISD::FP_TO_UINT, MVT::v32i8, MVT::v4f64, 2 }, 2697 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, 2 }, 2698 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f64, 2 }, 2699 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, 2 }, 2700 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v4f64, 2 }, 2701 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 3 }, 2702 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 4 }, 2703 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 6 }, 2704 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 7 }, 2705 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, 7 }, 2706 2707 { ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, 1 }, 2708 { ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, 1 }, 2709 }; 2710 2711 static const TypeConversionCostTblEntry SSE41ConversionTbl[] = { 2712 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, 1 }, 2713 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, 1 }, 2714 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, 1 }, 2715 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, 1 }, 2716 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, 1 }, 2717 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, 1 }, 2718 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, 1 }, 2719 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, 1 }, 2720 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, 1 }, 2721 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, 1 }, 2722 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, 1 }, 2723 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, 1 }, 2724 2725 // These truncates end up widening elements. 2726 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 1 }, // PMOVXZBQ 2727 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 1 }, // PMOVXZWQ 2728 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 1 }, // PMOVXZBD 2729 2730 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, 2 }, 2731 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, 2 }, 2732 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, 2 }, 2733 2734 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, 1 }, 2735 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, 1 }, 2736 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, 1 }, 2737 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, 1 }, 2738 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 1 }, 2739 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 }, 2740 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 1 }, 2741 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 }, 2742 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, 2743 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 1 }, 2744 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 2 }, 2745 2746 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, 1 }, 2747 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, 1 }, 2748 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 4 }, 2749 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 4 }, 2750 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 1 }, 2751 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 }, 2752 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 1 }, 2753 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 }, 2754 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 3 }, 2755 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 3 }, 2756 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 2 }, 2757 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 12 }, 2758 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 22 }, 2759 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 4 }, 2760 2761 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, 1 }, 2762 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, 1 }, 2763 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, 1 }, 2764 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, 1 }, 2765 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, 2 }, 2766 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, 2 }, 2767 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, 1 }, 2768 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, 1 }, 2769 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 }, 2770 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, 1 }, 2771 2772 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, 1 }, 2773 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 4 }, 2774 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, 1 }, 2775 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 4 }, 2776 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, 2 }, 2777 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, 2 }, 2778 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, 1 }, 2779 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, 1 }, 2780 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 4 }, 2781 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 4 }, 2782 }; 2783 2784 static const TypeConversionCostTblEntry SSE2ConversionTbl[] = { 2785 // These are somewhat magic numbers justified by comparing the 2786 // output of llvm-mca for our various supported scheduler models 2787 // and basing it off the worst case scenario. 2788 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, 3 }, 2789 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, 3 }, 2790 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, 3 }, 2791 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, 3 }, 2792 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 3 }, 2793 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 4 }, 2794 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 3 }, 2795 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 4 }, 2796 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 3 }, 2797 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 4 }, 2798 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 8 }, 2799 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 8 }, 2800 2801 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, 3 }, 2802 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, 3 }, 2803 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 8 }, 2804 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 9 }, 2805 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 4 }, 2806 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 4 }, 2807 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 4 }, 2808 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 4 }, 2809 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 7 }, 2810 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 7 }, 2811 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 5 }, 2812 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 15 }, 2813 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 18 }, 2814 2815 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, 4 }, 2816 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, 4 }, 2817 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, 4 }, 2818 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, 4 }, 2819 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, 6 }, 2820 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, 6 }, 2821 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, 5 }, 2822 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, 5 }, 2823 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 4 }, 2824 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, 4 }, 2825 2826 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, 4 }, 2827 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 4 }, 2828 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, 4 }, 2829 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 15 }, 2830 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, 6 }, 2831 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, 6 }, 2832 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, 5 }, 2833 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, 5 }, 2834 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 8 }, 2835 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 8 }, 2836 2837 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, 4 }, 2838 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, 4 }, 2839 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, 2 }, 2840 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, 3 }, 2841 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, 1 }, 2842 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, 2 }, 2843 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, 2 }, 2844 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, 3 }, 2845 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, 1 }, 2846 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, 2 }, 2847 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, 1 }, 2848 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, 2 }, 2849 2850 // These truncates are really widening elements. 2851 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 1 }, // PSHUFD 2852 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, // PUNPCKLWD+DQ 2853 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // PUNPCKLBW+WD+PSHUFD 2854 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 1 }, // PUNPCKLWD 2855 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, // PUNPCKLBW+WD 2856 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 1 }, // PUNPCKLBW 2857 2858 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, 2 }, // PAND+PACKUSWB 2859 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 }, 2860 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, 3 }, // PAND+2*PACKUSWB 2861 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 7 }, 2862 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i32, 1 }, 2863 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, 3 }, 2864 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 }, 2865 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32,10 }, 2866 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, 4 }, // PAND+3*PACKUSWB 2867 { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, 2 }, // PSHUFD+PSHUFLW 2868 { ISD::TRUNCATE, MVT::v4i32, MVT::v2i64, 1 }, // PSHUFD 2869 }; 2870 2871 // Attempt to map directly to (simple) MVT types to let us match custom entries. 2872 EVT SrcTy = TLI->getValueType(DL, Src); 2873 EVT DstTy = TLI->getValueType(DL, Dst); 2874 2875 // The function getSimpleVT only handles simple value types. 2876 if (SrcTy.isSimple() && DstTy.isSimple()) { 2877 MVT SimpleSrcTy = SrcTy.getSimpleVT(); 2878 MVT SimpleDstTy = DstTy.getSimpleVT(); 2879 2880 if (ST->useAVX512Regs()) { 2881 if (ST->hasBWI()) 2882 if (const auto *Entry = ConvertCostTableLookup( 2883 AVX512BWConversionTbl, ISD, SimpleDstTy, SimpleSrcTy)) 2884 return AdjustCost(Entry->Cost); 2885 2886 if (ST->hasDQI()) 2887 if (const auto *Entry = ConvertCostTableLookup( 2888 AVX512DQConversionTbl, ISD, SimpleDstTy, SimpleSrcTy)) 2889 return AdjustCost(Entry->Cost); 2890 2891 if (ST->hasAVX512()) 2892 if (const auto *Entry = ConvertCostTableLookup( 2893 AVX512FConversionTbl, ISD, SimpleDstTy, SimpleSrcTy)) 2894 return AdjustCost(Entry->Cost); 2895 } 2896 2897 if (ST->hasBWI()) 2898 if (const auto *Entry = ConvertCostTableLookup( 2899 AVX512BWVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy)) 2900 return AdjustCost(Entry->Cost); 2901 2902 if (ST->hasDQI()) 2903 if (const auto *Entry = ConvertCostTableLookup( 2904 AVX512DQVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy)) 2905 return AdjustCost(Entry->Cost); 2906 2907 if (ST->hasAVX512()) 2908 if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD, 2909 SimpleDstTy, SimpleSrcTy)) 2910 return AdjustCost(Entry->Cost); 2911 2912 if (ST->hasAVX2()) { 2913 if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD, 2914 SimpleDstTy, SimpleSrcTy)) 2915 return AdjustCost(Entry->Cost); 2916 } 2917 2918 if (ST->hasAVX()) { 2919 if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD, 2920 SimpleDstTy, SimpleSrcTy)) 2921 return AdjustCost(Entry->Cost); 2922 } 2923 2924 if (ST->hasSSE41()) { 2925 if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD, 2926 SimpleDstTy, SimpleSrcTy)) 2927 return AdjustCost(Entry->Cost); 2928 } 2929 2930 if (ST->hasSSE2()) { 2931 if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD, 2932 SimpleDstTy, SimpleSrcTy)) 2933 return AdjustCost(Entry->Cost); 2934 } 2935 } 2936 2937 // Fall back to legalized types. 2938 std::pair<InstructionCost, MVT> LTSrc = getTypeLegalizationCost(Src); 2939 std::pair<InstructionCost, MVT> LTDest = getTypeLegalizationCost(Dst); 2940 2941 // If we're truncating to the same legalized type - just assume its free. 2942 if (ISD == ISD::TRUNCATE && LTSrc.second == LTDest.second) 2943 return TTI::TCC_Free; 2944 2945 if (ST->useAVX512Regs()) { 2946 if (ST->hasBWI()) 2947 if (const auto *Entry = ConvertCostTableLookup( 2948 AVX512BWConversionTbl, ISD, LTDest.second, LTSrc.second)) 2949 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); 2950 2951 if (ST->hasDQI()) 2952 if (const auto *Entry = ConvertCostTableLookup( 2953 AVX512DQConversionTbl, ISD, LTDest.second, LTSrc.second)) 2954 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); 2955 2956 if (ST->hasAVX512()) 2957 if (const auto *Entry = ConvertCostTableLookup( 2958 AVX512FConversionTbl, ISD, LTDest.second, LTSrc.second)) 2959 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); 2960 } 2961 2962 if (ST->hasBWI()) 2963 if (const auto *Entry = ConvertCostTableLookup(AVX512BWVLConversionTbl, ISD, 2964 LTDest.second, LTSrc.second)) 2965 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); 2966 2967 if (ST->hasDQI()) 2968 if (const auto *Entry = ConvertCostTableLookup(AVX512DQVLConversionTbl, ISD, 2969 LTDest.second, LTSrc.second)) 2970 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); 2971 2972 if (ST->hasAVX512()) 2973 if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD, 2974 LTDest.second, LTSrc.second)) 2975 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); 2976 2977 if (ST->hasAVX2()) 2978 if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD, 2979 LTDest.second, LTSrc.second)) 2980 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); 2981 2982 if (ST->hasAVX()) 2983 if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD, 2984 LTDest.second, LTSrc.second)) 2985 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); 2986 2987 if (ST->hasSSE41()) 2988 if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD, 2989 LTDest.second, LTSrc.second)) 2990 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); 2991 2992 if (ST->hasSSE2()) 2993 if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD, 2994 LTDest.second, LTSrc.second)) 2995 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); 2996 2997 // Fallback, for i8/i16 sitofp/uitofp cases we need to extend to i32 for 2998 // sitofp. 2999 if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) && 3000 1 < Src->getScalarSizeInBits() && Src->getScalarSizeInBits() < 32) { 3001 Type *ExtSrc = Src->getWithNewBitWidth(32); 3002 unsigned ExtOpc = 3003 (ISD == ISD::SINT_TO_FP) ? Instruction::SExt : Instruction::ZExt; 3004 3005 // For scalar loads the extend would be free. 3006 InstructionCost ExtCost = 0; 3007 if (!(Src->isIntegerTy() && I && isa<LoadInst>(I->getOperand(0)))) 3008 ExtCost = getCastInstrCost(ExtOpc, ExtSrc, Src, CCH, CostKind); 3009 3010 return ExtCost + getCastInstrCost(Instruction::SIToFP, Dst, ExtSrc, 3011 TTI::CastContextHint::None, CostKind); 3012 } 3013 3014 // Fallback for fptosi/fptoui i8/i16 cases we need to truncate from fptosi 3015 // i32. 3016 if ((ISD == ISD::FP_TO_SINT || ISD == ISD::FP_TO_UINT) && 3017 1 < Dst->getScalarSizeInBits() && Dst->getScalarSizeInBits() < 32) { 3018 Type *TruncDst = Dst->getWithNewBitWidth(32); 3019 return getCastInstrCost(Instruction::FPToSI, TruncDst, Src, CCH, CostKind) + 3020 getCastInstrCost(Instruction::Trunc, Dst, TruncDst, 3021 TTI::CastContextHint::None, CostKind); 3022 } 3023 3024 return AdjustCost( 3025 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); 3026 } 3027 3028 InstructionCost X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, 3029 Type *CondTy, 3030 CmpInst::Predicate VecPred, 3031 TTI::TargetCostKind CostKind, 3032 const Instruction *I) { 3033 // Early out if this type isn't scalar/vector integer/float. 3034 if (!(ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy())) 3035 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, 3036 I); 3037 3038 // Legalize the type. 3039 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy); 3040 3041 MVT MTy = LT.second; 3042 3043 int ISD = TLI->InstructionOpcodeToISD(Opcode); 3044 assert(ISD && "Invalid opcode"); 3045 3046 InstructionCost ExtraCost = 0; 3047 if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) { 3048 // Some vector comparison predicates cost extra instructions. 3049 // TODO: Should we invert this and assume worst case cmp costs 3050 // and reduce for particular predicates? 3051 if (MTy.isVector() && 3052 !((ST->hasXOP() && (!ST->hasAVX2() || MTy.is128BitVector())) || 3053 (ST->hasAVX512() && 32 <= MTy.getScalarSizeInBits()) || 3054 ST->hasBWI())) { 3055 // Fallback to I if a specific predicate wasn't specified. 3056 CmpInst::Predicate Pred = VecPred; 3057 if (I && (Pred == CmpInst::BAD_ICMP_PREDICATE || 3058 Pred == CmpInst::BAD_FCMP_PREDICATE)) 3059 Pred = cast<CmpInst>(I)->getPredicate(); 3060 3061 switch (Pred) { 3062 case CmpInst::Predicate::ICMP_NE: 3063 // xor(cmpeq(x,y),-1) 3064 ExtraCost = 1; 3065 break; 3066 case CmpInst::Predicate::ICMP_SGE: 3067 case CmpInst::Predicate::ICMP_SLE: 3068 // xor(cmpgt(x,y),-1) 3069 ExtraCost = 1; 3070 break; 3071 case CmpInst::Predicate::ICMP_ULT: 3072 case CmpInst::Predicate::ICMP_UGT: 3073 // cmpgt(xor(x,signbit),xor(y,signbit)) 3074 // xor(cmpeq(pmaxu(x,y),x),-1) 3075 ExtraCost = 2; 3076 break; 3077 case CmpInst::Predicate::ICMP_ULE: 3078 case CmpInst::Predicate::ICMP_UGE: 3079 if ((ST->hasSSE41() && MTy.getScalarSizeInBits() == 32) || 3080 (ST->hasSSE2() && MTy.getScalarSizeInBits() < 32)) { 3081 // cmpeq(psubus(x,y),0) 3082 // cmpeq(pminu(x,y),x) 3083 ExtraCost = 1; 3084 } else { 3085 // xor(cmpgt(xor(x,signbit),xor(y,signbit)),-1) 3086 ExtraCost = 3; 3087 } 3088 break; 3089 case CmpInst::Predicate::FCMP_ONE: 3090 case CmpInst::Predicate::FCMP_UEQ: 3091 // Without AVX we need to expand FCMP_ONE/FCMP_UEQ cases. 3092 // Use FCMP_UEQ expansion - FCMP_ONE should be the same. 3093 if (CondTy && !ST->hasAVX()) 3094 return getCmpSelInstrCost(Opcode, ValTy, CondTy, 3095 CmpInst::Predicate::FCMP_UNO, CostKind) + 3096 getCmpSelInstrCost(Opcode, ValTy, CondTy, 3097 CmpInst::Predicate::FCMP_OEQ, CostKind) + 3098 getArithmeticInstrCost(Instruction::Or, CondTy, CostKind); 3099 3100 break; 3101 case CmpInst::Predicate::BAD_ICMP_PREDICATE: 3102 case CmpInst::Predicate::BAD_FCMP_PREDICATE: 3103 // Assume worst case scenario and add the maximum extra cost. 3104 ExtraCost = 3; 3105 break; 3106 default: 3107 break; 3108 } 3109 } 3110 } 3111 3112 static const CostKindTblEntry SLMCostTbl[] = { 3113 // slm pcmpeq/pcmpgt throughput is 2 3114 { ISD::SETCC, MVT::v2i64, { 2, 5, 1, 2 } }, 3115 // slm pblendvb/blendvpd/blendvps throughput is 4 3116 { ISD::SELECT, MVT::v2f64, { 4, 4, 1, 3 } }, // vblendvpd 3117 { ISD::SELECT, MVT::v4f32, { 4, 4, 1, 3 } }, // vblendvps 3118 { ISD::SELECT, MVT::v2i64, { 4, 4, 1, 3 } }, // pblendvb 3119 { ISD::SELECT, MVT::v8i32, { 4, 4, 1, 3 } }, // pblendvb 3120 { ISD::SELECT, MVT::v8i16, { 4, 4, 1, 3 } }, // pblendvb 3121 { ISD::SELECT, MVT::v16i8, { 4, 4, 1, 3 } }, // pblendvb 3122 }; 3123 3124 static const CostKindTblEntry AVX512BWCostTbl[] = { 3125 { ISD::SETCC, MVT::v32i16, { 1, 1, 1, 1 } }, 3126 { ISD::SETCC, MVT::v16i16, { 1, 1, 1, 1 } }, 3127 { ISD::SETCC, MVT::v64i8, { 1, 1, 1, 1 } }, 3128 { ISD::SETCC, MVT::v32i8, { 1, 1, 1, 1 } }, 3129 3130 { ISD::SELECT, MVT::v32i16, { 1, 1, 1, 1 } }, 3131 { ISD::SELECT, MVT::v64i8, { 1, 1, 1, 1 } }, 3132 }; 3133 3134 static const CostKindTblEntry AVX512CostTbl[] = { 3135 { ISD::SETCC, MVT::v8f64, { 1, 4, 1, 1 } }, 3136 { ISD::SETCC, MVT::v4f64, { 1, 4, 1, 1 } }, 3137 { ISD::SETCC, MVT::v16f32, { 1, 4, 1, 1 } }, 3138 { ISD::SETCC, MVT::v8f32, { 1, 4, 1, 1 } }, 3139 3140 { ISD::SETCC, MVT::v8i64, { 1, 1, 1, 1 } }, 3141 { ISD::SETCC, MVT::v4i64, { 1, 1, 1, 1 } }, 3142 { ISD::SETCC, MVT::v2i64, { 1, 1, 1, 1 } }, 3143 { ISD::SETCC, MVT::v16i32, { 1, 1, 1, 1 } }, 3144 { ISD::SETCC, MVT::v8i32, { 1, 1, 1, 1 } }, 3145 { ISD::SETCC, MVT::v32i16, { 3, 7, 5, 5 } }, 3146 { ISD::SETCC, MVT::v64i8, { 3, 7, 5, 5 } }, 3147 3148 { ISD::SELECT, MVT::v8i64, { 1, 1, 1, 1 } }, 3149 { ISD::SELECT, MVT::v4i64, { 1, 1, 1, 1 } }, 3150 { ISD::SELECT, MVT::v2i64, { 1, 1, 1, 1 } }, 3151 { ISD::SELECT, MVT::v16i32, { 1, 1, 1, 1 } }, 3152 { ISD::SELECT, MVT::v8i32, { 1, 1, 1, 1 } }, 3153 { ISD::SELECT, MVT::v4i32, { 1, 1, 1, 1 } }, 3154 { ISD::SELECT, MVT::v8f64, { 1, 1, 1, 1 } }, 3155 { ISD::SELECT, MVT::v4f64, { 1, 1, 1, 1 } }, 3156 { ISD::SELECT, MVT::v2f64, { 1, 1, 1, 1 } }, 3157 { ISD::SELECT, MVT::f64, { 1, 1, 1, 1 } }, 3158 { ISD::SELECT, MVT::v16f32, { 1, 1, 1, 1 } }, 3159 { ISD::SELECT, MVT::v8f32 , { 1, 1, 1, 1 } }, 3160 { ISD::SELECT, MVT::v4f32, { 1, 1, 1, 1 } }, 3161 { ISD::SELECT, MVT::f32 , { 1, 1, 1, 1 } }, 3162 3163 { ISD::SELECT, MVT::v32i16, { 2, 2, 4, 4 } }, 3164 { ISD::SELECT, MVT::v16i16, { 1, 1, 1, 1 } }, 3165 { ISD::SELECT, MVT::v8i16, { 1, 1, 1, 1 } }, 3166 { ISD::SELECT, MVT::v64i8, { 2, 2, 4, 4 } }, 3167 { ISD::SELECT, MVT::v32i8, { 1, 1, 1, 1 } }, 3168 { ISD::SELECT, MVT::v16i8, { 1, 1, 1, 1 } }, 3169 }; 3170 3171 static const CostKindTblEntry AVX2CostTbl[] = { 3172 { ISD::SETCC, MVT::v4f64, { 1, 4, 1, 2 } }, 3173 { ISD::SETCC, MVT::v2f64, { 1, 4, 1, 1 } }, 3174 { ISD::SETCC, MVT::f64, { 1, 4, 1, 1 } }, 3175 { ISD::SETCC, MVT::v8f32, { 1, 4, 1, 2 } }, 3176 { ISD::SETCC, MVT::v4f32, { 1, 4, 1, 1 } }, 3177 { ISD::SETCC, MVT::f32, { 1, 4, 1, 1 } }, 3178 3179 { ISD::SETCC, MVT::v4i64, { 1, 1, 1, 2 } }, 3180 { ISD::SETCC, MVT::v8i32, { 1, 1, 1, 2 } }, 3181 { ISD::SETCC, MVT::v16i16, { 1, 1, 1, 2 } }, 3182 { ISD::SETCC, MVT::v32i8, { 1, 1, 1, 2 } }, 3183 3184 { ISD::SELECT, MVT::v4f64, { 2, 2, 1, 2 } }, // vblendvpd 3185 { ISD::SELECT, MVT::v8f32, { 2, 2, 1, 2 } }, // vblendvps 3186 { ISD::SELECT, MVT::v4i64, { 2, 2, 1, 2 } }, // pblendvb 3187 { ISD::SELECT, MVT::v8i32, { 2, 2, 1, 2 } }, // pblendvb 3188 { ISD::SELECT, MVT::v16i16, { 2, 2, 1, 2 } }, // pblendvb 3189 { ISD::SELECT, MVT::v32i8, { 2, 2, 1, 2 } }, // pblendvb 3190 }; 3191 3192 static const CostKindTblEntry XOPCostTbl[] = { 3193 { ISD::SETCC, MVT::v4i64, { 4, 2, 5, 6 } }, 3194 { ISD::SETCC, MVT::v2i64, { 1, 1, 1, 1 } }, 3195 }; 3196 3197 static const CostKindTblEntry AVX1CostTbl[] = { 3198 { ISD::SETCC, MVT::v4f64, { 2, 3, 1, 2 } }, 3199 { ISD::SETCC, MVT::v2f64, { 1, 3, 1, 1 } }, 3200 { ISD::SETCC, MVT::f64, { 1, 3, 1, 1 } }, 3201 { ISD::SETCC, MVT::v8f32, { 2, 3, 1, 2 } }, 3202 { ISD::SETCC, MVT::v4f32, { 1, 3, 1, 1 } }, 3203 { ISD::SETCC, MVT::f32, { 1, 3, 1, 1 } }, 3204 3205 // AVX1 does not support 8-wide integer compare. 3206 { ISD::SETCC, MVT::v4i64, { 4, 2, 5, 6 } }, 3207 { ISD::SETCC, MVT::v8i32, { 4, 2, 5, 6 } }, 3208 { ISD::SETCC, MVT::v16i16, { 4, 2, 5, 6 } }, 3209 { ISD::SETCC, MVT::v32i8, { 4, 2, 5, 6 } }, 3210 3211 { ISD::SELECT, MVT::v4f64, { 3, 3, 1, 2 } }, // vblendvpd 3212 { ISD::SELECT, MVT::v8f32, { 3, 3, 1, 2 } }, // vblendvps 3213 { ISD::SELECT, MVT::v4i64, { 3, 3, 1, 2 } }, // vblendvpd 3214 { ISD::SELECT, MVT::v8i32, { 3, 3, 1, 2 } }, // vblendvps 3215 { ISD::SELECT, MVT::v16i16, { 3, 3, 3, 3 } }, // vandps + vandnps + vorps 3216 { ISD::SELECT, MVT::v32i8, { 3, 3, 3, 3 } }, // vandps + vandnps + vorps 3217 }; 3218 3219 static const CostKindTblEntry SSE42CostTbl[] = { 3220 { ISD::SETCC, MVT::v2i64, { 1, 2, 1, 2 } }, 3221 }; 3222 3223 static const CostKindTblEntry SSE41CostTbl[] = { 3224 { ISD::SETCC, MVT::v2f64, { 1, 5, 1, 1 } }, 3225 { ISD::SETCC, MVT::v4f32, { 1, 5, 1, 1 } }, 3226 3227 { ISD::SELECT, MVT::v2f64, { 2, 2, 1, 2 } }, // blendvpd 3228 { ISD::SELECT, MVT::f64, { 2, 2, 1, 2 } }, // blendvpd 3229 { ISD::SELECT, MVT::v4f32, { 2, 2, 1, 2 } }, // blendvps 3230 { ISD::SELECT, MVT::f32 , { 2, 2, 1, 2 } }, // blendvps 3231 { ISD::SELECT, MVT::v2i64, { 2, 2, 1, 2 } }, // pblendvb 3232 { ISD::SELECT, MVT::v4i32, { 2, 2, 1, 2 } }, // pblendvb 3233 { ISD::SELECT, MVT::v8i16, { 2, 2, 1, 2 } }, // pblendvb 3234 { ISD::SELECT, MVT::v16i8, { 2, 2, 1, 2 } }, // pblendvb 3235 }; 3236 3237 static const CostKindTblEntry SSE2CostTbl[] = { 3238 { ISD::SETCC, MVT::v2f64, { 2, 5, 1, 1 } }, 3239 { ISD::SETCC, MVT::f64, { 1, 5, 1, 1 } }, 3240 3241 { ISD::SETCC, MVT::v2i64, { 5, 4, 5, 5 } }, // pcmpeqd/pcmpgtd expansion 3242 { ISD::SETCC, MVT::v4i32, { 1, 1, 1, 1 } }, 3243 { ISD::SETCC, MVT::v8i16, { 1, 1, 1, 1 } }, 3244 { ISD::SETCC, MVT::v16i8, { 1, 1, 1, 1 } }, 3245 3246 { ISD::SELECT, MVT::v2f64, { 2, 2, 3, 3 } }, // andpd + andnpd + orpd 3247 { ISD::SELECT, MVT::f64, { 2, 2, 3, 3 } }, // andpd + andnpd + orpd 3248 { ISD::SELECT, MVT::v2i64, { 2, 2, 3, 3 } }, // pand + pandn + por 3249 { ISD::SELECT, MVT::v4i32, { 2, 2, 3, 3 } }, // pand + pandn + por 3250 { ISD::SELECT, MVT::v8i16, { 2, 2, 3, 3 } }, // pand + pandn + por 3251 { ISD::SELECT, MVT::v16i8, { 2, 2, 3, 3 } }, // pand + pandn + por 3252 }; 3253 3254 static const CostKindTblEntry SSE1CostTbl[] = { 3255 { ISD::SETCC, MVT::v4f32, { 2, 5, 1, 1 } }, 3256 { ISD::SETCC, MVT::f32, { 1, 5, 1, 1 } }, 3257 3258 { ISD::SELECT, MVT::v4f32, { 2, 2, 3, 3 } }, // andps + andnps + orps 3259 { ISD::SELECT, MVT::f32, { 2, 2, 3, 3 } }, // andps + andnps + orps 3260 }; 3261 3262 if (ST->useSLMArithCosts()) 3263 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy)) 3264 if (auto KindCost = Entry->Cost[CostKind]) 3265 return LT.first * (ExtraCost + *KindCost); 3266 3267 if (ST->hasBWI()) 3268 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy)) 3269 if (auto KindCost = Entry->Cost[CostKind]) 3270 return LT.first * (ExtraCost + *KindCost); 3271 3272 if (ST->hasAVX512()) 3273 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy)) 3274 if (auto KindCost = Entry->Cost[CostKind]) 3275 return LT.first * (ExtraCost + *KindCost); 3276 3277 if (ST->hasAVX2()) 3278 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy)) 3279 if (auto KindCost = Entry->Cost[CostKind]) 3280 return LT.first * (ExtraCost + *KindCost); 3281 3282 if (ST->hasXOP()) 3283 if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy)) 3284 if (auto KindCost = Entry->Cost[CostKind]) 3285 return LT.first * (ExtraCost + *KindCost); 3286 3287 if (ST->hasAVX()) 3288 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy)) 3289 if (auto KindCost = Entry->Cost[CostKind]) 3290 return LT.first * (ExtraCost + *KindCost); 3291 3292 if (ST->hasSSE42()) 3293 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy)) 3294 if (auto KindCost = Entry->Cost[CostKind]) 3295 return LT.first * (ExtraCost + *KindCost); 3296 3297 if (ST->hasSSE41()) 3298 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy)) 3299 if (auto KindCost = Entry->Cost[CostKind]) 3300 return LT.first * (ExtraCost + *KindCost); 3301 3302 if (ST->hasSSE2()) 3303 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy)) 3304 if (auto KindCost = Entry->Cost[CostKind]) 3305 return LT.first * (ExtraCost + *KindCost); 3306 3307 if (ST->hasSSE1()) 3308 if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy)) 3309 if (auto KindCost = Entry->Cost[CostKind]) 3310 return LT.first * (ExtraCost + *KindCost); 3311 3312 // Assume a 3cy latency for fp select ops. 3313 if (CostKind == TTI::TCK_Latency && Opcode == Instruction::Select) 3314 if (ValTy->getScalarType()->isFloatingPointTy()) 3315 return 3; 3316 3317 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I); 3318 } 3319 3320 unsigned X86TTIImpl::getAtomicMemIntrinsicMaxElementSize() const { return 16; } 3321 3322 InstructionCost 3323 X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, 3324 TTI::TargetCostKind CostKind) { 3325 // Costs should match the codegen from: 3326 // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll 3327 // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll 3328 // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll 3329 // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll 3330 // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll 3331 3332 // TODO: Overflow intrinsics (*ADDO, *SUBO, *MULO) with vector types are not 3333 // specialized in these tables yet. 3334 static const CostKindTblEntry AVX512VBMI2CostTbl[] = { 3335 { ISD::FSHL, MVT::v8i64, { 1, 1, 1, 1 } }, 3336 { ISD::FSHL, MVT::v4i64, { 1, 1, 1, 1 } }, 3337 { ISD::FSHL, MVT::v2i64, { 1, 1, 1, 1 } }, 3338 { ISD::FSHL, MVT::v16i32, { 1, 1, 1, 1 } }, 3339 { ISD::FSHL, MVT::v8i32, { 1, 1, 1, 1 } }, 3340 { ISD::FSHL, MVT::v4i32, { 1, 1, 1, 1 } }, 3341 { ISD::FSHL, MVT::v32i16, { 1, 1, 1, 1 } }, 3342 { ISD::FSHL, MVT::v16i16, { 1, 1, 1, 1 } }, 3343 { ISD::FSHL, MVT::v8i16, { 1, 1, 1, 1 } }, 3344 { ISD::ROTL, MVT::v32i16, { 1, 1, 1, 1 } }, 3345 { ISD::ROTL, MVT::v16i16, { 1, 1, 1, 1 } }, 3346 { ISD::ROTL, MVT::v8i16, { 1, 1, 1, 1 } }, 3347 { ISD::ROTR, MVT::v32i16, { 1, 1, 1, 1 } }, 3348 { ISD::ROTR, MVT::v16i16, { 1, 1, 1, 1 } }, 3349 { ISD::ROTR, MVT::v8i16, { 1, 1, 1, 1 } }, 3350 }; 3351 static const CostKindTblEntry AVX512BITALGCostTbl[] = { 3352 { ISD::CTPOP, MVT::v32i16, { 1, 1, 1, 1 } }, 3353 { ISD::CTPOP, MVT::v64i8, { 1, 1, 1, 1 } }, 3354 { ISD::CTPOP, MVT::v16i16, { 1, 1, 1, 1 } }, 3355 { ISD::CTPOP, MVT::v32i8, { 1, 1, 1, 1 } }, 3356 { ISD::CTPOP, MVT::v8i16, { 1, 1, 1, 1 } }, 3357 { ISD::CTPOP, MVT::v16i8, { 1, 1, 1, 1 } }, 3358 }; 3359 static const CostKindTblEntry AVX512VPOPCNTDQCostTbl[] = { 3360 { ISD::CTPOP, MVT::v8i64, { 1, 1, 1, 1 } }, 3361 { ISD::CTPOP, MVT::v16i32, { 1, 1, 1, 1 } }, 3362 { ISD::CTPOP, MVT::v4i64, { 1, 1, 1, 1 } }, 3363 { ISD::CTPOP, MVT::v8i32, { 1, 1, 1, 1 } }, 3364 { ISD::CTPOP, MVT::v2i64, { 1, 1, 1, 1 } }, 3365 { ISD::CTPOP, MVT::v4i32, { 1, 1, 1, 1 } }, 3366 }; 3367 static const CostKindTblEntry AVX512CDCostTbl[] = { 3368 { ISD::CTLZ, MVT::v8i64, { 1, 5, 1, 1 } }, 3369 { ISD::CTLZ, MVT::v16i32, { 1, 5, 1, 1 } }, 3370 { ISD::CTLZ, MVT::v32i16, { 18, 27, 23, 27 } }, 3371 { ISD::CTLZ, MVT::v64i8, { 3, 16, 9, 11 } }, 3372 { ISD::CTLZ, MVT::v4i64, { 1, 5, 1, 1 } }, 3373 { ISD::CTLZ, MVT::v8i32, { 1, 5, 1, 1 } }, 3374 { ISD::CTLZ, MVT::v16i16, { 8, 19, 11, 13 } }, 3375 { ISD::CTLZ, MVT::v32i8, { 2, 11, 9, 10 } }, 3376 { ISD::CTLZ, MVT::v2i64, { 1, 5, 1, 1 } }, 3377 { ISD::CTLZ, MVT::v4i32, { 1, 5, 1, 1 } }, 3378 { ISD::CTLZ, MVT::v8i16, { 3, 15, 4, 6 } }, 3379 { ISD::CTLZ, MVT::v16i8, { 2, 10, 9, 10 } }, 3380 3381 { ISD::CTTZ, MVT::v8i64, { 2, 8, 6, 7 } }, 3382 { ISD::CTTZ, MVT::v16i32, { 2, 8, 6, 7 } }, 3383 { ISD::CTTZ, MVT::v4i64, { 1, 8, 6, 6 } }, 3384 { ISD::CTTZ, MVT::v8i32, { 1, 8, 6, 6 } }, 3385 { ISD::CTTZ, MVT::v2i64, { 1, 8, 6, 6 } }, 3386 { ISD::CTTZ, MVT::v4i32, { 1, 8, 6, 6 } }, 3387 }; 3388 static const CostKindTblEntry AVX512BWCostTbl[] = { 3389 { ISD::ABS, MVT::v32i16, { 1, 1, 1, 1 } }, 3390 { ISD::ABS, MVT::v64i8, { 1, 1, 1, 1 } }, 3391 { ISD::BITREVERSE, MVT::v8i64, { 3 } }, 3392 { ISD::BITREVERSE, MVT::v16i32, { 3 } }, 3393 { ISD::BITREVERSE, MVT::v32i16, { 3 } }, 3394 { ISD::BITREVERSE, MVT::v64i8, { 2 } }, 3395 { ISD::BSWAP, MVT::v8i64, { 1 } }, 3396 { ISD::BSWAP, MVT::v16i32, { 1 } }, 3397 { ISD::BSWAP, MVT::v32i16, { 1 } }, 3398 { ISD::CTLZ, MVT::v8i64, { 8, 22, 23, 23 } }, 3399 { ISD::CTLZ, MVT::v16i32, { 8, 23, 25, 25 } }, 3400 { ISD::CTLZ, MVT::v32i16, { 4, 15, 15, 16 } }, 3401 { ISD::CTLZ, MVT::v64i8, { 3, 12, 10, 9 } }, 3402 { ISD::CTPOP, MVT::v2i64, { 3, 7, 10, 10 } }, 3403 { ISD::CTPOP, MVT::v4i64, { 3, 7, 10, 10 } }, 3404 { ISD::CTPOP, MVT::v8i64, { 3, 8, 10, 12 } }, 3405 { ISD::CTPOP, MVT::v4i32, { 7, 11, 14, 14 } }, 3406 { ISD::CTPOP, MVT::v8i32, { 7, 11, 14, 14 } }, 3407 { ISD::CTPOP, MVT::v16i32, { 7, 12, 14, 16 } }, 3408 { ISD::CTPOP, MVT::v8i16, { 2, 7, 11, 11 } }, 3409 { ISD::CTPOP, MVT::v16i16, { 2, 7, 11, 11 } }, 3410 { ISD::CTPOP, MVT::v32i16, { 3, 7, 11, 13 } }, 3411 { ISD::CTPOP, MVT::v16i8, { 2, 4, 8, 8 } }, 3412 { ISD::CTPOP, MVT::v32i8, { 2, 4, 8, 8 } }, 3413 { ISD::CTPOP, MVT::v64i8, { 2, 5, 8, 10 } }, 3414 { ISD::CTTZ, MVT::v8i16, { 3, 9, 14, 14 } }, 3415 { ISD::CTTZ, MVT::v16i16, { 3, 9, 14, 14 } }, 3416 { ISD::CTTZ, MVT::v32i16, { 3, 10, 14, 16 } }, 3417 { ISD::CTTZ, MVT::v16i8, { 2, 6, 11, 11 } }, 3418 { ISD::CTTZ, MVT::v32i8, { 2, 6, 11, 11 } }, 3419 { ISD::CTTZ, MVT::v64i8, { 3, 7, 11, 13 } }, 3420 { ISD::ROTL, MVT::v32i16, { 2, 8, 6, 8 } }, 3421 { ISD::ROTL, MVT::v16i16, { 2, 8, 6, 7 } }, 3422 { ISD::ROTL, MVT::v8i16, { 2, 7, 6, 7 } }, 3423 { ISD::ROTL, MVT::v64i8, { 5, 6, 11, 12 } }, 3424 { ISD::ROTL, MVT::v32i8, { 5, 15, 7, 10 } }, 3425 { ISD::ROTL, MVT::v16i8, { 5, 15, 7, 10 } }, 3426 { ISD::ROTR, MVT::v32i16, { 2, 8, 6, 8 } }, 3427 { ISD::ROTR, MVT::v16i16, { 2, 8, 6, 7 } }, 3428 { ISD::ROTR, MVT::v8i16, { 2, 7, 6, 7 } }, 3429 { ISD::ROTR, MVT::v64i8, { 5, 6, 12, 14 } }, 3430 { ISD::ROTR, MVT::v32i8, { 5, 14, 6, 9 } }, 3431 { ISD::ROTR, MVT::v16i8, { 5, 14, 6, 9 } }, 3432 { ISD::SADDSAT, MVT::v32i16, { 1 } }, 3433 { ISD::SADDSAT, MVT::v64i8, { 1 } }, 3434 { ISD::SMAX, MVT::v32i16, { 1, 1, 1, 1 } }, 3435 { ISD::SMAX, MVT::v64i8, { 1, 1, 1, 1 } }, 3436 { ISD::SMIN, MVT::v32i16, { 1, 1, 1, 1 } }, 3437 { ISD::SMIN, MVT::v64i8, { 1, 1, 1, 1 } }, 3438 { ISD::SSUBSAT, MVT::v32i16, { 1 } }, 3439 { ISD::SSUBSAT, MVT::v64i8, { 1 } }, 3440 { ISD::UADDSAT, MVT::v32i16, { 1 } }, 3441 { ISD::UADDSAT, MVT::v64i8, { 1 } }, 3442 { ISD::UMAX, MVT::v32i16, { 1, 1, 1, 1 } }, 3443 { ISD::UMAX, MVT::v64i8, { 1, 1, 1, 1 } }, 3444 { ISD::UMIN, MVT::v32i16, { 1, 1, 1, 1 } }, 3445 { ISD::UMIN, MVT::v64i8, { 1, 1, 1, 1 } }, 3446 { ISD::USUBSAT, MVT::v32i16, { 1 } }, 3447 { ISD::USUBSAT, MVT::v64i8, { 1 } }, 3448 }; 3449 static const CostKindTblEntry AVX512CostTbl[] = { 3450 { ISD::ABS, MVT::v8i64, { 1, 1, 1, 1 } }, 3451 { ISD::ABS, MVT::v4i64, { 1, 1, 1, 1 } }, 3452 { ISD::ABS, MVT::v2i64, { 1, 1, 1, 1 } }, 3453 { ISD::ABS, MVT::v16i32, { 1, 1, 1, 1 } }, 3454 { ISD::ABS, MVT::v8i32, { 1, 1, 1, 1 } }, 3455 { ISD::ABS, MVT::v32i16, { 2, 7, 4, 4 } }, 3456 { ISD::ABS, MVT::v16i16, { 1, 1, 1, 1 } }, 3457 { ISD::ABS, MVT::v64i8, { 2, 7, 4, 4 } }, 3458 { ISD::ABS, MVT::v32i8, { 1, 1, 1, 1 } }, 3459 { ISD::BITREVERSE, MVT::v8i64, { 36 } }, 3460 { ISD::BITREVERSE, MVT::v16i32, { 24 } }, 3461 { ISD::BITREVERSE, MVT::v32i16, { 10 } }, 3462 { ISD::BITREVERSE, MVT::v64i8, { 10 } }, 3463 { ISD::BSWAP, MVT::v8i64, { 4 } }, 3464 { ISD::BSWAP, MVT::v16i32, { 4 } }, 3465 { ISD::BSWAP, MVT::v32i16, { 4 } }, 3466 { ISD::CTLZ, MVT::v8i64, { 10, 28, 32, 32 } }, 3467 { ISD::CTLZ, MVT::v16i32, { 12, 30, 38, 38 } }, 3468 { ISD::CTLZ, MVT::v32i16, { 8, 15, 29, 29 } }, 3469 { ISD::CTLZ, MVT::v64i8, { 6, 11, 19, 19 } }, 3470 { ISD::CTPOP, MVT::v8i64, { 16, 16, 19, 19 } }, 3471 { ISD::CTPOP, MVT::v16i32, { 24, 19, 27, 27 } }, 3472 { ISD::CTPOP, MVT::v32i16, { 18, 15, 22, 22 } }, 3473 { ISD::CTPOP, MVT::v64i8, { 12, 11, 16, 16 } }, 3474 { ISD::CTTZ, MVT::v8i64, { 2, 8, 6, 7 } }, 3475 { ISD::CTTZ, MVT::v16i32, { 2, 8, 6, 7 } }, 3476 { ISD::CTTZ, MVT::v32i16, { 7, 17, 27, 27 } }, 3477 { ISD::CTTZ, MVT::v64i8, { 6, 13, 21, 21 } }, 3478 { ISD::ROTL, MVT::v8i64, { 1, 1, 1, 1 } }, 3479 { ISD::ROTL, MVT::v4i64, { 1, 1, 1, 1 } }, 3480 { ISD::ROTL, MVT::v2i64, { 1, 1, 1, 1 } }, 3481 { ISD::ROTL, MVT::v16i32, { 1, 1, 1, 1 } }, 3482 { ISD::ROTL, MVT::v8i32, { 1, 1, 1, 1 } }, 3483 { ISD::ROTL, MVT::v4i32, { 1, 1, 1, 1 } }, 3484 { ISD::ROTR, MVT::v8i64, { 1, 1, 1, 1 } }, 3485 { ISD::ROTR, MVT::v4i64, { 1, 1, 1, 1 } }, 3486 { ISD::ROTR, MVT::v2i64, { 1, 1, 1, 1 } }, 3487 { ISD::ROTR, MVT::v16i32, { 1, 1, 1, 1 } }, 3488 { ISD::ROTR, MVT::v8i32, { 1, 1, 1, 1 } }, 3489 { ISD::ROTR, MVT::v4i32, { 1, 1, 1, 1 } }, 3490 { ISD::SMAX, MVT::v8i64, { 1, 3, 1, 1 } }, 3491 { ISD::SMAX, MVT::v16i32, { 1, 1, 1, 1 } }, 3492 { ISD::SMAX, MVT::v32i16, { 3, 7, 5, 5 } }, 3493 { ISD::SMAX, MVT::v64i8, { 3, 7, 5, 5 } }, 3494 { ISD::SMAX, MVT::v4i64, { 1, 3, 1, 1 } }, 3495 { ISD::SMAX, MVT::v2i64, { 1, 3, 1, 1 } }, 3496 { ISD::SMIN, MVT::v8i64, { 1, 3, 1, 1 } }, 3497 { ISD::SMIN, MVT::v16i32, { 1, 1, 1, 1 } }, 3498 { ISD::SMIN, MVT::v32i16, { 3, 7, 5, 5 } }, 3499 { ISD::SMIN, MVT::v64i8, { 3, 7, 5, 5 } }, 3500 { ISD::SMIN, MVT::v4i64, { 1, 3, 1, 1 } }, 3501 { ISD::SMIN, MVT::v2i64, { 1, 3, 1, 1 } }, 3502 { ISD::UMAX, MVT::v8i64, { 1, 3, 1, 1 } }, 3503 { ISD::UMAX, MVT::v16i32, { 1, 1, 1, 1 } }, 3504 { ISD::UMAX, MVT::v32i16, { 3, 7, 5, 5 } }, 3505 { ISD::UMAX, MVT::v64i8, { 3, 7, 5, 5 } }, 3506 { ISD::UMAX, MVT::v4i64, { 1, 3, 1, 1 } }, 3507 { ISD::UMAX, MVT::v2i64, { 1, 3, 1, 1 } }, 3508 { ISD::UMIN, MVT::v8i64, { 1, 3, 1, 1 } }, 3509 { ISD::UMIN, MVT::v16i32, { 1, 1, 1, 1 } }, 3510 { ISD::UMIN, MVT::v32i16, { 3, 7, 5, 5 } }, 3511 { ISD::UMIN, MVT::v64i8, { 3, 7, 5, 5 } }, 3512 { ISD::UMIN, MVT::v4i64, { 1, 3, 1, 1 } }, 3513 { ISD::UMIN, MVT::v2i64, { 1, 3, 1, 1 } }, 3514 { ISD::USUBSAT, MVT::v16i32, { 2 } }, // pmaxud + psubd 3515 { ISD::USUBSAT, MVT::v2i64, { 2 } }, // pmaxuq + psubq 3516 { ISD::USUBSAT, MVT::v4i64, { 2 } }, // pmaxuq + psubq 3517 { ISD::USUBSAT, MVT::v8i64, { 2 } }, // pmaxuq + psubq 3518 { ISD::UADDSAT, MVT::v16i32, { 3 } }, // not + pminud + paddd 3519 { ISD::UADDSAT, MVT::v2i64, { 3 } }, // not + pminuq + paddq 3520 { ISD::UADDSAT, MVT::v4i64, { 3 } }, // not + pminuq + paddq 3521 { ISD::UADDSAT, MVT::v8i64, { 3 } }, // not + pminuq + paddq 3522 { ISD::SADDSAT, MVT::v32i16, { 2 } }, 3523 { ISD::SADDSAT, MVT::v64i8, { 2 } }, 3524 { ISD::SSUBSAT, MVT::v32i16, { 2 } }, 3525 { ISD::SSUBSAT, MVT::v64i8, { 2 } }, 3526 { ISD::UADDSAT, MVT::v32i16, { 2 } }, 3527 { ISD::UADDSAT, MVT::v64i8, { 2 } }, 3528 { ISD::USUBSAT, MVT::v32i16, { 2 } }, 3529 { ISD::USUBSAT, MVT::v64i8, { 2 } }, 3530 { ISD::FMAXNUM, MVT::f32, { 2 } }, 3531 { ISD::FMAXNUM, MVT::v4f32, { 2 } }, 3532 { ISD::FMAXNUM, MVT::v8f32, { 2 } }, 3533 { ISD::FMAXNUM, MVT::v16f32, { 2 } }, 3534 { ISD::FMAXNUM, MVT::f64, { 2 } }, 3535 { ISD::FMAXNUM, MVT::v2f64, { 2 } }, 3536 { ISD::FMAXNUM, MVT::v4f64, { 2 } }, 3537 { ISD::FMAXNUM, MVT::v8f64, { 2 } }, 3538 { ISD::FSQRT, MVT::f32, { 3, 12, 1, 1 } }, // Skylake from http://www.agner.org/ 3539 { ISD::FSQRT, MVT::v4f32, { 3, 12, 1, 1 } }, // Skylake from http://www.agner.org/ 3540 { ISD::FSQRT, MVT::v8f32, { 6, 12, 1, 1 } }, // Skylake from http://www.agner.org/ 3541 { ISD::FSQRT, MVT::v16f32, { 12, 20, 1, 3 } }, // Skylake from http://www.agner.org/ 3542 { ISD::FSQRT, MVT::f64, { 6, 18, 1, 1 } }, // Skylake from http://www.agner.org/ 3543 { ISD::FSQRT, MVT::v2f64, { 6, 18, 1, 1 } }, // Skylake from http://www.agner.org/ 3544 { ISD::FSQRT, MVT::v4f64, { 12, 18, 1, 1 } }, // Skylake from http://www.agner.org/ 3545 { ISD::FSQRT, MVT::v8f64, { 24, 32, 1, 3 } }, // Skylake from http://www.agner.org/ 3546 }; 3547 static const CostKindTblEntry XOPCostTbl[] = { 3548 { ISD::BITREVERSE, MVT::v4i64, { 4 } }, 3549 { ISD::BITREVERSE, MVT::v8i32, { 4 } }, 3550 { ISD::BITREVERSE, MVT::v16i16, { 4 } }, 3551 { ISD::BITREVERSE, MVT::v32i8, { 4 } }, 3552 { ISD::BITREVERSE, MVT::v2i64, { 1 } }, 3553 { ISD::BITREVERSE, MVT::v4i32, { 1 } }, 3554 { ISD::BITREVERSE, MVT::v8i16, { 1 } }, 3555 { ISD::BITREVERSE, MVT::v16i8, { 1 } }, 3556 { ISD::BITREVERSE, MVT::i64, { 3 } }, 3557 { ISD::BITREVERSE, MVT::i32, { 3 } }, 3558 { ISD::BITREVERSE, MVT::i16, { 3 } }, 3559 { ISD::BITREVERSE, MVT::i8, { 3 } }, 3560 // XOP: ROTL = VPROT(X,Y), ROTR = VPROT(X,SUB(0,Y)) 3561 { ISD::ROTL, MVT::v4i64, { 4, 7, 5, 6 } }, 3562 { ISD::ROTL, MVT::v8i32, { 4, 7, 5, 6 } }, 3563 { ISD::ROTL, MVT::v16i16, { 4, 7, 5, 6 } }, 3564 { ISD::ROTL, MVT::v32i8, { 4, 7, 5, 6 } }, 3565 { ISD::ROTL, MVT::v2i64, { 1, 3, 1, 1 } }, 3566 { ISD::ROTL, MVT::v4i32, { 1, 3, 1, 1 } }, 3567 { ISD::ROTL, MVT::v8i16, { 1, 3, 1, 1 } }, 3568 { ISD::ROTL, MVT::v16i8, { 1, 3, 1, 1 } }, 3569 { ISD::ROTR, MVT::v4i64, { 4, 7, 8, 9 } }, 3570 { ISD::ROTR, MVT::v8i32, { 4, 7, 8, 9 } }, 3571 { ISD::ROTR, MVT::v16i16, { 4, 7, 8, 9 } }, 3572 { ISD::ROTR, MVT::v32i8, { 4, 7, 8, 9 } }, 3573 { ISD::ROTR, MVT::v2i64, { 1, 3, 3, 3 } }, 3574 { ISD::ROTR, MVT::v4i32, { 1, 3, 3, 3 } }, 3575 { ISD::ROTR, MVT::v8i16, { 1, 3, 3, 3 } }, 3576 { ISD::ROTR, MVT::v16i8, { 1, 3, 3, 3 } } 3577 }; 3578 static const CostKindTblEntry AVX2CostTbl[] = { 3579 { ISD::ABS, MVT::v2i64, { 2, 4, 3, 5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X) 3580 { ISD::ABS, MVT::v4i64, { 2, 4, 3, 5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X) 3581 { ISD::ABS, MVT::v4i32, { 1, 1, 1, 1 } }, 3582 { ISD::ABS, MVT::v8i32, { 1, 1, 1, 2 } }, 3583 { ISD::ABS, MVT::v8i16, { 1, 1, 1, 1 } }, 3584 { ISD::ABS, MVT::v16i16, { 1, 1, 1, 2 } }, 3585 { ISD::ABS, MVT::v16i8, { 1, 1, 1, 1 } }, 3586 { ISD::ABS, MVT::v32i8, { 1, 1, 1, 2 } }, 3587 { ISD::BITREVERSE, MVT::v2i64, { 3 } }, 3588 { ISD::BITREVERSE, MVT::v4i64, { 3 } }, 3589 { ISD::BITREVERSE, MVT::v4i32, { 3 } }, 3590 { ISD::BITREVERSE, MVT::v8i32, { 3 } }, 3591 { ISD::BITREVERSE, MVT::v8i16, { 3 } }, 3592 { ISD::BITREVERSE, MVT::v16i16, { 3 } }, 3593 { ISD::BITREVERSE, MVT::v16i8, { 3 } }, 3594 { ISD::BITREVERSE, MVT::v32i8, { 3 } }, 3595 { ISD::BSWAP, MVT::v4i64, { 1 } }, 3596 { ISD::BSWAP, MVT::v8i32, { 1 } }, 3597 { ISD::BSWAP, MVT::v16i16, { 1 } }, 3598 { ISD::CTLZ, MVT::v2i64, { 7, 18, 24, 25 } }, 3599 { ISD::CTLZ, MVT::v4i64, { 14, 18, 24, 44 } }, 3600 { ISD::CTLZ, MVT::v4i32, { 5, 16, 19, 20 } }, 3601 { ISD::CTLZ, MVT::v8i32, { 10, 16, 19, 34 } }, 3602 { ISD::CTLZ, MVT::v8i16, { 4, 13, 14, 15 } }, 3603 { ISD::CTLZ, MVT::v16i16, { 6, 14, 14, 24 } }, 3604 { ISD::CTLZ, MVT::v16i8, { 3, 12, 9, 10 } }, 3605 { ISD::CTLZ, MVT::v32i8, { 4, 12, 9, 14 } }, 3606 { ISD::CTPOP, MVT::v2i64, { 3, 9, 10, 10 } }, 3607 { ISD::CTPOP, MVT::v4i64, { 4, 9, 10, 14 } }, 3608 { ISD::CTPOP, MVT::v4i32, { 7, 12, 14, 14 } }, 3609 { ISD::CTPOP, MVT::v8i32, { 7, 12, 14, 18 } }, 3610 { ISD::CTPOP, MVT::v8i16, { 3, 7, 11, 11 } }, 3611 { ISD::CTPOP, MVT::v16i16, { 6, 8, 11, 18 } }, 3612 { ISD::CTPOP, MVT::v16i8, { 2, 5, 8, 8 } }, 3613 { ISD::CTPOP, MVT::v32i8, { 3, 5, 8, 12 } }, 3614 { ISD::CTTZ, MVT::v2i64, { 4, 11, 13, 13 } }, 3615 { ISD::CTTZ, MVT::v4i64, { 5, 11, 13, 20 } }, 3616 { ISD::CTTZ, MVT::v4i32, { 7, 14, 17, 17 } }, 3617 { ISD::CTTZ, MVT::v8i32, { 7, 15, 17, 24 } }, 3618 { ISD::CTTZ, MVT::v8i16, { 4, 9, 14, 14 } }, 3619 { ISD::CTTZ, MVT::v16i16, { 6, 9, 14, 24 } }, 3620 { ISD::CTTZ, MVT::v16i8, { 3, 7, 11, 11 } }, 3621 { ISD::CTTZ, MVT::v32i8, { 5, 7, 11, 18 } }, 3622 { ISD::SADDSAT, MVT::v16i16, { 1 } }, 3623 { ISD::SADDSAT, MVT::v32i8, { 1 } }, 3624 { ISD::SMAX, MVT::v2i64, { 2, 7, 2, 3 } }, 3625 { ISD::SMAX, MVT::v4i64, { 2, 7, 2, 3 } }, 3626 { ISD::SMAX, MVT::v8i32, { 1, 1, 1, 2 } }, 3627 { ISD::SMAX, MVT::v16i16, { 1, 1, 1, 2 } }, 3628 { ISD::SMAX, MVT::v32i8, { 1, 1, 1, 2 } }, 3629 { ISD::SMIN, MVT::v2i64, { 2, 7, 2, 3 } }, 3630 { ISD::SMIN, MVT::v4i64, { 2, 7, 2, 3 } }, 3631 { ISD::SMIN, MVT::v8i32, { 1, 1, 1, 2 } }, 3632 { ISD::SMIN, MVT::v16i16, { 1, 1, 1, 2 } }, 3633 { ISD::SMIN, MVT::v32i8, { 1, 1, 1, 2 } }, 3634 { ISD::SSUBSAT, MVT::v16i16, { 1 } }, 3635 { ISD::SSUBSAT, MVT::v32i8, { 1 } }, 3636 { ISD::UADDSAT, MVT::v16i16, { 1 } }, 3637 { ISD::UADDSAT, MVT::v32i8, { 1 } }, 3638 { ISD::UADDSAT, MVT::v8i32, { 3 } }, // not + pminud + paddd 3639 { ISD::UMAX, MVT::v2i64, { 2, 8, 5, 6 } }, 3640 { ISD::UMAX, MVT::v4i64, { 2, 8, 5, 8 } }, 3641 { ISD::UMAX, MVT::v8i32, { 1, 1, 1, 2 } }, 3642 { ISD::UMAX, MVT::v16i16, { 1, 1, 1, 2 } }, 3643 { ISD::UMAX, MVT::v32i8, { 1, 1, 1, 2 } }, 3644 { ISD::UMIN, MVT::v2i64, { 2, 8, 5, 6 } }, 3645 { ISD::UMIN, MVT::v4i64, { 2, 8, 5, 8 } }, 3646 { ISD::UMIN, MVT::v8i32, { 1, 1, 1, 2 } }, 3647 { ISD::UMIN, MVT::v16i16, { 1, 1, 1, 2 } }, 3648 { ISD::UMIN, MVT::v32i8, { 1, 1, 1, 2 } }, 3649 { ISD::USUBSAT, MVT::v16i16, { 1 } }, 3650 { ISD::USUBSAT, MVT::v32i8, { 1 } }, 3651 { ISD::USUBSAT, MVT::v8i32, { 2 } }, // pmaxud + psubd 3652 { ISD::FMAXNUM, MVT::v8f32, { 3 } }, // MAXPS + CMPUNORDPS + BLENDVPS 3653 { ISD::FMAXNUM, MVT::v4f64, { 3 } }, // MAXPD + CMPUNORDPD + BLENDVPD 3654 { ISD::FSQRT, MVT::f32, { 7, 15, 1, 1 } }, // vsqrtss 3655 { ISD::FSQRT, MVT::v4f32, { 7, 15, 1, 1 } }, // vsqrtps 3656 { ISD::FSQRT, MVT::v8f32, { 14, 21, 1, 3 } }, // vsqrtps 3657 { ISD::FSQRT, MVT::f64, { 14, 21, 1, 1 } }, // vsqrtsd 3658 { ISD::FSQRT, MVT::v2f64, { 14, 21, 1, 1 } }, // vsqrtpd 3659 { ISD::FSQRT, MVT::v4f64, { 28, 35, 1, 3 } }, // vsqrtpd 3660 }; 3661 static const CostKindTblEntry AVX1CostTbl[] = { 3662 { ISD::ABS, MVT::v4i64, { 6, 8, 6, 12 } }, // VBLENDVPD(X,VPSUBQ(0,X),X) 3663 { ISD::ABS, MVT::v8i32, { 3, 6, 4, 5 } }, 3664 { ISD::ABS, MVT::v16i16, { 3, 6, 4, 5 } }, 3665 { ISD::ABS, MVT::v32i8, { 3, 6, 4, 5 } }, 3666 { ISD::BITREVERSE, MVT::v4i64, { 12 } }, // 2 x 128-bit Op + extract/insert 3667 { ISD::BITREVERSE, MVT::v8i32, { 12 } }, // 2 x 128-bit Op + extract/insert 3668 { ISD::BITREVERSE, MVT::v16i16, { 12 } }, // 2 x 128-bit Op + extract/insert 3669 { ISD::BITREVERSE, MVT::v32i8, { 12 } }, // 2 x 128-bit Op + extract/insert 3670 { ISD::BSWAP, MVT::v4i64, { 4 } }, 3671 { ISD::BSWAP, MVT::v8i32, { 4 } }, 3672 { ISD::BSWAP, MVT::v16i16, { 4 } }, 3673 { ISD::CTLZ, MVT::v4i64, { 29, 33, 49, 58 } }, // 2 x 128-bit Op + extract/insert 3674 { ISD::CTLZ, MVT::v2i64, { 14, 24, 24, 28 } }, 3675 { ISD::CTLZ, MVT::v8i32, { 24, 28, 39, 48 } }, // 2 x 128-bit Op + extract/insert 3676 { ISD::CTLZ, MVT::v4i32, { 12, 20, 19, 23 } }, 3677 { ISD::CTLZ, MVT::v16i16, { 19, 22, 29, 38 } }, // 2 x 128-bit Op + extract/insert 3678 { ISD::CTLZ, MVT::v8i16, { 9, 16, 14, 18 } }, 3679 { ISD::CTLZ, MVT::v32i8, { 14, 15, 19, 28 } }, // 2 x 128-bit Op + extract/insert 3680 { ISD::CTLZ, MVT::v16i8, { 7, 12, 9, 13 } }, 3681 { ISD::CTPOP, MVT::v4i64, { 14, 18, 19, 28 } }, // 2 x 128-bit Op + extract/insert 3682 { ISD::CTPOP, MVT::v2i64, { 7, 14, 10, 14 } }, 3683 { ISD::CTPOP, MVT::v8i32, { 18, 24, 27, 36 } }, // 2 x 128-bit Op + extract/insert 3684 { ISD::CTPOP, MVT::v4i32, { 9, 20, 14, 18 } }, 3685 { ISD::CTPOP, MVT::v16i16, { 16, 21, 22, 31 } }, // 2 x 128-bit Op + extract/insert 3686 { ISD::CTPOP, MVT::v8i16, { 8, 18, 11, 15 } }, 3687 { ISD::CTPOP, MVT::v32i8, { 13, 15, 16, 25 } }, // 2 x 128-bit Op + extract/insert 3688 { ISD::CTPOP, MVT::v16i8, { 6, 12, 8, 12 } }, 3689 { ISD::CTTZ, MVT::v4i64, { 17, 22, 24, 33 } }, // 2 x 128-bit Op + extract/insert 3690 { ISD::CTTZ, MVT::v2i64, { 9, 19, 13, 17 } }, 3691 { ISD::CTTZ, MVT::v8i32, { 21, 27, 32, 41 } }, // 2 x 128-bit Op + extract/insert 3692 { ISD::CTTZ, MVT::v4i32, { 11, 24, 17, 21 } }, 3693 { ISD::CTTZ, MVT::v16i16, { 18, 24, 27, 36 } }, // 2 x 128-bit Op + extract/insert 3694 { ISD::CTTZ, MVT::v8i16, { 9, 21, 14, 18 } }, 3695 { ISD::CTTZ, MVT::v32i8, { 15, 18, 21, 30 } }, // 2 x 128-bit Op + extract/insert 3696 { ISD::CTTZ, MVT::v16i8, { 8, 16, 11, 15 } }, 3697 { ISD::SADDSAT, MVT::v16i16, { 4 } }, // 2 x 128-bit Op + extract/insert 3698 { ISD::SADDSAT, MVT::v32i8, { 4 } }, // 2 x 128-bit Op + extract/insert 3699 { ISD::SMAX, MVT::v4i64, { 6, 9, 6, 12 } }, // 2 x 128-bit Op + extract/insert 3700 { ISD::SMAX, MVT::v2i64, { 3, 7, 2, 4 } }, 3701 { ISD::SMAX, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert 3702 { ISD::SMAX, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert 3703 { ISD::SMAX, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert 3704 { ISD::SMIN, MVT::v4i64, { 6, 9, 6, 12 } }, // 2 x 128-bit Op + extract/insert 3705 { ISD::SMIN, MVT::v2i64, { 3, 7, 2, 3 } }, 3706 { ISD::SMIN, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert 3707 { ISD::SMIN, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert 3708 { ISD::SMIN, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert 3709 { ISD::SSUBSAT, MVT::v16i16, { 4 } }, // 2 x 128-bit Op + extract/insert 3710 { ISD::SSUBSAT, MVT::v32i8, { 4 } }, // 2 x 128-bit Op + extract/insert 3711 { ISD::UADDSAT, MVT::v16i16, { 4 } }, // 2 x 128-bit Op + extract/insert 3712 { ISD::UADDSAT, MVT::v32i8, { 4 } }, // 2 x 128-bit Op + extract/insert 3713 { ISD::UADDSAT, MVT::v8i32, { 8 } }, // 2 x 128-bit Op + extract/insert 3714 { ISD::UMAX, MVT::v4i64, { 9, 10, 11, 17 } }, // 2 x 128-bit Op + extract/insert 3715 { ISD::UMAX, MVT::v2i64, { 4, 8, 5, 7 } }, 3716 { ISD::UMAX, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert 3717 { ISD::UMAX, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert 3718 { ISD::UMAX, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert 3719 { ISD::UMIN, MVT::v4i64, { 9, 10, 11, 17 } }, // 2 x 128-bit Op + extract/insert 3720 { ISD::UMIN, MVT::v2i64, { 4, 8, 5, 7 } }, 3721 { ISD::UMIN, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert 3722 { ISD::UMIN, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert 3723 { ISD::UMIN, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert 3724 { ISD::USUBSAT, MVT::v16i16, { 4 } }, // 2 x 128-bit Op + extract/insert 3725 { ISD::USUBSAT, MVT::v32i8, { 4 } }, // 2 x 128-bit Op + extract/insert 3726 { ISD::USUBSAT, MVT::v8i32, { 6 } }, // 2 x 128-bit Op + extract/insert 3727 { ISD::FMAXNUM, MVT::f32, { 3 } }, // MAXSS + CMPUNORDSS + BLENDVPS 3728 { ISD::FMAXNUM, MVT::v4f32, { 3 } }, // MAXPS + CMPUNORDPS + BLENDVPS 3729 { ISD::FMAXNUM, MVT::v8f32, { 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS + ? 3730 { ISD::FMAXNUM, MVT::f64, { 3 } }, // MAXSD + CMPUNORDSD + BLENDVPD 3731 { ISD::FMAXNUM, MVT::v2f64, { 3 } }, // MAXPD + CMPUNORDPD + BLENDVPD 3732 { ISD::FMAXNUM, MVT::v4f64, { 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD + ? 3733 { ISD::FSQRT, MVT::f32, { 21, 21, 1, 1 } }, // vsqrtss 3734 { ISD::FSQRT, MVT::v4f32, { 21, 21, 1, 1 } }, // vsqrtps 3735 { ISD::FSQRT, MVT::v8f32, { 42, 42, 1, 3 } }, // vsqrtps 3736 { ISD::FSQRT, MVT::f64, { 27, 27, 1, 1 } }, // vsqrtsd 3737 { ISD::FSQRT, MVT::v2f64, { 27, 27, 1, 1 } }, // vsqrtpd 3738 { ISD::FSQRT, MVT::v4f64, { 54, 54, 1, 3 } }, // vsqrtpd 3739 }; 3740 static const CostKindTblEntry GLMCostTbl[] = { 3741 { ISD::FSQRT, MVT::f32, { 19, 20, 1, 1 } }, // sqrtss 3742 { ISD::FSQRT, MVT::v4f32, { 37, 41, 1, 5 } }, // sqrtps 3743 { ISD::FSQRT, MVT::f64, { 34, 35, 1, 1 } }, // sqrtsd 3744 { ISD::FSQRT, MVT::v2f64, { 67, 71, 1, 5 } }, // sqrtpd 3745 }; 3746 static const CostKindTblEntry SLMCostTbl[] = { 3747 { ISD::FSQRT, MVT::f32, { 20, 20, 1, 1 } }, // sqrtss 3748 { ISD::FSQRT, MVT::v4f32, { 40, 41, 1, 5 } }, // sqrtps 3749 { ISD::FSQRT, MVT::f64, { 35, 35, 1, 1 } }, // sqrtsd 3750 { ISD::FSQRT, MVT::v2f64, { 70, 71, 1, 5 } }, // sqrtpd 3751 }; 3752 static const CostKindTblEntry SSE42CostTbl[] = { 3753 { ISD::USUBSAT, MVT::v4i32, { 2 } }, // pmaxud + psubd 3754 { ISD::UADDSAT, MVT::v4i32, { 3 } }, // not + pminud + paddd 3755 { ISD::FSQRT, MVT::f32, { 18, 18, 1, 1 } }, // Nehalem from http://www.agner.org/ 3756 { ISD::FSQRT, MVT::v4f32, { 18, 18, 1, 1 } }, // Nehalem from http://www.agner.org/ 3757 }; 3758 static const CostKindTblEntry SSE41CostTbl[] = { 3759 { ISD::ABS, MVT::v2i64, { 3, 4, 3, 5 } }, // BLENDVPD(X,PSUBQ(0,X),X) 3760 { ISD::SMAX, MVT::v2i64, { 3, 7, 2, 3 } }, 3761 { ISD::SMAX, MVT::v4i32, { 1, 1, 1, 1 } }, 3762 { ISD::SMAX, MVT::v16i8, { 1, 1, 1, 1 } }, 3763 { ISD::SMIN, MVT::v2i64, { 3, 7, 2, 3 } }, 3764 { ISD::SMIN, MVT::v4i32, { 1, 1, 1, 1 } }, 3765 { ISD::SMIN, MVT::v16i8, { 1, 1, 1, 1 } }, 3766 { ISD::UMAX, MVT::v2i64, { 2, 11, 6, 7 } }, 3767 { ISD::UMAX, MVT::v4i32, { 1, 1, 1, 1 } }, 3768 { ISD::UMAX, MVT::v8i16, { 1, 1, 1, 1 } }, 3769 { ISD::UMIN, MVT::v2i64, { 2, 11, 6, 7 } }, 3770 { ISD::UMIN, MVT::v4i32, { 1, 1, 1, 1 } }, 3771 { ISD::UMIN, MVT::v8i16, { 1, 1, 1, 1 } }, 3772 }; 3773 static const CostKindTblEntry SSSE3CostTbl[] = { 3774 { ISD::ABS, MVT::v4i32, { 1, 2, 1, 1 } }, 3775 { ISD::ABS, MVT::v8i16, { 1, 2, 1, 1 } }, 3776 { ISD::ABS, MVT::v16i8, { 1, 2, 1, 1 } }, 3777 { ISD::BITREVERSE, MVT::v2i64, { 5 } }, 3778 { ISD::BITREVERSE, MVT::v4i32, { 5 } }, 3779 { ISD::BITREVERSE, MVT::v8i16, { 5 } }, 3780 { ISD::BITREVERSE, MVT::v16i8, { 5 } }, 3781 { ISD::BSWAP, MVT::v2i64, { 1 } }, 3782 { ISD::BSWAP, MVT::v4i32, { 1 } }, 3783 { ISD::BSWAP, MVT::v8i16, { 1 } }, 3784 { ISD::CTLZ, MVT::v2i64, { 18, 28, 28, 35 } }, 3785 { ISD::CTLZ, MVT::v4i32, { 15, 20, 22, 28 } }, 3786 { ISD::CTLZ, MVT::v8i16, { 13, 17, 16, 22 } }, 3787 { ISD::CTLZ, MVT::v16i8, { 11, 15, 10, 16 } }, 3788 { ISD::CTPOP, MVT::v2i64, { 13, 19, 12, 18 } }, 3789 { ISD::CTPOP, MVT::v4i32, { 18, 24, 16, 22 } }, 3790 { ISD::CTPOP, MVT::v8i16, { 13, 18, 14, 20 } }, 3791 { ISD::CTPOP, MVT::v16i8, { 11, 12, 10, 16 } }, 3792 { ISD::CTTZ, MVT::v2i64, { 13, 25, 15, 22 } }, 3793 { ISD::CTTZ, MVT::v4i32, { 18, 26, 19, 25 } }, 3794 { ISD::CTTZ, MVT::v8i16, { 13, 20, 17, 23 } }, 3795 { ISD::CTTZ, MVT::v16i8, { 11, 16, 13, 19 } } 3796 }; 3797 static const CostKindTblEntry SSE2CostTbl[] = { 3798 { ISD::ABS, MVT::v2i64, { 3, 6, 5, 5 } }, 3799 { ISD::ABS, MVT::v4i32, { 1, 4, 4, 4 } }, 3800 { ISD::ABS, MVT::v8i16, { 1, 2, 3, 3 } }, 3801 { ISD::ABS, MVT::v16i8, { 1, 2, 3, 3 } }, 3802 { ISD::BITREVERSE, MVT::v2i64, { 29 } }, 3803 { ISD::BITREVERSE, MVT::v4i32, { 27 } }, 3804 { ISD::BITREVERSE, MVT::v8i16, { 27 } }, 3805 { ISD::BITREVERSE, MVT::v16i8, { 20 } }, 3806 { ISD::BSWAP, MVT::v2i64, { 7 } }, 3807 { ISD::BSWAP, MVT::v4i32, { 7 } }, 3808 { ISD::BSWAP, MVT::v8i16, { 7 } }, 3809 { ISD::CTLZ, MVT::v2i64, { 10, 45, 36, 38 } }, 3810 { ISD::CTLZ, MVT::v4i32, { 10, 45, 38, 40 } }, 3811 { ISD::CTLZ, MVT::v8i16, { 9, 38, 32, 34 } }, 3812 { ISD::CTLZ, MVT::v16i8, { 8, 39, 29, 32 } }, 3813 { ISD::CTPOP, MVT::v2i64, { 12, 26, 16, 18 } }, 3814 { ISD::CTPOP, MVT::v4i32, { 15, 29, 21, 23 } }, 3815 { ISD::CTPOP, MVT::v8i16, { 13, 25, 18, 20 } }, 3816 { ISD::CTPOP, MVT::v16i8, { 10, 21, 14, 16 } }, 3817 { ISD::CTTZ, MVT::v2i64, { 14, 28, 19, 21 } }, 3818 { ISD::CTTZ, MVT::v4i32, { 18, 31, 24, 26 } }, 3819 { ISD::CTTZ, MVT::v8i16, { 16, 27, 21, 23 } }, 3820 { ISD::CTTZ, MVT::v16i8, { 13, 23, 17, 19 } }, 3821 { ISD::SADDSAT, MVT::v8i16, { 1 } }, 3822 { ISD::SADDSAT, MVT::v16i8, { 1 } }, 3823 { ISD::SMAX, MVT::v2i64, { 4, 8, 15, 15 } }, 3824 { ISD::SMAX, MVT::v4i32, { 2, 4, 5, 5 } }, 3825 { ISD::SMAX, MVT::v8i16, { 1, 1, 1, 1 } }, 3826 { ISD::SMAX, MVT::v16i8, { 2, 4, 5, 5 } }, 3827 { ISD::SMIN, MVT::v2i64, { 4, 8, 15, 15 } }, 3828 { ISD::SMIN, MVT::v4i32, { 2, 4, 5, 5 } }, 3829 { ISD::SMIN, MVT::v8i16, { 1, 1, 1, 1 } }, 3830 { ISD::SMIN, MVT::v16i8, { 2, 4, 5, 5 } }, 3831 { ISD::SSUBSAT, MVT::v8i16, { 1 } }, 3832 { ISD::SSUBSAT, MVT::v16i8, { 1 } }, 3833 { ISD::UADDSAT, MVT::v8i16, { 1 } }, 3834 { ISD::UADDSAT, MVT::v16i8, { 1 } }, 3835 { ISD::UMAX, MVT::v2i64, { 4, 8, 15, 15 } }, 3836 { ISD::UMAX, MVT::v4i32, { 2, 5, 8, 8 } }, 3837 { ISD::UMAX, MVT::v8i16, { 1, 3, 3, 3 } }, 3838 { ISD::UMAX, MVT::v16i8, { 1, 1, 1, 1 } }, 3839 { ISD::UMIN, MVT::v2i64, { 4, 8, 15, 15 } }, 3840 { ISD::UMIN, MVT::v4i32, { 2, 5, 8, 8 } }, 3841 { ISD::UMIN, MVT::v8i16, { 1, 3, 3, 3 } }, 3842 { ISD::UMIN, MVT::v16i8, { 1, 1, 1, 1 } }, 3843 { ISD::USUBSAT, MVT::v8i16, { 1 } }, 3844 { ISD::USUBSAT, MVT::v16i8, { 1 } }, 3845 { ISD::FMAXNUM, MVT::f64, { 4 } }, 3846 { ISD::FMAXNUM, MVT::v2f64, { 4 } }, 3847 { ISD::FSQRT, MVT::f64, { 32, 32, 1, 1 } }, // Nehalem from http://www.agner.org/ 3848 { ISD::FSQRT, MVT::v2f64, { 32, 32, 1, 1 } }, // Nehalem from http://www.agner.org/ 3849 }; 3850 static const CostKindTblEntry SSE1CostTbl[] = { 3851 { ISD::FMAXNUM, MVT::f32, { 4 } }, 3852 { ISD::FMAXNUM, MVT::v4f32, { 4 } }, 3853 { ISD::FSQRT, MVT::f32, { 28, 30, 1, 2 } }, // Pentium III from http://www.agner.org/ 3854 { ISD::FSQRT, MVT::v4f32, { 56, 56, 1, 2 } }, // Pentium III from http://www.agner.org/ 3855 }; 3856 static const CostKindTblEntry BMI64CostTbl[] = { // 64-bit targets 3857 { ISD::CTTZ, MVT::i64, { 1 } }, 3858 }; 3859 static const CostKindTblEntry BMI32CostTbl[] = { // 32 or 64-bit targets 3860 { ISD::CTTZ, MVT::i32, { 1 } }, 3861 { ISD::CTTZ, MVT::i16, { 1 } }, 3862 { ISD::CTTZ, MVT::i8, { 1 } }, 3863 }; 3864 static const CostKindTblEntry LZCNT64CostTbl[] = { // 64-bit targets 3865 { ISD::CTLZ, MVT::i64, { 1 } }, 3866 }; 3867 static const CostKindTblEntry LZCNT32CostTbl[] = { // 32 or 64-bit targets 3868 { ISD::CTLZ, MVT::i32, { 1 } }, 3869 { ISD::CTLZ, MVT::i16, { 2 } }, 3870 { ISD::CTLZ, MVT::i8, { 2 } }, 3871 }; 3872 static const CostKindTblEntry POPCNT64CostTbl[] = { // 64-bit targets 3873 { ISD::CTPOP, MVT::i64, { 1, 1, 1, 1 } }, // popcnt 3874 }; 3875 static const CostKindTblEntry POPCNT32CostTbl[] = { // 32 or 64-bit targets 3876 { ISD::CTPOP, MVT::i32, { 1, 1, 1, 1 } }, // popcnt 3877 { ISD::CTPOP, MVT::i16, { 1, 1, 2, 2 } }, // popcnt(zext()) 3878 { ISD::CTPOP, MVT::i8, { 1, 1, 2, 2 } }, // popcnt(zext()) 3879 }; 3880 static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets 3881 { ISD::ABS, MVT::i64, { 1, 2, 3, 4 } }, // SUB+CMOV 3882 { ISD::BITREVERSE, MVT::i64, { 14 } }, 3883 { ISD::BSWAP, MVT::i64, { 1 } }, 3884 { ISD::CTLZ, MVT::i64, { 4 } }, // BSR+XOR or BSR+XOR+CMOV 3885 { ISD::CTLZ_ZERO_UNDEF, MVT::i64,{ 1, 1, 1, 1 } }, // BSR+XOR 3886 { ISD::CTTZ, MVT::i64, { 3 } }, // TEST+BSF+CMOV/BRANCH 3887 { ISD::CTTZ_ZERO_UNDEF, MVT::i64,{ 1, 1, 1, 1 } }, // BSR 3888 { ISD::CTPOP, MVT::i64, { 10, 6, 19, 19 } }, 3889 { ISD::ROTL, MVT::i64, { 2, 3, 1, 3 } }, 3890 { ISD::ROTR, MVT::i64, { 2, 3, 1, 3 } }, 3891 { ISD::FSHL, MVT::i64, { 4, 4, 1, 4 } }, 3892 { ISD::SMAX, MVT::i64, { 1, 3, 2, 3 } }, 3893 { ISD::SMIN, MVT::i64, { 1, 3, 2, 3 } }, 3894 { ISD::UMAX, MVT::i64, { 1, 3, 2, 3 } }, 3895 { ISD::UMIN, MVT::i64, { 1, 3, 2, 3 } }, 3896 { ISD::SADDO, MVT::i64, { 1 } }, 3897 { ISD::UADDO, MVT::i64, { 1 } }, 3898 { ISD::UMULO, MVT::i64, { 2 } }, // mulq + seto 3899 }; 3900 static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets 3901 { ISD::ABS, MVT::i32, { 1, 2, 3, 4 } }, // SUB+XOR+SRA or SUB+CMOV 3902 { ISD::ABS, MVT::i16, { 2, 2, 3, 4 } }, // SUB+XOR+SRA or SUB+CMOV 3903 { ISD::ABS, MVT::i8, { 2, 4, 4, 4 } }, // SUB+XOR+SRA 3904 { ISD::BITREVERSE, MVT::i32, { 14 } }, 3905 { ISD::BITREVERSE, MVT::i16, { 14 } }, 3906 { ISD::BITREVERSE, MVT::i8, { 11 } }, 3907 { ISD::BSWAP, MVT::i32, { 1 } }, 3908 { ISD::BSWAP, MVT::i16, { 1 } }, // ROL 3909 { ISD::CTLZ, MVT::i32, { 4 } }, // BSR+XOR or BSR+XOR+CMOV 3910 { ISD::CTLZ, MVT::i16, { 4 } }, // BSR+XOR or BSR+XOR+CMOV 3911 { ISD::CTLZ, MVT::i8, { 4 } }, // BSR+XOR or BSR+XOR+CMOV 3912 { ISD::CTLZ_ZERO_UNDEF, MVT::i32,{ 1, 1, 1, 1 } }, // BSR+XOR 3913 { ISD::CTLZ_ZERO_UNDEF, MVT::i16,{ 2, 2, 3, 3 } }, // BSR+XOR 3914 { ISD::CTLZ_ZERO_UNDEF, MVT::i8, { 2, 2, 3, 3 } }, // BSR+XOR 3915 { ISD::CTTZ, MVT::i32, { 3 } }, // TEST+BSF+CMOV/BRANCH 3916 { ISD::CTTZ, MVT::i16, { 3 } }, // TEST+BSF+CMOV/BRANCH 3917 { ISD::CTTZ, MVT::i8, { 3 } }, // TEST+BSF+CMOV/BRANCH 3918 { ISD::CTTZ_ZERO_UNDEF, MVT::i32,{ 1, 1, 1, 1 } }, // BSF 3919 { ISD::CTTZ_ZERO_UNDEF, MVT::i16,{ 2, 2, 1, 1 } }, // BSF 3920 { ISD::CTTZ_ZERO_UNDEF, MVT::i8, { 2, 2, 1, 1 } }, // BSF 3921 { ISD::CTPOP, MVT::i32, { 8, 7, 15, 15 } }, 3922 { ISD::CTPOP, MVT::i16, { 9, 8, 17, 17 } }, 3923 { ISD::CTPOP, MVT::i8, { 7, 6, 13, 13 } }, 3924 { ISD::ROTL, MVT::i32, { 2, 3, 1, 3 } }, 3925 { ISD::ROTL, MVT::i16, { 2, 3, 1, 3 } }, 3926 { ISD::ROTL, MVT::i8, { 2, 3, 1, 3 } }, 3927 { ISD::ROTR, MVT::i32, { 2, 3, 1, 3 } }, 3928 { ISD::ROTR, MVT::i16, { 2, 3, 1, 3 } }, 3929 { ISD::ROTR, MVT::i8, { 2, 3, 1, 3 } }, 3930 { ISD::FSHL, MVT::i32, { 4, 4, 1, 4 } }, 3931 { ISD::FSHL, MVT::i16, { 4, 4, 2, 5 } }, 3932 { ISD::FSHL, MVT::i8, { 4, 4, 2, 5 } }, 3933 { ISD::SMAX, MVT::i32, { 1, 2, 2, 3 } }, 3934 { ISD::SMAX, MVT::i16, { 1, 4, 2, 4 } }, 3935 { ISD::SMAX, MVT::i8, { 1, 4, 2, 4 } }, 3936 { ISD::SMIN, MVT::i32, { 1, 2, 2, 3 } }, 3937 { ISD::SMIN, MVT::i16, { 1, 4, 2, 4 } }, 3938 { ISD::SMIN, MVT::i8, { 1, 4, 2, 4 } }, 3939 { ISD::UMAX, MVT::i32, { 1, 2, 2, 3 } }, 3940 { ISD::UMAX, MVT::i16, { 1, 4, 2, 4 } }, 3941 { ISD::UMAX, MVT::i8, { 1, 4, 2, 4 } }, 3942 { ISD::UMIN, MVT::i32, { 1, 2, 2, 3 } }, 3943 { ISD::UMIN, MVT::i16, { 1, 4, 2, 4 } }, 3944 { ISD::UMIN, MVT::i8, { 1, 4, 2, 4 } }, 3945 { ISD::SADDO, MVT::i32, { 1 } }, 3946 { ISD::SADDO, MVT::i16, { 1 } }, 3947 { ISD::SADDO, MVT::i8, { 1 } }, 3948 { ISD::UADDO, MVT::i32, { 1 } }, 3949 { ISD::UADDO, MVT::i16, { 1 } }, 3950 { ISD::UADDO, MVT::i8, { 1 } }, 3951 { ISD::UMULO, MVT::i32, { 2 } }, // mul + seto 3952 { ISD::UMULO, MVT::i16, { 2 } }, 3953 { ISD::UMULO, MVT::i8, { 2 } }, 3954 }; 3955 3956 Type *RetTy = ICA.getReturnType(); 3957 Type *OpTy = RetTy; 3958 Intrinsic::ID IID = ICA.getID(); 3959 unsigned ISD = ISD::DELETED_NODE; 3960 switch (IID) { 3961 default: 3962 break; 3963 case Intrinsic::abs: 3964 ISD = ISD::ABS; 3965 break; 3966 case Intrinsic::bitreverse: 3967 ISD = ISD::BITREVERSE; 3968 break; 3969 case Intrinsic::bswap: 3970 ISD = ISD::BSWAP; 3971 break; 3972 case Intrinsic::ctlz: 3973 ISD = ISD::CTLZ; 3974 break; 3975 case Intrinsic::ctpop: 3976 ISD = ISD::CTPOP; 3977 break; 3978 case Intrinsic::cttz: 3979 ISD = ISD::CTTZ; 3980 break; 3981 case Intrinsic::fshl: 3982 ISD = ISD::FSHL; 3983 if (!ICA.isTypeBasedOnly()) { 3984 const SmallVectorImpl<const Value *> &Args = ICA.getArgs(); 3985 if (Args[0] == Args[1]) 3986 ISD = ISD::ROTL; 3987 } 3988 break; 3989 case Intrinsic::fshr: 3990 // FSHR has same costs so don't duplicate. 3991 ISD = ISD::FSHL; 3992 if (!ICA.isTypeBasedOnly()) { 3993 const SmallVectorImpl<const Value *> &Args = ICA.getArgs(); 3994 if (Args[0] == Args[1]) 3995 ISD = ISD::ROTR; 3996 } 3997 break; 3998 case Intrinsic::maxnum: 3999 case Intrinsic::minnum: 4000 // FMINNUM has same costs so don't duplicate. 4001 ISD = ISD::FMAXNUM; 4002 break; 4003 case Intrinsic::sadd_sat: 4004 ISD = ISD::SADDSAT; 4005 break; 4006 case Intrinsic::smax: 4007 ISD = ISD::SMAX; 4008 break; 4009 case Intrinsic::smin: 4010 ISD = ISD::SMIN; 4011 break; 4012 case Intrinsic::ssub_sat: 4013 ISD = ISD::SSUBSAT; 4014 break; 4015 case Intrinsic::uadd_sat: 4016 ISD = ISD::UADDSAT; 4017 break; 4018 case Intrinsic::umax: 4019 ISD = ISD::UMAX; 4020 break; 4021 case Intrinsic::umin: 4022 ISD = ISD::UMIN; 4023 break; 4024 case Intrinsic::usub_sat: 4025 ISD = ISD::USUBSAT; 4026 break; 4027 case Intrinsic::sqrt: 4028 ISD = ISD::FSQRT; 4029 break; 4030 case Intrinsic::sadd_with_overflow: 4031 case Intrinsic::ssub_with_overflow: 4032 // SSUBO has same costs so don't duplicate. 4033 ISD = ISD::SADDO; 4034 OpTy = RetTy->getContainedType(0); 4035 break; 4036 case Intrinsic::uadd_with_overflow: 4037 case Intrinsic::usub_with_overflow: 4038 // USUBO has same costs so don't duplicate. 4039 ISD = ISD::UADDO; 4040 OpTy = RetTy->getContainedType(0); 4041 break; 4042 case Intrinsic::umul_with_overflow: 4043 case Intrinsic::smul_with_overflow: 4044 // SMULO has same costs so don't duplicate. 4045 ISD = ISD::UMULO; 4046 OpTy = RetTy->getContainedType(0); 4047 break; 4048 } 4049 4050 if (ISD != ISD::DELETED_NODE) { 4051 // Legalize the type. 4052 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(OpTy); 4053 MVT MTy = LT.second; 4054 4055 // Attempt to lookup cost. 4056 if (ISD == ISD::BITREVERSE && ST->hasGFNI() && ST->hasSSSE3() && 4057 MTy.isVector()) { 4058 // With PSHUFB the code is very similar for all types. If we have integer 4059 // byte operations, we just need a GF2P8AFFINEQB for vXi8. For other types 4060 // we also need a PSHUFB. 4061 unsigned Cost = MTy.getVectorElementType() == MVT::i8 ? 1 : 2; 4062 4063 // Without byte operations, we need twice as many GF2P8AFFINEQB and PSHUFB 4064 // instructions. We also need an extract and an insert. 4065 if (!(MTy.is128BitVector() || (ST->hasAVX2() && MTy.is256BitVector()) || 4066 (ST->hasBWI() && MTy.is512BitVector()))) 4067 Cost = Cost * 2 + 2; 4068 4069 return LT.first * Cost; 4070 } 4071 4072 // Without BMI/LZCNT see if we're only looking for a *_ZERO_UNDEF cost. 4073 if (((ISD == ISD::CTTZ && !ST->hasBMI()) || 4074 (ISD == ISD::CTLZ && !ST->hasLZCNT())) && 4075 !MTy.isVector() && !ICA.isTypeBasedOnly()) { 4076 const SmallVectorImpl<const Value *> &Args = ICA.getArgs(); 4077 if (auto *Cst = dyn_cast<ConstantInt>(Args[1])) 4078 if (Cst->isAllOnesValue()) 4079 ISD = ISD == ISD::CTTZ ? ISD::CTTZ_ZERO_UNDEF : ISD::CTLZ_ZERO_UNDEF; 4080 } 4081 4082 // FSQRT is a single instruction. 4083 if (ISD == ISD::FSQRT && CostKind == TTI::TCK_CodeSize) 4084 return LT.first; 4085 4086 auto adjustTableCost = [](int ISD, unsigned Cost, 4087 InstructionCost LegalizationCost, 4088 FastMathFlags FMF) { 4089 // If there are no NANs to deal with, then these are reduced to a 4090 // single MIN** or MAX** instruction instead of the MIN/CMP/SELECT that we 4091 // assume is used in the non-fast case. 4092 if (ISD == ISD::FMAXNUM || ISD == ISD::FMINNUM) { 4093 if (FMF.noNaNs()) 4094 return LegalizationCost * 1; 4095 } 4096 return LegalizationCost * (int)Cost; 4097 }; 4098 4099 if (ST->useGLMDivSqrtCosts()) 4100 if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy)) 4101 if (auto KindCost = Entry->Cost[CostKind]) 4102 return adjustTableCost(Entry->ISD, *KindCost, LT.first, 4103 ICA.getFlags()); 4104 4105 if (ST->useSLMArithCosts()) 4106 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy)) 4107 if (auto KindCost = Entry->Cost[CostKind]) 4108 return adjustTableCost(Entry->ISD, *KindCost, LT.first, 4109 ICA.getFlags()); 4110 4111 if (ST->hasVBMI2()) 4112 if (const auto *Entry = CostTableLookup(AVX512VBMI2CostTbl, ISD, MTy)) 4113 if (auto KindCost = Entry->Cost[CostKind]) 4114 return adjustTableCost(Entry->ISD, *KindCost, LT.first, 4115 ICA.getFlags()); 4116 4117 if (ST->hasBITALG()) 4118 if (const auto *Entry = CostTableLookup(AVX512BITALGCostTbl, ISD, MTy)) 4119 if (auto KindCost = Entry->Cost[CostKind]) 4120 return adjustTableCost(Entry->ISD, *KindCost, LT.first, 4121 ICA.getFlags()); 4122 4123 if (ST->hasVPOPCNTDQ()) 4124 if (const auto *Entry = CostTableLookup(AVX512VPOPCNTDQCostTbl, ISD, MTy)) 4125 if (auto KindCost = Entry->Cost[CostKind]) 4126 return adjustTableCost(Entry->ISD, *KindCost, LT.first, 4127 ICA.getFlags()); 4128 4129 if (ST->hasCDI()) 4130 if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy)) 4131 if (auto KindCost = Entry->Cost[CostKind]) 4132 return adjustTableCost(Entry->ISD, *KindCost, LT.first, 4133 ICA.getFlags()); 4134 4135 if (ST->hasBWI()) 4136 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy)) 4137 if (auto KindCost = Entry->Cost[CostKind]) 4138 return adjustTableCost(Entry->ISD, *KindCost, LT.first, 4139 ICA.getFlags()); 4140 4141 if (ST->hasAVX512()) 4142 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy)) 4143 if (auto KindCost = Entry->Cost[CostKind]) 4144 return adjustTableCost(Entry->ISD, *KindCost, LT.first, 4145 ICA.getFlags()); 4146 4147 if (ST->hasXOP()) 4148 if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy)) 4149 if (auto KindCost = Entry->Cost[CostKind]) 4150 return adjustTableCost(Entry->ISD, *KindCost, LT.first, 4151 ICA.getFlags()); 4152 4153 if (ST->hasAVX2()) 4154 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy)) 4155 if (auto KindCost = Entry->Cost[CostKind]) 4156 return adjustTableCost(Entry->ISD, *KindCost, LT.first, 4157 ICA.getFlags()); 4158 4159 if (ST->hasAVX()) 4160 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy)) 4161 if (auto KindCost = Entry->Cost[CostKind]) 4162 return adjustTableCost(Entry->ISD, *KindCost, LT.first, 4163 ICA.getFlags()); 4164 4165 if (ST->hasSSE42()) 4166 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy)) 4167 if (auto KindCost = Entry->Cost[CostKind]) 4168 return adjustTableCost(Entry->ISD, *KindCost, LT.first, 4169 ICA.getFlags()); 4170 4171 if (ST->hasSSE41()) 4172 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy)) 4173 if (auto KindCost = Entry->Cost[CostKind]) 4174 return adjustTableCost(Entry->ISD, *KindCost, LT.first, 4175 ICA.getFlags()); 4176 4177 if (ST->hasSSSE3()) 4178 if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy)) 4179 if (auto KindCost = Entry->Cost[CostKind]) 4180 return adjustTableCost(Entry->ISD, *KindCost, LT.first, 4181 ICA.getFlags()); 4182 4183 if (ST->hasSSE2()) 4184 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy)) 4185 if (auto KindCost = Entry->Cost[CostKind]) 4186 return adjustTableCost(Entry->ISD, *KindCost, LT.first, 4187 ICA.getFlags()); 4188 4189 if (ST->hasSSE1()) 4190 if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy)) 4191 if (auto KindCost = Entry->Cost[CostKind]) 4192 return adjustTableCost(Entry->ISD, *KindCost, LT.first, 4193 ICA.getFlags()); 4194 4195 if (ST->hasBMI()) { 4196 if (ST->is64Bit()) 4197 if (const auto *Entry = CostTableLookup(BMI64CostTbl, ISD, MTy)) 4198 if (auto KindCost = Entry->Cost[CostKind]) 4199 return adjustTableCost(Entry->ISD, *KindCost, LT.first, 4200 ICA.getFlags()); 4201 4202 if (const auto *Entry = CostTableLookup(BMI32CostTbl, ISD, MTy)) 4203 if (auto KindCost = Entry->Cost[CostKind]) 4204 return adjustTableCost(Entry->ISD, *KindCost, LT.first, 4205 ICA.getFlags()); 4206 } 4207 4208 if (ST->hasLZCNT()) { 4209 if (ST->is64Bit()) 4210 if (const auto *Entry = CostTableLookup(LZCNT64CostTbl, ISD, MTy)) 4211 if (auto KindCost = Entry->Cost[CostKind]) 4212 return adjustTableCost(Entry->ISD, *KindCost, LT.first, 4213 ICA.getFlags()); 4214 4215 if (const auto *Entry = CostTableLookup(LZCNT32CostTbl, ISD, MTy)) 4216 if (auto KindCost = Entry->Cost[CostKind]) 4217 return adjustTableCost(Entry->ISD, *KindCost, LT.first, 4218 ICA.getFlags()); 4219 } 4220 4221 if (ST->hasPOPCNT()) { 4222 if (ST->is64Bit()) 4223 if (const auto *Entry = CostTableLookup(POPCNT64CostTbl, ISD, MTy)) 4224 if (auto KindCost = Entry->Cost[CostKind]) 4225 return adjustTableCost(Entry->ISD, *KindCost, LT.first, 4226 ICA.getFlags()); 4227 4228 if (const auto *Entry = CostTableLookup(POPCNT32CostTbl, ISD, MTy)) 4229 if (auto KindCost = Entry->Cost[CostKind]) 4230 return adjustTableCost(Entry->ISD, *KindCost, LT.first, 4231 ICA.getFlags()); 4232 } 4233 4234 if (ISD == ISD::BSWAP && ST->hasMOVBE() && ST->hasFastMOVBE()) { 4235 if (const Instruction *II = ICA.getInst()) { 4236 if (II->hasOneUse() && isa<StoreInst>(II->user_back())) 4237 return TTI::TCC_Free; 4238 if (auto *LI = dyn_cast<LoadInst>(II->getOperand(0))) { 4239 if (LI->hasOneUse()) 4240 return TTI::TCC_Free; 4241 } 4242 } 4243 } 4244 4245 if (ST->is64Bit()) 4246 if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy)) 4247 if (auto KindCost = Entry->Cost[CostKind]) 4248 return adjustTableCost(Entry->ISD, *KindCost, LT.first, 4249 ICA.getFlags()); 4250 4251 if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy)) 4252 if (auto KindCost = Entry->Cost[CostKind]) 4253 return adjustTableCost(Entry->ISD, *KindCost, LT.first, ICA.getFlags()); 4254 } 4255 4256 return BaseT::getIntrinsicInstrCost(ICA, CostKind); 4257 } 4258 4259 InstructionCost X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, 4260 TTI::TargetCostKind CostKind, 4261 unsigned Index, Value *Op0, 4262 Value *Op1) { 4263 static const CostTblEntry SLMCostTbl[] = { 4264 { ISD::EXTRACT_VECTOR_ELT, MVT::i8, 4 }, 4265 { ISD::EXTRACT_VECTOR_ELT, MVT::i16, 4 }, 4266 { ISD::EXTRACT_VECTOR_ELT, MVT::i32, 4 }, 4267 { ISD::EXTRACT_VECTOR_ELT, MVT::i64, 7 } 4268 }; 4269 4270 assert(Val->isVectorTy() && "This must be a vector type"); 4271 Type *ScalarType = Val->getScalarType(); 4272 InstructionCost RegisterFileMoveCost = 0; 4273 4274 // Non-immediate extraction/insertion can be handled as a sequence of 4275 // aliased loads+stores via the stack. 4276 if (Index == -1U && (Opcode == Instruction::ExtractElement || 4277 Opcode == Instruction::InsertElement)) { 4278 // TODO: On some SSE41+ targets, we expand to cmp+splat+select patterns: 4279 // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0. 4280 4281 // TODO: Move this to BasicTTIImpl.h? We'd need better gep + index handling. 4282 assert(isa<FixedVectorType>(Val) && "Fixed vector type expected"); 4283 Align VecAlign = DL.getPrefTypeAlign(Val); 4284 Align SclAlign = DL.getPrefTypeAlign(ScalarType); 4285 4286 // Extract - store vector to stack, load scalar. 4287 if (Opcode == Instruction::ExtractElement) { 4288 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) + 4289 getMemoryOpCost(Instruction::Load, ScalarType, SclAlign, 0, 4290 CostKind); 4291 } 4292 // Insert - store vector to stack, store scalar, load vector. 4293 if (Opcode == Instruction::InsertElement) { 4294 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) + 4295 getMemoryOpCost(Instruction::Store, ScalarType, SclAlign, 0, 4296 CostKind) + 4297 getMemoryOpCost(Instruction::Load, Val, VecAlign, 0, CostKind); 4298 } 4299 } 4300 4301 if (Index != -1U && (Opcode == Instruction::ExtractElement || 4302 Opcode == Instruction::InsertElement)) { 4303 // Extraction of vXi1 elements are now efficiently handled by MOVMSK. 4304 if (Opcode == Instruction::ExtractElement && 4305 ScalarType->getScalarSizeInBits() == 1 && 4306 cast<FixedVectorType>(Val)->getNumElements() > 1) 4307 return 1; 4308 4309 // Legalize the type. 4310 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val); 4311 4312 // This type is legalized to a scalar type. 4313 if (!LT.second.isVector()) 4314 return 0; 4315 4316 // The type may be split. Normalize the index to the new type. 4317 unsigned SizeInBits = LT.second.getSizeInBits(); 4318 unsigned NumElts = LT.second.getVectorNumElements(); 4319 unsigned SubNumElts = NumElts; 4320 Index = Index % NumElts; 4321 4322 // For >128-bit vectors, we need to extract higher 128-bit subvectors. 4323 // For inserts, we also need to insert the subvector back. 4324 if (SizeInBits > 128) { 4325 assert((SizeInBits % 128) == 0 && "Illegal vector"); 4326 unsigned NumSubVecs = SizeInBits / 128; 4327 SubNumElts = NumElts / NumSubVecs; 4328 if (SubNumElts <= Index) { 4329 RegisterFileMoveCost += (Opcode == Instruction::InsertElement ? 2 : 1); 4330 Index %= SubNumElts; 4331 } 4332 } 4333 4334 MVT MScalarTy = LT.second.getScalarType(); 4335 auto IsCheapPInsrPExtrInsertPS = [&]() { 4336 // Assume pinsr/pextr XMM <-> GPR is relatively cheap on all targets. 4337 // Also, assume insertps is relatively cheap on all >= SSE41 targets. 4338 return (MScalarTy == MVT::i16 && ST->hasSSE2()) || 4339 (MScalarTy.isInteger() && ST->hasSSE41()) || 4340 (MScalarTy == MVT::f32 && ST->hasSSE41() && 4341 Opcode == Instruction::InsertElement); 4342 }; 4343 4344 if (Index == 0) { 4345 // Floating point scalars are already located in index #0. 4346 // Many insertions to #0 can fold away for scalar fp-ops, so let's assume 4347 // true for all. 4348 if (ScalarType->isFloatingPointTy()) 4349 return RegisterFileMoveCost; 4350 4351 if (Opcode == Instruction::InsertElement && 4352 isa_and_nonnull<UndefValue>(Op0)) { 4353 // Consider the gather cost to be cheap. 4354 if (isa_and_nonnull<LoadInst>(Op1)) 4355 return RegisterFileMoveCost; 4356 if (!IsCheapPInsrPExtrInsertPS()) { 4357 // mov constant-to-GPR + movd/movq GPR -> XMM. 4358 if (isa_and_nonnull<Constant>(Op1) && Op1->getType()->isIntegerTy()) 4359 return 2 + RegisterFileMoveCost; 4360 // Assume movd/movq GPR -> XMM is relatively cheap on all targets. 4361 return 1 + RegisterFileMoveCost; 4362 } 4363 } 4364 4365 // Assume movd/movq XMM -> GPR is relatively cheap on all targets. 4366 if (ScalarType->isIntegerTy() && Opcode == Instruction::ExtractElement) 4367 return 1 + RegisterFileMoveCost; 4368 } 4369 4370 int ISD = TLI->InstructionOpcodeToISD(Opcode); 4371 assert(ISD && "Unexpected vector opcode"); 4372 if (ST->useSLMArithCosts()) 4373 if (auto *Entry = CostTableLookup(SLMCostTbl, ISD, MScalarTy)) 4374 return Entry->Cost + RegisterFileMoveCost; 4375 4376 // Consider cheap cases. 4377 if (IsCheapPInsrPExtrInsertPS()) 4378 return 1 + RegisterFileMoveCost; 4379 4380 // For extractions we just need to shuffle the element to index 0, which 4381 // should be very cheap (assume cost = 1). For insertions we need to shuffle 4382 // the elements to its destination. In both cases we must handle the 4383 // subvector move(s). 4384 // If the vector type is already less than 128-bits then don't reduce it. 4385 // TODO: Under what circumstances should we shuffle using the full width? 4386 InstructionCost ShuffleCost = 1; 4387 if (Opcode == Instruction::InsertElement) { 4388 auto *SubTy = cast<VectorType>(Val); 4389 EVT VT = TLI->getValueType(DL, Val); 4390 if (VT.getScalarType() != MScalarTy || VT.getSizeInBits() >= 128) 4391 SubTy = FixedVectorType::get(ScalarType, SubNumElts); 4392 ShuffleCost = getShuffleCost(TTI::SK_PermuteTwoSrc, SubTy, std::nullopt, 4393 CostKind, 0, SubTy); 4394 } 4395 int IntOrFpCost = ScalarType->isFloatingPointTy() ? 0 : 1; 4396 return ShuffleCost + IntOrFpCost + RegisterFileMoveCost; 4397 } 4398 4399 // Add to the base cost if we know that the extracted element of a vector is 4400 // destined to be moved to and used in the integer register file. 4401 if (Opcode == Instruction::ExtractElement && ScalarType->isPointerTy()) 4402 RegisterFileMoveCost += 1; 4403 4404 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1) + 4405 RegisterFileMoveCost; 4406 } 4407 4408 InstructionCost 4409 X86TTIImpl::getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, 4410 bool Insert, bool Extract, 4411 TTI::TargetCostKind CostKind) { 4412 assert(DemandedElts.getBitWidth() == 4413 cast<FixedVectorType>(Ty)->getNumElements() && 4414 "Vector size mismatch"); 4415 4416 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); 4417 MVT MScalarTy = LT.second.getScalarType(); 4418 unsigned LegalVectorBitWidth = LT.second.getSizeInBits(); 4419 InstructionCost Cost = 0; 4420 4421 constexpr unsigned LaneBitWidth = 128; 4422 assert((LegalVectorBitWidth < LaneBitWidth || 4423 (LegalVectorBitWidth % LaneBitWidth) == 0) && 4424 "Illegal vector"); 4425 4426 const int NumLegalVectors = *LT.first.getValue(); 4427 assert(NumLegalVectors >= 0 && "Negative cost!"); 4428 4429 // For insertions, a ISD::BUILD_VECTOR style vector initialization can be much 4430 // cheaper than an accumulation of ISD::INSERT_VECTOR_ELT. 4431 if (Insert) { 4432 if ((MScalarTy == MVT::i16 && ST->hasSSE2()) || 4433 (MScalarTy.isInteger() && ST->hasSSE41()) || 4434 (MScalarTy == MVT::f32 && ST->hasSSE41())) { 4435 // For types we can insert directly, insertion into 128-bit sub vectors is 4436 // cheap, followed by a cheap chain of concatenations. 4437 if (LegalVectorBitWidth <= LaneBitWidth) { 4438 Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, 4439 /*Extract*/ false, CostKind); 4440 } else { 4441 // In each 128-lane, if at least one index is demanded but not all 4442 // indices are demanded and this 128-lane is not the first 128-lane of 4443 // the legalized-vector, then this 128-lane needs a extracti128; If in 4444 // each 128-lane, there is at least one demanded index, this 128-lane 4445 // needs a inserti128. 4446 4447 // The following cases will help you build a better understanding: 4448 // Assume we insert several elements into a v8i32 vector in avx2, 4449 // Case#1: inserting into 1th index needs vpinsrd + inserti128. 4450 // Case#2: inserting into 5th index needs extracti128 + vpinsrd + 4451 // inserti128. 4452 // Case#3: inserting into 4,5,6,7 index needs 4*vpinsrd + inserti128. 4453 assert((LegalVectorBitWidth % LaneBitWidth) == 0 && "Illegal vector"); 4454 unsigned NumLegalLanes = LegalVectorBitWidth / LaneBitWidth; 4455 unsigned NumLanesTotal = NumLegalLanes * NumLegalVectors; 4456 unsigned NumLegalElts = 4457 LT.second.getVectorNumElements() * NumLegalVectors; 4458 assert(NumLegalElts >= DemandedElts.getBitWidth() && 4459 "Vector has been legalized to smaller element count"); 4460 assert((NumLegalElts % NumLanesTotal) == 0 && 4461 "Unexpected elts per lane"); 4462 unsigned NumEltsPerLane = NumLegalElts / NumLanesTotal; 4463 4464 APInt WidenedDemandedElts = DemandedElts.zext(NumLegalElts); 4465 auto *LaneTy = 4466 FixedVectorType::get(Ty->getElementType(), NumEltsPerLane); 4467 4468 for (unsigned I = 0; I != NumLanesTotal; ++I) { 4469 APInt LaneEltMask = WidenedDemandedElts.extractBits( 4470 NumEltsPerLane, NumEltsPerLane * I); 4471 if (LaneEltMask.isNullValue()) 4472 continue; 4473 // FIXME: we don't need to extract if all non-demanded elements 4474 // are legalization-inserted padding. 4475 if (!LaneEltMask.isAllOnes()) 4476 Cost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt, 4477 CostKind, I * NumEltsPerLane, LaneTy); 4478 Cost += BaseT::getScalarizationOverhead(LaneTy, LaneEltMask, Insert, 4479 /*Extract*/ false, CostKind); 4480 } 4481 4482 APInt AffectedLanes = 4483 APIntOps::ScaleBitMask(WidenedDemandedElts, NumLanesTotal); 4484 APInt FullyAffectedLegalVectors = APIntOps::ScaleBitMask( 4485 AffectedLanes, NumLegalVectors, /*MatchAllBits=*/true); 4486 for (int LegalVec = 0; LegalVec != NumLegalVectors; ++LegalVec) { 4487 for (unsigned Lane = 0; Lane != NumLegalLanes; ++Lane) { 4488 unsigned I = NumLegalLanes * LegalVec + Lane; 4489 // No need to insert unaffected lane; or lane 0 of each legal vector 4490 // iff ALL lanes of that vector were affected and will be inserted. 4491 if (!AffectedLanes[I] || 4492 (Lane == 0 && FullyAffectedLegalVectors[LegalVec])) 4493 continue; 4494 Cost += getShuffleCost(TTI::SK_InsertSubvector, Ty, std::nullopt, 4495 CostKind, I * NumEltsPerLane, LaneTy); 4496 } 4497 } 4498 } 4499 } else if (LT.second.isVector()) { 4500 // Without fast insertion, we need to use MOVD/MOVQ to pass each demanded 4501 // integer element as a SCALAR_TO_VECTOR, then we build the vector as a 4502 // series of UNPCK followed by CONCAT_VECTORS - all of these can be 4503 // considered cheap. 4504 if (Ty->isIntOrIntVectorTy()) 4505 Cost += DemandedElts.countPopulation(); 4506 4507 // Get the smaller of the legalized or original pow2-extended number of 4508 // vector elements, which represents the number of unpacks we'll end up 4509 // performing. 4510 unsigned NumElts = LT.second.getVectorNumElements(); 4511 unsigned Pow2Elts = 4512 PowerOf2Ceil(cast<FixedVectorType>(Ty)->getNumElements()); 4513 Cost += (std::min<unsigned>(NumElts, Pow2Elts) - 1) * LT.first; 4514 } 4515 } 4516 4517 if (Extract) { 4518 // vXi1 can be efficiently extracted with MOVMSK. 4519 // TODO: AVX512 predicate mask handling. 4520 // NOTE: This doesn't work well for roundtrip scalarization. 4521 if (!Insert && Ty->getScalarSizeInBits() == 1 && !ST->hasAVX512()) { 4522 unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements(); 4523 unsigned MaxElts = ST->hasAVX2() ? 32 : 16; 4524 unsigned MOVMSKCost = (NumElts + MaxElts - 1) / MaxElts; 4525 return MOVMSKCost; 4526 } 4527 4528 if (LT.second.isVector()) { 4529 unsigned NumLegalElts = 4530 LT.second.getVectorNumElements() * NumLegalVectors; 4531 assert(NumLegalElts >= DemandedElts.getBitWidth() && 4532 "Vector has been legalized to smaller element count"); 4533 4534 // If we're extracting elements from a 128-bit subvector lane, 4535 // we only need to extract each lane once, not for every element. 4536 if (LegalVectorBitWidth > LaneBitWidth) { 4537 unsigned NumLegalLanes = LegalVectorBitWidth / LaneBitWidth; 4538 unsigned NumLanesTotal = NumLegalLanes * NumLegalVectors; 4539 assert((NumLegalElts % NumLanesTotal) == 0 && 4540 "Unexpected elts per lane"); 4541 unsigned NumEltsPerLane = NumLegalElts / NumLanesTotal; 4542 4543 // Add cost for each demanded 128-bit subvector extraction. 4544 // Luckily this is a lot easier than for insertion. 4545 APInt WidenedDemandedElts = DemandedElts.zext(NumLegalElts); 4546 auto *LaneTy = 4547 FixedVectorType::get(Ty->getElementType(), NumEltsPerLane); 4548 4549 for (unsigned I = 0; I != NumLanesTotal; ++I) { 4550 APInt LaneEltMask = WidenedDemandedElts.extractBits( 4551 NumEltsPerLane, I * NumEltsPerLane); 4552 if (LaneEltMask.isNullValue()) 4553 continue; 4554 Cost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt, 4555 CostKind, I * NumEltsPerLane, LaneTy); 4556 Cost += BaseT::getScalarizationOverhead( 4557 LaneTy, LaneEltMask, /*Insert*/ false, Extract, CostKind); 4558 } 4559 4560 return Cost; 4561 } 4562 } 4563 4564 // Fallback to default extraction. 4565 Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, /*Insert*/ false, 4566 Extract, CostKind); 4567 } 4568 4569 return Cost; 4570 } 4571 4572 InstructionCost 4573 X86TTIImpl::getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, 4574 int VF, const APInt &DemandedDstElts, 4575 TTI::TargetCostKind CostKind) { 4576 const unsigned EltTyBits = DL.getTypeSizeInBits(EltTy); 4577 // We don't differentiate element types here, only element bit width. 4578 EltTy = IntegerType::getIntNTy(EltTy->getContext(), EltTyBits); 4579 4580 auto bailout = [&]() { 4581 return BaseT::getReplicationShuffleCost(EltTy, ReplicationFactor, VF, 4582 DemandedDstElts, CostKind); 4583 }; 4584 4585 // For now, only deal with AVX512 cases. 4586 if (!ST->hasAVX512()) 4587 return bailout(); 4588 4589 // Do we have a native shuffle for this element type, or should we promote? 4590 unsigned PromEltTyBits = EltTyBits; 4591 switch (EltTyBits) { 4592 case 32: 4593 case 64: 4594 break; // AVX512F. 4595 case 16: 4596 if (!ST->hasBWI()) 4597 PromEltTyBits = 32; // promote to i32, AVX512F. 4598 break; // AVX512BW 4599 case 8: 4600 if (!ST->hasVBMI()) 4601 PromEltTyBits = 32; // promote to i32, AVX512F. 4602 break; // AVX512VBMI 4603 case 1: 4604 // There is no support for shuffling i1 elements. We *must* promote. 4605 if (ST->hasBWI()) { 4606 if (ST->hasVBMI()) 4607 PromEltTyBits = 8; // promote to i8, AVX512VBMI. 4608 else 4609 PromEltTyBits = 16; // promote to i16, AVX512BW. 4610 break; 4611 } 4612 PromEltTyBits = 32; // promote to i32, AVX512F. 4613 break; 4614 default: 4615 return bailout(); 4616 } 4617 auto *PromEltTy = IntegerType::getIntNTy(EltTy->getContext(), PromEltTyBits); 4618 4619 auto *SrcVecTy = FixedVectorType::get(EltTy, VF); 4620 auto *PromSrcVecTy = FixedVectorType::get(PromEltTy, VF); 4621 4622 int NumDstElements = VF * ReplicationFactor; 4623 auto *PromDstVecTy = FixedVectorType::get(PromEltTy, NumDstElements); 4624 auto *DstVecTy = FixedVectorType::get(EltTy, NumDstElements); 4625 4626 // Legalize the types. 4627 MVT LegalSrcVecTy = getTypeLegalizationCost(SrcVecTy).second; 4628 MVT LegalPromSrcVecTy = getTypeLegalizationCost(PromSrcVecTy).second; 4629 MVT LegalPromDstVecTy = getTypeLegalizationCost(PromDstVecTy).second; 4630 MVT LegalDstVecTy = getTypeLegalizationCost(DstVecTy).second; 4631 // They should have legalized into vector types. 4632 if (!LegalSrcVecTy.isVector() || !LegalPromSrcVecTy.isVector() || 4633 !LegalPromDstVecTy.isVector() || !LegalDstVecTy.isVector()) 4634 return bailout(); 4635 4636 if (PromEltTyBits != EltTyBits) { 4637 // If we have to perform the shuffle with wider elt type than our data type, 4638 // then we will first need to anyext (we don't care about the new bits) 4639 // the source elements, and then truncate Dst elements. 4640 InstructionCost PromotionCost; 4641 PromotionCost += getCastInstrCost( 4642 Instruction::SExt, /*Dst=*/PromSrcVecTy, /*Src=*/SrcVecTy, 4643 TargetTransformInfo::CastContextHint::None, CostKind); 4644 PromotionCost += 4645 getCastInstrCost(Instruction::Trunc, /*Dst=*/DstVecTy, 4646 /*Src=*/PromDstVecTy, 4647 TargetTransformInfo::CastContextHint::None, CostKind); 4648 return PromotionCost + getReplicationShuffleCost(PromEltTy, 4649 ReplicationFactor, VF, 4650 DemandedDstElts, CostKind); 4651 } 4652 4653 assert(LegalSrcVecTy.getScalarSizeInBits() == EltTyBits && 4654 LegalSrcVecTy.getScalarType() == LegalDstVecTy.getScalarType() && 4655 "We expect that the legalization doesn't affect the element width, " 4656 "doesn't coalesce/split elements."); 4657 4658 unsigned NumEltsPerDstVec = LegalDstVecTy.getVectorNumElements(); 4659 unsigned NumDstVectors = 4660 divideCeil(DstVecTy->getNumElements(), NumEltsPerDstVec); 4661 4662 auto *SingleDstVecTy = FixedVectorType::get(EltTy, NumEltsPerDstVec); 4663 4664 // Not all the produced Dst elements may be demanded. In our case, 4665 // given that a single Dst vector is formed by a single shuffle, 4666 // if all elements that will form a single Dst vector aren't demanded, 4667 // then we won't need to do that shuffle, so adjust the cost accordingly. 4668 APInt DemandedDstVectors = APIntOps::ScaleBitMask( 4669 DemandedDstElts.zext(NumDstVectors * NumEltsPerDstVec), NumDstVectors); 4670 unsigned NumDstVectorsDemanded = DemandedDstVectors.countPopulation(); 4671 4672 InstructionCost SingleShuffleCost = getShuffleCost( 4673 TTI::SK_PermuteSingleSrc, SingleDstVecTy, /*Mask=*/std::nullopt, CostKind, 4674 /*Index=*/0, /*SubTp=*/nullptr); 4675 return NumDstVectorsDemanded * SingleShuffleCost; 4676 } 4677 4678 InstructionCost X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, 4679 MaybeAlign Alignment, 4680 unsigned AddressSpace, 4681 TTI::TargetCostKind CostKind, 4682 TTI::OperandValueInfo OpInfo, 4683 const Instruction *I) { 4684 // TODO: Handle other cost kinds. 4685 if (CostKind != TTI::TCK_RecipThroughput) { 4686 if (auto *SI = dyn_cast_or_null<StoreInst>(I)) { 4687 // Store instruction with index and scale costs 2 Uops. 4688 // Check the preceding GEP to identify non-const indices. 4689 if (auto *GEP = dyn_cast<GetElementPtrInst>(SI->getPointerOperand())) { 4690 if (!all_of(GEP->indices(), [](Value *V) { return isa<Constant>(V); })) 4691 return TTI::TCC_Basic * 2; 4692 } 4693 } 4694 return TTI::TCC_Basic; 4695 } 4696 4697 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) && 4698 "Invalid Opcode"); 4699 // Type legalization can't handle structs 4700 if (TLI->getValueType(DL, Src, true) == MVT::Other) 4701 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, 4702 CostKind); 4703 4704 // Legalize the type. 4705 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src); 4706 4707 auto *VTy = dyn_cast<FixedVectorType>(Src); 4708 4709 InstructionCost Cost = 0; 4710 4711 // Add a cost for constant load to vector. 4712 if (Opcode == Instruction::Store && OpInfo.isConstant()) 4713 Cost += getMemoryOpCost(Instruction::Load, Src, DL.getABITypeAlign(Src), 4714 /*AddressSpace=*/0, CostKind); 4715 4716 // Handle the simple case of non-vectors. 4717 // NOTE: this assumes that legalization never creates vector from scalars! 4718 if (!VTy || !LT.second.isVector()) { 4719 // Each load/store unit costs 1. 4720 return (LT.second.isFloatingPoint() ? Cost : 0) + LT.first * 1; 4721 } 4722 4723 bool IsLoad = Opcode == Instruction::Load; 4724 4725 Type *EltTy = VTy->getElementType(); 4726 4727 const int EltTyBits = DL.getTypeSizeInBits(EltTy); 4728 4729 // Source of truth: how many elements were there in the original IR vector? 4730 const unsigned SrcNumElt = VTy->getNumElements(); 4731 4732 // How far have we gotten? 4733 int NumEltRemaining = SrcNumElt; 4734 // Note that we intentionally capture by-reference, NumEltRemaining changes. 4735 auto NumEltDone = [&]() { return SrcNumElt - NumEltRemaining; }; 4736 4737 const int MaxLegalOpSizeBytes = divideCeil(LT.second.getSizeInBits(), 8); 4738 4739 // Note that even if we can store 64 bits of an XMM, we still operate on XMM. 4740 const unsigned XMMBits = 128; 4741 if (XMMBits % EltTyBits != 0) 4742 // Vector size must be a multiple of the element size. I.e. no padding. 4743 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, 4744 CostKind); 4745 const int NumEltPerXMM = XMMBits / EltTyBits; 4746 4747 auto *XMMVecTy = FixedVectorType::get(EltTy, NumEltPerXMM); 4748 4749 for (int CurrOpSizeBytes = MaxLegalOpSizeBytes, SubVecEltsLeft = 0; 4750 NumEltRemaining > 0; CurrOpSizeBytes /= 2) { 4751 // How many elements would a single op deal with at once? 4752 if ((8 * CurrOpSizeBytes) % EltTyBits != 0) 4753 // Vector size must be a multiple of the element size. I.e. no padding. 4754 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, 4755 CostKind); 4756 int CurrNumEltPerOp = (8 * CurrOpSizeBytes) / EltTyBits; 4757 4758 assert(CurrOpSizeBytes > 0 && CurrNumEltPerOp > 0 && "How'd we get here?"); 4759 assert((((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) || 4760 (CurrOpSizeBytes == MaxLegalOpSizeBytes)) && 4761 "Unless we haven't halved the op size yet, " 4762 "we have less than two op's sized units of work left."); 4763 4764 auto *CurrVecTy = CurrNumEltPerOp > NumEltPerXMM 4765 ? FixedVectorType::get(EltTy, CurrNumEltPerOp) 4766 : XMMVecTy; 4767 4768 assert(CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 && 4769 "After halving sizes, the vector elt count is no longer a multiple " 4770 "of number of elements per operation?"); 4771 auto *CoalescedVecTy = 4772 CurrNumEltPerOp == 1 4773 ? CurrVecTy 4774 : FixedVectorType::get( 4775 IntegerType::get(Src->getContext(), 4776 EltTyBits * CurrNumEltPerOp), 4777 CurrVecTy->getNumElements() / CurrNumEltPerOp); 4778 assert(DL.getTypeSizeInBits(CoalescedVecTy) == 4779 DL.getTypeSizeInBits(CurrVecTy) && 4780 "coalesciing elements doesn't change vector width."); 4781 4782 while (NumEltRemaining > 0) { 4783 assert(SubVecEltsLeft >= 0 && "Subreg element count overconsumtion?"); 4784 4785 // Can we use this vector size, as per the remaining element count? 4786 // Iff the vector is naturally aligned, we can do a wide load regardless. 4787 if (NumEltRemaining < CurrNumEltPerOp && 4788 (!IsLoad || Alignment.valueOrOne() < CurrOpSizeBytes) && 4789 CurrOpSizeBytes != 1) 4790 break; // Try smalled vector size. 4791 4792 bool Is0thSubVec = (NumEltDone() % LT.second.getVectorNumElements()) == 0; 4793 4794 // If we have fully processed the previous reg, we need to replenish it. 4795 if (SubVecEltsLeft == 0) { 4796 SubVecEltsLeft += CurrVecTy->getNumElements(); 4797 // And that's free only for the 0'th subvector of a legalized vector. 4798 if (!Is0thSubVec) 4799 Cost += getShuffleCost(IsLoad ? TTI::ShuffleKind::SK_InsertSubvector 4800 : TTI::ShuffleKind::SK_ExtractSubvector, 4801 VTy, std::nullopt, CostKind, NumEltDone(), 4802 CurrVecTy); 4803 } 4804 4805 // While we can directly load/store ZMM, YMM, and 64-bit halves of XMM, 4806 // for smaller widths (32/16/8) we have to insert/extract them separately. 4807 // Again, it's free for the 0'th subreg (if op is 32/64 bit wide, 4808 // but let's pretend that it is also true for 16/8 bit wide ops...) 4809 if (CurrOpSizeBytes <= 32 / 8 && !Is0thSubVec) { 4810 int NumEltDoneInCurrXMM = NumEltDone() % NumEltPerXMM; 4811 assert(NumEltDoneInCurrXMM % CurrNumEltPerOp == 0 && ""); 4812 int CoalescedVecEltIdx = NumEltDoneInCurrXMM / CurrNumEltPerOp; 4813 APInt DemandedElts = 4814 APInt::getBitsSet(CoalescedVecTy->getNumElements(), 4815 CoalescedVecEltIdx, CoalescedVecEltIdx + 1); 4816 assert(DemandedElts.countPopulation() == 1 && "Inserting single value"); 4817 Cost += getScalarizationOverhead(CoalescedVecTy, DemandedElts, IsLoad, 4818 !IsLoad, CostKind); 4819 } 4820 4821 // This isn't exactly right. We're using slow unaligned 32-byte accesses 4822 // as a proxy for a double-pumped AVX memory interface such as on 4823 // Sandybridge. 4824 if (CurrOpSizeBytes == 32 && ST->isUnalignedMem32Slow()) 4825 Cost += 2; 4826 else 4827 Cost += 1; 4828 4829 SubVecEltsLeft -= CurrNumEltPerOp; 4830 NumEltRemaining -= CurrNumEltPerOp; 4831 Alignment = commonAlignment(Alignment.valueOrOne(), CurrOpSizeBytes); 4832 } 4833 } 4834 4835 assert(NumEltRemaining <= 0 && "Should have processed all the elements."); 4836 4837 return Cost; 4838 } 4839 4840 InstructionCost 4841 X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, Align Alignment, 4842 unsigned AddressSpace, 4843 TTI::TargetCostKind CostKind) { 4844 bool IsLoad = (Instruction::Load == Opcode); 4845 bool IsStore = (Instruction::Store == Opcode); 4846 4847 auto *SrcVTy = dyn_cast<FixedVectorType>(SrcTy); 4848 if (!SrcVTy) 4849 // To calculate scalar take the regular cost, without mask 4850 return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace, CostKind); 4851 4852 unsigned NumElem = SrcVTy->getNumElements(); 4853 auto *MaskTy = 4854 FixedVectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem); 4855 if ((IsLoad && !isLegalMaskedLoad(SrcVTy, Alignment)) || 4856 (IsStore && !isLegalMaskedStore(SrcVTy, Alignment))) { 4857 // Scalarization 4858 APInt DemandedElts = APInt::getAllOnes(NumElem); 4859 InstructionCost MaskSplitCost = getScalarizationOverhead( 4860 MaskTy, DemandedElts, /*Insert*/ false, /*Extract*/ true, CostKind); 4861 InstructionCost ScalarCompareCost = getCmpSelInstrCost( 4862 Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr, 4863 CmpInst::BAD_ICMP_PREDICATE, CostKind); 4864 InstructionCost BranchCost = getCFInstrCost(Instruction::Br, CostKind); 4865 InstructionCost MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost); 4866 InstructionCost ValueSplitCost = getScalarizationOverhead( 4867 SrcVTy, DemandedElts, IsLoad, IsStore, CostKind); 4868 InstructionCost MemopCost = 4869 NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(), 4870 Alignment, AddressSpace, CostKind); 4871 return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost; 4872 } 4873 4874 // Legalize the type. 4875 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcVTy); 4876 auto VT = TLI->getValueType(DL, SrcVTy); 4877 InstructionCost Cost = 0; 4878 if (VT.isSimple() && LT.second != VT.getSimpleVT() && 4879 LT.second.getVectorNumElements() == NumElem) 4880 // Promotion requires extend/truncate for data and a shuffle for mask. 4881 Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVTy, std::nullopt, 4882 CostKind, 0, nullptr) + 4883 getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, std::nullopt, 4884 CostKind, 0, nullptr); 4885 4886 else if (LT.first * LT.second.getVectorNumElements() > NumElem) { 4887 auto *NewMaskTy = FixedVectorType::get(MaskTy->getElementType(), 4888 LT.second.getVectorNumElements()); 4889 // Expanding requires fill mask with zeroes 4890 Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, std::nullopt, 4891 CostKind, 0, MaskTy); 4892 } 4893 4894 // Pre-AVX512 - each maskmov load costs 2 + store costs ~8. 4895 if (!ST->hasAVX512()) 4896 return Cost + LT.first * (IsLoad ? 2 : 8); 4897 4898 // AVX-512 masked load/store is cheaper 4899 return Cost + LT.first; 4900 } 4901 4902 InstructionCost X86TTIImpl::getAddressComputationCost(Type *Ty, 4903 ScalarEvolution *SE, 4904 const SCEV *Ptr) { 4905 // Address computations in vectorized code with non-consecutive addresses will 4906 // likely result in more instructions compared to scalar code where the 4907 // computation can more often be merged into the index mode. The resulting 4908 // extra micro-ops can significantly decrease throughput. 4909 const unsigned NumVectorInstToHideOverhead = 10; 4910 4911 // Cost modeling of Strided Access Computation is hidden by the indexing 4912 // modes of X86 regardless of the stride value. We dont believe that there 4913 // is a difference between constant strided access in gerenal and constant 4914 // strided value which is less than or equal to 64. 4915 // Even in the case of (loop invariant) stride whose value is not known at 4916 // compile time, the address computation will not incur more than one extra 4917 // ADD instruction. 4918 if (Ty->isVectorTy() && SE && !ST->hasAVX2()) { 4919 // TODO: AVX2 is the current cut-off because we don't have correct 4920 // interleaving costs for prior ISA's. 4921 if (!BaseT::isStridedAccess(Ptr)) 4922 return NumVectorInstToHideOverhead; 4923 if (!BaseT::getConstantStrideStep(SE, Ptr)) 4924 return 1; 4925 } 4926 4927 return BaseT::getAddressComputationCost(Ty, SE, Ptr); 4928 } 4929 4930 InstructionCost 4931 X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, 4932 std::optional<FastMathFlags> FMF, 4933 TTI::TargetCostKind CostKind) { 4934 if (TTI::requiresOrderedReduction(FMF)) 4935 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind); 4936 4937 // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput 4938 // and make it as the cost. 4939 4940 static const CostTblEntry SLMCostTblNoPairWise[] = { 4941 { ISD::FADD, MVT::v2f64, 3 }, 4942 { ISD::ADD, MVT::v2i64, 5 }, 4943 }; 4944 4945 static const CostTblEntry SSE2CostTblNoPairWise[] = { 4946 { ISD::FADD, MVT::v2f64, 2 }, 4947 { ISD::FADD, MVT::v2f32, 2 }, 4948 { ISD::FADD, MVT::v4f32, 4 }, 4949 { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6". 4950 { ISD::ADD, MVT::v2i32, 2 }, // FIXME: chosen to be less than v4i32 4951 { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.3". 4952 { ISD::ADD, MVT::v2i16, 2 }, // The data reported by the IACA tool is "4.3". 4953 { ISD::ADD, MVT::v4i16, 3 }, // The data reported by the IACA tool is "4.3". 4954 { ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3". 4955 { ISD::ADD, MVT::v2i8, 2 }, 4956 { ISD::ADD, MVT::v4i8, 2 }, 4957 { ISD::ADD, MVT::v8i8, 2 }, 4958 { ISD::ADD, MVT::v16i8, 3 }, 4959 }; 4960 4961 static const CostTblEntry AVX1CostTblNoPairWise[] = { 4962 { ISD::FADD, MVT::v4f64, 3 }, 4963 { ISD::FADD, MVT::v4f32, 3 }, 4964 { ISD::FADD, MVT::v8f32, 4 }, 4965 { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5". 4966 { ISD::ADD, MVT::v4i64, 3 }, 4967 { ISD::ADD, MVT::v8i32, 5 }, 4968 { ISD::ADD, MVT::v16i16, 5 }, 4969 { ISD::ADD, MVT::v32i8, 4 }, 4970 }; 4971 4972 int ISD = TLI->InstructionOpcodeToISD(Opcode); 4973 assert(ISD && "Invalid opcode"); 4974 4975 // Before legalizing the type, give a chance to look up illegal narrow types 4976 // in the table. 4977 // FIXME: Is there a better way to do this? 4978 EVT VT = TLI->getValueType(DL, ValTy); 4979 if (VT.isSimple()) { 4980 MVT MTy = VT.getSimpleVT(); 4981 if (ST->useSLMArithCosts()) 4982 if (const auto *Entry = CostTableLookup(SLMCostTblNoPairWise, ISD, MTy)) 4983 return Entry->Cost; 4984 4985 if (ST->hasAVX()) 4986 if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy)) 4987 return Entry->Cost; 4988 4989 if (ST->hasSSE2()) 4990 if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy)) 4991 return Entry->Cost; 4992 } 4993 4994 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy); 4995 4996 MVT MTy = LT.second; 4997 4998 auto *ValVTy = cast<FixedVectorType>(ValTy); 4999 5000 // Special case: vXi8 mul reductions are performed as vXi16. 5001 if (ISD == ISD::MUL && MTy.getScalarType() == MVT::i8) { 5002 auto *WideSclTy = IntegerType::get(ValVTy->getContext(), 16); 5003 auto *WideVecTy = FixedVectorType::get(WideSclTy, ValVTy->getNumElements()); 5004 return getCastInstrCost(Instruction::ZExt, WideVecTy, ValTy, 5005 TargetTransformInfo::CastContextHint::None, 5006 CostKind) + 5007 getArithmeticReductionCost(Opcode, WideVecTy, FMF, CostKind); 5008 } 5009 5010 InstructionCost ArithmeticCost = 0; 5011 if (LT.first != 1 && MTy.isVector() && 5012 MTy.getVectorNumElements() < ValVTy->getNumElements()) { 5013 // Type needs to be split. We need LT.first - 1 arithmetic ops. 5014 auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(), 5015 MTy.getVectorNumElements()); 5016 ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind); 5017 ArithmeticCost *= LT.first - 1; 5018 } 5019 5020 if (ST->useSLMArithCosts()) 5021 if (const auto *Entry = CostTableLookup(SLMCostTblNoPairWise, ISD, MTy)) 5022 return ArithmeticCost + Entry->Cost; 5023 5024 if (ST->hasAVX()) 5025 if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy)) 5026 return ArithmeticCost + Entry->Cost; 5027 5028 if (ST->hasSSE2()) 5029 if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy)) 5030 return ArithmeticCost + Entry->Cost; 5031 5032 // FIXME: These assume a naive kshift+binop lowering, which is probably 5033 // conservative in most cases. 5034 static const CostTblEntry AVX512BoolReduction[] = { 5035 { ISD::AND, MVT::v2i1, 3 }, 5036 { ISD::AND, MVT::v4i1, 5 }, 5037 { ISD::AND, MVT::v8i1, 7 }, 5038 { ISD::AND, MVT::v16i1, 9 }, 5039 { ISD::AND, MVT::v32i1, 11 }, 5040 { ISD::AND, MVT::v64i1, 13 }, 5041 { ISD::OR, MVT::v2i1, 3 }, 5042 { ISD::OR, MVT::v4i1, 5 }, 5043 { ISD::OR, MVT::v8i1, 7 }, 5044 { ISD::OR, MVT::v16i1, 9 }, 5045 { ISD::OR, MVT::v32i1, 11 }, 5046 { ISD::OR, MVT::v64i1, 13 }, 5047 }; 5048 5049 static const CostTblEntry AVX2BoolReduction[] = { 5050 { ISD::AND, MVT::v16i16, 2 }, // vpmovmskb + cmp 5051 { ISD::AND, MVT::v32i8, 2 }, // vpmovmskb + cmp 5052 { ISD::OR, MVT::v16i16, 2 }, // vpmovmskb + cmp 5053 { ISD::OR, MVT::v32i8, 2 }, // vpmovmskb + cmp 5054 }; 5055 5056 static const CostTblEntry AVX1BoolReduction[] = { 5057 { ISD::AND, MVT::v4i64, 2 }, // vmovmskpd + cmp 5058 { ISD::AND, MVT::v8i32, 2 }, // vmovmskps + cmp 5059 { ISD::AND, MVT::v16i16, 4 }, // vextractf128 + vpand + vpmovmskb + cmp 5060 { ISD::AND, MVT::v32i8, 4 }, // vextractf128 + vpand + vpmovmskb + cmp 5061 { ISD::OR, MVT::v4i64, 2 }, // vmovmskpd + cmp 5062 { ISD::OR, MVT::v8i32, 2 }, // vmovmskps + cmp 5063 { ISD::OR, MVT::v16i16, 4 }, // vextractf128 + vpor + vpmovmskb + cmp 5064 { ISD::OR, MVT::v32i8, 4 }, // vextractf128 + vpor + vpmovmskb + cmp 5065 }; 5066 5067 static const CostTblEntry SSE2BoolReduction[] = { 5068 { ISD::AND, MVT::v2i64, 2 }, // movmskpd + cmp 5069 { ISD::AND, MVT::v4i32, 2 }, // movmskps + cmp 5070 { ISD::AND, MVT::v8i16, 2 }, // pmovmskb + cmp 5071 { ISD::AND, MVT::v16i8, 2 }, // pmovmskb + cmp 5072 { ISD::OR, MVT::v2i64, 2 }, // movmskpd + cmp 5073 { ISD::OR, MVT::v4i32, 2 }, // movmskps + cmp 5074 { ISD::OR, MVT::v8i16, 2 }, // pmovmskb + cmp 5075 { ISD::OR, MVT::v16i8, 2 }, // pmovmskb + cmp 5076 }; 5077 5078 // Handle bool allof/anyof patterns. 5079 if (ValVTy->getElementType()->isIntegerTy(1)) { 5080 InstructionCost ArithmeticCost = 0; 5081 if (LT.first != 1 && MTy.isVector() && 5082 MTy.getVectorNumElements() < ValVTy->getNumElements()) { 5083 // Type needs to be split. We need LT.first - 1 arithmetic ops. 5084 auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(), 5085 MTy.getVectorNumElements()); 5086 ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind); 5087 ArithmeticCost *= LT.first - 1; 5088 } 5089 5090 if (ST->hasAVX512()) 5091 if (const auto *Entry = CostTableLookup(AVX512BoolReduction, ISD, MTy)) 5092 return ArithmeticCost + Entry->Cost; 5093 if (ST->hasAVX2()) 5094 if (const auto *Entry = CostTableLookup(AVX2BoolReduction, ISD, MTy)) 5095 return ArithmeticCost + Entry->Cost; 5096 if (ST->hasAVX()) 5097 if (const auto *Entry = CostTableLookup(AVX1BoolReduction, ISD, MTy)) 5098 return ArithmeticCost + Entry->Cost; 5099 if (ST->hasSSE2()) 5100 if (const auto *Entry = CostTableLookup(SSE2BoolReduction, ISD, MTy)) 5101 return ArithmeticCost + Entry->Cost; 5102 5103 return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind); 5104 } 5105 5106 unsigned NumVecElts = ValVTy->getNumElements(); 5107 unsigned ScalarSize = ValVTy->getScalarSizeInBits(); 5108 5109 // Special case power of 2 reductions where the scalar type isn't changed 5110 // by type legalization. 5111 if (!isPowerOf2_32(NumVecElts) || ScalarSize != MTy.getScalarSizeInBits()) 5112 return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind); 5113 5114 InstructionCost ReductionCost = 0; 5115 5116 auto *Ty = ValVTy; 5117 if (LT.first != 1 && MTy.isVector() && 5118 MTy.getVectorNumElements() < ValVTy->getNumElements()) { 5119 // Type needs to be split. We need LT.first - 1 arithmetic ops. 5120 Ty = FixedVectorType::get(ValVTy->getElementType(), 5121 MTy.getVectorNumElements()); 5122 ReductionCost = getArithmeticInstrCost(Opcode, Ty, CostKind); 5123 ReductionCost *= LT.first - 1; 5124 NumVecElts = MTy.getVectorNumElements(); 5125 } 5126 5127 // Now handle reduction with the legal type, taking into account size changes 5128 // at each level. 5129 while (NumVecElts > 1) { 5130 // Determine the size of the remaining vector we need to reduce. 5131 unsigned Size = NumVecElts * ScalarSize; 5132 NumVecElts /= 2; 5133 // If we're reducing from 256/512 bits, use an extract_subvector. 5134 if (Size > 128) { 5135 auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts); 5136 ReductionCost += 5137 getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt, CostKind, 5138 NumVecElts, SubTy); 5139 Ty = SubTy; 5140 } else if (Size == 128) { 5141 // Reducing from 128 bits is a permute of v2f64/v2i64. 5142 FixedVectorType *ShufTy; 5143 if (ValVTy->isFloatingPointTy()) 5144 ShufTy = 5145 FixedVectorType::get(Type::getDoubleTy(ValVTy->getContext()), 2); 5146 else 5147 ShufTy = 5148 FixedVectorType::get(Type::getInt64Ty(ValVTy->getContext()), 2); 5149 ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, 5150 std::nullopt, CostKind, 0, nullptr); 5151 } else if (Size == 64) { 5152 // Reducing from 64 bits is a shuffle of v4f32/v4i32. 5153 FixedVectorType *ShufTy; 5154 if (ValVTy->isFloatingPointTy()) 5155 ShufTy = 5156 FixedVectorType::get(Type::getFloatTy(ValVTy->getContext()), 4); 5157 else 5158 ShufTy = 5159 FixedVectorType::get(Type::getInt32Ty(ValVTy->getContext()), 4); 5160 ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, 5161 std::nullopt, CostKind, 0, nullptr); 5162 } else { 5163 // Reducing from smaller size is a shift by immediate. 5164 auto *ShiftTy = FixedVectorType::get( 5165 Type::getIntNTy(ValVTy->getContext(), Size), 128 / Size); 5166 ReductionCost += getArithmeticInstrCost( 5167 Instruction::LShr, ShiftTy, CostKind, 5168 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, 5169 {TargetTransformInfo::OK_UniformConstantValue, TargetTransformInfo::OP_None}); 5170 } 5171 5172 // Add the arithmetic op for this level. 5173 ReductionCost += getArithmeticInstrCost(Opcode, Ty, CostKind); 5174 } 5175 5176 // Add the final extract element to the cost. 5177 return ReductionCost + getVectorInstrCost(Instruction::ExtractElement, Ty, 5178 CostKind, 0, nullptr, nullptr); 5179 } 5180 5181 InstructionCost X86TTIImpl::getMinMaxCost(Type *Ty, Type *CondTy, 5182 bool IsUnsigned) { 5183 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); 5184 5185 MVT MTy = LT.second; 5186 5187 int ISD; 5188 if (Ty->isIntOrIntVectorTy()) { 5189 ISD = IsUnsigned ? ISD::UMIN : ISD::SMIN; 5190 } else { 5191 assert(Ty->isFPOrFPVectorTy() && 5192 "Expected float point or integer vector type."); 5193 ISD = ISD::FMINNUM; 5194 } 5195 5196 static const CostTblEntry SSE1CostTbl[] = { 5197 {ISD::FMINNUM, MVT::v4f32, 1}, 5198 }; 5199 5200 static const CostTblEntry SSE2CostTbl[] = { 5201 {ISD::FMINNUM, MVT::v2f64, 1}, 5202 {ISD::SMIN, MVT::v8i16, 1}, 5203 {ISD::UMIN, MVT::v16i8, 1}, 5204 }; 5205 5206 static const CostTblEntry SSE41CostTbl[] = { 5207 {ISD::SMIN, MVT::v4i32, 1}, 5208 {ISD::UMIN, MVT::v4i32, 1}, 5209 {ISD::UMIN, MVT::v8i16, 1}, 5210 {ISD::SMIN, MVT::v16i8, 1}, 5211 }; 5212 5213 static const CostTblEntry SSE42CostTbl[] = { 5214 {ISD::UMIN, MVT::v2i64, 3}, // xor+pcmpgtq+blendvpd 5215 }; 5216 5217 static const CostTblEntry AVX1CostTbl[] = { 5218 {ISD::FMINNUM, MVT::v8f32, 1}, 5219 {ISD::FMINNUM, MVT::v4f64, 1}, 5220 {ISD::SMIN, MVT::v8i32, 3}, 5221 {ISD::UMIN, MVT::v8i32, 3}, 5222 {ISD::SMIN, MVT::v16i16, 3}, 5223 {ISD::UMIN, MVT::v16i16, 3}, 5224 {ISD::SMIN, MVT::v32i8, 3}, 5225 {ISD::UMIN, MVT::v32i8, 3}, 5226 }; 5227 5228 static const CostTblEntry AVX2CostTbl[] = { 5229 {ISD::SMIN, MVT::v8i32, 1}, 5230 {ISD::UMIN, MVT::v8i32, 1}, 5231 {ISD::SMIN, MVT::v16i16, 1}, 5232 {ISD::UMIN, MVT::v16i16, 1}, 5233 {ISD::SMIN, MVT::v32i8, 1}, 5234 {ISD::UMIN, MVT::v32i8, 1}, 5235 }; 5236 5237 static const CostTblEntry AVX512CostTbl[] = { 5238 {ISD::FMINNUM, MVT::v16f32, 1}, 5239 {ISD::FMINNUM, MVT::v8f64, 1}, 5240 {ISD::SMIN, MVT::v2i64, 1}, 5241 {ISD::UMIN, MVT::v2i64, 1}, 5242 {ISD::SMIN, MVT::v4i64, 1}, 5243 {ISD::UMIN, MVT::v4i64, 1}, 5244 {ISD::SMIN, MVT::v8i64, 1}, 5245 {ISD::UMIN, MVT::v8i64, 1}, 5246 {ISD::SMIN, MVT::v16i32, 1}, 5247 {ISD::UMIN, MVT::v16i32, 1}, 5248 }; 5249 5250 static const CostTblEntry AVX512BWCostTbl[] = { 5251 {ISD::SMIN, MVT::v32i16, 1}, 5252 {ISD::UMIN, MVT::v32i16, 1}, 5253 {ISD::SMIN, MVT::v64i8, 1}, 5254 {ISD::UMIN, MVT::v64i8, 1}, 5255 }; 5256 5257 // If we have a native MIN/MAX instruction for this type, use it. 5258 if (ST->hasBWI()) 5259 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy)) 5260 return LT.first * Entry->Cost; 5261 5262 if (ST->hasAVX512()) 5263 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy)) 5264 return LT.first * Entry->Cost; 5265 5266 if (ST->hasAVX2()) 5267 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy)) 5268 return LT.first * Entry->Cost; 5269 5270 if (ST->hasAVX()) 5271 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy)) 5272 return LT.first * Entry->Cost; 5273 5274 if (ST->hasSSE42()) 5275 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy)) 5276 return LT.first * Entry->Cost; 5277 5278 if (ST->hasSSE41()) 5279 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy)) 5280 return LT.first * Entry->Cost; 5281 5282 if (ST->hasSSE2()) 5283 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy)) 5284 return LT.first * Entry->Cost; 5285 5286 if (ST->hasSSE1()) 5287 if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy)) 5288 return LT.first * Entry->Cost; 5289 5290 unsigned CmpOpcode; 5291 if (Ty->isFPOrFPVectorTy()) { 5292 CmpOpcode = Instruction::FCmp; 5293 } else { 5294 assert(Ty->isIntOrIntVectorTy() && 5295 "expecting floating point or integer type for min/max reduction"); 5296 CmpOpcode = Instruction::ICmp; 5297 } 5298 5299 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 5300 // Otherwise fall back to cmp+select. 5301 InstructionCost Result = 5302 getCmpSelInstrCost(CmpOpcode, Ty, CondTy, CmpInst::BAD_ICMP_PREDICATE, 5303 CostKind) + 5304 getCmpSelInstrCost(Instruction::Select, Ty, CondTy, 5305 CmpInst::BAD_ICMP_PREDICATE, CostKind); 5306 return Result; 5307 } 5308 5309 InstructionCost 5310 X86TTIImpl::getMinMaxReductionCost(VectorType *ValTy, VectorType *CondTy, 5311 bool IsUnsigned, 5312 TTI::TargetCostKind CostKind) { 5313 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy); 5314 5315 MVT MTy = LT.second; 5316 5317 int ISD; 5318 if (ValTy->isIntOrIntVectorTy()) { 5319 ISD = IsUnsigned ? ISD::UMIN : ISD::SMIN; 5320 } else { 5321 assert(ValTy->isFPOrFPVectorTy() && 5322 "Expected float point or integer vector type."); 5323 ISD = ISD::FMINNUM; 5324 } 5325 5326 // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput 5327 // and make it as the cost. 5328 5329 static const CostTblEntry SSE2CostTblNoPairWise[] = { 5330 {ISD::UMIN, MVT::v2i16, 5}, // need pxors to use pminsw/pmaxsw 5331 {ISD::UMIN, MVT::v4i16, 7}, // need pxors to use pminsw/pmaxsw 5332 {ISD::UMIN, MVT::v8i16, 9}, // need pxors to use pminsw/pmaxsw 5333 }; 5334 5335 static const CostTblEntry SSE41CostTblNoPairWise[] = { 5336 {ISD::SMIN, MVT::v2i16, 3}, // same as sse2 5337 {ISD::SMIN, MVT::v4i16, 5}, // same as sse2 5338 {ISD::UMIN, MVT::v2i16, 5}, // same as sse2 5339 {ISD::UMIN, MVT::v4i16, 7}, // same as sse2 5340 {ISD::SMIN, MVT::v8i16, 4}, // phminposuw+xor 5341 {ISD::UMIN, MVT::v8i16, 4}, // FIXME: umin is cheaper than umax 5342 {ISD::SMIN, MVT::v2i8, 3}, // pminsb 5343 {ISD::SMIN, MVT::v4i8, 5}, // pminsb 5344 {ISD::SMIN, MVT::v8i8, 7}, // pminsb 5345 {ISD::SMIN, MVT::v16i8, 6}, 5346 {ISD::UMIN, MVT::v2i8, 3}, // same as sse2 5347 {ISD::UMIN, MVT::v4i8, 5}, // same as sse2 5348 {ISD::UMIN, MVT::v8i8, 7}, // same as sse2 5349 {ISD::UMIN, MVT::v16i8, 6}, // FIXME: umin is cheaper than umax 5350 }; 5351 5352 static const CostTblEntry AVX1CostTblNoPairWise[] = { 5353 {ISD::SMIN, MVT::v16i16, 6}, 5354 {ISD::UMIN, MVT::v16i16, 6}, // FIXME: umin is cheaper than umax 5355 {ISD::SMIN, MVT::v32i8, 8}, 5356 {ISD::UMIN, MVT::v32i8, 8}, 5357 }; 5358 5359 static const CostTblEntry AVX512BWCostTblNoPairWise[] = { 5360 {ISD::SMIN, MVT::v32i16, 8}, 5361 {ISD::UMIN, MVT::v32i16, 8}, // FIXME: umin is cheaper than umax 5362 {ISD::SMIN, MVT::v64i8, 10}, 5363 {ISD::UMIN, MVT::v64i8, 10}, 5364 }; 5365 5366 // Before legalizing the type, give a chance to look up illegal narrow types 5367 // in the table. 5368 // FIXME: Is there a better way to do this? 5369 EVT VT = TLI->getValueType(DL, ValTy); 5370 if (VT.isSimple()) { 5371 MVT MTy = VT.getSimpleVT(); 5372 if (ST->hasBWI()) 5373 if (const auto *Entry = CostTableLookup(AVX512BWCostTblNoPairWise, ISD, MTy)) 5374 return Entry->Cost; 5375 5376 if (ST->hasAVX()) 5377 if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy)) 5378 return Entry->Cost; 5379 5380 if (ST->hasSSE41()) 5381 if (const auto *Entry = CostTableLookup(SSE41CostTblNoPairWise, ISD, MTy)) 5382 return Entry->Cost; 5383 5384 if (ST->hasSSE2()) 5385 if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy)) 5386 return Entry->Cost; 5387 } 5388 5389 auto *ValVTy = cast<FixedVectorType>(ValTy); 5390 unsigned NumVecElts = ValVTy->getNumElements(); 5391 5392 auto *Ty = ValVTy; 5393 InstructionCost MinMaxCost = 0; 5394 if (LT.first != 1 && MTy.isVector() && 5395 MTy.getVectorNumElements() < ValVTy->getNumElements()) { 5396 // Type needs to be split. We need LT.first - 1 operations ops. 5397 Ty = FixedVectorType::get(ValVTy->getElementType(), 5398 MTy.getVectorNumElements()); 5399 auto *SubCondTy = FixedVectorType::get(CondTy->getElementType(), 5400 MTy.getVectorNumElements()); 5401 MinMaxCost = getMinMaxCost(Ty, SubCondTy, IsUnsigned); 5402 MinMaxCost *= LT.first - 1; 5403 NumVecElts = MTy.getVectorNumElements(); 5404 } 5405 5406 if (ST->hasBWI()) 5407 if (const auto *Entry = CostTableLookup(AVX512BWCostTblNoPairWise, ISD, MTy)) 5408 return MinMaxCost + Entry->Cost; 5409 5410 if (ST->hasAVX()) 5411 if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy)) 5412 return MinMaxCost + Entry->Cost; 5413 5414 if (ST->hasSSE41()) 5415 if (const auto *Entry = CostTableLookup(SSE41CostTblNoPairWise, ISD, MTy)) 5416 return MinMaxCost + Entry->Cost; 5417 5418 if (ST->hasSSE2()) 5419 if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy)) 5420 return MinMaxCost + Entry->Cost; 5421 5422 unsigned ScalarSize = ValTy->getScalarSizeInBits(); 5423 5424 // Special case power of 2 reductions where the scalar type isn't changed 5425 // by type legalization. 5426 if (!isPowerOf2_32(ValVTy->getNumElements()) || 5427 ScalarSize != MTy.getScalarSizeInBits()) 5428 return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsUnsigned, CostKind); 5429 5430 // Now handle reduction with the legal type, taking into account size changes 5431 // at each level. 5432 while (NumVecElts > 1) { 5433 // Determine the size of the remaining vector we need to reduce. 5434 unsigned Size = NumVecElts * ScalarSize; 5435 NumVecElts /= 2; 5436 // If we're reducing from 256/512 bits, use an extract_subvector. 5437 if (Size > 128) { 5438 auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts); 5439 MinMaxCost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt, 5440 CostKind, NumVecElts, SubTy); 5441 Ty = SubTy; 5442 } else if (Size == 128) { 5443 // Reducing from 128 bits is a permute of v2f64/v2i64. 5444 VectorType *ShufTy; 5445 if (ValTy->isFloatingPointTy()) 5446 ShufTy = 5447 FixedVectorType::get(Type::getDoubleTy(ValTy->getContext()), 2); 5448 else 5449 ShufTy = FixedVectorType::get(Type::getInt64Ty(ValTy->getContext()), 2); 5450 MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, 5451 std::nullopt, CostKind, 0, nullptr); 5452 } else if (Size == 64) { 5453 // Reducing from 64 bits is a shuffle of v4f32/v4i32. 5454 FixedVectorType *ShufTy; 5455 if (ValTy->isFloatingPointTy()) 5456 ShufTy = FixedVectorType::get(Type::getFloatTy(ValTy->getContext()), 4); 5457 else 5458 ShufTy = FixedVectorType::get(Type::getInt32Ty(ValTy->getContext()), 4); 5459 MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, 5460 std::nullopt, CostKind, 0, nullptr); 5461 } else { 5462 // Reducing from smaller size is a shift by immediate. 5463 auto *ShiftTy = FixedVectorType::get( 5464 Type::getIntNTy(ValTy->getContext(), Size), 128 / Size); 5465 MinMaxCost += getArithmeticInstrCost( 5466 Instruction::LShr, ShiftTy, TTI::TCK_RecipThroughput, 5467 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, 5468 {TargetTransformInfo::OK_UniformConstantValue, TargetTransformInfo::OP_None}); 5469 } 5470 5471 // Add the arithmetic op for this level. 5472 auto *SubCondTy = 5473 FixedVectorType::get(CondTy->getElementType(), Ty->getNumElements()); 5474 MinMaxCost += getMinMaxCost(Ty, SubCondTy, IsUnsigned); 5475 } 5476 5477 // Add the final extract element to the cost. 5478 return MinMaxCost + getVectorInstrCost(Instruction::ExtractElement, Ty, 5479 CostKind, 0, nullptr, nullptr); 5480 } 5481 5482 /// Calculate the cost of materializing a 64-bit value. This helper 5483 /// method might only calculate a fraction of a larger immediate. Therefore it 5484 /// is valid to return a cost of ZERO. 5485 InstructionCost X86TTIImpl::getIntImmCost(int64_t Val) { 5486 if (Val == 0) 5487 return TTI::TCC_Free; 5488 5489 if (isInt<32>(Val)) 5490 return TTI::TCC_Basic; 5491 5492 return 2 * TTI::TCC_Basic; 5493 } 5494 5495 InstructionCost X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty, 5496 TTI::TargetCostKind CostKind) { 5497 assert(Ty->isIntegerTy()); 5498 5499 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 5500 if (BitSize == 0) 5501 return ~0U; 5502 5503 // Never hoist constants larger than 128bit, because this might lead to 5504 // incorrect code generation or assertions in codegen. 5505 // Fixme: Create a cost model for types larger than i128 once the codegen 5506 // issues have been fixed. 5507 if (BitSize > 128) 5508 return TTI::TCC_Free; 5509 5510 if (Imm == 0) 5511 return TTI::TCC_Free; 5512 5513 // Sign-extend all constants to a multiple of 64-bit. 5514 APInt ImmVal = Imm; 5515 if (BitSize % 64 != 0) 5516 ImmVal = Imm.sext(alignTo(BitSize, 64)); 5517 5518 // Split the constant into 64-bit chunks and calculate the cost for each 5519 // chunk. 5520 InstructionCost Cost = 0; 5521 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) { 5522 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64); 5523 int64_t Val = Tmp.getSExtValue(); 5524 Cost += getIntImmCost(Val); 5525 } 5526 // We need at least one instruction to materialize the constant. 5527 return std::max<InstructionCost>(1, Cost); 5528 } 5529 5530 InstructionCost X86TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, 5531 const APInt &Imm, Type *Ty, 5532 TTI::TargetCostKind CostKind, 5533 Instruction *Inst) { 5534 assert(Ty->isIntegerTy()); 5535 5536 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 5537 // There is no cost model for constants with a bit size of 0. Return TCC_Free 5538 // here, so that constant hoisting will ignore this constant. 5539 if (BitSize == 0) 5540 return TTI::TCC_Free; 5541 5542 unsigned ImmIdx = ~0U; 5543 switch (Opcode) { 5544 default: 5545 return TTI::TCC_Free; 5546 case Instruction::GetElementPtr: 5547 // Always hoist the base address of a GetElementPtr. This prevents the 5548 // creation of new constants for every base constant that gets constant 5549 // folded with the offset. 5550 if (Idx == 0) 5551 return 2 * TTI::TCC_Basic; 5552 return TTI::TCC_Free; 5553 case Instruction::Store: 5554 ImmIdx = 0; 5555 break; 5556 case Instruction::ICmp: 5557 // This is an imperfect hack to prevent constant hoisting of 5558 // compares that might be trying to check if a 64-bit value fits in 5559 // 32-bits. The backend can optimize these cases using a right shift by 32. 5560 // Ideally we would check the compare predicate here. There also other 5561 // similar immediates the backend can use shifts for. 5562 if (Idx == 1 && Imm.getBitWidth() == 64) { 5563 uint64_t ImmVal = Imm.getZExtValue(); 5564 if (ImmVal == 0x100000000ULL || ImmVal == 0xffffffff) 5565 return TTI::TCC_Free; 5566 } 5567 ImmIdx = 1; 5568 break; 5569 case Instruction::And: 5570 // We support 64-bit ANDs with immediates with 32-bits of leading zeroes 5571 // by using a 32-bit operation with implicit zero extension. Detect such 5572 // immediates here as the normal path expects bit 31 to be sign extended. 5573 if (Idx == 1 && Imm.getBitWidth() == 64 && Imm.isIntN(32)) 5574 return TTI::TCC_Free; 5575 ImmIdx = 1; 5576 break; 5577 case Instruction::Add: 5578 case Instruction::Sub: 5579 // For add/sub, we can use the opposite instruction for INT32_MIN. 5580 if (Idx == 1 && Imm.getBitWidth() == 64 && Imm.getZExtValue() == 0x80000000) 5581 return TTI::TCC_Free; 5582 ImmIdx = 1; 5583 break; 5584 case Instruction::UDiv: 5585 case Instruction::SDiv: 5586 case Instruction::URem: 5587 case Instruction::SRem: 5588 // Division by constant is typically expanded later into a different 5589 // instruction sequence. This completely changes the constants. 5590 // Report them as "free" to stop ConstantHoist from marking them as opaque. 5591 return TTI::TCC_Free; 5592 case Instruction::Mul: 5593 case Instruction::Or: 5594 case Instruction::Xor: 5595 ImmIdx = 1; 5596 break; 5597 // Always return TCC_Free for the shift value of a shift instruction. 5598 case Instruction::Shl: 5599 case Instruction::LShr: 5600 case Instruction::AShr: 5601 if (Idx == 1) 5602 return TTI::TCC_Free; 5603 break; 5604 case Instruction::Trunc: 5605 case Instruction::ZExt: 5606 case Instruction::SExt: 5607 case Instruction::IntToPtr: 5608 case Instruction::PtrToInt: 5609 case Instruction::BitCast: 5610 case Instruction::PHI: 5611 case Instruction::Call: 5612 case Instruction::Select: 5613 case Instruction::Ret: 5614 case Instruction::Load: 5615 break; 5616 } 5617 5618 if (Idx == ImmIdx) { 5619 int NumConstants = divideCeil(BitSize, 64); 5620 InstructionCost Cost = X86TTIImpl::getIntImmCost(Imm, Ty, CostKind); 5621 return (Cost <= NumConstants * TTI::TCC_Basic) 5622 ? static_cast<int>(TTI::TCC_Free) 5623 : Cost; 5624 } 5625 5626 return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind); 5627 } 5628 5629 InstructionCost X86TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, 5630 const APInt &Imm, Type *Ty, 5631 TTI::TargetCostKind CostKind) { 5632 assert(Ty->isIntegerTy()); 5633 5634 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 5635 // There is no cost model for constants with a bit size of 0. Return TCC_Free 5636 // here, so that constant hoisting will ignore this constant. 5637 if (BitSize == 0) 5638 return TTI::TCC_Free; 5639 5640 switch (IID) { 5641 default: 5642 return TTI::TCC_Free; 5643 case Intrinsic::sadd_with_overflow: 5644 case Intrinsic::uadd_with_overflow: 5645 case Intrinsic::ssub_with_overflow: 5646 case Intrinsic::usub_with_overflow: 5647 case Intrinsic::smul_with_overflow: 5648 case Intrinsic::umul_with_overflow: 5649 if ((Idx == 1) && Imm.getBitWidth() <= 64 && Imm.isSignedIntN(32)) 5650 return TTI::TCC_Free; 5651 break; 5652 case Intrinsic::experimental_stackmap: 5653 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && Imm.isSignedIntN(64))) 5654 return TTI::TCC_Free; 5655 break; 5656 case Intrinsic::experimental_patchpoint_void: 5657 case Intrinsic::experimental_patchpoint_i64: 5658 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && Imm.isSignedIntN(64))) 5659 return TTI::TCC_Free; 5660 break; 5661 } 5662 return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind); 5663 } 5664 5665 InstructionCost X86TTIImpl::getCFInstrCost(unsigned Opcode, 5666 TTI::TargetCostKind CostKind, 5667 const Instruction *I) { 5668 if (CostKind != TTI::TCK_RecipThroughput) 5669 return Opcode == Instruction::PHI ? 0 : 1; 5670 // Branches are assumed to be predicted. 5671 return 0; 5672 } 5673 5674 int X86TTIImpl::getGatherOverhead() const { 5675 // Some CPUs have more overhead for gather. The specified overhead is relative 5676 // to the Load operation. "2" is the number provided by Intel architects. This 5677 // parameter is used for cost estimation of Gather Op and comparison with 5678 // other alternatives. 5679 // TODO: Remove the explicit hasAVX512()?, That would mean we would only 5680 // enable gather with a -march. 5681 if (ST->hasAVX512() || (ST->hasAVX2() && ST->hasFastGather())) 5682 return 2; 5683 5684 return 1024; 5685 } 5686 5687 int X86TTIImpl::getScatterOverhead() const { 5688 if (ST->hasAVX512()) 5689 return 2; 5690 5691 return 1024; 5692 } 5693 5694 // Return an average cost of Gather / Scatter instruction, maybe improved later. 5695 // FIXME: Add TargetCostKind support. 5696 InstructionCost X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, 5697 const Value *Ptr, Align Alignment, 5698 unsigned AddressSpace) { 5699 5700 assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost"); 5701 unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements(); 5702 5703 // Try to reduce index size from 64 bit (default for GEP) 5704 // to 32. It is essential for VF 16. If the index can't be reduced to 32, the 5705 // operation will use 16 x 64 indices which do not fit in a zmm and needs 5706 // to split. Also check that the base pointer is the same for all lanes, 5707 // and that there's at most one variable index. 5708 auto getIndexSizeInBits = [](const Value *Ptr, const DataLayout &DL) { 5709 unsigned IndexSize = DL.getPointerSizeInBits(); 5710 const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr); 5711 if (IndexSize < 64 || !GEP) 5712 return IndexSize; 5713 5714 unsigned NumOfVarIndices = 0; 5715 const Value *Ptrs = GEP->getPointerOperand(); 5716 if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs)) 5717 return IndexSize; 5718 for (unsigned i = 1; i < GEP->getNumOperands(); ++i) { 5719 if (isa<Constant>(GEP->getOperand(i))) 5720 continue; 5721 Type *IndxTy = GEP->getOperand(i)->getType(); 5722 if (auto *IndexVTy = dyn_cast<VectorType>(IndxTy)) 5723 IndxTy = IndexVTy->getElementType(); 5724 if ((IndxTy->getPrimitiveSizeInBits() == 64 && 5725 !isa<SExtInst>(GEP->getOperand(i))) || 5726 ++NumOfVarIndices > 1) 5727 return IndexSize; // 64 5728 } 5729 return (unsigned)32; 5730 }; 5731 5732 // Trying to reduce IndexSize to 32 bits for vector 16. 5733 // By default the IndexSize is equal to pointer size. 5734 unsigned IndexSize = (ST->hasAVX512() && VF >= 16) 5735 ? getIndexSizeInBits(Ptr, DL) 5736 : DL.getPointerSizeInBits(); 5737 5738 auto *IndexVTy = FixedVectorType::get( 5739 IntegerType::get(SrcVTy->getContext(), IndexSize), VF); 5740 std::pair<InstructionCost, MVT> IdxsLT = getTypeLegalizationCost(IndexVTy); 5741 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(SrcVTy); 5742 InstructionCost::CostType SplitFactor = 5743 *std::max(IdxsLT.first, SrcLT.first).getValue(); 5744 if (SplitFactor > 1) { 5745 // Handle splitting of vector of pointers 5746 auto *SplitSrcTy = 5747 FixedVectorType::get(SrcVTy->getScalarType(), VF / SplitFactor); 5748 return SplitFactor * getGSVectorCost(Opcode, SplitSrcTy, Ptr, Alignment, 5749 AddressSpace); 5750 } 5751 5752 // The gather / scatter cost is given by Intel architects. It is a rough 5753 // number since we are looking at one instruction in a time. 5754 const int GSOverhead = (Opcode == Instruction::Load) 5755 ? getGatherOverhead() 5756 : getScatterOverhead(); 5757 return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(), 5758 MaybeAlign(Alignment), AddressSpace, 5759 TTI::TCK_RecipThroughput); 5760 } 5761 5762 /// Return the cost of full scalarization of gather / scatter operation. 5763 /// 5764 /// Opcode - Load or Store instruction. 5765 /// SrcVTy - The type of the data vector that should be gathered or scattered. 5766 /// VariableMask - The mask is non-constant at compile time. 5767 /// Alignment - Alignment for one element. 5768 /// AddressSpace - pointer[s] address space. 5769 /// 5770 /// FIXME: Add TargetCostKind support. 5771 InstructionCost X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy, 5772 bool VariableMask, Align Alignment, 5773 unsigned AddressSpace) { 5774 Type *ScalarTy = SrcVTy->getScalarType(); 5775 unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements(); 5776 APInt DemandedElts = APInt::getAllOnes(VF); 5777 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 5778 5779 InstructionCost MaskUnpackCost = 0; 5780 if (VariableMask) { 5781 auto *MaskTy = 5782 FixedVectorType::get(Type::getInt1Ty(SrcVTy->getContext()), VF); 5783 MaskUnpackCost = getScalarizationOverhead( 5784 MaskTy, DemandedElts, /*Insert=*/false, /*Extract=*/true, CostKind); 5785 InstructionCost ScalarCompareCost = getCmpSelInstrCost( 5786 Instruction::ICmp, Type::getInt1Ty(SrcVTy->getContext()), nullptr, 5787 CmpInst::BAD_ICMP_PREDICATE, CostKind); 5788 InstructionCost BranchCost = getCFInstrCost(Instruction::Br, CostKind); 5789 MaskUnpackCost += VF * (BranchCost + ScalarCompareCost); 5790 } 5791 5792 InstructionCost AddressUnpackCost = getScalarizationOverhead( 5793 FixedVectorType::get(ScalarTy->getPointerTo(), VF), DemandedElts, 5794 /*Insert=*/false, /*Extract=*/true, CostKind); 5795 5796 // The cost of the scalar loads/stores. 5797 InstructionCost MemoryOpCost = 5798 VF * getMemoryOpCost(Opcode, ScalarTy, MaybeAlign(Alignment), 5799 AddressSpace, CostKind); 5800 5801 // The cost of forming the vector from loaded scalars/ 5802 // scalarizing the vector to perform scalar stores. 5803 InstructionCost InsertExtractCost = getScalarizationOverhead( 5804 cast<FixedVectorType>(SrcVTy), DemandedElts, 5805 /*Insert=*/Opcode == Instruction::Load, 5806 /*Extract=*/Opcode == Instruction::Store, CostKind); 5807 5808 return AddressUnpackCost + MemoryOpCost + MaskUnpackCost + InsertExtractCost; 5809 } 5810 5811 /// Calculate the cost of Gather / Scatter operation 5812 InstructionCost X86TTIImpl::getGatherScatterOpCost( 5813 unsigned Opcode, Type *SrcVTy, const Value *Ptr, bool VariableMask, 5814 Align Alignment, TTI::TargetCostKind CostKind, 5815 const Instruction *I = nullptr) { 5816 if (CostKind != TTI::TCK_RecipThroughput) { 5817 if ((Opcode == Instruction::Load && 5818 isLegalMaskedGather(SrcVTy, Align(Alignment)) && 5819 !forceScalarizeMaskedGather(cast<VectorType>(SrcVTy), 5820 Align(Alignment))) || 5821 (Opcode == Instruction::Store && 5822 isLegalMaskedScatter(SrcVTy, Align(Alignment)) && 5823 !forceScalarizeMaskedScatter(cast<VectorType>(SrcVTy), 5824 Align(Alignment)))) 5825 return 1; 5826 return BaseT::getGatherScatterOpCost(Opcode, SrcVTy, Ptr, VariableMask, 5827 Alignment, CostKind, I); 5828 } 5829 5830 assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter"); 5831 PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType()); 5832 if (!PtrTy && Ptr->getType()->isVectorTy()) 5833 PtrTy = dyn_cast<PointerType>( 5834 cast<VectorType>(Ptr->getType())->getElementType()); 5835 assert(PtrTy && "Unexpected type for Ptr argument"); 5836 unsigned AddressSpace = PtrTy->getAddressSpace(); 5837 5838 if ((Opcode == Instruction::Load && 5839 (!isLegalMaskedGather(SrcVTy, Align(Alignment)) || 5840 forceScalarizeMaskedGather(cast<VectorType>(SrcVTy), 5841 Align(Alignment)))) || 5842 (Opcode == Instruction::Store && 5843 (!isLegalMaskedScatter(SrcVTy, Align(Alignment)) || 5844 forceScalarizeMaskedScatter(cast<VectorType>(SrcVTy), 5845 Align(Alignment))))) 5846 return getGSScalarCost(Opcode, SrcVTy, VariableMask, Alignment, 5847 AddressSpace); 5848 5849 return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace); 5850 } 5851 5852 bool X86TTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1, 5853 const TargetTransformInfo::LSRCost &C2) { 5854 // X86 specific here are "instruction number 1st priority". 5855 return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost, 5856 C1.NumIVMuls, C1.NumBaseAdds, 5857 C1.ScaleCost, C1.ImmCost, C1.SetupCost) < 5858 std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost, 5859 C2.NumIVMuls, C2.NumBaseAdds, 5860 C2.ScaleCost, C2.ImmCost, C2.SetupCost); 5861 } 5862 5863 bool X86TTIImpl::canMacroFuseCmp() { 5864 return ST->hasMacroFusion() || ST->hasBranchFusion(); 5865 } 5866 5867 bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) { 5868 if (!ST->hasAVX()) 5869 return false; 5870 5871 // The backend can't handle a single element vector. 5872 if (isa<VectorType>(DataTy) && 5873 cast<FixedVectorType>(DataTy)->getNumElements() == 1) 5874 return false; 5875 Type *ScalarTy = DataTy->getScalarType(); 5876 5877 if (ScalarTy->isPointerTy()) 5878 return true; 5879 5880 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy()) 5881 return true; 5882 5883 if (ScalarTy->isHalfTy() && ST->hasBWI()) 5884 return true; 5885 5886 if (!ScalarTy->isIntegerTy()) 5887 return false; 5888 5889 unsigned IntWidth = ScalarTy->getIntegerBitWidth(); 5890 return IntWidth == 32 || IntWidth == 64 || 5891 ((IntWidth == 8 || IntWidth == 16) && ST->hasBWI()); 5892 } 5893 5894 bool X86TTIImpl::isLegalMaskedStore(Type *DataType, Align Alignment) { 5895 return isLegalMaskedLoad(DataType, Alignment); 5896 } 5897 5898 bool X86TTIImpl::isLegalNTLoad(Type *DataType, Align Alignment) { 5899 unsigned DataSize = DL.getTypeStoreSize(DataType); 5900 // The only supported nontemporal loads are for aligned vectors of 16 or 32 5901 // bytes. Note that 32-byte nontemporal vector loads are supported by AVX2 5902 // (the equivalent stores only require AVX). 5903 if (Alignment >= DataSize && (DataSize == 16 || DataSize == 32)) 5904 return DataSize == 16 ? ST->hasSSE1() : ST->hasAVX2(); 5905 5906 return false; 5907 } 5908 5909 bool X86TTIImpl::isLegalNTStore(Type *DataType, Align Alignment) { 5910 unsigned DataSize = DL.getTypeStoreSize(DataType); 5911 5912 // SSE4A supports nontemporal stores of float and double at arbitrary 5913 // alignment. 5914 if (ST->hasSSE4A() && (DataType->isFloatTy() || DataType->isDoubleTy())) 5915 return true; 5916 5917 // Besides the SSE4A subtarget exception above, only aligned stores are 5918 // available nontemporaly on any other subtarget. And only stores with a size 5919 // of 4..32 bytes (powers of 2, only) are permitted. 5920 if (Alignment < DataSize || DataSize < 4 || DataSize > 32 || 5921 !isPowerOf2_32(DataSize)) 5922 return false; 5923 5924 // 32-byte vector nontemporal stores are supported by AVX (the equivalent 5925 // loads require AVX2). 5926 if (DataSize == 32) 5927 return ST->hasAVX(); 5928 if (DataSize == 16) 5929 return ST->hasSSE1(); 5930 return true; 5931 } 5932 5933 bool X86TTIImpl::isLegalBroadcastLoad(Type *ElementTy, 5934 ElementCount NumElements) const { 5935 // movddup 5936 return ST->hasSSE3() && !NumElements.isScalable() && 5937 NumElements.getFixedValue() == 2 && 5938 ElementTy == Type::getDoubleTy(ElementTy->getContext()); 5939 } 5940 5941 bool X86TTIImpl::isLegalMaskedExpandLoad(Type *DataTy) { 5942 if (!isa<VectorType>(DataTy)) 5943 return false; 5944 5945 if (!ST->hasAVX512()) 5946 return false; 5947 5948 // The backend can't handle a single element vector. 5949 if (cast<FixedVectorType>(DataTy)->getNumElements() == 1) 5950 return false; 5951 5952 Type *ScalarTy = cast<VectorType>(DataTy)->getElementType(); 5953 5954 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy()) 5955 return true; 5956 5957 if (!ScalarTy->isIntegerTy()) 5958 return false; 5959 5960 unsigned IntWidth = ScalarTy->getIntegerBitWidth(); 5961 return IntWidth == 32 || IntWidth == 64 || 5962 ((IntWidth == 8 || IntWidth == 16) && ST->hasVBMI2()); 5963 } 5964 5965 bool X86TTIImpl::isLegalMaskedCompressStore(Type *DataTy) { 5966 return isLegalMaskedExpandLoad(DataTy); 5967 } 5968 5969 bool X86TTIImpl::supportsGather() const { 5970 // Some CPUs have better gather performance than others. 5971 // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only 5972 // enable gather with a -march. 5973 return ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2()); 5974 } 5975 5976 bool X86TTIImpl::forceScalarizeMaskedGather(VectorType *VTy, Align Alignment) { 5977 // Gather / Scatter for vector 2 is not profitable on KNL / SKX 5978 // Vector-4 of gather/scatter instruction does not exist on KNL. We can extend 5979 // it to 8 elements, but zeroing upper bits of the mask vector will add more 5980 // instructions. Right now we give the scalar cost of vector-4 for KNL. TODO: 5981 // Check, maybe the gather/scatter instruction is better in the VariableMask 5982 // case. 5983 unsigned NumElts = cast<FixedVectorType>(VTy)->getNumElements(); 5984 return NumElts == 1 || 5985 (ST->hasAVX512() && (NumElts == 2 || (NumElts == 4 && !ST->hasVLX()))); 5986 } 5987 5988 bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, Align Alignment) { 5989 if (!supportsGather()) 5990 return false; 5991 Type *ScalarTy = DataTy->getScalarType(); 5992 if (ScalarTy->isPointerTy()) 5993 return true; 5994 5995 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy()) 5996 return true; 5997 5998 if (!ScalarTy->isIntegerTy()) 5999 return false; 6000 6001 unsigned IntWidth = ScalarTy->getIntegerBitWidth(); 6002 return IntWidth == 32 || IntWidth == 64; 6003 } 6004 6005 bool X86TTIImpl::isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, 6006 unsigned Opcode1, 6007 const SmallBitVector &OpcodeMask) const { 6008 // ADDSUBPS 4xf32 SSE3 6009 // VADDSUBPS 4xf32 AVX 6010 // VADDSUBPS 8xf32 AVX2 6011 // ADDSUBPD 2xf64 SSE3 6012 // VADDSUBPD 2xf64 AVX 6013 // VADDSUBPD 4xf64 AVX2 6014 6015 unsigned NumElements = cast<FixedVectorType>(VecTy)->getNumElements(); 6016 assert(OpcodeMask.size() == NumElements && "Mask and VecTy are incompatible"); 6017 if (!isPowerOf2_32(NumElements)) 6018 return false; 6019 // Check the opcode pattern. We apply the mask on the opcode arguments and 6020 // then check if it is what we expect. 6021 for (int Lane : seq<int>(0, NumElements)) { 6022 unsigned Opc = OpcodeMask.test(Lane) ? Opcode1 : Opcode0; 6023 // We expect FSub for even lanes and FAdd for odd lanes. 6024 if (Lane % 2 == 0 && Opc != Instruction::FSub) 6025 return false; 6026 if (Lane % 2 == 1 && Opc != Instruction::FAdd) 6027 return false; 6028 } 6029 // Now check that the pattern is supported by the target ISA. 6030 Type *ElemTy = cast<VectorType>(VecTy)->getElementType(); 6031 if (ElemTy->isFloatTy()) 6032 return ST->hasSSE3() && NumElements % 4 == 0; 6033 if (ElemTy->isDoubleTy()) 6034 return ST->hasSSE3() && NumElements % 2 == 0; 6035 return false; 6036 } 6037 6038 bool X86TTIImpl::isLegalMaskedScatter(Type *DataType, Align Alignment) { 6039 // AVX2 doesn't support scatter 6040 if (!ST->hasAVX512()) 6041 return false; 6042 return isLegalMaskedGather(DataType, Alignment); 6043 } 6044 6045 bool X86TTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) { 6046 EVT VT = TLI->getValueType(DL, DataType); 6047 return TLI->isOperationLegal(IsSigned ? ISD::SDIVREM : ISD::UDIVREM, VT); 6048 } 6049 6050 bool X86TTIImpl::isExpensiveToSpeculativelyExecute(const Instruction* I) { 6051 // FDIV is always expensive, even if it has a very low uop count. 6052 // TODO: Still necessary for recent CPUs with low latency/throughput fdiv? 6053 if (I->getOpcode() == Instruction::FDiv) 6054 return true; 6055 6056 return BaseT::isExpensiveToSpeculativelyExecute(I); 6057 } 6058 6059 bool X86TTIImpl::isFCmpOrdCheaperThanFCmpZero(Type *Ty) { 6060 return false; 6061 } 6062 6063 bool X86TTIImpl::areInlineCompatible(const Function *Caller, 6064 const Function *Callee) const { 6065 const TargetMachine &TM = getTLI()->getTargetMachine(); 6066 6067 // Work this as a subsetting of subtarget features. 6068 const FeatureBitset &CallerBits = 6069 TM.getSubtargetImpl(*Caller)->getFeatureBits(); 6070 const FeatureBitset &CalleeBits = 6071 TM.getSubtargetImpl(*Callee)->getFeatureBits(); 6072 6073 // Check whether features are the same (apart from the ignore list). 6074 FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList; 6075 FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList; 6076 if (RealCallerBits == RealCalleeBits) 6077 return true; 6078 6079 // If the features are a subset, we need to additionally check for calls 6080 // that may become ABI-incompatible as a result of inlining. 6081 if ((RealCallerBits & RealCalleeBits) != RealCalleeBits) 6082 return false; 6083 6084 for (const Instruction &I : instructions(Callee)) { 6085 if (const auto *CB = dyn_cast<CallBase>(&I)) { 6086 SmallVector<Type *, 8> Types; 6087 for (Value *Arg : CB->args()) 6088 Types.push_back(Arg->getType()); 6089 if (!CB->getType()->isVoidTy()) 6090 Types.push_back(CB->getType()); 6091 6092 // Simple types are always ABI compatible. 6093 auto IsSimpleTy = [](Type *Ty) { 6094 return !Ty->isVectorTy() && !Ty->isAggregateType(); 6095 }; 6096 if (all_of(Types, IsSimpleTy)) 6097 continue; 6098 6099 if (Function *NestedCallee = CB->getCalledFunction()) { 6100 // Assume that intrinsics are always ABI compatible. 6101 if (NestedCallee->isIntrinsic()) 6102 continue; 6103 6104 // Do a precise compatibility check. 6105 if (!areTypesABICompatible(Caller, NestedCallee, Types)) 6106 return false; 6107 } else { 6108 // We don't know the target features of the callee, 6109 // assume it is incompatible. 6110 return false; 6111 } 6112 } 6113 } 6114 return true; 6115 } 6116 6117 bool X86TTIImpl::areTypesABICompatible(const Function *Caller, 6118 const Function *Callee, 6119 const ArrayRef<Type *> &Types) const { 6120 if (!BaseT::areTypesABICompatible(Caller, Callee, Types)) 6121 return false; 6122 6123 // If we get here, we know the target features match. If one function 6124 // considers 512-bit vectors legal and the other does not, consider them 6125 // incompatible. 6126 const TargetMachine &TM = getTLI()->getTargetMachine(); 6127 6128 if (TM.getSubtarget<X86Subtarget>(*Caller).useAVX512Regs() == 6129 TM.getSubtarget<X86Subtarget>(*Callee).useAVX512Regs()) 6130 return true; 6131 6132 // Consider the arguments compatible if they aren't vectors or aggregates. 6133 // FIXME: Look at the size of vectors. 6134 // FIXME: Look at the element types of aggregates to see if there are vectors. 6135 return llvm::none_of(Types, 6136 [](Type *T) { return T->isVectorTy() || T->isAggregateType(); }); 6137 } 6138 6139 X86TTIImpl::TTI::MemCmpExpansionOptions 6140 X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const { 6141 TTI::MemCmpExpansionOptions Options; 6142 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize); 6143 Options.NumLoadsPerBlock = 2; 6144 // All GPR and vector loads can be unaligned. 6145 Options.AllowOverlappingLoads = true; 6146 if (IsZeroCmp) { 6147 // Only enable vector loads for equality comparison. Right now the vector 6148 // version is not as fast for three way compare (see #33329). 6149 const unsigned PreferredWidth = ST->getPreferVectorWidth(); 6150 if (PreferredWidth >= 512 && ST->hasAVX512()) Options.LoadSizes.push_back(64); 6151 if (PreferredWidth >= 256 && ST->hasAVX()) Options.LoadSizes.push_back(32); 6152 if (PreferredWidth >= 128 && ST->hasSSE2()) Options.LoadSizes.push_back(16); 6153 } 6154 if (ST->is64Bit()) { 6155 Options.LoadSizes.push_back(8); 6156 } 6157 Options.LoadSizes.push_back(4); 6158 Options.LoadSizes.push_back(2); 6159 Options.LoadSizes.push_back(1); 6160 return Options; 6161 } 6162 6163 bool X86TTIImpl::prefersVectorizedAddressing() const { 6164 return supportsGather(); 6165 } 6166 6167 bool X86TTIImpl::supportsEfficientVectorElementLoadStore() const { 6168 return false; 6169 } 6170 6171 bool X86TTIImpl::enableInterleavedAccessVectorization() { 6172 // TODO: We expect this to be beneficial regardless of arch, 6173 // but there are currently some unexplained performance artifacts on Atom. 6174 // As a temporary solution, disable on Atom. 6175 return !(ST->isAtom()); 6176 } 6177 6178 // Get estimation for interleaved load/store operations and strided load. 6179 // \p Indices contains indices for strided load. 6180 // \p Factor - the factor of interleaving. 6181 // AVX-512 provides 3-src shuffles that significantly reduces the cost. 6182 InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX512( 6183 unsigned Opcode, FixedVectorType *VecTy, unsigned Factor, 6184 ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace, 6185 TTI::TargetCostKind CostKind, bool UseMaskForCond, bool UseMaskForGaps) { 6186 // VecTy for interleave memop is <VF*Factor x Elt>. 6187 // So, for VF=4, Interleave Factor = 3, Element type = i32 we have 6188 // VecTy = <12 x i32>. 6189 6190 // Calculate the number of memory operations (NumOfMemOps), required 6191 // for load/store the VecTy. 6192 MVT LegalVT = getTypeLegalizationCost(VecTy).second; 6193 unsigned VecTySize = DL.getTypeStoreSize(VecTy); 6194 unsigned LegalVTSize = LegalVT.getStoreSize(); 6195 unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize; 6196 6197 // Get the cost of one memory operation. 6198 auto *SingleMemOpTy = FixedVectorType::get(VecTy->getElementType(), 6199 LegalVT.getVectorNumElements()); 6200 InstructionCost MemOpCost; 6201 bool UseMaskedMemOp = UseMaskForCond || UseMaskForGaps; 6202 if (UseMaskedMemOp) 6203 MemOpCost = getMaskedMemoryOpCost(Opcode, SingleMemOpTy, Alignment, 6204 AddressSpace, CostKind); 6205 else 6206 MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy, MaybeAlign(Alignment), 6207 AddressSpace, CostKind); 6208 6209 unsigned VF = VecTy->getNumElements() / Factor; 6210 MVT VT = MVT::getVectorVT(MVT::getVT(VecTy->getScalarType()), VF); 6211 6212 InstructionCost MaskCost; 6213 if (UseMaskedMemOp) { 6214 APInt DemandedLoadStoreElts = APInt::getZero(VecTy->getNumElements()); 6215 for (unsigned Index : Indices) { 6216 assert(Index < Factor && "Invalid index for interleaved memory op"); 6217 for (unsigned Elm = 0; Elm < VF; Elm++) 6218 DemandedLoadStoreElts.setBit(Index + Elm * Factor); 6219 } 6220 6221 Type *I1Type = Type::getInt1Ty(VecTy->getContext()); 6222 6223 MaskCost = getReplicationShuffleCost( 6224 I1Type, Factor, VF, 6225 UseMaskForGaps ? DemandedLoadStoreElts 6226 : APInt::getAllOnes(VecTy->getNumElements()), 6227 CostKind); 6228 6229 // The Gaps mask is invariant and created outside the loop, therefore the 6230 // cost of creating it is not accounted for here. However if we have both 6231 // a MaskForGaps and some other mask that guards the execution of the 6232 // memory access, we need to account for the cost of And-ing the two masks 6233 // inside the loop. 6234 if (UseMaskForGaps) { 6235 auto *MaskVT = FixedVectorType::get(I1Type, VecTy->getNumElements()); 6236 MaskCost += getArithmeticInstrCost(BinaryOperator::And, MaskVT, CostKind); 6237 } 6238 } 6239 6240 if (Opcode == Instruction::Load) { 6241 // The tables (AVX512InterleavedLoadTbl and AVX512InterleavedStoreTbl) 6242 // contain the cost of the optimized shuffle sequence that the 6243 // X86InterleavedAccess pass will generate. 6244 // The cost of loads and stores are computed separately from the table. 6245 6246 // X86InterleavedAccess support only the following interleaved-access group. 6247 static const CostTblEntry AVX512InterleavedLoadTbl[] = { 6248 {3, MVT::v16i8, 12}, //(load 48i8 and) deinterleave into 3 x 16i8 6249 {3, MVT::v32i8, 14}, //(load 96i8 and) deinterleave into 3 x 32i8 6250 {3, MVT::v64i8, 22}, //(load 96i8 and) deinterleave into 3 x 32i8 6251 }; 6252 6253 if (const auto *Entry = 6254 CostTableLookup(AVX512InterleavedLoadTbl, Factor, VT)) 6255 return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost; 6256 //If an entry does not exist, fallback to the default implementation. 6257 6258 // Kind of shuffle depends on number of loaded values. 6259 // If we load the entire data in one register, we can use a 1-src shuffle. 6260 // Otherwise, we'll merge 2 sources in each operation. 6261 TTI::ShuffleKind ShuffleKind = 6262 (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc; 6263 6264 InstructionCost ShuffleCost = getShuffleCost( 6265 ShuffleKind, SingleMemOpTy, std::nullopt, CostKind, 0, nullptr); 6266 6267 unsigned NumOfLoadsInInterleaveGrp = 6268 Indices.size() ? Indices.size() : Factor; 6269 auto *ResultTy = FixedVectorType::get(VecTy->getElementType(), 6270 VecTy->getNumElements() / Factor); 6271 InstructionCost NumOfResults = 6272 getTypeLegalizationCost(ResultTy).first * NumOfLoadsInInterleaveGrp; 6273 6274 // About a half of the loads may be folded in shuffles when we have only 6275 // one result. If we have more than one result, or the loads are masked, 6276 // we do not fold loads at all. 6277 unsigned NumOfUnfoldedLoads = 6278 UseMaskedMemOp || NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2; 6279 6280 // Get a number of shuffle operations per result. 6281 unsigned NumOfShufflesPerResult = 6282 std::max((unsigned)1, (unsigned)(NumOfMemOps - 1)); 6283 6284 // The SK_MergeTwoSrc shuffle clobbers one of src operands. 6285 // When we have more than one destination, we need additional instructions 6286 // to keep sources. 6287 InstructionCost NumOfMoves = 0; 6288 if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc) 6289 NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2; 6290 6291 InstructionCost Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost + 6292 MaskCost + NumOfUnfoldedLoads * MemOpCost + 6293 NumOfMoves; 6294 6295 return Cost; 6296 } 6297 6298 // Store. 6299 assert(Opcode == Instruction::Store && 6300 "Expected Store Instruction at this point"); 6301 // X86InterleavedAccess support only the following interleaved-access group. 6302 static const CostTblEntry AVX512InterleavedStoreTbl[] = { 6303 {3, MVT::v16i8, 12}, // interleave 3 x 16i8 into 48i8 (and store) 6304 {3, MVT::v32i8, 14}, // interleave 3 x 32i8 into 96i8 (and store) 6305 {3, MVT::v64i8, 26}, // interleave 3 x 64i8 into 96i8 (and store) 6306 6307 {4, MVT::v8i8, 10}, // interleave 4 x 8i8 into 32i8 (and store) 6308 {4, MVT::v16i8, 11}, // interleave 4 x 16i8 into 64i8 (and store) 6309 {4, MVT::v32i8, 14}, // interleave 4 x 32i8 into 128i8 (and store) 6310 {4, MVT::v64i8, 24} // interleave 4 x 32i8 into 256i8 (and store) 6311 }; 6312 6313 if (const auto *Entry = 6314 CostTableLookup(AVX512InterleavedStoreTbl, Factor, VT)) 6315 return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost; 6316 //If an entry does not exist, fallback to the default implementation. 6317 6318 // There is no strided stores meanwhile. And store can't be folded in 6319 // shuffle. 6320 unsigned NumOfSources = Factor; // The number of values to be merged. 6321 InstructionCost ShuffleCost = getShuffleCost( 6322 TTI::SK_PermuteTwoSrc, SingleMemOpTy, std::nullopt, CostKind, 0, nullptr); 6323 unsigned NumOfShufflesPerStore = NumOfSources - 1; 6324 6325 // The SK_MergeTwoSrc shuffle clobbers one of src operands. 6326 // We need additional instructions to keep sources. 6327 unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2; 6328 InstructionCost Cost = 6329 MaskCost + 6330 NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) + 6331 NumOfMoves; 6332 return Cost; 6333 } 6334 6335 InstructionCost X86TTIImpl::getInterleavedMemoryOpCost( 6336 unsigned Opcode, Type *BaseTy, unsigned Factor, ArrayRef<unsigned> Indices, 6337 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, 6338 bool UseMaskForCond, bool UseMaskForGaps) { 6339 auto *VecTy = cast<FixedVectorType>(BaseTy); 6340 6341 auto isSupportedOnAVX512 = [&](Type *VecTy, bool HasBW) { 6342 Type *EltTy = cast<VectorType>(VecTy)->getElementType(); 6343 if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) || 6344 EltTy->isIntegerTy(32) || EltTy->isPointerTy()) 6345 return true; 6346 if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8) || EltTy->isHalfTy()) 6347 return HasBW; 6348 return false; 6349 }; 6350 if (ST->hasAVX512() && isSupportedOnAVX512(VecTy, ST->hasBWI())) 6351 return getInterleavedMemoryOpCostAVX512( 6352 Opcode, VecTy, Factor, Indices, Alignment, 6353 AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps); 6354 6355 if (UseMaskForCond || UseMaskForGaps) 6356 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, 6357 Alignment, AddressSpace, CostKind, 6358 UseMaskForCond, UseMaskForGaps); 6359 6360 // Get estimation for interleaved load/store operations for SSE-AVX2. 6361 // As opposed to AVX-512, SSE-AVX2 do not have generic shuffles that allow 6362 // computing the cost using a generic formula as a function of generic 6363 // shuffles. We therefore use a lookup table instead, filled according to 6364 // the instruction sequences that codegen currently generates. 6365 6366 // VecTy for interleave memop is <VF*Factor x Elt>. 6367 // So, for VF=4, Interleave Factor = 3, Element type = i32 we have 6368 // VecTy = <12 x i32>. 6369 MVT LegalVT = getTypeLegalizationCost(VecTy).second; 6370 6371 // This function can be called with VecTy=<6xi128>, Factor=3, in which case 6372 // the VF=2, while v2i128 is an unsupported MVT vector type 6373 // (see MachineValueType.h::getVectorVT()). 6374 if (!LegalVT.isVector()) 6375 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, 6376 Alignment, AddressSpace, CostKind); 6377 6378 unsigned VF = VecTy->getNumElements() / Factor; 6379 Type *ScalarTy = VecTy->getElementType(); 6380 // Deduplicate entries, model floats/pointers as appropriately-sized integers. 6381 if (!ScalarTy->isIntegerTy()) 6382 ScalarTy = 6383 Type::getIntNTy(ScalarTy->getContext(), DL.getTypeSizeInBits(ScalarTy)); 6384 6385 // Get the cost of all the memory operations. 6386 // FIXME: discount dead loads. 6387 InstructionCost MemOpCosts = getMemoryOpCost( 6388 Opcode, VecTy, MaybeAlign(Alignment), AddressSpace, CostKind); 6389 6390 auto *VT = FixedVectorType::get(ScalarTy, VF); 6391 EVT ETy = TLI->getValueType(DL, VT); 6392 if (!ETy.isSimple()) 6393 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, 6394 Alignment, AddressSpace, CostKind); 6395 6396 // TODO: Complete for other data-types and strides. 6397 // Each combination of Stride, element bit width and VF results in a different 6398 // sequence; The cost tables are therefore accessed with: 6399 // Factor (stride) and VectorType=VFxiN. 6400 // The Cost accounts only for the shuffle sequence; 6401 // The cost of the loads/stores is accounted for separately. 6402 // 6403 static const CostTblEntry AVX2InterleavedLoadTbl[] = { 6404 {2, MVT::v2i8, 2}, // (load 4i8 and) deinterleave into 2 x 2i8 6405 {2, MVT::v4i8, 2}, // (load 8i8 and) deinterleave into 2 x 4i8 6406 {2, MVT::v8i8, 2}, // (load 16i8 and) deinterleave into 2 x 8i8 6407 {2, MVT::v16i8, 4}, // (load 32i8 and) deinterleave into 2 x 16i8 6408 {2, MVT::v32i8, 6}, // (load 64i8 and) deinterleave into 2 x 32i8 6409 6410 {2, MVT::v8i16, 6}, // (load 16i16 and) deinterleave into 2 x 8i16 6411 {2, MVT::v16i16, 9}, // (load 32i16 and) deinterleave into 2 x 16i16 6412 {2, MVT::v32i16, 18}, // (load 64i16 and) deinterleave into 2 x 32i16 6413 6414 {2, MVT::v8i32, 4}, // (load 16i32 and) deinterleave into 2 x 8i32 6415 {2, MVT::v16i32, 8}, // (load 32i32 and) deinterleave into 2 x 16i32 6416 {2, MVT::v32i32, 16}, // (load 64i32 and) deinterleave into 2 x 32i32 6417 6418 {2, MVT::v4i64, 4}, // (load 8i64 and) deinterleave into 2 x 4i64 6419 {2, MVT::v8i64, 8}, // (load 16i64 and) deinterleave into 2 x 8i64 6420 {2, MVT::v16i64, 16}, // (load 32i64 and) deinterleave into 2 x 16i64 6421 {2, MVT::v32i64, 32}, // (load 64i64 and) deinterleave into 2 x 32i64 6422 6423 {3, MVT::v2i8, 3}, // (load 6i8 and) deinterleave into 3 x 2i8 6424 {3, MVT::v4i8, 3}, // (load 12i8 and) deinterleave into 3 x 4i8 6425 {3, MVT::v8i8, 6}, // (load 24i8 and) deinterleave into 3 x 8i8 6426 {3, MVT::v16i8, 11}, // (load 48i8 and) deinterleave into 3 x 16i8 6427 {3, MVT::v32i8, 14}, // (load 96i8 and) deinterleave into 3 x 32i8 6428 6429 {3, MVT::v2i16, 5}, // (load 6i16 and) deinterleave into 3 x 2i16 6430 {3, MVT::v4i16, 7}, // (load 12i16 and) deinterleave into 3 x 4i16 6431 {3, MVT::v8i16, 9}, // (load 24i16 and) deinterleave into 3 x 8i16 6432 {3, MVT::v16i16, 28}, // (load 48i16 and) deinterleave into 3 x 16i16 6433 {3, MVT::v32i16, 56}, // (load 96i16 and) deinterleave into 3 x 32i16 6434 6435 {3, MVT::v2i32, 3}, // (load 6i32 and) deinterleave into 3 x 2i32 6436 {3, MVT::v4i32, 3}, // (load 12i32 and) deinterleave into 3 x 4i32 6437 {3, MVT::v8i32, 7}, // (load 24i32 and) deinterleave into 3 x 8i32 6438 {3, MVT::v16i32, 14}, // (load 48i32 and) deinterleave into 3 x 16i32 6439 {3, MVT::v32i32, 32}, // (load 96i32 and) deinterleave into 3 x 32i32 6440 6441 {3, MVT::v2i64, 1}, // (load 6i64 and) deinterleave into 3 x 2i64 6442 {3, MVT::v4i64, 5}, // (load 12i64 and) deinterleave into 3 x 4i64 6443 {3, MVT::v8i64, 10}, // (load 24i64 and) deinterleave into 3 x 8i64 6444 {3, MVT::v16i64, 20}, // (load 48i64 and) deinterleave into 3 x 16i64 6445 6446 {4, MVT::v2i8, 4}, // (load 8i8 and) deinterleave into 4 x 2i8 6447 {4, MVT::v4i8, 4}, // (load 16i8 and) deinterleave into 4 x 4i8 6448 {4, MVT::v8i8, 12}, // (load 32i8 and) deinterleave into 4 x 8i8 6449 {4, MVT::v16i8, 24}, // (load 64i8 and) deinterleave into 4 x 16i8 6450 {4, MVT::v32i8, 56}, // (load 128i8 and) deinterleave into 4 x 32i8 6451 6452 {4, MVT::v2i16, 6}, // (load 8i16 and) deinterleave into 4 x 2i16 6453 {4, MVT::v4i16, 17}, // (load 16i16 and) deinterleave into 4 x 4i16 6454 {4, MVT::v8i16, 33}, // (load 32i16 and) deinterleave into 4 x 8i16 6455 {4, MVT::v16i16, 75}, // (load 64i16 and) deinterleave into 4 x 16i16 6456 {4, MVT::v32i16, 150}, // (load 128i16 and) deinterleave into 4 x 32i16 6457 6458 {4, MVT::v2i32, 4}, // (load 8i32 and) deinterleave into 4 x 2i32 6459 {4, MVT::v4i32, 8}, // (load 16i32 and) deinterleave into 4 x 4i32 6460 {4, MVT::v8i32, 16}, // (load 32i32 and) deinterleave into 4 x 8i32 6461 {4, MVT::v16i32, 32}, // (load 64i32 and) deinterleave into 4 x 16i32 6462 {4, MVT::v32i32, 68}, // (load 128i32 and) deinterleave into 4 x 32i32 6463 6464 {4, MVT::v2i64, 6}, // (load 8i64 and) deinterleave into 4 x 2i64 6465 {4, MVT::v4i64, 8}, // (load 16i64 and) deinterleave into 4 x 4i64 6466 {4, MVT::v8i64, 20}, // (load 32i64 and) deinterleave into 4 x 8i64 6467 {4, MVT::v16i64, 40}, // (load 64i64 and) deinterleave into 4 x 16i64 6468 6469 {6, MVT::v2i8, 6}, // (load 12i8 and) deinterleave into 6 x 2i8 6470 {6, MVT::v4i8, 14}, // (load 24i8 and) deinterleave into 6 x 4i8 6471 {6, MVT::v8i8, 18}, // (load 48i8 and) deinterleave into 6 x 8i8 6472 {6, MVT::v16i8, 43}, // (load 96i8 and) deinterleave into 6 x 16i8 6473 {6, MVT::v32i8, 82}, // (load 192i8 and) deinterleave into 6 x 32i8 6474 6475 {6, MVT::v2i16, 13}, // (load 12i16 and) deinterleave into 6 x 2i16 6476 {6, MVT::v4i16, 9}, // (load 24i16 and) deinterleave into 6 x 4i16 6477 {6, MVT::v8i16, 39}, // (load 48i16 and) deinterleave into 6 x 8i16 6478 {6, MVT::v16i16, 106}, // (load 96i16 and) deinterleave into 6 x 16i16 6479 {6, MVT::v32i16, 212}, // (load 192i16 and) deinterleave into 6 x 32i16 6480 6481 {6, MVT::v2i32, 6}, // (load 12i32 and) deinterleave into 6 x 2i32 6482 {6, MVT::v4i32, 15}, // (load 24i32 and) deinterleave into 6 x 4i32 6483 {6, MVT::v8i32, 31}, // (load 48i32 and) deinterleave into 6 x 8i32 6484 {6, MVT::v16i32, 64}, // (load 96i32 and) deinterleave into 6 x 16i32 6485 6486 {6, MVT::v2i64, 6}, // (load 12i64 and) deinterleave into 6 x 2i64 6487 {6, MVT::v4i64, 18}, // (load 24i64 and) deinterleave into 6 x 4i64 6488 {6, MVT::v8i64, 36}, // (load 48i64 and) deinterleave into 6 x 8i64 6489 6490 {8, MVT::v8i32, 40} // (load 64i32 and) deinterleave into 8 x 8i32 6491 }; 6492 6493 static const CostTblEntry SSSE3InterleavedLoadTbl[] = { 6494 {2, MVT::v4i16, 2}, // (load 8i16 and) deinterleave into 2 x 4i16 6495 }; 6496 6497 static const CostTblEntry SSE2InterleavedLoadTbl[] = { 6498 {2, MVT::v2i16, 2}, // (load 4i16 and) deinterleave into 2 x 2i16 6499 {2, MVT::v4i16, 7}, // (load 8i16 and) deinterleave into 2 x 4i16 6500 6501 {2, MVT::v2i32, 2}, // (load 4i32 and) deinterleave into 2 x 2i32 6502 {2, MVT::v4i32, 2}, // (load 8i32 and) deinterleave into 2 x 4i32 6503 6504 {2, MVT::v2i64, 2}, // (load 4i64 and) deinterleave into 2 x 2i64 6505 }; 6506 6507 static const CostTblEntry AVX2InterleavedStoreTbl[] = { 6508 {2, MVT::v16i8, 3}, // interleave 2 x 16i8 into 32i8 (and store) 6509 {2, MVT::v32i8, 4}, // interleave 2 x 32i8 into 64i8 (and store) 6510 6511 {2, MVT::v8i16, 3}, // interleave 2 x 8i16 into 16i16 (and store) 6512 {2, MVT::v16i16, 4}, // interleave 2 x 16i16 into 32i16 (and store) 6513 {2, MVT::v32i16, 8}, // interleave 2 x 32i16 into 64i16 (and store) 6514 6515 {2, MVT::v4i32, 2}, // interleave 2 x 4i32 into 8i32 (and store) 6516 {2, MVT::v8i32, 4}, // interleave 2 x 8i32 into 16i32 (and store) 6517 {2, MVT::v16i32, 8}, // interleave 2 x 16i32 into 32i32 (and store) 6518 {2, MVT::v32i32, 16}, // interleave 2 x 32i32 into 64i32 (and store) 6519 6520 {2, MVT::v2i64, 2}, // interleave 2 x 2i64 into 4i64 (and store) 6521 {2, MVT::v4i64, 4}, // interleave 2 x 4i64 into 8i64 (and store) 6522 {2, MVT::v8i64, 8}, // interleave 2 x 8i64 into 16i64 (and store) 6523 {2, MVT::v16i64, 16}, // interleave 2 x 16i64 into 32i64 (and store) 6524 {2, MVT::v32i64, 32}, // interleave 2 x 32i64 into 64i64 (and store) 6525 6526 {3, MVT::v2i8, 4}, // interleave 3 x 2i8 into 6i8 (and store) 6527 {3, MVT::v4i8, 4}, // interleave 3 x 4i8 into 12i8 (and store) 6528 {3, MVT::v8i8, 6}, // interleave 3 x 8i8 into 24i8 (and store) 6529 {3, MVT::v16i8, 11}, // interleave 3 x 16i8 into 48i8 (and store) 6530 {3, MVT::v32i8, 13}, // interleave 3 x 32i8 into 96i8 (and store) 6531 6532 {3, MVT::v2i16, 4}, // interleave 3 x 2i16 into 6i16 (and store) 6533 {3, MVT::v4i16, 6}, // interleave 3 x 4i16 into 12i16 (and store) 6534 {3, MVT::v8i16, 12}, // interleave 3 x 8i16 into 24i16 (and store) 6535 {3, MVT::v16i16, 27}, // interleave 3 x 16i16 into 48i16 (and store) 6536 {3, MVT::v32i16, 54}, // interleave 3 x 32i16 into 96i16 (and store) 6537 6538 {3, MVT::v2i32, 4}, // interleave 3 x 2i32 into 6i32 (and store) 6539 {3, MVT::v4i32, 5}, // interleave 3 x 4i32 into 12i32 (and store) 6540 {3, MVT::v8i32, 11}, // interleave 3 x 8i32 into 24i32 (and store) 6541 {3, MVT::v16i32, 22}, // interleave 3 x 16i32 into 48i32 (and store) 6542 {3, MVT::v32i32, 48}, // interleave 3 x 32i32 into 96i32 (and store) 6543 6544 {3, MVT::v2i64, 4}, // interleave 3 x 2i64 into 6i64 (and store) 6545 {3, MVT::v4i64, 6}, // interleave 3 x 4i64 into 12i64 (and store) 6546 {3, MVT::v8i64, 12}, // interleave 3 x 8i64 into 24i64 (and store) 6547 {3, MVT::v16i64, 24}, // interleave 3 x 16i64 into 48i64 (and store) 6548 6549 {4, MVT::v2i8, 4}, // interleave 4 x 2i8 into 8i8 (and store) 6550 {4, MVT::v4i8, 4}, // interleave 4 x 4i8 into 16i8 (and store) 6551 {4, MVT::v8i8, 4}, // interleave 4 x 8i8 into 32i8 (and store) 6552 {4, MVT::v16i8, 8}, // interleave 4 x 16i8 into 64i8 (and store) 6553 {4, MVT::v32i8, 12}, // interleave 4 x 32i8 into 128i8 (and store) 6554 6555 {4, MVT::v2i16, 2}, // interleave 4 x 2i16 into 8i16 (and store) 6556 {4, MVT::v4i16, 6}, // interleave 4 x 4i16 into 16i16 (and store) 6557 {4, MVT::v8i16, 10}, // interleave 4 x 8i16 into 32i16 (and store) 6558 {4, MVT::v16i16, 32}, // interleave 4 x 16i16 into 64i16 (and store) 6559 {4, MVT::v32i16, 64}, // interleave 4 x 32i16 into 128i16 (and store) 6560 6561 {4, MVT::v2i32, 5}, // interleave 4 x 2i32 into 8i32 (and store) 6562 {4, MVT::v4i32, 6}, // interleave 4 x 4i32 into 16i32 (and store) 6563 {4, MVT::v8i32, 16}, // interleave 4 x 8i32 into 32i32 (and store) 6564 {4, MVT::v16i32, 32}, // interleave 4 x 16i32 into 64i32 (and store) 6565 {4, MVT::v32i32, 64}, // interleave 4 x 32i32 into 128i32 (and store) 6566 6567 {4, MVT::v2i64, 6}, // interleave 4 x 2i64 into 8i64 (and store) 6568 {4, MVT::v4i64, 8}, // interleave 4 x 4i64 into 16i64 (and store) 6569 {4, MVT::v8i64, 20}, // interleave 4 x 8i64 into 32i64 (and store) 6570 {4, MVT::v16i64, 40}, // interleave 4 x 16i64 into 64i64 (and store) 6571 6572 {6, MVT::v2i8, 7}, // interleave 6 x 2i8 into 12i8 (and store) 6573 {6, MVT::v4i8, 9}, // interleave 6 x 4i8 into 24i8 (and store) 6574 {6, MVT::v8i8, 16}, // interleave 6 x 8i8 into 48i8 (and store) 6575 {6, MVT::v16i8, 27}, // interleave 6 x 16i8 into 96i8 (and store) 6576 {6, MVT::v32i8, 90}, // interleave 6 x 32i8 into 192i8 (and store) 6577 6578 {6, MVT::v2i16, 10}, // interleave 6 x 2i16 into 12i16 (and store) 6579 {6, MVT::v4i16, 15}, // interleave 6 x 4i16 into 24i16 (and store) 6580 {6, MVT::v8i16, 21}, // interleave 6 x 8i16 into 48i16 (and store) 6581 {6, MVT::v16i16, 58}, // interleave 6 x 16i16 into 96i16 (and store) 6582 {6, MVT::v32i16, 90}, // interleave 6 x 32i16 into 192i16 (and store) 6583 6584 {6, MVT::v2i32, 9}, // interleave 6 x 2i32 into 12i32 (and store) 6585 {6, MVT::v4i32, 12}, // interleave 6 x 4i32 into 24i32 (and store) 6586 {6, MVT::v8i32, 33}, // interleave 6 x 8i32 into 48i32 (and store) 6587 {6, MVT::v16i32, 66}, // interleave 6 x 16i32 into 96i32 (and store) 6588 6589 {6, MVT::v2i64, 8}, // interleave 6 x 2i64 into 12i64 (and store) 6590 {6, MVT::v4i64, 15}, // interleave 6 x 4i64 into 24i64 (and store) 6591 {6, MVT::v8i64, 30}, // interleave 6 x 8i64 into 48i64 (and store) 6592 }; 6593 6594 static const CostTblEntry SSE2InterleavedStoreTbl[] = { 6595 {2, MVT::v2i8, 1}, // interleave 2 x 2i8 into 4i8 (and store) 6596 {2, MVT::v4i8, 1}, // interleave 2 x 4i8 into 8i8 (and store) 6597 {2, MVT::v8i8, 1}, // interleave 2 x 8i8 into 16i8 (and store) 6598 6599 {2, MVT::v2i16, 1}, // interleave 2 x 2i16 into 4i16 (and store) 6600 {2, MVT::v4i16, 1}, // interleave 2 x 4i16 into 8i16 (and store) 6601 6602 {2, MVT::v2i32, 1}, // interleave 2 x 2i32 into 4i32 (and store) 6603 }; 6604 6605 if (Opcode == Instruction::Load) { 6606 auto GetDiscountedCost = [Factor, NumMembers = Indices.size(), 6607 MemOpCosts](const CostTblEntry *Entry) { 6608 // NOTE: this is just an approximation! 6609 // It can over/under -estimate the cost! 6610 return MemOpCosts + divideCeil(NumMembers * Entry->Cost, Factor); 6611 }; 6612 6613 if (ST->hasAVX2()) 6614 if (const auto *Entry = CostTableLookup(AVX2InterleavedLoadTbl, Factor, 6615 ETy.getSimpleVT())) 6616 return GetDiscountedCost(Entry); 6617 6618 if (ST->hasSSSE3()) 6619 if (const auto *Entry = CostTableLookup(SSSE3InterleavedLoadTbl, Factor, 6620 ETy.getSimpleVT())) 6621 return GetDiscountedCost(Entry); 6622 6623 if (ST->hasSSE2()) 6624 if (const auto *Entry = CostTableLookup(SSE2InterleavedLoadTbl, Factor, 6625 ETy.getSimpleVT())) 6626 return GetDiscountedCost(Entry); 6627 } else { 6628 assert(Opcode == Instruction::Store && 6629 "Expected Store Instruction at this point"); 6630 assert((!Indices.size() || Indices.size() == Factor) && 6631 "Interleaved store only supports fully-interleaved groups."); 6632 if (ST->hasAVX2()) 6633 if (const auto *Entry = CostTableLookup(AVX2InterleavedStoreTbl, Factor, 6634 ETy.getSimpleVT())) 6635 return MemOpCosts + Entry->Cost; 6636 6637 if (ST->hasSSE2()) 6638 if (const auto *Entry = CostTableLookup(SSE2InterleavedStoreTbl, Factor, 6639 ETy.getSimpleVT())) 6640 return MemOpCosts + Entry->Cost; 6641 } 6642 6643 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, 6644 Alignment, AddressSpace, CostKind, 6645 UseMaskForCond, UseMaskForGaps); 6646 } 6647 6648 InstructionCost X86TTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, 6649 int64_t BaseOffset, 6650 bool HasBaseReg, int64_t Scale, 6651 unsigned AddrSpace) const { 6652 // Scaling factors are not free at all. 6653 // An indexed folded instruction, i.e., inst (reg1, reg2, scale), 6654 // will take 2 allocations in the out of order engine instead of 1 6655 // for plain addressing mode, i.e. inst (reg1). 6656 // E.g., 6657 // vaddps (%rsi,%rdx), %ymm0, %ymm1 6658 // Requires two allocations (one for the load, one for the computation) 6659 // whereas: 6660 // vaddps (%rsi), %ymm0, %ymm1 6661 // Requires just 1 allocation, i.e., freeing allocations for other operations 6662 // and having less micro operations to execute. 6663 // 6664 // For some X86 architectures, this is even worse because for instance for 6665 // stores, the complex addressing mode forces the instruction to use the 6666 // "load" ports instead of the dedicated "store" port. 6667 // E.g., on Haswell: 6668 // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3. 6669 // vmovaps %ymm1, (%r8) can use port 2, 3, or 7. 6670 TargetLoweringBase::AddrMode AM; 6671 AM.BaseGV = BaseGV; 6672 AM.BaseOffs = BaseOffset; 6673 AM.HasBaseReg = HasBaseReg; 6674 AM.Scale = Scale; 6675 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace)) 6676 // Scale represents reg2 * scale, thus account for 1 6677 // as soon as we use a second register. 6678 return AM.Scale != 0; 6679 return -1; 6680 } 6681