xref: /freebsd/contrib/llvm-project/llvm/lib/Target/X86/X86TargetTransformInfo.cpp (revision 258a0d760aa8b42899a000e30f610f900a402556)
1 //===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements a TargetTransformInfo analysis pass specific to the
10 /// X86 target machine. It uses the target's detailed information to provide
11 /// more precise answers to certain TTI queries, while letting the target
12 /// independent and default TTI implementations handle the rest.
13 ///
14 //===----------------------------------------------------------------------===//
15 /// About Cost Model numbers used below it's necessary to say the following:
16 /// the numbers correspond to some "generic" X86 CPU instead of usage of a
17 /// specific CPU model. Usually the numbers correspond to the CPU where the
18 /// feature first appeared. For example, if we do Subtarget.hasSSE42() in
19 /// the lookups below the cost is based on Nehalem as that was the first CPU
20 /// to support that feature level and thus has most likely the worst case cost,
21 /// although we may discard an outlying worst cost from one CPU (e.g. Atom).
22 ///
23 /// Some examples of other technologies/CPUs:
24 ///   SSE 3   - Pentium4 / Athlon64
25 ///   SSE 4.1 - Penryn
26 ///   SSE 4.2 - Nehalem / Silvermont
27 ///   AVX     - Sandy Bridge / Jaguar / Bulldozer
28 ///   AVX2    - Haswell / Ryzen
29 ///   AVX-512 - Xeon Phi / Skylake
30 ///
31 /// And some examples of instruction target dependent costs (latency)
32 ///                   divss     sqrtss          rsqrtss
33 ///   AMD K7          11-16     19              3
34 ///   Piledriver      9-24      13-15           5
35 ///   Jaguar          14        16              2
36 ///   Pentium II,III  18        30              2
37 ///   Nehalem         7-14      7-18            3
38 ///   Haswell         10-13     11              5
39 ///
40 /// Interpreting the 4 TargetCostKind types:
41 /// TCK_RecipThroughput and TCK_Latency should try to match the worst case
42 /// values reported by the CPU scheduler models (and llvm-mca).
43 /// TCK_CodeSize should match the instruction count (e.g. divss = 1), NOT the
44 /// actual encoding size of the instruction.
45 /// TCK_SizeAndLatency should match the worst case micro-op counts reported by
46 /// by the CPU scheduler models (and llvm-mca), to ensure that they are
47 /// compatible with the MicroOpBufferSize and LoopMicroOpBufferSize values which are
48 /// often used as the cost thresholds where TCK_SizeAndLatency is requested.
49 //===----------------------------------------------------------------------===//
50 
51 #include "X86TargetTransformInfo.h"
52 #include "llvm/Analysis/TargetTransformInfo.h"
53 #include "llvm/CodeGen/BasicTTIImpl.h"
54 #include "llvm/CodeGen/CostTable.h"
55 #include "llvm/CodeGen/TargetLowering.h"
56 #include "llvm/IR/InstIterator.h"
57 #include "llvm/IR/IntrinsicInst.h"
58 #include "llvm/Support/Debug.h"
59 #include <optional>
60 
61 using namespace llvm;
62 
63 #define DEBUG_TYPE "x86tti"
64 
65 //===----------------------------------------------------------------------===//
66 //
67 // X86 cost model.
68 //
69 //===----------------------------------------------------------------------===//
70 
71 // Helper struct to store/access costs for each cost kind.
72 // TODO: Move this to allow other targets to use it?
73 struct CostKindCosts {
74   unsigned RecipThroughputCost = ~0U;
75   unsigned LatencyCost = ~0U;
76   unsigned CodeSizeCost = ~0U;
77   unsigned SizeAndLatencyCost = ~0U;
78 
79   std::optional<unsigned>
80   operator[](TargetTransformInfo::TargetCostKind Kind) const {
81     unsigned Cost = ~0U;
82     switch (Kind) {
83     case TargetTransformInfo::TCK_RecipThroughput:
84       Cost = RecipThroughputCost;
85       break;
86     case TargetTransformInfo::TCK_Latency:
87       Cost = LatencyCost;
88       break;
89     case TargetTransformInfo::TCK_CodeSize:
90       Cost = CodeSizeCost;
91       break;
92     case TargetTransformInfo::TCK_SizeAndLatency:
93       Cost = SizeAndLatencyCost;
94       break;
95     }
96     if (Cost == ~0U)
97       return std::nullopt;
98     return Cost;
99   }
100 };
101 using CostKindTblEntry = CostTblEntryT<CostKindCosts>;
102 
103 TargetTransformInfo::PopcntSupportKind
104 X86TTIImpl::getPopcntSupport(unsigned TyWidth) {
105   assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
106   // TODO: Currently the __builtin_popcount() implementation using SSE3
107   //   instructions is inefficient. Once the problem is fixed, we should
108   //   call ST->hasSSE3() instead of ST->hasPOPCNT().
109   return ST->hasPOPCNT() ? TTI::PSK_FastHardware : TTI::PSK_Software;
110 }
111 
112 std::optional<unsigned> X86TTIImpl::getCacheSize(
113   TargetTransformInfo::CacheLevel Level) const {
114   switch (Level) {
115   case TargetTransformInfo::CacheLevel::L1D:
116     //   - Penryn
117     //   - Nehalem
118     //   - Westmere
119     //   - Sandy Bridge
120     //   - Ivy Bridge
121     //   - Haswell
122     //   - Broadwell
123     //   - Skylake
124     //   - Kabylake
125     return 32 * 1024;  //  32 KByte
126   case TargetTransformInfo::CacheLevel::L2D:
127     //   - Penryn
128     //   - Nehalem
129     //   - Westmere
130     //   - Sandy Bridge
131     //   - Ivy Bridge
132     //   - Haswell
133     //   - Broadwell
134     //   - Skylake
135     //   - Kabylake
136     return 256 * 1024; // 256 KByte
137   }
138 
139   llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
140 }
141 
142 std::optional<unsigned> X86TTIImpl::getCacheAssociativity(
143   TargetTransformInfo::CacheLevel Level) const {
144   //   - Penryn
145   //   - Nehalem
146   //   - Westmere
147   //   - Sandy Bridge
148   //   - Ivy Bridge
149   //   - Haswell
150   //   - Broadwell
151   //   - Skylake
152   //   - Kabylake
153   switch (Level) {
154   case TargetTransformInfo::CacheLevel::L1D:
155     [[fallthrough]];
156   case TargetTransformInfo::CacheLevel::L2D:
157     return 8;
158   }
159 
160   llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
161 }
162 
163 unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const {
164   bool Vector = (ClassID == 1);
165   if (Vector && !ST->hasSSE1())
166     return 0;
167 
168   if (ST->is64Bit()) {
169     if (Vector && ST->hasAVX512())
170       return 32;
171     return 16;
172   }
173   return 8;
174 }
175 
176 TypeSize
177 X86TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
178   unsigned PreferVectorWidth = ST->getPreferVectorWidth();
179   switch (K) {
180   case TargetTransformInfo::RGK_Scalar:
181     return TypeSize::getFixed(ST->is64Bit() ? 64 : 32);
182   case TargetTransformInfo::RGK_FixedWidthVector:
183     if (ST->hasAVX512() && PreferVectorWidth >= 512)
184       return TypeSize::getFixed(512);
185     if (ST->hasAVX() && PreferVectorWidth >= 256)
186       return TypeSize::getFixed(256);
187     if (ST->hasSSE1() && PreferVectorWidth >= 128)
188       return TypeSize::getFixed(128);
189     return TypeSize::getFixed(0);
190   case TargetTransformInfo::RGK_ScalableVector:
191     return TypeSize::getScalable(0);
192   }
193 
194   llvm_unreachable("Unsupported register kind");
195 }
196 
197 unsigned X86TTIImpl::getLoadStoreVecRegBitWidth(unsigned) const {
198   return getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
199       .getFixedValue();
200 }
201 
202 unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) {
203   // If the loop will not be vectorized, don't interleave the loop.
204   // Let regular unroll to unroll the loop, which saves the overflow
205   // check and memory check cost.
206   if (VF == 1)
207     return 1;
208 
209   if (ST->isAtom())
210     return 1;
211 
212   // Sandybridge and Haswell have multiple execution ports and pipelined
213   // vector units.
214   if (ST->hasAVX())
215     return 4;
216 
217   return 2;
218 }
219 
220 InstructionCost X86TTIImpl::getArithmeticInstrCost(
221     unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
222     TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
223     ArrayRef<const Value *> Args,
224     const Instruction *CxtI) {
225 
226   // vXi8 multiplications are always promoted to vXi16.
227   if (Opcode == Instruction::Mul && Ty->isVectorTy() &&
228       Ty->getScalarSizeInBits() == 8) {
229     Type *WideVecTy =
230         VectorType::getExtendedElementVectorType(cast<VectorType>(Ty));
231     return getCastInstrCost(Instruction::ZExt, WideVecTy, Ty,
232                             TargetTransformInfo::CastContextHint::None,
233                             CostKind) +
234            getCastInstrCost(Instruction::Trunc, Ty, WideVecTy,
235                             TargetTransformInfo::CastContextHint::None,
236                             CostKind) +
237            getArithmeticInstrCost(Opcode, WideVecTy, CostKind, Op1Info, Op2Info);
238   }
239 
240   // Legalize the type.
241   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
242 
243   int ISD = TLI->InstructionOpcodeToISD(Opcode);
244   assert(ISD && "Invalid opcode");
245 
246   if (ISD == ISD::MUL && Args.size() == 2 && LT.second.isVector() &&
247       LT.second.getScalarType() == MVT::i32) {
248     // Check if the operands can be represented as a smaller datatype.
249     bool Op1Signed = false, Op2Signed = false;
250     unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
251     unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
252     unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
253     bool SignedMode = Op1Signed || Op2Signed;
254 
255     // If both are representable as i15 and at least one is constant,
256     // zero-extended, or sign-extended from vXi16 (or less pre-SSE41) then we
257     // can treat this as PMADDWD which has the same costs as a vXi16 multiply.
258     if (OpMinSize <= 15 && !ST->isPMADDWDSlow()) {
259       bool Op1Constant =
260           isa<ConstantDataVector>(Args[0]) || isa<ConstantVector>(Args[0]);
261       bool Op2Constant =
262           isa<ConstantDataVector>(Args[1]) || isa<ConstantVector>(Args[1]);
263       bool Op1Sext = isa<SExtInst>(Args[0]) &&
264                      (Op1MinSize == 15 || (Op1MinSize < 15 && !ST->hasSSE41()));
265       bool Op2Sext = isa<SExtInst>(Args[1]) &&
266                      (Op2MinSize == 15 || (Op2MinSize < 15 && !ST->hasSSE41()));
267 
268       bool IsZeroExtended = !Op1Signed || !Op2Signed;
269       bool IsConstant = Op1Constant || Op2Constant;
270       bool IsSext = Op1Sext || Op2Sext;
271       if (IsConstant || IsZeroExtended || IsSext)
272         LT.second =
273             MVT::getVectorVT(MVT::i16, 2 * LT.second.getVectorNumElements());
274     }
275 
276     // Check if the vXi32 operands can be shrunk into a smaller datatype.
277     // This should match the codegen from reduceVMULWidth.
278     // TODO: Make this generic (!ST->SSE41 || ST->isPMULLDSlow()).
279     if (ST->useSLMArithCosts() && LT.second == MVT::v4i32) {
280       if (OpMinSize <= 7)
281         return LT.first * 3; // pmullw/sext
282       if (!SignedMode && OpMinSize <= 8)
283         return LT.first * 3; // pmullw/zext
284       if (OpMinSize <= 15)
285         return LT.first * 5; // pmullw/pmulhw/pshuf
286       if (!SignedMode && OpMinSize <= 16)
287         return LT.first * 5; // pmullw/pmulhw/pshuf
288     }
289   }
290 
291   // Vector multiply by pow2 will be simplified to shifts.
292   // Vector multiply by -pow2 will be simplified to shifts/negates.
293   if (ISD == ISD::MUL && Op2Info.isConstant() &&
294       (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2())) {
295     InstructionCost Cost =
296         getArithmeticInstrCost(Instruction::Shl, Ty, CostKind,
297                                Op1Info.getNoProps(), Op2Info.getNoProps());
298     if (Op2Info.isNegatedPowerOf2())
299       Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind);
300     return Cost;
301   }
302 
303   // On X86, vector signed division by constants power-of-two are
304   // normally expanded to the sequence SRA + SRL + ADD + SRA.
305   // The OperandValue properties may not be the same as that of the previous
306   // operation; conservatively assume OP_None.
307   if ((ISD == ISD::SDIV || ISD == ISD::SREM) &&
308       Op2Info.isConstant() && Op2Info.isPowerOf2()) {
309     InstructionCost Cost =
310         2 * getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
311                                    Op1Info.getNoProps(), Op2Info.getNoProps());
312     Cost += getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
313                                    Op1Info.getNoProps(), Op2Info.getNoProps());
314     Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
315                                    Op1Info.getNoProps(), Op2Info.getNoProps());
316 
317     if (ISD == ISD::SREM) {
318       // For SREM: (X % C) is the equivalent of (X - (X/C)*C)
319       Cost += getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, Op1Info.getNoProps(),
320                                      Op2Info.getNoProps());
321       Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Op1Info.getNoProps(),
322                                      Op2Info.getNoProps());
323     }
324 
325     return Cost;
326   }
327 
328   // Vector unsigned division/remainder will be simplified to shifts/masks.
329   if ((ISD == ISD::UDIV || ISD == ISD::UREM) &&
330       Op2Info.isConstant() && Op2Info.isPowerOf2()) {
331     if (ISD == ISD::UDIV)
332       return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
333                                     Op1Info.getNoProps(), Op2Info.getNoProps());
334     // UREM
335     return getArithmeticInstrCost(Instruction::And, Ty, CostKind,
336                                   Op1Info.getNoProps(), Op2Info.getNoProps());
337   }
338 
339   static const CostKindTblEntry AVX512BWUniformConstCostTable[] = {
340     { ISD::SHL,  MVT::v16i8,  { 1, 7, 2, 3 } }, // psllw + pand.
341     { ISD::SRL,  MVT::v16i8,  { 1, 7, 2, 3 } }, // psrlw + pand.
342     { ISD::SRA,  MVT::v16i8,  { 1, 8, 4, 5 } }, // psrlw, pand, pxor, psubb.
343     { ISD::SHL,  MVT::v32i8,  { 1, 8, 2, 3 } }, // psllw + pand.
344     { ISD::SRL,  MVT::v32i8,  { 1, 8, 2, 3 } }, // psrlw + pand.
345     { ISD::SRA,  MVT::v32i8,  { 1, 9, 4, 5 } }, // psrlw, pand, pxor, psubb.
346     { ISD::SHL,  MVT::v64i8,  { 1, 8, 2, 3 } }, // psllw + pand.
347     { ISD::SRL,  MVT::v64i8,  { 1, 8, 2, 3 } }, // psrlw + pand.
348     { ISD::SRA,  MVT::v64i8,  { 1, 9, 4, 6 } }, // psrlw, pand, pxor, psubb.
349 
350     { ISD::SHL,  MVT::v16i16, { 1, 1, 1, 1 } }, // psllw
351     { ISD::SRL,  MVT::v16i16, { 1, 1, 1, 1 } }, // psrlw
352     { ISD::SRA,  MVT::v16i16, { 1, 1, 1, 1 } }, // psrlw
353     { ISD::SHL,  MVT::v32i16, { 1, 1, 1, 1 } }, // psllw
354     { ISD::SRL,  MVT::v32i16, { 1, 1, 1, 1 } }, // psrlw
355     { ISD::SRA,  MVT::v32i16, { 1, 1, 1, 1 } }, // psrlw
356   };
357 
358   if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasBWI())
359     if (const auto *Entry =
360             CostTableLookup(AVX512BWUniformConstCostTable, ISD, LT.second))
361       if (auto KindCost = Entry->Cost[CostKind])
362         return LT.first * *KindCost;
363 
364   static const CostKindTblEntry AVX512UniformConstCostTable[] = {
365     { ISD::SHL,  MVT::v64i8,  {  2, 12,  5,  6 } }, // psllw + pand.
366     { ISD::SRL,  MVT::v64i8,  {  2, 12,  5,  6 } }, // psrlw + pand.
367     { ISD::SRA,  MVT::v64i8,  {  3, 10, 12, 12 } }, // psrlw, pand, pxor, psubb.
368 
369     { ISD::SHL,  MVT::v16i16, {  2,  7,  4,  4 } }, // psllw + split.
370     { ISD::SRL,  MVT::v16i16, {  2,  7,  4,  4 } }, // psrlw + split.
371     { ISD::SRA,  MVT::v16i16, {  2,  7,  4,  4 } }, // psraw + split.
372 
373     { ISD::SHL,  MVT::v8i32,  {  1,  1,  1,  1 } }, // pslld
374     { ISD::SRL,  MVT::v8i32,  {  1,  1,  1,  1 } }, // psrld
375     { ISD::SRA,  MVT::v8i32,  {  1,  1,  1,  1 } }, // psrad
376     { ISD::SHL,  MVT::v16i32, {  1,  1,  1,  1 } }, // pslld
377     { ISD::SRL,  MVT::v16i32, {  1,  1,  1,  1 } }, // psrld
378     { ISD::SRA,  MVT::v16i32, {  1,  1,  1,  1 } }, // psrad
379 
380     { ISD::SRA,  MVT::v2i64,  {  1,  1,  1,  1 } }, // psraq
381     { ISD::SHL,  MVT::v4i64,  {  1,  1,  1,  1 } }, // psllq
382     { ISD::SRL,  MVT::v4i64,  {  1,  1,  1,  1 } }, // psrlq
383     { ISD::SRA,  MVT::v4i64,  {  1,  1,  1,  1 } }, // psraq
384     { ISD::SHL,  MVT::v8i64,  {  1,  1,  1,  1 } }, // psllq
385     { ISD::SRL,  MVT::v8i64,  {  1,  1,  1,  1 } }, // psrlq
386     { ISD::SRA,  MVT::v8i64,  {  1,  1,  1,  1 } }, // psraq
387 
388     { ISD::SDIV, MVT::v16i32, {  6 } }, // pmuludq sequence
389     { ISD::SREM, MVT::v16i32, {  8 } }, // pmuludq+mul+sub sequence
390     { ISD::UDIV, MVT::v16i32, {  5 } }, // pmuludq sequence
391     { ISD::UREM, MVT::v16i32, {  7 } }, // pmuludq+mul+sub sequence
392   };
393 
394   if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX512())
395     if (const auto *Entry =
396             CostTableLookup(AVX512UniformConstCostTable, ISD, LT.second))
397       if (auto KindCost = Entry->Cost[CostKind])
398         return LT.first * *KindCost;
399 
400   static const CostKindTblEntry AVX2UniformConstCostTable[] = {
401     { ISD::SHL,  MVT::v16i8, {  1,  8,  2,  3 } }, // psllw + pand.
402     { ISD::SRL,  MVT::v16i8, {  1,  8,  2,  3 } }, // psrlw + pand.
403     { ISD::SRA,  MVT::v16i8, {  2, 10,  5,  6 } }, // psrlw, pand, pxor, psubb.
404     { ISD::SHL,  MVT::v32i8, {  2,  8,  2,  4 } }, // psllw + pand.
405     { ISD::SRL,  MVT::v32i8, {  2,  8,  2,  4 } }, // psrlw + pand.
406     { ISD::SRA,  MVT::v32i8, {  3, 10,  5,  9 } }, // psrlw, pand, pxor, psubb.
407 
408     { ISD::SHL,  MVT::v8i16, {  1,  1,  1,  1 } }, // psllw
409     { ISD::SRL,  MVT::v8i16, {  1,  1,  1,  1 } }, // psrlw
410     { ISD::SRA,  MVT::v8i16, {  1,  1,  1,  1 } }, // psraw
411     { ISD::SHL,  MVT::v16i16,{  2,  2,  1,  2 } }, // psllw
412     { ISD::SRL,  MVT::v16i16,{  2,  2,  1,  2 } }, // psrlw
413     { ISD::SRA,  MVT::v16i16,{  2,  2,  1,  2 } }, // psraw
414 
415     { ISD::SHL,  MVT::v4i32, {  1,  1,  1,  1 } }, // pslld
416     { ISD::SRL,  MVT::v4i32, {  1,  1,  1,  1 } }, // psrld
417     { ISD::SRA,  MVT::v4i32, {  1,  1,  1,  1 } }, // psrad
418     { ISD::SHL,  MVT::v8i32, {  2,  2,  1,  2 } }, // pslld
419     { ISD::SRL,  MVT::v8i32, {  2,  2,  1,  2 } }, // psrld
420     { ISD::SRA,  MVT::v8i32, {  2,  2,  1,  2 } }, // psrad
421 
422     { ISD::SHL,  MVT::v2i64, {  1,  1,  1,  1 } }, // psllq
423     { ISD::SRL,  MVT::v2i64, {  1,  1,  1,  1 } }, // psrlq
424     { ISD::SRA,  MVT::v2i64, {  2,  3,  3,  3 } }, // psrad + shuffle.
425     { ISD::SHL,  MVT::v4i64, {  2,  2,  1,  2 } }, // psllq
426     { ISD::SRL,  MVT::v4i64, {  2,  2,  1,  2 } }, // psrlq
427     { ISD::SRA,  MVT::v4i64, {  4,  4,  3,  6 } }, // psrad + shuffle + split.
428 
429     { ISD::SDIV, MVT::v8i32, {  6 } }, // pmuludq sequence
430     { ISD::SREM, MVT::v8i32, {  8 } }, // pmuludq+mul+sub sequence
431     { ISD::UDIV, MVT::v8i32, {  5 } }, // pmuludq sequence
432     { ISD::UREM, MVT::v8i32, {  7 } }, // pmuludq+mul+sub sequence
433   };
434 
435   if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX2())
436     if (const auto *Entry =
437             CostTableLookup(AVX2UniformConstCostTable, ISD, LT.second))
438       if (auto KindCost = Entry->Cost[CostKind])
439         return LT.first * *KindCost;
440 
441   static const CostKindTblEntry AVXUniformConstCostTable[] = {
442     { ISD::SHL,  MVT::v16i8, {  2,  7,  2,  3 } }, // psllw + pand.
443     { ISD::SRL,  MVT::v16i8, {  2,  7,  2,  3 } }, // psrlw + pand.
444     { ISD::SRA,  MVT::v16i8, {  3,  9,  5,  6 } }, // psrlw, pand, pxor, psubb.
445     { ISD::SHL,  MVT::v32i8, {  4,  7,  7,  8 } }, // 2*(psllw + pand) + split.
446     { ISD::SRL,  MVT::v32i8, {  4,  7,  7,  8 } }, // 2*(psrlw + pand) + split.
447     { ISD::SRA,  MVT::v32i8, {  7,  7, 12, 13 } }, // 2*(psrlw, pand, pxor, psubb) + split.
448 
449     { ISD::SHL,  MVT::v8i16, {  1,  2,  1,  1 } }, // psllw.
450     { ISD::SRL,  MVT::v8i16, {  1,  2,  1,  1 } }, // psrlw.
451     { ISD::SRA,  MVT::v8i16, {  1,  2,  1,  1 } }, // psraw.
452     { ISD::SHL,  MVT::v16i16,{  3,  6,  4,  5 } }, // psllw + split.
453     { ISD::SRL,  MVT::v16i16,{  3,  6,  4,  5 } }, // psrlw + split.
454     { ISD::SRA,  MVT::v16i16,{  3,  6,  4,  5 } }, // psraw + split.
455 
456     { ISD::SHL,  MVT::v4i32, {  1,  2,  1,  1 } }, // pslld.
457     { ISD::SRL,  MVT::v4i32, {  1,  2,  1,  1 } }, // psrld.
458     { ISD::SRA,  MVT::v4i32, {  1,  2,  1,  1 } }, // psrad.
459     { ISD::SHL,  MVT::v8i32, {  3,  6,  4,  5 } }, // pslld + split.
460     { ISD::SRL,  MVT::v8i32, {  3,  6,  4,  5 } }, // psrld + split.
461     { ISD::SRA,  MVT::v8i32, {  3,  6,  4,  5 } }, // psrad + split.
462 
463     { ISD::SHL,  MVT::v2i64, {  1,  2,  1,  1 } }, // psllq.
464     { ISD::SRL,  MVT::v2i64, {  1,  2,  1,  1 } }, // psrlq.
465     { ISD::SRA,  MVT::v2i64, {  2,  3,  3,  3 } }, // psrad + shuffle.
466     { ISD::SHL,  MVT::v4i64, {  3,  6,  4,  5 } }, // 2 x psllq + split.
467     { ISD::SRL,  MVT::v4i64, {  3,  6,  4,  5 } }, // 2 x psllq + split.
468     { ISD::SRA,  MVT::v4i64, {  5,  7,  8,  9 } }, // 2 x psrad + shuffle + split.
469 
470     { ISD::SDIV, MVT::v8i32, { 14 } }, // 2*pmuludq sequence + split.
471     { ISD::SREM, MVT::v8i32, { 18 } }, // 2*pmuludq+mul+sub sequence + split.
472     { ISD::UDIV, MVT::v8i32, { 12 } }, // 2*pmuludq sequence + split.
473     { ISD::UREM, MVT::v8i32, { 16 } }, // 2*pmuludq+mul+sub sequence + split.
474   };
475 
476   // XOP has faster vXi8 shifts.
477   if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX() &&
478       (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
479     if (const auto *Entry =
480             CostTableLookup(AVXUniformConstCostTable, ISD, LT.second))
481       if (auto KindCost = Entry->Cost[CostKind])
482         return LT.first * *KindCost;
483 
484   static const CostKindTblEntry SSE2UniformConstCostTable[] = {
485     { ISD::SHL,  MVT::v16i8, {  1,  7,  2,  3 } }, // psllw + pand.
486     { ISD::SRL,  MVT::v16i8, {  1,  7,  2,  3 } }, // psrlw + pand.
487     { ISD::SRA,  MVT::v16i8, {  3,  9,  5,  6 } }, // psrlw, pand, pxor, psubb.
488 
489     { ISD::SHL,  MVT::v8i16, {  1,  1,  1,  1 } }, // psllw.
490     { ISD::SRL,  MVT::v8i16, {  1,  1,  1,  1 } }, // psrlw.
491     { ISD::SRA,  MVT::v8i16, {  1,  1,  1,  1 } }, // psraw.
492 
493     { ISD::SHL,  MVT::v4i32, {  1,  1,  1,  1 } }, // pslld
494     { ISD::SRL,  MVT::v4i32, {  1,  1,  1,  1 } }, // psrld.
495     { ISD::SRA,  MVT::v4i32, {  1,  1,  1,  1 } }, // psrad.
496 
497     { ISD::SHL,  MVT::v2i64, {  1,  1,  1,  1 } }, // psllq.
498     { ISD::SRL,  MVT::v2i64, {  1,  1,  1,  1 } }, // psrlq.
499     { ISD::SRA,  MVT::v2i64, {  3,  5,  6,  6 } }, // 2 x psrad + shuffle.
500 
501     { ISD::SDIV, MVT::v4i32, {  6 } }, // pmuludq sequence
502     { ISD::SREM, MVT::v4i32, {  8 } }, // pmuludq+mul+sub sequence
503     { ISD::UDIV, MVT::v4i32, {  5 } }, // pmuludq sequence
504     { ISD::UREM, MVT::v4i32, {  7 } }, // pmuludq+mul+sub sequence
505   };
506 
507   // XOP has faster vXi8 shifts.
508   if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasSSE2() &&
509       (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
510     if (const auto *Entry =
511             CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second))
512       if (auto KindCost = Entry->Cost[CostKind])
513         return LT.first * *KindCost;
514 
515   static const CostKindTblEntry AVX512BWConstCostTable[] = {
516     { ISD::SDIV, MVT::v64i8,  { 14 } }, // 2*ext+2*pmulhw sequence
517     { ISD::SREM, MVT::v64i8,  { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
518     { ISD::UDIV, MVT::v64i8,  { 14 } }, // 2*ext+2*pmulhw sequence
519     { ISD::UREM, MVT::v64i8,  { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
520 
521     { ISD::SDIV, MVT::v32i16, {  6 } }, // vpmulhw sequence
522     { ISD::SREM, MVT::v32i16, {  8 } }, // vpmulhw+mul+sub sequence
523     { ISD::UDIV, MVT::v32i16, {  6 } }, // vpmulhuw sequence
524     { ISD::UREM, MVT::v32i16, {  8 } }, // vpmulhuw+mul+sub sequence
525   };
526 
527   if (Op2Info.isConstant() && ST->hasBWI())
528     if (const auto *Entry =
529             CostTableLookup(AVX512BWConstCostTable, ISD, LT.second))
530       if (auto KindCost = Entry->Cost[CostKind])
531         return LT.first * *KindCost;
532 
533   static const CostKindTblEntry AVX512ConstCostTable[] = {
534     { ISD::SDIV, MVT::v64i8,  { 28 } }, // 4*ext+4*pmulhw sequence
535     { ISD::SREM, MVT::v64i8,  { 32 } }, // 4*ext+4*pmulhw+mul+sub sequence
536     { ISD::UDIV, MVT::v64i8,  { 28 } }, // 4*ext+4*pmulhw sequence
537     { ISD::UREM, MVT::v64i8,  { 32 } }, // 4*ext+4*pmulhw+mul+sub sequence
538 
539     { ISD::SDIV, MVT::v32i16, { 12 } }, // 2*vpmulhw sequence
540     { ISD::SREM, MVT::v32i16, { 16 } }, // 2*vpmulhw+mul+sub sequence
541     { ISD::UDIV, MVT::v32i16, { 12 } }, // 2*vpmulhuw sequence
542     { ISD::UREM, MVT::v32i16, { 16 } }, // 2*vpmulhuw+mul+sub sequence
543 
544     { ISD::SDIV, MVT::v16i32, { 15 } }, // vpmuldq sequence
545     { ISD::SREM, MVT::v16i32, { 17 } }, // vpmuldq+mul+sub sequence
546     { ISD::UDIV, MVT::v16i32, { 15 } }, // vpmuludq sequence
547     { ISD::UREM, MVT::v16i32, { 17 } }, // vpmuludq+mul+sub sequence
548   };
549 
550   if (Op2Info.isConstant() && ST->hasAVX512())
551     if (const auto *Entry =
552             CostTableLookup(AVX512ConstCostTable, ISD, LT.second))
553       if (auto KindCost = Entry->Cost[CostKind])
554         return LT.first * *KindCost;
555 
556   static const CostKindTblEntry AVX2ConstCostTable[] = {
557     { ISD::SDIV, MVT::v32i8,  { 14 } }, // 2*ext+2*pmulhw sequence
558     { ISD::SREM, MVT::v32i8,  { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
559     { ISD::UDIV, MVT::v32i8,  { 14 } }, // 2*ext+2*pmulhw sequence
560     { ISD::UREM, MVT::v32i8,  { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
561 
562     { ISD::SDIV, MVT::v16i16, {  6 } }, // vpmulhw sequence
563     { ISD::SREM, MVT::v16i16, {  8 } }, // vpmulhw+mul+sub sequence
564     { ISD::UDIV, MVT::v16i16, {  6 } }, // vpmulhuw sequence
565     { ISD::UREM, MVT::v16i16, {  8 } }, // vpmulhuw+mul+sub sequence
566 
567     { ISD::SDIV, MVT::v8i32,  { 15 } }, // vpmuldq sequence
568     { ISD::SREM, MVT::v8i32,  { 19 } }, // vpmuldq+mul+sub sequence
569     { ISD::UDIV, MVT::v8i32,  { 15 } }, // vpmuludq sequence
570     { ISD::UREM, MVT::v8i32,  { 19 } }, // vpmuludq+mul+sub sequence
571   };
572 
573   if (Op2Info.isConstant() && ST->hasAVX2())
574     if (const auto *Entry = CostTableLookup(AVX2ConstCostTable, ISD, LT.second))
575       if (auto KindCost = Entry->Cost[CostKind])
576         return LT.first * *KindCost;
577 
578   static const CostKindTblEntry AVXConstCostTable[] = {
579     { ISD::SDIV, MVT::v32i8,  { 30 } }, // 4*ext+4*pmulhw sequence + split.
580     { ISD::SREM, MVT::v32i8,  { 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split.
581     { ISD::UDIV, MVT::v32i8,  { 30 } }, // 4*ext+4*pmulhw sequence + split.
582     { ISD::UREM, MVT::v32i8,  { 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split.
583 
584     { ISD::SDIV, MVT::v16i16, { 14 } }, // 2*pmulhw sequence + split.
585     { ISD::SREM, MVT::v16i16, { 18 } }, // 2*pmulhw+mul+sub sequence + split.
586     { ISD::UDIV, MVT::v16i16, { 14 } }, // 2*pmulhuw sequence + split.
587     { ISD::UREM, MVT::v16i16, { 18 } }, // 2*pmulhuw+mul+sub sequence + split.
588 
589     { ISD::SDIV, MVT::v8i32,  { 32 } }, // vpmuludq sequence
590     { ISD::SREM, MVT::v8i32,  { 38 } }, // vpmuludq+mul+sub sequence
591     { ISD::UDIV, MVT::v8i32,  { 32 } }, // 2*pmuludq sequence + split.
592     { ISD::UREM, MVT::v8i32,  { 42 } }, // 2*pmuludq+mul+sub sequence + split.
593   };
594 
595   if (Op2Info.isConstant() && ST->hasAVX())
596     if (const auto *Entry = CostTableLookup(AVXConstCostTable, ISD, LT.second))
597       if (auto KindCost = Entry->Cost[CostKind])
598         return LT.first * *KindCost;
599 
600   static const CostKindTblEntry SSE41ConstCostTable[] = {
601     { ISD::SDIV, MVT::v4i32,  { 15 } }, // vpmuludq sequence
602     { ISD::SREM, MVT::v4i32,  { 20 } }, // vpmuludq+mul+sub sequence
603   };
604 
605   if (Op2Info.isConstant() && ST->hasSSE41())
606     if (const auto *Entry =
607             CostTableLookup(SSE41ConstCostTable, ISD, LT.second))
608       if (auto KindCost = Entry->Cost[CostKind])
609         return LT.first * *KindCost;
610 
611   static const CostKindTblEntry SSE2ConstCostTable[] = {
612     { ISD::SDIV, MVT::v16i8,  { 14 } }, // 2*ext+2*pmulhw sequence
613     { ISD::SREM, MVT::v16i8,  { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
614     { ISD::UDIV, MVT::v16i8,  { 14 } }, // 2*ext+2*pmulhw sequence
615     { ISD::UREM, MVT::v16i8,  { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
616 
617     { ISD::SDIV, MVT::v8i16,  {  6 } }, // pmulhw sequence
618     { ISD::SREM, MVT::v8i16,  {  8 } }, // pmulhw+mul+sub sequence
619     { ISD::UDIV, MVT::v8i16,  {  6 } }, // pmulhuw sequence
620     { ISD::UREM, MVT::v8i16,  {  8 } }, // pmulhuw+mul+sub sequence
621 
622     { ISD::SDIV, MVT::v4i32,  { 19 } }, // pmuludq sequence
623     { ISD::SREM, MVT::v4i32,  { 24 } }, // pmuludq+mul+sub sequence
624     { ISD::UDIV, MVT::v4i32,  { 15 } }, // pmuludq sequence
625     { ISD::UREM, MVT::v4i32,  { 20 } }, // pmuludq+mul+sub sequence
626   };
627 
628   if (Op2Info.isConstant() && ST->hasSSE2())
629     if (const auto *Entry = CostTableLookup(SSE2ConstCostTable, ISD, LT.second))
630       if (auto KindCost = Entry->Cost[CostKind])
631         return LT.first * *KindCost;
632 
633   static const CostKindTblEntry AVX512BWUniformCostTable[] = {
634     { ISD::SHL,  MVT::v16i8,  { 3, 5, 5, 7 } }, // psllw + pand.
635     { ISD::SRL,  MVT::v16i8,  { 3,10, 5, 8 } }, // psrlw + pand.
636     { ISD::SRA,  MVT::v16i8,  { 4,12, 8,12 } }, // psrlw, pand, pxor, psubb.
637     { ISD::SHL,  MVT::v32i8,  { 4, 7, 6, 8 } }, // psllw + pand.
638     { ISD::SRL,  MVT::v32i8,  { 4, 8, 7, 9 } }, // psrlw + pand.
639     { ISD::SRA,  MVT::v32i8,  { 5,10,10,13 } }, // psrlw, pand, pxor, psubb.
640     { ISD::SHL,  MVT::v64i8,  { 4, 7, 6, 8 } }, // psllw + pand.
641     { ISD::SRL,  MVT::v64i8,  { 4, 8, 7,10 } }, // psrlw + pand.
642     { ISD::SRA,  MVT::v64i8,  { 5,10,10,15 } }, // psrlw, pand, pxor, psubb.
643 
644     { ISD::SHL,  MVT::v32i16, { 2, 4, 2, 3 } }, // psllw
645     { ISD::SRL,  MVT::v32i16, { 2, 4, 2, 3 } }, // psrlw
646     { ISD::SRA,  MVT::v32i16, { 2, 4, 2, 3 } }, // psrqw
647   };
648 
649   if (ST->hasBWI() && Op2Info.isUniform())
650     if (const auto *Entry =
651             CostTableLookup(AVX512BWUniformCostTable, ISD, LT.second))
652       if (auto KindCost = Entry->Cost[CostKind])
653         return LT.first * *KindCost;
654 
655   static const CostKindTblEntry AVX512UniformCostTable[] = {
656     { ISD::SHL,  MVT::v32i16, { 5,10, 5, 7 } }, // psllw + split.
657     { ISD::SRL,  MVT::v32i16, { 5,10, 5, 7 } }, // psrlw + split.
658     { ISD::SRA,  MVT::v32i16, { 5,10, 5, 7 } }, // psraw + split.
659 
660     { ISD::SHL,  MVT::v16i32, { 2, 4, 2, 3 } }, // pslld
661     { ISD::SRL,  MVT::v16i32, { 2, 4, 2, 3 } }, // psrld
662     { ISD::SRA,  MVT::v16i32, { 2, 4, 2, 3 } }, // psrad
663 
664     { ISD::SRA,  MVT::v2i64,  { 1, 2, 1, 2 } }, // psraq
665     { ISD::SHL,  MVT::v4i64,  { 1, 4, 1, 2 } }, // psllq
666     { ISD::SRL,  MVT::v4i64,  { 1, 4, 1, 2 } }, // psrlq
667     { ISD::SRA,  MVT::v4i64,  { 1, 4, 1, 2 } }, // psraq
668     { ISD::SHL,  MVT::v8i64,  { 1, 4, 1, 2 } }, // psllq
669     { ISD::SRL,  MVT::v8i64,  { 1, 4, 1, 2 } }, // psrlq
670     { ISD::SRA,  MVT::v8i64,  { 1, 4, 1, 2 } }, // psraq
671   };
672 
673   if (ST->hasAVX512() && Op2Info.isUniform())
674     if (const auto *Entry =
675             CostTableLookup(AVX512UniformCostTable, ISD, LT.second))
676       if (auto KindCost = Entry->Cost[CostKind])
677         return LT.first * *KindCost;
678 
679   static const CostKindTblEntry AVX2UniformCostTable[] = {
680     // Uniform splats are cheaper for the following instructions.
681     { ISD::SHL,  MVT::v16i8,  { 3, 5, 5, 7 } }, // psllw + pand.
682     { ISD::SRL,  MVT::v16i8,  { 3, 9, 5, 8 } }, // psrlw + pand.
683     { ISD::SRA,  MVT::v16i8,  { 4, 5, 9,13 } }, // psrlw, pand, pxor, psubb.
684     { ISD::SHL,  MVT::v32i8,  { 4, 7, 6, 8 } }, // psllw + pand.
685     { ISD::SRL,  MVT::v32i8,  { 4, 8, 7, 9 } }, // psrlw + pand.
686     { ISD::SRA,  MVT::v32i8,  { 6, 9,11,16 } }, // psrlw, pand, pxor, psubb.
687 
688     { ISD::SHL,  MVT::v8i16,  { 1, 2, 1, 2 } }, // psllw.
689     { ISD::SRL,  MVT::v8i16,  { 1, 2, 1, 2 } }, // psrlw.
690     { ISD::SRA,  MVT::v8i16,  { 1, 2, 1, 2 } }, // psraw.
691     { ISD::SHL,  MVT::v16i16, { 2, 4, 2, 3 } }, // psllw.
692     { ISD::SRL,  MVT::v16i16, { 2, 4, 2, 3 } }, // psrlw.
693     { ISD::SRA,  MVT::v16i16, { 2, 4, 2, 3 } }, // psraw.
694 
695     { ISD::SHL,  MVT::v4i32,  { 1, 2, 1, 2 } }, // pslld
696     { ISD::SRL,  MVT::v4i32,  { 1, 2, 1, 2 } }, // psrld
697     { ISD::SRA,  MVT::v4i32,  { 1, 2, 1, 2 } }, // psrad
698     { ISD::SHL,  MVT::v8i32,  { 2, 4, 2, 3 } }, // pslld
699     { ISD::SRL,  MVT::v8i32,  { 2, 4, 2, 3 } }, // psrld
700     { ISD::SRA,  MVT::v8i32,  { 2, 4, 2, 3 } }, // psrad
701 
702     { ISD::SHL,  MVT::v2i64,  { 1, 2, 1, 2 } }, // psllq
703     { ISD::SRL,  MVT::v2i64,  { 1, 2, 1, 2 } }, // psrlq
704     { ISD::SRA,  MVT::v2i64,  { 2, 4, 5, 7 } }, // 2 x psrad + shuffle.
705     { ISD::SHL,  MVT::v4i64,  { 2, 4, 1, 2 } }, // psllq
706     { ISD::SRL,  MVT::v4i64,  { 2, 4, 1, 2 } }, // psrlq
707     { ISD::SRA,  MVT::v4i64,  { 4, 6, 5, 9 } }, // 2 x psrad + shuffle.
708   };
709 
710   if (ST->hasAVX2() && Op2Info.isUniform())
711     if (const auto *Entry =
712             CostTableLookup(AVX2UniformCostTable, ISD, LT.second))
713       if (auto KindCost = Entry->Cost[CostKind])
714         return LT.first * *KindCost;
715 
716   static const CostKindTblEntry AVXUniformCostTable[] = {
717     { ISD::SHL,  MVT::v16i8,  {  4, 4, 6, 8 } }, // psllw + pand.
718     { ISD::SRL,  MVT::v16i8,  {  4, 8, 5, 8 } }, // psrlw + pand.
719     { ISD::SRA,  MVT::v16i8,  {  6, 6, 9,13 } }, // psrlw, pand, pxor, psubb.
720     { ISD::SHL,  MVT::v32i8,  {  7, 8,11,14 } }, // psllw + pand + split.
721     { ISD::SRL,  MVT::v32i8,  {  7, 9,10,14 } }, // psrlw + pand + split.
722     { ISD::SRA,  MVT::v32i8,  { 10,11,16,21 } }, // psrlw, pand, pxor, psubb + split.
723 
724     { ISD::SHL,  MVT::v8i16,  {  1, 3, 1, 2 } }, // psllw.
725     { ISD::SRL,  MVT::v8i16,  {  1, 3, 1, 2 } }, // psrlw.
726     { ISD::SRA,  MVT::v8i16,  {  1, 3, 1, 2 } }, // psraw.
727     { ISD::SHL,  MVT::v16i16, {  3, 7, 5, 7 } }, // psllw + split.
728     { ISD::SRL,  MVT::v16i16, {  3, 7, 5, 7 } }, // psrlw + split.
729     { ISD::SRA,  MVT::v16i16, {  3, 7, 5, 7 } }, // psraw + split.
730 
731     { ISD::SHL,  MVT::v4i32,  {  1, 3, 1, 2 } }, // pslld.
732     { ISD::SRL,  MVT::v4i32,  {  1, 3, 1, 2 } }, // psrld.
733     { ISD::SRA,  MVT::v4i32,  {  1, 3, 1, 2 } }, // psrad.
734     { ISD::SHL,  MVT::v8i32,  {  3, 7, 5, 7 } }, // pslld + split.
735     { ISD::SRL,  MVT::v8i32,  {  3, 7, 5, 7 } }, // psrld + split.
736     { ISD::SRA,  MVT::v8i32,  {  3, 7, 5, 7 } }, // psrad + split.
737 
738     { ISD::SHL,  MVT::v2i64,  {  1, 3, 1, 2 } }, // psllq.
739     { ISD::SRL,  MVT::v2i64,  {  1, 3, 1, 2 } }, // psrlq.
740     { ISD::SRA,  MVT::v2i64,  {  3, 4, 5, 7 } }, // 2 x psrad + shuffle.
741     { ISD::SHL,  MVT::v4i64,  {  3, 7, 4, 6 } }, // psllq + split.
742     { ISD::SRL,  MVT::v4i64,  {  3, 7, 4, 6 } }, // psrlq + split.
743     { ISD::SRA,  MVT::v4i64,  {  6, 7,10,13 } }, // 2 x (2 x psrad + shuffle) + split.
744   };
745 
746   // XOP has faster vXi8 shifts.
747   if (ST->hasAVX() && Op2Info.isUniform() &&
748       (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
749     if (const auto *Entry =
750             CostTableLookup(AVXUniformCostTable, ISD, LT.second))
751       if (auto KindCost = Entry->Cost[CostKind])
752         return LT.first * *KindCost;
753 
754   static const CostKindTblEntry SSE2UniformCostTable[] = {
755     // Uniform splats are cheaper for the following instructions.
756     { ISD::SHL,  MVT::v16i8, {  9, 10, 6, 9 } }, // psllw + pand.
757     { ISD::SRL,  MVT::v16i8, {  9, 13, 5, 9 } }, // psrlw + pand.
758     { ISD::SRA,  MVT::v16i8, { 11, 15, 9,13 } }, // pcmpgtb sequence.
759 
760     { ISD::SHL,  MVT::v8i16, {  2, 2, 1, 2 } }, // psllw.
761     { ISD::SRL,  MVT::v8i16, {  2, 2, 1, 2 } }, // psrlw.
762     { ISD::SRA,  MVT::v8i16, {  2, 2, 1, 2 } }, // psraw.
763 
764     { ISD::SHL,  MVT::v4i32, {  2, 2, 1, 2 } }, // pslld
765     { ISD::SRL,  MVT::v4i32, {  2, 2, 1, 2 } }, // psrld.
766     { ISD::SRA,  MVT::v4i32, {  2, 2, 1, 2 } }, // psrad.
767 
768     { ISD::SHL,  MVT::v2i64, {  2, 2, 1, 2 } }, // psllq.
769     { ISD::SRL,  MVT::v2i64, {  2, 2, 1, 2 } }, // psrlq.
770     { ISD::SRA,  MVT::v2i64, {  5, 9, 5, 7 } }, // 2*psrlq + xor + sub.
771   };
772 
773   if (ST->hasSSE2() && Op2Info.isUniform() &&
774       (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
775     if (const auto *Entry =
776             CostTableLookup(SSE2UniformCostTable, ISD, LT.second))
777       if (auto KindCost = Entry->Cost[CostKind])
778         return LT.first * *KindCost;
779 
780   static const CostKindTblEntry AVX512DQCostTable[] = {
781     { ISD::MUL,  MVT::v2i64, { 2, 15, 1, 3 } }, // pmullq
782     { ISD::MUL,  MVT::v4i64, { 2, 15, 1, 3 } }, // pmullq
783     { ISD::MUL,  MVT::v8i64, { 3, 15, 1, 3 } }  // pmullq
784   };
785 
786   // Look for AVX512DQ lowering tricks for custom cases.
787   if (ST->hasDQI())
788     if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second))
789       if (auto KindCost = Entry->Cost[CostKind])
790         return LT.first * *KindCost;
791 
792   static const CostKindTblEntry AVX512BWCostTable[] = {
793     { ISD::SHL,   MVT::v16i8,   {  4,  8, 4, 5 } }, // extend/vpsllvw/pack sequence.
794     { ISD::SRL,   MVT::v16i8,   {  4,  8, 4, 5 } }, // extend/vpsrlvw/pack sequence.
795     { ISD::SRA,   MVT::v16i8,   {  4,  8, 4, 5 } }, // extend/vpsravw/pack sequence.
796     { ISD::SHL,   MVT::v32i8,   {  4, 23,11,16 } }, // extend/vpsllvw/pack sequence.
797     { ISD::SRL,   MVT::v32i8,   {  4, 30,12,18 } }, // extend/vpsrlvw/pack sequence.
798     { ISD::SRA,   MVT::v32i8,   {  6, 13,24,30 } }, // extend/vpsravw/pack sequence.
799     { ISD::SHL,   MVT::v64i8,   {  6, 19,13,15 } }, // extend/vpsllvw/pack sequence.
800     { ISD::SRL,   MVT::v64i8,   {  7, 27,15,18 } }, // extend/vpsrlvw/pack sequence.
801     { ISD::SRA,   MVT::v64i8,   { 15, 15,30,30 } }, // extend/vpsravw/pack sequence.
802 
803     { ISD::SHL,   MVT::v8i16,   {  1,  1, 1, 1 } }, // vpsllvw
804     { ISD::SRL,   MVT::v8i16,   {  1,  1, 1, 1 } }, // vpsrlvw
805     { ISD::SRA,   MVT::v8i16,   {  1,  1, 1, 1 } }, // vpsravw
806     { ISD::SHL,   MVT::v16i16,  {  1,  1, 1, 1 } }, // vpsllvw
807     { ISD::SRL,   MVT::v16i16,  {  1,  1, 1, 1 } }, // vpsrlvw
808     { ISD::SRA,   MVT::v16i16,  {  1,  1, 1, 1 } }, // vpsravw
809     { ISD::SHL,   MVT::v32i16,  {  1,  1, 1, 1 } }, // vpsllvw
810     { ISD::SRL,   MVT::v32i16,  {  1,  1, 1, 1 } }, // vpsrlvw
811     { ISD::SRA,   MVT::v32i16,  {  1,  1, 1, 1 } }, // vpsravw
812 
813     { ISD::ADD,   MVT::v64i8,   {  1,  1, 1, 1 } }, // paddb
814     { ISD::ADD,   MVT::v32i16,  {  1,  1, 1, 1 } }, // paddw
815 
816     { ISD::ADD,   MVT::v32i8,   {  1,  1, 1, 1 } }, // paddb
817     { ISD::ADD,   MVT::v16i16,  {  1,  1, 1, 1 } }, // paddw
818     { ISD::ADD,   MVT::v8i32,   {  1,  1, 1, 1 } }, // paddd
819     { ISD::ADD,   MVT::v4i64,   {  1,  1, 1, 1 } }, // paddq
820 
821     { ISD::SUB,   MVT::v64i8,   {  1,  1, 1, 1 } }, // psubb
822     { ISD::SUB,   MVT::v32i16,  {  1,  1, 1, 1 } }, // psubw
823 
824     { ISD::MUL,   MVT::v32i16,  {  1,  5, 1, 1 } }, // pmullw
825 
826     { ISD::SUB,   MVT::v32i8,   {  1,  1, 1, 1 } }, // psubb
827     { ISD::SUB,   MVT::v16i16,  {  1,  1, 1, 1 } }, // psubw
828     { ISD::SUB,   MVT::v8i32,   {  1,  1, 1, 1 } }, // psubd
829     { ISD::SUB,   MVT::v4i64,   {  1,  1, 1, 1 } }, // psubq
830   };
831 
832   // Look for AVX512BW lowering tricks for custom cases.
833   if (ST->hasBWI())
834     if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second))
835       if (auto KindCost = Entry->Cost[CostKind])
836         return LT.first * *KindCost;
837 
838   static const CostKindTblEntry AVX512CostTable[] = {
839     { ISD::SHL,     MVT::v64i8,   { 15, 19,27,33 } }, // vpblendv+split sequence.
840     { ISD::SRL,     MVT::v64i8,   { 15, 19,30,36 } }, // vpblendv+split sequence.
841     { ISD::SRA,     MVT::v64i8,   { 37, 37,51,63 } }, // vpblendv+split sequence.
842 
843     { ISD::SHL,     MVT::v32i16,  { 11, 16,11,15 } }, // 2*extend/vpsrlvd/pack sequence.
844     { ISD::SRL,     MVT::v32i16,  { 11, 16,11,15 } }, // 2*extend/vpsrlvd/pack sequence.
845     { ISD::SRA,     MVT::v32i16,  { 11, 16,11,15 } }, // 2*extend/vpsravd/pack sequence.
846 
847     { ISD::SHL,     MVT::v4i32,   {  1,  1, 1, 1 } },
848     { ISD::SRL,     MVT::v4i32,   {  1,  1, 1, 1 } },
849     { ISD::SRA,     MVT::v4i32,   {  1,  1, 1, 1 } },
850     { ISD::SHL,     MVT::v8i32,   {  1,  1, 1, 1 } },
851     { ISD::SRL,     MVT::v8i32,   {  1,  1, 1, 1 } },
852     { ISD::SRA,     MVT::v8i32,   {  1,  1, 1, 1 } },
853     { ISD::SHL,     MVT::v16i32,  {  1,  1, 1, 1 } },
854     { ISD::SRL,     MVT::v16i32,  {  1,  1, 1, 1 } },
855     { ISD::SRA,     MVT::v16i32,  {  1,  1, 1, 1 } },
856 
857     { ISD::SHL,     MVT::v2i64,   {  1,  1, 1, 1 } },
858     { ISD::SRL,     MVT::v2i64,   {  1,  1, 1, 1 } },
859     { ISD::SRA,     MVT::v2i64,   {  1,  1, 1, 1 } },
860     { ISD::SHL,     MVT::v4i64,   {  1,  1, 1, 1 } },
861     { ISD::SRL,     MVT::v4i64,   {  1,  1, 1, 1 } },
862     { ISD::SRA,     MVT::v4i64,   {  1,  1, 1, 1 } },
863     { ISD::SHL,     MVT::v8i64,   {  1,  1, 1, 1 } },
864     { ISD::SRL,     MVT::v8i64,   {  1,  1, 1, 1 } },
865     { ISD::SRA,     MVT::v8i64,   {  1,  1, 1, 1 } },
866 
867     { ISD::ADD,     MVT::v64i8,   {  3,  7, 5, 5 } }, // 2*paddb + split
868     { ISD::ADD,     MVT::v32i16,  {  3,  7, 5, 5 } }, // 2*paddw + split
869 
870     { ISD::SUB,     MVT::v64i8,   {  3,  7, 5, 5 } }, // 2*psubb + split
871     { ISD::SUB,     MVT::v32i16,  {  3,  7, 5, 5 } }, // 2*psubw + split
872 
873     { ISD::AND,     MVT::v32i8,   {  1,  1, 1, 1 } },
874     { ISD::AND,     MVT::v16i16,  {  1,  1, 1, 1 } },
875     { ISD::AND,     MVT::v8i32,   {  1,  1, 1, 1 } },
876     { ISD::AND,     MVT::v4i64,   {  1,  1, 1, 1 } },
877 
878     { ISD::OR,      MVT::v32i8,   {  1,  1, 1, 1 } },
879     { ISD::OR,      MVT::v16i16,  {  1,  1, 1, 1 } },
880     { ISD::OR,      MVT::v8i32,   {  1,  1, 1, 1 } },
881     { ISD::OR,      MVT::v4i64,   {  1,  1, 1, 1 } },
882 
883     { ISD::XOR,     MVT::v32i8,   {  1,  1, 1, 1 } },
884     { ISD::XOR,     MVT::v16i16,  {  1,  1, 1, 1 } },
885     { ISD::XOR,     MVT::v8i32,   {  1,  1, 1, 1 } },
886     { ISD::XOR,     MVT::v4i64,   {  1,  1, 1, 1 } },
887 
888     { ISD::MUL,     MVT::v16i32,  {  1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
889     { ISD::MUL,     MVT::v8i32,   {  1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
890     { ISD::MUL,     MVT::v4i32,   {  1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
891     { ISD::MUL,     MVT::v8i64,   {  6,  9, 8, 8 } }, // 3*pmuludq/3*shift/2*add
892     { ISD::MUL,     MVT::i64,     {  1 } }, // Skylake from http://www.agner.org/
893 
894     { ISD::FNEG,    MVT::v8f64,   {  1,  1, 1, 2 } }, // Skylake from http://www.agner.org/
895     { ISD::FADD,    MVT::v8f64,   {  1,  4, 1, 1 } }, // Skylake from http://www.agner.org/
896     { ISD::FADD,    MVT::v4f64,   {  1,  4, 1, 1 } }, // Skylake from http://www.agner.org/
897     { ISD::FSUB,    MVT::v8f64,   {  1,  4, 1, 1 } }, // Skylake from http://www.agner.org/
898     { ISD::FSUB,    MVT::v4f64,   {  1,  4, 1, 1 } }, // Skylake from http://www.agner.org/
899     { ISD::FMUL,    MVT::v8f64,   {  1,  4, 1, 1 } }, // Skylake from http://www.agner.org/
900     { ISD::FMUL,    MVT::v4f64,   {  1,  4, 1, 1 } }, // Skylake from http://www.agner.org/
901     { ISD::FMUL,    MVT::v2f64,   {  1,  4, 1, 1 } }, // Skylake from http://www.agner.org/
902     { ISD::FMUL,    MVT::f64,     {  1,  4, 1, 1 } }, // Skylake from http://www.agner.org/
903 
904     { ISD::FDIV,    MVT::f64,     {  4, 14, 1, 1 } }, // Skylake from http://www.agner.org/
905     { ISD::FDIV,    MVT::v2f64,   {  4, 14, 1, 1 } }, // Skylake from http://www.agner.org/
906     { ISD::FDIV,    MVT::v4f64,   {  8, 14, 1, 1 } }, // Skylake from http://www.agner.org/
907     { ISD::FDIV,    MVT::v8f64,   { 16, 23, 1, 3 } }, // Skylake from http://www.agner.org/
908 
909     { ISD::FNEG,    MVT::v16f32,  {  1,  1, 1, 2 } }, // Skylake from http://www.agner.org/
910     { ISD::FADD,    MVT::v16f32,  {  1,  4, 1, 1 } }, // Skylake from http://www.agner.org/
911     { ISD::FADD,    MVT::v8f32,   {  1,  4, 1, 1 } }, // Skylake from http://www.agner.org/
912     { ISD::FSUB,    MVT::v16f32,  {  1,  4, 1, 1 } }, // Skylake from http://www.agner.org/
913     { ISD::FSUB,    MVT::v8f32,   {  1,  4, 1, 1 } }, // Skylake from http://www.agner.org/
914     { ISD::FMUL,    MVT::v16f32,  {  1,  4, 1, 1 } }, // Skylake from http://www.agner.org/
915     { ISD::FMUL,    MVT::v8f32,   {  1,  4, 1, 1 } }, // Skylake from http://www.agner.org/
916     { ISD::FMUL,    MVT::v4f32,   {  1,  4, 1, 1 } }, // Skylake from http://www.agner.org/
917     { ISD::FMUL,    MVT::f32,     {  1,  4, 1, 1 } }, // Skylake from http://www.agner.org/
918 
919     { ISD::FDIV,    MVT::f32,     {  3, 11, 1, 1 } }, // Skylake from http://www.agner.org/
920     { ISD::FDIV,    MVT::v4f32,   {  3, 11, 1, 1 } }, // Skylake from http://www.agner.org/
921     { ISD::FDIV,    MVT::v8f32,   {  5, 11, 1, 1 } }, // Skylake from http://www.agner.org/
922     { ISD::FDIV,    MVT::v16f32,  { 10, 18, 1, 3 } }, // Skylake from http://www.agner.org/
923   };
924 
925   if (ST->hasAVX512())
926     if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second))
927       if (auto KindCost = Entry->Cost[CostKind])
928         return LT.first * *KindCost;
929 
930   static const CostKindTblEntry AVX2ShiftCostTable[] = {
931     // Shifts on vXi64/vXi32 on AVX2 is legal even though we declare to
932     // customize them to detect the cases where shift amount is a scalar one.
933     { ISD::SHL,     MVT::v4i32,  { 2, 3, 1, 3 } }, // vpsllvd (Haswell from agner.org)
934     { ISD::SRL,     MVT::v4i32,  { 2, 3, 1, 3 } }, // vpsrlvd (Haswell from agner.org)
935     { ISD::SRA,     MVT::v4i32,  { 2, 3, 1, 3 } }, // vpsravd (Haswell from agner.org)
936     { ISD::SHL,     MVT::v8i32,  { 4, 4, 1, 3 } }, // vpsllvd (Haswell from agner.org)
937     { ISD::SRL,     MVT::v8i32,  { 4, 4, 1, 3 } }, // vpsrlvd (Haswell from agner.org)
938     { ISD::SRA,     MVT::v8i32,  { 4, 4, 1, 3 } }, // vpsravd (Haswell from agner.org)
939     { ISD::SHL,     MVT::v2i64,  { 2, 3, 1, 1 } }, // vpsllvq (Haswell from agner.org)
940     { ISD::SRL,     MVT::v2i64,  { 2, 3, 1, 1 } }, // vpsrlvq (Haswell from agner.org)
941     { ISD::SHL,     MVT::v4i64,  { 4, 4, 1, 2 } }, // vpsllvq (Haswell from agner.org)
942     { ISD::SRL,     MVT::v4i64,  { 4, 4, 1, 2 } }, // vpsrlvq (Haswell from agner.org)
943   };
944 
945   if (ST->hasAVX512()) {
946     if (ISD == ISD::SHL && LT.second == MVT::v32i16 && Op2Info.isConstant())
947       // On AVX512, a packed v32i16 shift left by a constant build_vector
948       // is lowered into a vector multiply (vpmullw).
949       return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
950                                     Op1Info.getNoProps(), Op2Info.getNoProps());
951   }
952 
953   // Look for AVX2 lowering tricks (XOP is always better at v4i32 shifts).
954   if (ST->hasAVX2() && !(ST->hasXOP() && LT.second == MVT::v4i32)) {
955     if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
956         Op2Info.isConstant())
957       // On AVX2, a packed v16i16 shift left by a constant build_vector
958       // is lowered into a vector multiply (vpmullw).
959       return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
960                                     Op1Info.getNoProps(), Op2Info.getNoProps());
961 
962     if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second))
963       if (auto KindCost = Entry->Cost[CostKind])
964         return LT.first * *KindCost;
965   }
966 
967   static const CostKindTblEntry XOPShiftCostTable[] = {
968     // 128bit shifts take 1cy, but right shifts require negation beforehand.
969     { ISD::SHL,     MVT::v16i8,  { 1, 3, 1, 1 } },
970     { ISD::SRL,     MVT::v16i8,  { 2, 3, 1, 1 } },
971     { ISD::SRA,     MVT::v16i8,  { 2, 3, 1, 1 } },
972     { ISD::SHL,     MVT::v8i16,  { 1, 3, 1, 1 } },
973     { ISD::SRL,     MVT::v8i16,  { 2, 3, 1, 1 } },
974     { ISD::SRA,     MVT::v8i16,  { 2, 3, 1, 1 } },
975     { ISD::SHL,     MVT::v4i32,  { 1, 3, 1, 1 } },
976     { ISD::SRL,     MVT::v4i32,  { 2, 3, 1, 1 } },
977     { ISD::SRA,     MVT::v4i32,  { 2, 3, 1, 1 } },
978     { ISD::SHL,     MVT::v2i64,  { 1, 3, 1, 1 } },
979     { ISD::SRL,     MVT::v2i64,  { 2, 3, 1, 1 } },
980     { ISD::SRA,     MVT::v2i64,  { 2, 3, 1, 1 } },
981     // 256bit shifts require splitting if AVX2 didn't catch them above.
982     { ISD::SHL,     MVT::v32i8,  { 4, 7, 5, 6 } },
983     { ISD::SRL,     MVT::v32i8,  { 6, 7, 5, 6 } },
984     { ISD::SRA,     MVT::v32i8,  { 6, 7, 5, 6 } },
985     { ISD::SHL,     MVT::v16i16, { 4, 7, 5, 6 } },
986     { ISD::SRL,     MVT::v16i16, { 6, 7, 5, 6 } },
987     { ISD::SRA,     MVT::v16i16, { 6, 7, 5, 6 } },
988     { ISD::SHL,     MVT::v8i32,  { 4, 7, 5, 6 } },
989     { ISD::SRL,     MVT::v8i32,  { 6, 7, 5, 6 } },
990     { ISD::SRA,     MVT::v8i32,  { 6, 7, 5, 6 } },
991     { ISD::SHL,     MVT::v4i64,  { 4, 7, 5, 6 } },
992     { ISD::SRL,     MVT::v4i64,  { 6, 7, 5, 6 } },
993     { ISD::SRA,     MVT::v4i64,  { 6, 7, 5, 6 } },
994   };
995 
996   // Look for XOP lowering tricks.
997   if (ST->hasXOP()) {
998     // If the right shift is constant then we'll fold the negation so
999     // it's as cheap as a left shift.
1000     int ShiftISD = ISD;
1001     if ((ShiftISD == ISD::SRL || ShiftISD == ISD::SRA) && Op2Info.isConstant())
1002       ShiftISD = ISD::SHL;
1003     if (const auto *Entry =
1004             CostTableLookup(XOPShiftCostTable, ShiftISD, LT.second))
1005       if (auto KindCost = Entry->Cost[CostKind])
1006         return LT.first * *KindCost;
1007   }
1008 
1009   if (ISD == ISD::SHL && !Op2Info.isUniform() && Op2Info.isConstant()) {
1010     MVT VT = LT.second;
1011     // Vector shift left by non uniform constant can be lowered
1012     // into vector multiply.
1013     if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) ||
1014         ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX()))
1015       ISD = ISD::MUL;
1016   }
1017 
1018   static const CostKindTblEntry GLMCostTable[] = {
1019     { ISD::FDIV,  MVT::f32,   { 18, 19, 1, 1 } }, // divss
1020     { ISD::FDIV,  MVT::v4f32, { 35, 36, 1, 1 } }, // divps
1021     { ISD::FDIV,  MVT::f64,   { 33, 34, 1, 1 } }, // divsd
1022     { ISD::FDIV,  MVT::v2f64, { 65, 66, 1, 1 } }, // divpd
1023   };
1024 
1025   if (ST->useGLMDivSqrtCosts())
1026     if (const auto *Entry = CostTableLookup(GLMCostTable, ISD, LT.second))
1027       if (auto KindCost = Entry->Cost[CostKind])
1028         return LT.first * *KindCost;
1029 
1030   static const CostKindTblEntry SLMCostTable[] = {
1031     { ISD::MUL,   MVT::v4i32, { 11, 11, 1, 7 } }, // pmulld
1032     { ISD::MUL,   MVT::v8i16, {  2,  5, 1, 1 } }, // pmullw
1033     { ISD::FMUL,  MVT::f64,   {  2,  5, 1, 1 } }, // mulsd
1034     { ISD::FMUL,  MVT::f32,   {  1,  4, 1, 1 } }, // mulss
1035     { ISD::FMUL,  MVT::v2f64, {  4,  7, 1, 1 } }, // mulpd
1036     { ISD::FMUL,  MVT::v4f32, {  2,  5, 1, 1 } }, // mulps
1037     { ISD::FDIV,  MVT::f32,   { 17, 19, 1, 1 } }, // divss
1038     { ISD::FDIV,  MVT::v4f32, { 39, 39, 1, 6 } }, // divps
1039     { ISD::FDIV,  MVT::f64,   { 32, 34, 1, 1 } }, // divsd
1040     { ISD::FDIV,  MVT::v2f64, { 69, 69, 1, 6 } }, // divpd
1041     { ISD::FADD,  MVT::v2f64, {  2,  4, 1, 1 } }, // addpd
1042     { ISD::FSUB,  MVT::v2f64, {  2,  4, 1, 1 } }, // subpd
1043     // v2i64/v4i64 mul is custom lowered as a series of long:
1044     // multiplies(3), shifts(3) and adds(2)
1045     // slm muldq version throughput is 2 and addq throughput 4
1046     // thus: 3X2 (muldq throughput) + 3X1 (shift throughput) +
1047     //       3X4 (addq throughput) = 17
1048     { ISD::MUL,   MVT::v2i64, { 17, 22, 9, 9 } },
1049     // slm addq\subq throughput is 4
1050     { ISD::ADD,   MVT::v2i64, {  4,  2, 1, 2 } },
1051     { ISD::SUB,   MVT::v2i64, {  4,  2, 1, 2 } },
1052   };
1053 
1054   if (ST->useSLMArithCosts())
1055     if (const auto *Entry = CostTableLookup(SLMCostTable, ISD, LT.second))
1056       if (auto KindCost = Entry->Cost[CostKind])
1057         return LT.first * *KindCost;
1058 
1059   static const CostKindTblEntry AVX2CostTable[] = {
1060     { ISD::SHL,  MVT::v16i8,   {  6, 21,11,16 } }, // vpblendvb sequence.
1061     { ISD::SHL,  MVT::v32i8,   {  6, 23,11,22 } }, // vpblendvb sequence.
1062     { ISD::SHL,  MVT::v8i16,   {  5, 18, 5,10 } }, // extend/vpsrlvd/pack sequence.
1063     { ISD::SHL,  MVT::v16i16,  {  8, 10,10,14 } }, // extend/vpsrlvd/pack sequence.
1064 
1065     { ISD::SRL,  MVT::v16i8,   {  6, 27,12,18 } }, // vpblendvb sequence.
1066     { ISD::SRL,  MVT::v32i8,   {  8, 30,12,24 } }, // vpblendvb sequence.
1067     { ISD::SRL,  MVT::v8i16,   {  5, 11, 5,10 } }, // extend/vpsrlvd/pack sequence.
1068     { ISD::SRL,  MVT::v16i16,  {  8, 10,10,14 } }, // extend/vpsrlvd/pack sequence.
1069 
1070     { ISD::SRA,  MVT::v16i8,   { 17, 17,24,30 } }, // vpblendvb sequence.
1071     { ISD::SRA,  MVT::v32i8,   { 18, 20,24,43 } }, // vpblendvb sequence.
1072     { ISD::SRA,  MVT::v8i16,   {  5, 11, 5,10 } }, // extend/vpsravd/pack sequence.
1073     { ISD::SRA,  MVT::v16i16,  {  8, 10,10,14 } }, // extend/vpsravd/pack sequence.
1074     { ISD::SRA,  MVT::v2i64,   {  4,  5, 5, 5 } }, // srl/xor/sub sequence.
1075     { ISD::SRA,  MVT::v4i64,   {  8,  8, 5, 9 } }, // srl/xor/sub sequence.
1076 
1077     { ISD::SUB,  MVT::v32i8,   {  1,  1, 1, 2 } }, // psubb
1078     { ISD::ADD,  MVT::v32i8,   {  1,  1, 1, 2 } }, // paddb
1079     { ISD::SUB,  MVT::v16i16,  {  1,  1, 1, 2 } }, // psubw
1080     { ISD::ADD,  MVT::v16i16,  {  1,  1, 1, 2 } }, // paddw
1081     { ISD::SUB,  MVT::v8i32,   {  1,  1, 1, 2 } }, // psubd
1082     { ISD::ADD,  MVT::v8i32,   {  1,  1, 1, 2 } }, // paddd
1083     { ISD::SUB,  MVT::v4i64,   {  1,  1, 1, 2 } }, // psubq
1084     { ISD::ADD,  MVT::v4i64,   {  1,  1, 1, 2 } }, // paddq
1085 
1086     { ISD::MUL,  MVT::v16i16,  {  2,  5, 1, 1 } }, // pmullw
1087     { ISD::MUL,  MVT::v8i32,   {  4, 10, 1, 2 } }, // pmulld
1088     { ISD::MUL,  MVT::v4i32,   {  2, 10, 1, 2 } }, // pmulld
1089     { ISD::MUL,  MVT::v4i64,   {  6, 10, 8,13 } }, // 3*pmuludq/3*shift/2*add
1090     { ISD::MUL,  MVT::v2i64,   {  6, 10, 8, 8 } }, // 3*pmuludq/3*shift/2*add
1091 
1092     { ISD::FNEG, MVT::v4f64,   {  1,  1, 1, 2 } }, // vxorpd
1093     { ISD::FNEG, MVT::v8f32,   {  1,  1, 1, 2 } }, // vxorps
1094 
1095     { ISD::FADD, MVT::f64,     {  1,  4, 1, 1 } }, // vaddsd
1096     { ISD::FADD, MVT::f32,     {  1,  4, 1, 1 } }, // vaddss
1097     { ISD::FADD, MVT::v2f64,   {  1,  4, 1, 1 } }, // vaddpd
1098     { ISD::FADD, MVT::v4f32,   {  1,  4, 1, 1 } }, // vaddps
1099     { ISD::FADD, MVT::v4f64,   {  1,  4, 1, 2 } }, // vaddpd
1100     { ISD::FADD, MVT::v8f32,   {  1,  4, 1, 2 } }, // vaddps
1101 
1102     { ISD::FSUB, MVT::f64,     {  1,  4, 1, 1 } }, // vsubsd
1103     { ISD::FSUB, MVT::f32,     {  1,  4, 1, 1 } }, // vsubss
1104     { ISD::FSUB, MVT::v2f64,   {  1,  4, 1, 1 } }, // vsubpd
1105     { ISD::FSUB, MVT::v4f32,   {  1,  4, 1, 1 } }, // vsubps
1106     { ISD::FSUB, MVT::v4f64,   {  1,  4, 1, 2 } }, // vsubpd
1107     { ISD::FSUB, MVT::v8f32,   {  1,  4, 1, 2 } }, // vsubps
1108 
1109     { ISD::FMUL, MVT::f64,     {  1,  5, 1, 1 } }, // vmulsd
1110     { ISD::FMUL, MVT::f32,     {  1,  5, 1, 1 } }, // vmulss
1111     { ISD::FMUL, MVT::v2f64,   {  1,  5, 1, 1 } }, // vmulpd
1112     { ISD::FMUL, MVT::v4f32,   {  1,  5, 1, 1 } }, // vmulps
1113     { ISD::FMUL, MVT::v4f64,   {  1,  5, 1, 2 } }, // vmulpd
1114     { ISD::FMUL, MVT::v8f32,   {  1,  5, 1, 2 } }, // vmulps
1115 
1116     { ISD::FDIV, MVT::f32,     {  7, 13, 1, 1 } }, // vdivss
1117     { ISD::FDIV, MVT::v4f32,   {  7, 13, 1, 1 } }, // vdivps
1118     { ISD::FDIV, MVT::v8f32,   { 14, 21, 1, 3 } }, // vdivps
1119     { ISD::FDIV, MVT::f64,     { 14, 20, 1, 1 } }, // vdivsd
1120     { ISD::FDIV, MVT::v2f64,   { 14, 20, 1, 1 } }, // vdivpd
1121     { ISD::FDIV, MVT::v4f64,   { 28, 35, 1, 3 } }, // vdivpd
1122   };
1123 
1124   // Look for AVX2 lowering tricks for custom cases.
1125   if (ST->hasAVX2())
1126     if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
1127       if (auto KindCost = Entry->Cost[CostKind])
1128         return LT.first * *KindCost;
1129 
1130   static const CostKindTblEntry AVX1CostTable[] = {
1131     // We don't have to scalarize unsupported ops. We can issue two half-sized
1132     // operations and we only need to extract the upper YMM half.
1133     // Two ops + 1 extract + 1 insert = 4.
1134     { ISD::MUL,     MVT::v16i16,  {  4,  8,  5,  6 } }, // pmullw + split
1135     { ISD::MUL,     MVT::v8i32,   {  5,  8,  5, 10 } }, // pmulld + split
1136     { ISD::MUL,     MVT::v4i32,   {  2,  5,  1,  3 } }, // pmulld
1137     { ISD::MUL,     MVT::v4i64,   { 12, 15, 19, 20 } },
1138 
1139     { ISD::AND,     MVT::v32i8,   {  1,  1, 1, 2 } }, // vandps
1140     { ISD::AND,     MVT::v16i16,  {  1,  1, 1, 2 } }, // vandps
1141     { ISD::AND,     MVT::v8i32,   {  1,  1, 1, 2 } }, // vandps
1142     { ISD::AND,     MVT::v4i64,   {  1,  1, 1, 2 } }, // vandps
1143 
1144     { ISD::OR,      MVT::v32i8,   {  1,  1, 1, 2 } }, // vorps
1145     { ISD::OR,      MVT::v16i16,  {  1,  1, 1, 2 } }, // vorps
1146     { ISD::OR,      MVT::v8i32,   {  1,  1, 1, 2 } }, // vorps
1147     { ISD::OR,      MVT::v4i64,   {  1,  1, 1, 2 } }, // vorps
1148 
1149     { ISD::XOR,     MVT::v32i8,   {  1,  1, 1, 2 } }, // vxorps
1150     { ISD::XOR,     MVT::v16i16,  {  1,  1, 1, 2 } }, // vxorps
1151     { ISD::XOR,     MVT::v8i32,   {  1,  1, 1, 2 } }, // vxorps
1152     { ISD::XOR,     MVT::v4i64,   {  1,  1, 1, 2 } }, // vxorps
1153 
1154     { ISD::SUB,     MVT::v32i8,   {  4,  2, 5, 6 } }, // psubb + split
1155     { ISD::ADD,     MVT::v32i8,   {  4,  2, 5, 6 } }, // paddb + split
1156     { ISD::SUB,     MVT::v16i16,  {  4,  2, 5, 6 } }, // psubw + split
1157     { ISD::ADD,     MVT::v16i16,  {  4,  2, 5, 6 } }, // paddw + split
1158     { ISD::SUB,     MVT::v8i32,   {  4,  2, 5, 6 } }, // psubd + split
1159     { ISD::ADD,     MVT::v8i32,   {  4,  2, 5, 6 } }, // paddd + split
1160     { ISD::SUB,     MVT::v4i64,   {  4,  2, 5, 6 } }, // psubq + split
1161     { ISD::ADD,     MVT::v4i64,   {  4,  2, 5, 6 } }, // paddq + split
1162     { ISD::SUB,     MVT::v2i64,   {  1,  1, 1, 1 } }, // psubq
1163     { ISD::ADD,     MVT::v2i64,   {  1,  1, 1, 1 } }, // paddq
1164 
1165     { ISD::SHL,     MVT::v16i8,   { 10, 21,11,17 } }, // pblendvb sequence.
1166     { ISD::SHL,     MVT::v32i8,   { 22, 22,27,40 } }, // pblendvb sequence + split.
1167     { ISD::SHL,     MVT::v8i16,   {  6,  9,11,11 } }, // pblendvb sequence.
1168     { ISD::SHL,     MVT::v16i16,  { 13, 16,24,25 } }, // pblendvb sequence + split.
1169     { ISD::SHL,     MVT::v4i32,   {  3, 11, 4, 6 } }, // pslld/paddd/cvttps2dq/pmulld
1170     { ISD::SHL,     MVT::v8i32,   {  9, 11,12,17 } }, // pslld/paddd/cvttps2dq/pmulld + split
1171     { ISD::SHL,     MVT::v2i64,   {  2,  4, 4, 6 } }, // Shift each lane + blend.
1172     { ISD::SHL,     MVT::v4i64,   {  6,  7,11,15 } }, // Shift each lane + blend + split.
1173 
1174     { ISD::SRL,     MVT::v16i8,   { 11, 27,12,18 } }, // pblendvb sequence.
1175     { ISD::SRL,     MVT::v32i8,   { 23, 23,30,43 } }, // pblendvb sequence + split.
1176     { ISD::SRL,     MVT::v8i16,   { 13, 16,14,22 } }, // pblendvb sequence.
1177     { ISD::SRL,     MVT::v16i16,  { 28, 30,31,48 } }, // pblendvb sequence + split.
1178     { ISD::SRL,     MVT::v4i32,   {  6,  7,12,16 } }, // Shift each lane + blend.
1179     { ISD::SRL,     MVT::v8i32,   { 14, 14,26,34 } }, // Shift each lane + blend + split.
1180     { ISD::SRL,     MVT::v2i64,   {  2,  4, 4, 6 } }, // Shift each lane + blend.
1181     { ISD::SRL,     MVT::v4i64,   {  6,  7,11,15 } }, // Shift each lane + blend + split.
1182 
1183     { ISD::SRA,     MVT::v16i8,   { 21, 22,24,36 } }, // pblendvb sequence.
1184     { ISD::SRA,     MVT::v32i8,   { 44, 45,51,76 } }, // pblendvb sequence + split.
1185     { ISD::SRA,     MVT::v8i16,   { 13, 16,14,22 } }, // pblendvb sequence.
1186     { ISD::SRA,     MVT::v16i16,  { 28, 30,31,48 } }, // pblendvb sequence + split.
1187     { ISD::SRA,     MVT::v4i32,   {  6,  7,12,16 } }, // Shift each lane + blend.
1188     { ISD::SRA,     MVT::v8i32,   { 14, 14,26,34 } }, // Shift each lane + blend + split.
1189     { ISD::SRA,     MVT::v2i64,   {  5,  6,10,14 } }, // Shift each lane + blend.
1190     { ISD::SRA,     MVT::v4i64,   { 12, 12,22,30 } }, // Shift each lane + blend + split.
1191 
1192     { ISD::FNEG,    MVT::v4f64,   {  2,  2, 1, 2 } }, // BTVER2 from http://www.agner.org/
1193     { ISD::FNEG,    MVT::v8f32,   {  2,  2, 1, 2 } }, // BTVER2 from http://www.agner.org/
1194 
1195     { ISD::FADD,    MVT::f64,     {  1,  5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1196     { ISD::FADD,    MVT::f32,     {  1,  5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1197     { ISD::FADD,    MVT::v2f64,   {  1,  5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1198     { ISD::FADD,    MVT::v4f32,   {  1,  5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1199     { ISD::FADD,    MVT::v4f64,   {  2,  5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1200     { ISD::FADD,    MVT::v8f32,   {  2,  5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1201 
1202     { ISD::FSUB,    MVT::f64,     {  1,  5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1203     { ISD::FSUB,    MVT::f32,     {  1,  5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1204     { ISD::FSUB,    MVT::v2f64,   {  1,  5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1205     { ISD::FSUB,    MVT::v4f32,   {  1,  5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1206     { ISD::FSUB,    MVT::v4f64,   {  2,  5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1207     { ISD::FSUB,    MVT::v8f32,   {  2,  5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1208 
1209     { ISD::FMUL,    MVT::f64,     {  2,  5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1210     { ISD::FMUL,    MVT::f32,     {  1,  5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1211     { ISD::FMUL,    MVT::v2f64,   {  2,  5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1212     { ISD::FMUL,    MVT::v4f32,   {  1,  5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1213     { ISD::FMUL,    MVT::v4f64,   {  4,  5, 1, 2 } }, // BTVER2 from http://www.agner.org/
1214     { ISD::FMUL,    MVT::v8f32,   {  2,  5, 1, 2 } }, // BTVER2 from http://www.agner.org/
1215 
1216     { ISD::FDIV,    MVT::f32,     { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/
1217     { ISD::FDIV,    MVT::v4f32,   { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/
1218     { ISD::FDIV,    MVT::v8f32,   { 28, 29, 1, 3 } }, // SNB from http://www.agner.org/
1219     { ISD::FDIV,    MVT::f64,     { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/
1220     { ISD::FDIV,    MVT::v2f64,   { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/
1221     { ISD::FDIV,    MVT::v4f64,   { 44, 45, 1, 3 } }, // SNB from http://www.agner.org/
1222   };
1223 
1224   if (ST->hasAVX())
1225     if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second))
1226       if (auto KindCost = Entry->Cost[CostKind])
1227         return LT.first * *KindCost;
1228 
1229   static const CostKindTblEntry SSE42CostTable[] = {
1230     { ISD::FADD, MVT::f64,    {  1,  3, 1, 1 } }, // Nehalem from http://www.agner.org/
1231     { ISD::FADD, MVT::f32,    {  1,  3, 1, 1 } }, // Nehalem from http://www.agner.org/
1232     { ISD::FADD, MVT::v2f64,  {  1,  3, 1, 1 } }, // Nehalem from http://www.agner.org/
1233     { ISD::FADD, MVT::v4f32,  {  1,  3, 1, 1 } }, // Nehalem from http://www.agner.org/
1234 
1235     { ISD::FSUB, MVT::f64,    {  1,  3, 1, 1 } }, // Nehalem from http://www.agner.org/
1236     { ISD::FSUB, MVT::f32 ,   {  1,  3, 1, 1 } }, // Nehalem from http://www.agner.org/
1237     { ISD::FSUB, MVT::v2f64,  {  1,  3, 1, 1 } }, // Nehalem from http://www.agner.org/
1238     { ISD::FSUB, MVT::v4f32,  {  1,  3, 1, 1 } }, // Nehalem from http://www.agner.org/
1239 
1240     { ISD::FMUL, MVT::f64,    {  1,  5, 1, 1 } }, // Nehalem from http://www.agner.org/
1241     { ISD::FMUL, MVT::f32,    {  1,  5, 1, 1 } }, // Nehalem from http://www.agner.org/
1242     { ISD::FMUL, MVT::v2f64,  {  1,  5, 1, 1 } }, // Nehalem from http://www.agner.org/
1243     { ISD::FMUL, MVT::v4f32,  {  1,  5, 1, 1 } }, // Nehalem from http://www.agner.org/
1244 
1245     { ISD::FDIV,  MVT::f32,   { 14, 14, 1, 1 } }, // Nehalem from http://www.agner.org/
1246     { ISD::FDIV,  MVT::v4f32, { 14, 14, 1, 1 } }, // Nehalem from http://www.agner.org/
1247     { ISD::FDIV,  MVT::f64,   { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/
1248     { ISD::FDIV,  MVT::v2f64, { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/
1249 
1250     { ISD::MUL,   MVT::v2i64, {  6, 10,10,10 } }  // 3*pmuludq/3*shift/2*add
1251   };
1252 
1253   if (ST->hasSSE42())
1254     if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second))
1255       if (auto KindCost = Entry->Cost[CostKind])
1256         return LT.first * *KindCost;
1257 
1258   static const CostKindTblEntry SSE41CostTable[] = {
1259     { ISD::SHL,  MVT::v16i8,  { 15, 24,17,22 } }, // pblendvb sequence.
1260     { ISD::SHL,  MVT::v8i16,  { 11, 14,11,11 } }, // pblendvb sequence.
1261     { ISD::SHL,  MVT::v4i32,  { 14, 20, 4,10 } }, // pslld/paddd/cvttps2dq/pmulld
1262 
1263     { ISD::SRL,  MVT::v16i8,  { 16, 27,18,24 } }, // pblendvb sequence.
1264     { ISD::SRL,  MVT::v8i16,  { 22, 26,23,27 } }, // pblendvb sequence.
1265     { ISD::SRL,  MVT::v4i32,  { 16, 17,15,19 } }, // Shift each lane + blend.
1266     { ISD::SRL,  MVT::v2i64,  {  4,  6, 5, 7 } }, // splat+shuffle sequence.
1267 
1268     { ISD::SRA,  MVT::v16i8,  { 38, 41,30,36 } }, // pblendvb sequence.
1269     { ISD::SRA,  MVT::v8i16,  { 22, 26,23,27 } }, // pblendvb sequence.
1270     { ISD::SRA,  MVT::v4i32,  { 16, 17,15,19 } }, // Shift each lane + blend.
1271     { ISD::SRA,  MVT::v2i64,  {  8, 17, 5, 7 } }, // splat+shuffle sequence.
1272 
1273     { ISD::MUL,  MVT::v4i32,  {  2, 11, 1, 1 } }  // pmulld (Nehalem from agner.org)
1274   };
1275 
1276   if (ST->hasSSE41())
1277     if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second))
1278       if (auto KindCost = Entry->Cost[CostKind])
1279         return LT.first * *KindCost;
1280 
1281   static const CostKindTblEntry SSE2CostTable[] = {
1282     // We don't correctly identify costs of casts because they are marked as
1283     // custom.
1284     { ISD::SHL,  MVT::v16i8,  { 13, 21,26,28 } }, // cmpgtb sequence.
1285     { ISD::SHL,  MVT::v8i16,  { 24, 27,16,20 } }, // cmpgtw sequence.
1286     { ISD::SHL,  MVT::v4i32,  { 17, 19,10,12 } }, // pslld/paddd/cvttps2dq/pmuludq.
1287     { ISD::SHL,  MVT::v2i64,  {  4,  6, 5, 7 } }, // splat+shuffle sequence.
1288 
1289     { ISD::SRL,  MVT::v16i8,  { 14, 28,27,30 } }, // cmpgtb sequence.
1290     { ISD::SRL,  MVT::v8i16,  { 16, 19,31,31 } }, // cmpgtw sequence.
1291     { ISD::SRL,  MVT::v4i32,  { 12, 12,15,19 } }, // Shift each lane + blend.
1292     { ISD::SRL,  MVT::v2i64,  {  4,  6, 5, 7 } }, // splat+shuffle sequence.
1293 
1294     { ISD::SRA,  MVT::v16i8,  { 27, 30,54,54 } }, // unpacked cmpgtb sequence.
1295     { ISD::SRA,  MVT::v8i16,  { 16, 19,31,31 } }, // cmpgtw sequence.
1296     { ISD::SRA,  MVT::v4i32,  { 12, 12,15,19 } }, // Shift each lane + blend.
1297     { ISD::SRA,  MVT::v2i64,  {  8, 11,12,16 } }, // srl/xor/sub splat+shuffle sequence.
1298 
1299     { ISD::AND,  MVT::v16i8,  {  1,  1, 1, 1 } }, // pand
1300     { ISD::AND,  MVT::v8i16,  {  1,  1, 1, 1 } }, // pand
1301     { ISD::AND,  MVT::v4i32,  {  1,  1, 1, 1 } }, // pand
1302     { ISD::AND,  MVT::v2i64,  {  1,  1, 1, 1 } }, // pand
1303 
1304     { ISD::OR,   MVT::v16i8,  {  1,  1, 1, 1 } }, // por
1305     { ISD::OR,   MVT::v8i16,  {  1,  1, 1, 1 } }, // por
1306     { ISD::OR,   MVT::v4i32,  {  1,  1, 1, 1 } }, // por
1307     { ISD::OR,   MVT::v2i64,  {  1,  1, 1, 1 } }, // por
1308 
1309     { ISD::XOR,  MVT::v16i8,  {  1,  1, 1, 1 } }, // pxor
1310     { ISD::XOR,  MVT::v8i16,  {  1,  1, 1, 1 } }, // pxor
1311     { ISD::XOR,  MVT::v4i32,  {  1,  1, 1, 1 } }, // pxor
1312     { ISD::XOR,  MVT::v2i64,  {  1,  1, 1, 1 } }, // pxor
1313 
1314     { ISD::ADD,  MVT::v2i64,  {  1,  2, 1, 2 } }, // paddq
1315     { ISD::SUB,  MVT::v2i64,  {  1,  2, 1, 2 } }, // psubq
1316 
1317     { ISD::MUL,  MVT::v8i16,  {  1,  5, 1, 1 } }, // pmullw
1318     { ISD::MUL,  MVT::v4i32,  {  6,  8, 7, 7 } }, // 3*pmuludq/4*shuffle
1319     { ISD::MUL,  MVT::v2i64,  {  8, 10, 8, 8 } }, // 3*pmuludq/3*shift/2*add
1320 
1321     { ISD::FDIV, MVT::f32,    { 23, 23, 1, 1 } }, // Pentium IV from http://www.agner.org/
1322     { ISD::FDIV, MVT::v4f32,  { 39, 39, 1, 1 } }, // Pentium IV from http://www.agner.org/
1323     { ISD::FDIV, MVT::f64,    { 38, 38, 1, 1 } }, // Pentium IV from http://www.agner.org/
1324     { ISD::FDIV, MVT::v2f64,  { 69, 69, 1, 1 } }, // Pentium IV from http://www.agner.org/
1325 
1326     { ISD::FNEG, MVT::f32,    {  1,  1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1327     { ISD::FNEG, MVT::f64,    {  1,  1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1328     { ISD::FNEG, MVT::v4f32,  {  1,  1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1329     { ISD::FNEG, MVT::v2f64,  {  1,  1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1330 
1331     { ISD::FADD, MVT::f32,    {  2,  3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1332     { ISD::FADD, MVT::f64,    {  2,  3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1333     { ISD::FADD, MVT::v2f64,  {  2,  3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1334 
1335     { ISD::FSUB, MVT::f32,    {  2,  3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1336     { ISD::FSUB, MVT::f64,    {  2,  3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1337     { ISD::FSUB, MVT::v2f64,  {  2,  3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1338 
1339     { ISD::FMUL, MVT::f64,    {  2,  5, 1, 1 } }, // Pentium IV from http://www.agner.org/
1340     { ISD::FMUL, MVT::v2f64,  {  2,  5, 1, 1 } }, // Pentium IV from http://www.agner.org/
1341   };
1342 
1343   if (ST->hasSSE2())
1344     if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second))
1345       if (auto KindCost = Entry->Cost[CostKind])
1346         return LT.first * *KindCost;
1347 
1348   static const CostKindTblEntry SSE1CostTable[] = {
1349     { ISD::FDIV, MVT::f32,   { 17, 18, 1, 1 } }, // Pentium III from http://www.agner.org/
1350     { ISD::FDIV, MVT::v4f32, { 34, 48, 1, 1 } }, // Pentium III from http://www.agner.org/
1351 
1352     { ISD::FNEG, MVT::f32,   {  2,  2, 1, 2 } }, // Pentium III from http://www.agner.org/
1353     { ISD::FNEG, MVT::v4f32, {  2,  2, 1, 2 } }, // Pentium III from http://www.agner.org/
1354 
1355     { ISD::FADD, MVT::f32,   {  1,  3, 1, 1 } }, // Pentium III from http://www.agner.org/
1356     { ISD::FADD, MVT::v4f32, {  2,  3, 1, 1 } }, // Pentium III from http://www.agner.org/
1357 
1358     { ISD::FSUB, MVT::f32,   {  1,  3, 1, 1 } }, // Pentium III from http://www.agner.org/
1359     { ISD::FSUB, MVT::v4f32, {  2,  3, 1, 1 } }, // Pentium III from http://www.agner.org/
1360 
1361     { ISD::FMUL, MVT::f32,   {  2,  5, 1, 1 } }, // Pentium III from http://www.agner.org/
1362     { ISD::FMUL, MVT::v4f32, {  2,  5, 1, 1 } }, // Pentium III from http://www.agner.org/
1363   };
1364 
1365   if (ST->hasSSE1())
1366     if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second))
1367       if (auto KindCost = Entry->Cost[CostKind])
1368         return LT.first * *KindCost;
1369 
1370   static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets
1371     { ISD::ADD,  MVT::i64,  {  1 } }, // Core (Merom) from http://www.agner.org/
1372     { ISD::SUB,  MVT::i64,  {  1 } }, // Core (Merom) from http://www.agner.org/
1373     { ISD::MUL,  MVT::i64,  {  2 } }, // Nehalem from http://www.agner.org/
1374   };
1375 
1376   if (ST->is64Bit())
1377     if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, LT.second))
1378       if (auto KindCost = Entry->Cost[CostKind])
1379         return LT.first * *KindCost;
1380 
1381   static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets
1382     { ISD::ADD,  MVT::i8,  {  1 } }, // Pentium III from http://www.agner.org/
1383     { ISD::ADD,  MVT::i16, {  1 } }, // Pentium III from http://www.agner.org/
1384     { ISD::ADD,  MVT::i32, {  1 } }, // Pentium III from http://www.agner.org/
1385 
1386     { ISD::SUB,  MVT::i8,  {  1 } }, // Pentium III from http://www.agner.org/
1387     { ISD::SUB,  MVT::i16, {  1 } }, // Pentium III from http://www.agner.org/
1388     { ISD::SUB,  MVT::i32, {  1 } }, // Pentium III from http://www.agner.org/
1389 
1390     { ISD::FNEG, MVT::f64, {  2,  2, 1, 3 } }, // (x87)
1391     { ISD::FADD, MVT::f64, {  2,  3, 1, 1 } }, // (x87)
1392     { ISD::FSUB, MVT::f64, {  2,  3, 1, 1 } }, // (x87)
1393     { ISD::FMUL, MVT::f64, {  2,  5, 1, 1 } }, // (x87)
1394     { ISD::FDIV, MVT::f64, { 38, 38, 1, 1 } }, // (x87)
1395   };
1396 
1397   if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, LT.second))
1398     if (auto KindCost = Entry->Cost[CostKind])
1399       return LT.first * *KindCost;
1400 
1401   // It is not a good idea to vectorize division. We have to scalarize it and
1402   // in the process we will often end up having to spilling regular
1403   // registers. The overhead of division is going to dominate most kernels
1404   // anyways so try hard to prevent vectorization of division - it is
1405   // generally a bad idea. Assume somewhat arbitrarily that we have to be able
1406   // to hide "20 cycles" for each lane.
1407   if (CostKind == TTI::TCK_RecipThroughput && LT.second.isVector() &&
1408       (ISD == ISD::SDIV || ISD == ISD::SREM || ISD == ISD::UDIV ||
1409        ISD == ISD::UREM)) {
1410     InstructionCost ScalarCost =
1411         getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind,
1412                                Op1Info.getNoProps(), Op2Info.getNoProps());
1413     return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost;
1414   }
1415 
1416   // Handle some basic single instruction code size cases.
1417   if (CostKind == TTI::TCK_CodeSize) {
1418     switch (ISD) {
1419     case ISD::FADD:
1420     case ISD::FSUB:
1421     case ISD::FMUL:
1422     case ISD::FDIV:
1423     case ISD::FNEG:
1424     case ISD::AND:
1425     case ISD::OR:
1426     case ISD::XOR:
1427       return LT.first;
1428       break;
1429     }
1430   }
1431 
1432   // Fallback to the default implementation.
1433   return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1434                                        Args, CxtI);
1435 }
1436 
1437 InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
1438                                            VectorType *BaseTp,
1439                                            ArrayRef<int> Mask,
1440                                            TTI::TargetCostKind CostKind,
1441                                            int Index, VectorType *SubTp,
1442                                            ArrayRef<const Value *> Args) {
1443   // 64-bit packed float vectors (v2f32) are widened to type v4f32.
1444   // 64-bit packed integer vectors (v2i32) are widened to type v4i32.
1445   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(BaseTp);
1446 
1447   Kind = improveShuffleKindFromMask(Kind, Mask);
1448 
1449   // Treat Transpose as 2-op shuffles - there's no difference in lowering.
1450   if (Kind == TTI::SK_Transpose)
1451     Kind = TTI::SK_PermuteTwoSrc;
1452 
1453   // For Broadcasts we are splatting the first element from the first input
1454   // register, so only need to reference that input and all the output
1455   // registers are the same.
1456   if (Kind == TTI::SK_Broadcast)
1457     LT.first = 1;
1458 
1459   // Subvector extractions are free if they start at the beginning of a
1460   // vector and cheap if the subvectors are aligned.
1461   if (Kind == TTI::SK_ExtractSubvector && LT.second.isVector()) {
1462     int NumElts = LT.second.getVectorNumElements();
1463     if ((Index % NumElts) == 0)
1464       return 0;
1465     std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
1466     if (SubLT.second.isVector()) {
1467       int NumSubElts = SubLT.second.getVectorNumElements();
1468       if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1469         return SubLT.first;
1470       // Handle some cases for widening legalization. For now we only handle
1471       // cases where the original subvector was naturally aligned and evenly
1472       // fit in its legalized subvector type.
1473       // FIXME: Remove some of the alignment restrictions.
1474       // FIXME: We can use permq for 64-bit or larger extracts from 256-bit
1475       // vectors.
1476       int OrigSubElts = cast<FixedVectorType>(SubTp)->getNumElements();
1477       if (NumSubElts > OrigSubElts && (Index % OrigSubElts) == 0 &&
1478           (NumSubElts % OrigSubElts) == 0 &&
1479           LT.second.getVectorElementType() ==
1480               SubLT.second.getVectorElementType() &&
1481           LT.second.getVectorElementType().getSizeInBits() ==
1482               BaseTp->getElementType()->getPrimitiveSizeInBits()) {
1483         assert(NumElts >= NumSubElts && NumElts > OrigSubElts &&
1484                "Unexpected number of elements!");
1485         auto *VecTy = FixedVectorType::get(BaseTp->getElementType(),
1486                                            LT.second.getVectorNumElements());
1487         auto *SubTy = FixedVectorType::get(BaseTp->getElementType(),
1488                                            SubLT.second.getVectorNumElements());
1489         int ExtractIndex = alignDown((Index % NumElts), NumSubElts);
1490         InstructionCost ExtractCost =
1491             getShuffleCost(TTI::SK_ExtractSubvector, VecTy, std::nullopt,
1492                            CostKind, ExtractIndex, SubTy);
1493 
1494         // If the original size is 32-bits or more, we can use pshufd. Otherwise
1495         // if we have SSSE3 we can use pshufb.
1496         if (SubTp->getPrimitiveSizeInBits() >= 32 || ST->hasSSSE3())
1497           return ExtractCost + 1; // pshufd or pshufb
1498 
1499         assert(SubTp->getPrimitiveSizeInBits() == 16 &&
1500                "Unexpected vector size");
1501 
1502         return ExtractCost + 2; // worst case pshufhw + pshufd
1503       }
1504     }
1505   }
1506 
1507   // Subvector insertions are cheap if the subvectors are aligned.
1508   // Note that in general, the insertion starting at the beginning of a vector
1509   // isn't free, because we need to preserve the rest of the wide vector.
1510   if (Kind == TTI::SK_InsertSubvector && LT.second.isVector()) {
1511     int NumElts = LT.second.getVectorNumElements();
1512     std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
1513     if (SubLT.second.isVector()) {
1514       int NumSubElts = SubLT.second.getVectorNumElements();
1515       if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1516         return SubLT.first;
1517     }
1518 
1519     // If the insertion isn't aligned, treat it like a 2-op shuffle.
1520     Kind = TTI::SK_PermuteTwoSrc;
1521   }
1522 
1523   // Handle some common (illegal) sub-vector types as they are often very cheap
1524   // to shuffle even on targets without PSHUFB.
1525   EVT VT = TLI->getValueType(DL, BaseTp);
1526   if (VT.isSimple() && VT.isVector() && VT.getSizeInBits() < 128 &&
1527       !ST->hasSSSE3()) {
1528      static const CostTblEntry SSE2SubVectorShuffleTbl[] = {
1529       {TTI::SK_Broadcast,        MVT::v4i16, 1}, // pshuflw
1530       {TTI::SK_Broadcast,        MVT::v2i16, 1}, // pshuflw
1531       {TTI::SK_Broadcast,        MVT::v8i8,  2}, // punpck/pshuflw
1532       {TTI::SK_Broadcast,        MVT::v4i8,  2}, // punpck/pshuflw
1533       {TTI::SK_Broadcast,        MVT::v2i8,  1}, // punpck
1534 
1535       {TTI::SK_Reverse,          MVT::v4i16, 1}, // pshuflw
1536       {TTI::SK_Reverse,          MVT::v2i16, 1}, // pshuflw
1537       {TTI::SK_Reverse,          MVT::v4i8,  3}, // punpck/pshuflw/packus
1538       {TTI::SK_Reverse,          MVT::v2i8,  1}, // punpck
1539 
1540       {TTI::SK_Splice,           MVT::v4i16, 2}, // punpck+psrldq
1541       {TTI::SK_Splice,           MVT::v2i16, 2}, // punpck+psrldq
1542       {TTI::SK_Splice,           MVT::v4i8,  2}, // punpck+psrldq
1543       {TTI::SK_Splice,           MVT::v2i8,  2}, // punpck+psrldq
1544 
1545       {TTI::SK_PermuteTwoSrc,    MVT::v4i16, 2}, // punpck/pshuflw
1546       {TTI::SK_PermuteTwoSrc,    MVT::v2i16, 2}, // punpck/pshuflw
1547       {TTI::SK_PermuteTwoSrc,    MVT::v8i8,  7}, // punpck/pshuflw
1548       {TTI::SK_PermuteTwoSrc,    MVT::v4i8,  4}, // punpck/pshuflw
1549       {TTI::SK_PermuteTwoSrc,    MVT::v2i8,  2}, // punpck
1550 
1551       {TTI::SK_PermuteSingleSrc, MVT::v4i16, 1}, // pshuflw
1552       {TTI::SK_PermuteSingleSrc, MVT::v2i16, 1}, // pshuflw
1553       {TTI::SK_PermuteSingleSrc, MVT::v8i8,  5}, // punpck/pshuflw
1554       {TTI::SK_PermuteSingleSrc, MVT::v4i8,  3}, // punpck/pshuflw
1555       {TTI::SK_PermuteSingleSrc, MVT::v2i8,  1}, // punpck
1556     };
1557 
1558     if (ST->hasSSE2())
1559       if (const auto *Entry =
1560               CostTableLookup(SSE2SubVectorShuffleTbl, Kind, VT.getSimpleVT()))
1561         return Entry->Cost;
1562   }
1563 
1564   // We are going to permute multiple sources and the result will be in multiple
1565   // destinations. Providing an accurate cost only for splits where the element
1566   // type remains the same.
1567   if (Kind == TTI::SK_PermuteSingleSrc && LT.first != 1) {
1568     MVT LegalVT = LT.second;
1569     if (LegalVT.isVector() &&
1570         LegalVT.getVectorElementType().getSizeInBits() ==
1571             BaseTp->getElementType()->getPrimitiveSizeInBits() &&
1572         LegalVT.getVectorNumElements() <
1573             cast<FixedVectorType>(BaseTp)->getNumElements()) {
1574 
1575       unsigned VecTySize = DL.getTypeStoreSize(BaseTp);
1576       unsigned LegalVTSize = LegalVT.getStoreSize();
1577       // Number of source vectors after legalization:
1578       unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
1579       // Number of destination vectors after legalization:
1580       InstructionCost NumOfDests = LT.first;
1581 
1582       auto *SingleOpTy = FixedVectorType::get(BaseTp->getElementType(),
1583                                               LegalVT.getVectorNumElements());
1584 
1585       if (!Mask.empty() && NumOfDests.isValid()) {
1586         // Try to perform better estimation of the permutation.
1587         // 1. Split the source/destination vectors into real registers.
1588         // 2. Do the mask analysis to identify which real registers are
1589         // permuted. If more than 1 source registers are used for the
1590         // destination register building, the cost for this destination register
1591         // is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one
1592         // source register is used, build mask and calculate the cost as a cost
1593         // of PermuteSingleSrc.
1594         // Also, for the single register permute we try to identify if the
1595         // destination register is just a copy of the source register or the
1596         // copy of the previous destination register (the cost is
1597         // TTI::TCC_Basic). If the source register is just reused, the cost for
1598         // this operation is 0.
1599         unsigned E = *NumOfDests.getValue();
1600         unsigned NormalizedVF =
1601             LegalVT.getVectorNumElements() * std::max(NumOfSrcs, E);
1602         unsigned NumOfSrcRegs = NormalizedVF / LegalVT.getVectorNumElements();
1603         unsigned NumOfDestRegs = NormalizedVF / LegalVT.getVectorNumElements();
1604         SmallVector<int> NormalizedMask(NormalizedVF, UndefMaskElem);
1605         copy(Mask, NormalizedMask.begin());
1606         unsigned PrevSrcReg = 0;
1607         ArrayRef<int> PrevRegMask;
1608         InstructionCost Cost = 0;
1609         processShuffleMasks(
1610             NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
1611             [this, SingleOpTy, CostKind, &PrevSrcReg, &PrevRegMask,
1612              &Cost](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
1613               if (!ShuffleVectorInst::isIdentityMask(RegMask)) {
1614                 // Check if the previous register can be just copied to the next
1615                 // one.
1616                 if (PrevRegMask.empty() || PrevSrcReg != SrcReg ||
1617                     PrevRegMask != RegMask)
1618                   Cost += getShuffleCost(TTI::SK_PermuteSingleSrc, SingleOpTy,
1619                                          RegMask, CostKind, 0, nullptr);
1620                 else
1621                   // Just a copy of previous destination register.
1622                   Cost += TTI::TCC_Basic;
1623                 return;
1624               }
1625               if (SrcReg != DestReg &&
1626                   any_of(RegMask, [](int I) { return I != UndefMaskElem; })) {
1627                 // Just a copy of the source register.
1628                 Cost += TTI::TCC_Basic;
1629               }
1630               PrevSrcReg = SrcReg;
1631               PrevRegMask = RegMask;
1632             },
1633             [this, SingleOpTy, CostKind, &Cost](ArrayRef<int> RegMask,
1634                                                 unsigned /*Unused*/,
1635                                                 unsigned /*Unused*/) {
1636               Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, RegMask,
1637                                      CostKind, 0, nullptr);
1638             });
1639         return Cost;
1640       }
1641 
1642       InstructionCost NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
1643       return NumOfShuffles * getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy,
1644                                             std::nullopt, CostKind, 0, nullptr);
1645     }
1646 
1647     return BaseT::getShuffleCost(Kind, BaseTp, Mask, CostKind, Index, SubTp);
1648   }
1649 
1650   // For 2-input shuffles, we must account for splitting the 2 inputs into many.
1651   if (Kind == TTI::SK_PermuteTwoSrc && LT.first != 1) {
1652     // We assume that source and destination have the same vector type.
1653     InstructionCost NumOfDests = LT.first;
1654     InstructionCost NumOfShufflesPerDest = LT.first * 2 - 1;
1655     LT.first = NumOfDests * NumOfShufflesPerDest;
1656   }
1657 
1658   static const CostTblEntry AVX512VBMIShuffleTbl[] = {
1659       {TTI::SK_Reverse, MVT::v64i8, 1}, // vpermb
1660       {TTI::SK_Reverse, MVT::v32i8, 1}, // vpermb
1661 
1662       {TTI::SK_PermuteSingleSrc, MVT::v64i8, 1}, // vpermb
1663       {TTI::SK_PermuteSingleSrc, MVT::v32i8, 1}, // vpermb
1664 
1665       {TTI::SK_PermuteTwoSrc, MVT::v64i8, 2}, // vpermt2b
1666       {TTI::SK_PermuteTwoSrc, MVT::v32i8, 2}, // vpermt2b
1667       {TTI::SK_PermuteTwoSrc, MVT::v16i8, 2}  // vpermt2b
1668   };
1669 
1670   if (ST->hasVBMI())
1671     if (const auto *Entry =
1672             CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second))
1673       return LT.first * Entry->Cost;
1674 
1675   static const CostTblEntry AVX512BWShuffleTbl[] = {
1676       {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw
1677       {TTI::SK_Broadcast, MVT::v32f16, 1}, // vpbroadcastw
1678       {TTI::SK_Broadcast, MVT::v64i8, 1},  // vpbroadcastb
1679 
1680       {TTI::SK_Reverse, MVT::v32i16, 2}, // vpermw
1681       {TTI::SK_Reverse, MVT::v32f16, 2}, // vpermw
1682       {TTI::SK_Reverse, MVT::v16i16, 2}, // vpermw
1683       {TTI::SK_Reverse, MVT::v64i8, 2},  // pshufb + vshufi64x2
1684 
1685       {TTI::SK_PermuteSingleSrc, MVT::v32i16, 2}, // vpermw
1686       {TTI::SK_PermuteSingleSrc, MVT::v32f16, 2}, // vpermw
1687       {TTI::SK_PermuteSingleSrc, MVT::v16i16, 2}, // vpermw
1688       {TTI::SK_PermuteSingleSrc, MVT::v16f16, 2}, // vpermw
1689       {TTI::SK_PermuteSingleSrc, MVT::v64i8, 8},  // extend to v32i16
1690 
1691       {TTI::SK_PermuteTwoSrc, MVT::v32i16, 2}, // vpermt2w
1692       {TTI::SK_PermuteTwoSrc, MVT::v32f16, 2}, // vpermt2w
1693       {TTI::SK_PermuteTwoSrc, MVT::v16i16, 2}, // vpermt2w
1694       {TTI::SK_PermuteTwoSrc, MVT::v8i16, 2},  // vpermt2w
1695       {TTI::SK_PermuteTwoSrc, MVT::v64i8, 19}, // 6 * v32i8 + 1
1696 
1697       {TTI::SK_Select, MVT::v32i16, 1}, // vblendmw
1698       {TTI::SK_Select, MVT::v64i8,  1}, // vblendmb
1699 
1700       {TTI::SK_Splice, MVT::v32i16, 2}, // vshufi64x2 + palignr
1701       {TTI::SK_Splice, MVT::v32f16, 2}, // vshufi64x2 + palignr
1702       {TTI::SK_Splice, MVT::v64i8,  2}, // vshufi64x2 + palignr
1703   };
1704 
1705   if (ST->hasBWI())
1706     if (const auto *Entry =
1707             CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second))
1708       return LT.first * Entry->Cost;
1709 
1710   static const CostKindTblEntry AVX512ShuffleTbl[] = {
1711       {TTI::SK_Broadcast, MVT::v8f64,  { 1, 1, 1, 1 } }, // vbroadcastsd
1712       {TTI::SK_Broadcast, MVT::v16f32, { 1, 1, 1, 1 } }, // vbroadcastss
1713       {TTI::SK_Broadcast, MVT::v8i64,  { 1, 1, 1, 1 } }, // vpbroadcastq
1714       {TTI::SK_Broadcast, MVT::v16i32, { 1, 1, 1, 1 } }, // vpbroadcastd
1715       {TTI::SK_Broadcast, MVT::v32i16, { 1, 1, 1, 1 } }, // vpbroadcastw
1716       {TTI::SK_Broadcast, MVT::v32f16, { 1, 1, 1, 1 } }, // vpbroadcastw
1717       {TTI::SK_Broadcast, MVT::v64i8,  { 1, 1, 1, 1 } }, // vpbroadcastb
1718 
1719       {TTI::SK_Reverse, MVT::v8f64,  { 1, 3, 1, 1 } }, // vpermpd
1720       {TTI::SK_Reverse, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermps
1721       {TTI::SK_Reverse, MVT::v8i64,  { 1, 3, 1, 1 } }, // vpermq
1722       {TTI::SK_Reverse, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermd
1723       {TTI::SK_Reverse, MVT::v32i16, { 7, 7, 7, 7 } }, // per mca
1724       {TTI::SK_Reverse, MVT::v32f16, { 7, 7, 7, 7 } }, // per mca
1725       {TTI::SK_Reverse, MVT::v64i8,  { 7, 7, 7, 7 } }, // per mca
1726 
1727       {TTI::SK_Splice, MVT::v8f64,  { 1, 1, 1, 1 } }, // vpalignd
1728       {TTI::SK_Splice, MVT::v4f64,  { 1, 1, 1, 1 } }, // vpalignd
1729       {TTI::SK_Splice, MVT::v16f32, { 1, 1, 1, 1 } }, // vpalignd
1730       {TTI::SK_Splice, MVT::v8f32,  { 1, 1, 1, 1 } }, // vpalignd
1731       {TTI::SK_Splice, MVT::v8i64,  { 1, 1, 1, 1 } }, // vpalignd
1732       {TTI::SK_Splice, MVT::v4i64,  { 1, 1, 1, 1 } }, // vpalignd
1733       {TTI::SK_Splice, MVT::v16i32, { 1, 1, 1, 1 } }, // vpalignd
1734       {TTI::SK_Splice, MVT::v8i32,  { 1, 1, 1, 1 } }, // vpalignd
1735       {TTI::SK_Splice, MVT::v32i16, { 4, 4, 4, 4 } }, // split + palignr
1736       {TTI::SK_Splice, MVT::v32f16, { 4, 4, 4, 4 } }, // split + palignr
1737       {TTI::SK_Splice, MVT::v64i8,  { 4, 4, 4, 4 } }, // split + palignr
1738 
1739       {TTI::SK_PermuteSingleSrc, MVT::v8f64,  { 1, 3, 1, 1 } }, // vpermpd
1740       {TTI::SK_PermuteSingleSrc, MVT::v4f64,  { 1, 3, 1, 1 } }, // vpermpd
1741       {TTI::SK_PermuteSingleSrc, MVT::v2f64,  { 1, 3, 1, 1 } }, // vpermpd
1742       {TTI::SK_PermuteSingleSrc, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermps
1743       {TTI::SK_PermuteSingleSrc, MVT::v8f32,  { 1, 3, 1, 1 } }, // vpermps
1744       {TTI::SK_PermuteSingleSrc, MVT::v4f32,  { 1, 3, 1, 1 } }, // vpermps
1745       {TTI::SK_PermuteSingleSrc, MVT::v8i64,  { 1, 3, 1, 1 } }, // vpermq
1746       {TTI::SK_PermuteSingleSrc, MVT::v4i64,  { 1, 3, 1, 1 } }, // vpermq
1747       {TTI::SK_PermuteSingleSrc, MVT::v2i64,  { 1, 3, 1, 1 } }, // vpermq
1748       {TTI::SK_PermuteSingleSrc, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermd
1749       {TTI::SK_PermuteSingleSrc, MVT::v8i32,  { 1, 3, 1, 1 } }, // vpermd
1750       {TTI::SK_PermuteSingleSrc, MVT::v4i32,  { 1, 3, 1, 1 } }, // vpermd
1751       {TTI::SK_PermuteSingleSrc, MVT::v16i8,  { 1, 3, 1, 1 } }, // pshufb
1752 
1753       {TTI::SK_PermuteTwoSrc, MVT::v8f64,  { 1, 3, 1, 1 } }, // vpermt2pd
1754       {TTI::SK_PermuteTwoSrc, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermt2ps
1755       {TTI::SK_PermuteTwoSrc, MVT::v8i64,  { 1, 3, 1, 1 } }, // vpermt2q
1756       {TTI::SK_PermuteTwoSrc, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermt2d
1757       {TTI::SK_PermuteTwoSrc, MVT::v4f64,  { 1, 3, 1, 1 } }, // vpermt2pd
1758       {TTI::SK_PermuteTwoSrc, MVT::v8f32,  { 1, 3, 1, 1 } }, // vpermt2ps
1759       {TTI::SK_PermuteTwoSrc, MVT::v4i64,  { 1, 3, 1, 1 } }, // vpermt2q
1760       {TTI::SK_PermuteTwoSrc, MVT::v8i32,  { 1, 3, 1, 1 } }, // vpermt2d
1761       {TTI::SK_PermuteTwoSrc, MVT::v2f64,  { 1, 3, 1, 1 } }, // vpermt2pd
1762       {TTI::SK_PermuteTwoSrc, MVT::v4f32,  { 1, 3, 1, 1 } }, // vpermt2ps
1763       {TTI::SK_PermuteTwoSrc, MVT::v2i64,  { 1, 3, 1, 1 } }, // vpermt2q
1764       {TTI::SK_PermuteTwoSrc, MVT::v4i32,  { 1, 3, 1, 1 } }, // vpermt2d
1765 
1766       // FIXME: This just applies the type legalization cost rules above
1767       // assuming these completely split.
1768       {TTI::SK_PermuteSingleSrc, MVT::v32i16, { 14, 14, 14, 14 } },
1769       {TTI::SK_PermuteSingleSrc, MVT::v32f16, { 14, 14, 14, 14 } },
1770       {TTI::SK_PermuteSingleSrc, MVT::v64i8,  { 14, 14, 14, 14 } },
1771       {TTI::SK_PermuteTwoSrc,    MVT::v32i16, { 42, 42, 42, 42 } },
1772       {TTI::SK_PermuteTwoSrc,    MVT::v32f16, { 42, 42, 42, 42 } },
1773       {TTI::SK_PermuteTwoSrc,    MVT::v64i8,  { 42, 42, 42, 42 } },
1774 
1775       {TTI::SK_Select, MVT::v32i16, { 1, 1, 1, 1 } }, // vpternlogq
1776       {TTI::SK_Select, MVT::v32f16, { 1, 1, 1, 1 } }, // vpternlogq
1777       {TTI::SK_Select, MVT::v64i8,  { 1, 1, 1, 1 } }, // vpternlogq
1778       {TTI::SK_Select, MVT::v8f64,  { 1, 1, 1, 1 } }, // vblendmpd
1779       {TTI::SK_Select, MVT::v16f32, { 1, 1, 1, 1 } }, // vblendmps
1780       {TTI::SK_Select, MVT::v8i64,  { 1, 1, 1, 1 } }, // vblendmq
1781       {TTI::SK_Select, MVT::v16i32, { 1, 1, 1, 1 } }, // vblendmd
1782   };
1783 
1784   if (ST->hasAVX512())
1785     if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second))
1786       if (auto KindCost = Entry->Cost[CostKind])
1787         return LT.first * *KindCost;
1788 
1789   static const CostTblEntry AVX2ShuffleTbl[] = {
1790       {TTI::SK_Broadcast, MVT::v4f64, 1},  // vbroadcastpd
1791       {TTI::SK_Broadcast, MVT::v8f32, 1},  // vbroadcastps
1792       {TTI::SK_Broadcast, MVT::v4i64, 1},  // vpbroadcastq
1793       {TTI::SK_Broadcast, MVT::v8i32, 1},  // vpbroadcastd
1794       {TTI::SK_Broadcast, MVT::v16i16, 1}, // vpbroadcastw
1795       {TTI::SK_Broadcast, MVT::v16f16, 1}, // vpbroadcastw
1796       {TTI::SK_Broadcast, MVT::v32i8, 1},  // vpbroadcastb
1797 
1798       {TTI::SK_Reverse, MVT::v4f64, 1},  // vpermpd
1799       {TTI::SK_Reverse, MVT::v8f32, 1},  // vpermps
1800       {TTI::SK_Reverse, MVT::v4i64, 1},  // vpermq
1801       {TTI::SK_Reverse, MVT::v8i32, 1},  // vpermd
1802       {TTI::SK_Reverse, MVT::v16i16, 2}, // vperm2i128 + pshufb
1803       {TTI::SK_Reverse, MVT::v16f16, 2}, // vperm2i128 + pshufb
1804       {TTI::SK_Reverse, MVT::v32i8, 2},  // vperm2i128 + pshufb
1805 
1806       {TTI::SK_Select, MVT::v16i16, 1}, // vpblendvb
1807       {TTI::SK_Select, MVT::v16f16, 1}, // vpblendvb
1808       {TTI::SK_Select, MVT::v32i8,  1}, // vpblendvb
1809 
1810       {TTI::SK_Splice, MVT::v8i32,  2}, // vperm2i128 + vpalignr
1811       {TTI::SK_Splice, MVT::v8f32,  2}, // vperm2i128 + vpalignr
1812       {TTI::SK_Splice, MVT::v16i16, 2}, // vperm2i128 + vpalignr
1813       {TTI::SK_Splice, MVT::v16f16, 2}, // vperm2i128 + vpalignr
1814       {TTI::SK_Splice, MVT::v32i8,  2}, // vperm2i128 + vpalignr
1815 
1816       {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1},  // vpermpd
1817       {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1},  // vpermps
1818       {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1},  // vpermq
1819       {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1},  // vpermd
1820       {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vperm2i128 + 2*vpshufb
1821                                                   // + vpblendvb
1822       {TTI::SK_PermuteSingleSrc, MVT::v16f16, 4}, // vperm2i128 + 2*vpshufb
1823                                                   // + vpblendvb
1824       {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4},  // vperm2i128 + 2*vpshufb
1825                                                   // + vpblendvb
1826 
1827       {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3},  // 2*vpermpd + vblendpd
1828       {TTI::SK_PermuteTwoSrc, MVT::v8f32, 3},  // 2*vpermps + vblendps
1829       {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3},  // 2*vpermq + vpblendd
1830       {TTI::SK_PermuteTwoSrc, MVT::v8i32, 3},  // 2*vpermd + vpblendd
1831       {TTI::SK_PermuteTwoSrc, MVT::v16i16, 7}, // 2*vperm2i128 + 4*vpshufb
1832                                                // + vpblendvb
1833       {TTI::SK_PermuteTwoSrc, MVT::v16f16, 7}, // 2*vperm2i128 + 4*vpshufb
1834                                                // + vpblendvb
1835       {TTI::SK_PermuteTwoSrc, MVT::v32i8, 7},  // 2*vperm2i128 + 4*vpshufb
1836                                                // + vpblendvb
1837   };
1838 
1839   if (ST->hasAVX2())
1840     if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second))
1841       return LT.first * Entry->Cost;
1842 
1843   static const CostTblEntry XOPShuffleTbl[] = {
1844       {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2},  // vperm2f128 + vpermil2pd
1845       {TTI::SK_PermuteSingleSrc, MVT::v8f32, 2},  // vperm2f128 + vpermil2ps
1846       {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2},  // vperm2f128 + vpermil2pd
1847       {TTI::SK_PermuteSingleSrc, MVT::v8i32, 2},  // vperm2f128 + vpermil2ps
1848       {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vextractf128 + 2*vpperm
1849                                                   // + vinsertf128
1850       {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4},  // vextractf128 + 2*vpperm
1851                                                   // + vinsertf128
1852 
1853       {TTI::SK_PermuteTwoSrc, MVT::v16i16, 9}, // 2*vextractf128 + 6*vpperm
1854                                                // + vinsertf128
1855       {TTI::SK_PermuteTwoSrc, MVT::v8i16, 1},  // vpperm
1856       {TTI::SK_PermuteTwoSrc, MVT::v32i8, 9},  // 2*vextractf128 + 6*vpperm
1857                                                // + vinsertf128
1858       {TTI::SK_PermuteTwoSrc, MVT::v16i8, 1},  // vpperm
1859   };
1860 
1861   if (ST->hasXOP())
1862     if (const auto *Entry = CostTableLookup(XOPShuffleTbl, Kind, LT.second))
1863       return LT.first * Entry->Cost;
1864 
1865   static const CostTblEntry AVX1ShuffleTbl[] = {
1866       {TTI::SK_Broadcast, MVT::v4f64, 2},  // vperm2f128 + vpermilpd
1867       {TTI::SK_Broadcast, MVT::v8f32, 2},  // vperm2f128 + vpermilps
1868       {TTI::SK_Broadcast, MVT::v4i64, 2},  // vperm2f128 + vpermilpd
1869       {TTI::SK_Broadcast, MVT::v8i32, 2},  // vperm2f128 + vpermilps
1870       {TTI::SK_Broadcast, MVT::v16i16, 3}, // vpshuflw + vpshufd + vinsertf128
1871       {TTI::SK_Broadcast, MVT::v16f16, 3}, // vpshuflw + vpshufd + vinsertf128
1872       {TTI::SK_Broadcast, MVT::v32i8, 2},  // vpshufb + vinsertf128
1873 
1874       {TTI::SK_Reverse, MVT::v4f64, 2},  // vperm2f128 + vpermilpd
1875       {TTI::SK_Reverse, MVT::v8f32, 2},  // vperm2f128 + vpermilps
1876       {TTI::SK_Reverse, MVT::v4i64, 2},  // vperm2f128 + vpermilpd
1877       {TTI::SK_Reverse, MVT::v8i32, 2},  // vperm2f128 + vpermilps
1878       {TTI::SK_Reverse, MVT::v16i16, 4}, // vextractf128 + 2*pshufb
1879                                          // + vinsertf128
1880       {TTI::SK_Reverse, MVT::v16f16, 4}, // vextractf128 + 2*pshufb
1881                                          // + vinsertf128
1882       {TTI::SK_Reverse, MVT::v32i8, 4},  // vextractf128 + 2*pshufb
1883                                          // + vinsertf128
1884 
1885       {TTI::SK_Select, MVT::v4i64, 1},  // vblendpd
1886       {TTI::SK_Select, MVT::v4f64, 1},  // vblendpd
1887       {TTI::SK_Select, MVT::v8i32, 1},  // vblendps
1888       {TTI::SK_Select, MVT::v8f32, 1},  // vblendps
1889       {TTI::SK_Select, MVT::v16i16, 3}, // vpand + vpandn + vpor
1890       {TTI::SK_Select, MVT::v16f16, 3}, // vpand + vpandn + vpor
1891       {TTI::SK_Select, MVT::v32i8, 3},  // vpand + vpandn + vpor
1892 
1893       {TTI::SK_Splice, MVT::v4i64,  2}, // vperm2f128 + shufpd
1894       {TTI::SK_Splice, MVT::v4f64,  2}, // vperm2f128 + shufpd
1895       {TTI::SK_Splice, MVT::v8i32,  4}, // 2*vperm2f128 + 2*vshufps
1896       {TTI::SK_Splice, MVT::v8f32,  4}, // 2*vperm2f128 + 2*vshufps
1897       {TTI::SK_Splice, MVT::v16i16, 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
1898       {TTI::SK_Splice, MVT::v16f16, 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
1899       {TTI::SK_Splice, MVT::v32i8,  5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
1900 
1901       {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2},  // vperm2f128 + vshufpd
1902       {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2},  // vperm2f128 + vshufpd
1903       {TTI::SK_PermuteSingleSrc, MVT::v8f32, 4},  // 2*vperm2f128 + 2*vshufps
1904       {TTI::SK_PermuteSingleSrc, MVT::v8i32, 4},  // 2*vperm2f128 + 2*vshufps
1905       {TTI::SK_PermuteSingleSrc, MVT::v16i16, 8}, // vextractf128 + 4*pshufb
1906                                                   // + 2*por + vinsertf128
1907       {TTI::SK_PermuteSingleSrc, MVT::v16f16, 8}, // vextractf128 + 4*pshufb
1908                                                   // + 2*por + vinsertf128
1909       {TTI::SK_PermuteSingleSrc, MVT::v32i8, 8},  // vextractf128 + 4*pshufb
1910                                                   // + 2*por + vinsertf128
1911 
1912       {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3},   // 2*vperm2f128 + vshufpd
1913       {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3},   // 2*vperm2f128 + vshufpd
1914       {TTI::SK_PermuteTwoSrc, MVT::v8f32, 4},   // 2*vperm2f128 + 2*vshufps
1915       {TTI::SK_PermuteTwoSrc, MVT::v8i32, 4},   // 2*vperm2f128 + 2*vshufps
1916       {TTI::SK_PermuteTwoSrc, MVT::v16i16, 15}, // 2*vextractf128 + 8*pshufb
1917                                                 // + 4*por + vinsertf128
1918       {TTI::SK_PermuteTwoSrc, MVT::v16f16, 15}, // 2*vextractf128 + 8*pshufb
1919                                                 // + 4*por + vinsertf128
1920       {TTI::SK_PermuteTwoSrc, MVT::v32i8, 15},  // 2*vextractf128 + 8*pshufb
1921                                                 // + 4*por + vinsertf128
1922   };
1923 
1924   if (ST->hasAVX())
1925     if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second))
1926       return LT.first * Entry->Cost;
1927 
1928   static const CostTblEntry SSE41ShuffleTbl[] = {
1929       {TTI::SK_Select, MVT::v2i64, 1}, // pblendw
1930       {TTI::SK_Select, MVT::v2f64, 1}, // movsd
1931       {TTI::SK_Select, MVT::v4i32, 1}, // pblendw
1932       {TTI::SK_Select, MVT::v4f32, 1}, // blendps
1933       {TTI::SK_Select, MVT::v8i16, 1}, // pblendw
1934       {TTI::SK_Select, MVT::v8f16, 1}, // pblendw
1935       {TTI::SK_Select, MVT::v16i8, 1}  // pblendvb
1936   };
1937 
1938   if (ST->hasSSE41())
1939     if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second))
1940       return LT.first * Entry->Cost;
1941 
1942   static const CostTblEntry SSSE3ShuffleTbl[] = {
1943       {TTI::SK_Broadcast, MVT::v8i16, 1}, // pshufb
1944       {TTI::SK_Broadcast, MVT::v8f16, 1}, // pshufb
1945       {TTI::SK_Broadcast, MVT::v16i8, 1}, // pshufb
1946 
1947       {TTI::SK_Reverse, MVT::v8i16, 1}, // pshufb
1948       {TTI::SK_Reverse, MVT::v8f16, 1}, // pshufb
1949       {TTI::SK_Reverse, MVT::v16i8, 1}, // pshufb
1950 
1951       {TTI::SK_Select, MVT::v8i16, 3}, // 2*pshufb + por
1952       {TTI::SK_Select, MVT::v8f16, 3}, // 2*pshufb + por
1953       {TTI::SK_Select, MVT::v16i8, 3}, // 2*pshufb + por
1954 
1955       {TTI::SK_Splice, MVT::v4i32, 1}, // palignr
1956       {TTI::SK_Splice, MVT::v4f32, 1}, // palignr
1957       {TTI::SK_Splice, MVT::v8i16, 1}, // palignr
1958       {TTI::SK_Splice, MVT::v8f16, 1}, // palignr
1959       {TTI::SK_Splice, MVT::v16i8, 1}, // palignr
1960 
1961       {TTI::SK_PermuteSingleSrc, MVT::v8i16, 1}, // pshufb
1962       {TTI::SK_PermuteSingleSrc, MVT::v8f16, 1}, // pshufb
1963       {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb
1964 
1965       {TTI::SK_PermuteTwoSrc, MVT::v8i16, 3}, // 2*pshufb + por
1966       {TTI::SK_PermuteTwoSrc, MVT::v8f16, 3}, // 2*pshufb + por
1967       {TTI::SK_PermuteTwoSrc, MVT::v16i8, 3}, // 2*pshufb + por
1968   };
1969 
1970   if (ST->hasSSSE3())
1971     if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second))
1972       return LT.first * Entry->Cost;
1973 
1974   static const CostTblEntry SSE2ShuffleTbl[] = {
1975       {TTI::SK_Broadcast, MVT::v2f64, 1}, // shufpd
1976       {TTI::SK_Broadcast, MVT::v2i64, 1}, // pshufd
1977       {TTI::SK_Broadcast, MVT::v4i32, 1}, // pshufd
1978       {TTI::SK_Broadcast, MVT::v8i16, 2}, // pshuflw + pshufd
1979       {TTI::SK_Broadcast, MVT::v8f16, 2}, // pshuflw + pshufd
1980       {TTI::SK_Broadcast, MVT::v16i8, 3}, // unpck + pshuflw + pshufd
1981 
1982       {TTI::SK_Reverse, MVT::v2f64, 1}, // shufpd
1983       {TTI::SK_Reverse, MVT::v2i64, 1}, // pshufd
1984       {TTI::SK_Reverse, MVT::v4i32, 1}, // pshufd
1985       {TTI::SK_Reverse, MVT::v8i16, 3}, // pshuflw + pshufhw + pshufd
1986       {TTI::SK_Reverse, MVT::v8f16, 3}, // pshuflw + pshufhw + pshufd
1987       {TTI::SK_Reverse, MVT::v16i8, 9}, // 2*pshuflw + 2*pshufhw
1988                                         // + 2*pshufd + 2*unpck + packus
1989 
1990       {TTI::SK_Select, MVT::v2i64, 1}, // movsd
1991       {TTI::SK_Select, MVT::v2f64, 1}, // movsd
1992       {TTI::SK_Select, MVT::v4i32, 2}, // 2*shufps
1993       {TTI::SK_Select, MVT::v8i16, 3}, // pand + pandn + por
1994       {TTI::SK_Select, MVT::v8f16, 3}, // pand + pandn + por
1995       {TTI::SK_Select, MVT::v16i8, 3}, // pand + pandn + por
1996 
1997       {TTI::SK_Splice, MVT::v2i64, 1}, // shufpd
1998       {TTI::SK_Splice, MVT::v2f64, 1}, // shufpd
1999       {TTI::SK_Splice, MVT::v4i32, 2}, // 2*{unpck,movsd,pshufd}
2000       {TTI::SK_Splice, MVT::v8i16, 3}, // psrldq + psrlldq + por
2001       {TTI::SK_Splice, MVT::v8f16, 3}, // psrldq + psrlldq + por
2002       {TTI::SK_Splice, MVT::v16i8, 3}, // psrldq + psrlldq + por
2003 
2004       {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // shufpd
2005       {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // pshufd
2006       {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // pshufd
2007       {TTI::SK_PermuteSingleSrc, MVT::v8i16, 5}, // 2*pshuflw + 2*pshufhw
2008                                                   // + pshufd/unpck
2009       {TTI::SK_PermuteSingleSrc, MVT::v8f16, 5}, // 2*pshuflw + 2*pshufhw
2010                                                   // + pshufd/unpck
2011     { TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2*pshuflw + 2*pshufhw
2012                                                   // + 2*pshufd + 2*unpck + 2*packus
2013 
2014     { TTI::SK_PermuteTwoSrc,    MVT::v2f64,  1 }, // shufpd
2015     { TTI::SK_PermuteTwoSrc,    MVT::v2i64,  1 }, // shufpd
2016     { TTI::SK_PermuteTwoSrc,    MVT::v4i32,  2 }, // 2*{unpck,movsd,pshufd}
2017     { TTI::SK_PermuteTwoSrc,    MVT::v8i16,  8 }, // blend+permute
2018     { TTI::SK_PermuteTwoSrc,    MVT::v8f16,  8 }, // blend+permute
2019     { TTI::SK_PermuteTwoSrc,    MVT::v16i8, 13 }, // blend+permute
2020   };
2021 
2022   static const CostTblEntry SSE3BroadcastLoadTbl[] = {
2023       {TTI::SK_Broadcast, MVT::v2f64, 0}, // broadcast handled by movddup
2024   };
2025 
2026   if (ST->hasSSE2()) {
2027     bool IsLoad =
2028         llvm::any_of(Args, [](const auto &V) { return isa<LoadInst>(V); });
2029     if (ST->hasSSE3() && IsLoad)
2030       if (const auto *Entry =
2031               CostTableLookup(SSE3BroadcastLoadTbl, Kind, LT.second)) {
2032         assert(isLegalBroadcastLoad(BaseTp->getElementType(),
2033                                     LT.second.getVectorElementCount()) &&
2034                "Table entry missing from isLegalBroadcastLoad()");
2035         return LT.first * Entry->Cost;
2036       }
2037 
2038     if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second))
2039       return LT.first * Entry->Cost;
2040   }
2041 
2042   static const CostTblEntry SSE1ShuffleTbl[] = {
2043     { TTI::SK_Broadcast,        MVT::v4f32, 1 }, // shufps
2044     { TTI::SK_Reverse,          MVT::v4f32, 1 }, // shufps
2045     { TTI::SK_Select,           MVT::v4f32, 2 }, // 2*shufps
2046     { TTI::SK_Splice,           MVT::v4f32, 2 }, // 2*shufps
2047     { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // shufps
2048     { TTI::SK_PermuteTwoSrc,    MVT::v4f32, 2 }, // 2*shufps
2049   };
2050 
2051   if (ST->hasSSE1())
2052     if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second))
2053       return LT.first * Entry->Cost;
2054 
2055   return BaseT::getShuffleCost(Kind, BaseTp, Mask, CostKind, Index, SubTp);
2056 }
2057 
2058 InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
2059                                              Type *Src,
2060                                              TTI::CastContextHint CCH,
2061                                              TTI::TargetCostKind CostKind,
2062                                              const Instruction *I) {
2063   int ISD = TLI->InstructionOpcodeToISD(Opcode);
2064   assert(ISD && "Invalid opcode");
2065 
2066   // TODO: Allow non-throughput costs that aren't binary.
2067   auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
2068     if (CostKind != TTI::TCK_RecipThroughput)
2069       return Cost == 0 ? 0 : 1;
2070     return Cost;
2071   };
2072 
2073   // The cost tables include both specific, custom (non-legal) src/dst type
2074   // conversions and generic, legalized types. We test for customs first, before
2075   // falling back to legalization.
2076   // FIXME: Need a better design of the cost table to handle non-simple types of
2077   // potential massive combinations (elem_num x src_type x dst_type).
2078   static const TypeConversionCostTblEntry AVX512BWConversionTbl[] {
2079     { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 1 },
2080     { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 1 },
2081 
2082     // Mask sign extend has an instruction.
2083     { ISD::SIGN_EXTEND, MVT::v2i8,   MVT::v2i1,   1 },
2084     { ISD::SIGN_EXTEND, MVT::v16i8,  MVT::v2i1,   1 },
2085     { ISD::SIGN_EXTEND, MVT::v2i16,  MVT::v2i1,   1 },
2086     { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v2i1,   1 },
2087     { ISD::SIGN_EXTEND, MVT::v4i8,   MVT::v4i1,   1 },
2088     { ISD::SIGN_EXTEND, MVT::v16i8,  MVT::v4i1,   1 },
2089     { ISD::SIGN_EXTEND, MVT::v4i16,  MVT::v4i1,   1 },
2090     { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v4i1,   1 },
2091     { ISD::SIGN_EXTEND, MVT::v8i8,   MVT::v8i1,   1 },
2092     { ISD::SIGN_EXTEND, MVT::v16i8,  MVT::v8i1,   1 },
2093     { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v8i1,   1 },
2094     { ISD::SIGN_EXTEND, MVT::v16i8,  MVT::v16i1,  1 },
2095     { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1,  1 },
2096     { ISD::SIGN_EXTEND, MVT::v32i8,  MVT::v32i1,  1 },
2097     { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i1,  1 },
2098     { ISD::SIGN_EXTEND, MVT::v64i8,  MVT::v64i1,  1 },
2099     { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v64i1,  1 },
2100 
2101     // Mask zero extend is a sext + shift.
2102     { ISD::ZERO_EXTEND, MVT::v2i8,   MVT::v2i1,   2 },
2103     { ISD::ZERO_EXTEND, MVT::v16i8,  MVT::v2i1,   2 },
2104     { ISD::ZERO_EXTEND, MVT::v2i16,  MVT::v2i1,   2 },
2105     { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v2i1,   2 },
2106     { ISD::ZERO_EXTEND, MVT::v4i8,   MVT::v4i1,   2 },
2107     { ISD::ZERO_EXTEND, MVT::v16i8,  MVT::v4i1,   2 },
2108     { ISD::ZERO_EXTEND, MVT::v4i16,  MVT::v4i1,   2 },
2109     { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v4i1,   2 },
2110     { ISD::ZERO_EXTEND, MVT::v8i8,   MVT::v8i1,   2 },
2111     { ISD::ZERO_EXTEND, MVT::v16i8,  MVT::v8i1,   2 },
2112     { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v8i1,   2 },
2113     { ISD::ZERO_EXTEND, MVT::v16i8,  MVT::v16i1,  2 },
2114     { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1,  2 },
2115     { ISD::ZERO_EXTEND, MVT::v32i8,  MVT::v32i1,  2 },
2116     { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i1,  2 },
2117     { ISD::ZERO_EXTEND, MVT::v64i8,  MVT::v64i1,  2 },
2118     { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v64i1,  2 },
2119 
2120     { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i8,   2 },
2121     { ISD::TRUNCATE,    MVT::v2i1,   MVT::v16i8,  2 },
2122     { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i16,  2 },
2123     { ISD::TRUNCATE,    MVT::v2i1,   MVT::v8i16,  2 },
2124     { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i8,   2 },
2125     { ISD::TRUNCATE,    MVT::v4i1,   MVT::v16i8,  2 },
2126     { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i16,  2 },
2127     { ISD::TRUNCATE,    MVT::v4i1,   MVT::v8i16,  2 },
2128     { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i8,   2 },
2129     { ISD::TRUNCATE,    MVT::v8i1,   MVT::v16i8,  2 },
2130     { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i16,  2 },
2131     { ISD::TRUNCATE,    MVT::v16i1,  MVT::v16i8,  2 },
2132     { ISD::TRUNCATE,    MVT::v16i1,  MVT::v16i16, 2 },
2133     { ISD::TRUNCATE,    MVT::v32i1,  MVT::v32i8,  2 },
2134     { ISD::TRUNCATE,    MVT::v32i1,  MVT::v32i16, 2 },
2135     { ISD::TRUNCATE,    MVT::v64i1,  MVT::v64i8,  2 },
2136     { ISD::TRUNCATE,    MVT::v64i1,  MVT::v32i16, 2 },
2137 
2138     { ISD::TRUNCATE,    MVT::v32i8,  MVT::v32i16, 2 },
2139     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i16, 2 }, // widen to zmm
2140     { ISD::TRUNCATE,    MVT::v2i8,   MVT::v2i16,  2 }, // vpmovwb
2141     { ISD::TRUNCATE,    MVT::v4i8,   MVT::v4i16,  2 }, // vpmovwb
2142     { ISD::TRUNCATE,    MVT::v8i8,   MVT::v8i16,  2 }, // vpmovwb
2143   };
2144 
2145   static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = {
2146     // Mask sign extend has an instruction.
2147     { ISD::SIGN_EXTEND, MVT::v2i64,  MVT::v2i1,   1 },
2148     { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v2i1,   1 },
2149     { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v4i1,   1 },
2150     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i1,   1 },
2151     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i1,   1 },
2152     { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v16i1,  1 },
2153     { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i1,   1 },
2154     { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1,  1 },
2155 
2156     // Mask zero extend is a sext + shift.
2157     { ISD::ZERO_EXTEND, MVT::v2i64,  MVT::v2i1,   2 },
2158     { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v2i1,   2 },
2159     { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v4i1,   2 },
2160     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i1,   2 },
2161     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i1,   2 },
2162     { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v16i1,  2 },
2163     { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i1,   2 },
2164     { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1,  2 },
2165 
2166     { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i64,  2 },
2167     { ISD::TRUNCATE,    MVT::v2i1,   MVT::v4i32,  2 },
2168     { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i32,  2 },
2169     { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i64,  2 },
2170     { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i32,  2 },
2171     { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i64,  2 },
2172     { ISD::TRUNCATE,    MVT::v16i1,  MVT::v16i32, 2 },
2173     { ISD::TRUNCATE,    MVT::v16i1,  MVT::v8i64,  2 },
2174 
2175     { ISD::SINT_TO_FP,  MVT::v8f32,  MVT::v8i64,  1 },
2176     { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i64,  1 },
2177 
2178     { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i64,  1 },
2179     { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i64,  1 },
2180 
2181     { ISD::FP_TO_SINT,  MVT::v8i64,  MVT::v8f32,  1 },
2182     { ISD::FP_TO_SINT,  MVT::v8i64,  MVT::v8f64,  1 },
2183 
2184     { ISD::FP_TO_UINT,  MVT::v8i64,  MVT::v8f32,  1 },
2185     { ISD::FP_TO_UINT,  MVT::v8i64,  MVT::v8f64,  1 },
2186   };
2187 
2188   // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and
2189   // 256-bit wide vectors.
2190 
2191   static const TypeConversionCostTblEntry AVX512FConversionTbl[] = {
2192     { ISD::FP_EXTEND, MVT::v8f64,   MVT::v8f32,  1 },
2193     { ISD::FP_EXTEND, MVT::v8f64,   MVT::v16f32, 3 },
2194     { ISD::FP_ROUND,  MVT::v8f32,   MVT::v8f64,  1 },
2195 
2196     { ISD::TRUNCATE,  MVT::v2i1,    MVT::v2i8,   3 }, // sext+vpslld+vptestmd
2197     { ISD::TRUNCATE,  MVT::v4i1,    MVT::v4i8,   3 }, // sext+vpslld+vptestmd
2198     { ISD::TRUNCATE,  MVT::v8i1,    MVT::v8i8,   3 }, // sext+vpslld+vptestmd
2199     { ISD::TRUNCATE,  MVT::v16i1,   MVT::v16i8,  3 }, // sext+vpslld+vptestmd
2200     { ISD::TRUNCATE,  MVT::v2i1,    MVT::v2i16,  3 }, // sext+vpsllq+vptestmq
2201     { ISD::TRUNCATE,  MVT::v4i1,    MVT::v4i16,  3 }, // sext+vpsllq+vptestmq
2202     { ISD::TRUNCATE,  MVT::v8i1,    MVT::v8i16,  3 }, // sext+vpsllq+vptestmq
2203     { ISD::TRUNCATE,  MVT::v16i1,   MVT::v16i16, 3 }, // sext+vpslld+vptestmd
2204     { ISD::TRUNCATE,  MVT::v2i1,    MVT::v2i32,  2 }, // zmm vpslld+vptestmd
2205     { ISD::TRUNCATE,  MVT::v4i1,    MVT::v4i32,  2 }, // zmm vpslld+vptestmd
2206     { ISD::TRUNCATE,  MVT::v8i1,    MVT::v8i32,  2 }, // zmm vpslld+vptestmd
2207     { ISD::TRUNCATE,  MVT::v16i1,   MVT::v16i32, 2 }, // vpslld+vptestmd
2208     { ISD::TRUNCATE,  MVT::v2i1,    MVT::v2i64,  2 }, // zmm vpsllq+vptestmq
2209     { ISD::TRUNCATE,  MVT::v4i1,    MVT::v4i64,  2 }, // zmm vpsllq+vptestmq
2210     { ISD::TRUNCATE,  MVT::v8i1,    MVT::v8i64,  2 }, // vpsllq+vptestmq
2211     { ISD::TRUNCATE,  MVT::v2i8,    MVT::v2i32,  2 }, // vpmovdb
2212     { ISD::TRUNCATE,  MVT::v4i8,    MVT::v4i32,  2 }, // vpmovdb
2213     { ISD::TRUNCATE,  MVT::v16i8,   MVT::v16i32, 2 }, // vpmovdb
2214     { ISD::TRUNCATE,  MVT::v32i8,   MVT::v16i32, 2 }, // vpmovdb
2215     { ISD::TRUNCATE,  MVT::v64i8,   MVT::v16i32, 2 }, // vpmovdb
2216     { ISD::TRUNCATE,  MVT::v16i16,  MVT::v16i32, 2 }, // vpmovdw
2217     { ISD::TRUNCATE,  MVT::v32i16,  MVT::v16i32, 2 }, // vpmovdw
2218     { ISD::TRUNCATE,  MVT::v2i8,    MVT::v2i64,  2 }, // vpmovqb
2219     { ISD::TRUNCATE,  MVT::v2i16,   MVT::v2i64,  1 }, // vpshufb
2220     { ISD::TRUNCATE,  MVT::v8i8,    MVT::v8i64,  2 }, // vpmovqb
2221     { ISD::TRUNCATE,  MVT::v16i8,   MVT::v8i64,  2 }, // vpmovqb
2222     { ISD::TRUNCATE,  MVT::v32i8,   MVT::v8i64,  2 }, // vpmovqb
2223     { ISD::TRUNCATE,  MVT::v64i8,   MVT::v8i64,  2 }, // vpmovqb
2224     { ISD::TRUNCATE,  MVT::v8i16,   MVT::v8i64,  2 }, // vpmovqw
2225     { ISD::TRUNCATE,  MVT::v16i16,  MVT::v8i64,  2 }, // vpmovqw
2226     { ISD::TRUNCATE,  MVT::v32i16,  MVT::v8i64,  2 }, // vpmovqw
2227     { ISD::TRUNCATE,  MVT::v8i32,   MVT::v8i64,  1 }, // vpmovqd
2228     { ISD::TRUNCATE,  MVT::v4i32,   MVT::v4i64,  1 }, // zmm vpmovqd
2229     { ISD::TRUNCATE,  MVT::v16i8,   MVT::v16i64, 5 },// 2*vpmovqd+concat+vpmovdb
2230 
2231     { ISD::TRUNCATE,  MVT::v16i8,  MVT::v16i16,  3 }, // extend to v16i32
2232     { ISD::TRUNCATE,  MVT::v32i8,  MVT::v32i16,  8 },
2233     { ISD::TRUNCATE,  MVT::v64i8,  MVT::v32i16,  8 },
2234 
2235     // Sign extend is zmm vpternlogd+vptruncdb.
2236     // Zero extend is zmm broadcast load+vptruncdw.
2237     { ISD::SIGN_EXTEND, MVT::v2i8,   MVT::v2i1,   3 },
2238     { ISD::ZERO_EXTEND, MVT::v2i8,   MVT::v2i1,   4 },
2239     { ISD::SIGN_EXTEND, MVT::v4i8,   MVT::v4i1,   3 },
2240     { ISD::ZERO_EXTEND, MVT::v4i8,   MVT::v4i1,   4 },
2241     { ISD::SIGN_EXTEND, MVT::v8i8,   MVT::v8i1,   3 },
2242     { ISD::ZERO_EXTEND, MVT::v8i8,   MVT::v8i1,   4 },
2243     { ISD::SIGN_EXTEND, MVT::v16i8,  MVT::v16i1,  3 },
2244     { ISD::ZERO_EXTEND, MVT::v16i8,  MVT::v16i1,  4 },
2245 
2246     // Sign extend is zmm vpternlogd+vptruncdw.
2247     // Zero extend is zmm vpternlogd+vptruncdw+vpsrlw.
2248     { ISD::SIGN_EXTEND, MVT::v2i16,  MVT::v2i1,   3 },
2249     { ISD::ZERO_EXTEND, MVT::v2i16,  MVT::v2i1,   4 },
2250     { ISD::SIGN_EXTEND, MVT::v4i16,  MVT::v4i1,   3 },
2251     { ISD::ZERO_EXTEND, MVT::v4i16,  MVT::v4i1,   4 },
2252     { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v8i1,   3 },
2253     { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v8i1,   4 },
2254     { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1,  3 },
2255     { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1,  4 },
2256 
2257     { ISD::SIGN_EXTEND, MVT::v2i32,  MVT::v2i1,   1 }, // zmm vpternlogd
2258     { ISD::ZERO_EXTEND, MVT::v2i32,  MVT::v2i1,   2 }, // zmm vpternlogd+psrld
2259     { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v4i1,   1 }, // zmm vpternlogd
2260     { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v4i1,   2 }, // zmm vpternlogd+psrld
2261     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i1,   1 }, // zmm vpternlogd
2262     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i1,   2 }, // zmm vpternlogd+psrld
2263     { ISD::SIGN_EXTEND, MVT::v2i64,  MVT::v2i1,   1 }, // zmm vpternlogq
2264     { ISD::ZERO_EXTEND, MVT::v2i64,  MVT::v2i1,   2 }, // zmm vpternlogq+psrlq
2265     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i1,   1 }, // zmm vpternlogq
2266     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i1,   2 }, // zmm vpternlogq+psrlq
2267 
2268     { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1,  1 }, // vpternlogd
2269     { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1,  2 }, // vpternlogd+psrld
2270     { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i1,   1 }, // vpternlogq
2271     { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i1,   2 }, // vpternlogq+psrlq
2272 
2273     { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8,  1 },
2274     { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8,  1 },
2275     { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
2276     { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
2277     { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i8,   1 },
2278     { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i8,   1 },
2279     { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i16,  1 },
2280     { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i16,  1 },
2281     { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i32,  1 },
2282     { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i32,  1 },
2283 
2284     { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8,  3 }, // FIXME: May not be right
2285     { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8,  3 }, // FIXME: May not be right
2286 
2287     { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i1,   4 },
2288     { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i1,  3 },
2289     { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v16i8,  2 },
2290     { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i8,  1 },
2291     { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i16,  2 },
2292     { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i16, 1 },
2293     { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i32,  1 },
2294     { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i32, 1 },
2295 
2296     { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i1,   4 },
2297     { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i1,  3 },
2298     { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v16i8,  2 },
2299     { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i8,  1 },
2300     { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i16,  2 },
2301     { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i16, 1 },
2302     { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i32,  1 },
2303     { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i32, 1 },
2304     { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i64, 26 },
2305     { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i64,  5 },
2306 
2307     { ISD::FP_TO_SINT,  MVT::v16i8,  MVT::v16f32, 2 },
2308     { ISD::FP_TO_SINT,  MVT::v16i8,  MVT::v16f64, 7 },
2309     { ISD::FP_TO_SINT,  MVT::v32i8,  MVT::v32f64,15 },
2310     { ISD::FP_TO_SINT,  MVT::v64i8,  MVT::v64f32,11 },
2311     { ISD::FP_TO_SINT,  MVT::v64i8,  MVT::v64f64,31 },
2312     { ISD::FP_TO_SINT,  MVT::v8i16,  MVT::v8f64,  3 },
2313     { ISD::FP_TO_SINT,  MVT::v16i16, MVT::v16f64, 7 },
2314     { ISD::FP_TO_SINT,  MVT::v32i16, MVT::v32f32, 5 },
2315     { ISD::FP_TO_SINT,  MVT::v32i16, MVT::v32f64,15 },
2316     { ISD::FP_TO_SINT,  MVT::v8i32,  MVT::v8f64,  1 },
2317     { ISD::FP_TO_SINT,  MVT::v16i32, MVT::v16f64, 3 },
2318 
2319     { ISD::FP_TO_UINT,  MVT::v8i32,  MVT::v8f64,  1 },
2320     { ISD::FP_TO_UINT,  MVT::v8i16,  MVT::v8f64,  3 },
2321     { ISD::FP_TO_UINT,  MVT::v8i8,   MVT::v8f64,  3 },
2322     { ISD::FP_TO_UINT,  MVT::v16i32, MVT::v16f32, 1 },
2323     { ISD::FP_TO_UINT,  MVT::v16i16, MVT::v16f32, 3 },
2324     { ISD::FP_TO_UINT,  MVT::v16i8,  MVT::v16f32, 3 },
2325   };
2326 
2327   static const TypeConversionCostTblEntry AVX512BWVLConversionTbl[] {
2328     // Mask sign extend has an instruction.
2329     { ISD::SIGN_EXTEND, MVT::v2i8,   MVT::v2i1,   1 },
2330     { ISD::SIGN_EXTEND, MVT::v16i8,  MVT::v2i1,   1 },
2331     { ISD::SIGN_EXTEND, MVT::v2i16,  MVT::v2i1,   1 },
2332     { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v2i1,   1 },
2333     { ISD::SIGN_EXTEND, MVT::v4i16,  MVT::v4i1,   1 },
2334     { ISD::SIGN_EXTEND, MVT::v16i8,  MVT::v4i1,   1 },
2335     { ISD::SIGN_EXTEND, MVT::v4i8,   MVT::v4i1,   1 },
2336     { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v4i1,   1 },
2337     { ISD::SIGN_EXTEND, MVT::v8i8,   MVT::v8i1,   1 },
2338     { ISD::SIGN_EXTEND, MVT::v16i8,  MVT::v8i1,   1 },
2339     { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v8i1,   1 },
2340     { ISD::SIGN_EXTEND, MVT::v16i8,  MVT::v16i1,  1 },
2341     { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1,  1 },
2342     { ISD::SIGN_EXTEND, MVT::v32i8,  MVT::v32i1,  1 },
2343     { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v32i1,  1 },
2344     { ISD::SIGN_EXTEND, MVT::v32i8,  MVT::v64i1,  1 },
2345     { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v64i1,  1 },
2346 
2347     // Mask zero extend is a sext + shift.
2348     { ISD::ZERO_EXTEND, MVT::v2i8,   MVT::v2i1,   2 },
2349     { ISD::ZERO_EXTEND, MVT::v16i8,  MVT::v2i1,   2 },
2350     { ISD::ZERO_EXTEND, MVT::v2i16,  MVT::v2i1,   2 },
2351     { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v2i1,   2 },
2352     { ISD::ZERO_EXTEND, MVT::v4i8,   MVT::v4i1,   2 },
2353     { ISD::ZERO_EXTEND, MVT::v16i8,  MVT::v4i1,   2 },
2354     { ISD::ZERO_EXTEND, MVT::v4i16,  MVT::v4i1,   2 },
2355     { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v4i1,   2 },
2356     { ISD::ZERO_EXTEND, MVT::v8i8,   MVT::v8i1,   2 },
2357     { ISD::ZERO_EXTEND, MVT::v16i8,  MVT::v8i1,   2 },
2358     { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v8i1,   2 },
2359     { ISD::ZERO_EXTEND, MVT::v16i8,  MVT::v16i1,  2 },
2360     { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1,  2 },
2361     { ISD::ZERO_EXTEND, MVT::v32i8,  MVT::v32i1,  2 },
2362     { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v32i1,  2 },
2363     { ISD::ZERO_EXTEND, MVT::v32i8,  MVT::v64i1,  2 },
2364     { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v64i1,  2 },
2365 
2366     { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i8,   2 },
2367     { ISD::TRUNCATE,    MVT::v2i1,   MVT::v16i8,  2 },
2368     { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i16,  2 },
2369     { ISD::TRUNCATE,    MVT::v2i1,   MVT::v8i16,  2 },
2370     { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i8,   2 },
2371     { ISD::TRUNCATE,    MVT::v4i1,   MVT::v16i8,  2 },
2372     { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i16,  2 },
2373     { ISD::TRUNCATE,    MVT::v4i1,   MVT::v8i16,  2 },
2374     { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i8,   2 },
2375     { ISD::TRUNCATE,    MVT::v8i1,   MVT::v16i8,  2 },
2376     { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i16,  2 },
2377     { ISD::TRUNCATE,    MVT::v16i1,  MVT::v16i8,  2 },
2378     { ISD::TRUNCATE,    MVT::v16i1,  MVT::v16i16, 2 },
2379     { ISD::TRUNCATE,    MVT::v32i1,  MVT::v32i8,  2 },
2380     { ISD::TRUNCATE,    MVT::v32i1,  MVT::v16i16, 2 },
2381     { ISD::TRUNCATE,    MVT::v64i1,  MVT::v32i8,  2 },
2382     { ISD::TRUNCATE,    MVT::v64i1,  MVT::v16i16, 2 },
2383 
2384     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i16, 2 },
2385   };
2386 
2387   static const TypeConversionCostTblEntry AVX512DQVLConversionTbl[] = {
2388     // Mask sign extend has an instruction.
2389     { ISD::SIGN_EXTEND, MVT::v2i64,  MVT::v2i1,   1 },
2390     { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v2i1,   1 },
2391     { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v4i1,   1 },
2392     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v16i1,  1 },
2393     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i1,   1 },
2394     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v8i1,   1 },
2395     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v16i1,  1 },
2396     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i1,   1 },
2397 
2398     // Mask zero extend is a sext + shift.
2399     { ISD::ZERO_EXTEND, MVT::v2i64,  MVT::v2i1,   2 },
2400     { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v2i1,   2 },
2401     { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v4i1,   2 },
2402     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v16i1,  2 },
2403     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i1,   2 },
2404     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v8i1,   2 },
2405     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v16i1,  2 },
2406     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i1,   2 },
2407 
2408     { ISD::TRUNCATE,    MVT::v16i1,  MVT::v4i64,  2 },
2409     { ISD::TRUNCATE,    MVT::v16i1,  MVT::v8i32,  2 },
2410     { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i64,  2 },
2411     { ISD::TRUNCATE,    MVT::v2i1,   MVT::v4i32,  2 },
2412     { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i32,  2 },
2413     { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i64,  2 },
2414     { ISD::TRUNCATE,    MVT::v8i1,   MVT::v4i64,  2 },
2415     { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i32,  2 },
2416 
2417     { ISD::SINT_TO_FP,  MVT::v2f32,  MVT::v2i64,  1 },
2418     { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v2i64,  1 },
2419     { ISD::SINT_TO_FP,  MVT::v4f32,  MVT::v4i64,  1 },
2420     { ISD::SINT_TO_FP,  MVT::v4f64,  MVT::v4i64,  1 },
2421 
2422     { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i64,  1 },
2423     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i64,  1 },
2424     { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v4i64,  1 },
2425     { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i64,  1 },
2426 
2427     { ISD::FP_TO_SINT,  MVT::v2i64,  MVT::v4f32,  1 },
2428     { ISD::FP_TO_SINT,  MVT::v4i64,  MVT::v4f32,  1 },
2429     { ISD::FP_TO_SINT,  MVT::v2i64,  MVT::v2f64,  1 },
2430     { ISD::FP_TO_SINT,  MVT::v4i64,  MVT::v4f64,  1 },
2431 
2432     { ISD::FP_TO_UINT,  MVT::v2i64,  MVT::v4f32,  1 },
2433     { ISD::FP_TO_UINT,  MVT::v4i64,  MVT::v4f32,  1 },
2434     { ISD::FP_TO_UINT,  MVT::v2i64,  MVT::v2f64,  1 },
2435     { ISD::FP_TO_UINT,  MVT::v4i64,  MVT::v4f64,  1 },
2436   };
2437 
2438   static const TypeConversionCostTblEntry AVX512VLConversionTbl[] = {
2439     { ISD::TRUNCATE,  MVT::v2i1,    MVT::v2i8,   3 }, // sext+vpslld+vptestmd
2440     { ISD::TRUNCATE,  MVT::v4i1,    MVT::v4i8,   3 }, // sext+vpslld+vptestmd
2441     { ISD::TRUNCATE,  MVT::v8i1,    MVT::v8i8,   3 }, // sext+vpslld+vptestmd
2442     { ISD::TRUNCATE,  MVT::v16i1,   MVT::v16i8,  8 }, // split+2*v8i8
2443     { ISD::TRUNCATE,  MVT::v2i1,    MVT::v2i16,  3 }, // sext+vpsllq+vptestmq
2444     { ISD::TRUNCATE,  MVT::v4i1,    MVT::v4i16,  3 }, // sext+vpsllq+vptestmq
2445     { ISD::TRUNCATE,  MVT::v8i1,    MVT::v8i16,  3 }, // sext+vpsllq+vptestmq
2446     { ISD::TRUNCATE,  MVT::v16i1,   MVT::v16i16, 8 }, // split+2*v8i16
2447     { ISD::TRUNCATE,  MVT::v2i1,    MVT::v2i32,  2 }, // vpslld+vptestmd
2448     { ISD::TRUNCATE,  MVT::v4i1,    MVT::v4i32,  2 }, // vpslld+vptestmd
2449     { ISD::TRUNCATE,  MVT::v8i1,    MVT::v8i32,  2 }, // vpslld+vptestmd
2450     { ISD::TRUNCATE,  MVT::v16i1,   MVT::v8i32,  2 }, // vpslld+vptestmd
2451     { ISD::TRUNCATE,  MVT::v2i1,    MVT::v2i64,  2 }, // vpsllq+vptestmq
2452     { ISD::TRUNCATE,  MVT::v4i1,    MVT::v4i64,  2 }, // vpsllq+vptestmq
2453     { ISD::TRUNCATE,  MVT::v4i32,   MVT::v4i64,  1 }, // vpmovqd
2454     { ISD::TRUNCATE,  MVT::v4i8,    MVT::v4i64,  2 }, // vpmovqb
2455     { ISD::TRUNCATE,  MVT::v4i16,   MVT::v4i64,  2 }, // vpmovqw
2456     { ISD::TRUNCATE,  MVT::v8i8,    MVT::v8i32,  2 }, // vpmovwb
2457 
2458     // sign extend is vpcmpeq+maskedmove+vpmovdw+vpacksswb
2459     // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw+vpackuswb
2460     { ISD::SIGN_EXTEND, MVT::v2i8,   MVT::v2i1,   5 },
2461     { ISD::ZERO_EXTEND, MVT::v2i8,   MVT::v2i1,   6 },
2462     { ISD::SIGN_EXTEND, MVT::v4i8,   MVT::v4i1,   5 },
2463     { ISD::ZERO_EXTEND, MVT::v4i8,   MVT::v4i1,   6 },
2464     { ISD::SIGN_EXTEND, MVT::v8i8,   MVT::v8i1,   5 },
2465     { ISD::ZERO_EXTEND, MVT::v8i8,   MVT::v8i1,   6 },
2466     { ISD::SIGN_EXTEND, MVT::v16i8,  MVT::v16i1, 10 },
2467     { ISD::ZERO_EXTEND, MVT::v16i8,  MVT::v16i1, 12 },
2468 
2469     // sign extend is vpcmpeq+maskedmove+vpmovdw
2470     // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw
2471     { ISD::SIGN_EXTEND, MVT::v2i16,  MVT::v2i1,   4 },
2472     { ISD::ZERO_EXTEND, MVT::v2i16,  MVT::v2i1,   5 },
2473     { ISD::SIGN_EXTEND, MVT::v4i16,  MVT::v4i1,   4 },
2474     { ISD::ZERO_EXTEND, MVT::v4i16,  MVT::v4i1,   5 },
2475     { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v8i1,   4 },
2476     { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v8i1,   5 },
2477     { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 10 },
2478     { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 12 },
2479 
2480     { ISD::SIGN_EXTEND, MVT::v2i32,  MVT::v2i1,   1 }, // vpternlogd
2481     { ISD::ZERO_EXTEND, MVT::v2i32,  MVT::v2i1,   2 }, // vpternlogd+psrld
2482     { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v4i1,   1 }, // vpternlogd
2483     { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v4i1,   2 }, // vpternlogd+psrld
2484     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i1,   1 }, // vpternlogd
2485     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i1,   2 }, // vpternlogd+psrld
2486     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v16i1,  1 }, // vpternlogd
2487     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v16i1,  2 }, // vpternlogd+psrld
2488 
2489     { ISD::SIGN_EXTEND, MVT::v2i64,  MVT::v2i1,   1 }, // vpternlogq
2490     { ISD::ZERO_EXTEND, MVT::v2i64,  MVT::v2i1,   2 }, // vpternlogq+psrlq
2491     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i1,   1 }, // vpternlogq
2492     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i1,   2 }, // vpternlogq+psrlq
2493 
2494     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v16i8,  1 },
2495     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v16i8,  1 },
2496     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v16i8,  1 },
2497     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v16i8,  1 },
2498     { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8,  1 },
2499     { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8,  1 },
2500     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v8i16,  1 },
2501     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v8i16,  1 },
2502     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i16,  1 },
2503     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i16,  1 },
2504     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i32,  1 },
2505     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i32,  1 },
2506 
2507     { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v16i8,  1 },
2508     { ISD::SINT_TO_FP,  MVT::v8f32,  MVT::v16i8,  1 },
2509     { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v8i16,  1 },
2510     { ISD::SINT_TO_FP,  MVT::v8f32,  MVT::v8i16,  1 },
2511 
2512     { ISD::UINT_TO_FP,  MVT::f32,    MVT::i64,    1 },
2513     { ISD::UINT_TO_FP,  MVT::f64,    MVT::i64,    1 },
2514     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v16i8,  1 },
2515     { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v16i8,  1 },
2516     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v8i16,  1 },
2517     { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i16,  1 },
2518     { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i32,  1 },
2519     { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v4i32,  1 },
2520     { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i32,  1 },
2521     { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i32,  1 },
2522     { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i64,  5 },
2523     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i64,  5 },
2524     { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i64,  5 },
2525 
2526     { ISD::FP_TO_SINT,  MVT::v16i8,  MVT::v8f32,  2 },
2527     { ISD::FP_TO_SINT,  MVT::v16i8,  MVT::v16f32, 2 },
2528     { ISD::FP_TO_SINT,  MVT::v32i8,  MVT::v32f32, 5 },
2529 
2530     { ISD::FP_TO_UINT,  MVT::i64,    MVT::f32,    1 },
2531     { ISD::FP_TO_UINT,  MVT::i64,    MVT::f64,    1 },
2532     { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v4f32,  1 },
2533     { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v2f64,  1 },
2534     { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v4f64,  1 },
2535     { ISD::FP_TO_UINT,  MVT::v8i32,  MVT::v8f32,  1 },
2536     { ISD::FP_TO_UINT,  MVT::v8i32,  MVT::v8f64,  1 },
2537   };
2538 
2539   static const TypeConversionCostTblEntry AVX2ConversionTbl[] = {
2540     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i1,   3 },
2541     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i1,   3 },
2542     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i1,   3 },
2543     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i1,   3 },
2544     { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1,  1 },
2545     { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1,  1 },
2546 
2547     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v16i8,  2 },
2548     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v16i8,  2 },
2549     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v16i8,  2 },
2550     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v16i8,  2 },
2551     { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8,  2 },
2552     { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8,  2 },
2553     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v8i16,  2 },
2554     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v8i16,  2 },
2555     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i16,  2 },
2556     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i16,  2 },
2557     { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 3 },
2558     { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 3 },
2559     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i32,  2 },
2560     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i32,  2 },
2561 
2562     { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i32,  2 },
2563 
2564     { ISD::TRUNCATE,    MVT::v16i16, MVT::v16i32, 4 },
2565     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i32, 4 },
2566     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v8i16,  1 },
2567     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v4i32,  1 },
2568     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v2i64,  1 },
2569     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v8i32,  4 },
2570     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v4i64,  4 },
2571     { ISD::TRUNCATE,    MVT::v8i16,  MVT::v4i32,  1 },
2572     { ISD::TRUNCATE,    MVT::v8i16,  MVT::v2i64,  1 },
2573     { ISD::TRUNCATE,    MVT::v8i16,  MVT::v4i64,  5 },
2574     { ISD::TRUNCATE,    MVT::v4i32,  MVT::v4i64,  1 },
2575     { ISD::TRUNCATE,    MVT::v8i16,  MVT::v8i32,  2 },
2576 
2577     { ISD::FP_EXTEND,   MVT::v8f64,  MVT::v8f32,  3 },
2578     { ISD::FP_ROUND,    MVT::v8f32,  MVT::v8f64,  3 },
2579 
2580     { ISD::FP_TO_SINT,  MVT::v16i16, MVT::v8f32,  1 },
2581     { ISD::FP_TO_SINT,  MVT::v4i32,  MVT::v4f64,  1 },
2582     { ISD::FP_TO_SINT,  MVT::v8i32,  MVT::v8f32,  1 },
2583     { ISD::FP_TO_SINT,  MVT::v8i32,  MVT::v8f64,  3 },
2584 
2585     { ISD::FP_TO_UINT,  MVT::i64,    MVT::f32,    3 },
2586     { ISD::FP_TO_UINT,  MVT::i64,    MVT::f64,    3 },
2587     { ISD::FP_TO_UINT,  MVT::v16i16, MVT::v8f32,  1 },
2588     { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v4f32,  3 },
2589     { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v2f64,  4 },
2590     { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v4f64,  4 },
2591     { ISD::FP_TO_UINT,  MVT::v8i32,  MVT::v8f32,  3 },
2592     { ISD::FP_TO_UINT,  MVT::v8i32,  MVT::v4f64,  4 },
2593 
2594     { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v16i8,  2 },
2595     { ISD::SINT_TO_FP,  MVT::v8f32,  MVT::v16i8,  2 },
2596     { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v8i16,  2 },
2597     { ISD::SINT_TO_FP,  MVT::v8f32,  MVT::v8i16,  2 },
2598     { ISD::SINT_TO_FP,  MVT::v4f64,  MVT::v4i32,  1 },
2599     { ISD::SINT_TO_FP,  MVT::v8f32,  MVT::v8i32,  1 },
2600     { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i32,  3 },
2601 
2602     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v16i8,  2 },
2603     { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v16i8,  2 },
2604     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v8i16,  2 },
2605     { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i16,  2 },
2606     { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i32,  2 },
2607     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i32,  1 },
2608     { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v4i32,  2 },
2609     { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i32,  2 },
2610     { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i32,  2 },
2611     { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i32,  4 },
2612   };
2613 
2614   static const TypeConversionCostTblEntry AVXConversionTbl[] = {
2615     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i1,   6 },
2616     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i1,   4 },
2617     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i1,   7 },
2618     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i1,   4 },
2619     { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1,  4 },
2620     { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1,  4 },
2621 
2622     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v16i8,  3 },
2623     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v16i8,  3 },
2624     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v16i8,  3 },
2625     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v16i8,  3 },
2626     { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8,  3 },
2627     { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8,  3 },
2628     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v8i16,  3 },
2629     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v8i16,  3 },
2630     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i16,  3 },
2631     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i16,  3 },
2632     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i32,  3 },
2633     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i32,  3 },
2634 
2635     { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i64,  4 },
2636     { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i32,  5 },
2637     { ISD::TRUNCATE,    MVT::v16i1,  MVT::v16i16, 4 },
2638     { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i64,  9 },
2639     { ISD::TRUNCATE,    MVT::v16i1,  MVT::v16i64, 11 },
2640 
2641     { ISD::TRUNCATE,    MVT::v16i16, MVT::v16i32, 6 },
2642     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i32, 6 },
2643     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i16, 2 }, // and+extract+packuswb
2644     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v8i32,  5 },
2645     { ISD::TRUNCATE,    MVT::v8i16,  MVT::v8i32,  5 },
2646     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v4i64,  5 },
2647     { ISD::TRUNCATE,    MVT::v8i16,  MVT::v4i64,  3 }, // and+extract+2*packusdw
2648     { ISD::TRUNCATE,    MVT::v4i32,  MVT::v4i64,  2 },
2649 
2650     { ISD::SINT_TO_FP,  MVT::v4f32,  MVT::v4i1,   3 },
2651     { ISD::SINT_TO_FP,  MVT::v4f64,  MVT::v4i1,   3 },
2652     { ISD::SINT_TO_FP,  MVT::v8f32,  MVT::v8i1,   8 },
2653     { ISD::SINT_TO_FP,  MVT::v8f32,  MVT::v16i8,  4 },
2654     { ISD::SINT_TO_FP,  MVT::v4f64,  MVT::v16i8,  2 },
2655     { ISD::SINT_TO_FP,  MVT::v8f32,  MVT::v8i16,  4 },
2656     { ISD::SINT_TO_FP,  MVT::v4f64,  MVT::v8i16,  2 },
2657     { ISD::SINT_TO_FP,  MVT::v4f64,  MVT::v4i32,  2 },
2658     { ISD::SINT_TO_FP,  MVT::v8f32,  MVT::v8i32,  2 },
2659     { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i32,  4 },
2660     { ISD::SINT_TO_FP,  MVT::v4f32,  MVT::v2i64,  5 },
2661     { ISD::SINT_TO_FP,  MVT::v4f32,  MVT::v4i64,  8 },
2662 
2663     { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v4i1,   7 },
2664     { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i1,   7 },
2665     { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i1,   6 },
2666     { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v16i8,  4 },
2667     { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v16i8,  2 },
2668     { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i16,  4 },
2669     { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v8i16,  2 },
2670     { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i32,  4 },
2671     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i32,  4 },
2672     { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v4i32,  5 },
2673     { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i32,  6 },
2674     { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i32,  8 },
2675     { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i32, 10 },
2676     { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i64, 10 },
2677     { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v4i64, 18 },
2678     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i64,  5 },
2679     { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i64, 10 },
2680 
2681     { ISD::FP_TO_SINT,  MVT::v16i8,  MVT::v8f32,  2 },
2682     { ISD::FP_TO_SINT,  MVT::v16i8,  MVT::v4f64,  2 },
2683     { ISD::FP_TO_SINT,  MVT::v32i8,  MVT::v8f32,  2 },
2684     { ISD::FP_TO_SINT,  MVT::v32i8,  MVT::v4f64,  2 },
2685     { ISD::FP_TO_SINT,  MVT::v8i16,  MVT::v8f32,  2 },
2686     { ISD::FP_TO_SINT,  MVT::v8i16,  MVT::v4f64,  2 },
2687     { ISD::FP_TO_SINT,  MVT::v16i16, MVT::v8f32,  2 },
2688     { ISD::FP_TO_SINT,  MVT::v16i16, MVT::v4f64,  2 },
2689     { ISD::FP_TO_SINT,  MVT::v4i32,  MVT::v4f64,  2 },
2690     { ISD::FP_TO_SINT,  MVT::v8i32,  MVT::v8f32,  2 },
2691     { ISD::FP_TO_SINT,  MVT::v8i32,  MVT::v8f64,  5 },
2692 
2693     { ISD::FP_TO_UINT,  MVT::v16i8,  MVT::v8f32,  2 },
2694     { ISD::FP_TO_UINT,  MVT::v16i8,  MVT::v4f64,  2 },
2695     { ISD::FP_TO_UINT,  MVT::v32i8,  MVT::v8f32,  2 },
2696     { ISD::FP_TO_UINT,  MVT::v32i8,  MVT::v4f64,  2 },
2697     { ISD::FP_TO_UINT,  MVT::v8i16,  MVT::v8f32,  2 },
2698     { ISD::FP_TO_UINT,  MVT::v8i16,  MVT::v4f64,  2 },
2699     { ISD::FP_TO_UINT,  MVT::v16i16, MVT::v8f32,  2 },
2700     { ISD::FP_TO_UINT,  MVT::v16i16, MVT::v4f64,  2 },
2701     { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v4f32,  3 },
2702     { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v2f64,  4 },
2703     { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v4f64,  6 },
2704     { ISD::FP_TO_UINT,  MVT::v8i32,  MVT::v8f32,  7 },
2705     { ISD::FP_TO_UINT,  MVT::v8i32,  MVT::v4f64,  7 },
2706 
2707     { ISD::FP_EXTEND,   MVT::v4f64,  MVT::v4f32,  1 },
2708     { ISD::FP_ROUND,    MVT::v4f32,  MVT::v4f64,  1 },
2709   };
2710 
2711   static const TypeConversionCostTblEntry SSE41ConversionTbl[] = {
2712     { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8,   1 },
2713     { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8,   1 },
2714     { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8,   1 },
2715     { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8,   1 },
2716     { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8,   1 },
2717     { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8,   1 },
2718     { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16,   1 },
2719     { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16,   1 },
2720     { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16,   1 },
2721     { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16,   1 },
2722     { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32,   1 },
2723     { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32,   1 },
2724 
2725     // These truncates end up widening elements.
2726     { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i8,   1 }, // PMOVXZBQ
2727     { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i16,  1 }, // PMOVXZWQ
2728     { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i8,   1 }, // PMOVXZBD
2729 
2730     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v4i32,  2 },
2731     { ISD::TRUNCATE,    MVT::v8i16,  MVT::v4i32,  2 },
2732     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v2i64,  2 },
2733 
2734     { ISD::SINT_TO_FP,  MVT::f32,    MVT::i32,    1 },
2735     { ISD::SINT_TO_FP,  MVT::f64,    MVT::i32,    1 },
2736     { ISD::SINT_TO_FP,  MVT::f32,    MVT::i64,    1 },
2737     { ISD::SINT_TO_FP,  MVT::f64,    MVT::i64,    1 },
2738     { ISD::SINT_TO_FP,  MVT::v4f32,  MVT::v16i8,  1 },
2739     { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v16i8,  1 },
2740     { ISD::SINT_TO_FP,  MVT::v4f32,  MVT::v8i16,  1 },
2741     { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v8i16,  1 },
2742     { ISD::SINT_TO_FP,  MVT::v4f32,  MVT::v4i32,  1 },
2743     { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v4i32,  1 },
2744     { ISD::SINT_TO_FP,  MVT::v4f64,  MVT::v4i32,  2 },
2745 
2746     { ISD::UINT_TO_FP,  MVT::f32,    MVT::i32,    1 },
2747     { ISD::UINT_TO_FP,  MVT::f64,    MVT::i32,    1 },
2748     { ISD::UINT_TO_FP,  MVT::f32,    MVT::i64,    4 },
2749     { ISD::UINT_TO_FP,  MVT::f64,    MVT::i64,    4 },
2750     { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v16i8,  1 },
2751     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v16i8,  1 },
2752     { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v8i16,  1 },
2753     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v8i16,  1 },
2754     { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i32,  3 },
2755     { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v4i32,  3 },
2756     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v4i32,  2 },
2757     { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v2i64, 12 },
2758     { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v4i64, 22 },
2759     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i64,  4 },
2760 
2761     { ISD::FP_TO_SINT,  MVT::i32,    MVT::f32,    1 },
2762     { ISD::FP_TO_SINT,  MVT::i64,    MVT::f32,    1 },
2763     { ISD::FP_TO_SINT,  MVT::i32,    MVT::f64,    1 },
2764     { ISD::FP_TO_SINT,  MVT::i64,    MVT::f64,    1 },
2765     { ISD::FP_TO_SINT,  MVT::v16i8,  MVT::v4f32,  2 },
2766     { ISD::FP_TO_SINT,  MVT::v16i8,  MVT::v2f64,  2 },
2767     { ISD::FP_TO_SINT,  MVT::v8i16,  MVT::v4f32,  1 },
2768     { ISD::FP_TO_SINT,  MVT::v8i16,  MVT::v2f64,  1 },
2769     { ISD::FP_TO_SINT,  MVT::v4i32,  MVT::v4f32,  1 },
2770     { ISD::FP_TO_SINT,  MVT::v4i32,  MVT::v2f64,  1 },
2771 
2772     { ISD::FP_TO_UINT,  MVT::i32,    MVT::f32,    1 },
2773     { ISD::FP_TO_UINT,  MVT::i64,    MVT::f32,    4 },
2774     { ISD::FP_TO_UINT,  MVT::i32,    MVT::f64,    1 },
2775     { ISD::FP_TO_UINT,  MVT::i64,    MVT::f64,    4 },
2776     { ISD::FP_TO_UINT,  MVT::v16i8,  MVT::v4f32,  2 },
2777     { ISD::FP_TO_UINT,  MVT::v16i8,  MVT::v2f64,  2 },
2778     { ISD::FP_TO_UINT,  MVT::v8i16,  MVT::v4f32,  1 },
2779     { ISD::FP_TO_UINT,  MVT::v8i16,  MVT::v2f64,  1 },
2780     { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v4f32,  4 },
2781     { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v2f64,  4 },
2782   };
2783 
2784   static const TypeConversionCostTblEntry SSE2ConversionTbl[] = {
2785     // These are somewhat magic numbers justified by comparing the
2786     // output of llvm-mca for our various supported scheduler models
2787     // and basing it off the worst case scenario.
2788     { ISD::SINT_TO_FP,  MVT::f32,    MVT::i32,    3 },
2789     { ISD::SINT_TO_FP,  MVT::f64,    MVT::i32,    3 },
2790     { ISD::SINT_TO_FP,  MVT::f32,    MVT::i64,    3 },
2791     { ISD::SINT_TO_FP,  MVT::f64,    MVT::i64,    3 },
2792     { ISD::SINT_TO_FP,  MVT::v4f32,  MVT::v16i8,  3 },
2793     { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v16i8,  4 },
2794     { ISD::SINT_TO_FP,  MVT::v4f32,  MVT::v8i16,  3 },
2795     { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v8i16,  4 },
2796     { ISD::SINT_TO_FP,  MVT::v4f32,  MVT::v4i32,  3 },
2797     { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v4i32,  4 },
2798     { ISD::SINT_TO_FP,  MVT::v4f32,  MVT::v2i64,  8 },
2799     { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v2i64,  8 },
2800 
2801     { ISD::UINT_TO_FP,  MVT::f32,    MVT::i32,    3 },
2802     { ISD::UINT_TO_FP,  MVT::f64,    MVT::i32,    3 },
2803     { ISD::UINT_TO_FP,  MVT::f32,    MVT::i64,    8 },
2804     { ISD::UINT_TO_FP,  MVT::f64,    MVT::i64,    9 },
2805     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v16i8,  4 },
2806     { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v16i8,  4 },
2807     { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v8i16,  4 },
2808     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v8i16,  4 },
2809     { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i32,  7 },
2810     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v4i32,  7 },
2811     { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v4i32,  5 },
2812     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i64, 15 },
2813     { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v2i64, 18 },
2814 
2815     { ISD::FP_TO_SINT,  MVT::i32,    MVT::f32,    4 },
2816     { ISD::FP_TO_SINT,  MVT::i64,    MVT::f32,    4 },
2817     { ISD::FP_TO_SINT,  MVT::i32,    MVT::f64,    4 },
2818     { ISD::FP_TO_SINT,  MVT::i64,    MVT::f64,    4 },
2819     { ISD::FP_TO_SINT,  MVT::v16i8,  MVT::v4f32,  6 },
2820     { ISD::FP_TO_SINT,  MVT::v16i8,  MVT::v2f64,  6 },
2821     { ISD::FP_TO_SINT,  MVT::v8i16,  MVT::v4f32,  5 },
2822     { ISD::FP_TO_SINT,  MVT::v8i16,  MVT::v2f64,  5 },
2823     { ISD::FP_TO_SINT,  MVT::v4i32,  MVT::v4f32,  4 },
2824     { ISD::FP_TO_SINT,  MVT::v4i32,  MVT::v2f64,  4 },
2825 
2826     { ISD::FP_TO_UINT,  MVT::i32,    MVT::f32,    4 },
2827     { ISD::FP_TO_UINT,  MVT::i64,    MVT::f32,    4 },
2828     { ISD::FP_TO_UINT,  MVT::i32,    MVT::f64,    4 },
2829     { ISD::FP_TO_UINT,  MVT::i64,    MVT::f64,   15 },
2830     { ISD::FP_TO_UINT,  MVT::v16i8,  MVT::v4f32,  6 },
2831     { ISD::FP_TO_UINT,  MVT::v16i8,  MVT::v2f64,  6 },
2832     { ISD::FP_TO_UINT,  MVT::v8i16,  MVT::v4f32,  5 },
2833     { ISD::FP_TO_UINT,  MVT::v8i16,  MVT::v2f64,  5 },
2834     { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v4f32,  8 },
2835     { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v2f64,  8 },
2836 
2837     { ISD::ZERO_EXTEND, MVT::v2i64,  MVT::v16i8,  4 },
2838     { ISD::SIGN_EXTEND, MVT::v2i64,  MVT::v16i8,  4 },
2839     { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v16i8,  2 },
2840     { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v16i8,  3 },
2841     { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v16i8,  1 },
2842     { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v16i8,  2 },
2843     { ISD::ZERO_EXTEND, MVT::v2i64,  MVT::v8i16,  2 },
2844     { ISD::SIGN_EXTEND, MVT::v2i64,  MVT::v8i16,  3 },
2845     { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v8i16,  1 },
2846     { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v8i16,  2 },
2847     { ISD::ZERO_EXTEND, MVT::v2i64,  MVT::v4i32,  1 },
2848     { ISD::SIGN_EXTEND, MVT::v2i64,  MVT::v4i32,  2 },
2849 
2850     // These truncates are really widening elements.
2851     { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i32,  1 }, // PSHUFD
2852     { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i16,  2 }, // PUNPCKLWD+DQ
2853     { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i8,   3 }, // PUNPCKLBW+WD+PSHUFD
2854     { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i16,  1 }, // PUNPCKLWD
2855     { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i8,   2 }, // PUNPCKLBW+WD
2856     { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i8,   1 }, // PUNPCKLBW
2857 
2858     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v8i16,  2 }, // PAND+PACKUSWB
2859     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i16, 3 },
2860     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v4i32,  3 }, // PAND+2*PACKUSWB
2861     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i32, 7 },
2862     { ISD::TRUNCATE,    MVT::v2i16,  MVT::v2i32,  1 },
2863     { ISD::TRUNCATE,    MVT::v8i16,  MVT::v4i32,  3 },
2864     { ISD::TRUNCATE,    MVT::v8i16,  MVT::v8i32,  5 },
2865     { ISD::TRUNCATE,    MVT::v16i16, MVT::v16i32,10 },
2866     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v2i64,  4 }, // PAND+3*PACKUSWB
2867     { ISD::TRUNCATE,    MVT::v8i16,  MVT::v2i64,  2 }, // PSHUFD+PSHUFLW
2868     { ISD::TRUNCATE,    MVT::v4i32,  MVT::v2i64,  1 }, // PSHUFD
2869   };
2870 
2871   // Attempt to map directly to (simple) MVT types to let us match custom entries.
2872   EVT SrcTy = TLI->getValueType(DL, Src);
2873   EVT DstTy = TLI->getValueType(DL, Dst);
2874 
2875   // The function getSimpleVT only handles simple value types.
2876   if (SrcTy.isSimple() && DstTy.isSimple()) {
2877     MVT SimpleSrcTy = SrcTy.getSimpleVT();
2878     MVT SimpleDstTy = DstTy.getSimpleVT();
2879 
2880     if (ST->useAVX512Regs()) {
2881       if (ST->hasBWI())
2882         if (const auto *Entry = ConvertCostTableLookup(
2883                 AVX512BWConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2884           return AdjustCost(Entry->Cost);
2885 
2886       if (ST->hasDQI())
2887         if (const auto *Entry = ConvertCostTableLookup(
2888                 AVX512DQConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2889           return AdjustCost(Entry->Cost);
2890 
2891       if (ST->hasAVX512())
2892         if (const auto *Entry = ConvertCostTableLookup(
2893                 AVX512FConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2894           return AdjustCost(Entry->Cost);
2895     }
2896 
2897     if (ST->hasBWI())
2898       if (const auto *Entry = ConvertCostTableLookup(
2899               AVX512BWVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2900         return AdjustCost(Entry->Cost);
2901 
2902     if (ST->hasDQI())
2903       if (const auto *Entry = ConvertCostTableLookup(
2904               AVX512DQVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2905         return AdjustCost(Entry->Cost);
2906 
2907     if (ST->hasAVX512())
2908       if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
2909                                                      SimpleDstTy, SimpleSrcTy))
2910         return AdjustCost(Entry->Cost);
2911 
2912     if (ST->hasAVX2()) {
2913       if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
2914                                                      SimpleDstTy, SimpleSrcTy))
2915         return AdjustCost(Entry->Cost);
2916     }
2917 
2918     if (ST->hasAVX()) {
2919       if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
2920                                                      SimpleDstTy, SimpleSrcTy))
2921         return AdjustCost(Entry->Cost);
2922     }
2923 
2924     if (ST->hasSSE41()) {
2925       if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
2926                                                      SimpleDstTy, SimpleSrcTy))
2927         return AdjustCost(Entry->Cost);
2928     }
2929 
2930     if (ST->hasSSE2()) {
2931       if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
2932                                                      SimpleDstTy, SimpleSrcTy))
2933         return AdjustCost(Entry->Cost);
2934     }
2935   }
2936 
2937   // Fall back to legalized types.
2938   std::pair<InstructionCost, MVT> LTSrc = getTypeLegalizationCost(Src);
2939   std::pair<InstructionCost, MVT> LTDest = getTypeLegalizationCost(Dst);
2940 
2941   // If we're truncating to the same legalized type - just assume its free.
2942   if (ISD == ISD::TRUNCATE && LTSrc.second == LTDest.second)
2943     return TTI::TCC_Free;
2944 
2945   if (ST->useAVX512Regs()) {
2946     if (ST->hasBWI())
2947       if (const auto *Entry = ConvertCostTableLookup(
2948               AVX512BWConversionTbl, ISD, LTDest.second, LTSrc.second))
2949         return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2950 
2951     if (ST->hasDQI())
2952       if (const auto *Entry = ConvertCostTableLookup(
2953               AVX512DQConversionTbl, ISD, LTDest.second, LTSrc.second))
2954         return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2955 
2956     if (ST->hasAVX512())
2957       if (const auto *Entry = ConvertCostTableLookup(
2958               AVX512FConversionTbl, ISD, LTDest.second, LTSrc.second))
2959         return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2960   }
2961 
2962   if (ST->hasBWI())
2963     if (const auto *Entry = ConvertCostTableLookup(AVX512BWVLConversionTbl, ISD,
2964                                                    LTDest.second, LTSrc.second))
2965       return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2966 
2967   if (ST->hasDQI())
2968     if (const auto *Entry = ConvertCostTableLookup(AVX512DQVLConversionTbl, ISD,
2969                                                    LTDest.second, LTSrc.second))
2970       return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2971 
2972   if (ST->hasAVX512())
2973     if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
2974                                                    LTDest.second, LTSrc.second))
2975       return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2976 
2977   if (ST->hasAVX2())
2978     if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
2979                                                    LTDest.second, LTSrc.second))
2980       return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2981 
2982   if (ST->hasAVX())
2983     if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
2984                                                    LTDest.second, LTSrc.second))
2985       return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2986 
2987   if (ST->hasSSE41())
2988     if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
2989                                                    LTDest.second, LTSrc.second))
2990       return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2991 
2992   if (ST->hasSSE2())
2993     if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
2994                                                    LTDest.second, LTSrc.second))
2995       return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2996 
2997   // Fallback, for i8/i16 sitofp/uitofp cases we need to extend to i32 for
2998   // sitofp.
2999   if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) &&
3000       1 < Src->getScalarSizeInBits() && Src->getScalarSizeInBits() < 32) {
3001     Type *ExtSrc = Src->getWithNewBitWidth(32);
3002     unsigned ExtOpc =
3003         (ISD == ISD::SINT_TO_FP) ? Instruction::SExt : Instruction::ZExt;
3004 
3005     // For scalar loads the extend would be free.
3006     InstructionCost ExtCost = 0;
3007     if (!(Src->isIntegerTy() && I && isa<LoadInst>(I->getOperand(0))))
3008       ExtCost = getCastInstrCost(ExtOpc, ExtSrc, Src, CCH, CostKind);
3009 
3010     return ExtCost + getCastInstrCost(Instruction::SIToFP, Dst, ExtSrc,
3011                                       TTI::CastContextHint::None, CostKind);
3012   }
3013 
3014   // Fallback for fptosi/fptoui i8/i16 cases we need to truncate from fptosi
3015   // i32.
3016   if ((ISD == ISD::FP_TO_SINT || ISD == ISD::FP_TO_UINT) &&
3017       1 < Dst->getScalarSizeInBits() && Dst->getScalarSizeInBits() < 32) {
3018     Type *TruncDst = Dst->getWithNewBitWidth(32);
3019     return getCastInstrCost(Instruction::FPToSI, TruncDst, Src, CCH, CostKind) +
3020            getCastInstrCost(Instruction::Trunc, Dst, TruncDst,
3021                             TTI::CastContextHint::None, CostKind);
3022   }
3023 
3024   return AdjustCost(
3025       BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
3026 }
3027 
3028 InstructionCost X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
3029                                                Type *CondTy,
3030                                                CmpInst::Predicate VecPred,
3031                                                TTI::TargetCostKind CostKind,
3032                                                const Instruction *I) {
3033   // Early out if this type isn't scalar/vector integer/float.
3034   if (!(ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy()))
3035     return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
3036                                      I);
3037 
3038   // Legalize the type.
3039   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
3040 
3041   MVT MTy = LT.second;
3042 
3043   int ISD = TLI->InstructionOpcodeToISD(Opcode);
3044   assert(ISD && "Invalid opcode");
3045 
3046   InstructionCost ExtraCost = 0;
3047   if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) {
3048     // Some vector comparison predicates cost extra instructions.
3049     // TODO: Should we invert this and assume worst case cmp costs
3050     // and reduce for particular predicates?
3051     if (MTy.isVector() &&
3052         !((ST->hasXOP() && (!ST->hasAVX2() || MTy.is128BitVector())) ||
3053           (ST->hasAVX512() && 32 <= MTy.getScalarSizeInBits()) ||
3054           ST->hasBWI())) {
3055       // Fallback to I if a specific predicate wasn't specified.
3056       CmpInst::Predicate Pred = VecPred;
3057       if (I && (Pred == CmpInst::BAD_ICMP_PREDICATE ||
3058                 Pred == CmpInst::BAD_FCMP_PREDICATE))
3059         Pred = cast<CmpInst>(I)->getPredicate();
3060 
3061       switch (Pred) {
3062       case CmpInst::Predicate::ICMP_NE:
3063         // xor(cmpeq(x,y),-1)
3064         ExtraCost = 1;
3065         break;
3066       case CmpInst::Predicate::ICMP_SGE:
3067       case CmpInst::Predicate::ICMP_SLE:
3068         // xor(cmpgt(x,y),-1)
3069         ExtraCost = 1;
3070         break;
3071       case CmpInst::Predicate::ICMP_ULT:
3072       case CmpInst::Predicate::ICMP_UGT:
3073         // cmpgt(xor(x,signbit),xor(y,signbit))
3074         // xor(cmpeq(pmaxu(x,y),x),-1)
3075         ExtraCost = 2;
3076         break;
3077       case CmpInst::Predicate::ICMP_ULE:
3078       case CmpInst::Predicate::ICMP_UGE:
3079         if ((ST->hasSSE41() && MTy.getScalarSizeInBits() == 32) ||
3080             (ST->hasSSE2() && MTy.getScalarSizeInBits() < 32)) {
3081           // cmpeq(psubus(x,y),0)
3082           // cmpeq(pminu(x,y),x)
3083           ExtraCost = 1;
3084         } else {
3085           // xor(cmpgt(xor(x,signbit),xor(y,signbit)),-1)
3086           ExtraCost = 3;
3087         }
3088         break;
3089       case CmpInst::Predicate::FCMP_ONE:
3090       case CmpInst::Predicate::FCMP_UEQ:
3091         // Without AVX we need to expand FCMP_ONE/FCMP_UEQ cases.
3092         // Use FCMP_UEQ expansion - FCMP_ONE should be the same.
3093         if (CondTy && !ST->hasAVX())
3094           return getCmpSelInstrCost(Opcode, ValTy, CondTy,
3095                                     CmpInst::Predicate::FCMP_UNO, CostKind) +
3096                  getCmpSelInstrCost(Opcode, ValTy, CondTy,
3097                                     CmpInst::Predicate::FCMP_OEQ, CostKind) +
3098                  getArithmeticInstrCost(Instruction::Or, CondTy, CostKind);
3099 
3100         break;
3101       case CmpInst::Predicate::BAD_ICMP_PREDICATE:
3102       case CmpInst::Predicate::BAD_FCMP_PREDICATE:
3103         // Assume worst case scenario and add the maximum extra cost.
3104         ExtraCost = 3;
3105         break;
3106       default:
3107         break;
3108       }
3109     }
3110   }
3111 
3112   static const CostKindTblEntry SLMCostTbl[] = {
3113     // slm pcmpeq/pcmpgt throughput is 2
3114     { ISD::SETCC,   MVT::v2i64,   { 2, 5, 1, 2 } },
3115     // slm pblendvb/blendvpd/blendvps throughput is 4
3116     { ISD::SELECT,  MVT::v2f64,   { 4, 4, 1, 3 } }, // vblendvpd
3117     { ISD::SELECT,  MVT::v4f32,   { 4, 4, 1, 3 } }, // vblendvps
3118     { ISD::SELECT,  MVT::v2i64,   { 4, 4, 1, 3 } }, // pblendvb
3119     { ISD::SELECT,  MVT::v8i32,   { 4, 4, 1, 3 } }, // pblendvb
3120     { ISD::SELECT,  MVT::v8i16,   { 4, 4, 1, 3 } }, // pblendvb
3121     { ISD::SELECT,  MVT::v16i8,   { 4, 4, 1, 3 } }, // pblendvb
3122   };
3123 
3124   static const CostKindTblEntry AVX512BWCostTbl[] = {
3125     { ISD::SETCC,   MVT::v32i16,  { 1, 1, 1, 1 } },
3126     { ISD::SETCC,   MVT::v16i16,  { 1, 1, 1, 1 } },
3127     { ISD::SETCC,   MVT::v64i8,   { 1, 1, 1, 1 } },
3128     { ISD::SETCC,   MVT::v32i8,   { 1, 1, 1, 1 } },
3129 
3130     { ISD::SELECT,  MVT::v32i16,  { 1, 1, 1, 1 } },
3131     { ISD::SELECT,  MVT::v64i8,   { 1, 1, 1, 1 } },
3132   };
3133 
3134   static const CostKindTblEntry AVX512CostTbl[] = {
3135     { ISD::SETCC,   MVT::v8f64,   { 1, 4, 1, 1 } },
3136     { ISD::SETCC,   MVT::v4f64,   { 1, 4, 1, 1 } },
3137     { ISD::SETCC,   MVT::v16f32,  { 1, 4, 1, 1 } },
3138     { ISD::SETCC,   MVT::v8f32,   { 1, 4, 1, 1 } },
3139 
3140     { ISD::SETCC,   MVT::v8i64,   { 1, 1, 1, 1 } },
3141     { ISD::SETCC,   MVT::v4i64,   { 1, 1, 1, 1 } },
3142     { ISD::SETCC,   MVT::v2i64,   { 1, 1, 1, 1 } },
3143     { ISD::SETCC,   MVT::v16i32,  { 1, 1, 1, 1 } },
3144     { ISD::SETCC,   MVT::v8i32,   { 1, 1, 1, 1 } },
3145     { ISD::SETCC,   MVT::v32i16,  { 3, 7, 5, 5 } },
3146     { ISD::SETCC,   MVT::v64i8,   { 3, 7, 5, 5 } },
3147 
3148     { ISD::SELECT,  MVT::v8i64,   { 1, 1, 1, 1 } },
3149     { ISD::SELECT,  MVT::v4i64,   { 1, 1, 1, 1 } },
3150     { ISD::SELECT,  MVT::v2i64,   { 1, 1, 1, 1 } },
3151     { ISD::SELECT,  MVT::v16i32,  { 1, 1, 1, 1 } },
3152     { ISD::SELECT,  MVT::v8i32,   { 1, 1, 1, 1 } },
3153     { ISD::SELECT,  MVT::v4i32,   { 1, 1, 1, 1 } },
3154     { ISD::SELECT,  MVT::v8f64,   { 1, 1, 1, 1 } },
3155     { ISD::SELECT,  MVT::v4f64,   { 1, 1, 1, 1 } },
3156     { ISD::SELECT,  MVT::v2f64,   { 1, 1, 1, 1 } },
3157     { ISD::SELECT,  MVT::f64,     { 1, 1, 1, 1 } },
3158     { ISD::SELECT,  MVT::v16f32,  { 1, 1, 1, 1 } },
3159     { ISD::SELECT,  MVT::v8f32 ,  { 1, 1, 1, 1 } },
3160     { ISD::SELECT,  MVT::v4f32,   { 1, 1, 1, 1 } },
3161     { ISD::SELECT,  MVT::f32  ,   { 1, 1, 1, 1 } },
3162 
3163     { ISD::SELECT,  MVT::v32i16,  { 2, 2, 4, 4 } },
3164     { ISD::SELECT,  MVT::v16i16,  { 1, 1, 1, 1 } },
3165     { ISD::SELECT,  MVT::v8i16,   { 1, 1, 1, 1 } },
3166     { ISD::SELECT,  MVT::v64i8,   { 2, 2, 4, 4 } },
3167     { ISD::SELECT,  MVT::v32i8,   { 1, 1, 1, 1 } },
3168     { ISD::SELECT,  MVT::v16i8,   { 1, 1, 1, 1 } },
3169   };
3170 
3171   static const CostKindTblEntry AVX2CostTbl[] = {
3172     { ISD::SETCC,   MVT::v4f64,   { 1, 4, 1, 2 } },
3173     { ISD::SETCC,   MVT::v2f64,   { 1, 4, 1, 1 } },
3174     { ISD::SETCC,   MVT::f64,     { 1, 4, 1, 1 } },
3175     { ISD::SETCC,   MVT::v8f32,   { 1, 4, 1, 2 } },
3176     { ISD::SETCC,   MVT::v4f32,   { 1, 4, 1, 1 } },
3177     { ISD::SETCC,   MVT::f32,     { 1, 4, 1, 1 } },
3178 
3179     { ISD::SETCC,   MVT::v4i64,   { 1, 1, 1, 2 } },
3180     { ISD::SETCC,   MVT::v8i32,   { 1, 1, 1, 2 } },
3181     { ISD::SETCC,   MVT::v16i16,  { 1, 1, 1, 2 } },
3182     { ISD::SETCC,   MVT::v32i8,   { 1, 1, 1, 2 } },
3183 
3184     { ISD::SELECT,  MVT::v4f64,   { 2, 2, 1, 2 } }, // vblendvpd
3185     { ISD::SELECT,  MVT::v8f32,   { 2, 2, 1, 2 } }, // vblendvps
3186     { ISD::SELECT,  MVT::v4i64,   { 2, 2, 1, 2 } }, // pblendvb
3187     { ISD::SELECT,  MVT::v8i32,   { 2, 2, 1, 2 } }, // pblendvb
3188     { ISD::SELECT,  MVT::v16i16,  { 2, 2, 1, 2 } }, // pblendvb
3189     { ISD::SELECT,  MVT::v32i8,   { 2, 2, 1, 2 } }, // pblendvb
3190   };
3191 
3192   static const CostKindTblEntry XOPCostTbl[] = {
3193     { ISD::SETCC,   MVT::v4i64,   { 4, 2, 5, 6 } },
3194     { ISD::SETCC,   MVT::v2i64,   { 1, 1, 1, 1 } },
3195   };
3196 
3197   static const CostKindTblEntry AVX1CostTbl[] = {
3198     { ISD::SETCC,   MVT::v4f64,   { 2, 3, 1, 2 } },
3199     { ISD::SETCC,   MVT::v2f64,   { 1, 3, 1, 1 } },
3200     { ISD::SETCC,   MVT::f64,     { 1, 3, 1, 1 } },
3201     { ISD::SETCC,   MVT::v8f32,   { 2, 3, 1, 2 } },
3202     { ISD::SETCC,   MVT::v4f32,   { 1, 3, 1, 1 } },
3203     { ISD::SETCC,   MVT::f32,     { 1, 3, 1, 1 } },
3204 
3205     // AVX1 does not support 8-wide integer compare.
3206     { ISD::SETCC,   MVT::v4i64,   { 4, 2, 5, 6 } },
3207     { ISD::SETCC,   MVT::v8i32,   { 4, 2, 5, 6 } },
3208     { ISD::SETCC,   MVT::v16i16,  { 4, 2, 5, 6 } },
3209     { ISD::SETCC,   MVT::v32i8,   { 4, 2, 5, 6 } },
3210 
3211     { ISD::SELECT,  MVT::v4f64,   { 3, 3, 1, 2 } }, // vblendvpd
3212     { ISD::SELECT,  MVT::v8f32,   { 3, 3, 1, 2 } }, // vblendvps
3213     { ISD::SELECT,  MVT::v4i64,   { 3, 3, 1, 2 } }, // vblendvpd
3214     { ISD::SELECT,  MVT::v8i32,   { 3, 3, 1, 2 } }, // vblendvps
3215     { ISD::SELECT,  MVT::v16i16,  { 3, 3, 3, 3 } }, // vandps + vandnps + vorps
3216     { ISD::SELECT,  MVT::v32i8,   { 3, 3, 3, 3 } }, // vandps + vandnps + vorps
3217   };
3218 
3219   static const CostKindTblEntry SSE42CostTbl[] = {
3220     { ISD::SETCC,   MVT::v2i64,   { 1, 2, 1, 2 } },
3221   };
3222 
3223   static const CostKindTblEntry SSE41CostTbl[] = {
3224     { ISD::SETCC,   MVT::v2f64,   { 1, 5, 1, 1 } },
3225     { ISD::SETCC,   MVT::v4f32,   { 1, 5, 1, 1 } },
3226 
3227     { ISD::SELECT,  MVT::v2f64,   { 2, 2, 1, 2 } }, // blendvpd
3228     { ISD::SELECT,  MVT::f64,     { 2, 2, 1, 2 } }, // blendvpd
3229     { ISD::SELECT,  MVT::v4f32,   { 2, 2, 1, 2 } }, // blendvps
3230     { ISD::SELECT,  MVT::f32  ,   { 2, 2, 1, 2 } }, // blendvps
3231     { ISD::SELECT,  MVT::v2i64,   { 2, 2, 1, 2 } }, // pblendvb
3232     { ISD::SELECT,  MVT::v4i32,   { 2, 2, 1, 2 } }, // pblendvb
3233     { ISD::SELECT,  MVT::v8i16,   { 2, 2, 1, 2 } }, // pblendvb
3234     { ISD::SELECT,  MVT::v16i8,   { 2, 2, 1, 2 } }, // pblendvb
3235   };
3236 
3237   static const CostKindTblEntry SSE2CostTbl[] = {
3238     { ISD::SETCC,   MVT::v2f64,   { 2, 5, 1, 1 } },
3239     { ISD::SETCC,   MVT::f64,     { 1, 5, 1, 1 } },
3240 
3241     { ISD::SETCC,   MVT::v2i64,   { 5, 4, 5, 5 } }, // pcmpeqd/pcmpgtd expansion
3242     { ISD::SETCC,   MVT::v4i32,   { 1, 1, 1, 1 } },
3243     { ISD::SETCC,   MVT::v8i16,   { 1, 1, 1, 1 } },
3244     { ISD::SETCC,   MVT::v16i8,   { 1, 1, 1, 1 } },
3245 
3246     { ISD::SELECT,  MVT::v2f64,   { 2, 2, 3, 3 } }, // andpd + andnpd + orpd
3247     { ISD::SELECT,  MVT::f64,     { 2, 2, 3, 3 } }, // andpd + andnpd + orpd
3248     { ISD::SELECT,  MVT::v2i64,   { 2, 2, 3, 3 } }, // pand + pandn + por
3249     { ISD::SELECT,  MVT::v4i32,   { 2, 2, 3, 3 } }, // pand + pandn + por
3250     { ISD::SELECT,  MVT::v8i16,   { 2, 2, 3, 3 } }, // pand + pandn + por
3251     { ISD::SELECT,  MVT::v16i8,   { 2, 2, 3, 3 } }, // pand + pandn + por
3252   };
3253 
3254   static const CostKindTblEntry SSE1CostTbl[] = {
3255     { ISD::SETCC,   MVT::v4f32,   { 2, 5, 1, 1 } },
3256     { ISD::SETCC,   MVT::f32,     { 1, 5, 1, 1 } },
3257 
3258     { ISD::SELECT,  MVT::v4f32,   { 2, 2, 3, 3 } }, // andps + andnps + orps
3259     { ISD::SELECT,  MVT::f32,     { 2, 2, 3, 3 } }, // andps + andnps + orps
3260   };
3261 
3262   if (ST->useSLMArithCosts())
3263     if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
3264       if (auto KindCost = Entry->Cost[CostKind])
3265         return LT.first * (ExtraCost + *KindCost);
3266 
3267   if (ST->hasBWI())
3268     if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
3269       if (auto KindCost = Entry->Cost[CostKind])
3270         return LT.first * (ExtraCost + *KindCost);
3271 
3272   if (ST->hasAVX512())
3273     if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
3274       if (auto KindCost = Entry->Cost[CostKind])
3275         return LT.first * (ExtraCost + *KindCost);
3276 
3277   if (ST->hasAVX2())
3278     if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
3279       if (auto KindCost = Entry->Cost[CostKind])
3280         return LT.first * (ExtraCost + *KindCost);
3281 
3282   if (ST->hasXOP())
3283     if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
3284       if (auto KindCost = Entry->Cost[CostKind])
3285         return LT.first * (ExtraCost + *KindCost);
3286 
3287   if (ST->hasAVX())
3288     if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
3289       if (auto KindCost = Entry->Cost[CostKind])
3290         return LT.first * (ExtraCost + *KindCost);
3291 
3292   if (ST->hasSSE42())
3293     if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
3294       if (auto KindCost = Entry->Cost[CostKind])
3295         return LT.first * (ExtraCost + *KindCost);
3296 
3297   if (ST->hasSSE41())
3298     if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
3299       if (auto KindCost = Entry->Cost[CostKind])
3300         return LT.first * (ExtraCost + *KindCost);
3301 
3302   if (ST->hasSSE2())
3303     if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
3304       if (auto KindCost = Entry->Cost[CostKind])
3305         return LT.first * (ExtraCost + *KindCost);
3306 
3307   if (ST->hasSSE1())
3308     if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
3309       if (auto KindCost = Entry->Cost[CostKind])
3310         return LT.first * (ExtraCost + *KindCost);
3311 
3312   // Assume a 3cy latency for fp select ops.
3313   if (CostKind == TTI::TCK_Latency && Opcode == Instruction::Select)
3314     if (ValTy->getScalarType()->isFloatingPointTy())
3315       return 3;
3316 
3317   return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
3318 }
3319 
3320 unsigned X86TTIImpl::getAtomicMemIntrinsicMaxElementSize() const { return 16; }
3321 
3322 InstructionCost
3323 X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
3324                                   TTI::TargetCostKind CostKind) {
3325   // Costs should match the codegen from:
3326   // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll
3327   // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll
3328   // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll
3329   // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll
3330   // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll
3331 
3332   // TODO: Overflow intrinsics (*ADDO, *SUBO, *MULO) with vector types are not
3333   //       specialized in these tables yet.
3334   static const CostKindTblEntry AVX512VBMI2CostTbl[] = {
3335     { ISD::FSHL,       MVT::v8i64,   {  1,  1,  1,  1 } },
3336     { ISD::FSHL,       MVT::v4i64,   {  1,  1,  1,  1 } },
3337     { ISD::FSHL,       MVT::v2i64,   {  1,  1,  1,  1 } },
3338     { ISD::FSHL,       MVT::v16i32,  {  1,  1,  1,  1 } },
3339     { ISD::FSHL,       MVT::v8i32,   {  1,  1,  1,  1 } },
3340     { ISD::FSHL,       MVT::v4i32,   {  1,  1,  1,  1 } },
3341     { ISD::FSHL,       MVT::v32i16,  {  1,  1,  1,  1 } },
3342     { ISD::FSHL,       MVT::v16i16,  {  1,  1,  1,  1 } },
3343     { ISD::FSHL,       MVT::v8i16,   {  1,  1,  1,  1 } },
3344     { ISD::ROTL,       MVT::v32i16,  {  1,  1,  1,  1 } },
3345     { ISD::ROTL,       MVT::v16i16,  {  1,  1,  1,  1 } },
3346     { ISD::ROTL,       MVT::v8i16,   {  1,  1,  1,  1 } },
3347     { ISD::ROTR,       MVT::v32i16,  {  1,  1,  1,  1 } },
3348     { ISD::ROTR,       MVT::v16i16,  {  1,  1,  1,  1 } },
3349     { ISD::ROTR,       MVT::v8i16,   {  1,  1,  1,  1 } },
3350   };
3351   static const CostKindTblEntry AVX512BITALGCostTbl[] = {
3352     { ISD::CTPOP,      MVT::v32i16,  {  1,  1,  1,  1 } },
3353     { ISD::CTPOP,      MVT::v64i8,   {  1,  1,  1,  1 } },
3354     { ISD::CTPOP,      MVT::v16i16,  {  1,  1,  1,  1 } },
3355     { ISD::CTPOP,      MVT::v32i8,   {  1,  1,  1,  1 } },
3356     { ISD::CTPOP,      MVT::v8i16,   {  1,  1,  1,  1 } },
3357     { ISD::CTPOP,      MVT::v16i8,   {  1,  1,  1,  1 } },
3358   };
3359   static const CostKindTblEntry AVX512VPOPCNTDQCostTbl[] = {
3360     { ISD::CTPOP,      MVT::v8i64,   {  1,  1,  1,  1 } },
3361     { ISD::CTPOP,      MVT::v16i32,  {  1,  1,  1,  1 } },
3362     { ISD::CTPOP,      MVT::v4i64,   {  1,  1,  1,  1 } },
3363     { ISD::CTPOP,      MVT::v8i32,   {  1,  1,  1,  1 } },
3364     { ISD::CTPOP,      MVT::v2i64,   {  1,  1,  1,  1 } },
3365     { ISD::CTPOP,      MVT::v4i32,   {  1,  1,  1,  1 } },
3366   };
3367   static const CostKindTblEntry AVX512CDCostTbl[] = {
3368     { ISD::CTLZ,       MVT::v8i64,   {  1,  5,  1,  1 } },
3369     { ISD::CTLZ,       MVT::v16i32,  {  1,  5,  1,  1 } },
3370     { ISD::CTLZ,       MVT::v32i16,  { 18, 27, 23, 27 } },
3371     { ISD::CTLZ,       MVT::v64i8,   {  3, 16,  9, 11 } },
3372     { ISD::CTLZ,       MVT::v4i64,   {  1,  5,  1,  1 } },
3373     { ISD::CTLZ,       MVT::v8i32,   {  1,  5,  1,  1 } },
3374     { ISD::CTLZ,       MVT::v16i16,  {  8, 19, 11, 13 } },
3375     { ISD::CTLZ,       MVT::v32i8,   {  2, 11,  9, 10 } },
3376     { ISD::CTLZ,       MVT::v2i64,   {  1,  5,  1,  1 } },
3377     { ISD::CTLZ,       MVT::v4i32,   {  1,  5,  1,  1 } },
3378     { ISD::CTLZ,       MVT::v8i16,   {  3, 15,  4,  6 } },
3379     { ISD::CTLZ,       MVT::v16i8,   {  2, 10,  9, 10 } },
3380 
3381     { ISD::CTTZ,       MVT::v8i64,   {  2,  8,  6,  7 } },
3382     { ISD::CTTZ,       MVT::v16i32,  {  2,  8,  6,  7 } },
3383     { ISD::CTTZ,       MVT::v4i64,   {  1,  8,  6,  6 } },
3384     { ISD::CTTZ,       MVT::v8i32,   {  1,  8,  6,  6 } },
3385     { ISD::CTTZ,       MVT::v2i64,   {  1,  8,  6,  6 } },
3386     { ISD::CTTZ,       MVT::v4i32,   {  1,  8,  6,  6 } },
3387   };
3388   static const CostKindTblEntry AVX512BWCostTbl[] = {
3389     { ISD::ABS,        MVT::v32i16,  {  1,  1,  1,  1 } },
3390     { ISD::ABS,        MVT::v64i8,   {  1,  1,  1,  1 } },
3391     { ISD::BITREVERSE, MVT::v8i64,   {  3 } },
3392     { ISD::BITREVERSE, MVT::v16i32,  {  3 } },
3393     { ISD::BITREVERSE, MVT::v32i16,  {  3 } },
3394     { ISD::BITREVERSE, MVT::v64i8,   {  2 } },
3395     { ISD::BSWAP,      MVT::v8i64,   {  1 } },
3396     { ISD::BSWAP,      MVT::v16i32,  {  1 } },
3397     { ISD::BSWAP,      MVT::v32i16,  {  1 } },
3398     { ISD::CTLZ,       MVT::v8i64,   {  8, 22, 23, 23 } },
3399     { ISD::CTLZ,       MVT::v16i32,  {  8, 23, 25, 25 } },
3400     { ISD::CTLZ,       MVT::v32i16,  {  4, 15, 15, 16 } },
3401     { ISD::CTLZ,       MVT::v64i8,   {  3, 12, 10,  9 } },
3402     { ISD::CTPOP,      MVT::v2i64,   {  3,  7, 10, 10 } },
3403     { ISD::CTPOP,      MVT::v4i64,   {  3,  7, 10, 10 } },
3404     { ISD::CTPOP,      MVT::v8i64,   {  3,  8, 10, 12 } },
3405     { ISD::CTPOP,      MVT::v4i32,   {  7, 11, 14, 14 } },
3406     { ISD::CTPOP,      MVT::v8i32,   {  7, 11, 14, 14 } },
3407     { ISD::CTPOP,      MVT::v16i32,  {  7, 12, 14, 16 } },
3408     { ISD::CTPOP,      MVT::v8i16,   {  2,  7, 11, 11 } },
3409     { ISD::CTPOP,      MVT::v16i16,  {  2,  7, 11, 11 } },
3410     { ISD::CTPOP,      MVT::v32i16,  {  3,  7, 11, 13 } },
3411     { ISD::CTPOP,      MVT::v16i8,   {  2,  4,  8,  8 } },
3412     { ISD::CTPOP,      MVT::v32i8,   {  2,  4,  8,  8 } },
3413     { ISD::CTPOP,      MVT::v64i8,   {  2,  5,  8, 10 } },
3414     { ISD::CTTZ,       MVT::v8i16,   {  3,  9, 14, 14 } },
3415     { ISD::CTTZ,       MVT::v16i16,  {  3,  9, 14, 14 } },
3416     { ISD::CTTZ,       MVT::v32i16,  {  3, 10, 14, 16 } },
3417     { ISD::CTTZ,       MVT::v16i8,   {  2,  6, 11, 11 } },
3418     { ISD::CTTZ,       MVT::v32i8,   {  2,  6, 11, 11 } },
3419     { ISD::CTTZ,       MVT::v64i8,   {  3,  7, 11, 13 } },
3420     { ISD::ROTL,       MVT::v32i16,  {  2,  8,  6,  8 } },
3421     { ISD::ROTL,       MVT::v16i16,  {  2,  8,  6,  7 } },
3422     { ISD::ROTL,       MVT::v8i16,   {  2,  7,  6,  7 } },
3423     { ISD::ROTL,       MVT::v64i8,   {  5,  6, 11, 12 } },
3424     { ISD::ROTL,       MVT::v32i8,   {  5, 15,  7, 10 } },
3425     { ISD::ROTL,       MVT::v16i8,   {  5, 15,  7, 10 } },
3426     { ISD::ROTR,       MVT::v32i16,  {  2,  8,  6,  8 } },
3427     { ISD::ROTR,       MVT::v16i16,  {  2,  8,  6,  7 } },
3428     { ISD::ROTR,       MVT::v8i16,   {  2,  7,  6,  7 } },
3429     { ISD::ROTR,       MVT::v64i8,   {  5,  6, 12, 14 } },
3430     { ISD::ROTR,       MVT::v32i8,   {  5, 14,  6,  9 } },
3431     { ISD::ROTR,       MVT::v16i8,   {  5, 14,  6,  9 } },
3432     { ISD::SADDSAT,    MVT::v32i16,  {  1 } },
3433     { ISD::SADDSAT,    MVT::v64i8,   {  1 } },
3434     { ISD::SMAX,       MVT::v32i16,  {  1,  1,  1,  1 } },
3435     { ISD::SMAX,       MVT::v64i8,   {  1,  1,  1,  1 } },
3436     { ISD::SMIN,       MVT::v32i16,  {  1,  1,  1,  1 } },
3437     { ISD::SMIN,       MVT::v64i8,   {  1,  1,  1,  1 } },
3438     { ISD::SSUBSAT,    MVT::v32i16,  {  1 } },
3439     { ISD::SSUBSAT,    MVT::v64i8,   {  1 } },
3440     { ISD::UADDSAT,    MVT::v32i16,  {  1 } },
3441     { ISD::UADDSAT,    MVT::v64i8,   {  1 } },
3442     { ISD::UMAX,       MVT::v32i16,  {  1,  1,  1,  1 } },
3443     { ISD::UMAX,       MVT::v64i8,   {  1,  1,  1,  1 } },
3444     { ISD::UMIN,       MVT::v32i16,  {  1,  1,  1,  1 } },
3445     { ISD::UMIN,       MVT::v64i8,   {  1,  1,  1,  1 } },
3446     { ISD::USUBSAT,    MVT::v32i16,  {  1 } },
3447     { ISD::USUBSAT,    MVT::v64i8,   {  1 } },
3448   };
3449   static const CostKindTblEntry AVX512CostTbl[] = {
3450     { ISD::ABS,        MVT::v8i64,   {  1,  1,  1,  1 } },
3451     { ISD::ABS,        MVT::v4i64,   {  1,  1,  1,  1 } },
3452     { ISD::ABS,        MVT::v2i64,   {  1,  1,  1,  1 } },
3453     { ISD::ABS,        MVT::v16i32,  {  1,  1,  1,  1 } },
3454     { ISD::ABS,        MVT::v8i32,   {  1,  1,  1,  1 } },
3455     { ISD::ABS,        MVT::v32i16,  {  2,  7,  4,  4 } },
3456     { ISD::ABS,        MVT::v16i16,  {  1,  1,  1,  1 } },
3457     { ISD::ABS,        MVT::v64i8,   {  2,  7,  4,  4 } },
3458     { ISD::ABS,        MVT::v32i8,   {  1,  1,  1,  1 } },
3459     { ISD::BITREVERSE, MVT::v8i64,   { 36 } },
3460     { ISD::BITREVERSE, MVT::v16i32,  { 24 } },
3461     { ISD::BITREVERSE, MVT::v32i16,  { 10 } },
3462     { ISD::BITREVERSE, MVT::v64i8,   { 10 } },
3463     { ISD::BSWAP,      MVT::v8i64,   {  4 } },
3464     { ISD::BSWAP,      MVT::v16i32,  {  4 } },
3465     { ISD::BSWAP,      MVT::v32i16,  {  4 } },
3466     { ISD::CTLZ,       MVT::v8i64,   { 10, 28, 32, 32 } },
3467     { ISD::CTLZ,       MVT::v16i32,  { 12, 30, 38, 38 } },
3468     { ISD::CTLZ,       MVT::v32i16,  {  8, 15, 29, 29 } },
3469     { ISD::CTLZ,       MVT::v64i8,   {  6, 11, 19, 19 } },
3470     { ISD::CTPOP,      MVT::v8i64,   { 16, 16, 19, 19 } },
3471     { ISD::CTPOP,      MVT::v16i32,  { 24, 19, 27, 27 } },
3472     { ISD::CTPOP,      MVT::v32i16,  { 18, 15, 22, 22 } },
3473     { ISD::CTPOP,      MVT::v64i8,   { 12, 11, 16, 16 } },
3474     { ISD::CTTZ,       MVT::v8i64,   {  2,  8,  6,  7 } },
3475     { ISD::CTTZ,       MVT::v16i32,  {  2,  8,  6,  7 } },
3476     { ISD::CTTZ,       MVT::v32i16,  {  7, 17, 27, 27 } },
3477     { ISD::CTTZ,       MVT::v64i8,   {  6, 13, 21, 21 } },
3478     { ISD::ROTL,       MVT::v8i64,   {  1,  1,  1,  1 } },
3479     { ISD::ROTL,       MVT::v4i64,   {  1,  1,  1,  1 } },
3480     { ISD::ROTL,       MVT::v2i64,   {  1,  1,  1,  1 } },
3481     { ISD::ROTL,       MVT::v16i32,  {  1,  1,  1,  1 } },
3482     { ISD::ROTL,       MVT::v8i32,   {  1,  1,  1,  1 } },
3483     { ISD::ROTL,       MVT::v4i32,   {  1,  1,  1,  1 } },
3484     { ISD::ROTR,       MVT::v8i64,   {  1,  1,  1,  1 } },
3485     { ISD::ROTR,       MVT::v4i64,   {  1,  1,  1,  1 } },
3486     { ISD::ROTR,       MVT::v2i64,   {  1,  1,  1,  1 } },
3487     { ISD::ROTR,       MVT::v16i32,  {  1,  1,  1,  1 } },
3488     { ISD::ROTR,       MVT::v8i32,   {  1,  1,  1,  1 } },
3489     { ISD::ROTR,       MVT::v4i32,   {  1,  1,  1,  1 } },
3490     { ISD::SMAX,       MVT::v8i64,   {  1,  3,  1,  1 } },
3491     { ISD::SMAX,       MVT::v16i32,  {  1,  1,  1,  1 } },
3492     { ISD::SMAX,       MVT::v32i16,  {  3,  7,  5,  5 } },
3493     { ISD::SMAX,       MVT::v64i8,   {  3,  7,  5,  5 } },
3494     { ISD::SMAX,       MVT::v4i64,   {  1,  3,  1,  1 } },
3495     { ISD::SMAX,       MVT::v2i64,   {  1,  3,  1,  1 } },
3496     { ISD::SMIN,       MVT::v8i64,   {  1,  3,  1,  1 } },
3497     { ISD::SMIN,       MVT::v16i32,  {  1,  1,  1,  1 } },
3498     { ISD::SMIN,       MVT::v32i16,  {  3,  7,  5,  5 } },
3499     { ISD::SMIN,       MVT::v64i8,   {  3,  7,  5,  5 } },
3500     { ISD::SMIN,       MVT::v4i64,   {  1,  3,  1,  1 } },
3501     { ISD::SMIN,       MVT::v2i64,   {  1,  3,  1,  1 } },
3502     { ISD::UMAX,       MVT::v8i64,   {  1,  3,  1,  1 } },
3503     { ISD::UMAX,       MVT::v16i32,  {  1,  1,  1,  1 } },
3504     { ISD::UMAX,       MVT::v32i16,  {  3,  7,  5,  5 } },
3505     { ISD::UMAX,       MVT::v64i8,   {  3,  7,  5,  5 } },
3506     { ISD::UMAX,       MVT::v4i64,   {  1,  3,  1,  1 } },
3507     { ISD::UMAX,       MVT::v2i64,   {  1,  3,  1,  1 } },
3508     { ISD::UMIN,       MVT::v8i64,   {  1,  3,  1,  1 } },
3509     { ISD::UMIN,       MVT::v16i32,  {  1,  1,  1,  1 } },
3510     { ISD::UMIN,       MVT::v32i16,  {  3,  7,  5,  5 } },
3511     { ISD::UMIN,       MVT::v64i8,   {  3,  7,  5,  5 } },
3512     { ISD::UMIN,       MVT::v4i64,   {  1,  3,  1,  1 } },
3513     { ISD::UMIN,       MVT::v2i64,   {  1,  3,  1,  1 } },
3514     { ISD::USUBSAT,    MVT::v16i32,  {  2 } }, // pmaxud + psubd
3515     { ISD::USUBSAT,    MVT::v2i64,   {  2 } }, // pmaxuq + psubq
3516     { ISD::USUBSAT,    MVT::v4i64,   {  2 } }, // pmaxuq + psubq
3517     { ISD::USUBSAT,    MVT::v8i64,   {  2 } }, // pmaxuq + psubq
3518     { ISD::UADDSAT,    MVT::v16i32,  {  3 } }, // not + pminud + paddd
3519     { ISD::UADDSAT,    MVT::v2i64,   {  3 } }, // not + pminuq + paddq
3520     { ISD::UADDSAT,    MVT::v4i64,   {  3 } }, // not + pminuq + paddq
3521     { ISD::UADDSAT,    MVT::v8i64,   {  3 } }, // not + pminuq + paddq
3522     { ISD::SADDSAT,    MVT::v32i16,  {  2 } },
3523     { ISD::SADDSAT,    MVT::v64i8,   {  2 } },
3524     { ISD::SSUBSAT,    MVT::v32i16,  {  2 } },
3525     { ISD::SSUBSAT,    MVT::v64i8,   {  2 } },
3526     { ISD::UADDSAT,    MVT::v32i16,  {  2 } },
3527     { ISD::UADDSAT,    MVT::v64i8,   {  2 } },
3528     { ISD::USUBSAT,    MVT::v32i16,  {  2 } },
3529     { ISD::USUBSAT,    MVT::v64i8,   {  2 } },
3530     { ISD::FMAXNUM,    MVT::f32,     {  2 } },
3531     { ISD::FMAXNUM,    MVT::v4f32,   {  2 } },
3532     { ISD::FMAXNUM,    MVT::v8f32,   {  2 } },
3533     { ISD::FMAXNUM,    MVT::v16f32,  {  2 } },
3534     { ISD::FMAXNUM,    MVT::f64,     {  2 } },
3535     { ISD::FMAXNUM,    MVT::v2f64,   {  2 } },
3536     { ISD::FMAXNUM,    MVT::v4f64,   {  2 } },
3537     { ISD::FMAXNUM,    MVT::v8f64,   {  2 } },
3538     { ISD::FSQRT,      MVT::f32,     {  3, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3539     { ISD::FSQRT,      MVT::v4f32,   {  3, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3540     { ISD::FSQRT,      MVT::v8f32,   {  6, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3541     { ISD::FSQRT,      MVT::v16f32,  { 12, 20, 1, 3 } }, // Skylake from http://www.agner.org/
3542     { ISD::FSQRT,      MVT::f64,     {  6, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3543     { ISD::FSQRT,      MVT::v2f64,   {  6, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3544     { ISD::FSQRT,      MVT::v4f64,   { 12, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3545     { ISD::FSQRT,      MVT::v8f64,   { 24, 32, 1, 3 } }, // Skylake from http://www.agner.org/
3546   };
3547   static const CostKindTblEntry XOPCostTbl[] = {
3548     { ISD::BITREVERSE, MVT::v4i64,   {  4 } },
3549     { ISD::BITREVERSE, MVT::v8i32,   {  4 } },
3550     { ISD::BITREVERSE, MVT::v16i16,  {  4 } },
3551     { ISD::BITREVERSE, MVT::v32i8,   {  4 } },
3552     { ISD::BITREVERSE, MVT::v2i64,   {  1 } },
3553     { ISD::BITREVERSE, MVT::v4i32,   {  1 } },
3554     { ISD::BITREVERSE, MVT::v8i16,   {  1 } },
3555     { ISD::BITREVERSE, MVT::v16i8,   {  1 } },
3556     { ISD::BITREVERSE, MVT::i64,     {  3 } },
3557     { ISD::BITREVERSE, MVT::i32,     {  3 } },
3558     { ISD::BITREVERSE, MVT::i16,     {  3 } },
3559     { ISD::BITREVERSE, MVT::i8,      {  3 } },
3560     // XOP: ROTL = VPROT(X,Y), ROTR = VPROT(X,SUB(0,Y))
3561     { ISD::ROTL,       MVT::v4i64,   {  4,  7,  5,  6 } },
3562     { ISD::ROTL,       MVT::v8i32,   {  4,  7,  5,  6 } },
3563     { ISD::ROTL,       MVT::v16i16,  {  4,  7,  5,  6 } },
3564     { ISD::ROTL,       MVT::v32i8,   {  4,  7,  5,  6 } },
3565     { ISD::ROTL,       MVT::v2i64,   {  1,  3,  1,  1 } },
3566     { ISD::ROTL,       MVT::v4i32,   {  1,  3,  1,  1 } },
3567     { ISD::ROTL,       MVT::v8i16,   {  1,  3,  1,  1 } },
3568     { ISD::ROTL,       MVT::v16i8,   {  1,  3,  1,  1 } },
3569     { ISD::ROTR,       MVT::v4i64,   {  4,  7,  8,  9 } },
3570     { ISD::ROTR,       MVT::v8i32,   {  4,  7,  8,  9 } },
3571     { ISD::ROTR,       MVT::v16i16,  {  4,  7,  8,  9 } },
3572     { ISD::ROTR,       MVT::v32i8,   {  4,  7,  8,  9 } },
3573     { ISD::ROTR,       MVT::v2i64,   {  1,  3,  3,  3 } },
3574     { ISD::ROTR,       MVT::v4i32,   {  1,  3,  3,  3 } },
3575     { ISD::ROTR,       MVT::v8i16,   {  1,  3,  3,  3 } },
3576     { ISD::ROTR,       MVT::v16i8,   {  1,  3,  3,  3 } }
3577   };
3578   static const CostKindTblEntry AVX2CostTbl[] = {
3579     { ISD::ABS,        MVT::v2i64,   {  2,  4,  3,  5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
3580     { ISD::ABS,        MVT::v4i64,   {  2,  4,  3,  5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
3581     { ISD::ABS,        MVT::v4i32,   {  1,  1,  1,  1 } },
3582     { ISD::ABS,        MVT::v8i32,   {  1,  1,  1,  2 } },
3583     { ISD::ABS,        MVT::v8i16,   {  1,  1,  1,  1 } },
3584     { ISD::ABS,        MVT::v16i16,  {  1,  1,  1,  2 } },
3585     { ISD::ABS,        MVT::v16i8,   {  1,  1,  1,  1 } },
3586     { ISD::ABS,        MVT::v32i8,   {  1,  1,  1,  2 } },
3587     { ISD::BITREVERSE, MVT::v2i64,   {  3 } },
3588     { ISD::BITREVERSE, MVT::v4i64,   {  3 } },
3589     { ISD::BITREVERSE, MVT::v4i32,   {  3 } },
3590     { ISD::BITREVERSE, MVT::v8i32,   {  3 } },
3591     { ISD::BITREVERSE, MVT::v8i16,   {  3 } },
3592     { ISD::BITREVERSE, MVT::v16i16,  {  3 } },
3593     { ISD::BITREVERSE, MVT::v16i8,   {  3 } },
3594     { ISD::BITREVERSE, MVT::v32i8,   {  3 } },
3595     { ISD::BSWAP,      MVT::v4i64,   {  1 } },
3596     { ISD::BSWAP,      MVT::v8i32,   {  1 } },
3597     { ISD::BSWAP,      MVT::v16i16,  {  1 } },
3598     { ISD::CTLZ,       MVT::v2i64,   {  7, 18, 24, 25 } },
3599     { ISD::CTLZ,       MVT::v4i64,   { 14, 18, 24, 44 } },
3600     { ISD::CTLZ,       MVT::v4i32,   {  5, 16, 19, 20 } },
3601     { ISD::CTLZ,       MVT::v8i32,   { 10, 16, 19, 34 } },
3602     { ISD::CTLZ,       MVT::v8i16,   {  4, 13, 14, 15 } },
3603     { ISD::CTLZ,       MVT::v16i16,  {  6, 14, 14, 24 } },
3604     { ISD::CTLZ,       MVT::v16i8,   {  3, 12,  9, 10 } },
3605     { ISD::CTLZ,       MVT::v32i8,   {  4, 12,  9, 14 } },
3606     { ISD::CTPOP,      MVT::v2i64,   {  3,  9, 10, 10 } },
3607     { ISD::CTPOP,      MVT::v4i64,   {  4,  9, 10, 14 } },
3608     { ISD::CTPOP,      MVT::v4i32,   {  7, 12, 14, 14 } },
3609     { ISD::CTPOP,      MVT::v8i32,   {  7, 12, 14, 18 } },
3610     { ISD::CTPOP,      MVT::v8i16,   {  3,  7, 11, 11 } },
3611     { ISD::CTPOP,      MVT::v16i16,  {  6,  8, 11, 18 } },
3612     { ISD::CTPOP,      MVT::v16i8,   {  2,  5,  8,  8 } },
3613     { ISD::CTPOP,      MVT::v32i8,   {  3,  5,  8, 12 } },
3614     { ISD::CTTZ,       MVT::v2i64,   {  4, 11, 13, 13 } },
3615     { ISD::CTTZ,       MVT::v4i64,   {  5, 11, 13, 20 } },
3616     { ISD::CTTZ,       MVT::v4i32,   {  7, 14, 17, 17 } },
3617     { ISD::CTTZ,       MVT::v8i32,   {  7, 15, 17, 24 } },
3618     { ISD::CTTZ,       MVT::v8i16,   {  4,  9, 14, 14 } },
3619     { ISD::CTTZ,       MVT::v16i16,  {  6,  9, 14, 24 } },
3620     { ISD::CTTZ,       MVT::v16i8,   {  3,  7, 11, 11 } },
3621     { ISD::CTTZ,       MVT::v32i8,   {  5,  7, 11, 18 } },
3622     { ISD::SADDSAT,    MVT::v16i16,  {  1 } },
3623     { ISD::SADDSAT,    MVT::v32i8,   {  1 } },
3624     { ISD::SMAX,       MVT::v2i64,   {  2,  7,  2,  3 } },
3625     { ISD::SMAX,       MVT::v4i64,   {  2,  7,  2,  3 } },
3626     { ISD::SMAX,       MVT::v8i32,   {  1,  1,  1,  2 } },
3627     { ISD::SMAX,       MVT::v16i16,  {  1,  1,  1,  2 } },
3628     { ISD::SMAX,       MVT::v32i8,   {  1,  1,  1,  2 } },
3629     { ISD::SMIN,       MVT::v2i64,   {  2,  7,  2,  3 } },
3630     { ISD::SMIN,       MVT::v4i64,   {  2,  7,  2,  3 } },
3631     { ISD::SMIN,       MVT::v8i32,   {  1,  1,  1,  2 } },
3632     { ISD::SMIN,       MVT::v16i16,  {  1,  1,  1,  2 } },
3633     { ISD::SMIN,       MVT::v32i8,   {  1,  1,  1,  2 } },
3634     { ISD::SSUBSAT,    MVT::v16i16,  {  1 } },
3635     { ISD::SSUBSAT,    MVT::v32i8,   {  1 } },
3636     { ISD::UADDSAT,    MVT::v16i16,  {  1 } },
3637     { ISD::UADDSAT,    MVT::v32i8,   {  1 } },
3638     { ISD::UADDSAT,    MVT::v8i32,   {  3 } }, // not + pminud + paddd
3639     { ISD::UMAX,       MVT::v2i64,   {  2,  8,  5,  6 } },
3640     { ISD::UMAX,       MVT::v4i64,   {  2,  8,  5,  8 } },
3641     { ISD::UMAX,       MVT::v8i32,   {  1,  1,  1,  2 } },
3642     { ISD::UMAX,       MVT::v16i16,  {  1,  1,  1,  2 } },
3643     { ISD::UMAX,       MVT::v32i8,   {  1,  1,  1,  2 } },
3644     { ISD::UMIN,       MVT::v2i64,   {  2,  8,  5,  6 } },
3645     { ISD::UMIN,       MVT::v4i64,   {  2,  8,  5,  8 } },
3646     { ISD::UMIN,       MVT::v8i32,   {  1,  1,  1,  2 } },
3647     { ISD::UMIN,       MVT::v16i16,  {  1,  1,  1,  2 } },
3648     { ISD::UMIN,       MVT::v32i8,   {  1,  1,  1,  2 } },
3649     { ISD::USUBSAT,    MVT::v16i16,  {  1 } },
3650     { ISD::USUBSAT,    MVT::v32i8,   {  1 } },
3651     { ISD::USUBSAT,    MVT::v8i32,   {  2 } }, // pmaxud + psubd
3652     { ISD::FMAXNUM,    MVT::v8f32,   {  3 } }, // MAXPS + CMPUNORDPS + BLENDVPS
3653     { ISD::FMAXNUM,    MVT::v4f64,   {  3 } }, // MAXPD + CMPUNORDPD + BLENDVPD
3654     { ISD::FSQRT,      MVT::f32,     {  7, 15, 1, 1 } }, // vsqrtss
3655     { ISD::FSQRT,      MVT::v4f32,   {  7, 15, 1, 1 } }, // vsqrtps
3656     { ISD::FSQRT,      MVT::v8f32,   { 14, 21, 1, 3 } }, // vsqrtps
3657     { ISD::FSQRT,      MVT::f64,     { 14, 21, 1, 1 } }, // vsqrtsd
3658     { ISD::FSQRT,      MVT::v2f64,   { 14, 21, 1, 1 } }, // vsqrtpd
3659     { ISD::FSQRT,      MVT::v4f64,   { 28, 35, 1, 3 } }, // vsqrtpd
3660   };
3661   static const CostKindTblEntry AVX1CostTbl[] = {
3662     { ISD::ABS,        MVT::v4i64,   {  6,  8,  6, 12 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
3663     { ISD::ABS,        MVT::v8i32,   {  3,  6,  4,  5 } },
3664     { ISD::ABS,        MVT::v16i16,  {  3,  6,  4,  5 } },
3665     { ISD::ABS,        MVT::v32i8,   {  3,  6,  4,  5 } },
3666     { ISD::BITREVERSE, MVT::v4i64,   { 12 } }, // 2 x 128-bit Op + extract/insert
3667     { ISD::BITREVERSE, MVT::v8i32,   { 12 } }, // 2 x 128-bit Op + extract/insert
3668     { ISD::BITREVERSE, MVT::v16i16,  { 12 } }, // 2 x 128-bit Op + extract/insert
3669     { ISD::BITREVERSE, MVT::v32i8,   { 12 } }, // 2 x 128-bit Op + extract/insert
3670     { ISD::BSWAP,      MVT::v4i64,   {  4 } },
3671     { ISD::BSWAP,      MVT::v8i32,   {  4 } },
3672     { ISD::BSWAP,      MVT::v16i16,  {  4 } },
3673     { ISD::CTLZ,       MVT::v4i64,   { 29, 33, 49, 58 } }, // 2 x 128-bit Op + extract/insert
3674     { ISD::CTLZ,       MVT::v2i64,   { 14, 24, 24, 28 } },
3675     { ISD::CTLZ,       MVT::v8i32,   { 24, 28, 39, 48 } }, // 2 x 128-bit Op + extract/insert
3676     { ISD::CTLZ,       MVT::v4i32,   { 12, 20, 19, 23 } },
3677     { ISD::CTLZ,       MVT::v16i16,  { 19, 22, 29, 38 } }, // 2 x 128-bit Op + extract/insert
3678     { ISD::CTLZ,       MVT::v8i16,   {  9, 16, 14, 18 } },
3679     { ISD::CTLZ,       MVT::v32i8,   { 14, 15, 19, 28 } }, // 2 x 128-bit Op + extract/insert
3680     { ISD::CTLZ,       MVT::v16i8,   {  7, 12,  9, 13 } },
3681     { ISD::CTPOP,      MVT::v4i64,   { 14, 18, 19, 28 } }, // 2 x 128-bit Op + extract/insert
3682     { ISD::CTPOP,      MVT::v2i64,   {  7, 14, 10, 14 } },
3683     { ISD::CTPOP,      MVT::v8i32,   { 18, 24, 27, 36 } }, // 2 x 128-bit Op + extract/insert
3684     { ISD::CTPOP,      MVT::v4i32,   {  9, 20, 14, 18 } },
3685     { ISD::CTPOP,      MVT::v16i16,  { 16, 21, 22, 31 } }, // 2 x 128-bit Op + extract/insert
3686     { ISD::CTPOP,      MVT::v8i16,   {  8, 18, 11, 15 } },
3687     { ISD::CTPOP,      MVT::v32i8,   { 13, 15, 16, 25 } }, // 2 x 128-bit Op + extract/insert
3688     { ISD::CTPOP,      MVT::v16i8,   {  6, 12,  8, 12 } },
3689     { ISD::CTTZ,       MVT::v4i64,   { 17, 22, 24, 33 } }, // 2 x 128-bit Op + extract/insert
3690     { ISD::CTTZ,       MVT::v2i64,   {  9, 19, 13, 17 } },
3691     { ISD::CTTZ,       MVT::v8i32,   { 21, 27, 32, 41 } }, // 2 x 128-bit Op + extract/insert
3692     { ISD::CTTZ,       MVT::v4i32,   { 11, 24, 17, 21 } },
3693     { ISD::CTTZ,       MVT::v16i16,  { 18, 24, 27, 36 } }, // 2 x 128-bit Op + extract/insert
3694     { ISD::CTTZ,       MVT::v8i16,   {  9, 21, 14, 18 } },
3695     { ISD::CTTZ,       MVT::v32i8,   { 15, 18, 21, 30 } }, // 2 x 128-bit Op + extract/insert
3696     { ISD::CTTZ,       MVT::v16i8,   {  8, 16, 11, 15 } },
3697     { ISD::SADDSAT,    MVT::v16i16,  {  4 } }, // 2 x 128-bit Op + extract/insert
3698     { ISD::SADDSAT,    MVT::v32i8,   {  4 } }, // 2 x 128-bit Op + extract/insert
3699     { ISD::SMAX,       MVT::v4i64,   {  6,  9,  6, 12 } }, // 2 x 128-bit Op + extract/insert
3700     { ISD::SMAX,       MVT::v2i64,   {  3,  7,  2,  4 } },
3701     { ISD::SMAX,       MVT::v8i32,   {  4,  6,  5,  6 } }, // 2 x 128-bit Op + extract/insert
3702     { ISD::SMAX,       MVT::v16i16,  {  4,  6,  5,  6 } }, // 2 x 128-bit Op + extract/insert
3703     { ISD::SMAX,       MVT::v32i8,   {  4,  6,  5,  6 } }, // 2 x 128-bit Op + extract/insert
3704     { ISD::SMIN,       MVT::v4i64,   {  6,  9,  6, 12 } }, // 2 x 128-bit Op + extract/insert
3705     { ISD::SMIN,       MVT::v2i64,   {  3,  7,  2,  3 } },
3706     { ISD::SMIN,       MVT::v8i32,   {  4,  6,  5,  6 } }, // 2 x 128-bit Op + extract/insert
3707     { ISD::SMIN,       MVT::v16i16,  {  4,  6,  5,  6 } }, // 2 x 128-bit Op + extract/insert
3708     { ISD::SMIN,       MVT::v32i8,   {  4,  6,  5,  6 } }, // 2 x 128-bit Op + extract/insert
3709     { ISD::SSUBSAT,    MVT::v16i16,  {  4 } }, // 2 x 128-bit Op + extract/insert
3710     { ISD::SSUBSAT,    MVT::v32i8,   {  4 } }, // 2 x 128-bit Op + extract/insert
3711     { ISD::UADDSAT,    MVT::v16i16,  {  4 } }, // 2 x 128-bit Op + extract/insert
3712     { ISD::UADDSAT,    MVT::v32i8,   {  4 } }, // 2 x 128-bit Op + extract/insert
3713     { ISD::UADDSAT,    MVT::v8i32,   {  8 } }, // 2 x 128-bit Op + extract/insert
3714     { ISD::UMAX,       MVT::v4i64,   {  9, 10, 11, 17 } }, // 2 x 128-bit Op + extract/insert
3715     { ISD::UMAX,       MVT::v2i64,   {  4,  8,  5,  7 } },
3716     { ISD::UMAX,       MVT::v8i32,   {  4,  6,  5,  6 } }, // 2 x 128-bit Op + extract/insert
3717     { ISD::UMAX,       MVT::v16i16,  {  4,  6,  5,  6 } }, // 2 x 128-bit Op + extract/insert
3718     { ISD::UMAX,       MVT::v32i8,   {  4,  6,  5,  6 } }, // 2 x 128-bit Op + extract/insert
3719     { ISD::UMIN,       MVT::v4i64,   {  9, 10, 11, 17 } }, // 2 x 128-bit Op + extract/insert
3720     { ISD::UMIN,       MVT::v2i64,   {  4,  8,  5,  7 } },
3721     { ISD::UMIN,       MVT::v8i32,   {  4,  6,  5,  6 } }, // 2 x 128-bit Op + extract/insert
3722     { ISD::UMIN,       MVT::v16i16,  {  4,  6,  5,  6 } }, // 2 x 128-bit Op + extract/insert
3723     { ISD::UMIN,       MVT::v32i8,   {  4,  6,  5,  6 } }, // 2 x 128-bit Op + extract/insert
3724     { ISD::USUBSAT,    MVT::v16i16,  {  4 } }, // 2 x 128-bit Op + extract/insert
3725     { ISD::USUBSAT,    MVT::v32i8,   {  4 } }, // 2 x 128-bit Op + extract/insert
3726     { ISD::USUBSAT,    MVT::v8i32,   {  6 } }, // 2 x 128-bit Op + extract/insert
3727     { ISD::FMAXNUM,    MVT::f32,     {  3 } }, // MAXSS + CMPUNORDSS + BLENDVPS
3728     { ISD::FMAXNUM,    MVT::v4f32,   {  3 } }, // MAXPS + CMPUNORDPS + BLENDVPS
3729     { ISD::FMAXNUM,    MVT::v8f32,   {  5 } }, // MAXPS + CMPUNORDPS + BLENDVPS + ?
3730     { ISD::FMAXNUM,    MVT::f64,     {  3 } }, // MAXSD + CMPUNORDSD + BLENDVPD
3731     { ISD::FMAXNUM,    MVT::v2f64,   {  3 } }, // MAXPD + CMPUNORDPD + BLENDVPD
3732     { ISD::FMAXNUM,    MVT::v4f64,   {  5 } }, // MAXPD + CMPUNORDPD + BLENDVPD + ?
3733     { ISD::FSQRT,      MVT::f32,     { 21, 21, 1, 1 } }, // vsqrtss
3734     { ISD::FSQRT,      MVT::v4f32,   { 21, 21, 1, 1 } }, // vsqrtps
3735     { ISD::FSQRT,      MVT::v8f32,   { 42, 42, 1, 3 } }, // vsqrtps
3736     { ISD::FSQRT,      MVT::f64,     { 27, 27, 1, 1 } }, // vsqrtsd
3737     { ISD::FSQRT,      MVT::v2f64,   { 27, 27, 1, 1 } }, // vsqrtpd
3738     { ISD::FSQRT,      MVT::v4f64,   { 54, 54, 1, 3 } }, // vsqrtpd
3739   };
3740   static const CostKindTblEntry GLMCostTbl[] = {
3741     { ISD::FSQRT,      MVT::f32,     { 19, 20, 1, 1 } }, // sqrtss
3742     { ISD::FSQRT,      MVT::v4f32,   { 37, 41, 1, 5 } }, // sqrtps
3743     { ISD::FSQRT,      MVT::f64,     { 34, 35, 1, 1 } }, // sqrtsd
3744     { ISD::FSQRT,      MVT::v2f64,   { 67, 71, 1, 5 } }, // sqrtpd
3745   };
3746   static const CostKindTblEntry SLMCostTbl[] = {
3747     { ISD::FSQRT,      MVT::f32,     { 20, 20, 1, 1 } }, // sqrtss
3748     { ISD::FSQRT,      MVT::v4f32,   { 40, 41, 1, 5 } }, // sqrtps
3749     { ISD::FSQRT,      MVT::f64,     { 35, 35, 1, 1 } }, // sqrtsd
3750     { ISD::FSQRT,      MVT::v2f64,   { 70, 71, 1, 5 } }, // sqrtpd
3751   };
3752   static const CostKindTblEntry SSE42CostTbl[] = {
3753     { ISD::USUBSAT,    MVT::v4i32,   {  2 } }, // pmaxud + psubd
3754     { ISD::UADDSAT,    MVT::v4i32,   {  3 } }, // not + pminud + paddd
3755     { ISD::FSQRT,      MVT::f32,     { 18, 18, 1, 1 } }, // Nehalem from http://www.agner.org/
3756     { ISD::FSQRT,      MVT::v4f32,   { 18, 18, 1, 1 } }, // Nehalem from http://www.agner.org/
3757   };
3758   static const CostKindTblEntry SSE41CostTbl[] = {
3759     { ISD::ABS,        MVT::v2i64,   {  3,  4,  3,  5 } }, // BLENDVPD(X,PSUBQ(0,X),X)
3760     { ISD::SMAX,       MVT::v2i64,   {  3,  7,  2,  3 } },
3761     { ISD::SMAX,       MVT::v4i32,   {  1,  1,  1,  1 } },
3762     { ISD::SMAX,       MVT::v16i8,   {  1,  1,  1,  1 } },
3763     { ISD::SMIN,       MVT::v2i64,   {  3,  7,  2,  3 } },
3764     { ISD::SMIN,       MVT::v4i32,   {  1,  1,  1,  1 } },
3765     { ISD::SMIN,       MVT::v16i8,   {  1,  1,  1,  1 } },
3766     { ISD::UMAX,       MVT::v2i64,   {  2, 11,  6,  7 } },
3767     { ISD::UMAX,       MVT::v4i32,   {  1,  1,  1,  1 } },
3768     { ISD::UMAX,       MVT::v8i16,   {  1,  1,  1,  1 } },
3769     { ISD::UMIN,       MVT::v2i64,   {  2, 11,  6,  7 } },
3770     { ISD::UMIN,       MVT::v4i32,   {  1,  1,  1,  1 } },
3771     { ISD::UMIN,       MVT::v8i16,   {  1,  1,  1,  1 } },
3772   };
3773   static const CostKindTblEntry SSSE3CostTbl[] = {
3774     { ISD::ABS,        MVT::v4i32,   {  1,  2,  1,  1 } },
3775     { ISD::ABS,        MVT::v8i16,   {  1,  2,  1,  1 } },
3776     { ISD::ABS,        MVT::v16i8,   {  1,  2,  1,  1 } },
3777     { ISD::BITREVERSE, MVT::v2i64,   {  5 } },
3778     { ISD::BITREVERSE, MVT::v4i32,   {  5 } },
3779     { ISD::BITREVERSE, MVT::v8i16,   {  5 } },
3780     { ISD::BITREVERSE, MVT::v16i8,   {  5 } },
3781     { ISD::BSWAP,      MVT::v2i64,   {  1 } },
3782     { ISD::BSWAP,      MVT::v4i32,   {  1 } },
3783     { ISD::BSWAP,      MVT::v8i16,   {  1 } },
3784     { ISD::CTLZ,       MVT::v2i64,   { 18, 28, 28, 35 } },
3785     { ISD::CTLZ,       MVT::v4i32,   { 15, 20, 22, 28 } },
3786     { ISD::CTLZ,       MVT::v8i16,   { 13, 17, 16, 22 } },
3787     { ISD::CTLZ,       MVT::v16i8,   { 11, 15, 10, 16 } },
3788     { ISD::CTPOP,      MVT::v2i64,   { 13, 19, 12, 18 } },
3789     { ISD::CTPOP,      MVT::v4i32,   { 18, 24, 16, 22 } },
3790     { ISD::CTPOP,      MVT::v8i16,   { 13, 18, 14, 20 } },
3791     { ISD::CTPOP,      MVT::v16i8,   { 11, 12, 10, 16 } },
3792     { ISD::CTTZ,       MVT::v2i64,   { 13, 25, 15, 22 } },
3793     { ISD::CTTZ,       MVT::v4i32,   { 18, 26, 19, 25 } },
3794     { ISD::CTTZ,       MVT::v8i16,   { 13, 20, 17, 23 } },
3795     { ISD::CTTZ,       MVT::v16i8,   { 11, 16, 13, 19 } }
3796   };
3797   static const CostKindTblEntry SSE2CostTbl[] = {
3798     { ISD::ABS,        MVT::v2i64,   {  3,  6,  5,  5 } },
3799     { ISD::ABS,        MVT::v4i32,   {  1,  4,  4,  4 } },
3800     { ISD::ABS,        MVT::v8i16,   {  1,  2,  3,  3 } },
3801     { ISD::ABS,        MVT::v16i8,   {  1,  2,  3,  3 } },
3802     { ISD::BITREVERSE, MVT::v2i64,   { 29 } },
3803     { ISD::BITREVERSE, MVT::v4i32,   { 27 } },
3804     { ISD::BITREVERSE, MVT::v8i16,   { 27 } },
3805     { ISD::BITREVERSE, MVT::v16i8,   { 20 } },
3806     { ISD::BSWAP,      MVT::v2i64,   {  7 } },
3807     { ISD::BSWAP,      MVT::v4i32,   {  7 } },
3808     { ISD::BSWAP,      MVT::v8i16,   {  7 } },
3809     { ISD::CTLZ,       MVT::v2i64,   { 10, 45, 36, 38 } },
3810     { ISD::CTLZ,       MVT::v4i32,   { 10, 45, 38, 40 } },
3811     { ISD::CTLZ,       MVT::v8i16,   {  9, 38, 32, 34 } },
3812     { ISD::CTLZ,       MVT::v16i8,   {  8, 39, 29, 32 } },
3813     { ISD::CTPOP,      MVT::v2i64,   { 12, 26, 16, 18 } },
3814     { ISD::CTPOP,      MVT::v4i32,   { 15, 29, 21, 23 } },
3815     { ISD::CTPOP,      MVT::v8i16,   { 13, 25, 18, 20 } },
3816     { ISD::CTPOP,      MVT::v16i8,   { 10, 21, 14, 16 } },
3817     { ISD::CTTZ,       MVT::v2i64,   { 14, 28, 19, 21 } },
3818     { ISD::CTTZ,       MVT::v4i32,   { 18, 31, 24, 26 } },
3819     { ISD::CTTZ,       MVT::v8i16,   { 16, 27, 21, 23 } },
3820     { ISD::CTTZ,       MVT::v16i8,   { 13, 23, 17, 19 } },
3821     { ISD::SADDSAT,    MVT::v8i16,   {  1 } },
3822     { ISD::SADDSAT,    MVT::v16i8,   {  1 } },
3823     { ISD::SMAX,       MVT::v2i64,   {  4,  8, 15, 15 } },
3824     { ISD::SMAX,       MVT::v4i32,   {  2,  4,  5,  5 } },
3825     { ISD::SMAX,       MVT::v8i16,   {  1,  1,  1,  1 } },
3826     { ISD::SMAX,       MVT::v16i8,   {  2,  4,  5,  5 } },
3827     { ISD::SMIN,       MVT::v2i64,   {  4,  8, 15, 15 } },
3828     { ISD::SMIN,       MVT::v4i32,   {  2,  4,  5,  5 } },
3829     { ISD::SMIN,       MVT::v8i16,   {  1,  1,  1,  1 } },
3830     { ISD::SMIN,       MVT::v16i8,   {  2,  4,  5,  5 } },
3831     { ISD::SSUBSAT,    MVT::v8i16,   {  1 } },
3832     { ISD::SSUBSAT,    MVT::v16i8,   {  1 } },
3833     { ISD::UADDSAT,    MVT::v8i16,   {  1 } },
3834     { ISD::UADDSAT,    MVT::v16i8,   {  1 } },
3835     { ISD::UMAX,       MVT::v2i64,   {  4,  8, 15, 15 } },
3836     { ISD::UMAX,       MVT::v4i32,   {  2,  5,  8,  8 } },
3837     { ISD::UMAX,       MVT::v8i16,   {  1,  3,  3,  3 } },
3838     { ISD::UMAX,       MVT::v16i8,   {  1,  1,  1,  1 } },
3839     { ISD::UMIN,       MVT::v2i64,   {  4,  8, 15, 15 } },
3840     { ISD::UMIN,       MVT::v4i32,   {  2,  5,  8,  8 } },
3841     { ISD::UMIN,       MVT::v8i16,   {  1,  3,  3,  3 } },
3842     { ISD::UMIN,       MVT::v16i8,   {  1,  1,  1,  1 } },
3843     { ISD::USUBSAT,    MVT::v8i16,   {  1 } },
3844     { ISD::USUBSAT,    MVT::v16i8,   {  1 } },
3845     { ISD::FMAXNUM,    MVT::f64,     {  4 } },
3846     { ISD::FMAXNUM,    MVT::v2f64,   {  4 } },
3847     { ISD::FSQRT,      MVT::f64,     { 32, 32, 1, 1 } }, // Nehalem from http://www.agner.org/
3848     { ISD::FSQRT,      MVT::v2f64,   { 32, 32, 1, 1 } }, // Nehalem from http://www.agner.org/
3849   };
3850   static const CostKindTblEntry SSE1CostTbl[] = {
3851     { ISD::FMAXNUM,    MVT::f32,     {  4 } },
3852     { ISD::FMAXNUM,    MVT::v4f32,   {  4 } },
3853     { ISD::FSQRT,      MVT::f32,     { 28, 30, 1, 2 } }, // Pentium III from http://www.agner.org/
3854     { ISD::FSQRT,      MVT::v4f32,   { 56, 56, 1, 2 } }, // Pentium III from http://www.agner.org/
3855   };
3856   static const CostKindTblEntry BMI64CostTbl[] = { // 64-bit targets
3857     { ISD::CTTZ,       MVT::i64,     {  1 } },
3858   };
3859   static const CostKindTblEntry BMI32CostTbl[] = { // 32 or 64-bit targets
3860     { ISD::CTTZ,       MVT::i32,     {  1 } },
3861     { ISD::CTTZ,       MVT::i16,     {  1 } },
3862     { ISD::CTTZ,       MVT::i8,      {  1 } },
3863   };
3864   static const CostKindTblEntry LZCNT64CostTbl[] = { // 64-bit targets
3865     { ISD::CTLZ,       MVT::i64,     {  1 } },
3866   };
3867   static const CostKindTblEntry LZCNT32CostTbl[] = { // 32 or 64-bit targets
3868     { ISD::CTLZ,       MVT::i32,     {  1 } },
3869     { ISD::CTLZ,       MVT::i16,     {  2 } },
3870     { ISD::CTLZ,       MVT::i8,      {  2 } },
3871   };
3872   static const CostKindTblEntry POPCNT64CostTbl[] = { // 64-bit targets
3873     { ISD::CTPOP,      MVT::i64,     {  1, 1, 1, 1 } }, // popcnt
3874   };
3875   static const CostKindTblEntry POPCNT32CostTbl[] = { // 32 or 64-bit targets
3876     { ISD::CTPOP,      MVT::i32,     {  1, 1, 1, 1 } }, // popcnt
3877     { ISD::CTPOP,      MVT::i16,     {  1, 1, 2, 2 } }, // popcnt(zext())
3878     { ISD::CTPOP,      MVT::i8,      {  1, 1, 2, 2 } }, // popcnt(zext())
3879   };
3880   static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets
3881     { ISD::ABS,        MVT::i64,     {  1,  2,  3,  4 } }, // SUB+CMOV
3882     { ISD::BITREVERSE, MVT::i64,     { 14 } },
3883     { ISD::BSWAP,      MVT::i64,     {  1 } },
3884     { ISD::CTLZ,       MVT::i64,     {  4 } }, // BSR+XOR or BSR+XOR+CMOV
3885     { ISD::CTLZ_ZERO_UNDEF, MVT::i64,{  1,  1,  1,  1 } }, // BSR+XOR
3886     { ISD::CTTZ,       MVT::i64,     {  3 } }, // TEST+BSF+CMOV/BRANCH
3887     { ISD::CTTZ_ZERO_UNDEF, MVT::i64,{  1,  1,  1,  1 } }, // BSR
3888     { ISD::CTPOP,      MVT::i64,     { 10,  6, 19, 19 } },
3889     { ISD::ROTL,       MVT::i64,     {  2, 3, 1, 3 } },
3890     { ISD::ROTR,       MVT::i64,     {  2, 3, 1, 3 } },
3891     { ISD::FSHL,       MVT::i64,     {  4, 4, 1, 4 } },
3892     { ISD::SMAX,       MVT::i64,     {  1,  3,  2,  3 } },
3893     { ISD::SMIN,       MVT::i64,     {  1,  3,  2,  3 } },
3894     { ISD::UMAX,       MVT::i64,     {  1,  3,  2,  3 } },
3895     { ISD::UMIN,       MVT::i64,     {  1,  3,  2,  3 } },
3896     { ISD::SADDO,      MVT::i64,     {  1 } },
3897     { ISD::UADDO,      MVT::i64,     {  1 } },
3898     { ISD::UMULO,      MVT::i64,     {  2 } }, // mulq + seto
3899   };
3900   static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets
3901     { ISD::ABS,        MVT::i32,     {  1,  2,  3,  4 } }, // SUB+XOR+SRA or SUB+CMOV
3902     { ISD::ABS,        MVT::i16,     {  2,  2,  3,  4 } }, // SUB+XOR+SRA or SUB+CMOV
3903     { ISD::ABS,        MVT::i8,      {  2,  4,  4,  4 } }, // SUB+XOR+SRA
3904     { ISD::BITREVERSE, MVT::i32,     { 14 } },
3905     { ISD::BITREVERSE, MVT::i16,     { 14 } },
3906     { ISD::BITREVERSE, MVT::i8,      { 11 } },
3907     { ISD::BSWAP,      MVT::i32,     {  1 } },
3908     { ISD::BSWAP,      MVT::i16,     {  1 } }, // ROL
3909     { ISD::CTLZ,       MVT::i32,     {  4 } }, // BSR+XOR or BSR+XOR+CMOV
3910     { ISD::CTLZ,       MVT::i16,     {  4 } }, // BSR+XOR or BSR+XOR+CMOV
3911     { ISD::CTLZ,       MVT::i8,      {  4 } }, // BSR+XOR or BSR+XOR+CMOV
3912     { ISD::CTLZ_ZERO_UNDEF, MVT::i32,{  1,  1,  1,  1 } }, // BSR+XOR
3913     { ISD::CTLZ_ZERO_UNDEF, MVT::i16,{  2,  2,  3,  3 } }, // BSR+XOR
3914     { ISD::CTLZ_ZERO_UNDEF, MVT::i8, {  2,  2,  3,  3 } }, // BSR+XOR
3915     { ISD::CTTZ,       MVT::i32,     {  3 } }, // TEST+BSF+CMOV/BRANCH
3916     { ISD::CTTZ,       MVT::i16,     {  3 } }, // TEST+BSF+CMOV/BRANCH
3917     { ISD::CTTZ,       MVT::i8,      {  3 } }, // TEST+BSF+CMOV/BRANCH
3918     { ISD::CTTZ_ZERO_UNDEF, MVT::i32,{  1,  1,  1,  1 } }, // BSF
3919     { ISD::CTTZ_ZERO_UNDEF, MVT::i16,{  2,  2,  1,  1 } }, // BSF
3920     { ISD::CTTZ_ZERO_UNDEF, MVT::i8, {  2,  2,  1,  1 } }, // BSF
3921     { ISD::CTPOP,      MVT::i32,     {  8,  7, 15, 15 } },
3922     { ISD::CTPOP,      MVT::i16,     {  9,  8, 17, 17 } },
3923     { ISD::CTPOP,      MVT::i8,      {  7,  6, 13, 13 } },
3924     { ISD::ROTL,       MVT::i32,     {  2,  3,  1,  3 } },
3925     { ISD::ROTL,       MVT::i16,     {  2,  3,  1,  3 } },
3926     { ISD::ROTL,       MVT::i8,      {  2,  3,  1,  3 } },
3927     { ISD::ROTR,       MVT::i32,     {  2,  3,  1,  3 } },
3928     { ISD::ROTR,       MVT::i16,     {  2,  3,  1,  3 } },
3929     { ISD::ROTR,       MVT::i8,      {  2,  3,  1,  3 } },
3930     { ISD::FSHL,       MVT::i32,     {  4,  4,  1,  4 } },
3931     { ISD::FSHL,       MVT::i16,     {  4,  4,  2,  5 } },
3932     { ISD::FSHL,       MVT::i8,      {  4,  4,  2,  5 } },
3933     { ISD::SMAX,       MVT::i32,     {  1,  2,  2,  3 } },
3934     { ISD::SMAX,       MVT::i16,     {  1,  4,  2,  4 } },
3935     { ISD::SMAX,       MVT::i8,      {  1,  4,  2,  4 } },
3936     { ISD::SMIN,       MVT::i32,     {  1,  2,  2,  3 } },
3937     { ISD::SMIN,       MVT::i16,     {  1,  4,  2,  4 } },
3938     { ISD::SMIN,       MVT::i8,      {  1,  4,  2,  4 } },
3939     { ISD::UMAX,       MVT::i32,     {  1,  2,  2,  3 } },
3940     { ISD::UMAX,       MVT::i16,     {  1,  4,  2,  4 } },
3941     { ISD::UMAX,       MVT::i8,      {  1,  4,  2,  4 } },
3942     { ISD::UMIN,       MVT::i32,     {  1,  2,  2,  3 } },
3943     { ISD::UMIN,       MVT::i16,     {  1,  4,  2,  4 } },
3944     { ISD::UMIN,       MVT::i8,      {  1,  4,  2,  4 } },
3945     { ISD::SADDO,      MVT::i32,     {  1 } },
3946     { ISD::SADDO,      MVT::i16,     {  1 } },
3947     { ISD::SADDO,      MVT::i8,      {  1 } },
3948     { ISD::UADDO,      MVT::i32,     {  1 } },
3949     { ISD::UADDO,      MVT::i16,     {  1 } },
3950     { ISD::UADDO,      MVT::i8,      {  1 } },
3951     { ISD::UMULO,      MVT::i32,     {  2 } }, // mul + seto
3952     { ISD::UMULO,      MVT::i16,     {  2 } },
3953     { ISD::UMULO,      MVT::i8,      {  2 } },
3954   };
3955 
3956   Type *RetTy = ICA.getReturnType();
3957   Type *OpTy = RetTy;
3958   Intrinsic::ID IID = ICA.getID();
3959   unsigned ISD = ISD::DELETED_NODE;
3960   switch (IID) {
3961   default:
3962     break;
3963   case Intrinsic::abs:
3964     ISD = ISD::ABS;
3965     break;
3966   case Intrinsic::bitreverse:
3967     ISD = ISD::BITREVERSE;
3968     break;
3969   case Intrinsic::bswap:
3970     ISD = ISD::BSWAP;
3971     break;
3972   case Intrinsic::ctlz:
3973     ISD = ISD::CTLZ;
3974     break;
3975   case Intrinsic::ctpop:
3976     ISD = ISD::CTPOP;
3977     break;
3978   case Intrinsic::cttz:
3979     ISD = ISD::CTTZ;
3980     break;
3981   case Intrinsic::fshl:
3982     ISD = ISD::FSHL;
3983     if (!ICA.isTypeBasedOnly()) {
3984       const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
3985       if (Args[0] == Args[1])
3986         ISD = ISD::ROTL;
3987     }
3988     break;
3989   case Intrinsic::fshr:
3990     // FSHR has same costs so don't duplicate.
3991     ISD = ISD::FSHL;
3992     if (!ICA.isTypeBasedOnly()) {
3993       const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
3994       if (Args[0] == Args[1])
3995         ISD = ISD::ROTR;
3996     }
3997     break;
3998   case Intrinsic::maxnum:
3999   case Intrinsic::minnum:
4000     // FMINNUM has same costs so don't duplicate.
4001     ISD = ISD::FMAXNUM;
4002     break;
4003   case Intrinsic::sadd_sat:
4004     ISD = ISD::SADDSAT;
4005     break;
4006   case Intrinsic::smax:
4007     ISD = ISD::SMAX;
4008     break;
4009   case Intrinsic::smin:
4010     ISD = ISD::SMIN;
4011     break;
4012   case Intrinsic::ssub_sat:
4013     ISD = ISD::SSUBSAT;
4014     break;
4015   case Intrinsic::uadd_sat:
4016     ISD = ISD::UADDSAT;
4017     break;
4018   case Intrinsic::umax:
4019     ISD = ISD::UMAX;
4020     break;
4021   case Intrinsic::umin:
4022     ISD = ISD::UMIN;
4023     break;
4024   case Intrinsic::usub_sat:
4025     ISD = ISD::USUBSAT;
4026     break;
4027   case Intrinsic::sqrt:
4028     ISD = ISD::FSQRT;
4029     break;
4030   case Intrinsic::sadd_with_overflow:
4031   case Intrinsic::ssub_with_overflow:
4032     // SSUBO has same costs so don't duplicate.
4033     ISD = ISD::SADDO;
4034     OpTy = RetTy->getContainedType(0);
4035     break;
4036   case Intrinsic::uadd_with_overflow:
4037   case Intrinsic::usub_with_overflow:
4038     // USUBO has same costs so don't duplicate.
4039     ISD = ISD::UADDO;
4040     OpTy = RetTy->getContainedType(0);
4041     break;
4042   case Intrinsic::umul_with_overflow:
4043   case Intrinsic::smul_with_overflow:
4044     // SMULO has same costs so don't duplicate.
4045     ISD = ISD::UMULO;
4046     OpTy = RetTy->getContainedType(0);
4047     break;
4048   }
4049 
4050   if (ISD != ISD::DELETED_NODE) {
4051     // Legalize the type.
4052     std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(OpTy);
4053     MVT MTy = LT.second;
4054 
4055     // Attempt to lookup cost.
4056     if (ISD == ISD::BITREVERSE && ST->hasGFNI() && ST->hasSSSE3() &&
4057         MTy.isVector()) {
4058       // With PSHUFB the code is very similar for all types. If we have integer
4059       // byte operations, we just need a GF2P8AFFINEQB for vXi8. For other types
4060       // we also need a PSHUFB.
4061       unsigned Cost = MTy.getVectorElementType() == MVT::i8 ? 1 : 2;
4062 
4063       // Without byte operations, we need twice as many GF2P8AFFINEQB and PSHUFB
4064       // instructions. We also need an extract and an insert.
4065       if (!(MTy.is128BitVector() || (ST->hasAVX2() && MTy.is256BitVector()) ||
4066             (ST->hasBWI() && MTy.is512BitVector())))
4067         Cost = Cost * 2 + 2;
4068 
4069       return LT.first * Cost;
4070     }
4071 
4072     // Without BMI/LZCNT see if we're only looking for a *_ZERO_UNDEF cost.
4073     if (((ISD == ISD::CTTZ && !ST->hasBMI()) ||
4074          (ISD == ISD::CTLZ && !ST->hasLZCNT())) &&
4075         !MTy.isVector() && !ICA.isTypeBasedOnly()) {
4076       const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4077       if (auto *Cst = dyn_cast<ConstantInt>(Args[1]))
4078         if (Cst->isAllOnesValue())
4079           ISD = ISD == ISD::CTTZ ? ISD::CTTZ_ZERO_UNDEF : ISD::CTLZ_ZERO_UNDEF;
4080     }
4081 
4082     // FSQRT is a single instruction.
4083     if (ISD == ISD::FSQRT && CostKind == TTI::TCK_CodeSize)
4084       return LT.first;
4085 
4086     auto adjustTableCost = [](int ISD, unsigned Cost,
4087                               InstructionCost LegalizationCost,
4088                               FastMathFlags FMF) {
4089       // If there are no NANs to deal with, then these are reduced to a
4090       // single MIN** or MAX** instruction instead of the MIN/CMP/SELECT that we
4091       // assume is used in the non-fast case.
4092       if (ISD == ISD::FMAXNUM || ISD == ISD::FMINNUM) {
4093         if (FMF.noNaNs())
4094           return LegalizationCost * 1;
4095       }
4096       return LegalizationCost * (int)Cost;
4097     };
4098 
4099     if (ST->useGLMDivSqrtCosts())
4100       if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy))
4101         if (auto KindCost = Entry->Cost[CostKind])
4102           return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4103                                  ICA.getFlags());
4104 
4105     if (ST->useSLMArithCosts())
4106       if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
4107         if (auto KindCost = Entry->Cost[CostKind])
4108           return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4109                                  ICA.getFlags());
4110 
4111     if (ST->hasVBMI2())
4112       if (const auto *Entry = CostTableLookup(AVX512VBMI2CostTbl, ISD, MTy))
4113         if (auto KindCost = Entry->Cost[CostKind])
4114           return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4115                                  ICA.getFlags());
4116 
4117     if (ST->hasBITALG())
4118       if (const auto *Entry = CostTableLookup(AVX512BITALGCostTbl, ISD, MTy))
4119         if (auto KindCost = Entry->Cost[CostKind])
4120           return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4121                                  ICA.getFlags());
4122 
4123     if (ST->hasVPOPCNTDQ())
4124       if (const auto *Entry = CostTableLookup(AVX512VPOPCNTDQCostTbl, ISD, MTy))
4125         if (auto KindCost = Entry->Cost[CostKind])
4126           return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4127                                  ICA.getFlags());
4128 
4129     if (ST->hasCDI())
4130       if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy))
4131         if (auto KindCost = Entry->Cost[CostKind])
4132           return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4133                                  ICA.getFlags());
4134 
4135     if (ST->hasBWI())
4136       if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
4137         if (auto KindCost = Entry->Cost[CostKind])
4138           return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4139                                  ICA.getFlags());
4140 
4141     if (ST->hasAVX512())
4142       if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
4143         if (auto KindCost = Entry->Cost[CostKind])
4144           return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4145                                  ICA.getFlags());
4146 
4147     if (ST->hasXOP())
4148       if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
4149         if (auto KindCost = Entry->Cost[CostKind])
4150           return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4151                                  ICA.getFlags());
4152 
4153     if (ST->hasAVX2())
4154       if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
4155         if (auto KindCost = Entry->Cost[CostKind])
4156           return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4157                                  ICA.getFlags());
4158 
4159     if (ST->hasAVX())
4160       if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
4161         if (auto KindCost = Entry->Cost[CostKind])
4162           return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4163                                  ICA.getFlags());
4164 
4165     if (ST->hasSSE42())
4166       if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
4167         if (auto KindCost = Entry->Cost[CostKind])
4168           return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4169                                  ICA.getFlags());
4170 
4171     if (ST->hasSSE41())
4172       if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
4173         if (auto KindCost = Entry->Cost[CostKind])
4174           return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4175                                  ICA.getFlags());
4176 
4177     if (ST->hasSSSE3())
4178       if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy))
4179         if (auto KindCost = Entry->Cost[CostKind])
4180           return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4181                                  ICA.getFlags());
4182 
4183     if (ST->hasSSE2())
4184       if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
4185         if (auto KindCost = Entry->Cost[CostKind])
4186           return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4187                                  ICA.getFlags());
4188 
4189     if (ST->hasSSE1())
4190       if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
4191         if (auto KindCost = Entry->Cost[CostKind])
4192           return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4193                                  ICA.getFlags());
4194 
4195     if (ST->hasBMI()) {
4196       if (ST->is64Bit())
4197         if (const auto *Entry = CostTableLookup(BMI64CostTbl, ISD, MTy))
4198           if (auto KindCost = Entry->Cost[CostKind])
4199             return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4200                                    ICA.getFlags());
4201 
4202       if (const auto *Entry = CostTableLookup(BMI32CostTbl, ISD, MTy))
4203         if (auto KindCost = Entry->Cost[CostKind])
4204           return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4205                                  ICA.getFlags());
4206     }
4207 
4208     if (ST->hasLZCNT()) {
4209       if (ST->is64Bit())
4210         if (const auto *Entry = CostTableLookup(LZCNT64CostTbl, ISD, MTy))
4211           if (auto KindCost = Entry->Cost[CostKind])
4212             return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4213                                    ICA.getFlags());
4214 
4215       if (const auto *Entry = CostTableLookup(LZCNT32CostTbl, ISD, MTy))
4216         if (auto KindCost = Entry->Cost[CostKind])
4217           return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4218                                  ICA.getFlags());
4219     }
4220 
4221     if (ST->hasPOPCNT()) {
4222       if (ST->is64Bit())
4223         if (const auto *Entry = CostTableLookup(POPCNT64CostTbl, ISD, MTy))
4224           if (auto KindCost = Entry->Cost[CostKind])
4225             return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4226                                    ICA.getFlags());
4227 
4228       if (const auto *Entry = CostTableLookup(POPCNT32CostTbl, ISD, MTy))
4229         if (auto KindCost = Entry->Cost[CostKind])
4230           return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4231                                  ICA.getFlags());
4232     }
4233 
4234     if (ISD == ISD::BSWAP && ST->hasMOVBE() && ST->hasFastMOVBE()) {
4235       if (const Instruction *II = ICA.getInst()) {
4236         if (II->hasOneUse() && isa<StoreInst>(II->user_back()))
4237           return TTI::TCC_Free;
4238         if (auto *LI = dyn_cast<LoadInst>(II->getOperand(0))) {
4239           if (LI->hasOneUse())
4240             return TTI::TCC_Free;
4241         }
4242       }
4243     }
4244 
4245     if (ST->is64Bit())
4246       if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
4247         if (auto KindCost = Entry->Cost[CostKind])
4248           return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4249                                  ICA.getFlags());
4250 
4251     if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
4252       if (auto KindCost = Entry->Cost[CostKind])
4253         return adjustTableCost(Entry->ISD, *KindCost, LT.first, ICA.getFlags());
4254   }
4255 
4256   return BaseT::getIntrinsicInstrCost(ICA, CostKind);
4257 }
4258 
4259 InstructionCost X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
4260                                                TTI::TargetCostKind CostKind,
4261                                                unsigned Index, Value *Op0,
4262                                                Value *Op1) {
4263   static const CostTblEntry SLMCostTbl[] = {
4264      { ISD::EXTRACT_VECTOR_ELT,       MVT::i8,      4 },
4265      { ISD::EXTRACT_VECTOR_ELT,       MVT::i16,     4 },
4266      { ISD::EXTRACT_VECTOR_ELT,       MVT::i32,     4 },
4267      { ISD::EXTRACT_VECTOR_ELT,       MVT::i64,     7 }
4268    };
4269 
4270   assert(Val->isVectorTy() && "This must be a vector type");
4271   Type *ScalarType = Val->getScalarType();
4272   InstructionCost RegisterFileMoveCost = 0;
4273 
4274   // Non-immediate extraction/insertion can be handled as a sequence of
4275   // aliased loads+stores via the stack.
4276   if (Index == -1U && (Opcode == Instruction::ExtractElement ||
4277                        Opcode == Instruction::InsertElement)) {
4278     // TODO: On some SSE41+ targets, we expand to cmp+splat+select patterns:
4279     // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
4280 
4281     // TODO: Move this to BasicTTIImpl.h? We'd need better gep + index handling.
4282     assert(isa<FixedVectorType>(Val) && "Fixed vector type expected");
4283     Align VecAlign = DL.getPrefTypeAlign(Val);
4284     Align SclAlign = DL.getPrefTypeAlign(ScalarType);
4285 
4286     // Extract - store vector to stack, load scalar.
4287     if (Opcode == Instruction::ExtractElement) {
4288       return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
4289              getMemoryOpCost(Instruction::Load, ScalarType, SclAlign, 0,
4290                              CostKind);
4291     }
4292     // Insert - store vector to stack, store scalar, load vector.
4293     if (Opcode == Instruction::InsertElement) {
4294       return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
4295              getMemoryOpCost(Instruction::Store, ScalarType, SclAlign, 0,
4296                              CostKind) +
4297              getMemoryOpCost(Instruction::Load, Val, VecAlign, 0, CostKind);
4298     }
4299   }
4300 
4301   if (Index != -1U && (Opcode == Instruction::ExtractElement ||
4302                        Opcode == Instruction::InsertElement)) {
4303     // Extraction of vXi1 elements are now efficiently handled by MOVMSK.
4304     if (Opcode == Instruction::ExtractElement &&
4305         ScalarType->getScalarSizeInBits() == 1 &&
4306         cast<FixedVectorType>(Val)->getNumElements() > 1)
4307       return 1;
4308 
4309     // Legalize the type.
4310     std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
4311 
4312     // This type is legalized to a scalar type.
4313     if (!LT.second.isVector())
4314       return 0;
4315 
4316     // The type may be split. Normalize the index to the new type.
4317     unsigned SizeInBits = LT.second.getSizeInBits();
4318     unsigned NumElts = LT.second.getVectorNumElements();
4319     unsigned SubNumElts = NumElts;
4320     Index = Index % NumElts;
4321 
4322     // For >128-bit vectors, we need to extract higher 128-bit subvectors.
4323     // For inserts, we also need to insert the subvector back.
4324     if (SizeInBits > 128) {
4325       assert((SizeInBits % 128) == 0 && "Illegal vector");
4326       unsigned NumSubVecs = SizeInBits / 128;
4327       SubNumElts = NumElts / NumSubVecs;
4328       if (SubNumElts <= Index) {
4329         RegisterFileMoveCost += (Opcode == Instruction::InsertElement ? 2 : 1);
4330         Index %= SubNumElts;
4331       }
4332     }
4333 
4334     MVT MScalarTy = LT.second.getScalarType();
4335     auto IsCheapPInsrPExtrInsertPS = [&]() {
4336       // Assume pinsr/pextr XMM <-> GPR is relatively cheap on all targets.
4337       // Also, assume insertps is relatively cheap on all >= SSE41 targets.
4338       return (MScalarTy == MVT::i16 && ST->hasSSE2()) ||
4339              (MScalarTy.isInteger() && ST->hasSSE41()) ||
4340              (MScalarTy == MVT::f32 && ST->hasSSE41() &&
4341               Opcode == Instruction::InsertElement);
4342     };
4343 
4344     if (Index == 0) {
4345       // Floating point scalars are already located in index #0.
4346       // Many insertions to #0 can fold away for scalar fp-ops, so let's assume
4347       // true for all.
4348       if (ScalarType->isFloatingPointTy())
4349         return RegisterFileMoveCost;
4350 
4351       if (Opcode == Instruction::InsertElement &&
4352           isa_and_nonnull<UndefValue>(Op0)) {
4353         // Consider the gather cost to be cheap.
4354         if (isa_and_nonnull<LoadInst>(Op1))
4355           return RegisterFileMoveCost;
4356         if (!IsCheapPInsrPExtrInsertPS()) {
4357           // mov constant-to-GPR + movd/movq GPR -> XMM.
4358           if (isa_and_nonnull<Constant>(Op1) && Op1->getType()->isIntegerTy())
4359             return 2 + RegisterFileMoveCost;
4360           // Assume movd/movq GPR -> XMM is relatively cheap on all targets.
4361           return 1 + RegisterFileMoveCost;
4362         }
4363       }
4364 
4365       // Assume movd/movq XMM -> GPR is relatively cheap on all targets.
4366       if (ScalarType->isIntegerTy() && Opcode == Instruction::ExtractElement)
4367         return 1 + RegisterFileMoveCost;
4368     }
4369 
4370     int ISD = TLI->InstructionOpcodeToISD(Opcode);
4371     assert(ISD && "Unexpected vector opcode");
4372     if (ST->useSLMArithCosts())
4373       if (auto *Entry = CostTableLookup(SLMCostTbl, ISD, MScalarTy))
4374         return Entry->Cost + RegisterFileMoveCost;
4375 
4376     // Consider cheap cases.
4377     if (IsCheapPInsrPExtrInsertPS())
4378       return 1 + RegisterFileMoveCost;
4379 
4380     // For extractions we just need to shuffle the element to index 0, which
4381     // should be very cheap (assume cost = 1). For insertions we need to shuffle
4382     // the elements to its destination. In both cases we must handle the
4383     // subvector move(s).
4384     // If the vector type is already less than 128-bits then don't reduce it.
4385     // TODO: Under what circumstances should we shuffle using the full width?
4386     InstructionCost ShuffleCost = 1;
4387     if (Opcode == Instruction::InsertElement) {
4388       auto *SubTy = cast<VectorType>(Val);
4389       EVT VT = TLI->getValueType(DL, Val);
4390       if (VT.getScalarType() != MScalarTy || VT.getSizeInBits() >= 128)
4391         SubTy = FixedVectorType::get(ScalarType, SubNumElts);
4392       ShuffleCost = getShuffleCost(TTI::SK_PermuteTwoSrc, SubTy, std::nullopt,
4393                                    CostKind, 0, SubTy);
4394     }
4395     int IntOrFpCost = ScalarType->isFloatingPointTy() ? 0 : 1;
4396     return ShuffleCost + IntOrFpCost + RegisterFileMoveCost;
4397   }
4398 
4399   // Add to the base cost if we know that the extracted element of a vector is
4400   // destined to be moved to and used in the integer register file.
4401   if (Opcode == Instruction::ExtractElement && ScalarType->isPointerTy())
4402     RegisterFileMoveCost += 1;
4403 
4404   return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1) +
4405          RegisterFileMoveCost;
4406 }
4407 
4408 InstructionCost
4409 X86TTIImpl::getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts,
4410                                      bool Insert, bool Extract,
4411                                      TTI::TargetCostKind CostKind) {
4412   assert(DemandedElts.getBitWidth() ==
4413              cast<FixedVectorType>(Ty)->getNumElements() &&
4414          "Vector size mismatch");
4415 
4416   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
4417   MVT MScalarTy = LT.second.getScalarType();
4418   unsigned LegalVectorBitWidth = LT.second.getSizeInBits();
4419   InstructionCost Cost = 0;
4420 
4421   constexpr unsigned LaneBitWidth = 128;
4422   assert((LegalVectorBitWidth < LaneBitWidth ||
4423           (LegalVectorBitWidth % LaneBitWidth) == 0) &&
4424          "Illegal vector");
4425 
4426   const int NumLegalVectors = *LT.first.getValue();
4427   assert(NumLegalVectors >= 0 && "Negative cost!");
4428 
4429   // For insertions, a ISD::BUILD_VECTOR style vector initialization can be much
4430   // cheaper than an accumulation of ISD::INSERT_VECTOR_ELT.
4431   if (Insert) {
4432     if ((MScalarTy == MVT::i16 && ST->hasSSE2()) ||
4433         (MScalarTy.isInteger() && ST->hasSSE41()) ||
4434         (MScalarTy == MVT::f32 && ST->hasSSE41())) {
4435       // For types we can insert directly, insertion into 128-bit sub vectors is
4436       // cheap, followed by a cheap chain of concatenations.
4437       if (LegalVectorBitWidth <= LaneBitWidth) {
4438         Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert,
4439                                                 /*Extract*/ false, CostKind);
4440       } else {
4441         // In each 128-lane, if at least one index is demanded but not all
4442         // indices are demanded and this 128-lane is not the first 128-lane of
4443         // the legalized-vector, then this 128-lane needs a extracti128; If in
4444         // each 128-lane, there is at least one demanded index, this 128-lane
4445         // needs a inserti128.
4446 
4447         // The following cases will help you build a better understanding:
4448         // Assume we insert several elements into a v8i32 vector in avx2,
4449         // Case#1: inserting into 1th index needs vpinsrd + inserti128.
4450         // Case#2: inserting into 5th index needs extracti128 + vpinsrd +
4451         // inserti128.
4452         // Case#3: inserting into 4,5,6,7 index needs 4*vpinsrd + inserti128.
4453         assert((LegalVectorBitWidth % LaneBitWidth) == 0 && "Illegal vector");
4454         unsigned NumLegalLanes = LegalVectorBitWidth / LaneBitWidth;
4455         unsigned NumLanesTotal = NumLegalLanes * NumLegalVectors;
4456         unsigned NumLegalElts =
4457             LT.second.getVectorNumElements() * NumLegalVectors;
4458         assert(NumLegalElts >= DemandedElts.getBitWidth() &&
4459                "Vector has been legalized to smaller element count");
4460         assert((NumLegalElts % NumLanesTotal) == 0 &&
4461                "Unexpected elts per lane");
4462         unsigned NumEltsPerLane = NumLegalElts / NumLanesTotal;
4463 
4464         APInt WidenedDemandedElts = DemandedElts.zext(NumLegalElts);
4465         auto *LaneTy =
4466             FixedVectorType::get(Ty->getElementType(), NumEltsPerLane);
4467 
4468         for (unsigned I = 0; I != NumLanesTotal; ++I) {
4469           APInt LaneEltMask = WidenedDemandedElts.extractBits(
4470               NumEltsPerLane, NumEltsPerLane * I);
4471           if (LaneEltMask.isNullValue())
4472             continue;
4473           // FIXME: we don't need to extract if all non-demanded elements
4474           //        are legalization-inserted padding.
4475           if (!LaneEltMask.isAllOnes())
4476             Cost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt,
4477                                    CostKind, I * NumEltsPerLane, LaneTy);
4478           Cost += BaseT::getScalarizationOverhead(LaneTy, LaneEltMask, Insert,
4479                                                   /*Extract*/ false, CostKind);
4480         }
4481 
4482         APInt AffectedLanes =
4483             APIntOps::ScaleBitMask(WidenedDemandedElts, NumLanesTotal);
4484         APInt FullyAffectedLegalVectors = APIntOps::ScaleBitMask(
4485             AffectedLanes, NumLegalVectors, /*MatchAllBits=*/true);
4486         for (int LegalVec = 0; LegalVec != NumLegalVectors; ++LegalVec) {
4487           for (unsigned Lane = 0; Lane != NumLegalLanes; ++Lane) {
4488             unsigned I = NumLegalLanes * LegalVec + Lane;
4489             // No need to insert unaffected lane; or lane 0 of each legal vector
4490             // iff ALL lanes of that vector were affected and will be inserted.
4491             if (!AffectedLanes[I] ||
4492                 (Lane == 0 && FullyAffectedLegalVectors[LegalVec]))
4493               continue;
4494             Cost += getShuffleCost(TTI::SK_InsertSubvector, Ty, std::nullopt,
4495                                    CostKind, I * NumEltsPerLane, LaneTy);
4496           }
4497         }
4498       }
4499     } else if (LT.second.isVector()) {
4500       // Without fast insertion, we need to use MOVD/MOVQ to pass each demanded
4501       // integer element as a SCALAR_TO_VECTOR, then we build the vector as a
4502       // series of UNPCK followed by CONCAT_VECTORS - all of these can be
4503       // considered cheap.
4504       if (Ty->isIntOrIntVectorTy())
4505         Cost += DemandedElts.countPopulation();
4506 
4507       // Get the smaller of the legalized or original pow2-extended number of
4508       // vector elements, which represents the number of unpacks we'll end up
4509       // performing.
4510       unsigned NumElts = LT.second.getVectorNumElements();
4511       unsigned Pow2Elts =
4512           PowerOf2Ceil(cast<FixedVectorType>(Ty)->getNumElements());
4513       Cost += (std::min<unsigned>(NumElts, Pow2Elts) - 1) * LT.first;
4514     }
4515   }
4516 
4517   if (Extract) {
4518     // vXi1 can be efficiently extracted with MOVMSK.
4519     // TODO: AVX512 predicate mask handling.
4520     // NOTE: This doesn't work well for roundtrip scalarization.
4521     if (!Insert && Ty->getScalarSizeInBits() == 1 && !ST->hasAVX512()) {
4522       unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements();
4523       unsigned MaxElts = ST->hasAVX2() ? 32 : 16;
4524       unsigned MOVMSKCost = (NumElts + MaxElts - 1) / MaxElts;
4525       return MOVMSKCost;
4526     }
4527 
4528     if (LT.second.isVector()) {
4529       unsigned NumLegalElts =
4530           LT.second.getVectorNumElements() * NumLegalVectors;
4531       assert(NumLegalElts >= DemandedElts.getBitWidth() &&
4532              "Vector has been legalized to smaller element count");
4533 
4534       // If we're extracting elements from a 128-bit subvector lane,
4535       // we only need to extract each lane once, not for every element.
4536       if (LegalVectorBitWidth > LaneBitWidth) {
4537         unsigned NumLegalLanes = LegalVectorBitWidth / LaneBitWidth;
4538         unsigned NumLanesTotal = NumLegalLanes * NumLegalVectors;
4539         assert((NumLegalElts % NumLanesTotal) == 0 &&
4540                "Unexpected elts per lane");
4541         unsigned NumEltsPerLane = NumLegalElts / NumLanesTotal;
4542 
4543         // Add cost for each demanded 128-bit subvector extraction.
4544         // Luckily this is a lot easier than for insertion.
4545         APInt WidenedDemandedElts = DemandedElts.zext(NumLegalElts);
4546         auto *LaneTy =
4547             FixedVectorType::get(Ty->getElementType(), NumEltsPerLane);
4548 
4549         for (unsigned I = 0; I != NumLanesTotal; ++I) {
4550           APInt LaneEltMask = WidenedDemandedElts.extractBits(
4551               NumEltsPerLane, I * NumEltsPerLane);
4552           if (LaneEltMask.isNullValue())
4553             continue;
4554           Cost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt,
4555                                  CostKind, I * NumEltsPerLane, LaneTy);
4556           Cost += BaseT::getScalarizationOverhead(
4557               LaneTy, LaneEltMask, /*Insert*/ false, Extract, CostKind);
4558         }
4559 
4560         return Cost;
4561       }
4562     }
4563 
4564     // Fallback to default extraction.
4565     Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, /*Insert*/ false,
4566                                             Extract, CostKind);
4567   }
4568 
4569   return Cost;
4570 }
4571 
4572 InstructionCost
4573 X86TTIImpl::getReplicationShuffleCost(Type *EltTy, int ReplicationFactor,
4574                                       int VF, const APInt &DemandedDstElts,
4575                                       TTI::TargetCostKind CostKind) {
4576   const unsigned EltTyBits = DL.getTypeSizeInBits(EltTy);
4577   // We don't differentiate element types here, only element bit width.
4578   EltTy = IntegerType::getIntNTy(EltTy->getContext(), EltTyBits);
4579 
4580   auto bailout = [&]() {
4581     return BaseT::getReplicationShuffleCost(EltTy, ReplicationFactor, VF,
4582                                             DemandedDstElts, CostKind);
4583   };
4584 
4585   // For now, only deal with AVX512 cases.
4586   if (!ST->hasAVX512())
4587     return bailout();
4588 
4589   // Do we have a native shuffle for this element type, or should we promote?
4590   unsigned PromEltTyBits = EltTyBits;
4591   switch (EltTyBits) {
4592   case 32:
4593   case 64:
4594     break; // AVX512F.
4595   case 16:
4596     if (!ST->hasBWI())
4597       PromEltTyBits = 32; // promote to i32, AVX512F.
4598     break;                // AVX512BW
4599   case 8:
4600     if (!ST->hasVBMI())
4601       PromEltTyBits = 32; // promote to i32, AVX512F.
4602     break;                // AVX512VBMI
4603   case 1:
4604     // There is no support for shuffling i1 elements. We *must* promote.
4605     if (ST->hasBWI()) {
4606       if (ST->hasVBMI())
4607         PromEltTyBits = 8; // promote to i8, AVX512VBMI.
4608       else
4609         PromEltTyBits = 16; // promote to i16, AVX512BW.
4610       break;
4611     }
4612     PromEltTyBits = 32; // promote to i32, AVX512F.
4613     break;
4614   default:
4615     return bailout();
4616   }
4617   auto *PromEltTy = IntegerType::getIntNTy(EltTy->getContext(), PromEltTyBits);
4618 
4619   auto *SrcVecTy = FixedVectorType::get(EltTy, VF);
4620   auto *PromSrcVecTy = FixedVectorType::get(PromEltTy, VF);
4621 
4622   int NumDstElements = VF * ReplicationFactor;
4623   auto *PromDstVecTy = FixedVectorType::get(PromEltTy, NumDstElements);
4624   auto *DstVecTy = FixedVectorType::get(EltTy, NumDstElements);
4625 
4626   // Legalize the types.
4627   MVT LegalSrcVecTy = getTypeLegalizationCost(SrcVecTy).second;
4628   MVT LegalPromSrcVecTy = getTypeLegalizationCost(PromSrcVecTy).second;
4629   MVT LegalPromDstVecTy = getTypeLegalizationCost(PromDstVecTy).second;
4630   MVT LegalDstVecTy = getTypeLegalizationCost(DstVecTy).second;
4631   // They should have legalized into vector types.
4632   if (!LegalSrcVecTy.isVector() || !LegalPromSrcVecTy.isVector() ||
4633       !LegalPromDstVecTy.isVector() || !LegalDstVecTy.isVector())
4634     return bailout();
4635 
4636   if (PromEltTyBits != EltTyBits) {
4637     // If we have to perform the shuffle with wider elt type than our data type,
4638     // then we will first need to anyext (we don't care about the new bits)
4639     // the source elements, and then truncate Dst elements.
4640     InstructionCost PromotionCost;
4641     PromotionCost += getCastInstrCost(
4642         Instruction::SExt, /*Dst=*/PromSrcVecTy, /*Src=*/SrcVecTy,
4643         TargetTransformInfo::CastContextHint::None, CostKind);
4644     PromotionCost +=
4645         getCastInstrCost(Instruction::Trunc, /*Dst=*/DstVecTy,
4646                          /*Src=*/PromDstVecTy,
4647                          TargetTransformInfo::CastContextHint::None, CostKind);
4648     return PromotionCost + getReplicationShuffleCost(PromEltTy,
4649                                                      ReplicationFactor, VF,
4650                                                      DemandedDstElts, CostKind);
4651   }
4652 
4653   assert(LegalSrcVecTy.getScalarSizeInBits() == EltTyBits &&
4654          LegalSrcVecTy.getScalarType() == LegalDstVecTy.getScalarType() &&
4655          "We expect that the legalization doesn't affect the element width, "
4656          "doesn't coalesce/split elements.");
4657 
4658   unsigned NumEltsPerDstVec = LegalDstVecTy.getVectorNumElements();
4659   unsigned NumDstVectors =
4660       divideCeil(DstVecTy->getNumElements(), NumEltsPerDstVec);
4661 
4662   auto *SingleDstVecTy = FixedVectorType::get(EltTy, NumEltsPerDstVec);
4663 
4664   // Not all the produced Dst elements may be demanded. In our case,
4665   // given that a single Dst vector is formed by a single shuffle,
4666   // if all elements that will form a single Dst vector aren't demanded,
4667   // then we won't need to do that shuffle, so adjust the cost accordingly.
4668   APInt DemandedDstVectors = APIntOps::ScaleBitMask(
4669       DemandedDstElts.zext(NumDstVectors * NumEltsPerDstVec), NumDstVectors);
4670   unsigned NumDstVectorsDemanded = DemandedDstVectors.countPopulation();
4671 
4672   InstructionCost SingleShuffleCost = getShuffleCost(
4673       TTI::SK_PermuteSingleSrc, SingleDstVecTy, /*Mask=*/std::nullopt, CostKind,
4674       /*Index=*/0, /*SubTp=*/nullptr);
4675   return NumDstVectorsDemanded * SingleShuffleCost;
4676 }
4677 
4678 InstructionCost X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
4679                                             MaybeAlign Alignment,
4680                                             unsigned AddressSpace,
4681                                             TTI::TargetCostKind CostKind,
4682                                             TTI::OperandValueInfo OpInfo,
4683                                             const Instruction *I) {
4684   // TODO: Handle other cost kinds.
4685   if (CostKind != TTI::TCK_RecipThroughput) {
4686     if (auto *SI = dyn_cast_or_null<StoreInst>(I)) {
4687       // Store instruction with index and scale costs 2 Uops.
4688       // Check the preceding GEP to identify non-const indices.
4689       if (auto *GEP = dyn_cast<GetElementPtrInst>(SI->getPointerOperand())) {
4690         if (!all_of(GEP->indices(), [](Value *V) { return isa<Constant>(V); }))
4691           return TTI::TCC_Basic * 2;
4692       }
4693     }
4694     return TTI::TCC_Basic;
4695   }
4696 
4697   assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
4698          "Invalid Opcode");
4699   // Type legalization can't handle structs
4700   if (TLI->getValueType(DL, Src, true) == MVT::Other)
4701     return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
4702                                   CostKind);
4703 
4704   // Legalize the type.
4705   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
4706 
4707   auto *VTy = dyn_cast<FixedVectorType>(Src);
4708 
4709   InstructionCost Cost = 0;
4710 
4711   // Add a cost for constant load to vector.
4712   if (Opcode == Instruction::Store && OpInfo.isConstant())
4713     Cost += getMemoryOpCost(Instruction::Load, Src, DL.getABITypeAlign(Src),
4714                             /*AddressSpace=*/0, CostKind);
4715 
4716   // Handle the simple case of non-vectors.
4717   // NOTE: this assumes that legalization never creates vector from scalars!
4718   if (!VTy || !LT.second.isVector()) {
4719     // Each load/store unit costs 1.
4720     return (LT.second.isFloatingPoint() ? Cost : 0) + LT.first * 1;
4721   }
4722 
4723   bool IsLoad = Opcode == Instruction::Load;
4724 
4725   Type *EltTy = VTy->getElementType();
4726 
4727   const int EltTyBits = DL.getTypeSizeInBits(EltTy);
4728 
4729   // Source of truth: how many elements were there in the original IR vector?
4730   const unsigned SrcNumElt = VTy->getNumElements();
4731 
4732   // How far have we gotten?
4733   int NumEltRemaining = SrcNumElt;
4734   // Note that we intentionally capture by-reference, NumEltRemaining changes.
4735   auto NumEltDone = [&]() { return SrcNumElt - NumEltRemaining; };
4736 
4737   const int MaxLegalOpSizeBytes = divideCeil(LT.second.getSizeInBits(), 8);
4738 
4739   // Note that even if we can store 64 bits of an XMM, we still operate on XMM.
4740   const unsigned XMMBits = 128;
4741   if (XMMBits % EltTyBits != 0)
4742     // Vector size must be a multiple of the element size. I.e. no padding.
4743     return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
4744                                   CostKind);
4745   const int NumEltPerXMM = XMMBits / EltTyBits;
4746 
4747   auto *XMMVecTy = FixedVectorType::get(EltTy, NumEltPerXMM);
4748 
4749   for (int CurrOpSizeBytes = MaxLegalOpSizeBytes, SubVecEltsLeft = 0;
4750        NumEltRemaining > 0; CurrOpSizeBytes /= 2) {
4751     // How many elements would a single op deal with at once?
4752     if ((8 * CurrOpSizeBytes) % EltTyBits != 0)
4753       // Vector size must be a multiple of the element size. I.e. no padding.
4754       return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
4755                                     CostKind);
4756     int CurrNumEltPerOp = (8 * CurrOpSizeBytes) / EltTyBits;
4757 
4758     assert(CurrOpSizeBytes > 0 && CurrNumEltPerOp > 0 && "How'd we get here?");
4759     assert((((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) ||
4760             (CurrOpSizeBytes == MaxLegalOpSizeBytes)) &&
4761            "Unless we haven't halved the op size yet, "
4762            "we have less than two op's sized units of work left.");
4763 
4764     auto *CurrVecTy = CurrNumEltPerOp > NumEltPerXMM
4765                           ? FixedVectorType::get(EltTy, CurrNumEltPerOp)
4766                           : XMMVecTy;
4767 
4768     assert(CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 &&
4769            "After halving sizes, the vector elt count is no longer a multiple "
4770            "of number of elements per operation?");
4771     auto *CoalescedVecTy =
4772         CurrNumEltPerOp == 1
4773             ? CurrVecTy
4774             : FixedVectorType::get(
4775                   IntegerType::get(Src->getContext(),
4776                                    EltTyBits * CurrNumEltPerOp),
4777                   CurrVecTy->getNumElements() / CurrNumEltPerOp);
4778     assert(DL.getTypeSizeInBits(CoalescedVecTy) ==
4779                DL.getTypeSizeInBits(CurrVecTy) &&
4780            "coalesciing elements doesn't change vector width.");
4781 
4782     while (NumEltRemaining > 0) {
4783       assert(SubVecEltsLeft >= 0 && "Subreg element count overconsumtion?");
4784 
4785       // Can we use this vector size, as per the remaining element count?
4786       // Iff the vector is naturally aligned, we can do a wide load regardless.
4787       if (NumEltRemaining < CurrNumEltPerOp &&
4788           (!IsLoad || Alignment.valueOrOne() < CurrOpSizeBytes) &&
4789           CurrOpSizeBytes != 1)
4790         break; // Try smalled vector size.
4791 
4792       bool Is0thSubVec = (NumEltDone() % LT.second.getVectorNumElements()) == 0;
4793 
4794       // If we have fully processed the previous reg, we need to replenish it.
4795       if (SubVecEltsLeft == 0) {
4796         SubVecEltsLeft += CurrVecTy->getNumElements();
4797         // And that's free only for the 0'th subvector of a legalized vector.
4798         if (!Is0thSubVec)
4799           Cost += getShuffleCost(IsLoad ? TTI::ShuffleKind::SK_InsertSubvector
4800                                         : TTI::ShuffleKind::SK_ExtractSubvector,
4801                                  VTy, std::nullopt, CostKind, NumEltDone(),
4802                                  CurrVecTy);
4803       }
4804 
4805       // While we can directly load/store ZMM, YMM, and 64-bit halves of XMM,
4806       // for smaller widths (32/16/8) we have to insert/extract them separately.
4807       // Again, it's free for the 0'th subreg (if op is 32/64 bit wide,
4808       // but let's pretend that it is also true for 16/8 bit wide ops...)
4809       if (CurrOpSizeBytes <= 32 / 8 && !Is0thSubVec) {
4810         int NumEltDoneInCurrXMM = NumEltDone() % NumEltPerXMM;
4811         assert(NumEltDoneInCurrXMM % CurrNumEltPerOp == 0 && "");
4812         int CoalescedVecEltIdx = NumEltDoneInCurrXMM / CurrNumEltPerOp;
4813         APInt DemandedElts =
4814             APInt::getBitsSet(CoalescedVecTy->getNumElements(),
4815                               CoalescedVecEltIdx, CoalescedVecEltIdx + 1);
4816         assert(DemandedElts.countPopulation() == 1 && "Inserting single value");
4817         Cost += getScalarizationOverhead(CoalescedVecTy, DemandedElts, IsLoad,
4818                                          !IsLoad, CostKind);
4819       }
4820 
4821       // This isn't exactly right. We're using slow unaligned 32-byte accesses
4822       // as a proxy for a double-pumped AVX memory interface such as on
4823       // Sandybridge.
4824       if (CurrOpSizeBytes == 32 && ST->isUnalignedMem32Slow())
4825         Cost += 2;
4826       else
4827         Cost += 1;
4828 
4829       SubVecEltsLeft -= CurrNumEltPerOp;
4830       NumEltRemaining -= CurrNumEltPerOp;
4831       Alignment = commonAlignment(Alignment.valueOrOne(), CurrOpSizeBytes);
4832     }
4833   }
4834 
4835   assert(NumEltRemaining <= 0 && "Should have processed all the elements.");
4836 
4837   return Cost;
4838 }
4839 
4840 InstructionCost
4841 X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, Align Alignment,
4842                                   unsigned AddressSpace,
4843                                   TTI::TargetCostKind CostKind) {
4844   bool IsLoad = (Instruction::Load == Opcode);
4845   bool IsStore = (Instruction::Store == Opcode);
4846 
4847   auto *SrcVTy = dyn_cast<FixedVectorType>(SrcTy);
4848   if (!SrcVTy)
4849     // To calculate scalar take the regular cost, without mask
4850     return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace, CostKind);
4851 
4852   unsigned NumElem = SrcVTy->getNumElements();
4853   auto *MaskTy =
4854       FixedVectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem);
4855   if ((IsLoad && !isLegalMaskedLoad(SrcVTy, Alignment)) ||
4856       (IsStore && !isLegalMaskedStore(SrcVTy, Alignment))) {
4857     // Scalarization
4858     APInt DemandedElts = APInt::getAllOnes(NumElem);
4859     InstructionCost MaskSplitCost = getScalarizationOverhead(
4860         MaskTy, DemandedElts, /*Insert*/ false, /*Extract*/ true, CostKind);
4861     InstructionCost ScalarCompareCost = getCmpSelInstrCost(
4862         Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr,
4863         CmpInst::BAD_ICMP_PREDICATE, CostKind);
4864     InstructionCost BranchCost = getCFInstrCost(Instruction::Br, CostKind);
4865     InstructionCost MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);
4866     InstructionCost ValueSplitCost = getScalarizationOverhead(
4867         SrcVTy, DemandedElts, IsLoad, IsStore, CostKind);
4868     InstructionCost MemopCost =
4869         NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
4870                                          Alignment, AddressSpace, CostKind);
4871     return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost;
4872   }
4873 
4874   // Legalize the type.
4875   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcVTy);
4876   auto VT = TLI->getValueType(DL, SrcVTy);
4877   InstructionCost Cost = 0;
4878   if (VT.isSimple() && LT.second != VT.getSimpleVT() &&
4879       LT.second.getVectorNumElements() == NumElem)
4880     // Promotion requires extend/truncate for data and a shuffle for mask.
4881     Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVTy, std::nullopt,
4882                            CostKind, 0, nullptr) +
4883             getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, std::nullopt,
4884                            CostKind, 0, nullptr);
4885 
4886   else if (LT.first * LT.second.getVectorNumElements() > NumElem) {
4887     auto *NewMaskTy = FixedVectorType::get(MaskTy->getElementType(),
4888                                            LT.second.getVectorNumElements());
4889     // Expanding requires fill mask with zeroes
4890     Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, std::nullopt,
4891                            CostKind, 0, MaskTy);
4892   }
4893 
4894   // Pre-AVX512 - each maskmov load costs 2 + store costs ~8.
4895   if (!ST->hasAVX512())
4896     return Cost + LT.first * (IsLoad ? 2 : 8);
4897 
4898   // AVX-512 masked load/store is cheaper
4899   return Cost + LT.first;
4900 }
4901 
4902 InstructionCost X86TTIImpl::getAddressComputationCost(Type *Ty,
4903                                                       ScalarEvolution *SE,
4904                                                       const SCEV *Ptr) {
4905   // Address computations in vectorized code with non-consecutive addresses will
4906   // likely result in more instructions compared to scalar code where the
4907   // computation can more often be merged into the index mode. The resulting
4908   // extra micro-ops can significantly decrease throughput.
4909   const unsigned NumVectorInstToHideOverhead = 10;
4910 
4911   // Cost modeling of Strided Access Computation is hidden by the indexing
4912   // modes of X86 regardless of the stride value. We dont believe that there
4913   // is a difference between constant strided access in gerenal and constant
4914   // strided value which is less than or equal to 64.
4915   // Even in the case of (loop invariant) stride whose value is not known at
4916   // compile time, the address computation will not incur more than one extra
4917   // ADD instruction.
4918   if (Ty->isVectorTy() && SE && !ST->hasAVX2()) {
4919     // TODO: AVX2 is the current cut-off because we don't have correct
4920     //       interleaving costs for prior ISA's.
4921     if (!BaseT::isStridedAccess(Ptr))
4922       return NumVectorInstToHideOverhead;
4923     if (!BaseT::getConstantStrideStep(SE, Ptr))
4924       return 1;
4925   }
4926 
4927   return BaseT::getAddressComputationCost(Ty, SE, Ptr);
4928 }
4929 
4930 InstructionCost
4931 X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
4932                                        std::optional<FastMathFlags> FMF,
4933                                        TTI::TargetCostKind CostKind) {
4934   if (TTI::requiresOrderedReduction(FMF))
4935     return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
4936 
4937   // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
4938   // and make it as the cost.
4939 
4940   static const CostTblEntry SLMCostTblNoPairWise[] = {
4941     { ISD::FADD,  MVT::v2f64,   3 },
4942     { ISD::ADD,   MVT::v2i64,   5 },
4943   };
4944 
4945   static const CostTblEntry SSE2CostTblNoPairWise[] = {
4946     { ISD::FADD,  MVT::v2f64,   2 },
4947     { ISD::FADD,  MVT::v2f32,   2 },
4948     { ISD::FADD,  MVT::v4f32,   4 },
4949     { ISD::ADD,   MVT::v2i64,   2 },      // The data reported by the IACA tool is "1.6".
4950     { ISD::ADD,   MVT::v2i32,   2 }, // FIXME: chosen to be less than v4i32
4951     { ISD::ADD,   MVT::v4i32,   3 },      // The data reported by the IACA tool is "3.3".
4952     { ISD::ADD,   MVT::v2i16,   2 },      // The data reported by the IACA tool is "4.3".
4953     { ISD::ADD,   MVT::v4i16,   3 },      // The data reported by the IACA tool is "4.3".
4954     { ISD::ADD,   MVT::v8i16,   4 },      // The data reported by the IACA tool is "4.3".
4955     { ISD::ADD,   MVT::v2i8,    2 },
4956     { ISD::ADD,   MVT::v4i8,    2 },
4957     { ISD::ADD,   MVT::v8i8,    2 },
4958     { ISD::ADD,   MVT::v16i8,   3 },
4959   };
4960 
4961   static const CostTblEntry AVX1CostTblNoPairWise[] = {
4962     { ISD::FADD,  MVT::v4f64,   3 },
4963     { ISD::FADD,  MVT::v4f32,   3 },
4964     { ISD::FADD,  MVT::v8f32,   4 },
4965     { ISD::ADD,   MVT::v2i64,   1 },      // The data reported by the IACA tool is "1.5".
4966     { ISD::ADD,   MVT::v4i64,   3 },
4967     { ISD::ADD,   MVT::v8i32,   5 },
4968     { ISD::ADD,   MVT::v16i16,  5 },
4969     { ISD::ADD,   MVT::v32i8,   4 },
4970   };
4971 
4972   int ISD = TLI->InstructionOpcodeToISD(Opcode);
4973   assert(ISD && "Invalid opcode");
4974 
4975   // Before legalizing the type, give a chance to look up illegal narrow types
4976   // in the table.
4977   // FIXME: Is there a better way to do this?
4978   EVT VT = TLI->getValueType(DL, ValTy);
4979   if (VT.isSimple()) {
4980     MVT MTy = VT.getSimpleVT();
4981     if (ST->useSLMArithCosts())
4982       if (const auto *Entry = CostTableLookup(SLMCostTblNoPairWise, ISD, MTy))
4983         return Entry->Cost;
4984 
4985     if (ST->hasAVX())
4986       if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
4987         return Entry->Cost;
4988 
4989     if (ST->hasSSE2())
4990       if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy))
4991         return Entry->Cost;
4992   }
4993 
4994   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
4995 
4996   MVT MTy = LT.second;
4997 
4998   auto *ValVTy = cast<FixedVectorType>(ValTy);
4999 
5000   // Special case: vXi8 mul reductions are performed as vXi16.
5001   if (ISD == ISD::MUL && MTy.getScalarType() == MVT::i8) {
5002     auto *WideSclTy = IntegerType::get(ValVTy->getContext(), 16);
5003     auto *WideVecTy = FixedVectorType::get(WideSclTy, ValVTy->getNumElements());
5004     return getCastInstrCost(Instruction::ZExt, WideVecTy, ValTy,
5005                             TargetTransformInfo::CastContextHint::None,
5006                             CostKind) +
5007            getArithmeticReductionCost(Opcode, WideVecTy, FMF, CostKind);
5008   }
5009 
5010   InstructionCost ArithmeticCost = 0;
5011   if (LT.first != 1 && MTy.isVector() &&
5012       MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5013     // Type needs to be split. We need LT.first - 1 arithmetic ops.
5014     auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
5015                                             MTy.getVectorNumElements());
5016     ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
5017     ArithmeticCost *= LT.first - 1;
5018   }
5019 
5020   if (ST->useSLMArithCosts())
5021     if (const auto *Entry = CostTableLookup(SLMCostTblNoPairWise, ISD, MTy))
5022       return ArithmeticCost + Entry->Cost;
5023 
5024   if (ST->hasAVX())
5025     if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
5026       return ArithmeticCost + Entry->Cost;
5027 
5028   if (ST->hasSSE2())
5029     if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy))
5030       return ArithmeticCost + Entry->Cost;
5031 
5032   // FIXME: These assume a naive kshift+binop lowering, which is probably
5033   // conservative in most cases.
5034   static const CostTblEntry AVX512BoolReduction[] = {
5035     { ISD::AND,  MVT::v2i1,   3 },
5036     { ISD::AND,  MVT::v4i1,   5 },
5037     { ISD::AND,  MVT::v8i1,   7 },
5038     { ISD::AND,  MVT::v16i1,  9 },
5039     { ISD::AND,  MVT::v32i1, 11 },
5040     { ISD::AND,  MVT::v64i1, 13 },
5041     { ISD::OR,   MVT::v2i1,   3 },
5042     { ISD::OR,   MVT::v4i1,   5 },
5043     { ISD::OR,   MVT::v8i1,   7 },
5044     { ISD::OR,   MVT::v16i1,  9 },
5045     { ISD::OR,   MVT::v32i1, 11 },
5046     { ISD::OR,   MVT::v64i1, 13 },
5047   };
5048 
5049   static const CostTblEntry AVX2BoolReduction[] = {
5050     { ISD::AND,  MVT::v16i16,  2 }, // vpmovmskb + cmp
5051     { ISD::AND,  MVT::v32i8,   2 }, // vpmovmskb + cmp
5052     { ISD::OR,   MVT::v16i16,  2 }, // vpmovmskb + cmp
5053     { ISD::OR,   MVT::v32i8,   2 }, // vpmovmskb + cmp
5054   };
5055 
5056   static const CostTblEntry AVX1BoolReduction[] = {
5057     { ISD::AND,  MVT::v4i64,   2 }, // vmovmskpd + cmp
5058     { ISD::AND,  MVT::v8i32,   2 }, // vmovmskps + cmp
5059     { ISD::AND,  MVT::v16i16,  4 }, // vextractf128 + vpand + vpmovmskb + cmp
5060     { ISD::AND,  MVT::v32i8,   4 }, // vextractf128 + vpand + vpmovmskb + cmp
5061     { ISD::OR,   MVT::v4i64,   2 }, // vmovmskpd + cmp
5062     { ISD::OR,   MVT::v8i32,   2 }, // vmovmskps + cmp
5063     { ISD::OR,   MVT::v16i16,  4 }, // vextractf128 + vpor + vpmovmskb + cmp
5064     { ISD::OR,   MVT::v32i8,   4 }, // vextractf128 + vpor + vpmovmskb + cmp
5065   };
5066 
5067   static const CostTblEntry SSE2BoolReduction[] = {
5068     { ISD::AND,  MVT::v2i64,   2 }, // movmskpd + cmp
5069     { ISD::AND,  MVT::v4i32,   2 }, // movmskps + cmp
5070     { ISD::AND,  MVT::v8i16,   2 }, // pmovmskb + cmp
5071     { ISD::AND,  MVT::v16i8,   2 }, // pmovmskb + cmp
5072     { ISD::OR,   MVT::v2i64,   2 }, // movmskpd + cmp
5073     { ISD::OR,   MVT::v4i32,   2 }, // movmskps + cmp
5074     { ISD::OR,   MVT::v8i16,   2 }, // pmovmskb + cmp
5075     { ISD::OR,   MVT::v16i8,   2 }, // pmovmskb + cmp
5076   };
5077 
5078   // Handle bool allof/anyof patterns.
5079   if (ValVTy->getElementType()->isIntegerTy(1)) {
5080     InstructionCost ArithmeticCost = 0;
5081     if (LT.first != 1 && MTy.isVector() &&
5082         MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5083       // Type needs to be split. We need LT.first - 1 arithmetic ops.
5084       auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
5085                                               MTy.getVectorNumElements());
5086       ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
5087       ArithmeticCost *= LT.first - 1;
5088     }
5089 
5090     if (ST->hasAVX512())
5091       if (const auto *Entry = CostTableLookup(AVX512BoolReduction, ISD, MTy))
5092         return ArithmeticCost + Entry->Cost;
5093     if (ST->hasAVX2())
5094       if (const auto *Entry = CostTableLookup(AVX2BoolReduction, ISD, MTy))
5095         return ArithmeticCost + Entry->Cost;
5096     if (ST->hasAVX())
5097       if (const auto *Entry = CostTableLookup(AVX1BoolReduction, ISD, MTy))
5098         return ArithmeticCost + Entry->Cost;
5099     if (ST->hasSSE2())
5100       if (const auto *Entry = CostTableLookup(SSE2BoolReduction, ISD, MTy))
5101         return ArithmeticCost + Entry->Cost;
5102 
5103     return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind);
5104   }
5105 
5106   unsigned NumVecElts = ValVTy->getNumElements();
5107   unsigned ScalarSize = ValVTy->getScalarSizeInBits();
5108 
5109   // Special case power of 2 reductions where the scalar type isn't changed
5110   // by type legalization.
5111   if (!isPowerOf2_32(NumVecElts) || ScalarSize != MTy.getScalarSizeInBits())
5112     return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind);
5113 
5114   InstructionCost ReductionCost = 0;
5115 
5116   auto *Ty = ValVTy;
5117   if (LT.first != 1 && MTy.isVector() &&
5118       MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5119     // Type needs to be split. We need LT.first - 1 arithmetic ops.
5120     Ty = FixedVectorType::get(ValVTy->getElementType(),
5121                               MTy.getVectorNumElements());
5122     ReductionCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
5123     ReductionCost *= LT.first - 1;
5124     NumVecElts = MTy.getVectorNumElements();
5125   }
5126 
5127   // Now handle reduction with the legal type, taking into account size changes
5128   // at each level.
5129   while (NumVecElts > 1) {
5130     // Determine the size of the remaining vector we need to reduce.
5131     unsigned Size = NumVecElts * ScalarSize;
5132     NumVecElts /= 2;
5133     // If we're reducing from 256/512 bits, use an extract_subvector.
5134     if (Size > 128) {
5135       auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
5136       ReductionCost +=
5137           getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt, CostKind,
5138                          NumVecElts, SubTy);
5139       Ty = SubTy;
5140     } else if (Size == 128) {
5141       // Reducing from 128 bits is a permute of v2f64/v2i64.
5142       FixedVectorType *ShufTy;
5143       if (ValVTy->isFloatingPointTy())
5144         ShufTy =
5145             FixedVectorType::get(Type::getDoubleTy(ValVTy->getContext()), 2);
5146       else
5147         ShufTy =
5148             FixedVectorType::get(Type::getInt64Ty(ValVTy->getContext()), 2);
5149       ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy,
5150                                       std::nullopt, CostKind, 0, nullptr);
5151     } else if (Size == 64) {
5152       // Reducing from 64 bits is a shuffle of v4f32/v4i32.
5153       FixedVectorType *ShufTy;
5154       if (ValVTy->isFloatingPointTy())
5155         ShufTy =
5156             FixedVectorType::get(Type::getFloatTy(ValVTy->getContext()), 4);
5157       else
5158         ShufTy =
5159             FixedVectorType::get(Type::getInt32Ty(ValVTy->getContext()), 4);
5160       ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy,
5161                                       std::nullopt, CostKind, 0, nullptr);
5162     } else {
5163       // Reducing from smaller size is a shift by immediate.
5164       auto *ShiftTy = FixedVectorType::get(
5165           Type::getIntNTy(ValVTy->getContext(), Size), 128 / Size);
5166       ReductionCost += getArithmeticInstrCost(
5167           Instruction::LShr, ShiftTy, CostKind,
5168           {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
5169           {TargetTransformInfo::OK_UniformConstantValue, TargetTransformInfo::OP_None});
5170     }
5171 
5172     // Add the arithmetic op for this level.
5173     ReductionCost += getArithmeticInstrCost(Opcode, Ty, CostKind);
5174   }
5175 
5176   // Add the final extract element to the cost.
5177   return ReductionCost + getVectorInstrCost(Instruction::ExtractElement, Ty,
5178                                             CostKind, 0, nullptr, nullptr);
5179 }
5180 
5181 InstructionCost X86TTIImpl::getMinMaxCost(Type *Ty, Type *CondTy,
5182                                           bool IsUnsigned) {
5183   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
5184 
5185   MVT MTy = LT.second;
5186 
5187   int ISD;
5188   if (Ty->isIntOrIntVectorTy()) {
5189     ISD = IsUnsigned ? ISD::UMIN : ISD::SMIN;
5190   } else {
5191     assert(Ty->isFPOrFPVectorTy() &&
5192            "Expected float point or integer vector type.");
5193     ISD = ISD::FMINNUM;
5194   }
5195 
5196   static const CostTblEntry SSE1CostTbl[] = {
5197     {ISD::FMINNUM, MVT::v4f32, 1},
5198   };
5199 
5200   static const CostTblEntry SSE2CostTbl[] = {
5201     {ISD::FMINNUM, MVT::v2f64, 1},
5202     {ISD::SMIN,    MVT::v8i16, 1},
5203     {ISD::UMIN,    MVT::v16i8, 1},
5204   };
5205 
5206   static const CostTblEntry SSE41CostTbl[] = {
5207     {ISD::SMIN,    MVT::v4i32, 1},
5208     {ISD::UMIN,    MVT::v4i32, 1},
5209     {ISD::UMIN,    MVT::v8i16, 1},
5210     {ISD::SMIN,    MVT::v16i8, 1},
5211   };
5212 
5213   static const CostTblEntry SSE42CostTbl[] = {
5214     {ISD::UMIN,    MVT::v2i64, 3}, // xor+pcmpgtq+blendvpd
5215   };
5216 
5217   static const CostTblEntry AVX1CostTbl[] = {
5218     {ISD::FMINNUM, MVT::v8f32,  1},
5219     {ISD::FMINNUM, MVT::v4f64,  1},
5220     {ISD::SMIN,    MVT::v8i32,  3},
5221     {ISD::UMIN,    MVT::v8i32,  3},
5222     {ISD::SMIN,    MVT::v16i16, 3},
5223     {ISD::UMIN,    MVT::v16i16, 3},
5224     {ISD::SMIN,    MVT::v32i8,  3},
5225     {ISD::UMIN,    MVT::v32i8,  3},
5226   };
5227 
5228   static const CostTblEntry AVX2CostTbl[] = {
5229     {ISD::SMIN,    MVT::v8i32,  1},
5230     {ISD::UMIN,    MVT::v8i32,  1},
5231     {ISD::SMIN,    MVT::v16i16, 1},
5232     {ISD::UMIN,    MVT::v16i16, 1},
5233     {ISD::SMIN,    MVT::v32i8,  1},
5234     {ISD::UMIN,    MVT::v32i8,  1},
5235   };
5236 
5237   static const CostTblEntry AVX512CostTbl[] = {
5238     {ISD::FMINNUM, MVT::v16f32, 1},
5239     {ISD::FMINNUM, MVT::v8f64,  1},
5240     {ISD::SMIN,    MVT::v2i64,  1},
5241     {ISD::UMIN,    MVT::v2i64,  1},
5242     {ISD::SMIN,    MVT::v4i64,  1},
5243     {ISD::UMIN,    MVT::v4i64,  1},
5244     {ISD::SMIN,    MVT::v8i64,  1},
5245     {ISD::UMIN,    MVT::v8i64,  1},
5246     {ISD::SMIN,    MVT::v16i32, 1},
5247     {ISD::UMIN,    MVT::v16i32, 1},
5248   };
5249 
5250   static const CostTblEntry AVX512BWCostTbl[] = {
5251     {ISD::SMIN,    MVT::v32i16, 1},
5252     {ISD::UMIN,    MVT::v32i16, 1},
5253     {ISD::SMIN,    MVT::v64i8,  1},
5254     {ISD::UMIN,    MVT::v64i8,  1},
5255   };
5256 
5257   // If we have a native MIN/MAX instruction for this type, use it.
5258   if (ST->hasBWI())
5259     if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
5260       return LT.first * Entry->Cost;
5261 
5262   if (ST->hasAVX512())
5263     if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
5264       return LT.first * Entry->Cost;
5265 
5266   if (ST->hasAVX2())
5267     if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
5268       return LT.first * Entry->Cost;
5269 
5270   if (ST->hasAVX())
5271     if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5272       return LT.first * Entry->Cost;
5273 
5274   if (ST->hasSSE42())
5275     if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
5276       return LT.first * Entry->Cost;
5277 
5278   if (ST->hasSSE41())
5279     if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
5280       return LT.first * Entry->Cost;
5281 
5282   if (ST->hasSSE2())
5283     if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5284       return LT.first * Entry->Cost;
5285 
5286   if (ST->hasSSE1())
5287     if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
5288       return LT.first * Entry->Cost;
5289 
5290   unsigned CmpOpcode;
5291   if (Ty->isFPOrFPVectorTy()) {
5292     CmpOpcode = Instruction::FCmp;
5293   } else {
5294     assert(Ty->isIntOrIntVectorTy() &&
5295            "expecting floating point or integer type for min/max reduction");
5296     CmpOpcode = Instruction::ICmp;
5297   }
5298 
5299   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
5300   // Otherwise fall back to cmp+select.
5301   InstructionCost Result =
5302       getCmpSelInstrCost(CmpOpcode, Ty, CondTy, CmpInst::BAD_ICMP_PREDICATE,
5303                          CostKind) +
5304       getCmpSelInstrCost(Instruction::Select, Ty, CondTy,
5305                          CmpInst::BAD_ICMP_PREDICATE, CostKind);
5306   return Result;
5307 }
5308 
5309 InstructionCost
5310 X86TTIImpl::getMinMaxReductionCost(VectorType *ValTy, VectorType *CondTy,
5311                                    bool IsUnsigned,
5312                                    TTI::TargetCostKind CostKind) {
5313   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5314 
5315   MVT MTy = LT.second;
5316 
5317   int ISD;
5318   if (ValTy->isIntOrIntVectorTy()) {
5319     ISD = IsUnsigned ? ISD::UMIN : ISD::SMIN;
5320   } else {
5321     assert(ValTy->isFPOrFPVectorTy() &&
5322            "Expected float point or integer vector type.");
5323     ISD = ISD::FMINNUM;
5324   }
5325 
5326   // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
5327   // and make it as the cost.
5328 
5329   static const CostTblEntry SSE2CostTblNoPairWise[] = {
5330       {ISD::UMIN, MVT::v2i16, 5}, // need pxors to use pminsw/pmaxsw
5331       {ISD::UMIN, MVT::v4i16, 7}, // need pxors to use pminsw/pmaxsw
5332       {ISD::UMIN, MVT::v8i16, 9}, // need pxors to use pminsw/pmaxsw
5333   };
5334 
5335   static const CostTblEntry SSE41CostTblNoPairWise[] = {
5336       {ISD::SMIN, MVT::v2i16, 3}, // same as sse2
5337       {ISD::SMIN, MVT::v4i16, 5}, // same as sse2
5338       {ISD::UMIN, MVT::v2i16, 5}, // same as sse2
5339       {ISD::UMIN, MVT::v4i16, 7}, // same as sse2
5340       {ISD::SMIN, MVT::v8i16, 4}, // phminposuw+xor
5341       {ISD::UMIN, MVT::v8i16, 4}, // FIXME: umin is cheaper than umax
5342       {ISD::SMIN, MVT::v2i8,  3}, // pminsb
5343       {ISD::SMIN, MVT::v4i8,  5}, // pminsb
5344       {ISD::SMIN, MVT::v8i8,  7}, // pminsb
5345       {ISD::SMIN, MVT::v16i8, 6},
5346       {ISD::UMIN, MVT::v2i8,  3}, // same as sse2
5347       {ISD::UMIN, MVT::v4i8,  5}, // same as sse2
5348       {ISD::UMIN, MVT::v8i8,  7}, // same as sse2
5349       {ISD::UMIN, MVT::v16i8, 6}, // FIXME: umin is cheaper than umax
5350   };
5351 
5352   static const CostTblEntry AVX1CostTblNoPairWise[] = {
5353       {ISD::SMIN, MVT::v16i16, 6},
5354       {ISD::UMIN, MVT::v16i16, 6}, // FIXME: umin is cheaper than umax
5355       {ISD::SMIN, MVT::v32i8, 8},
5356       {ISD::UMIN, MVT::v32i8, 8},
5357   };
5358 
5359   static const CostTblEntry AVX512BWCostTblNoPairWise[] = {
5360       {ISD::SMIN, MVT::v32i16, 8},
5361       {ISD::UMIN, MVT::v32i16, 8}, // FIXME: umin is cheaper than umax
5362       {ISD::SMIN, MVT::v64i8, 10},
5363       {ISD::UMIN, MVT::v64i8, 10},
5364   };
5365 
5366   // Before legalizing the type, give a chance to look up illegal narrow types
5367   // in the table.
5368   // FIXME: Is there a better way to do this?
5369   EVT VT = TLI->getValueType(DL, ValTy);
5370   if (VT.isSimple()) {
5371     MVT MTy = VT.getSimpleVT();
5372     if (ST->hasBWI())
5373       if (const auto *Entry = CostTableLookup(AVX512BWCostTblNoPairWise, ISD, MTy))
5374         return Entry->Cost;
5375 
5376     if (ST->hasAVX())
5377       if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
5378         return Entry->Cost;
5379 
5380     if (ST->hasSSE41())
5381       if (const auto *Entry = CostTableLookup(SSE41CostTblNoPairWise, ISD, MTy))
5382         return Entry->Cost;
5383 
5384     if (ST->hasSSE2())
5385       if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy))
5386         return Entry->Cost;
5387   }
5388 
5389   auto *ValVTy = cast<FixedVectorType>(ValTy);
5390   unsigned NumVecElts = ValVTy->getNumElements();
5391 
5392   auto *Ty = ValVTy;
5393   InstructionCost MinMaxCost = 0;
5394   if (LT.first != 1 && MTy.isVector() &&
5395       MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5396     // Type needs to be split. We need LT.first - 1 operations ops.
5397     Ty = FixedVectorType::get(ValVTy->getElementType(),
5398                               MTy.getVectorNumElements());
5399     auto *SubCondTy = FixedVectorType::get(CondTy->getElementType(),
5400                                            MTy.getVectorNumElements());
5401     MinMaxCost = getMinMaxCost(Ty, SubCondTy, IsUnsigned);
5402     MinMaxCost *= LT.first - 1;
5403     NumVecElts = MTy.getVectorNumElements();
5404   }
5405 
5406   if (ST->hasBWI())
5407     if (const auto *Entry = CostTableLookup(AVX512BWCostTblNoPairWise, ISD, MTy))
5408       return MinMaxCost + Entry->Cost;
5409 
5410   if (ST->hasAVX())
5411     if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
5412       return MinMaxCost + Entry->Cost;
5413 
5414   if (ST->hasSSE41())
5415     if (const auto *Entry = CostTableLookup(SSE41CostTblNoPairWise, ISD, MTy))
5416       return MinMaxCost + Entry->Cost;
5417 
5418   if (ST->hasSSE2())
5419     if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy))
5420       return MinMaxCost + Entry->Cost;
5421 
5422   unsigned ScalarSize = ValTy->getScalarSizeInBits();
5423 
5424   // Special case power of 2 reductions where the scalar type isn't changed
5425   // by type legalization.
5426   if (!isPowerOf2_32(ValVTy->getNumElements()) ||
5427       ScalarSize != MTy.getScalarSizeInBits())
5428     return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsUnsigned, CostKind);
5429 
5430   // Now handle reduction with the legal type, taking into account size changes
5431   // at each level.
5432   while (NumVecElts > 1) {
5433     // Determine the size of the remaining vector we need to reduce.
5434     unsigned Size = NumVecElts * ScalarSize;
5435     NumVecElts /= 2;
5436     // If we're reducing from 256/512 bits, use an extract_subvector.
5437     if (Size > 128) {
5438       auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
5439       MinMaxCost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt,
5440                                    CostKind, NumVecElts, SubTy);
5441       Ty = SubTy;
5442     } else if (Size == 128) {
5443       // Reducing from 128 bits is a permute of v2f64/v2i64.
5444       VectorType *ShufTy;
5445       if (ValTy->isFloatingPointTy())
5446         ShufTy =
5447             FixedVectorType::get(Type::getDoubleTy(ValTy->getContext()), 2);
5448       else
5449         ShufTy = FixedVectorType::get(Type::getInt64Ty(ValTy->getContext()), 2);
5450       MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy,
5451                                    std::nullopt, CostKind, 0, nullptr);
5452     } else if (Size == 64) {
5453       // Reducing from 64 bits is a shuffle of v4f32/v4i32.
5454       FixedVectorType *ShufTy;
5455       if (ValTy->isFloatingPointTy())
5456         ShufTy = FixedVectorType::get(Type::getFloatTy(ValTy->getContext()), 4);
5457       else
5458         ShufTy = FixedVectorType::get(Type::getInt32Ty(ValTy->getContext()), 4);
5459       MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy,
5460                                    std::nullopt, CostKind, 0, nullptr);
5461     } else {
5462       // Reducing from smaller size is a shift by immediate.
5463       auto *ShiftTy = FixedVectorType::get(
5464           Type::getIntNTy(ValTy->getContext(), Size), 128 / Size);
5465       MinMaxCost += getArithmeticInstrCost(
5466           Instruction::LShr, ShiftTy, TTI::TCK_RecipThroughput,
5467           {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
5468           {TargetTransformInfo::OK_UniformConstantValue, TargetTransformInfo::OP_None});
5469     }
5470 
5471     // Add the arithmetic op for this level.
5472     auto *SubCondTy =
5473         FixedVectorType::get(CondTy->getElementType(), Ty->getNumElements());
5474     MinMaxCost += getMinMaxCost(Ty, SubCondTy, IsUnsigned);
5475   }
5476 
5477   // Add the final extract element to the cost.
5478   return MinMaxCost + getVectorInstrCost(Instruction::ExtractElement, Ty,
5479                                          CostKind, 0, nullptr, nullptr);
5480 }
5481 
5482 /// Calculate the cost of materializing a 64-bit value. This helper
5483 /// method might only calculate a fraction of a larger immediate. Therefore it
5484 /// is valid to return a cost of ZERO.
5485 InstructionCost X86TTIImpl::getIntImmCost(int64_t Val) {
5486   if (Val == 0)
5487     return TTI::TCC_Free;
5488 
5489   if (isInt<32>(Val))
5490     return TTI::TCC_Basic;
5491 
5492   return 2 * TTI::TCC_Basic;
5493 }
5494 
5495 InstructionCost X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
5496                                           TTI::TargetCostKind CostKind) {
5497   assert(Ty->isIntegerTy());
5498 
5499   unsigned BitSize = Ty->getPrimitiveSizeInBits();
5500   if (BitSize == 0)
5501     return ~0U;
5502 
5503   // Never hoist constants larger than 128bit, because this might lead to
5504   // incorrect code generation or assertions in codegen.
5505   // Fixme: Create a cost model for types larger than i128 once the codegen
5506   // issues have been fixed.
5507   if (BitSize > 128)
5508     return TTI::TCC_Free;
5509 
5510   if (Imm == 0)
5511     return TTI::TCC_Free;
5512 
5513   // Sign-extend all constants to a multiple of 64-bit.
5514   APInt ImmVal = Imm;
5515   if (BitSize % 64 != 0)
5516     ImmVal = Imm.sext(alignTo(BitSize, 64));
5517 
5518   // Split the constant into 64-bit chunks and calculate the cost for each
5519   // chunk.
5520   InstructionCost Cost = 0;
5521   for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
5522     APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
5523     int64_t Val = Tmp.getSExtValue();
5524     Cost += getIntImmCost(Val);
5525   }
5526   // We need at least one instruction to materialize the constant.
5527   return std::max<InstructionCost>(1, Cost);
5528 }
5529 
5530 InstructionCost X86TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
5531                                               const APInt &Imm, Type *Ty,
5532                                               TTI::TargetCostKind CostKind,
5533                                               Instruction *Inst) {
5534   assert(Ty->isIntegerTy());
5535 
5536   unsigned BitSize = Ty->getPrimitiveSizeInBits();
5537   // There is no cost model for constants with a bit size of 0. Return TCC_Free
5538   // here, so that constant hoisting will ignore this constant.
5539   if (BitSize == 0)
5540     return TTI::TCC_Free;
5541 
5542   unsigned ImmIdx = ~0U;
5543   switch (Opcode) {
5544   default:
5545     return TTI::TCC_Free;
5546   case Instruction::GetElementPtr:
5547     // Always hoist the base address of a GetElementPtr. This prevents the
5548     // creation of new constants for every base constant that gets constant
5549     // folded with the offset.
5550     if (Idx == 0)
5551       return 2 * TTI::TCC_Basic;
5552     return TTI::TCC_Free;
5553   case Instruction::Store:
5554     ImmIdx = 0;
5555     break;
5556   case Instruction::ICmp:
5557     // This is an imperfect hack to prevent constant hoisting of
5558     // compares that might be trying to check if a 64-bit value fits in
5559     // 32-bits. The backend can optimize these cases using a right shift by 32.
5560     // Ideally we would check the compare predicate here. There also other
5561     // similar immediates the backend can use shifts for.
5562     if (Idx == 1 && Imm.getBitWidth() == 64) {
5563       uint64_t ImmVal = Imm.getZExtValue();
5564       if (ImmVal == 0x100000000ULL || ImmVal == 0xffffffff)
5565         return TTI::TCC_Free;
5566     }
5567     ImmIdx = 1;
5568     break;
5569   case Instruction::And:
5570     // We support 64-bit ANDs with immediates with 32-bits of leading zeroes
5571     // by using a 32-bit operation with implicit zero extension. Detect such
5572     // immediates here as the normal path expects bit 31 to be sign extended.
5573     if (Idx == 1 && Imm.getBitWidth() == 64 && Imm.isIntN(32))
5574       return TTI::TCC_Free;
5575     ImmIdx = 1;
5576     break;
5577   case Instruction::Add:
5578   case Instruction::Sub:
5579     // For add/sub, we can use the opposite instruction for INT32_MIN.
5580     if (Idx == 1 && Imm.getBitWidth() == 64 && Imm.getZExtValue() == 0x80000000)
5581       return TTI::TCC_Free;
5582     ImmIdx = 1;
5583     break;
5584   case Instruction::UDiv:
5585   case Instruction::SDiv:
5586   case Instruction::URem:
5587   case Instruction::SRem:
5588     // Division by constant is typically expanded later into a different
5589     // instruction sequence. This completely changes the constants.
5590     // Report them as "free" to stop ConstantHoist from marking them as opaque.
5591     return TTI::TCC_Free;
5592   case Instruction::Mul:
5593   case Instruction::Or:
5594   case Instruction::Xor:
5595     ImmIdx = 1;
5596     break;
5597   // Always return TCC_Free for the shift value of a shift instruction.
5598   case Instruction::Shl:
5599   case Instruction::LShr:
5600   case Instruction::AShr:
5601     if (Idx == 1)
5602       return TTI::TCC_Free;
5603     break;
5604   case Instruction::Trunc:
5605   case Instruction::ZExt:
5606   case Instruction::SExt:
5607   case Instruction::IntToPtr:
5608   case Instruction::PtrToInt:
5609   case Instruction::BitCast:
5610   case Instruction::PHI:
5611   case Instruction::Call:
5612   case Instruction::Select:
5613   case Instruction::Ret:
5614   case Instruction::Load:
5615     break;
5616   }
5617 
5618   if (Idx == ImmIdx) {
5619     int NumConstants = divideCeil(BitSize, 64);
5620     InstructionCost Cost = X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
5621     return (Cost <= NumConstants * TTI::TCC_Basic)
5622                ? static_cast<int>(TTI::TCC_Free)
5623                : Cost;
5624   }
5625 
5626   return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
5627 }
5628 
5629 InstructionCost X86TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
5630                                                 const APInt &Imm, Type *Ty,
5631                                                 TTI::TargetCostKind CostKind) {
5632   assert(Ty->isIntegerTy());
5633 
5634   unsigned BitSize = Ty->getPrimitiveSizeInBits();
5635   // There is no cost model for constants with a bit size of 0. Return TCC_Free
5636   // here, so that constant hoisting will ignore this constant.
5637   if (BitSize == 0)
5638     return TTI::TCC_Free;
5639 
5640   switch (IID) {
5641   default:
5642     return TTI::TCC_Free;
5643   case Intrinsic::sadd_with_overflow:
5644   case Intrinsic::uadd_with_overflow:
5645   case Intrinsic::ssub_with_overflow:
5646   case Intrinsic::usub_with_overflow:
5647   case Intrinsic::smul_with_overflow:
5648   case Intrinsic::umul_with_overflow:
5649     if ((Idx == 1) && Imm.getBitWidth() <= 64 && Imm.isSignedIntN(32))
5650       return TTI::TCC_Free;
5651     break;
5652   case Intrinsic::experimental_stackmap:
5653     if ((Idx < 2) || (Imm.getBitWidth() <= 64 && Imm.isSignedIntN(64)))
5654       return TTI::TCC_Free;
5655     break;
5656   case Intrinsic::experimental_patchpoint_void:
5657   case Intrinsic::experimental_patchpoint_i64:
5658     if ((Idx < 4) || (Imm.getBitWidth() <= 64 && Imm.isSignedIntN(64)))
5659       return TTI::TCC_Free;
5660     break;
5661   }
5662   return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
5663 }
5664 
5665 InstructionCost X86TTIImpl::getCFInstrCost(unsigned Opcode,
5666                                            TTI::TargetCostKind CostKind,
5667                                            const Instruction *I) {
5668   if (CostKind != TTI::TCK_RecipThroughput)
5669     return Opcode == Instruction::PHI ? 0 : 1;
5670   // Branches are assumed to be predicted.
5671   return 0;
5672 }
5673 
5674 int X86TTIImpl::getGatherOverhead() const {
5675   // Some CPUs have more overhead for gather. The specified overhead is relative
5676   // to the Load operation. "2" is the number provided by Intel architects. This
5677   // parameter is used for cost estimation of Gather Op and comparison with
5678   // other alternatives.
5679   // TODO: Remove the explicit hasAVX512()?, That would mean we would only
5680   // enable gather with a -march.
5681   if (ST->hasAVX512() || (ST->hasAVX2() && ST->hasFastGather()))
5682     return 2;
5683 
5684   return 1024;
5685 }
5686 
5687 int X86TTIImpl::getScatterOverhead() const {
5688   if (ST->hasAVX512())
5689     return 2;
5690 
5691   return 1024;
5692 }
5693 
5694 // Return an average cost of Gather / Scatter instruction, maybe improved later.
5695 // FIXME: Add TargetCostKind support.
5696 InstructionCost X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy,
5697                                             const Value *Ptr, Align Alignment,
5698                                             unsigned AddressSpace) {
5699 
5700   assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost");
5701   unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements();
5702 
5703   // Try to reduce index size from 64 bit (default for GEP)
5704   // to 32. It is essential for VF 16. If the index can't be reduced to 32, the
5705   // operation will use 16 x 64 indices which do not fit in a zmm and needs
5706   // to split. Also check that the base pointer is the same for all lanes,
5707   // and that there's at most one variable index.
5708   auto getIndexSizeInBits = [](const Value *Ptr, const DataLayout &DL) {
5709     unsigned IndexSize = DL.getPointerSizeInBits();
5710     const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
5711     if (IndexSize < 64 || !GEP)
5712       return IndexSize;
5713 
5714     unsigned NumOfVarIndices = 0;
5715     const Value *Ptrs = GEP->getPointerOperand();
5716     if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs))
5717       return IndexSize;
5718     for (unsigned i = 1; i < GEP->getNumOperands(); ++i) {
5719       if (isa<Constant>(GEP->getOperand(i)))
5720         continue;
5721       Type *IndxTy = GEP->getOperand(i)->getType();
5722       if (auto *IndexVTy = dyn_cast<VectorType>(IndxTy))
5723         IndxTy = IndexVTy->getElementType();
5724       if ((IndxTy->getPrimitiveSizeInBits() == 64 &&
5725           !isa<SExtInst>(GEP->getOperand(i))) ||
5726          ++NumOfVarIndices > 1)
5727         return IndexSize; // 64
5728     }
5729     return (unsigned)32;
5730   };
5731 
5732   // Trying to reduce IndexSize to 32 bits for vector 16.
5733   // By default the IndexSize is equal to pointer size.
5734   unsigned IndexSize = (ST->hasAVX512() && VF >= 16)
5735                            ? getIndexSizeInBits(Ptr, DL)
5736                            : DL.getPointerSizeInBits();
5737 
5738   auto *IndexVTy = FixedVectorType::get(
5739       IntegerType::get(SrcVTy->getContext(), IndexSize), VF);
5740   std::pair<InstructionCost, MVT> IdxsLT = getTypeLegalizationCost(IndexVTy);
5741   std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(SrcVTy);
5742   InstructionCost::CostType SplitFactor =
5743       *std::max(IdxsLT.first, SrcLT.first).getValue();
5744   if (SplitFactor > 1) {
5745     // Handle splitting of vector of pointers
5746     auto *SplitSrcTy =
5747         FixedVectorType::get(SrcVTy->getScalarType(), VF / SplitFactor);
5748     return SplitFactor * getGSVectorCost(Opcode, SplitSrcTy, Ptr, Alignment,
5749                                          AddressSpace);
5750   }
5751 
5752   // The gather / scatter cost is given by Intel architects. It is a rough
5753   // number since we are looking at one instruction in a time.
5754   const int GSOverhead = (Opcode == Instruction::Load)
5755                              ? getGatherOverhead()
5756                              : getScatterOverhead();
5757   return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
5758                                            MaybeAlign(Alignment), AddressSpace,
5759                                            TTI::TCK_RecipThroughput);
5760 }
5761 
5762 /// Return the cost of full scalarization of gather / scatter operation.
5763 ///
5764 /// Opcode - Load or Store instruction.
5765 /// SrcVTy - The type of the data vector that should be gathered or scattered.
5766 /// VariableMask - The mask is non-constant at compile time.
5767 /// Alignment - Alignment for one element.
5768 /// AddressSpace - pointer[s] address space.
5769 ///
5770 /// FIXME: Add TargetCostKind support.
5771 InstructionCost X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy,
5772                                             bool VariableMask, Align Alignment,
5773                                             unsigned AddressSpace) {
5774   Type *ScalarTy = SrcVTy->getScalarType();
5775   unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements();
5776   APInt DemandedElts = APInt::getAllOnes(VF);
5777   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
5778 
5779   InstructionCost MaskUnpackCost = 0;
5780   if (VariableMask) {
5781     auto *MaskTy =
5782         FixedVectorType::get(Type::getInt1Ty(SrcVTy->getContext()), VF);
5783     MaskUnpackCost = getScalarizationOverhead(
5784         MaskTy, DemandedElts, /*Insert=*/false, /*Extract=*/true, CostKind);
5785     InstructionCost ScalarCompareCost = getCmpSelInstrCost(
5786         Instruction::ICmp, Type::getInt1Ty(SrcVTy->getContext()), nullptr,
5787         CmpInst::BAD_ICMP_PREDICATE, CostKind);
5788     InstructionCost BranchCost = getCFInstrCost(Instruction::Br, CostKind);
5789     MaskUnpackCost += VF * (BranchCost + ScalarCompareCost);
5790   }
5791 
5792   InstructionCost AddressUnpackCost = getScalarizationOverhead(
5793       FixedVectorType::get(ScalarTy->getPointerTo(), VF), DemandedElts,
5794       /*Insert=*/false, /*Extract=*/true, CostKind);
5795 
5796   // The cost of the scalar loads/stores.
5797   InstructionCost MemoryOpCost =
5798       VF * getMemoryOpCost(Opcode, ScalarTy, MaybeAlign(Alignment),
5799                            AddressSpace, CostKind);
5800 
5801   // The cost of forming the vector from loaded scalars/
5802   // scalarizing the vector to perform scalar stores.
5803   InstructionCost InsertExtractCost = getScalarizationOverhead(
5804       cast<FixedVectorType>(SrcVTy), DemandedElts,
5805       /*Insert=*/Opcode == Instruction::Load,
5806       /*Extract=*/Opcode == Instruction::Store, CostKind);
5807 
5808   return AddressUnpackCost + MemoryOpCost + MaskUnpackCost + InsertExtractCost;
5809 }
5810 
5811 /// Calculate the cost of Gather / Scatter operation
5812 InstructionCost X86TTIImpl::getGatherScatterOpCost(
5813     unsigned Opcode, Type *SrcVTy, const Value *Ptr, bool VariableMask,
5814     Align Alignment, TTI::TargetCostKind CostKind,
5815     const Instruction *I = nullptr) {
5816   if (CostKind != TTI::TCK_RecipThroughput) {
5817     if ((Opcode == Instruction::Load &&
5818          isLegalMaskedGather(SrcVTy, Align(Alignment)) &&
5819          !forceScalarizeMaskedGather(cast<VectorType>(SrcVTy),
5820                                      Align(Alignment))) ||
5821         (Opcode == Instruction::Store &&
5822          isLegalMaskedScatter(SrcVTy, Align(Alignment)) &&
5823          !forceScalarizeMaskedScatter(cast<VectorType>(SrcVTy),
5824                                       Align(Alignment))))
5825       return 1;
5826     return BaseT::getGatherScatterOpCost(Opcode, SrcVTy, Ptr, VariableMask,
5827                                          Alignment, CostKind, I);
5828   }
5829 
5830   assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter");
5831   PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
5832   if (!PtrTy && Ptr->getType()->isVectorTy())
5833     PtrTy = dyn_cast<PointerType>(
5834         cast<VectorType>(Ptr->getType())->getElementType());
5835   assert(PtrTy && "Unexpected type for Ptr argument");
5836   unsigned AddressSpace = PtrTy->getAddressSpace();
5837 
5838   if ((Opcode == Instruction::Load &&
5839        (!isLegalMaskedGather(SrcVTy, Align(Alignment)) ||
5840         forceScalarizeMaskedGather(cast<VectorType>(SrcVTy),
5841                                    Align(Alignment)))) ||
5842       (Opcode == Instruction::Store &&
5843        (!isLegalMaskedScatter(SrcVTy, Align(Alignment)) ||
5844         forceScalarizeMaskedScatter(cast<VectorType>(SrcVTy),
5845                                     Align(Alignment)))))
5846     return getGSScalarCost(Opcode, SrcVTy, VariableMask, Alignment,
5847                            AddressSpace);
5848 
5849   return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace);
5850 }
5851 
5852 bool X86TTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
5853                                const TargetTransformInfo::LSRCost &C2) {
5854     // X86 specific here are "instruction number 1st priority".
5855     return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
5856                     C1.NumIVMuls, C1.NumBaseAdds,
5857                     C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
5858            std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
5859                     C2.NumIVMuls, C2.NumBaseAdds,
5860                     C2.ScaleCost, C2.ImmCost, C2.SetupCost);
5861 }
5862 
5863 bool X86TTIImpl::canMacroFuseCmp() {
5864   return ST->hasMacroFusion() || ST->hasBranchFusion();
5865 }
5866 
5867 bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) {
5868   if (!ST->hasAVX())
5869     return false;
5870 
5871   // The backend can't handle a single element vector.
5872   if (isa<VectorType>(DataTy) &&
5873       cast<FixedVectorType>(DataTy)->getNumElements() == 1)
5874     return false;
5875   Type *ScalarTy = DataTy->getScalarType();
5876 
5877   if (ScalarTy->isPointerTy())
5878     return true;
5879 
5880   if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
5881     return true;
5882 
5883   if (ScalarTy->isHalfTy() && ST->hasBWI())
5884     return true;
5885 
5886   if (!ScalarTy->isIntegerTy())
5887     return false;
5888 
5889   unsigned IntWidth = ScalarTy->getIntegerBitWidth();
5890   return IntWidth == 32 || IntWidth == 64 ||
5891          ((IntWidth == 8 || IntWidth == 16) && ST->hasBWI());
5892 }
5893 
5894 bool X86TTIImpl::isLegalMaskedStore(Type *DataType, Align Alignment) {
5895   return isLegalMaskedLoad(DataType, Alignment);
5896 }
5897 
5898 bool X86TTIImpl::isLegalNTLoad(Type *DataType, Align Alignment) {
5899   unsigned DataSize = DL.getTypeStoreSize(DataType);
5900   // The only supported nontemporal loads are for aligned vectors of 16 or 32
5901   // bytes.  Note that 32-byte nontemporal vector loads are supported by AVX2
5902   // (the equivalent stores only require AVX).
5903   if (Alignment >= DataSize && (DataSize == 16 || DataSize == 32))
5904     return DataSize == 16 ?  ST->hasSSE1() : ST->hasAVX2();
5905 
5906   return false;
5907 }
5908 
5909 bool X86TTIImpl::isLegalNTStore(Type *DataType, Align Alignment) {
5910   unsigned DataSize = DL.getTypeStoreSize(DataType);
5911 
5912   // SSE4A supports nontemporal stores of float and double at arbitrary
5913   // alignment.
5914   if (ST->hasSSE4A() && (DataType->isFloatTy() || DataType->isDoubleTy()))
5915     return true;
5916 
5917   // Besides the SSE4A subtarget exception above, only aligned stores are
5918   // available nontemporaly on any other subtarget.  And only stores with a size
5919   // of 4..32 bytes (powers of 2, only) are permitted.
5920   if (Alignment < DataSize || DataSize < 4 || DataSize > 32 ||
5921       !isPowerOf2_32(DataSize))
5922     return false;
5923 
5924   // 32-byte vector nontemporal stores are supported by AVX (the equivalent
5925   // loads require AVX2).
5926   if (DataSize == 32)
5927     return ST->hasAVX();
5928   if (DataSize == 16)
5929     return ST->hasSSE1();
5930   return true;
5931 }
5932 
5933 bool X86TTIImpl::isLegalBroadcastLoad(Type *ElementTy,
5934                                       ElementCount NumElements) const {
5935   // movddup
5936   return ST->hasSSE3() && !NumElements.isScalable() &&
5937          NumElements.getFixedValue() == 2 &&
5938          ElementTy == Type::getDoubleTy(ElementTy->getContext());
5939 }
5940 
5941 bool X86TTIImpl::isLegalMaskedExpandLoad(Type *DataTy) {
5942   if (!isa<VectorType>(DataTy))
5943     return false;
5944 
5945   if (!ST->hasAVX512())
5946     return false;
5947 
5948   // The backend can't handle a single element vector.
5949   if (cast<FixedVectorType>(DataTy)->getNumElements() == 1)
5950     return false;
5951 
5952   Type *ScalarTy = cast<VectorType>(DataTy)->getElementType();
5953 
5954   if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
5955     return true;
5956 
5957   if (!ScalarTy->isIntegerTy())
5958     return false;
5959 
5960   unsigned IntWidth = ScalarTy->getIntegerBitWidth();
5961   return IntWidth == 32 || IntWidth == 64 ||
5962          ((IntWidth == 8 || IntWidth == 16) && ST->hasVBMI2());
5963 }
5964 
5965 bool X86TTIImpl::isLegalMaskedCompressStore(Type *DataTy) {
5966   return isLegalMaskedExpandLoad(DataTy);
5967 }
5968 
5969 bool X86TTIImpl::supportsGather() const {
5970   // Some CPUs have better gather performance than others.
5971   // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only
5972   // enable gather with a -march.
5973   return ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2());
5974 }
5975 
5976 bool X86TTIImpl::forceScalarizeMaskedGather(VectorType *VTy, Align Alignment) {
5977   // Gather / Scatter for vector 2 is not profitable on KNL / SKX
5978   // Vector-4 of gather/scatter instruction does not exist on KNL. We can extend
5979   // it to 8 elements, but zeroing upper bits of the mask vector will add more
5980   // instructions. Right now we give the scalar cost of vector-4 for KNL. TODO:
5981   // Check, maybe the gather/scatter instruction is better in the VariableMask
5982   // case.
5983   unsigned NumElts = cast<FixedVectorType>(VTy)->getNumElements();
5984   return NumElts == 1 ||
5985          (ST->hasAVX512() && (NumElts == 2 || (NumElts == 4 && !ST->hasVLX())));
5986 }
5987 
5988 bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, Align Alignment) {
5989   if (!supportsGather())
5990     return false;
5991   Type *ScalarTy = DataTy->getScalarType();
5992   if (ScalarTy->isPointerTy())
5993     return true;
5994 
5995   if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
5996     return true;
5997 
5998   if (!ScalarTy->isIntegerTy())
5999     return false;
6000 
6001   unsigned IntWidth = ScalarTy->getIntegerBitWidth();
6002   return IntWidth == 32 || IntWidth == 64;
6003 }
6004 
6005 bool X86TTIImpl::isLegalAltInstr(VectorType *VecTy, unsigned Opcode0,
6006                                  unsigned Opcode1,
6007                                  const SmallBitVector &OpcodeMask) const {
6008   // ADDSUBPS  4xf32 SSE3
6009   // VADDSUBPS 4xf32 AVX
6010   // VADDSUBPS 8xf32 AVX2
6011   // ADDSUBPD  2xf64 SSE3
6012   // VADDSUBPD 2xf64 AVX
6013   // VADDSUBPD 4xf64 AVX2
6014 
6015   unsigned NumElements = cast<FixedVectorType>(VecTy)->getNumElements();
6016   assert(OpcodeMask.size() == NumElements && "Mask and VecTy are incompatible");
6017   if (!isPowerOf2_32(NumElements))
6018     return false;
6019   // Check the opcode pattern. We apply the mask on the opcode arguments and
6020   // then check if it is what we expect.
6021   for (int Lane : seq<int>(0, NumElements)) {
6022     unsigned Opc = OpcodeMask.test(Lane) ? Opcode1 : Opcode0;
6023     // We expect FSub for even lanes and FAdd for odd lanes.
6024     if (Lane % 2 == 0 && Opc != Instruction::FSub)
6025       return false;
6026     if (Lane % 2 == 1 && Opc != Instruction::FAdd)
6027       return false;
6028   }
6029   // Now check that the pattern is supported by the target ISA.
6030   Type *ElemTy = cast<VectorType>(VecTy)->getElementType();
6031   if (ElemTy->isFloatTy())
6032     return ST->hasSSE3() && NumElements % 4 == 0;
6033   if (ElemTy->isDoubleTy())
6034     return ST->hasSSE3() && NumElements % 2 == 0;
6035   return false;
6036 }
6037 
6038 bool X86TTIImpl::isLegalMaskedScatter(Type *DataType, Align Alignment) {
6039   // AVX2 doesn't support scatter
6040   if (!ST->hasAVX512())
6041     return false;
6042   return isLegalMaskedGather(DataType, Alignment);
6043 }
6044 
6045 bool X86TTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
6046   EVT VT = TLI->getValueType(DL, DataType);
6047   return TLI->isOperationLegal(IsSigned ? ISD::SDIVREM : ISD::UDIVREM, VT);
6048 }
6049 
6050 bool X86TTIImpl::isExpensiveToSpeculativelyExecute(const Instruction* I) {
6051   // FDIV is always expensive, even if it has a very low uop count.
6052   // TODO: Still necessary for recent CPUs with low latency/throughput fdiv?
6053   if (I->getOpcode() == Instruction::FDiv)
6054     return true;
6055 
6056   return BaseT::isExpensiveToSpeculativelyExecute(I);
6057 }
6058 
6059 bool X86TTIImpl::isFCmpOrdCheaperThanFCmpZero(Type *Ty) {
6060   return false;
6061 }
6062 
6063 bool X86TTIImpl::areInlineCompatible(const Function *Caller,
6064                                      const Function *Callee) const {
6065   const TargetMachine &TM = getTLI()->getTargetMachine();
6066 
6067   // Work this as a subsetting of subtarget features.
6068   const FeatureBitset &CallerBits =
6069       TM.getSubtargetImpl(*Caller)->getFeatureBits();
6070   const FeatureBitset &CalleeBits =
6071       TM.getSubtargetImpl(*Callee)->getFeatureBits();
6072 
6073   // Check whether features are the same (apart from the ignore list).
6074   FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
6075   FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
6076   if (RealCallerBits == RealCalleeBits)
6077     return true;
6078 
6079   // If the features are a subset, we need to additionally check for calls
6080   // that may become ABI-incompatible as a result of inlining.
6081   if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
6082     return false;
6083 
6084   for (const Instruction &I : instructions(Callee)) {
6085     if (const auto *CB = dyn_cast<CallBase>(&I)) {
6086       SmallVector<Type *, 8> Types;
6087       for (Value *Arg : CB->args())
6088         Types.push_back(Arg->getType());
6089       if (!CB->getType()->isVoidTy())
6090         Types.push_back(CB->getType());
6091 
6092       // Simple types are always ABI compatible.
6093       auto IsSimpleTy = [](Type *Ty) {
6094         return !Ty->isVectorTy() && !Ty->isAggregateType();
6095       };
6096       if (all_of(Types, IsSimpleTy))
6097         continue;
6098 
6099       if (Function *NestedCallee = CB->getCalledFunction()) {
6100         // Assume that intrinsics are always ABI compatible.
6101         if (NestedCallee->isIntrinsic())
6102           continue;
6103 
6104         // Do a precise compatibility check.
6105         if (!areTypesABICompatible(Caller, NestedCallee, Types))
6106           return false;
6107       } else {
6108         // We don't know the target features of the callee,
6109         // assume it is incompatible.
6110         return false;
6111       }
6112     }
6113   }
6114   return true;
6115 }
6116 
6117 bool X86TTIImpl::areTypesABICompatible(const Function *Caller,
6118                                        const Function *Callee,
6119                                        const ArrayRef<Type *> &Types) const {
6120   if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
6121     return false;
6122 
6123   // If we get here, we know the target features match. If one function
6124   // considers 512-bit vectors legal and the other does not, consider them
6125   // incompatible.
6126   const TargetMachine &TM = getTLI()->getTargetMachine();
6127 
6128   if (TM.getSubtarget<X86Subtarget>(*Caller).useAVX512Regs() ==
6129       TM.getSubtarget<X86Subtarget>(*Callee).useAVX512Regs())
6130     return true;
6131 
6132   // Consider the arguments compatible if they aren't vectors or aggregates.
6133   // FIXME: Look at the size of vectors.
6134   // FIXME: Look at the element types of aggregates to see if there are vectors.
6135   return llvm::none_of(Types,
6136       [](Type *T) { return T->isVectorTy() || T->isAggregateType(); });
6137 }
6138 
6139 X86TTIImpl::TTI::MemCmpExpansionOptions
6140 X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
6141   TTI::MemCmpExpansionOptions Options;
6142   Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
6143   Options.NumLoadsPerBlock = 2;
6144   // All GPR and vector loads can be unaligned.
6145   Options.AllowOverlappingLoads = true;
6146   if (IsZeroCmp) {
6147     // Only enable vector loads for equality comparison. Right now the vector
6148     // version is not as fast for three way compare (see #33329).
6149     const unsigned PreferredWidth = ST->getPreferVectorWidth();
6150     if (PreferredWidth >= 512 && ST->hasAVX512()) Options.LoadSizes.push_back(64);
6151     if (PreferredWidth >= 256 && ST->hasAVX()) Options.LoadSizes.push_back(32);
6152     if (PreferredWidth >= 128 && ST->hasSSE2()) Options.LoadSizes.push_back(16);
6153   }
6154   if (ST->is64Bit()) {
6155     Options.LoadSizes.push_back(8);
6156   }
6157   Options.LoadSizes.push_back(4);
6158   Options.LoadSizes.push_back(2);
6159   Options.LoadSizes.push_back(1);
6160   return Options;
6161 }
6162 
6163 bool X86TTIImpl::prefersVectorizedAddressing() const {
6164   return supportsGather();
6165 }
6166 
6167 bool X86TTIImpl::supportsEfficientVectorElementLoadStore() const {
6168   return false;
6169 }
6170 
6171 bool X86TTIImpl::enableInterleavedAccessVectorization() {
6172   // TODO: We expect this to be beneficial regardless of arch,
6173   // but there are currently some unexplained performance artifacts on Atom.
6174   // As a temporary solution, disable on Atom.
6175   return !(ST->isAtom());
6176 }
6177 
6178 // Get estimation for interleaved load/store operations and strided load.
6179 // \p Indices contains indices for strided load.
6180 // \p Factor - the factor of interleaving.
6181 // AVX-512 provides 3-src shuffles that significantly reduces the cost.
6182 InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX512(
6183     unsigned Opcode, FixedVectorType *VecTy, unsigned Factor,
6184     ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace,
6185     TTI::TargetCostKind CostKind, bool UseMaskForCond, bool UseMaskForGaps) {
6186   // VecTy for interleave memop is <VF*Factor x Elt>.
6187   // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
6188   // VecTy = <12 x i32>.
6189 
6190   // Calculate the number of memory operations (NumOfMemOps), required
6191   // for load/store the VecTy.
6192   MVT LegalVT = getTypeLegalizationCost(VecTy).second;
6193   unsigned VecTySize = DL.getTypeStoreSize(VecTy);
6194   unsigned LegalVTSize = LegalVT.getStoreSize();
6195   unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
6196 
6197   // Get the cost of one memory operation.
6198   auto *SingleMemOpTy = FixedVectorType::get(VecTy->getElementType(),
6199                                              LegalVT.getVectorNumElements());
6200   InstructionCost MemOpCost;
6201   bool UseMaskedMemOp = UseMaskForCond || UseMaskForGaps;
6202   if (UseMaskedMemOp)
6203     MemOpCost = getMaskedMemoryOpCost(Opcode, SingleMemOpTy, Alignment,
6204                                       AddressSpace, CostKind);
6205   else
6206     MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy, MaybeAlign(Alignment),
6207                                 AddressSpace, CostKind);
6208 
6209   unsigned VF = VecTy->getNumElements() / Factor;
6210   MVT VT = MVT::getVectorVT(MVT::getVT(VecTy->getScalarType()), VF);
6211 
6212   InstructionCost MaskCost;
6213   if (UseMaskedMemOp) {
6214     APInt DemandedLoadStoreElts = APInt::getZero(VecTy->getNumElements());
6215     for (unsigned Index : Indices) {
6216       assert(Index < Factor && "Invalid index for interleaved memory op");
6217       for (unsigned Elm = 0; Elm < VF; Elm++)
6218         DemandedLoadStoreElts.setBit(Index + Elm * Factor);
6219     }
6220 
6221     Type *I1Type = Type::getInt1Ty(VecTy->getContext());
6222 
6223     MaskCost = getReplicationShuffleCost(
6224         I1Type, Factor, VF,
6225         UseMaskForGaps ? DemandedLoadStoreElts
6226                        : APInt::getAllOnes(VecTy->getNumElements()),
6227         CostKind);
6228 
6229     // The Gaps mask is invariant and created outside the loop, therefore the
6230     // cost of creating it is not accounted for here. However if we have both
6231     // a MaskForGaps and some other mask that guards the execution of the
6232     // memory access, we need to account for the cost of And-ing the two masks
6233     // inside the loop.
6234     if (UseMaskForGaps) {
6235       auto *MaskVT = FixedVectorType::get(I1Type, VecTy->getNumElements());
6236       MaskCost += getArithmeticInstrCost(BinaryOperator::And, MaskVT, CostKind);
6237     }
6238   }
6239 
6240   if (Opcode == Instruction::Load) {
6241     // The tables (AVX512InterleavedLoadTbl and AVX512InterleavedStoreTbl)
6242     // contain the cost of the optimized shuffle sequence that the
6243     // X86InterleavedAccess pass will generate.
6244     // The cost of loads and stores are computed separately from the table.
6245 
6246     // X86InterleavedAccess support only the following interleaved-access group.
6247     static const CostTblEntry AVX512InterleavedLoadTbl[] = {
6248         {3, MVT::v16i8, 12}, //(load 48i8 and) deinterleave into 3 x 16i8
6249         {3, MVT::v32i8, 14}, //(load 96i8 and) deinterleave into 3 x 32i8
6250         {3, MVT::v64i8, 22}, //(load 96i8 and) deinterleave into 3 x 32i8
6251     };
6252 
6253     if (const auto *Entry =
6254             CostTableLookup(AVX512InterleavedLoadTbl, Factor, VT))
6255       return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost;
6256     //If an entry does not exist, fallback to the default implementation.
6257 
6258     // Kind of shuffle depends on number of loaded values.
6259     // If we load the entire data in one register, we can use a 1-src shuffle.
6260     // Otherwise, we'll merge 2 sources in each operation.
6261     TTI::ShuffleKind ShuffleKind =
6262         (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc;
6263 
6264     InstructionCost ShuffleCost = getShuffleCost(
6265         ShuffleKind, SingleMemOpTy, std::nullopt, CostKind, 0, nullptr);
6266 
6267     unsigned NumOfLoadsInInterleaveGrp =
6268         Indices.size() ? Indices.size() : Factor;
6269     auto *ResultTy = FixedVectorType::get(VecTy->getElementType(),
6270                                           VecTy->getNumElements() / Factor);
6271     InstructionCost NumOfResults =
6272         getTypeLegalizationCost(ResultTy).first * NumOfLoadsInInterleaveGrp;
6273 
6274     // About a half of the loads may be folded in shuffles when we have only
6275     // one result. If we have more than one result, or the loads are masked,
6276     // we do not fold loads at all.
6277     unsigned NumOfUnfoldedLoads =
6278         UseMaskedMemOp || NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2;
6279 
6280     // Get a number of shuffle operations per result.
6281     unsigned NumOfShufflesPerResult =
6282         std::max((unsigned)1, (unsigned)(NumOfMemOps - 1));
6283 
6284     // The SK_MergeTwoSrc shuffle clobbers one of src operands.
6285     // When we have more than one destination, we need additional instructions
6286     // to keep sources.
6287     InstructionCost NumOfMoves = 0;
6288     if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc)
6289       NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2;
6290 
6291     InstructionCost Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost +
6292                            MaskCost + NumOfUnfoldedLoads * MemOpCost +
6293                            NumOfMoves;
6294 
6295     return Cost;
6296   }
6297 
6298   // Store.
6299   assert(Opcode == Instruction::Store &&
6300          "Expected Store Instruction at this  point");
6301   // X86InterleavedAccess support only the following interleaved-access group.
6302   static const CostTblEntry AVX512InterleavedStoreTbl[] = {
6303       {3, MVT::v16i8, 12}, // interleave 3 x 16i8 into 48i8 (and store)
6304       {3, MVT::v32i8, 14}, // interleave 3 x 32i8 into 96i8 (and store)
6305       {3, MVT::v64i8, 26}, // interleave 3 x 64i8 into 96i8 (and store)
6306 
6307       {4, MVT::v8i8, 10},  // interleave 4 x 8i8  into 32i8  (and store)
6308       {4, MVT::v16i8, 11}, // interleave 4 x 16i8 into 64i8  (and store)
6309       {4, MVT::v32i8, 14}, // interleave 4 x 32i8 into 128i8 (and store)
6310       {4, MVT::v64i8, 24}  // interleave 4 x 32i8 into 256i8 (and store)
6311   };
6312 
6313   if (const auto *Entry =
6314           CostTableLookup(AVX512InterleavedStoreTbl, Factor, VT))
6315     return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost;
6316   //If an entry does not exist, fallback to the default implementation.
6317 
6318   // There is no strided stores meanwhile. And store can't be folded in
6319   // shuffle.
6320   unsigned NumOfSources = Factor; // The number of values to be merged.
6321   InstructionCost ShuffleCost = getShuffleCost(
6322       TTI::SK_PermuteTwoSrc, SingleMemOpTy, std::nullopt, CostKind, 0, nullptr);
6323   unsigned NumOfShufflesPerStore = NumOfSources - 1;
6324 
6325   // The SK_MergeTwoSrc shuffle clobbers one of src operands.
6326   // We need additional instructions to keep sources.
6327   unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2;
6328   InstructionCost Cost =
6329       MaskCost +
6330       NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) +
6331       NumOfMoves;
6332   return Cost;
6333 }
6334 
6335 InstructionCost X86TTIImpl::getInterleavedMemoryOpCost(
6336     unsigned Opcode, Type *BaseTy, unsigned Factor, ArrayRef<unsigned> Indices,
6337     Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
6338     bool UseMaskForCond, bool UseMaskForGaps) {
6339   auto *VecTy = cast<FixedVectorType>(BaseTy);
6340 
6341   auto isSupportedOnAVX512 = [&](Type *VecTy, bool HasBW) {
6342     Type *EltTy = cast<VectorType>(VecTy)->getElementType();
6343     if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) ||
6344         EltTy->isIntegerTy(32) || EltTy->isPointerTy())
6345       return true;
6346     if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8) || EltTy->isHalfTy())
6347       return HasBW;
6348     return false;
6349   };
6350   if (ST->hasAVX512() && isSupportedOnAVX512(VecTy, ST->hasBWI()))
6351     return getInterleavedMemoryOpCostAVX512(
6352         Opcode, VecTy, Factor, Indices, Alignment,
6353         AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps);
6354 
6355   if (UseMaskForCond || UseMaskForGaps)
6356     return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6357                                              Alignment, AddressSpace, CostKind,
6358                                              UseMaskForCond, UseMaskForGaps);
6359 
6360   // Get estimation for interleaved load/store operations for SSE-AVX2.
6361   // As opposed to AVX-512, SSE-AVX2 do not have generic shuffles that allow
6362   // computing the cost using a generic formula as a function of generic
6363   // shuffles. We therefore use a lookup table instead, filled according to
6364   // the instruction sequences that codegen currently generates.
6365 
6366   // VecTy for interleave memop is <VF*Factor x Elt>.
6367   // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
6368   // VecTy = <12 x i32>.
6369   MVT LegalVT = getTypeLegalizationCost(VecTy).second;
6370 
6371   // This function can be called with VecTy=<6xi128>, Factor=3, in which case
6372   // the VF=2, while v2i128 is an unsupported MVT vector type
6373   // (see MachineValueType.h::getVectorVT()).
6374   if (!LegalVT.isVector())
6375     return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6376                                              Alignment, AddressSpace, CostKind);
6377 
6378   unsigned VF = VecTy->getNumElements() / Factor;
6379   Type *ScalarTy = VecTy->getElementType();
6380   // Deduplicate entries, model floats/pointers as appropriately-sized integers.
6381   if (!ScalarTy->isIntegerTy())
6382     ScalarTy =
6383         Type::getIntNTy(ScalarTy->getContext(), DL.getTypeSizeInBits(ScalarTy));
6384 
6385   // Get the cost of all the memory operations.
6386   // FIXME: discount dead loads.
6387   InstructionCost MemOpCosts = getMemoryOpCost(
6388       Opcode, VecTy, MaybeAlign(Alignment), AddressSpace, CostKind);
6389 
6390   auto *VT = FixedVectorType::get(ScalarTy, VF);
6391   EVT ETy = TLI->getValueType(DL, VT);
6392   if (!ETy.isSimple())
6393     return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6394                                              Alignment, AddressSpace, CostKind);
6395 
6396   // TODO: Complete for other data-types and strides.
6397   // Each combination of Stride, element bit width and VF results in a different
6398   // sequence; The cost tables are therefore accessed with:
6399   // Factor (stride) and VectorType=VFxiN.
6400   // The Cost accounts only for the shuffle sequence;
6401   // The cost of the loads/stores is accounted for separately.
6402   //
6403   static const CostTblEntry AVX2InterleavedLoadTbl[] = {
6404       {2, MVT::v2i8, 2},  // (load 4i8 and) deinterleave into 2 x 2i8
6405       {2, MVT::v4i8, 2},  // (load 8i8 and) deinterleave into 2 x 4i8
6406       {2, MVT::v8i8, 2},  // (load 16i8 and) deinterleave into 2 x 8i8
6407       {2, MVT::v16i8, 4}, // (load 32i8 and) deinterleave into 2 x 16i8
6408       {2, MVT::v32i8, 6}, // (load 64i8 and) deinterleave into 2 x 32i8
6409 
6410       {2, MVT::v8i16, 6},   // (load 16i16 and) deinterleave into 2 x 8i16
6411       {2, MVT::v16i16, 9},  // (load 32i16 and) deinterleave into 2 x 16i16
6412       {2, MVT::v32i16, 18}, // (load 64i16 and) deinterleave into 2 x 32i16
6413 
6414       {2, MVT::v8i32, 4},   // (load 16i32 and) deinterleave into 2 x 8i32
6415       {2, MVT::v16i32, 8},  // (load 32i32 and) deinterleave into 2 x 16i32
6416       {2, MVT::v32i32, 16}, // (load 64i32 and) deinterleave into 2 x 32i32
6417 
6418       {2, MVT::v4i64, 4},   // (load 8i64 and) deinterleave into 2 x 4i64
6419       {2, MVT::v8i64, 8},   // (load 16i64 and) deinterleave into 2 x 8i64
6420       {2, MVT::v16i64, 16}, // (load 32i64 and) deinterleave into 2 x 16i64
6421       {2, MVT::v32i64, 32}, // (load 64i64 and) deinterleave into 2 x 32i64
6422 
6423       {3, MVT::v2i8, 3},   // (load 6i8 and) deinterleave into 3 x 2i8
6424       {3, MVT::v4i8, 3},   // (load 12i8 and) deinterleave into 3 x 4i8
6425       {3, MVT::v8i8, 6},   // (load 24i8 and) deinterleave into 3 x 8i8
6426       {3, MVT::v16i8, 11}, // (load 48i8 and) deinterleave into 3 x 16i8
6427       {3, MVT::v32i8, 14}, // (load 96i8 and) deinterleave into 3 x 32i8
6428 
6429       {3, MVT::v2i16, 5},   // (load 6i16 and) deinterleave into 3 x 2i16
6430       {3, MVT::v4i16, 7},   // (load 12i16 and) deinterleave into 3 x 4i16
6431       {3, MVT::v8i16, 9},   // (load 24i16 and) deinterleave into 3 x 8i16
6432       {3, MVT::v16i16, 28}, // (load 48i16 and) deinterleave into 3 x 16i16
6433       {3, MVT::v32i16, 56}, // (load 96i16 and) deinterleave into 3 x 32i16
6434 
6435       {3, MVT::v2i32, 3},   // (load 6i32 and) deinterleave into 3 x 2i32
6436       {3, MVT::v4i32, 3},   // (load 12i32 and) deinterleave into 3 x 4i32
6437       {3, MVT::v8i32, 7},   // (load 24i32 and) deinterleave into 3 x 8i32
6438       {3, MVT::v16i32, 14}, // (load 48i32 and) deinterleave into 3 x 16i32
6439       {3, MVT::v32i32, 32}, // (load 96i32 and) deinterleave into 3 x 32i32
6440 
6441       {3, MVT::v2i64, 1},   // (load 6i64 and) deinterleave into 3 x 2i64
6442       {3, MVT::v4i64, 5},   // (load 12i64 and) deinterleave into 3 x 4i64
6443       {3, MVT::v8i64, 10},  // (load 24i64 and) deinterleave into 3 x 8i64
6444       {3, MVT::v16i64, 20}, // (load 48i64 and) deinterleave into 3 x 16i64
6445 
6446       {4, MVT::v2i8, 4},   // (load 8i8 and) deinterleave into 4 x 2i8
6447       {4, MVT::v4i8, 4},   // (load 16i8 and) deinterleave into 4 x 4i8
6448       {4, MVT::v8i8, 12},  // (load 32i8 and) deinterleave into 4 x 8i8
6449       {4, MVT::v16i8, 24}, // (load 64i8 and) deinterleave into 4 x 16i8
6450       {4, MVT::v32i8, 56}, // (load 128i8 and) deinterleave into 4 x 32i8
6451 
6452       {4, MVT::v2i16, 6},    // (load 8i16 and) deinterleave into 4 x 2i16
6453       {4, MVT::v4i16, 17},   // (load 16i16 and) deinterleave into 4 x 4i16
6454       {4, MVT::v8i16, 33},   // (load 32i16 and) deinterleave into 4 x 8i16
6455       {4, MVT::v16i16, 75},  // (load 64i16 and) deinterleave into 4 x 16i16
6456       {4, MVT::v32i16, 150}, // (load 128i16 and) deinterleave into 4 x 32i16
6457 
6458       {4, MVT::v2i32, 4},   // (load 8i32 and) deinterleave into 4 x 2i32
6459       {4, MVT::v4i32, 8},   // (load 16i32 and) deinterleave into 4 x 4i32
6460       {4, MVT::v8i32, 16},  // (load 32i32 and) deinterleave into 4 x 8i32
6461       {4, MVT::v16i32, 32}, // (load 64i32 and) deinterleave into 4 x 16i32
6462       {4, MVT::v32i32, 68}, // (load 128i32 and) deinterleave into 4 x 32i32
6463 
6464       {4, MVT::v2i64, 6},  // (load 8i64 and) deinterleave into 4 x 2i64
6465       {4, MVT::v4i64, 8},  // (load 16i64 and) deinterleave into 4 x 4i64
6466       {4, MVT::v8i64, 20}, // (load 32i64 and) deinterleave into 4 x 8i64
6467       {4, MVT::v16i64, 40}, // (load 64i64 and) deinterleave into 4 x 16i64
6468 
6469       {6, MVT::v2i8, 6},   // (load 12i8 and) deinterleave into 6 x 2i8
6470       {6, MVT::v4i8, 14},  // (load 24i8 and) deinterleave into 6 x 4i8
6471       {6, MVT::v8i8, 18},  // (load 48i8 and) deinterleave into 6 x 8i8
6472       {6, MVT::v16i8, 43}, // (load 96i8 and) deinterleave into 6 x 16i8
6473       {6, MVT::v32i8, 82}, // (load 192i8 and) deinterleave into 6 x 32i8
6474 
6475       {6, MVT::v2i16, 13},   // (load 12i16 and) deinterleave into 6 x 2i16
6476       {6, MVT::v4i16, 9},    // (load 24i16 and) deinterleave into 6 x 4i16
6477       {6, MVT::v8i16, 39},   // (load 48i16 and) deinterleave into 6 x 8i16
6478       {6, MVT::v16i16, 106}, // (load 96i16 and) deinterleave into 6 x 16i16
6479       {6, MVT::v32i16, 212}, // (load 192i16 and) deinterleave into 6 x 32i16
6480 
6481       {6, MVT::v2i32, 6},   // (load 12i32 and) deinterleave into 6 x 2i32
6482       {6, MVT::v4i32, 15},  // (load 24i32 and) deinterleave into 6 x 4i32
6483       {6, MVT::v8i32, 31},  // (load 48i32 and) deinterleave into 6 x 8i32
6484       {6, MVT::v16i32, 64}, // (load 96i32 and) deinterleave into 6 x 16i32
6485 
6486       {6, MVT::v2i64, 6},  // (load 12i64 and) deinterleave into 6 x 2i64
6487       {6, MVT::v4i64, 18}, // (load 24i64 and) deinterleave into 6 x 4i64
6488       {6, MVT::v8i64, 36}, // (load 48i64 and) deinterleave into 6 x 8i64
6489 
6490       {8, MVT::v8i32, 40} // (load 64i32 and) deinterleave into 8 x 8i32
6491   };
6492 
6493   static const CostTblEntry SSSE3InterleavedLoadTbl[] = {
6494       {2, MVT::v4i16, 2},   // (load 8i16 and) deinterleave into 2 x 4i16
6495   };
6496 
6497   static const CostTblEntry SSE2InterleavedLoadTbl[] = {
6498       {2, MVT::v2i16, 2},   // (load 4i16 and) deinterleave into 2 x 2i16
6499       {2, MVT::v4i16, 7},   // (load 8i16 and) deinterleave into 2 x 4i16
6500 
6501       {2, MVT::v2i32, 2},   // (load 4i32 and) deinterleave into 2 x 2i32
6502       {2, MVT::v4i32, 2},   // (load 8i32 and) deinterleave into 2 x 4i32
6503 
6504       {2, MVT::v2i64, 2},   // (load 4i64 and) deinterleave into 2 x 2i64
6505   };
6506 
6507   static const CostTblEntry AVX2InterleavedStoreTbl[] = {
6508       {2, MVT::v16i8, 3}, // interleave 2 x 16i8 into 32i8 (and store)
6509       {2, MVT::v32i8, 4}, // interleave 2 x 32i8 into 64i8 (and store)
6510 
6511       {2, MVT::v8i16, 3},  // interleave 2 x 8i16 into 16i16 (and store)
6512       {2, MVT::v16i16, 4}, // interleave 2 x 16i16 into 32i16 (and store)
6513       {2, MVT::v32i16, 8}, // interleave 2 x 32i16 into 64i16 (and store)
6514 
6515       {2, MVT::v4i32, 2},   // interleave 2 x 4i32 into 8i32 (and store)
6516       {2, MVT::v8i32, 4},   // interleave 2 x 8i32 into 16i32 (and store)
6517       {2, MVT::v16i32, 8},  // interleave 2 x 16i32 into 32i32 (and store)
6518       {2, MVT::v32i32, 16}, // interleave 2 x 32i32 into 64i32 (and store)
6519 
6520       {2, MVT::v2i64, 2},   // interleave 2 x 2i64 into 4i64 (and store)
6521       {2, MVT::v4i64, 4},   // interleave 2 x 4i64 into 8i64 (and store)
6522       {2, MVT::v8i64, 8},   // interleave 2 x 8i64 into 16i64 (and store)
6523       {2, MVT::v16i64, 16}, // interleave 2 x 16i64 into 32i64 (and store)
6524       {2, MVT::v32i64, 32}, // interleave 2 x 32i64 into 64i64 (and store)
6525 
6526       {3, MVT::v2i8, 4},   // interleave 3 x 2i8 into 6i8 (and store)
6527       {3, MVT::v4i8, 4},   // interleave 3 x 4i8 into 12i8 (and store)
6528       {3, MVT::v8i8, 6},   // interleave 3 x 8i8 into 24i8 (and store)
6529       {3, MVT::v16i8, 11}, // interleave 3 x 16i8 into 48i8 (and store)
6530       {3, MVT::v32i8, 13}, // interleave 3 x 32i8 into 96i8 (and store)
6531 
6532       {3, MVT::v2i16, 4},   // interleave 3 x 2i16 into 6i16 (and store)
6533       {3, MVT::v4i16, 6},   // interleave 3 x 4i16 into 12i16 (and store)
6534       {3, MVT::v8i16, 12},  // interleave 3 x 8i16 into 24i16 (and store)
6535       {3, MVT::v16i16, 27}, // interleave 3 x 16i16 into 48i16 (and store)
6536       {3, MVT::v32i16, 54}, // interleave 3 x 32i16 into 96i16 (and store)
6537 
6538       {3, MVT::v2i32, 4},   // interleave 3 x 2i32 into 6i32 (and store)
6539       {3, MVT::v4i32, 5},   // interleave 3 x 4i32 into 12i32 (and store)
6540       {3, MVT::v8i32, 11},  // interleave 3 x 8i32 into 24i32 (and store)
6541       {3, MVT::v16i32, 22}, // interleave 3 x 16i32 into 48i32 (and store)
6542       {3, MVT::v32i32, 48}, // interleave 3 x 32i32 into 96i32 (and store)
6543 
6544       {3, MVT::v2i64, 4},   // interleave 3 x 2i64 into 6i64 (and store)
6545       {3, MVT::v4i64, 6},   // interleave 3 x 4i64 into 12i64 (and store)
6546       {3, MVT::v8i64, 12},  // interleave 3 x 8i64 into 24i64 (and store)
6547       {3, MVT::v16i64, 24}, // interleave 3 x 16i64 into 48i64 (and store)
6548 
6549       {4, MVT::v2i8, 4},   // interleave 4 x 2i8 into 8i8 (and store)
6550       {4, MVT::v4i8, 4},   // interleave 4 x 4i8 into 16i8 (and store)
6551       {4, MVT::v8i8, 4},   // interleave 4 x 8i8 into 32i8 (and store)
6552       {4, MVT::v16i8, 8},  // interleave 4 x 16i8 into 64i8 (and store)
6553       {4, MVT::v32i8, 12}, // interleave 4 x 32i8 into 128i8 (and store)
6554 
6555       {4, MVT::v2i16, 2},   // interleave 4 x 2i16 into 8i16 (and store)
6556       {4, MVT::v4i16, 6},   // interleave 4 x 4i16 into 16i16 (and store)
6557       {4, MVT::v8i16, 10},  // interleave 4 x 8i16 into 32i16 (and store)
6558       {4, MVT::v16i16, 32}, // interleave 4 x 16i16 into 64i16 (and store)
6559       {4, MVT::v32i16, 64}, // interleave 4 x 32i16 into 128i16 (and store)
6560 
6561       {4, MVT::v2i32, 5},   // interleave 4 x 2i32 into 8i32 (and store)
6562       {4, MVT::v4i32, 6},   // interleave 4 x 4i32 into 16i32 (and store)
6563       {4, MVT::v8i32, 16},  // interleave 4 x 8i32 into 32i32 (and store)
6564       {4, MVT::v16i32, 32}, // interleave 4 x 16i32 into 64i32 (and store)
6565       {4, MVT::v32i32, 64}, // interleave 4 x 32i32 into 128i32 (and store)
6566 
6567       {4, MVT::v2i64, 6},  // interleave 4 x 2i64 into 8i64 (and store)
6568       {4, MVT::v4i64, 8},  // interleave 4 x 4i64 into 16i64 (and store)
6569       {4, MVT::v8i64, 20}, // interleave 4 x 8i64 into 32i64 (and store)
6570       {4, MVT::v16i64, 40}, // interleave 4 x 16i64 into 64i64 (and store)
6571 
6572       {6, MVT::v2i8, 7},   // interleave 6 x 2i8 into 12i8 (and store)
6573       {6, MVT::v4i8, 9},   // interleave 6 x 4i8 into 24i8 (and store)
6574       {6, MVT::v8i8, 16},  // interleave 6 x 8i8 into 48i8 (and store)
6575       {6, MVT::v16i8, 27}, // interleave 6 x 16i8 into 96i8 (and store)
6576       {6, MVT::v32i8, 90}, // interleave 6 x 32i8 into 192i8 (and store)
6577 
6578       {6, MVT::v2i16, 10},  // interleave 6 x 2i16 into 12i16 (and store)
6579       {6, MVT::v4i16, 15},  // interleave 6 x 4i16 into 24i16 (and store)
6580       {6, MVT::v8i16, 21},  // interleave 6 x 8i16 into 48i16 (and store)
6581       {6, MVT::v16i16, 58}, // interleave 6 x 16i16 into 96i16 (and store)
6582       {6, MVT::v32i16, 90}, // interleave 6 x 32i16 into 192i16 (and store)
6583 
6584       {6, MVT::v2i32, 9},   // interleave 6 x 2i32 into 12i32 (and store)
6585       {6, MVT::v4i32, 12},  // interleave 6 x 4i32 into 24i32 (and store)
6586       {6, MVT::v8i32, 33},  // interleave 6 x 8i32 into 48i32 (and store)
6587       {6, MVT::v16i32, 66}, // interleave 6 x 16i32 into 96i32 (and store)
6588 
6589       {6, MVT::v2i64, 8},  // interleave 6 x 2i64 into 12i64 (and store)
6590       {6, MVT::v4i64, 15}, // interleave 6 x 4i64 into 24i64 (and store)
6591       {6, MVT::v8i64, 30}, // interleave 6 x 8i64 into 48i64 (and store)
6592   };
6593 
6594   static const CostTblEntry SSE2InterleavedStoreTbl[] = {
6595       {2, MVT::v2i8, 1},   // interleave 2 x 2i8 into 4i8 (and store)
6596       {2, MVT::v4i8, 1},   // interleave 2 x 4i8 into 8i8 (and store)
6597       {2, MVT::v8i8, 1},   // interleave 2 x 8i8 into 16i8 (and store)
6598 
6599       {2, MVT::v2i16, 1},  // interleave 2 x 2i16 into 4i16 (and store)
6600       {2, MVT::v4i16, 1},  // interleave 2 x 4i16 into 8i16 (and store)
6601 
6602       {2, MVT::v2i32, 1},  // interleave 2 x 2i32 into 4i32 (and store)
6603   };
6604 
6605   if (Opcode == Instruction::Load) {
6606     auto GetDiscountedCost = [Factor, NumMembers = Indices.size(),
6607                               MemOpCosts](const CostTblEntry *Entry) {
6608       // NOTE: this is just an approximation!
6609       //       It can over/under -estimate the cost!
6610       return MemOpCosts + divideCeil(NumMembers * Entry->Cost, Factor);
6611     };
6612 
6613     if (ST->hasAVX2())
6614       if (const auto *Entry = CostTableLookup(AVX2InterleavedLoadTbl, Factor,
6615                                               ETy.getSimpleVT()))
6616         return GetDiscountedCost(Entry);
6617 
6618     if (ST->hasSSSE3())
6619       if (const auto *Entry = CostTableLookup(SSSE3InterleavedLoadTbl, Factor,
6620                                               ETy.getSimpleVT()))
6621         return GetDiscountedCost(Entry);
6622 
6623     if (ST->hasSSE2())
6624       if (const auto *Entry = CostTableLookup(SSE2InterleavedLoadTbl, Factor,
6625                                               ETy.getSimpleVT()))
6626         return GetDiscountedCost(Entry);
6627   } else {
6628     assert(Opcode == Instruction::Store &&
6629            "Expected Store Instruction at this point");
6630     assert((!Indices.size() || Indices.size() == Factor) &&
6631            "Interleaved store only supports fully-interleaved groups.");
6632     if (ST->hasAVX2())
6633       if (const auto *Entry = CostTableLookup(AVX2InterleavedStoreTbl, Factor,
6634                                               ETy.getSimpleVT()))
6635         return MemOpCosts + Entry->Cost;
6636 
6637     if (ST->hasSSE2())
6638       if (const auto *Entry = CostTableLookup(SSE2InterleavedStoreTbl, Factor,
6639                                               ETy.getSimpleVT()))
6640         return MemOpCosts + Entry->Cost;
6641   }
6642 
6643   return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6644                                            Alignment, AddressSpace, CostKind,
6645                                            UseMaskForCond, UseMaskForGaps);
6646 }
6647 
6648 InstructionCost X86TTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
6649                                                  int64_t BaseOffset,
6650                                                  bool HasBaseReg, int64_t Scale,
6651                                                  unsigned AddrSpace) const {
6652   // Scaling factors are not free at all.
6653   // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
6654   // will take 2 allocations in the out of order engine instead of 1
6655   // for plain addressing mode, i.e. inst (reg1).
6656   // E.g.,
6657   // vaddps (%rsi,%rdx), %ymm0, %ymm1
6658   // Requires two allocations (one for the load, one for the computation)
6659   // whereas:
6660   // vaddps (%rsi), %ymm0, %ymm1
6661   // Requires just 1 allocation, i.e., freeing allocations for other operations
6662   // and having less micro operations to execute.
6663   //
6664   // For some X86 architectures, this is even worse because for instance for
6665   // stores, the complex addressing mode forces the instruction to use the
6666   // "load" ports instead of the dedicated "store" port.
6667   // E.g., on Haswell:
6668   // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
6669   // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
6670   TargetLoweringBase::AddrMode AM;
6671   AM.BaseGV = BaseGV;
6672   AM.BaseOffs = BaseOffset;
6673   AM.HasBaseReg = HasBaseReg;
6674   AM.Scale = Scale;
6675   if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
6676     // Scale represents reg2 * scale, thus account for 1
6677     // as soon as we use a second register.
6678     return AM.Scale != 0;
6679   return -1;
6680 }
6681