xref: /freebsd/contrib/llvm-project/llvm/lib/Target/X86/X86TargetTransformInfo.cpp (revision a4e5e0106ac7145f56eb39a691e302cabb4635be)
1 //===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements a TargetTransformInfo analysis pass specific to the
10 /// X86 target machine. It uses the target's detailed information to provide
11 /// more precise answers to certain TTI queries, while letting the target
12 /// independent and default TTI implementations handle the rest.
13 ///
14 //===----------------------------------------------------------------------===//
15 /// About Cost Model numbers used below it's necessary to say the following:
16 /// the numbers correspond to some "generic" X86 CPU instead of usage of a
17 /// specific CPU model. Usually the numbers correspond to the CPU where the
18 /// feature first appeared. For example, if we do Subtarget.hasSSE42() in
19 /// the lookups below the cost is based on Nehalem as that was the first CPU
20 /// to support that feature level and thus has most likely the worst case cost,
21 /// although we may discard an outlying worst cost from one CPU (e.g. Atom).
22 ///
23 /// Some examples of other technologies/CPUs:
24 ///   SSE 3   - Pentium4 / Athlon64
25 ///   SSE 4.1 - Penryn
26 ///   SSE 4.2 - Nehalem / Silvermont
27 ///   AVX     - Sandy Bridge / Jaguar / Bulldozer
28 ///   AVX2    - Haswell / Ryzen
29 ///   AVX-512 - Xeon Phi / Skylake
30 ///
31 /// And some examples of instruction target dependent costs (latency)
32 ///                   divss     sqrtss          rsqrtss
33 ///   AMD K7          11-16     19              3
34 ///   Piledriver      9-24      13-15           5
35 ///   Jaguar          14        16              2
36 ///   Pentium II,III  18        30              2
37 ///   Nehalem         7-14      7-18            3
38 ///   Haswell         10-13     11              5
39 ///
40 /// Interpreting the 4 TargetCostKind types:
41 /// TCK_RecipThroughput and TCK_Latency should try to match the worst case
42 /// values reported by the CPU scheduler models (and llvm-mca).
43 /// TCK_CodeSize should match the instruction count (e.g. divss = 1), NOT the
44 /// actual encoding size of the instruction.
45 /// TCK_SizeAndLatency should match the worst case micro-op counts reported by
46 /// by the CPU scheduler models (and llvm-mca), to ensure that they are
47 /// compatible with the MicroOpBufferSize and LoopMicroOpBufferSize values which are
48 /// often used as the cost thresholds where TCK_SizeAndLatency is requested.
49 //===----------------------------------------------------------------------===//
50 
51 #include "X86TargetTransformInfo.h"
52 #include "llvm/Analysis/TargetTransformInfo.h"
53 #include "llvm/CodeGen/BasicTTIImpl.h"
54 #include "llvm/CodeGen/CostTable.h"
55 #include "llvm/CodeGen/TargetLowering.h"
56 #include "llvm/IR/InstIterator.h"
57 #include "llvm/IR/IntrinsicInst.h"
58 #include "llvm/Support/Debug.h"
59 #include <optional>
60 
61 using namespace llvm;
62 
63 #define DEBUG_TYPE "x86tti"
64 
65 //===----------------------------------------------------------------------===//
66 //
67 // X86 cost model.
68 //
69 //===----------------------------------------------------------------------===//
70 
71 // Helper struct to store/access costs for each cost kind.
72 // TODO: Move this to allow other targets to use it?
73 struct CostKindCosts {
74   unsigned RecipThroughputCost = ~0U;
75   unsigned LatencyCost = ~0U;
76   unsigned CodeSizeCost = ~0U;
77   unsigned SizeAndLatencyCost = ~0U;
78 
79   std::optional<unsigned>
80   operator[](TargetTransformInfo::TargetCostKind Kind) const {
81     unsigned Cost = ~0U;
82     switch (Kind) {
83     case TargetTransformInfo::TCK_RecipThroughput:
84       Cost = RecipThroughputCost;
85       break;
86     case TargetTransformInfo::TCK_Latency:
87       Cost = LatencyCost;
88       break;
89     case TargetTransformInfo::TCK_CodeSize:
90       Cost = CodeSizeCost;
91       break;
92     case TargetTransformInfo::TCK_SizeAndLatency:
93       Cost = SizeAndLatencyCost;
94       break;
95     }
96     if (Cost == ~0U)
97       return std::nullopt;
98     return Cost;
99   }
100 };
101 using CostKindTblEntry = CostTblEntryT<CostKindCosts>;
102 
103 TargetTransformInfo::PopcntSupportKind
104 X86TTIImpl::getPopcntSupport(unsigned TyWidth) {
105   assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
106   // TODO: Currently the __builtin_popcount() implementation using SSE3
107   //   instructions is inefficient. Once the problem is fixed, we should
108   //   call ST->hasSSE3() instead of ST->hasPOPCNT().
109   return ST->hasPOPCNT() ? TTI::PSK_FastHardware : TTI::PSK_Software;
110 }
111 
112 std::optional<unsigned> X86TTIImpl::getCacheSize(
113   TargetTransformInfo::CacheLevel Level) const {
114   switch (Level) {
115   case TargetTransformInfo::CacheLevel::L1D:
116     //   - Penryn
117     //   - Nehalem
118     //   - Westmere
119     //   - Sandy Bridge
120     //   - Ivy Bridge
121     //   - Haswell
122     //   - Broadwell
123     //   - Skylake
124     //   - Kabylake
125     return 32 * 1024;  //  32 KByte
126   case TargetTransformInfo::CacheLevel::L2D:
127     //   - Penryn
128     //   - Nehalem
129     //   - Westmere
130     //   - Sandy Bridge
131     //   - Ivy Bridge
132     //   - Haswell
133     //   - Broadwell
134     //   - Skylake
135     //   - Kabylake
136     return 256 * 1024; // 256 KByte
137   }
138 
139   llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
140 }
141 
142 std::optional<unsigned> X86TTIImpl::getCacheAssociativity(
143   TargetTransformInfo::CacheLevel Level) const {
144   //   - Penryn
145   //   - Nehalem
146   //   - Westmere
147   //   - Sandy Bridge
148   //   - Ivy Bridge
149   //   - Haswell
150   //   - Broadwell
151   //   - Skylake
152   //   - Kabylake
153   switch (Level) {
154   case TargetTransformInfo::CacheLevel::L1D:
155     [[fallthrough]];
156   case TargetTransformInfo::CacheLevel::L2D:
157     return 8;
158   }
159 
160   llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
161 }
162 
163 unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const {
164   bool Vector = (ClassID == 1);
165   if (Vector && !ST->hasSSE1())
166     return 0;
167 
168   if (ST->is64Bit()) {
169     if (Vector && ST->hasAVX512())
170       return 32;
171     return 16;
172   }
173   return 8;
174 }
175 
176 TypeSize
177 X86TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
178   unsigned PreferVectorWidth = ST->getPreferVectorWidth();
179   switch (K) {
180   case TargetTransformInfo::RGK_Scalar:
181     return TypeSize::getFixed(ST->is64Bit() ? 64 : 32);
182   case TargetTransformInfo::RGK_FixedWidthVector:
183     if (ST->hasAVX512() && PreferVectorWidth >= 512)
184       return TypeSize::getFixed(512);
185     if (ST->hasAVX() && PreferVectorWidth >= 256)
186       return TypeSize::getFixed(256);
187     if (ST->hasSSE1() && PreferVectorWidth >= 128)
188       return TypeSize::getFixed(128);
189     return TypeSize::getFixed(0);
190   case TargetTransformInfo::RGK_ScalableVector:
191     return TypeSize::getScalable(0);
192   }
193 
194   llvm_unreachable("Unsupported register kind");
195 }
196 
197 unsigned X86TTIImpl::getLoadStoreVecRegBitWidth(unsigned) const {
198   return getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
199       .getFixedValue();
200 }
201 
202 unsigned X86TTIImpl::getMaxInterleaveFactor(ElementCount VF) {
203   // If the loop will not be vectorized, don't interleave the loop.
204   // Let regular unroll to unroll the loop, which saves the overflow
205   // check and memory check cost.
206   if (VF.isScalar())
207     return 1;
208 
209   if (ST->isAtom())
210     return 1;
211 
212   // Sandybridge and Haswell have multiple execution ports and pipelined
213   // vector units.
214   if (ST->hasAVX())
215     return 4;
216 
217   return 2;
218 }
219 
220 InstructionCost X86TTIImpl::getArithmeticInstrCost(
221     unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
222     TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
223     ArrayRef<const Value *> Args,
224     const Instruction *CxtI) {
225 
226   // vXi8 multiplications are always promoted to vXi16.
227   // Sub-128-bit types can be extended/packed more efficiently.
228   if (Opcode == Instruction::Mul && Ty->isVectorTy() &&
229       Ty->getPrimitiveSizeInBits() <= 64 && Ty->getScalarSizeInBits() == 8) {
230     Type *WideVecTy =
231         VectorType::getExtendedElementVectorType(cast<VectorType>(Ty));
232     return getCastInstrCost(Instruction::ZExt, WideVecTy, Ty,
233                             TargetTransformInfo::CastContextHint::None,
234                             CostKind) +
235            getCastInstrCost(Instruction::Trunc, Ty, WideVecTy,
236                             TargetTransformInfo::CastContextHint::None,
237                             CostKind) +
238            getArithmeticInstrCost(Opcode, WideVecTy, CostKind, Op1Info, Op2Info);
239   }
240 
241   // Legalize the type.
242   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
243 
244   int ISD = TLI->InstructionOpcodeToISD(Opcode);
245   assert(ISD && "Invalid opcode");
246 
247   if (ISD == ISD::MUL && Args.size() == 2 && LT.second.isVector() &&
248       (LT.second.getScalarType() == MVT::i32 ||
249        LT.second.getScalarType() == MVT::i64)) {
250     // Check if the operands can be represented as a smaller datatype.
251     bool Op1Signed = false, Op2Signed = false;
252     unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
253     unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
254     unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
255     bool SignedMode = Op1Signed || Op2Signed;
256 
257     // If both vXi32 are representable as i15 and at least one is constant,
258     // zero-extended, or sign-extended from vXi16 (or less pre-SSE41) then we
259     // can treat this as PMADDWD which has the same costs as a vXi16 multiply.
260     if (OpMinSize <= 15 && !ST->isPMADDWDSlow() &&
261         LT.second.getScalarType() == MVT::i32) {
262       bool Op1Constant =
263           isa<ConstantDataVector>(Args[0]) || isa<ConstantVector>(Args[0]);
264       bool Op2Constant =
265           isa<ConstantDataVector>(Args[1]) || isa<ConstantVector>(Args[1]);
266       bool Op1Sext = isa<SExtInst>(Args[0]) &&
267                      (Op1MinSize == 15 || (Op1MinSize < 15 && !ST->hasSSE41()));
268       bool Op2Sext = isa<SExtInst>(Args[1]) &&
269                      (Op2MinSize == 15 || (Op2MinSize < 15 && !ST->hasSSE41()));
270 
271       bool IsZeroExtended = !Op1Signed || !Op2Signed;
272       bool IsConstant = Op1Constant || Op2Constant;
273       bool IsSext = Op1Sext || Op2Sext;
274       if (IsConstant || IsZeroExtended || IsSext)
275         LT.second =
276             MVT::getVectorVT(MVT::i16, 2 * LT.second.getVectorNumElements());
277     }
278 
279     // Check if the vXi32 operands can be shrunk into a smaller datatype.
280     // This should match the codegen from reduceVMULWidth.
281     // TODO: Make this generic (!ST->SSE41 || ST->isPMULLDSlow()).
282     if (ST->useSLMArithCosts() && LT.second == MVT::v4i32) {
283       if (OpMinSize <= 7)
284         return LT.first * 3; // pmullw/sext
285       if (!SignedMode && OpMinSize <= 8)
286         return LT.first * 3; // pmullw/zext
287       if (OpMinSize <= 15)
288         return LT.first * 5; // pmullw/pmulhw/pshuf
289       if (!SignedMode && OpMinSize <= 16)
290         return LT.first * 5; // pmullw/pmulhw/pshuf
291     }
292 
293     // If both vXi64 are representable as (unsigned) i32, then we can perform
294     // the multiple with a single PMULUDQ instruction.
295     // TODO: Add (SSE41+) PMULDQ handling for signed extensions.
296     if (!SignedMode && OpMinSize <= 32 && LT.second.getScalarType() == MVT::i64)
297       ISD = X86ISD::PMULUDQ;
298   }
299 
300   // Vector multiply by pow2 will be simplified to shifts.
301   // Vector multiply by -pow2 will be simplified to shifts/negates.
302   if (ISD == ISD::MUL && Op2Info.isConstant() &&
303       (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2())) {
304     InstructionCost Cost =
305         getArithmeticInstrCost(Instruction::Shl, Ty, CostKind,
306                                Op1Info.getNoProps(), Op2Info.getNoProps());
307     if (Op2Info.isNegatedPowerOf2())
308       Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind);
309     return Cost;
310   }
311 
312   // On X86, vector signed division by constants power-of-two are
313   // normally expanded to the sequence SRA + SRL + ADD + SRA.
314   // The OperandValue properties may not be the same as that of the previous
315   // operation; conservatively assume OP_None.
316   if ((ISD == ISD::SDIV || ISD == ISD::SREM) &&
317       Op2Info.isConstant() && Op2Info.isPowerOf2()) {
318     InstructionCost Cost =
319         2 * getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
320                                    Op1Info.getNoProps(), Op2Info.getNoProps());
321     Cost += getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
322                                    Op1Info.getNoProps(), Op2Info.getNoProps());
323     Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
324                                    Op1Info.getNoProps(), Op2Info.getNoProps());
325 
326     if (ISD == ISD::SREM) {
327       // For SREM: (X % C) is the equivalent of (X - (X/C)*C)
328       Cost += getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, Op1Info.getNoProps(),
329                                      Op2Info.getNoProps());
330       Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Op1Info.getNoProps(),
331                                      Op2Info.getNoProps());
332     }
333 
334     return Cost;
335   }
336 
337   // Vector unsigned division/remainder will be simplified to shifts/masks.
338   if ((ISD == ISD::UDIV || ISD == ISD::UREM) &&
339       Op2Info.isConstant() && Op2Info.isPowerOf2()) {
340     if (ISD == ISD::UDIV)
341       return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
342                                     Op1Info.getNoProps(), Op2Info.getNoProps());
343     // UREM
344     return getArithmeticInstrCost(Instruction::And, Ty, CostKind,
345                                   Op1Info.getNoProps(), Op2Info.getNoProps());
346   }
347 
348   static const CostKindTblEntry AVX512BWUniformConstCostTable[] = {
349     { ISD::SHL,  MVT::v16i8,  { 1, 7, 2, 3 } }, // psllw + pand.
350     { ISD::SRL,  MVT::v16i8,  { 1, 7, 2, 3 } }, // psrlw + pand.
351     { ISD::SRA,  MVT::v16i8,  { 1, 8, 4, 5 } }, // psrlw, pand, pxor, psubb.
352     { ISD::SHL,  MVT::v32i8,  { 1, 8, 2, 3 } }, // psllw + pand.
353     { ISD::SRL,  MVT::v32i8,  { 1, 8, 2, 3 } }, // psrlw + pand.
354     { ISD::SRA,  MVT::v32i8,  { 1, 9, 4, 5 } }, // psrlw, pand, pxor, psubb.
355     { ISD::SHL,  MVT::v64i8,  { 1, 8, 2, 3 } }, // psllw + pand.
356     { ISD::SRL,  MVT::v64i8,  { 1, 8, 2, 3 } }, // psrlw + pand.
357     { ISD::SRA,  MVT::v64i8,  { 1, 9, 4, 6 } }, // psrlw, pand, pxor, psubb.
358 
359     { ISD::SHL,  MVT::v16i16, { 1, 1, 1, 1 } }, // psllw
360     { ISD::SRL,  MVT::v16i16, { 1, 1, 1, 1 } }, // psrlw
361     { ISD::SRA,  MVT::v16i16, { 1, 1, 1, 1 } }, // psrlw
362     { ISD::SHL,  MVT::v32i16, { 1, 1, 1, 1 } }, // psllw
363     { ISD::SRL,  MVT::v32i16, { 1, 1, 1, 1 } }, // psrlw
364     { ISD::SRA,  MVT::v32i16, { 1, 1, 1, 1 } }, // psrlw
365   };
366 
367   if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasBWI())
368     if (const auto *Entry =
369             CostTableLookup(AVX512BWUniformConstCostTable, ISD, LT.second))
370       if (auto KindCost = Entry->Cost[CostKind])
371         return LT.first * *KindCost;
372 
373   static const CostKindTblEntry AVX512UniformConstCostTable[] = {
374     { ISD::SHL,  MVT::v64i8,  {  2, 12,  5,  6 } }, // psllw + pand.
375     { ISD::SRL,  MVT::v64i8,  {  2, 12,  5,  6 } }, // psrlw + pand.
376     { ISD::SRA,  MVT::v64i8,  {  3, 10, 12, 12 } }, // psrlw, pand, pxor, psubb.
377 
378     { ISD::SHL,  MVT::v16i16, {  2,  7,  4,  4 } }, // psllw + split.
379     { ISD::SRL,  MVT::v16i16, {  2,  7,  4,  4 } }, // psrlw + split.
380     { ISD::SRA,  MVT::v16i16, {  2,  7,  4,  4 } }, // psraw + split.
381 
382     { ISD::SHL,  MVT::v8i32,  {  1,  1,  1,  1 } }, // pslld
383     { ISD::SRL,  MVT::v8i32,  {  1,  1,  1,  1 } }, // psrld
384     { ISD::SRA,  MVT::v8i32,  {  1,  1,  1,  1 } }, // psrad
385     { ISD::SHL,  MVT::v16i32, {  1,  1,  1,  1 } }, // pslld
386     { ISD::SRL,  MVT::v16i32, {  1,  1,  1,  1 } }, // psrld
387     { ISD::SRA,  MVT::v16i32, {  1,  1,  1,  1 } }, // psrad
388 
389     { ISD::SRA,  MVT::v2i64,  {  1,  1,  1,  1 } }, // psraq
390     { ISD::SHL,  MVT::v4i64,  {  1,  1,  1,  1 } }, // psllq
391     { ISD::SRL,  MVT::v4i64,  {  1,  1,  1,  1 } }, // psrlq
392     { ISD::SRA,  MVT::v4i64,  {  1,  1,  1,  1 } }, // psraq
393     { ISD::SHL,  MVT::v8i64,  {  1,  1,  1,  1 } }, // psllq
394     { ISD::SRL,  MVT::v8i64,  {  1,  1,  1,  1 } }, // psrlq
395     { ISD::SRA,  MVT::v8i64,  {  1,  1,  1,  1 } }, // psraq
396 
397     { ISD::SDIV, MVT::v16i32, {  6 } }, // pmuludq sequence
398     { ISD::SREM, MVT::v16i32, {  8 } }, // pmuludq+mul+sub sequence
399     { ISD::UDIV, MVT::v16i32, {  5 } }, // pmuludq sequence
400     { ISD::UREM, MVT::v16i32, {  7 } }, // pmuludq+mul+sub sequence
401   };
402 
403   if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX512())
404     if (const auto *Entry =
405             CostTableLookup(AVX512UniformConstCostTable, ISD, LT.second))
406       if (auto KindCost = Entry->Cost[CostKind])
407         return LT.first * *KindCost;
408 
409   static const CostKindTblEntry AVX2UniformConstCostTable[] = {
410     { ISD::SHL,  MVT::v16i8, {  1,  8,  2,  3 } }, // psllw + pand.
411     { ISD::SRL,  MVT::v16i8, {  1,  8,  2,  3 } }, // psrlw + pand.
412     { ISD::SRA,  MVT::v16i8, {  2, 10,  5,  6 } }, // psrlw, pand, pxor, psubb.
413     { ISD::SHL,  MVT::v32i8, {  2,  8,  2,  4 } }, // psllw + pand.
414     { ISD::SRL,  MVT::v32i8, {  2,  8,  2,  4 } }, // psrlw + pand.
415     { ISD::SRA,  MVT::v32i8, {  3, 10,  5,  9 } }, // psrlw, pand, pxor, psubb.
416 
417     { ISD::SHL,  MVT::v8i16, {  1,  1,  1,  1 } }, // psllw
418     { ISD::SRL,  MVT::v8i16, {  1,  1,  1,  1 } }, // psrlw
419     { ISD::SRA,  MVT::v8i16, {  1,  1,  1,  1 } }, // psraw
420     { ISD::SHL,  MVT::v16i16,{  2,  2,  1,  2 } }, // psllw
421     { ISD::SRL,  MVT::v16i16,{  2,  2,  1,  2 } }, // psrlw
422     { ISD::SRA,  MVT::v16i16,{  2,  2,  1,  2 } }, // psraw
423 
424     { ISD::SHL,  MVT::v4i32, {  1,  1,  1,  1 } }, // pslld
425     { ISD::SRL,  MVT::v4i32, {  1,  1,  1,  1 } }, // psrld
426     { ISD::SRA,  MVT::v4i32, {  1,  1,  1,  1 } }, // psrad
427     { ISD::SHL,  MVT::v8i32, {  2,  2,  1,  2 } }, // pslld
428     { ISD::SRL,  MVT::v8i32, {  2,  2,  1,  2 } }, // psrld
429     { ISD::SRA,  MVT::v8i32, {  2,  2,  1,  2 } }, // psrad
430 
431     { ISD::SHL,  MVT::v2i64, {  1,  1,  1,  1 } }, // psllq
432     { ISD::SRL,  MVT::v2i64, {  1,  1,  1,  1 } }, // psrlq
433     { ISD::SRA,  MVT::v2i64, {  2,  3,  3,  3 } }, // psrad + shuffle.
434     { ISD::SHL,  MVT::v4i64, {  2,  2,  1,  2 } }, // psllq
435     { ISD::SRL,  MVT::v4i64, {  2,  2,  1,  2 } }, // psrlq
436     { ISD::SRA,  MVT::v4i64, {  4,  4,  3,  6 } }, // psrad + shuffle + split.
437 
438     { ISD::SDIV, MVT::v8i32, {  6 } }, // pmuludq sequence
439     { ISD::SREM, MVT::v8i32, {  8 } }, // pmuludq+mul+sub sequence
440     { ISD::UDIV, MVT::v8i32, {  5 } }, // pmuludq sequence
441     { ISD::UREM, MVT::v8i32, {  7 } }, // pmuludq+mul+sub sequence
442   };
443 
444   if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX2())
445     if (const auto *Entry =
446             CostTableLookup(AVX2UniformConstCostTable, ISD, LT.second))
447       if (auto KindCost = Entry->Cost[CostKind])
448         return LT.first * *KindCost;
449 
450   static const CostKindTblEntry AVXUniformConstCostTable[] = {
451     { ISD::SHL,  MVT::v16i8, {  2,  7,  2,  3 } }, // psllw + pand.
452     { ISD::SRL,  MVT::v16i8, {  2,  7,  2,  3 } }, // psrlw + pand.
453     { ISD::SRA,  MVT::v16i8, {  3,  9,  5,  6 } }, // psrlw, pand, pxor, psubb.
454     { ISD::SHL,  MVT::v32i8, {  4,  7,  7,  8 } }, // 2*(psllw + pand) + split.
455     { ISD::SRL,  MVT::v32i8, {  4,  7,  7,  8 } }, // 2*(psrlw + pand) + split.
456     { ISD::SRA,  MVT::v32i8, {  7,  7, 12, 13 } }, // 2*(psrlw, pand, pxor, psubb) + split.
457 
458     { ISD::SHL,  MVT::v8i16, {  1,  2,  1,  1 } }, // psllw.
459     { ISD::SRL,  MVT::v8i16, {  1,  2,  1,  1 } }, // psrlw.
460     { ISD::SRA,  MVT::v8i16, {  1,  2,  1,  1 } }, // psraw.
461     { ISD::SHL,  MVT::v16i16,{  3,  6,  4,  5 } }, // psllw + split.
462     { ISD::SRL,  MVT::v16i16,{  3,  6,  4,  5 } }, // psrlw + split.
463     { ISD::SRA,  MVT::v16i16,{  3,  6,  4,  5 } }, // psraw + split.
464 
465     { ISD::SHL,  MVT::v4i32, {  1,  2,  1,  1 } }, // pslld.
466     { ISD::SRL,  MVT::v4i32, {  1,  2,  1,  1 } }, // psrld.
467     { ISD::SRA,  MVT::v4i32, {  1,  2,  1,  1 } }, // psrad.
468     { ISD::SHL,  MVT::v8i32, {  3,  6,  4,  5 } }, // pslld + split.
469     { ISD::SRL,  MVT::v8i32, {  3,  6,  4,  5 } }, // psrld + split.
470     { ISD::SRA,  MVT::v8i32, {  3,  6,  4,  5 } }, // psrad + split.
471 
472     { ISD::SHL,  MVT::v2i64, {  1,  2,  1,  1 } }, // psllq.
473     { ISD::SRL,  MVT::v2i64, {  1,  2,  1,  1 } }, // psrlq.
474     { ISD::SRA,  MVT::v2i64, {  2,  3,  3,  3 } }, // psrad + shuffle.
475     { ISD::SHL,  MVT::v4i64, {  3,  6,  4,  5 } }, // 2 x psllq + split.
476     { ISD::SRL,  MVT::v4i64, {  3,  6,  4,  5 } }, // 2 x psllq + split.
477     { ISD::SRA,  MVT::v4i64, {  5,  7,  8,  9 } }, // 2 x psrad + shuffle + split.
478 
479     { ISD::SDIV, MVT::v8i32, { 14 } }, // 2*pmuludq sequence + split.
480     { ISD::SREM, MVT::v8i32, { 18 } }, // 2*pmuludq+mul+sub sequence + split.
481     { ISD::UDIV, MVT::v8i32, { 12 } }, // 2*pmuludq sequence + split.
482     { ISD::UREM, MVT::v8i32, { 16 } }, // 2*pmuludq+mul+sub sequence + split.
483   };
484 
485   // XOP has faster vXi8 shifts.
486   if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX() &&
487       (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
488     if (const auto *Entry =
489             CostTableLookup(AVXUniformConstCostTable, ISD, LT.second))
490       if (auto KindCost = Entry->Cost[CostKind])
491         return LT.first * *KindCost;
492 
493   static const CostKindTblEntry SSE2UniformConstCostTable[] = {
494     { ISD::SHL,  MVT::v16i8, {  1,  7,  2,  3 } }, // psllw + pand.
495     { ISD::SRL,  MVT::v16i8, {  1,  7,  2,  3 } }, // psrlw + pand.
496     { ISD::SRA,  MVT::v16i8, {  3,  9,  5,  6 } }, // psrlw, pand, pxor, psubb.
497 
498     { ISD::SHL,  MVT::v8i16, {  1,  1,  1,  1 } }, // psllw.
499     { ISD::SRL,  MVT::v8i16, {  1,  1,  1,  1 } }, // psrlw.
500     { ISD::SRA,  MVT::v8i16, {  1,  1,  1,  1 } }, // psraw.
501 
502     { ISD::SHL,  MVT::v4i32, {  1,  1,  1,  1 } }, // pslld
503     { ISD::SRL,  MVT::v4i32, {  1,  1,  1,  1 } }, // psrld.
504     { ISD::SRA,  MVT::v4i32, {  1,  1,  1,  1 } }, // psrad.
505 
506     { ISD::SHL,  MVT::v2i64, {  1,  1,  1,  1 } }, // psllq.
507     { ISD::SRL,  MVT::v2i64, {  1,  1,  1,  1 } }, // psrlq.
508     { ISD::SRA,  MVT::v2i64, {  3,  5,  6,  6 } }, // 2 x psrad + shuffle.
509 
510     { ISD::SDIV, MVT::v4i32, {  6 } }, // pmuludq sequence
511     { ISD::SREM, MVT::v4i32, {  8 } }, // pmuludq+mul+sub sequence
512     { ISD::UDIV, MVT::v4i32, {  5 } }, // pmuludq sequence
513     { ISD::UREM, MVT::v4i32, {  7 } }, // pmuludq+mul+sub sequence
514   };
515 
516   // XOP has faster vXi8 shifts.
517   if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasSSE2() &&
518       (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
519     if (const auto *Entry =
520             CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second))
521       if (auto KindCost = Entry->Cost[CostKind])
522         return LT.first * *KindCost;
523 
524   static const CostKindTblEntry AVX512BWConstCostTable[] = {
525     { ISD::SDIV, MVT::v64i8,  { 14 } }, // 2*ext+2*pmulhw sequence
526     { ISD::SREM, MVT::v64i8,  { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
527     { ISD::UDIV, MVT::v64i8,  { 14 } }, // 2*ext+2*pmulhw sequence
528     { ISD::UREM, MVT::v64i8,  { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
529 
530     { ISD::SDIV, MVT::v32i16, {  6 } }, // vpmulhw sequence
531     { ISD::SREM, MVT::v32i16, {  8 } }, // vpmulhw+mul+sub sequence
532     { ISD::UDIV, MVT::v32i16, {  6 } }, // vpmulhuw sequence
533     { ISD::UREM, MVT::v32i16, {  8 } }, // vpmulhuw+mul+sub sequence
534   };
535 
536   if (Op2Info.isConstant() && ST->hasBWI())
537     if (const auto *Entry =
538             CostTableLookup(AVX512BWConstCostTable, ISD, LT.second))
539       if (auto KindCost = Entry->Cost[CostKind])
540         return LT.first * *KindCost;
541 
542   static const CostKindTblEntry AVX512ConstCostTable[] = {
543     { ISD::SDIV, MVT::v64i8,  { 28 } }, // 4*ext+4*pmulhw sequence
544     { ISD::SREM, MVT::v64i8,  { 32 } }, // 4*ext+4*pmulhw+mul+sub sequence
545     { ISD::UDIV, MVT::v64i8,  { 28 } }, // 4*ext+4*pmulhw sequence
546     { ISD::UREM, MVT::v64i8,  { 32 } }, // 4*ext+4*pmulhw+mul+sub sequence
547 
548     { ISD::SDIV, MVT::v32i16, { 12 } }, // 2*vpmulhw sequence
549     { ISD::SREM, MVT::v32i16, { 16 } }, // 2*vpmulhw+mul+sub sequence
550     { ISD::UDIV, MVT::v32i16, { 12 } }, // 2*vpmulhuw sequence
551     { ISD::UREM, MVT::v32i16, { 16 } }, // 2*vpmulhuw+mul+sub sequence
552 
553     { ISD::SDIV, MVT::v16i32, { 15 } }, // vpmuldq sequence
554     { ISD::SREM, MVT::v16i32, { 17 } }, // vpmuldq+mul+sub sequence
555     { ISD::UDIV, MVT::v16i32, { 15 } }, // vpmuludq sequence
556     { ISD::UREM, MVT::v16i32, { 17 } }, // vpmuludq+mul+sub sequence
557   };
558 
559   if (Op2Info.isConstant() && ST->hasAVX512())
560     if (const auto *Entry =
561             CostTableLookup(AVX512ConstCostTable, ISD, LT.second))
562       if (auto KindCost = Entry->Cost[CostKind])
563         return LT.first * *KindCost;
564 
565   static const CostKindTblEntry AVX2ConstCostTable[] = {
566     { ISD::SDIV, MVT::v32i8,  { 14 } }, // 2*ext+2*pmulhw sequence
567     { ISD::SREM, MVT::v32i8,  { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
568     { ISD::UDIV, MVT::v32i8,  { 14 } }, // 2*ext+2*pmulhw sequence
569     { ISD::UREM, MVT::v32i8,  { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
570 
571     { ISD::SDIV, MVT::v16i16, {  6 } }, // vpmulhw sequence
572     { ISD::SREM, MVT::v16i16, {  8 } }, // vpmulhw+mul+sub sequence
573     { ISD::UDIV, MVT::v16i16, {  6 } }, // vpmulhuw sequence
574     { ISD::UREM, MVT::v16i16, {  8 } }, // vpmulhuw+mul+sub sequence
575 
576     { ISD::SDIV, MVT::v8i32,  { 15 } }, // vpmuldq sequence
577     { ISD::SREM, MVT::v8i32,  { 19 } }, // vpmuldq+mul+sub sequence
578     { ISD::UDIV, MVT::v8i32,  { 15 } }, // vpmuludq sequence
579     { ISD::UREM, MVT::v8i32,  { 19 } }, // vpmuludq+mul+sub sequence
580   };
581 
582   if (Op2Info.isConstant() && ST->hasAVX2())
583     if (const auto *Entry = CostTableLookup(AVX2ConstCostTable, ISD, LT.second))
584       if (auto KindCost = Entry->Cost[CostKind])
585         return LT.first * *KindCost;
586 
587   static const CostKindTblEntry AVXConstCostTable[] = {
588     { ISD::SDIV, MVT::v32i8,  { 30 } }, // 4*ext+4*pmulhw sequence + split.
589     { ISD::SREM, MVT::v32i8,  { 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split.
590     { ISD::UDIV, MVT::v32i8,  { 30 } }, // 4*ext+4*pmulhw sequence + split.
591     { ISD::UREM, MVT::v32i8,  { 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split.
592 
593     { ISD::SDIV, MVT::v16i16, { 14 } }, // 2*pmulhw sequence + split.
594     { ISD::SREM, MVT::v16i16, { 18 } }, // 2*pmulhw+mul+sub sequence + split.
595     { ISD::UDIV, MVT::v16i16, { 14 } }, // 2*pmulhuw sequence + split.
596     { ISD::UREM, MVT::v16i16, { 18 } }, // 2*pmulhuw+mul+sub sequence + split.
597 
598     { ISD::SDIV, MVT::v8i32,  { 32 } }, // vpmuludq sequence
599     { ISD::SREM, MVT::v8i32,  { 38 } }, // vpmuludq+mul+sub sequence
600     { ISD::UDIV, MVT::v8i32,  { 32 } }, // 2*pmuludq sequence + split.
601     { ISD::UREM, MVT::v8i32,  { 42 } }, // 2*pmuludq+mul+sub sequence + split.
602   };
603 
604   if (Op2Info.isConstant() && ST->hasAVX())
605     if (const auto *Entry = CostTableLookup(AVXConstCostTable, ISD, LT.second))
606       if (auto KindCost = Entry->Cost[CostKind])
607         return LT.first * *KindCost;
608 
609   static const CostKindTblEntry SSE41ConstCostTable[] = {
610     { ISD::SDIV, MVT::v4i32,  { 15 } }, // vpmuludq sequence
611     { ISD::SREM, MVT::v4i32,  { 20 } }, // vpmuludq+mul+sub sequence
612   };
613 
614   if (Op2Info.isConstant() && ST->hasSSE41())
615     if (const auto *Entry =
616             CostTableLookup(SSE41ConstCostTable, ISD, LT.second))
617       if (auto KindCost = Entry->Cost[CostKind])
618         return LT.first * *KindCost;
619 
620   static const CostKindTblEntry SSE2ConstCostTable[] = {
621     { ISD::SDIV, MVT::v16i8,  { 14 } }, // 2*ext+2*pmulhw sequence
622     { ISD::SREM, MVT::v16i8,  { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
623     { ISD::UDIV, MVT::v16i8,  { 14 } }, // 2*ext+2*pmulhw sequence
624     { ISD::UREM, MVT::v16i8,  { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
625 
626     { ISD::SDIV, MVT::v8i16,  {  6 } }, // pmulhw sequence
627     { ISD::SREM, MVT::v8i16,  {  8 } }, // pmulhw+mul+sub sequence
628     { ISD::UDIV, MVT::v8i16,  {  6 } }, // pmulhuw sequence
629     { ISD::UREM, MVT::v8i16,  {  8 } }, // pmulhuw+mul+sub sequence
630 
631     { ISD::SDIV, MVT::v4i32,  { 19 } }, // pmuludq sequence
632     { ISD::SREM, MVT::v4i32,  { 24 } }, // pmuludq+mul+sub sequence
633     { ISD::UDIV, MVT::v4i32,  { 15 } }, // pmuludq sequence
634     { ISD::UREM, MVT::v4i32,  { 20 } }, // pmuludq+mul+sub sequence
635   };
636 
637   if (Op2Info.isConstant() && ST->hasSSE2())
638     if (const auto *Entry = CostTableLookup(SSE2ConstCostTable, ISD, LT.second))
639       if (auto KindCost = Entry->Cost[CostKind])
640         return LT.first * *KindCost;
641 
642   static const CostKindTblEntry AVX512BWUniformCostTable[] = {
643     { ISD::SHL,  MVT::v16i8,  { 3, 5, 5, 7 } }, // psllw + pand.
644     { ISD::SRL,  MVT::v16i8,  { 3,10, 5, 8 } }, // psrlw + pand.
645     { ISD::SRA,  MVT::v16i8,  { 4,12, 8,12 } }, // psrlw, pand, pxor, psubb.
646     { ISD::SHL,  MVT::v32i8,  { 4, 7, 6, 8 } }, // psllw + pand.
647     { ISD::SRL,  MVT::v32i8,  { 4, 8, 7, 9 } }, // psrlw + pand.
648     { ISD::SRA,  MVT::v32i8,  { 5,10,10,13 } }, // psrlw, pand, pxor, psubb.
649     { ISD::SHL,  MVT::v64i8,  { 4, 7, 6, 8 } }, // psllw + pand.
650     { ISD::SRL,  MVT::v64i8,  { 4, 8, 7,10 } }, // psrlw + pand.
651     { ISD::SRA,  MVT::v64i8,  { 5,10,10,15 } }, // psrlw, pand, pxor, psubb.
652 
653     { ISD::SHL,  MVT::v32i16, { 2, 4, 2, 3 } }, // psllw
654     { ISD::SRL,  MVT::v32i16, { 2, 4, 2, 3 } }, // psrlw
655     { ISD::SRA,  MVT::v32i16, { 2, 4, 2, 3 } }, // psrqw
656   };
657 
658   if (ST->hasBWI() && Op2Info.isUniform())
659     if (const auto *Entry =
660             CostTableLookup(AVX512BWUniformCostTable, ISD, LT.second))
661       if (auto KindCost = Entry->Cost[CostKind])
662         return LT.first * *KindCost;
663 
664   static const CostKindTblEntry AVX512UniformCostTable[] = {
665     { ISD::SHL,  MVT::v32i16, { 5,10, 5, 7 } }, // psllw + split.
666     { ISD::SRL,  MVT::v32i16, { 5,10, 5, 7 } }, // psrlw + split.
667     { ISD::SRA,  MVT::v32i16, { 5,10, 5, 7 } }, // psraw + split.
668 
669     { ISD::SHL,  MVT::v16i32, { 2, 4, 2, 3 } }, // pslld
670     { ISD::SRL,  MVT::v16i32, { 2, 4, 2, 3 } }, // psrld
671     { ISD::SRA,  MVT::v16i32, { 2, 4, 2, 3 } }, // psrad
672 
673     { ISD::SRA,  MVT::v2i64,  { 1, 2, 1, 2 } }, // psraq
674     { ISD::SHL,  MVT::v4i64,  { 1, 4, 1, 2 } }, // psllq
675     { ISD::SRL,  MVT::v4i64,  { 1, 4, 1, 2 } }, // psrlq
676     { ISD::SRA,  MVT::v4i64,  { 1, 4, 1, 2 } }, // psraq
677     { ISD::SHL,  MVT::v8i64,  { 1, 4, 1, 2 } }, // psllq
678     { ISD::SRL,  MVT::v8i64,  { 1, 4, 1, 2 } }, // psrlq
679     { ISD::SRA,  MVT::v8i64,  { 1, 4, 1, 2 } }, // psraq
680   };
681 
682   if (ST->hasAVX512() && Op2Info.isUniform())
683     if (const auto *Entry =
684             CostTableLookup(AVX512UniformCostTable, ISD, LT.second))
685       if (auto KindCost = Entry->Cost[CostKind])
686         return LT.first * *KindCost;
687 
688   static const CostKindTblEntry AVX2UniformCostTable[] = {
689     // Uniform splats are cheaper for the following instructions.
690     { ISD::SHL,  MVT::v16i8,  { 3, 5, 5, 7 } }, // psllw + pand.
691     { ISD::SRL,  MVT::v16i8,  { 3, 9, 5, 8 } }, // psrlw + pand.
692     { ISD::SRA,  MVT::v16i8,  { 4, 5, 9,13 } }, // psrlw, pand, pxor, psubb.
693     { ISD::SHL,  MVT::v32i8,  { 4, 7, 6, 8 } }, // psllw + pand.
694     { ISD::SRL,  MVT::v32i8,  { 4, 8, 7, 9 } }, // psrlw + pand.
695     { ISD::SRA,  MVT::v32i8,  { 6, 9,11,16 } }, // psrlw, pand, pxor, psubb.
696 
697     { ISD::SHL,  MVT::v8i16,  { 1, 2, 1, 2 } }, // psllw.
698     { ISD::SRL,  MVT::v8i16,  { 1, 2, 1, 2 } }, // psrlw.
699     { ISD::SRA,  MVT::v8i16,  { 1, 2, 1, 2 } }, // psraw.
700     { ISD::SHL,  MVT::v16i16, { 2, 4, 2, 3 } }, // psllw.
701     { ISD::SRL,  MVT::v16i16, { 2, 4, 2, 3 } }, // psrlw.
702     { ISD::SRA,  MVT::v16i16, { 2, 4, 2, 3 } }, // psraw.
703 
704     { ISD::SHL,  MVT::v4i32,  { 1, 2, 1, 2 } }, // pslld
705     { ISD::SRL,  MVT::v4i32,  { 1, 2, 1, 2 } }, // psrld
706     { ISD::SRA,  MVT::v4i32,  { 1, 2, 1, 2 } }, // psrad
707     { ISD::SHL,  MVT::v8i32,  { 2, 4, 2, 3 } }, // pslld
708     { ISD::SRL,  MVT::v8i32,  { 2, 4, 2, 3 } }, // psrld
709     { ISD::SRA,  MVT::v8i32,  { 2, 4, 2, 3 } }, // psrad
710 
711     { ISD::SHL,  MVT::v2i64,  { 1, 2, 1, 2 } }, // psllq
712     { ISD::SRL,  MVT::v2i64,  { 1, 2, 1, 2 } }, // psrlq
713     { ISD::SRA,  MVT::v2i64,  { 2, 4, 5, 7 } }, // 2 x psrad + shuffle.
714     { ISD::SHL,  MVT::v4i64,  { 2, 4, 1, 2 } }, // psllq
715     { ISD::SRL,  MVT::v4i64,  { 2, 4, 1, 2 } }, // psrlq
716     { ISD::SRA,  MVT::v4i64,  { 4, 6, 5, 9 } }, // 2 x psrad + shuffle.
717   };
718 
719   if (ST->hasAVX2() && Op2Info.isUniform())
720     if (const auto *Entry =
721             CostTableLookup(AVX2UniformCostTable, ISD, LT.second))
722       if (auto KindCost = Entry->Cost[CostKind])
723         return LT.first * *KindCost;
724 
725   static const CostKindTblEntry AVXUniformCostTable[] = {
726     { ISD::SHL,  MVT::v16i8,  {  4, 4, 6, 8 } }, // psllw + pand.
727     { ISD::SRL,  MVT::v16i8,  {  4, 8, 5, 8 } }, // psrlw + pand.
728     { ISD::SRA,  MVT::v16i8,  {  6, 6, 9,13 } }, // psrlw, pand, pxor, psubb.
729     { ISD::SHL,  MVT::v32i8,  {  7, 8,11,14 } }, // psllw + pand + split.
730     { ISD::SRL,  MVT::v32i8,  {  7, 9,10,14 } }, // psrlw + pand + split.
731     { ISD::SRA,  MVT::v32i8,  { 10,11,16,21 } }, // psrlw, pand, pxor, psubb + split.
732 
733     { ISD::SHL,  MVT::v8i16,  {  1, 3, 1, 2 } }, // psllw.
734     { ISD::SRL,  MVT::v8i16,  {  1, 3, 1, 2 } }, // psrlw.
735     { ISD::SRA,  MVT::v8i16,  {  1, 3, 1, 2 } }, // psraw.
736     { ISD::SHL,  MVT::v16i16, {  3, 7, 5, 7 } }, // psllw + split.
737     { ISD::SRL,  MVT::v16i16, {  3, 7, 5, 7 } }, // psrlw + split.
738     { ISD::SRA,  MVT::v16i16, {  3, 7, 5, 7 } }, // psraw + split.
739 
740     { ISD::SHL,  MVT::v4i32,  {  1, 3, 1, 2 } }, // pslld.
741     { ISD::SRL,  MVT::v4i32,  {  1, 3, 1, 2 } }, // psrld.
742     { ISD::SRA,  MVT::v4i32,  {  1, 3, 1, 2 } }, // psrad.
743     { ISD::SHL,  MVT::v8i32,  {  3, 7, 5, 7 } }, // pslld + split.
744     { ISD::SRL,  MVT::v8i32,  {  3, 7, 5, 7 } }, // psrld + split.
745     { ISD::SRA,  MVT::v8i32,  {  3, 7, 5, 7 } }, // psrad + split.
746 
747     { ISD::SHL,  MVT::v2i64,  {  1, 3, 1, 2 } }, // psllq.
748     { ISD::SRL,  MVT::v2i64,  {  1, 3, 1, 2 } }, // psrlq.
749     { ISD::SRA,  MVT::v2i64,  {  3, 4, 5, 7 } }, // 2 x psrad + shuffle.
750     { ISD::SHL,  MVT::v4i64,  {  3, 7, 4, 6 } }, // psllq + split.
751     { ISD::SRL,  MVT::v4i64,  {  3, 7, 4, 6 } }, // psrlq + split.
752     { ISD::SRA,  MVT::v4i64,  {  6, 7,10,13 } }, // 2 x (2 x psrad + shuffle) + split.
753   };
754 
755   // XOP has faster vXi8 shifts.
756   if (ST->hasAVX() && Op2Info.isUniform() &&
757       (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
758     if (const auto *Entry =
759             CostTableLookup(AVXUniformCostTable, ISD, LT.second))
760       if (auto KindCost = Entry->Cost[CostKind])
761         return LT.first * *KindCost;
762 
763   static const CostKindTblEntry SSE2UniformCostTable[] = {
764     // Uniform splats are cheaper for the following instructions.
765     { ISD::SHL,  MVT::v16i8, {  9, 10, 6, 9 } }, // psllw + pand.
766     { ISD::SRL,  MVT::v16i8, {  9, 13, 5, 9 } }, // psrlw + pand.
767     { ISD::SRA,  MVT::v16i8, { 11, 15, 9,13 } }, // pcmpgtb sequence.
768 
769     { ISD::SHL,  MVT::v8i16, {  2, 2, 1, 2 } }, // psllw.
770     { ISD::SRL,  MVT::v8i16, {  2, 2, 1, 2 } }, // psrlw.
771     { ISD::SRA,  MVT::v8i16, {  2, 2, 1, 2 } }, // psraw.
772 
773     { ISD::SHL,  MVT::v4i32, {  2, 2, 1, 2 } }, // pslld
774     { ISD::SRL,  MVT::v4i32, {  2, 2, 1, 2 } }, // psrld.
775     { ISD::SRA,  MVT::v4i32, {  2, 2, 1, 2 } }, // psrad.
776 
777     { ISD::SHL,  MVT::v2i64, {  2, 2, 1, 2 } }, // psllq.
778     { ISD::SRL,  MVT::v2i64, {  2, 2, 1, 2 } }, // psrlq.
779     { ISD::SRA,  MVT::v2i64, {  5, 9, 5, 7 } }, // 2*psrlq + xor + sub.
780   };
781 
782   if (ST->hasSSE2() && Op2Info.isUniform() &&
783       (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
784     if (const auto *Entry =
785             CostTableLookup(SSE2UniformCostTable, ISD, LT.second))
786       if (auto KindCost = Entry->Cost[CostKind])
787         return LT.first * *KindCost;
788 
789   static const CostKindTblEntry AVX512DQCostTable[] = {
790     { ISD::MUL,  MVT::v2i64, { 2, 15, 1, 3 } }, // pmullq
791     { ISD::MUL,  MVT::v4i64, { 2, 15, 1, 3 } }, // pmullq
792     { ISD::MUL,  MVT::v8i64, { 3, 15, 1, 3 } }  // pmullq
793   };
794 
795   // Look for AVX512DQ lowering tricks for custom cases.
796   if (ST->hasDQI())
797     if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second))
798       if (auto KindCost = Entry->Cost[CostKind])
799         return LT.first * *KindCost;
800 
801   static const CostKindTblEntry AVX512BWCostTable[] = {
802     { ISD::SHL,   MVT::v16i8,   {  4,  8, 4, 5 } }, // extend/vpsllvw/pack sequence.
803     { ISD::SRL,   MVT::v16i8,   {  4,  8, 4, 5 } }, // extend/vpsrlvw/pack sequence.
804     { ISD::SRA,   MVT::v16i8,   {  4,  8, 4, 5 } }, // extend/vpsravw/pack sequence.
805     { ISD::SHL,   MVT::v32i8,   {  4, 23,11,16 } }, // extend/vpsllvw/pack sequence.
806     { ISD::SRL,   MVT::v32i8,   {  4, 30,12,18 } }, // extend/vpsrlvw/pack sequence.
807     { ISD::SRA,   MVT::v32i8,   {  6, 13,24,30 } }, // extend/vpsravw/pack sequence.
808     { ISD::SHL,   MVT::v64i8,   {  6, 19,13,15 } }, // extend/vpsllvw/pack sequence.
809     { ISD::SRL,   MVT::v64i8,   {  7, 27,15,18 } }, // extend/vpsrlvw/pack sequence.
810     { ISD::SRA,   MVT::v64i8,   { 15, 15,30,30 } }, // extend/vpsravw/pack sequence.
811 
812     { ISD::SHL,   MVT::v8i16,   {  1,  1, 1, 1 } }, // vpsllvw
813     { ISD::SRL,   MVT::v8i16,   {  1,  1, 1, 1 } }, // vpsrlvw
814     { ISD::SRA,   MVT::v8i16,   {  1,  1, 1, 1 } }, // vpsravw
815     { ISD::SHL,   MVT::v16i16,  {  1,  1, 1, 1 } }, // vpsllvw
816     { ISD::SRL,   MVT::v16i16,  {  1,  1, 1, 1 } }, // vpsrlvw
817     { ISD::SRA,   MVT::v16i16,  {  1,  1, 1, 1 } }, // vpsravw
818     { ISD::SHL,   MVT::v32i16,  {  1,  1, 1, 1 } }, // vpsllvw
819     { ISD::SRL,   MVT::v32i16,  {  1,  1, 1, 1 } }, // vpsrlvw
820     { ISD::SRA,   MVT::v32i16,  {  1,  1, 1, 1 } }, // vpsravw
821 
822     { ISD::ADD,   MVT::v64i8,   {  1,  1, 1, 1 } }, // paddb
823     { ISD::ADD,   MVT::v32i16,  {  1,  1, 1, 1 } }, // paddw
824 
825     { ISD::ADD,   MVT::v32i8,   {  1,  1, 1, 1 } }, // paddb
826     { ISD::ADD,   MVT::v16i16,  {  1,  1, 1, 1 } }, // paddw
827     { ISD::ADD,   MVT::v8i32,   {  1,  1, 1, 1 } }, // paddd
828     { ISD::ADD,   MVT::v4i64,   {  1,  1, 1, 1 } }, // paddq
829 
830     { ISD::SUB,   MVT::v64i8,   {  1,  1, 1, 1 } }, // psubb
831     { ISD::SUB,   MVT::v32i16,  {  1,  1, 1, 1 } }, // psubw
832 
833     { ISD::MUL,   MVT::v64i8,   {  5, 10,10,11 } },
834     { ISD::MUL,   MVT::v32i16,  {  1,  5, 1, 1 } }, // pmullw
835 
836     { ISD::SUB,   MVT::v32i8,   {  1,  1, 1, 1 } }, // psubb
837     { ISD::SUB,   MVT::v16i16,  {  1,  1, 1, 1 } }, // psubw
838     { ISD::SUB,   MVT::v8i32,   {  1,  1, 1, 1 } }, // psubd
839     { ISD::SUB,   MVT::v4i64,   {  1,  1, 1, 1 } }, // psubq
840   };
841 
842   // Look for AVX512BW lowering tricks for custom cases.
843   if (ST->hasBWI())
844     if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second))
845       if (auto KindCost = Entry->Cost[CostKind])
846         return LT.first * *KindCost;
847 
848   static const CostKindTblEntry AVX512CostTable[] = {
849     { ISD::SHL,     MVT::v64i8,   { 15, 19,27,33 } }, // vpblendv+split sequence.
850     { ISD::SRL,     MVT::v64i8,   { 15, 19,30,36 } }, // vpblendv+split sequence.
851     { ISD::SRA,     MVT::v64i8,   { 37, 37,51,63 } }, // vpblendv+split sequence.
852 
853     { ISD::SHL,     MVT::v32i16,  { 11, 16,11,15 } }, // 2*extend/vpsrlvd/pack sequence.
854     { ISD::SRL,     MVT::v32i16,  { 11, 16,11,15 } }, // 2*extend/vpsrlvd/pack sequence.
855     { ISD::SRA,     MVT::v32i16,  { 11, 16,11,15 } }, // 2*extend/vpsravd/pack sequence.
856 
857     { ISD::SHL,     MVT::v4i32,   {  1,  1, 1, 1 } },
858     { ISD::SRL,     MVT::v4i32,   {  1,  1, 1, 1 } },
859     { ISD::SRA,     MVT::v4i32,   {  1,  1, 1, 1 } },
860     { ISD::SHL,     MVT::v8i32,   {  1,  1, 1, 1 } },
861     { ISD::SRL,     MVT::v8i32,   {  1,  1, 1, 1 } },
862     { ISD::SRA,     MVT::v8i32,   {  1,  1, 1, 1 } },
863     { ISD::SHL,     MVT::v16i32,  {  1,  1, 1, 1 } },
864     { ISD::SRL,     MVT::v16i32,  {  1,  1, 1, 1 } },
865     { ISD::SRA,     MVT::v16i32,  {  1,  1, 1, 1 } },
866 
867     { ISD::SHL,     MVT::v2i64,   {  1,  1, 1, 1 } },
868     { ISD::SRL,     MVT::v2i64,   {  1,  1, 1, 1 } },
869     { ISD::SRA,     MVT::v2i64,   {  1,  1, 1, 1 } },
870     { ISD::SHL,     MVT::v4i64,   {  1,  1, 1, 1 } },
871     { ISD::SRL,     MVT::v4i64,   {  1,  1, 1, 1 } },
872     { ISD::SRA,     MVT::v4i64,   {  1,  1, 1, 1 } },
873     { ISD::SHL,     MVT::v8i64,   {  1,  1, 1, 1 } },
874     { ISD::SRL,     MVT::v8i64,   {  1,  1, 1, 1 } },
875     { ISD::SRA,     MVT::v8i64,   {  1,  1, 1, 1 } },
876 
877     { ISD::ADD,     MVT::v64i8,   {  3,  7, 5, 5 } }, // 2*paddb + split
878     { ISD::ADD,     MVT::v32i16,  {  3,  7, 5, 5 } }, // 2*paddw + split
879 
880     { ISD::SUB,     MVT::v64i8,   {  3,  7, 5, 5 } }, // 2*psubb + split
881     { ISD::SUB,     MVT::v32i16,  {  3,  7, 5, 5 } }, // 2*psubw + split
882 
883     { ISD::AND,     MVT::v32i8,   {  1,  1, 1, 1 } },
884     { ISD::AND,     MVT::v16i16,  {  1,  1, 1, 1 } },
885     { ISD::AND,     MVT::v8i32,   {  1,  1, 1, 1 } },
886     { ISD::AND,     MVT::v4i64,   {  1,  1, 1, 1 } },
887 
888     { ISD::OR,      MVT::v32i8,   {  1,  1, 1, 1 } },
889     { ISD::OR,      MVT::v16i16,  {  1,  1, 1, 1 } },
890     { ISD::OR,      MVT::v8i32,   {  1,  1, 1, 1 } },
891     { ISD::OR,      MVT::v4i64,   {  1,  1, 1, 1 } },
892 
893     { ISD::XOR,     MVT::v32i8,   {  1,  1, 1, 1 } },
894     { ISD::XOR,     MVT::v16i16,  {  1,  1, 1, 1 } },
895     { ISD::XOR,     MVT::v8i32,   {  1,  1, 1, 1 } },
896     { ISD::XOR,     MVT::v4i64,   {  1,  1, 1, 1 } },
897 
898     { ISD::MUL,     MVT::v16i32,  {  1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
899     { ISD::MUL,     MVT::v8i32,   {  1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
900     { ISD::MUL,     MVT::v4i32,   {  1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
901     { ISD::MUL,     MVT::v8i64,   {  6,  9, 8, 8 } }, // 3*pmuludq/3*shift/2*add
902     { ISD::MUL,     MVT::i64,     {  1 } }, // Skylake from http://www.agner.org/
903 
904     { X86ISD::PMULUDQ, MVT::v8i64, { 1,  5, 1, 1 } },
905 
906     { ISD::FNEG,    MVT::v8f64,   {  1,  1, 1, 2 } }, // Skylake from http://www.agner.org/
907     { ISD::FADD,    MVT::v8f64,   {  1,  4, 1, 1 } }, // Skylake from http://www.agner.org/
908     { ISD::FADD,    MVT::v4f64,   {  1,  4, 1, 1 } }, // Skylake from http://www.agner.org/
909     { ISD::FSUB,    MVT::v8f64,   {  1,  4, 1, 1 } }, // Skylake from http://www.agner.org/
910     { ISD::FSUB,    MVT::v4f64,   {  1,  4, 1, 1 } }, // Skylake from http://www.agner.org/
911     { ISD::FMUL,    MVT::v8f64,   {  1,  4, 1, 1 } }, // Skylake from http://www.agner.org/
912     { ISD::FMUL,    MVT::v4f64,   {  1,  4, 1, 1 } }, // Skylake from http://www.agner.org/
913     { ISD::FMUL,    MVT::v2f64,   {  1,  4, 1, 1 } }, // Skylake from http://www.agner.org/
914     { ISD::FMUL,    MVT::f64,     {  1,  4, 1, 1 } }, // Skylake from http://www.agner.org/
915 
916     { ISD::FDIV,    MVT::f64,     {  4, 14, 1, 1 } }, // Skylake from http://www.agner.org/
917     { ISD::FDIV,    MVT::v2f64,   {  4, 14, 1, 1 } }, // Skylake from http://www.agner.org/
918     { ISD::FDIV,    MVT::v4f64,   {  8, 14, 1, 1 } }, // Skylake from http://www.agner.org/
919     { ISD::FDIV,    MVT::v8f64,   { 16, 23, 1, 3 } }, // Skylake from http://www.agner.org/
920 
921     { ISD::FNEG,    MVT::v16f32,  {  1,  1, 1, 2 } }, // Skylake from http://www.agner.org/
922     { ISD::FADD,    MVT::v16f32,  {  1,  4, 1, 1 } }, // Skylake from http://www.agner.org/
923     { ISD::FADD,    MVT::v8f32,   {  1,  4, 1, 1 } }, // Skylake from http://www.agner.org/
924     { ISD::FSUB,    MVT::v16f32,  {  1,  4, 1, 1 } }, // Skylake from http://www.agner.org/
925     { ISD::FSUB,    MVT::v8f32,   {  1,  4, 1, 1 } }, // Skylake from http://www.agner.org/
926     { ISD::FMUL,    MVT::v16f32,  {  1,  4, 1, 1 } }, // Skylake from http://www.agner.org/
927     { ISD::FMUL,    MVT::v8f32,   {  1,  4, 1, 1 } }, // Skylake from http://www.agner.org/
928     { ISD::FMUL,    MVT::v4f32,   {  1,  4, 1, 1 } }, // Skylake from http://www.agner.org/
929     { ISD::FMUL,    MVT::f32,     {  1,  4, 1, 1 } }, // Skylake from http://www.agner.org/
930 
931     { ISD::FDIV,    MVT::f32,     {  3, 11, 1, 1 } }, // Skylake from http://www.agner.org/
932     { ISD::FDIV,    MVT::v4f32,   {  3, 11, 1, 1 } }, // Skylake from http://www.agner.org/
933     { ISD::FDIV,    MVT::v8f32,   {  5, 11, 1, 1 } }, // Skylake from http://www.agner.org/
934     { ISD::FDIV,    MVT::v16f32,  { 10, 18, 1, 3 } }, // Skylake from http://www.agner.org/
935   };
936 
937   if (ST->hasAVX512())
938     if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second))
939       if (auto KindCost = Entry->Cost[CostKind])
940         return LT.first * *KindCost;
941 
942   static const CostKindTblEntry AVX2ShiftCostTable[] = {
943     // Shifts on vXi64/vXi32 on AVX2 is legal even though we declare to
944     // customize them to detect the cases where shift amount is a scalar one.
945     { ISD::SHL,     MVT::v4i32,  { 2, 3, 1, 3 } }, // vpsllvd (Haswell from agner.org)
946     { ISD::SRL,     MVT::v4i32,  { 2, 3, 1, 3 } }, // vpsrlvd (Haswell from agner.org)
947     { ISD::SRA,     MVT::v4i32,  { 2, 3, 1, 3 } }, // vpsravd (Haswell from agner.org)
948     { ISD::SHL,     MVT::v8i32,  { 4, 4, 1, 3 } }, // vpsllvd (Haswell from agner.org)
949     { ISD::SRL,     MVT::v8i32,  { 4, 4, 1, 3 } }, // vpsrlvd (Haswell from agner.org)
950     { ISD::SRA,     MVT::v8i32,  { 4, 4, 1, 3 } }, // vpsravd (Haswell from agner.org)
951     { ISD::SHL,     MVT::v2i64,  { 2, 3, 1, 1 } }, // vpsllvq (Haswell from agner.org)
952     { ISD::SRL,     MVT::v2i64,  { 2, 3, 1, 1 } }, // vpsrlvq (Haswell from agner.org)
953     { ISD::SHL,     MVT::v4i64,  { 4, 4, 1, 2 } }, // vpsllvq (Haswell from agner.org)
954     { ISD::SRL,     MVT::v4i64,  { 4, 4, 1, 2 } }, // vpsrlvq (Haswell from agner.org)
955   };
956 
957   if (ST->hasAVX512()) {
958     if (ISD == ISD::SHL && LT.second == MVT::v32i16 && Op2Info.isConstant())
959       // On AVX512, a packed v32i16 shift left by a constant build_vector
960       // is lowered into a vector multiply (vpmullw).
961       return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
962                                     Op1Info.getNoProps(), Op2Info.getNoProps());
963   }
964 
965   // Look for AVX2 lowering tricks (XOP is always better at v4i32 shifts).
966   if (ST->hasAVX2() && !(ST->hasXOP() && LT.second == MVT::v4i32)) {
967     if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
968         Op2Info.isConstant())
969       // On AVX2, a packed v16i16 shift left by a constant build_vector
970       // is lowered into a vector multiply (vpmullw).
971       return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
972                                     Op1Info.getNoProps(), Op2Info.getNoProps());
973 
974     if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second))
975       if (auto KindCost = Entry->Cost[CostKind])
976         return LT.first * *KindCost;
977   }
978 
979   static const CostKindTblEntry XOPShiftCostTable[] = {
980     // 128bit shifts take 1cy, but right shifts require negation beforehand.
981     { ISD::SHL,     MVT::v16i8,  { 1, 3, 1, 1 } },
982     { ISD::SRL,     MVT::v16i8,  { 2, 3, 1, 1 } },
983     { ISD::SRA,     MVT::v16i8,  { 2, 3, 1, 1 } },
984     { ISD::SHL,     MVT::v8i16,  { 1, 3, 1, 1 } },
985     { ISD::SRL,     MVT::v8i16,  { 2, 3, 1, 1 } },
986     { ISD::SRA,     MVT::v8i16,  { 2, 3, 1, 1 } },
987     { ISD::SHL,     MVT::v4i32,  { 1, 3, 1, 1 } },
988     { ISD::SRL,     MVT::v4i32,  { 2, 3, 1, 1 } },
989     { ISD::SRA,     MVT::v4i32,  { 2, 3, 1, 1 } },
990     { ISD::SHL,     MVT::v2i64,  { 1, 3, 1, 1 } },
991     { ISD::SRL,     MVT::v2i64,  { 2, 3, 1, 1 } },
992     { ISD::SRA,     MVT::v2i64,  { 2, 3, 1, 1 } },
993     // 256bit shifts require splitting if AVX2 didn't catch them above.
994     { ISD::SHL,     MVT::v32i8,  { 4, 7, 5, 6 } },
995     { ISD::SRL,     MVT::v32i8,  { 6, 7, 5, 6 } },
996     { ISD::SRA,     MVT::v32i8,  { 6, 7, 5, 6 } },
997     { ISD::SHL,     MVT::v16i16, { 4, 7, 5, 6 } },
998     { ISD::SRL,     MVT::v16i16, { 6, 7, 5, 6 } },
999     { ISD::SRA,     MVT::v16i16, { 6, 7, 5, 6 } },
1000     { ISD::SHL,     MVT::v8i32,  { 4, 7, 5, 6 } },
1001     { ISD::SRL,     MVT::v8i32,  { 6, 7, 5, 6 } },
1002     { ISD::SRA,     MVT::v8i32,  { 6, 7, 5, 6 } },
1003     { ISD::SHL,     MVT::v4i64,  { 4, 7, 5, 6 } },
1004     { ISD::SRL,     MVT::v4i64,  { 6, 7, 5, 6 } },
1005     { ISD::SRA,     MVT::v4i64,  { 6, 7, 5, 6 } },
1006   };
1007 
1008   // Look for XOP lowering tricks.
1009   if (ST->hasXOP()) {
1010     // If the right shift is constant then we'll fold the negation so
1011     // it's as cheap as a left shift.
1012     int ShiftISD = ISD;
1013     if ((ShiftISD == ISD::SRL || ShiftISD == ISD::SRA) && Op2Info.isConstant())
1014       ShiftISD = ISD::SHL;
1015     if (const auto *Entry =
1016             CostTableLookup(XOPShiftCostTable, ShiftISD, LT.second))
1017       if (auto KindCost = Entry->Cost[CostKind])
1018         return LT.first * *KindCost;
1019   }
1020 
1021   if (ISD == ISD::SHL && !Op2Info.isUniform() && Op2Info.isConstant()) {
1022     MVT VT = LT.second;
1023     // Vector shift left by non uniform constant can be lowered
1024     // into vector multiply.
1025     if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) ||
1026         ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX()))
1027       ISD = ISD::MUL;
1028   }
1029 
1030   static const CostKindTblEntry GLMCostTable[] = {
1031     { ISD::FDIV,  MVT::f32,   { 18, 19, 1, 1 } }, // divss
1032     { ISD::FDIV,  MVT::v4f32, { 35, 36, 1, 1 } }, // divps
1033     { ISD::FDIV,  MVT::f64,   { 33, 34, 1, 1 } }, // divsd
1034     { ISD::FDIV,  MVT::v2f64, { 65, 66, 1, 1 } }, // divpd
1035   };
1036 
1037   if (ST->useGLMDivSqrtCosts())
1038     if (const auto *Entry = CostTableLookup(GLMCostTable, ISD, LT.second))
1039       if (auto KindCost = Entry->Cost[CostKind])
1040         return LT.first * *KindCost;
1041 
1042   static const CostKindTblEntry SLMCostTable[] = {
1043     { ISD::MUL,   MVT::v4i32, { 11, 11, 1, 7 } }, // pmulld
1044     { ISD::MUL,   MVT::v8i16, {  2,  5, 1, 1 } }, // pmullw
1045     { ISD::FMUL,  MVT::f64,   {  2,  5, 1, 1 } }, // mulsd
1046     { ISD::FMUL,  MVT::f32,   {  1,  4, 1, 1 } }, // mulss
1047     { ISD::FMUL,  MVT::v2f64, {  4,  7, 1, 1 } }, // mulpd
1048     { ISD::FMUL,  MVT::v4f32, {  2,  5, 1, 1 } }, // mulps
1049     { ISD::FDIV,  MVT::f32,   { 17, 19, 1, 1 } }, // divss
1050     { ISD::FDIV,  MVT::v4f32, { 39, 39, 1, 6 } }, // divps
1051     { ISD::FDIV,  MVT::f64,   { 32, 34, 1, 1 } }, // divsd
1052     { ISD::FDIV,  MVT::v2f64, { 69, 69, 1, 6 } }, // divpd
1053     { ISD::FADD,  MVT::v2f64, {  2,  4, 1, 1 } }, // addpd
1054     { ISD::FSUB,  MVT::v2f64, {  2,  4, 1, 1 } }, // subpd
1055     // v2i64/v4i64 mul is custom lowered as a series of long:
1056     // multiplies(3), shifts(3) and adds(2)
1057     // slm muldq version throughput is 2 and addq throughput 4
1058     // thus: 3X2 (muldq throughput) + 3X1 (shift throughput) +
1059     //       3X4 (addq throughput) = 17
1060     { ISD::MUL,   MVT::v2i64, { 17, 22, 9, 9 } },
1061     // slm addq\subq throughput is 4
1062     { ISD::ADD,   MVT::v2i64, {  4,  2, 1, 2 } },
1063     { ISD::SUB,   MVT::v2i64, {  4,  2, 1, 2 } },
1064   };
1065 
1066   if (ST->useSLMArithCosts())
1067     if (const auto *Entry = CostTableLookup(SLMCostTable, ISD, LT.second))
1068       if (auto KindCost = Entry->Cost[CostKind])
1069         return LT.first * *KindCost;
1070 
1071   static const CostKindTblEntry AVX2CostTable[] = {
1072     { ISD::SHL,  MVT::v16i8,   {  6, 21,11,16 } }, // vpblendvb sequence.
1073     { ISD::SHL,  MVT::v32i8,   {  6, 23,11,22 } }, // vpblendvb sequence.
1074     { ISD::SHL,  MVT::v8i16,   {  5, 18, 5,10 } }, // extend/vpsrlvd/pack sequence.
1075     { ISD::SHL,  MVT::v16i16,  {  8, 10,10,14 } }, // extend/vpsrlvd/pack sequence.
1076 
1077     { ISD::SRL,  MVT::v16i8,   {  6, 27,12,18 } }, // vpblendvb sequence.
1078     { ISD::SRL,  MVT::v32i8,   {  8, 30,12,24 } }, // vpblendvb sequence.
1079     { ISD::SRL,  MVT::v8i16,   {  5, 11, 5,10 } }, // extend/vpsrlvd/pack sequence.
1080     { ISD::SRL,  MVT::v16i16,  {  8, 10,10,14 } }, // extend/vpsrlvd/pack sequence.
1081 
1082     { ISD::SRA,  MVT::v16i8,   { 17, 17,24,30 } }, // vpblendvb sequence.
1083     { ISD::SRA,  MVT::v32i8,   { 18, 20,24,43 } }, // vpblendvb sequence.
1084     { ISD::SRA,  MVT::v8i16,   {  5, 11, 5,10 } }, // extend/vpsravd/pack sequence.
1085     { ISD::SRA,  MVT::v16i16,  {  8, 10,10,14 } }, // extend/vpsravd/pack sequence.
1086     { ISD::SRA,  MVT::v2i64,   {  4,  5, 5, 5 } }, // srl/xor/sub sequence.
1087     { ISD::SRA,  MVT::v4i64,   {  8,  8, 5, 9 } }, // srl/xor/sub sequence.
1088 
1089     { ISD::SUB,  MVT::v32i8,   {  1,  1, 1, 2 } }, // psubb
1090     { ISD::ADD,  MVT::v32i8,   {  1,  1, 1, 2 } }, // paddb
1091     { ISD::SUB,  MVT::v16i16,  {  1,  1, 1, 2 } }, // psubw
1092     { ISD::ADD,  MVT::v16i16,  {  1,  1, 1, 2 } }, // paddw
1093     { ISD::SUB,  MVT::v8i32,   {  1,  1, 1, 2 } }, // psubd
1094     { ISD::ADD,  MVT::v8i32,   {  1,  1, 1, 2 } }, // paddd
1095     { ISD::SUB,  MVT::v4i64,   {  1,  1, 1, 2 } }, // psubq
1096     { ISD::ADD,  MVT::v4i64,   {  1,  1, 1, 2 } }, // paddq
1097 
1098     { ISD::MUL,  MVT::v16i8,   {  5, 18, 6,12 } }, // extend/pmullw/pack
1099     { ISD::MUL,  MVT::v32i8,   {  6, 11,10,19 } }, // unpack/pmullw
1100     { ISD::MUL,  MVT::v16i16,  {  2,  5, 1, 2 } }, // pmullw
1101     { ISD::MUL,  MVT::v8i32,   {  4, 10, 1, 2 } }, // pmulld
1102     { ISD::MUL,  MVT::v4i32,   {  2, 10, 1, 2 } }, // pmulld
1103     { ISD::MUL,  MVT::v4i64,   {  6, 10, 8,13 } }, // 3*pmuludq/3*shift/2*add
1104     { ISD::MUL,  MVT::v2i64,   {  6, 10, 8, 8 } }, // 3*pmuludq/3*shift/2*add
1105 
1106     { X86ISD::PMULUDQ, MVT::v4i64, { 1,  5, 1, 1 } },
1107 
1108     { ISD::FNEG, MVT::v4f64,   {  1,  1, 1, 2 } }, // vxorpd
1109     { ISD::FNEG, MVT::v8f32,   {  1,  1, 1, 2 } }, // vxorps
1110 
1111     { ISD::FADD, MVT::f64,     {  1,  4, 1, 1 } }, // vaddsd
1112     { ISD::FADD, MVT::f32,     {  1,  4, 1, 1 } }, // vaddss
1113     { ISD::FADD, MVT::v2f64,   {  1,  4, 1, 1 } }, // vaddpd
1114     { ISD::FADD, MVT::v4f32,   {  1,  4, 1, 1 } }, // vaddps
1115     { ISD::FADD, MVT::v4f64,   {  1,  4, 1, 2 } }, // vaddpd
1116     { ISD::FADD, MVT::v8f32,   {  1,  4, 1, 2 } }, // vaddps
1117 
1118     { ISD::FSUB, MVT::f64,     {  1,  4, 1, 1 } }, // vsubsd
1119     { ISD::FSUB, MVT::f32,     {  1,  4, 1, 1 } }, // vsubss
1120     { ISD::FSUB, MVT::v2f64,   {  1,  4, 1, 1 } }, // vsubpd
1121     { ISD::FSUB, MVT::v4f32,   {  1,  4, 1, 1 } }, // vsubps
1122     { ISD::FSUB, MVT::v4f64,   {  1,  4, 1, 2 } }, // vsubpd
1123     { ISD::FSUB, MVT::v8f32,   {  1,  4, 1, 2 } }, // vsubps
1124 
1125     { ISD::FMUL, MVT::f64,     {  1,  5, 1, 1 } }, // vmulsd
1126     { ISD::FMUL, MVT::f32,     {  1,  5, 1, 1 } }, // vmulss
1127     { ISD::FMUL, MVT::v2f64,   {  1,  5, 1, 1 } }, // vmulpd
1128     { ISD::FMUL, MVT::v4f32,   {  1,  5, 1, 1 } }, // vmulps
1129     { ISD::FMUL, MVT::v4f64,   {  1,  5, 1, 2 } }, // vmulpd
1130     { ISD::FMUL, MVT::v8f32,   {  1,  5, 1, 2 } }, // vmulps
1131 
1132     { ISD::FDIV, MVT::f32,     {  7, 13, 1, 1 } }, // vdivss
1133     { ISD::FDIV, MVT::v4f32,   {  7, 13, 1, 1 } }, // vdivps
1134     { ISD::FDIV, MVT::v8f32,   { 14, 21, 1, 3 } }, // vdivps
1135     { ISD::FDIV, MVT::f64,     { 14, 20, 1, 1 } }, // vdivsd
1136     { ISD::FDIV, MVT::v2f64,   { 14, 20, 1, 1 } }, // vdivpd
1137     { ISD::FDIV, MVT::v4f64,   { 28, 35, 1, 3 } }, // vdivpd
1138   };
1139 
1140   // Look for AVX2 lowering tricks for custom cases.
1141   if (ST->hasAVX2())
1142     if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
1143       if (auto KindCost = Entry->Cost[CostKind])
1144         return LT.first * *KindCost;
1145 
1146   static const CostKindTblEntry AVX1CostTable[] = {
1147     // We don't have to scalarize unsupported ops. We can issue two half-sized
1148     // operations and we only need to extract the upper YMM half.
1149     // Two ops + 1 extract + 1 insert = 4.
1150     { ISD::MUL,     MVT::v32i8,   { 12, 13, 22, 23 } }, // unpack/pmullw + split
1151     { ISD::MUL,     MVT::v16i16,  {  4,  8,  5,  6 } }, // pmullw + split
1152     { ISD::MUL,     MVT::v8i32,   {  5,  8,  5, 10 } }, // pmulld + split
1153     { ISD::MUL,     MVT::v4i32,   {  2,  5,  1,  3 } }, // pmulld
1154     { ISD::MUL,     MVT::v4i64,   { 12, 15, 19, 20 } },
1155 
1156     { ISD::AND,     MVT::v32i8,   {  1,  1, 1, 2 } }, // vandps
1157     { ISD::AND,     MVT::v16i16,  {  1,  1, 1, 2 } }, // vandps
1158     { ISD::AND,     MVT::v8i32,   {  1,  1, 1, 2 } }, // vandps
1159     { ISD::AND,     MVT::v4i64,   {  1,  1, 1, 2 } }, // vandps
1160 
1161     { ISD::OR,      MVT::v32i8,   {  1,  1, 1, 2 } }, // vorps
1162     { ISD::OR,      MVT::v16i16,  {  1,  1, 1, 2 } }, // vorps
1163     { ISD::OR,      MVT::v8i32,   {  1,  1, 1, 2 } }, // vorps
1164     { ISD::OR,      MVT::v4i64,   {  1,  1, 1, 2 } }, // vorps
1165 
1166     { ISD::XOR,     MVT::v32i8,   {  1,  1, 1, 2 } }, // vxorps
1167     { ISD::XOR,     MVT::v16i16,  {  1,  1, 1, 2 } }, // vxorps
1168     { ISD::XOR,     MVT::v8i32,   {  1,  1, 1, 2 } }, // vxorps
1169     { ISD::XOR,     MVT::v4i64,   {  1,  1, 1, 2 } }, // vxorps
1170 
1171     { ISD::SUB,     MVT::v32i8,   {  4,  2, 5, 6 } }, // psubb + split
1172     { ISD::ADD,     MVT::v32i8,   {  4,  2, 5, 6 } }, // paddb + split
1173     { ISD::SUB,     MVT::v16i16,  {  4,  2, 5, 6 } }, // psubw + split
1174     { ISD::ADD,     MVT::v16i16,  {  4,  2, 5, 6 } }, // paddw + split
1175     { ISD::SUB,     MVT::v8i32,   {  4,  2, 5, 6 } }, // psubd + split
1176     { ISD::ADD,     MVT::v8i32,   {  4,  2, 5, 6 } }, // paddd + split
1177     { ISD::SUB,     MVT::v4i64,   {  4,  2, 5, 6 } }, // psubq + split
1178     { ISD::ADD,     MVT::v4i64,   {  4,  2, 5, 6 } }, // paddq + split
1179     { ISD::SUB,     MVT::v2i64,   {  1,  1, 1, 1 } }, // psubq
1180     { ISD::ADD,     MVT::v2i64,   {  1,  1, 1, 1 } }, // paddq
1181 
1182     { ISD::SHL,     MVT::v16i8,   { 10, 21,11,17 } }, // pblendvb sequence.
1183     { ISD::SHL,     MVT::v32i8,   { 22, 22,27,40 } }, // pblendvb sequence + split.
1184     { ISD::SHL,     MVT::v8i16,   {  6,  9,11,11 } }, // pblendvb sequence.
1185     { ISD::SHL,     MVT::v16i16,  { 13, 16,24,25 } }, // pblendvb sequence + split.
1186     { ISD::SHL,     MVT::v4i32,   {  3, 11, 4, 6 } }, // pslld/paddd/cvttps2dq/pmulld
1187     { ISD::SHL,     MVT::v8i32,   {  9, 11,12,17 } }, // pslld/paddd/cvttps2dq/pmulld + split
1188     { ISD::SHL,     MVT::v2i64,   {  2,  4, 4, 6 } }, // Shift each lane + blend.
1189     { ISD::SHL,     MVT::v4i64,   {  6,  7,11,15 } }, // Shift each lane + blend + split.
1190 
1191     { ISD::SRL,     MVT::v16i8,   { 11, 27,12,18 } }, // pblendvb sequence.
1192     { ISD::SRL,     MVT::v32i8,   { 23, 23,30,43 } }, // pblendvb sequence + split.
1193     { ISD::SRL,     MVT::v8i16,   { 13, 16,14,22 } }, // pblendvb sequence.
1194     { ISD::SRL,     MVT::v16i16,  { 28, 30,31,48 } }, // pblendvb sequence + split.
1195     { ISD::SRL,     MVT::v4i32,   {  6,  7,12,16 } }, // Shift each lane + blend.
1196     { ISD::SRL,     MVT::v8i32,   { 14, 14,26,34 } }, // Shift each lane + blend + split.
1197     { ISD::SRL,     MVT::v2i64,   {  2,  4, 4, 6 } }, // Shift each lane + blend.
1198     { ISD::SRL,     MVT::v4i64,   {  6,  7,11,15 } }, // Shift each lane + blend + split.
1199 
1200     { ISD::SRA,     MVT::v16i8,   { 21, 22,24,36 } }, // pblendvb sequence.
1201     { ISD::SRA,     MVT::v32i8,   { 44, 45,51,76 } }, // pblendvb sequence + split.
1202     { ISD::SRA,     MVT::v8i16,   { 13, 16,14,22 } }, // pblendvb sequence.
1203     { ISD::SRA,     MVT::v16i16,  { 28, 30,31,48 } }, // pblendvb sequence + split.
1204     { ISD::SRA,     MVT::v4i32,   {  6,  7,12,16 } }, // Shift each lane + blend.
1205     { ISD::SRA,     MVT::v8i32,   { 14, 14,26,34 } }, // Shift each lane + blend + split.
1206     { ISD::SRA,     MVT::v2i64,   {  5,  6,10,14 } }, // Shift each lane + blend.
1207     { ISD::SRA,     MVT::v4i64,   { 12, 12,22,30 } }, // Shift each lane + blend + split.
1208 
1209     { ISD::FNEG,    MVT::v4f64,   {  2,  2, 1, 2 } }, // BTVER2 from http://www.agner.org/
1210     { ISD::FNEG,    MVT::v8f32,   {  2,  2, 1, 2 } }, // BTVER2 from http://www.agner.org/
1211 
1212     { ISD::FADD,    MVT::f64,     {  1,  5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1213     { ISD::FADD,    MVT::f32,     {  1,  5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1214     { ISD::FADD,    MVT::v2f64,   {  1,  5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1215     { ISD::FADD,    MVT::v4f32,   {  1,  5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1216     { ISD::FADD,    MVT::v4f64,   {  2,  5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1217     { ISD::FADD,    MVT::v8f32,   {  2,  5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1218 
1219     { ISD::FSUB,    MVT::f64,     {  1,  5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1220     { ISD::FSUB,    MVT::f32,     {  1,  5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1221     { ISD::FSUB,    MVT::v2f64,   {  1,  5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1222     { ISD::FSUB,    MVT::v4f32,   {  1,  5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1223     { ISD::FSUB,    MVT::v4f64,   {  2,  5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1224     { ISD::FSUB,    MVT::v8f32,   {  2,  5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1225 
1226     { ISD::FMUL,    MVT::f64,     {  2,  5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1227     { ISD::FMUL,    MVT::f32,     {  1,  5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1228     { ISD::FMUL,    MVT::v2f64,   {  2,  5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1229     { ISD::FMUL,    MVT::v4f32,   {  1,  5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1230     { ISD::FMUL,    MVT::v4f64,   {  4,  5, 1, 2 } }, // BTVER2 from http://www.agner.org/
1231     { ISD::FMUL,    MVT::v8f32,   {  2,  5, 1, 2 } }, // BTVER2 from http://www.agner.org/
1232 
1233     { ISD::FDIV,    MVT::f32,     { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/
1234     { ISD::FDIV,    MVT::v4f32,   { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/
1235     { ISD::FDIV,    MVT::v8f32,   { 28, 29, 1, 3 } }, // SNB from http://www.agner.org/
1236     { ISD::FDIV,    MVT::f64,     { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/
1237     { ISD::FDIV,    MVT::v2f64,   { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/
1238     { ISD::FDIV,    MVT::v4f64,   { 44, 45, 1, 3 } }, // SNB from http://www.agner.org/
1239   };
1240 
1241   if (ST->hasAVX())
1242     if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second))
1243       if (auto KindCost = Entry->Cost[CostKind])
1244         return LT.first * *KindCost;
1245 
1246   static const CostKindTblEntry SSE42CostTable[] = {
1247     { ISD::FADD, MVT::f64,    {  1,  3, 1, 1 } }, // Nehalem from http://www.agner.org/
1248     { ISD::FADD, MVT::f32,    {  1,  3, 1, 1 } }, // Nehalem from http://www.agner.org/
1249     { ISD::FADD, MVT::v2f64,  {  1,  3, 1, 1 } }, // Nehalem from http://www.agner.org/
1250     { ISD::FADD, MVT::v4f32,  {  1,  3, 1, 1 } }, // Nehalem from http://www.agner.org/
1251 
1252     { ISD::FSUB, MVT::f64,    {  1,  3, 1, 1 } }, // Nehalem from http://www.agner.org/
1253     { ISD::FSUB, MVT::f32 ,   {  1,  3, 1, 1 } }, // Nehalem from http://www.agner.org/
1254     { ISD::FSUB, MVT::v2f64,  {  1,  3, 1, 1 } }, // Nehalem from http://www.agner.org/
1255     { ISD::FSUB, MVT::v4f32,  {  1,  3, 1, 1 } }, // Nehalem from http://www.agner.org/
1256 
1257     { ISD::FMUL, MVT::f64,    {  1,  5, 1, 1 } }, // Nehalem from http://www.agner.org/
1258     { ISD::FMUL, MVT::f32,    {  1,  5, 1, 1 } }, // Nehalem from http://www.agner.org/
1259     { ISD::FMUL, MVT::v2f64,  {  1,  5, 1, 1 } }, // Nehalem from http://www.agner.org/
1260     { ISD::FMUL, MVT::v4f32,  {  1,  5, 1, 1 } }, // Nehalem from http://www.agner.org/
1261 
1262     { ISD::FDIV,  MVT::f32,   { 14, 14, 1, 1 } }, // Nehalem from http://www.agner.org/
1263     { ISD::FDIV,  MVT::v4f32, { 14, 14, 1, 1 } }, // Nehalem from http://www.agner.org/
1264     { ISD::FDIV,  MVT::f64,   { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/
1265     { ISD::FDIV,  MVT::v2f64, { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/
1266 
1267     { ISD::MUL,   MVT::v2i64, {  6, 10,10,10 } }  // 3*pmuludq/3*shift/2*add
1268   };
1269 
1270   if (ST->hasSSE42())
1271     if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second))
1272       if (auto KindCost = Entry->Cost[CostKind])
1273         return LT.first * *KindCost;
1274 
1275   static const CostKindTblEntry SSE41CostTable[] = {
1276     { ISD::SHL,  MVT::v16i8,  { 15, 24,17,22 } }, // pblendvb sequence.
1277     { ISD::SHL,  MVT::v8i16,  { 11, 14,11,11 } }, // pblendvb sequence.
1278     { ISD::SHL,  MVT::v4i32,  { 14, 20, 4,10 } }, // pslld/paddd/cvttps2dq/pmulld
1279 
1280     { ISD::SRL,  MVT::v16i8,  { 16, 27,18,24 } }, // pblendvb sequence.
1281     { ISD::SRL,  MVT::v8i16,  { 22, 26,23,27 } }, // pblendvb sequence.
1282     { ISD::SRL,  MVT::v4i32,  { 16, 17,15,19 } }, // Shift each lane + blend.
1283     { ISD::SRL,  MVT::v2i64,  {  4,  6, 5, 7 } }, // splat+shuffle sequence.
1284 
1285     { ISD::SRA,  MVT::v16i8,  { 38, 41,30,36 } }, // pblendvb sequence.
1286     { ISD::SRA,  MVT::v8i16,  { 22, 26,23,27 } }, // pblendvb sequence.
1287     { ISD::SRA,  MVT::v4i32,  { 16, 17,15,19 } }, // Shift each lane + blend.
1288     { ISD::SRA,  MVT::v2i64,  {  8, 17, 5, 7 } }, // splat+shuffle sequence.
1289 
1290     { ISD::MUL,  MVT::v16i8,  {  5, 18,10,12 } }, // 2*unpack/2*pmullw/2*and/pack
1291     { ISD::MUL,  MVT::v4i32,  {  2, 11, 1, 1 } }  // pmulld (Nehalem from agner.org)
1292   };
1293 
1294   if (ST->hasSSE41())
1295     if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second))
1296       if (auto KindCost = Entry->Cost[CostKind])
1297         return LT.first * *KindCost;
1298 
1299   static const CostKindTblEntry SSE2CostTable[] = {
1300     // We don't correctly identify costs of casts because they are marked as
1301     // custom.
1302     { ISD::SHL,  MVT::v16i8,  { 13, 21,26,28 } }, // cmpgtb sequence.
1303     { ISD::SHL,  MVT::v8i16,  { 24, 27,16,20 } }, // cmpgtw sequence.
1304     { ISD::SHL,  MVT::v4i32,  { 17, 19,10,12 } }, // pslld/paddd/cvttps2dq/pmuludq.
1305     { ISD::SHL,  MVT::v2i64,  {  4,  6, 5, 7 } }, // splat+shuffle sequence.
1306 
1307     { ISD::SRL,  MVT::v16i8,  { 14, 28,27,30 } }, // cmpgtb sequence.
1308     { ISD::SRL,  MVT::v8i16,  { 16, 19,31,31 } }, // cmpgtw sequence.
1309     { ISD::SRL,  MVT::v4i32,  { 12, 12,15,19 } }, // Shift each lane + blend.
1310     { ISD::SRL,  MVT::v2i64,  {  4,  6, 5, 7 } }, // splat+shuffle sequence.
1311 
1312     { ISD::SRA,  MVT::v16i8,  { 27, 30,54,54 } }, // unpacked cmpgtb sequence.
1313     { ISD::SRA,  MVT::v8i16,  { 16, 19,31,31 } }, // cmpgtw sequence.
1314     { ISD::SRA,  MVT::v4i32,  { 12, 12,15,19 } }, // Shift each lane + blend.
1315     { ISD::SRA,  MVT::v2i64,  {  8, 11,12,16 } }, // srl/xor/sub splat+shuffle sequence.
1316 
1317     { ISD::AND,  MVT::v16i8,  {  1,  1, 1, 1 } }, // pand
1318     { ISD::AND,  MVT::v8i16,  {  1,  1, 1, 1 } }, // pand
1319     { ISD::AND,  MVT::v4i32,  {  1,  1, 1, 1 } }, // pand
1320     { ISD::AND,  MVT::v2i64,  {  1,  1, 1, 1 } }, // pand
1321 
1322     { ISD::OR,   MVT::v16i8,  {  1,  1, 1, 1 } }, // por
1323     { ISD::OR,   MVT::v8i16,  {  1,  1, 1, 1 } }, // por
1324     { ISD::OR,   MVT::v4i32,  {  1,  1, 1, 1 } }, // por
1325     { ISD::OR,   MVT::v2i64,  {  1,  1, 1, 1 } }, // por
1326 
1327     { ISD::XOR,  MVT::v16i8,  {  1,  1, 1, 1 } }, // pxor
1328     { ISD::XOR,  MVT::v8i16,  {  1,  1, 1, 1 } }, // pxor
1329     { ISD::XOR,  MVT::v4i32,  {  1,  1, 1, 1 } }, // pxor
1330     { ISD::XOR,  MVT::v2i64,  {  1,  1, 1, 1 } }, // pxor
1331 
1332     { ISD::ADD,  MVT::v2i64,  {  1,  2, 1, 2 } }, // paddq
1333     { ISD::SUB,  MVT::v2i64,  {  1,  2, 1, 2 } }, // psubq
1334 
1335     { ISD::MUL,  MVT::v16i8,  {  5, 18,12,12 } }, // 2*unpack/2*pmullw/2*and/pack
1336     { ISD::MUL,  MVT::v8i16,  {  1,  5, 1, 1 } }, // pmullw
1337     { ISD::MUL,  MVT::v4i32,  {  6,  8, 7, 7 } }, // 3*pmuludq/4*shuffle
1338     { ISD::MUL,  MVT::v2i64,  {  7, 10,10,10 } }, // 3*pmuludq/3*shift/2*add
1339 
1340     { X86ISD::PMULUDQ, MVT::v2i64, { 1,  5, 1, 1 } },
1341 
1342     { ISD::FDIV, MVT::f32,    { 23, 23, 1, 1 } }, // Pentium IV from http://www.agner.org/
1343     { ISD::FDIV, MVT::v4f32,  { 39, 39, 1, 1 } }, // Pentium IV from http://www.agner.org/
1344     { ISD::FDIV, MVT::f64,    { 38, 38, 1, 1 } }, // Pentium IV from http://www.agner.org/
1345     { ISD::FDIV, MVT::v2f64,  { 69, 69, 1, 1 } }, // Pentium IV from http://www.agner.org/
1346 
1347     { ISD::FNEG, MVT::f32,    {  1,  1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1348     { ISD::FNEG, MVT::f64,    {  1,  1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1349     { ISD::FNEG, MVT::v4f32,  {  1,  1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1350     { ISD::FNEG, MVT::v2f64,  {  1,  1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1351 
1352     { ISD::FADD, MVT::f32,    {  2,  3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1353     { ISD::FADD, MVT::f64,    {  2,  3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1354     { ISD::FADD, MVT::v2f64,  {  2,  3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1355 
1356     { ISD::FSUB, MVT::f32,    {  2,  3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1357     { ISD::FSUB, MVT::f64,    {  2,  3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1358     { ISD::FSUB, MVT::v2f64,  {  2,  3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1359 
1360     { ISD::FMUL, MVT::f64,    {  2,  5, 1, 1 } }, // Pentium IV from http://www.agner.org/
1361     { ISD::FMUL, MVT::v2f64,  {  2,  5, 1, 1 } }, // Pentium IV from http://www.agner.org/
1362   };
1363 
1364   if (ST->hasSSE2())
1365     if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second))
1366       if (auto KindCost = Entry->Cost[CostKind])
1367         return LT.first * *KindCost;
1368 
1369   static const CostKindTblEntry SSE1CostTable[] = {
1370     { ISD::FDIV, MVT::f32,   { 17, 18, 1, 1 } }, // Pentium III from http://www.agner.org/
1371     { ISD::FDIV, MVT::v4f32, { 34, 48, 1, 1 } }, // Pentium III from http://www.agner.org/
1372 
1373     { ISD::FNEG, MVT::f32,   {  2,  2, 1, 2 } }, // Pentium III from http://www.agner.org/
1374     { ISD::FNEG, MVT::v4f32, {  2,  2, 1, 2 } }, // Pentium III from http://www.agner.org/
1375 
1376     { ISD::FADD, MVT::f32,   {  1,  3, 1, 1 } }, // Pentium III from http://www.agner.org/
1377     { ISD::FADD, MVT::v4f32, {  2,  3, 1, 1 } }, // Pentium III from http://www.agner.org/
1378 
1379     { ISD::FSUB, MVT::f32,   {  1,  3, 1, 1 } }, // Pentium III from http://www.agner.org/
1380     { ISD::FSUB, MVT::v4f32, {  2,  3, 1, 1 } }, // Pentium III from http://www.agner.org/
1381 
1382     { ISD::FMUL, MVT::f32,   {  2,  5, 1, 1 } }, // Pentium III from http://www.agner.org/
1383     { ISD::FMUL, MVT::v4f32, {  2,  5, 1, 1 } }, // Pentium III from http://www.agner.org/
1384   };
1385 
1386   if (ST->hasSSE1())
1387     if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second))
1388       if (auto KindCost = Entry->Cost[CostKind])
1389         return LT.first * *KindCost;
1390 
1391   static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets
1392     { ISD::ADD,  MVT::i64,  {  1 } }, // Core (Merom) from http://www.agner.org/
1393     { ISD::SUB,  MVT::i64,  {  1 } }, // Core (Merom) from http://www.agner.org/
1394     { ISD::MUL,  MVT::i64,  {  2,  6,  1,  2 } },
1395   };
1396 
1397   if (ST->is64Bit())
1398     if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, LT.second))
1399       if (auto KindCost = Entry->Cost[CostKind])
1400         return LT.first * *KindCost;
1401 
1402   static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets
1403     { ISD::ADD,  MVT::i8,  {  1 } }, // Pentium III from http://www.agner.org/
1404     { ISD::ADD,  MVT::i16, {  1 } }, // Pentium III from http://www.agner.org/
1405     { ISD::ADD,  MVT::i32, {  1 } }, // Pentium III from http://www.agner.org/
1406 
1407     { ISD::SUB,  MVT::i8,  {  1 } }, // Pentium III from http://www.agner.org/
1408     { ISD::SUB,  MVT::i16, {  1 } }, // Pentium III from http://www.agner.org/
1409     { ISD::SUB,  MVT::i32, {  1 } }, // Pentium III from http://www.agner.org/
1410 
1411     { ISD::MUL,  MVT::i8,  {  3,  4, 1, 1 } },
1412     { ISD::MUL,  MVT::i16, {  2,  4, 1, 1 } },
1413     { ISD::MUL,  MVT::i32, {  1,  4, 1, 1 } },
1414 
1415     { ISD::FNEG, MVT::f64, {  2,  2, 1, 3 } }, // (x87)
1416     { ISD::FADD, MVT::f64, {  2,  3, 1, 1 } }, // (x87)
1417     { ISD::FSUB, MVT::f64, {  2,  3, 1, 1 } }, // (x87)
1418     { ISD::FMUL, MVT::f64, {  2,  5, 1, 1 } }, // (x87)
1419     { ISD::FDIV, MVT::f64, { 38, 38, 1, 1 } }, // (x87)
1420   };
1421 
1422   if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, LT.second))
1423     if (auto KindCost = Entry->Cost[CostKind])
1424       return LT.first * *KindCost;
1425 
1426   // It is not a good idea to vectorize division. We have to scalarize it and
1427   // in the process we will often end up having to spilling regular
1428   // registers. The overhead of division is going to dominate most kernels
1429   // anyways so try hard to prevent vectorization of division - it is
1430   // generally a bad idea. Assume somewhat arbitrarily that we have to be able
1431   // to hide "20 cycles" for each lane.
1432   if (CostKind == TTI::TCK_RecipThroughput && LT.second.isVector() &&
1433       (ISD == ISD::SDIV || ISD == ISD::SREM || ISD == ISD::UDIV ||
1434        ISD == ISD::UREM)) {
1435     InstructionCost ScalarCost =
1436         getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind,
1437                                Op1Info.getNoProps(), Op2Info.getNoProps());
1438     return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost;
1439   }
1440 
1441   // Handle some basic single instruction code size cases.
1442   if (CostKind == TTI::TCK_CodeSize) {
1443     switch (ISD) {
1444     case ISD::FADD:
1445     case ISD::FSUB:
1446     case ISD::FMUL:
1447     case ISD::FDIV:
1448     case ISD::FNEG:
1449     case ISD::AND:
1450     case ISD::OR:
1451     case ISD::XOR:
1452       return LT.first;
1453       break;
1454     }
1455   }
1456 
1457   // Fallback to the default implementation.
1458   return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1459                                        Args, CxtI);
1460 }
1461 
1462 InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
1463                                            VectorType *BaseTp,
1464                                            ArrayRef<int> Mask,
1465                                            TTI::TargetCostKind CostKind,
1466                                            int Index, VectorType *SubTp,
1467                                            ArrayRef<const Value *> Args) {
1468   // 64-bit packed float vectors (v2f32) are widened to type v4f32.
1469   // 64-bit packed integer vectors (v2i32) are widened to type v4i32.
1470   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(BaseTp);
1471 
1472   Kind = improveShuffleKindFromMask(Kind, Mask);
1473 
1474   // Treat Transpose as 2-op shuffles - there's no difference in lowering.
1475   if (Kind == TTI::SK_Transpose)
1476     Kind = TTI::SK_PermuteTwoSrc;
1477 
1478   // For Broadcasts we are splatting the first element from the first input
1479   // register, so only need to reference that input and all the output
1480   // registers are the same.
1481   if (Kind == TTI::SK_Broadcast)
1482     LT.first = 1;
1483 
1484   // Subvector extractions are free if they start at the beginning of a
1485   // vector and cheap if the subvectors are aligned.
1486   if (Kind == TTI::SK_ExtractSubvector && LT.second.isVector()) {
1487     int NumElts = LT.second.getVectorNumElements();
1488     if ((Index % NumElts) == 0)
1489       return 0;
1490     std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
1491     if (SubLT.second.isVector()) {
1492       int NumSubElts = SubLT.second.getVectorNumElements();
1493       if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1494         return SubLT.first;
1495       // Handle some cases for widening legalization. For now we only handle
1496       // cases where the original subvector was naturally aligned and evenly
1497       // fit in its legalized subvector type.
1498       // FIXME: Remove some of the alignment restrictions.
1499       // FIXME: We can use permq for 64-bit or larger extracts from 256-bit
1500       // vectors.
1501       int OrigSubElts = cast<FixedVectorType>(SubTp)->getNumElements();
1502       if (NumSubElts > OrigSubElts && (Index % OrigSubElts) == 0 &&
1503           (NumSubElts % OrigSubElts) == 0 &&
1504           LT.second.getVectorElementType() ==
1505               SubLT.second.getVectorElementType() &&
1506           LT.second.getVectorElementType().getSizeInBits() ==
1507               BaseTp->getElementType()->getPrimitiveSizeInBits()) {
1508         assert(NumElts >= NumSubElts && NumElts > OrigSubElts &&
1509                "Unexpected number of elements!");
1510         auto *VecTy = FixedVectorType::get(BaseTp->getElementType(),
1511                                            LT.second.getVectorNumElements());
1512         auto *SubTy = FixedVectorType::get(BaseTp->getElementType(),
1513                                            SubLT.second.getVectorNumElements());
1514         int ExtractIndex = alignDown((Index % NumElts), NumSubElts);
1515         InstructionCost ExtractCost =
1516             getShuffleCost(TTI::SK_ExtractSubvector, VecTy, std::nullopt,
1517                            CostKind, ExtractIndex, SubTy);
1518 
1519         // If the original size is 32-bits or more, we can use pshufd. Otherwise
1520         // if we have SSSE3 we can use pshufb.
1521         if (SubTp->getPrimitiveSizeInBits() >= 32 || ST->hasSSSE3())
1522           return ExtractCost + 1; // pshufd or pshufb
1523 
1524         assert(SubTp->getPrimitiveSizeInBits() == 16 &&
1525                "Unexpected vector size");
1526 
1527         return ExtractCost + 2; // worst case pshufhw + pshufd
1528       }
1529     }
1530   }
1531 
1532   // Subvector insertions are cheap if the subvectors are aligned.
1533   // Note that in general, the insertion starting at the beginning of a vector
1534   // isn't free, because we need to preserve the rest of the wide vector.
1535   if (Kind == TTI::SK_InsertSubvector && LT.second.isVector()) {
1536     int NumElts = LT.second.getVectorNumElements();
1537     std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
1538     if (SubLT.second.isVector()) {
1539       int NumSubElts = SubLT.second.getVectorNumElements();
1540       if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1541         return SubLT.first;
1542     }
1543 
1544     // If the insertion isn't aligned, treat it like a 2-op shuffle.
1545     Kind = TTI::SK_PermuteTwoSrc;
1546   }
1547 
1548   // Handle some common (illegal) sub-vector types as they are often very cheap
1549   // to shuffle even on targets without PSHUFB.
1550   EVT VT = TLI->getValueType(DL, BaseTp);
1551   if (VT.isSimple() && VT.isVector() && VT.getSizeInBits() < 128 &&
1552       !ST->hasSSSE3()) {
1553      static const CostTblEntry SSE2SubVectorShuffleTbl[] = {
1554       {TTI::SK_Broadcast,        MVT::v4i16, 1}, // pshuflw
1555       {TTI::SK_Broadcast,        MVT::v2i16, 1}, // pshuflw
1556       {TTI::SK_Broadcast,        MVT::v8i8,  2}, // punpck/pshuflw
1557       {TTI::SK_Broadcast,        MVT::v4i8,  2}, // punpck/pshuflw
1558       {TTI::SK_Broadcast,        MVT::v2i8,  1}, // punpck
1559 
1560       {TTI::SK_Reverse,          MVT::v4i16, 1}, // pshuflw
1561       {TTI::SK_Reverse,          MVT::v2i16, 1}, // pshuflw
1562       {TTI::SK_Reverse,          MVT::v4i8,  3}, // punpck/pshuflw/packus
1563       {TTI::SK_Reverse,          MVT::v2i8,  1}, // punpck
1564 
1565       {TTI::SK_Splice,           MVT::v4i16, 2}, // punpck+psrldq
1566       {TTI::SK_Splice,           MVT::v2i16, 2}, // punpck+psrldq
1567       {TTI::SK_Splice,           MVT::v4i8,  2}, // punpck+psrldq
1568       {TTI::SK_Splice,           MVT::v2i8,  2}, // punpck+psrldq
1569 
1570       {TTI::SK_PermuteTwoSrc,    MVT::v4i16, 2}, // punpck/pshuflw
1571       {TTI::SK_PermuteTwoSrc,    MVT::v2i16, 2}, // punpck/pshuflw
1572       {TTI::SK_PermuteTwoSrc,    MVT::v8i8,  7}, // punpck/pshuflw
1573       {TTI::SK_PermuteTwoSrc,    MVT::v4i8,  4}, // punpck/pshuflw
1574       {TTI::SK_PermuteTwoSrc,    MVT::v2i8,  2}, // punpck
1575 
1576       {TTI::SK_PermuteSingleSrc, MVT::v4i16, 1}, // pshuflw
1577       {TTI::SK_PermuteSingleSrc, MVT::v2i16, 1}, // pshuflw
1578       {TTI::SK_PermuteSingleSrc, MVT::v8i8,  5}, // punpck/pshuflw
1579       {TTI::SK_PermuteSingleSrc, MVT::v4i8,  3}, // punpck/pshuflw
1580       {TTI::SK_PermuteSingleSrc, MVT::v2i8,  1}, // punpck
1581     };
1582 
1583     if (ST->hasSSE2())
1584       if (const auto *Entry =
1585               CostTableLookup(SSE2SubVectorShuffleTbl, Kind, VT.getSimpleVT()))
1586         return Entry->Cost;
1587   }
1588 
1589   // We are going to permute multiple sources and the result will be in multiple
1590   // destinations. Providing an accurate cost only for splits where the element
1591   // type remains the same.
1592   if (Kind == TTI::SK_PermuteSingleSrc && LT.first != 1) {
1593     MVT LegalVT = LT.second;
1594     if (LegalVT.isVector() &&
1595         LegalVT.getVectorElementType().getSizeInBits() ==
1596             BaseTp->getElementType()->getPrimitiveSizeInBits() &&
1597         LegalVT.getVectorNumElements() <
1598             cast<FixedVectorType>(BaseTp)->getNumElements()) {
1599 
1600       unsigned VecTySize = DL.getTypeStoreSize(BaseTp);
1601       unsigned LegalVTSize = LegalVT.getStoreSize();
1602       // Number of source vectors after legalization:
1603       unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
1604       // Number of destination vectors after legalization:
1605       InstructionCost NumOfDests = LT.first;
1606 
1607       auto *SingleOpTy = FixedVectorType::get(BaseTp->getElementType(),
1608                                               LegalVT.getVectorNumElements());
1609 
1610       if (!Mask.empty() && NumOfDests.isValid()) {
1611         // Try to perform better estimation of the permutation.
1612         // 1. Split the source/destination vectors into real registers.
1613         // 2. Do the mask analysis to identify which real registers are
1614         // permuted. If more than 1 source registers are used for the
1615         // destination register building, the cost for this destination register
1616         // is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one
1617         // source register is used, build mask and calculate the cost as a cost
1618         // of PermuteSingleSrc.
1619         // Also, for the single register permute we try to identify if the
1620         // destination register is just a copy of the source register or the
1621         // copy of the previous destination register (the cost is
1622         // TTI::TCC_Basic). If the source register is just reused, the cost for
1623         // this operation is 0.
1624         unsigned E = *NumOfDests.getValue();
1625         unsigned NormalizedVF =
1626             LegalVT.getVectorNumElements() * std::max(NumOfSrcs, E);
1627         unsigned NumOfSrcRegs = NormalizedVF / LegalVT.getVectorNumElements();
1628         unsigned NumOfDestRegs = NormalizedVF / LegalVT.getVectorNumElements();
1629         SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem);
1630         copy(Mask, NormalizedMask.begin());
1631         unsigned PrevSrcReg = 0;
1632         ArrayRef<int> PrevRegMask;
1633         InstructionCost Cost = 0;
1634         processShuffleMasks(
1635             NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
1636             [this, SingleOpTy, CostKind, &PrevSrcReg, &PrevRegMask,
1637              &Cost](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
1638               if (!ShuffleVectorInst::isIdentityMask(RegMask)) {
1639                 // Check if the previous register can be just copied to the next
1640                 // one.
1641                 if (PrevRegMask.empty() || PrevSrcReg != SrcReg ||
1642                     PrevRegMask != RegMask)
1643                   Cost += getShuffleCost(TTI::SK_PermuteSingleSrc, SingleOpTy,
1644                                          RegMask, CostKind, 0, nullptr);
1645                 else
1646                   // Just a copy of previous destination register.
1647                   Cost += TTI::TCC_Basic;
1648                 return;
1649               }
1650               if (SrcReg != DestReg &&
1651                   any_of(RegMask, [](int I) { return I != PoisonMaskElem; })) {
1652                 // Just a copy of the source register.
1653                 Cost += TTI::TCC_Basic;
1654               }
1655               PrevSrcReg = SrcReg;
1656               PrevRegMask = RegMask;
1657             },
1658             [this, SingleOpTy, CostKind, &Cost](ArrayRef<int> RegMask,
1659                                                 unsigned /*Unused*/,
1660                                                 unsigned /*Unused*/) {
1661               Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, RegMask,
1662                                      CostKind, 0, nullptr);
1663             });
1664         return Cost;
1665       }
1666 
1667       InstructionCost NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
1668       return NumOfShuffles * getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy,
1669                                             std::nullopt, CostKind, 0, nullptr);
1670     }
1671 
1672     return BaseT::getShuffleCost(Kind, BaseTp, Mask, CostKind, Index, SubTp);
1673   }
1674 
1675   // For 2-input shuffles, we must account for splitting the 2 inputs into many.
1676   if (Kind == TTI::SK_PermuteTwoSrc && LT.first != 1) {
1677     // We assume that source and destination have the same vector type.
1678     InstructionCost NumOfDests = LT.first;
1679     InstructionCost NumOfShufflesPerDest = LT.first * 2 - 1;
1680     LT.first = NumOfDests * NumOfShufflesPerDest;
1681   }
1682 
1683   static const CostTblEntry AVX512VBMIShuffleTbl[] = {
1684       {TTI::SK_Reverse, MVT::v64i8, 1}, // vpermb
1685       {TTI::SK_Reverse, MVT::v32i8, 1}, // vpermb
1686 
1687       {TTI::SK_PermuteSingleSrc, MVT::v64i8, 1}, // vpermb
1688       {TTI::SK_PermuteSingleSrc, MVT::v32i8, 1}, // vpermb
1689 
1690       {TTI::SK_PermuteTwoSrc, MVT::v64i8, 2}, // vpermt2b
1691       {TTI::SK_PermuteTwoSrc, MVT::v32i8, 2}, // vpermt2b
1692       {TTI::SK_PermuteTwoSrc, MVT::v16i8, 2}  // vpermt2b
1693   };
1694 
1695   if (ST->hasVBMI())
1696     if (const auto *Entry =
1697             CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second))
1698       return LT.first * Entry->Cost;
1699 
1700   static const CostTblEntry AVX512BWShuffleTbl[] = {
1701       {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw
1702       {TTI::SK_Broadcast, MVT::v32f16, 1}, // vpbroadcastw
1703       {TTI::SK_Broadcast, MVT::v64i8, 1},  // vpbroadcastb
1704 
1705       {TTI::SK_Reverse, MVT::v32i16, 2}, // vpermw
1706       {TTI::SK_Reverse, MVT::v32f16, 2}, // vpermw
1707       {TTI::SK_Reverse, MVT::v16i16, 2}, // vpermw
1708       {TTI::SK_Reverse, MVT::v64i8, 2},  // pshufb + vshufi64x2
1709 
1710       {TTI::SK_PermuteSingleSrc, MVT::v32i16, 2}, // vpermw
1711       {TTI::SK_PermuteSingleSrc, MVT::v32f16, 2}, // vpermw
1712       {TTI::SK_PermuteSingleSrc, MVT::v16i16, 2}, // vpermw
1713       {TTI::SK_PermuteSingleSrc, MVT::v16f16, 2}, // vpermw
1714       {TTI::SK_PermuteSingleSrc, MVT::v64i8, 8},  // extend to v32i16
1715 
1716       {TTI::SK_PermuteTwoSrc, MVT::v32i16, 2}, // vpermt2w
1717       {TTI::SK_PermuteTwoSrc, MVT::v32f16, 2}, // vpermt2w
1718       {TTI::SK_PermuteTwoSrc, MVT::v16i16, 2}, // vpermt2w
1719       {TTI::SK_PermuteTwoSrc, MVT::v8i16, 2},  // vpermt2w
1720       {TTI::SK_PermuteTwoSrc, MVT::v64i8, 19}, // 6 * v32i8 + 1
1721 
1722       {TTI::SK_Select, MVT::v32i16, 1}, // vblendmw
1723       {TTI::SK_Select, MVT::v64i8,  1}, // vblendmb
1724 
1725       {TTI::SK_Splice, MVT::v32i16, 2}, // vshufi64x2 + palignr
1726       {TTI::SK_Splice, MVT::v32f16, 2}, // vshufi64x2 + palignr
1727       {TTI::SK_Splice, MVT::v64i8,  2}, // vshufi64x2 + palignr
1728   };
1729 
1730   if (ST->hasBWI())
1731     if (const auto *Entry =
1732             CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second))
1733       return LT.first * Entry->Cost;
1734 
1735   static const CostKindTblEntry AVX512ShuffleTbl[] = {
1736       {TTI::SK_Broadcast, MVT::v8f64,  { 1, 1, 1, 1 } }, // vbroadcastsd
1737       {TTI::SK_Broadcast, MVT::v16f32, { 1, 1, 1, 1 } }, // vbroadcastss
1738       {TTI::SK_Broadcast, MVT::v8i64,  { 1, 1, 1, 1 } }, // vpbroadcastq
1739       {TTI::SK_Broadcast, MVT::v16i32, { 1, 1, 1, 1 } }, // vpbroadcastd
1740       {TTI::SK_Broadcast, MVT::v32i16, { 1, 1, 1, 1 } }, // vpbroadcastw
1741       {TTI::SK_Broadcast, MVT::v32f16, { 1, 1, 1, 1 } }, // vpbroadcastw
1742       {TTI::SK_Broadcast, MVT::v64i8,  { 1, 1, 1, 1 } }, // vpbroadcastb
1743 
1744       {TTI::SK_Reverse, MVT::v8f64,  { 1, 3, 1, 1 } }, // vpermpd
1745       {TTI::SK_Reverse, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermps
1746       {TTI::SK_Reverse, MVT::v8i64,  { 1, 3, 1, 1 } }, // vpermq
1747       {TTI::SK_Reverse, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermd
1748       {TTI::SK_Reverse, MVT::v32i16, { 7, 7, 7, 7 } }, // per mca
1749       {TTI::SK_Reverse, MVT::v32f16, { 7, 7, 7, 7 } }, // per mca
1750       {TTI::SK_Reverse, MVT::v64i8,  { 7, 7, 7, 7 } }, // per mca
1751 
1752       {TTI::SK_Splice, MVT::v8f64,  { 1, 1, 1, 1 } }, // vpalignd
1753       {TTI::SK_Splice, MVT::v4f64,  { 1, 1, 1, 1 } }, // vpalignd
1754       {TTI::SK_Splice, MVT::v16f32, { 1, 1, 1, 1 } }, // vpalignd
1755       {TTI::SK_Splice, MVT::v8f32,  { 1, 1, 1, 1 } }, // vpalignd
1756       {TTI::SK_Splice, MVT::v8i64,  { 1, 1, 1, 1 } }, // vpalignd
1757       {TTI::SK_Splice, MVT::v4i64,  { 1, 1, 1, 1 } }, // vpalignd
1758       {TTI::SK_Splice, MVT::v16i32, { 1, 1, 1, 1 } }, // vpalignd
1759       {TTI::SK_Splice, MVT::v8i32,  { 1, 1, 1, 1 } }, // vpalignd
1760       {TTI::SK_Splice, MVT::v32i16, { 4, 4, 4, 4 } }, // split + palignr
1761       {TTI::SK_Splice, MVT::v32f16, { 4, 4, 4, 4 } }, // split + palignr
1762       {TTI::SK_Splice, MVT::v64i8,  { 4, 4, 4, 4 } }, // split + palignr
1763 
1764       {TTI::SK_PermuteSingleSrc, MVT::v8f64,  { 1, 3, 1, 1 } }, // vpermpd
1765       {TTI::SK_PermuteSingleSrc, MVT::v4f64,  { 1, 3, 1, 1 } }, // vpermpd
1766       {TTI::SK_PermuteSingleSrc, MVT::v2f64,  { 1, 3, 1, 1 } }, // vpermpd
1767       {TTI::SK_PermuteSingleSrc, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermps
1768       {TTI::SK_PermuteSingleSrc, MVT::v8f32,  { 1, 3, 1, 1 } }, // vpermps
1769       {TTI::SK_PermuteSingleSrc, MVT::v4f32,  { 1, 3, 1, 1 } }, // vpermps
1770       {TTI::SK_PermuteSingleSrc, MVT::v8i64,  { 1, 3, 1, 1 } }, // vpermq
1771       {TTI::SK_PermuteSingleSrc, MVT::v4i64,  { 1, 3, 1, 1 } }, // vpermq
1772       {TTI::SK_PermuteSingleSrc, MVT::v2i64,  { 1, 3, 1, 1 } }, // vpermq
1773       {TTI::SK_PermuteSingleSrc, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermd
1774       {TTI::SK_PermuteSingleSrc, MVT::v8i32,  { 1, 3, 1, 1 } }, // vpermd
1775       {TTI::SK_PermuteSingleSrc, MVT::v4i32,  { 1, 3, 1, 1 } }, // vpermd
1776       {TTI::SK_PermuteSingleSrc, MVT::v16i8,  { 1, 3, 1, 1 } }, // pshufb
1777 
1778       {TTI::SK_PermuteTwoSrc, MVT::v8f64,  { 1, 3, 1, 1 } }, // vpermt2pd
1779       {TTI::SK_PermuteTwoSrc, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermt2ps
1780       {TTI::SK_PermuteTwoSrc, MVT::v8i64,  { 1, 3, 1, 1 } }, // vpermt2q
1781       {TTI::SK_PermuteTwoSrc, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermt2d
1782       {TTI::SK_PermuteTwoSrc, MVT::v4f64,  { 1, 3, 1, 1 } }, // vpermt2pd
1783       {TTI::SK_PermuteTwoSrc, MVT::v8f32,  { 1, 3, 1, 1 } }, // vpermt2ps
1784       {TTI::SK_PermuteTwoSrc, MVT::v4i64,  { 1, 3, 1, 1 } }, // vpermt2q
1785       {TTI::SK_PermuteTwoSrc, MVT::v8i32,  { 1, 3, 1, 1 } }, // vpermt2d
1786       {TTI::SK_PermuteTwoSrc, MVT::v2f64,  { 1, 3, 1, 1 } }, // vpermt2pd
1787       {TTI::SK_PermuteTwoSrc, MVT::v4f32,  { 1, 3, 1, 1 } }, // vpermt2ps
1788       {TTI::SK_PermuteTwoSrc, MVT::v2i64,  { 1, 3, 1, 1 } }, // vpermt2q
1789       {TTI::SK_PermuteTwoSrc, MVT::v4i32,  { 1, 3, 1, 1 } }, // vpermt2d
1790 
1791       // FIXME: This just applies the type legalization cost rules above
1792       // assuming these completely split.
1793       {TTI::SK_PermuteSingleSrc, MVT::v32i16, { 14, 14, 14, 14 } },
1794       {TTI::SK_PermuteSingleSrc, MVT::v32f16, { 14, 14, 14, 14 } },
1795       {TTI::SK_PermuteSingleSrc, MVT::v64i8,  { 14, 14, 14, 14 } },
1796       {TTI::SK_PermuteTwoSrc,    MVT::v32i16, { 42, 42, 42, 42 } },
1797       {TTI::SK_PermuteTwoSrc,    MVT::v32f16, { 42, 42, 42, 42 } },
1798       {TTI::SK_PermuteTwoSrc,    MVT::v64i8,  { 42, 42, 42, 42 } },
1799 
1800       {TTI::SK_Select, MVT::v32i16, { 1, 1, 1, 1 } }, // vpternlogq
1801       {TTI::SK_Select, MVT::v32f16, { 1, 1, 1, 1 } }, // vpternlogq
1802       {TTI::SK_Select, MVT::v64i8,  { 1, 1, 1, 1 } }, // vpternlogq
1803       {TTI::SK_Select, MVT::v8f64,  { 1, 1, 1, 1 } }, // vblendmpd
1804       {TTI::SK_Select, MVT::v16f32, { 1, 1, 1, 1 } }, // vblendmps
1805       {TTI::SK_Select, MVT::v8i64,  { 1, 1, 1, 1 } }, // vblendmq
1806       {TTI::SK_Select, MVT::v16i32, { 1, 1, 1, 1 } }, // vblendmd
1807   };
1808 
1809   if (ST->hasAVX512())
1810     if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second))
1811       if (auto KindCost = Entry->Cost[CostKind])
1812         return LT.first * *KindCost;
1813 
1814   static const CostTblEntry AVX2ShuffleTbl[] = {
1815       {TTI::SK_Broadcast, MVT::v4f64, 1},  // vbroadcastpd
1816       {TTI::SK_Broadcast, MVT::v8f32, 1},  // vbroadcastps
1817       {TTI::SK_Broadcast, MVT::v4i64, 1},  // vpbroadcastq
1818       {TTI::SK_Broadcast, MVT::v8i32, 1},  // vpbroadcastd
1819       {TTI::SK_Broadcast, MVT::v16i16, 1}, // vpbroadcastw
1820       {TTI::SK_Broadcast, MVT::v16f16, 1}, // vpbroadcastw
1821       {TTI::SK_Broadcast, MVT::v32i8, 1},  // vpbroadcastb
1822 
1823       {TTI::SK_Reverse, MVT::v4f64, 1},  // vpermpd
1824       {TTI::SK_Reverse, MVT::v8f32, 1},  // vpermps
1825       {TTI::SK_Reverse, MVT::v4i64, 1},  // vpermq
1826       {TTI::SK_Reverse, MVT::v8i32, 1},  // vpermd
1827       {TTI::SK_Reverse, MVT::v16i16, 2}, // vperm2i128 + pshufb
1828       {TTI::SK_Reverse, MVT::v16f16, 2}, // vperm2i128 + pshufb
1829       {TTI::SK_Reverse, MVT::v32i8, 2},  // vperm2i128 + pshufb
1830 
1831       {TTI::SK_Select, MVT::v16i16, 1}, // vpblendvb
1832       {TTI::SK_Select, MVT::v16f16, 1}, // vpblendvb
1833       {TTI::SK_Select, MVT::v32i8,  1}, // vpblendvb
1834 
1835       {TTI::SK_Splice, MVT::v8i32,  2}, // vperm2i128 + vpalignr
1836       {TTI::SK_Splice, MVT::v8f32,  2}, // vperm2i128 + vpalignr
1837       {TTI::SK_Splice, MVT::v16i16, 2}, // vperm2i128 + vpalignr
1838       {TTI::SK_Splice, MVT::v16f16, 2}, // vperm2i128 + vpalignr
1839       {TTI::SK_Splice, MVT::v32i8,  2}, // vperm2i128 + vpalignr
1840 
1841       {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1},  // vpermpd
1842       {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1},  // vpermps
1843       {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1},  // vpermq
1844       {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1},  // vpermd
1845       {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vperm2i128 + 2*vpshufb
1846                                                   // + vpblendvb
1847       {TTI::SK_PermuteSingleSrc, MVT::v16f16, 4}, // vperm2i128 + 2*vpshufb
1848                                                   // + vpblendvb
1849       {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4},  // vperm2i128 + 2*vpshufb
1850                                                   // + vpblendvb
1851 
1852       {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3},  // 2*vpermpd + vblendpd
1853       {TTI::SK_PermuteTwoSrc, MVT::v8f32, 3},  // 2*vpermps + vblendps
1854       {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3},  // 2*vpermq + vpblendd
1855       {TTI::SK_PermuteTwoSrc, MVT::v8i32, 3},  // 2*vpermd + vpblendd
1856       {TTI::SK_PermuteTwoSrc, MVT::v16i16, 7}, // 2*vperm2i128 + 4*vpshufb
1857                                                // + vpblendvb
1858       {TTI::SK_PermuteTwoSrc, MVT::v16f16, 7}, // 2*vperm2i128 + 4*vpshufb
1859                                                // + vpblendvb
1860       {TTI::SK_PermuteTwoSrc, MVT::v32i8, 7},  // 2*vperm2i128 + 4*vpshufb
1861                                                // + vpblendvb
1862   };
1863 
1864   if (ST->hasAVX2())
1865     if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second))
1866       return LT.first * Entry->Cost;
1867 
1868   static const CostTblEntry XOPShuffleTbl[] = {
1869       {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2},  // vperm2f128 + vpermil2pd
1870       {TTI::SK_PermuteSingleSrc, MVT::v8f32, 2},  // vperm2f128 + vpermil2ps
1871       {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2},  // vperm2f128 + vpermil2pd
1872       {TTI::SK_PermuteSingleSrc, MVT::v8i32, 2},  // vperm2f128 + vpermil2ps
1873       {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vextractf128 + 2*vpperm
1874                                                   // + vinsertf128
1875       {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4},  // vextractf128 + 2*vpperm
1876                                                   // + vinsertf128
1877 
1878       {TTI::SK_PermuteTwoSrc, MVT::v16i16, 9}, // 2*vextractf128 + 6*vpperm
1879                                                // + vinsertf128
1880       {TTI::SK_PermuteTwoSrc, MVT::v8i16, 1},  // vpperm
1881       {TTI::SK_PermuteTwoSrc, MVT::v32i8, 9},  // 2*vextractf128 + 6*vpperm
1882                                                // + vinsertf128
1883       {TTI::SK_PermuteTwoSrc, MVT::v16i8, 1},  // vpperm
1884   };
1885 
1886   if (ST->hasXOP())
1887     if (const auto *Entry = CostTableLookup(XOPShuffleTbl, Kind, LT.second))
1888       return LT.first * Entry->Cost;
1889 
1890   static const CostTblEntry AVX1ShuffleTbl[] = {
1891       {TTI::SK_Broadcast, MVT::v4f64, 2},  // vperm2f128 + vpermilpd
1892       {TTI::SK_Broadcast, MVT::v8f32, 2},  // vperm2f128 + vpermilps
1893       {TTI::SK_Broadcast, MVT::v4i64, 2},  // vperm2f128 + vpermilpd
1894       {TTI::SK_Broadcast, MVT::v8i32, 2},  // vperm2f128 + vpermilps
1895       {TTI::SK_Broadcast, MVT::v16i16, 3}, // vpshuflw + vpshufd + vinsertf128
1896       {TTI::SK_Broadcast, MVT::v16f16, 3}, // vpshuflw + vpshufd + vinsertf128
1897       {TTI::SK_Broadcast, MVT::v32i8, 2},  // vpshufb + vinsertf128
1898 
1899       {TTI::SK_Reverse, MVT::v4f64, 2},  // vperm2f128 + vpermilpd
1900       {TTI::SK_Reverse, MVT::v8f32, 2},  // vperm2f128 + vpermilps
1901       {TTI::SK_Reverse, MVT::v4i64, 2},  // vperm2f128 + vpermilpd
1902       {TTI::SK_Reverse, MVT::v8i32, 2},  // vperm2f128 + vpermilps
1903       {TTI::SK_Reverse, MVT::v16i16, 4}, // vextractf128 + 2*pshufb
1904                                          // + vinsertf128
1905       {TTI::SK_Reverse, MVT::v16f16, 4}, // vextractf128 + 2*pshufb
1906                                          // + vinsertf128
1907       {TTI::SK_Reverse, MVT::v32i8, 4},  // vextractf128 + 2*pshufb
1908                                          // + vinsertf128
1909 
1910       {TTI::SK_Select, MVT::v4i64, 1},  // vblendpd
1911       {TTI::SK_Select, MVT::v4f64, 1},  // vblendpd
1912       {TTI::SK_Select, MVT::v8i32, 1},  // vblendps
1913       {TTI::SK_Select, MVT::v8f32, 1},  // vblendps
1914       {TTI::SK_Select, MVT::v16i16, 3}, // vpand + vpandn + vpor
1915       {TTI::SK_Select, MVT::v16f16, 3}, // vpand + vpandn + vpor
1916       {TTI::SK_Select, MVT::v32i8, 3},  // vpand + vpandn + vpor
1917 
1918       {TTI::SK_Splice, MVT::v4i64,  2}, // vperm2f128 + shufpd
1919       {TTI::SK_Splice, MVT::v4f64,  2}, // vperm2f128 + shufpd
1920       {TTI::SK_Splice, MVT::v8i32,  4}, // 2*vperm2f128 + 2*vshufps
1921       {TTI::SK_Splice, MVT::v8f32,  4}, // 2*vperm2f128 + 2*vshufps
1922       {TTI::SK_Splice, MVT::v16i16, 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
1923       {TTI::SK_Splice, MVT::v16f16, 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
1924       {TTI::SK_Splice, MVT::v32i8,  5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
1925 
1926       {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2},  // vperm2f128 + vshufpd
1927       {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2},  // vperm2f128 + vshufpd
1928       {TTI::SK_PermuteSingleSrc, MVT::v8f32, 4},  // 2*vperm2f128 + 2*vshufps
1929       {TTI::SK_PermuteSingleSrc, MVT::v8i32, 4},  // 2*vperm2f128 + 2*vshufps
1930       {TTI::SK_PermuteSingleSrc, MVT::v16i16, 8}, // vextractf128 + 4*pshufb
1931                                                   // + 2*por + vinsertf128
1932       {TTI::SK_PermuteSingleSrc, MVT::v16f16, 8}, // vextractf128 + 4*pshufb
1933                                                   // + 2*por + vinsertf128
1934       {TTI::SK_PermuteSingleSrc, MVT::v32i8, 8},  // vextractf128 + 4*pshufb
1935                                                   // + 2*por + vinsertf128
1936 
1937       {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3},   // 2*vperm2f128 + vshufpd
1938       {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3},   // 2*vperm2f128 + vshufpd
1939       {TTI::SK_PermuteTwoSrc, MVT::v8f32, 4},   // 2*vperm2f128 + 2*vshufps
1940       {TTI::SK_PermuteTwoSrc, MVT::v8i32, 4},   // 2*vperm2f128 + 2*vshufps
1941       {TTI::SK_PermuteTwoSrc, MVT::v16i16, 15}, // 2*vextractf128 + 8*pshufb
1942                                                 // + 4*por + vinsertf128
1943       {TTI::SK_PermuteTwoSrc, MVT::v16f16, 15}, // 2*vextractf128 + 8*pshufb
1944                                                 // + 4*por + vinsertf128
1945       {TTI::SK_PermuteTwoSrc, MVT::v32i8, 15},  // 2*vextractf128 + 8*pshufb
1946                                                 // + 4*por + vinsertf128
1947   };
1948 
1949   if (ST->hasAVX())
1950     if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second))
1951       return LT.first * Entry->Cost;
1952 
1953   static const CostTblEntry SSE41ShuffleTbl[] = {
1954       {TTI::SK_Select, MVT::v2i64, 1}, // pblendw
1955       {TTI::SK_Select, MVT::v2f64, 1}, // movsd
1956       {TTI::SK_Select, MVT::v4i32, 1}, // pblendw
1957       {TTI::SK_Select, MVT::v4f32, 1}, // blendps
1958       {TTI::SK_Select, MVT::v8i16, 1}, // pblendw
1959       {TTI::SK_Select, MVT::v8f16, 1}, // pblendw
1960       {TTI::SK_Select, MVT::v16i8, 1}  // pblendvb
1961   };
1962 
1963   if (ST->hasSSE41())
1964     if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second))
1965       return LT.first * Entry->Cost;
1966 
1967   static const CostTblEntry SSSE3ShuffleTbl[] = {
1968       {TTI::SK_Broadcast, MVT::v8i16, 1}, // pshufb
1969       {TTI::SK_Broadcast, MVT::v8f16, 1}, // pshufb
1970       {TTI::SK_Broadcast, MVT::v16i8, 1}, // pshufb
1971 
1972       {TTI::SK_Reverse, MVT::v8i16, 1}, // pshufb
1973       {TTI::SK_Reverse, MVT::v8f16, 1}, // pshufb
1974       {TTI::SK_Reverse, MVT::v16i8, 1}, // pshufb
1975 
1976       {TTI::SK_Select, MVT::v8i16, 3}, // 2*pshufb + por
1977       {TTI::SK_Select, MVT::v8f16, 3}, // 2*pshufb + por
1978       {TTI::SK_Select, MVT::v16i8, 3}, // 2*pshufb + por
1979 
1980       {TTI::SK_Splice, MVT::v4i32, 1}, // palignr
1981       {TTI::SK_Splice, MVT::v4f32, 1}, // palignr
1982       {TTI::SK_Splice, MVT::v8i16, 1}, // palignr
1983       {TTI::SK_Splice, MVT::v8f16, 1}, // palignr
1984       {TTI::SK_Splice, MVT::v16i8, 1}, // palignr
1985 
1986       {TTI::SK_PermuteSingleSrc, MVT::v8i16, 1}, // pshufb
1987       {TTI::SK_PermuteSingleSrc, MVT::v8f16, 1}, // pshufb
1988       {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb
1989 
1990       {TTI::SK_PermuteTwoSrc, MVT::v8i16, 3}, // 2*pshufb + por
1991       {TTI::SK_PermuteTwoSrc, MVT::v8f16, 3}, // 2*pshufb + por
1992       {TTI::SK_PermuteTwoSrc, MVT::v16i8, 3}, // 2*pshufb + por
1993   };
1994 
1995   if (ST->hasSSSE3())
1996     if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second))
1997       return LT.first * Entry->Cost;
1998 
1999   static const CostTblEntry SSE2ShuffleTbl[] = {
2000       {TTI::SK_Broadcast, MVT::v2f64, 1}, // shufpd
2001       {TTI::SK_Broadcast, MVT::v2i64, 1}, // pshufd
2002       {TTI::SK_Broadcast, MVT::v4i32, 1}, // pshufd
2003       {TTI::SK_Broadcast, MVT::v8i16, 2}, // pshuflw + pshufd
2004       {TTI::SK_Broadcast, MVT::v8f16, 2}, // pshuflw + pshufd
2005       {TTI::SK_Broadcast, MVT::v16i8, 3}, // unpck + pshuflw + pshufd
2006 
2007       {TTI::SK_Reverse, MVT::v2f64, 1}, // shufpd
2008       {TTI::SK_Reverse, MVT::v2i64, 1}, // pshufd
2009       {TTI::SK_Reverse, MVT::v4i32, 1}, // pshufd
2010       {TTI::SK_Reverse, MVT::v8i16, 3}, // pshuflw + pshufhw + pshufd
2011       {TTI::SK_Reverse, MVT::v8f16, 3}, // pshuflw + pshufhw + pshufd
2012       {TTI::SK_Reverse, MVT::v16i8, 9}, // 2*pshuflw + 2*pshufhw
2013                                         // + 2*pshufd + 2*unpck + packus
2014 
2015       {TTI::SK_Select, MVT::v2i64, 1}, // movsd
2016       {TTI::SK_Select, MVT::v2f64, 1}, // movsd
2017       {TTI::SK_Select, MVT::v4i32, 2}, // 2*shufps
2018       {TTI::SK_Select, MVT::v8i16, 3}, // pand + pandn + por
2019       {TTI::SK_Select, MVT::v8f16, 3}, // pand + pandn + por
2020       {TTI::SK_Select, MVT::v16i8, 3}, // pand + pandn + por
2021 
2022       {TTI::SK_Splice, MVT::v2i64, 1}, // shufpd
2023       {TTI::SK_Splice, MVT::v2f64, 1}, // shufpd
2024       {TTI::SK_Splice, MVT::v4i32, 2}, // 2*{unpck,movsd,pshufd}
2025       {TTI::SK_Splice, MVT::v8i16, 3}, // psrldq + psrlldq + por
2026       {TTI::SK_Splice, MVT::v8f16, 3}, // psrldq + psrlldq + por
2027       {TTI::SK_Splice, MVT::v16i8, 3}, // psrldq + psrlldq + por
2028 
2029       {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // shufpd
2030       {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // pshufd
2031       {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // pshufd
2032       {TTI::SK_PermuteSingleSrc, MVT::v8i16, 5}, // 2*pshuflw + 2*pshufhw
2033                                                   // + pshufd/unpck
2034       {TTI::SK_PermuteSingleSrc, MVT::v8f16, 5}, // 2*pshuflw + 2*pshufhw
2035                                                   // + pshufd/unpck
2036     { TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2*pshuflw + 2*pshufhw
2037                                                   // + 2*pshufd + 2*unpck + 2*packus
2038 
2039     { TTI::SK_PermuteTwoSrc,    MVT::v2f64,  1 }, // shufpd
2040     { TTI::SK_PermuteTwoSrc,    MVT::v2i64,  1 }, // shufpd
2041     { TTI::SK_PermuteTwoSrc,    MVT::v4i32,  2 }, // 2*{unpck,movsd,pshufd}
2042     { TTI::SK_PermuteTwoSrc,    MVT::v8i16,  8 }, // blend+permute
2043     { TTI::SK_PermuteTwoSrc,    MVT::v8f16,  8 }, // blend+permute
2044     { TTI::SK_PermuteTwoSrc,    MVT::v16i8, 13 }, // blend+permute
2045   };
2046 
2047   static const CostTblEntry SSE3BroadcastLoadTbl[] = {
2048       {TTI::SK_Broadcast, MVT::v2f64, 0}, // broadcast handled by movddup
2049   };
2050 
2051   if (ST->hasSSE2()) {
2052     bool IsLoad =
2053         llvm::any_of(Args, [](const auto &V) { return isa<LoadInst>(V); });
2054     if (ST->hasSSE3() && IsLoad)
2055       if (const auto *Entry =
2056               CostTableLookup(SSE3BroadcastLoadTbl, Kind, LT.second)) {
2057         assert(isLegalBroadcastLoad(BaseTp->getElementType(),
2058                                     LT.second.getVectorElementCount()) &&
2059                "Table entry missing from isLegalBroadcastLoad()");
2060         return LT.first * Entry->Cost;
2061       }
2062 
2063     if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second))
2064       return LT.first * Entry->Cost;
2065   }
2066 
2067   static const CostTblEntry SSE1ShuffleTbl[] = {
2068     { TTI::SK_Broadcast,        MVT::v4f32, 1 }, // shufps
2069     { TTI::SK_Reverse,          MVT::v4f32, 1 }, // shufps
2070     { TTI::SK_Select,           MVT::v4f32, 2 }, // 2*shufps
2071     { TTI::SK_Splice,           MVT::v4f32, 2 }, // 2*shufps
2072     { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // shufps
2073     { TTI::SK_PermuteTwoSrc,    MVT::v4f32, 2 }, // 2*shufps
2074   };
2075 
2076   if (ST->hasSSE1())
2077     if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second))
2078       return LT.first * Entry->Cost;
2079 
2080   return BaseT::getShuffleCost(Kind, BaseTp, Mask, CostKind, Index, SubTp);
2081 }
2082 
2083 InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
2084                                              Type *Src,
2085                                              TTI::CastContextHint CCH,
2086                                              TTI::TargetCostKind CostKind,
2087                                              const Instruction *I) {
2088   int ISD = TLI->InstructionOpcodeToISD(Opcode);
2089   assert(ISD && "Invalid opcode");
2090 
2091   // TODO: Allow non-throughput costs that aren't binary.
2092   auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
2093     if (CostKind != TTI::TCK_RecipThroughput)
2094       return Cost == 0 ? 0 : 1;
2095     return Cost;
2096   };
2097 
2098   // The cost tables include both specific, custom (non-legal) src/dst type
2099   // conversions and generic, legalized types. We test for customs first, before
2100   // falling back to legalization.
2101   // FIXME: Need a better design of the cost table to handle non-simple types of
2102   // potential massive combinations (elem_num x src_type x dst_type).
2103   static const TypeConversionCostTblEntry AVX512BWConversionTbl[] {
2104     { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 1 },
2105     { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 1 },
2106 
2107     // Mask sign extend has an instruction.
2108     { ISD::SIGN_EXTEND, MVT::v2i8,   MVT::v2i1,   1 },
2109     { ISD::SIGN_EXTEND, MVT::v16i8,  MVT::v2i1,   1 },
2110     { ISD::SIGN_EXTEND, MVT::v2i16,  MVT::v2i1,   1 },
2111     { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v2i1,   1 },
2112     { ISD::SIGN_EXTEND, MVT::v4i8,   MVT::v4i1,   1 },
2113     { ISD::SIGN_EXTEND, MVT::v16i8,  MVT::v4i1,   1 },
2114     { ISD::SIGN_EXTEND, MVT::v4i16,  MVT::v4i1,   1 },
2115     { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v4i1,   1 },
2116     { ISD::SIGN_EXTEND, MVT::v8i8,   MVT::v8i1,   1 },
2117     { ISD::SIGN_EXTEND, MVT::v16i8,  MVT::v8i1,   1 },
2118     { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v8i1,   1 },
2119     { ISD::SIGN_EXTEND, MVT::v16i8,  MVT::v16i1,  1 },
2120     { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1,  1 },
2121     { ISD::SIGN_EXTEND, MVT::v32i8,  MVT::v32i1,  1 },
2122     { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i1,  1 },
2123     { ISD::SIGN_EXTEND, MVT::v64i8,  MVT::v64i1,  1 },
2124     { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v64i1,  1 },
2125 
2126     // Mask zero extend is a sext + shift.
2127     { ISD::ZERO_EXTEND, MVT::v2i8,   MVT::v2i1,   2 },
2128     { ISD::ZERO_EXTEND, MVT::v16i8,  MVT::v2i1,   2 },
2129     { ISD::ZERO_EXTEND, MVT::v2i16,  MVT::v2i1,   2 },
2130     { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v2i1,   2 },
2131     { ISD::ZERO_EXTEND, MVT::v4i8,   MVT::v4i1,   2 },
2132     { ISD::ZERO_EXTEND, MVT::v16i8,  MVT::v4i1,   2 },
2133     { ISD::ZERO_EXTEND, MVT::v4i16,  MVT::v4i1,   2 },
2134     { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v4i1,   2 },
2135     { ISD::ZERO_EXTEND, MVT::v8i8,   MVT::v8i1,   2 },
2136     { ISD::ZERO_EXTEND, MVT::v16i8,  MVT::v8i1,   2 },
2137     { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v8i1,   2 },
2138     { ISD::ZERO_EXTEND, MVT::v16i8,  MVT::v16i1,  2 },
2139     { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1,  2 },
2140     { ISD::ZERO_EXTEND, MVT::v32i8,  MVT::v32i1,  2 },
2141     { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i1,  2 },
2142     { ISD::ZERO_EXTEND, MVT::v64i8,  MVT::v64i1,  2 },
2143     { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v64i1,  2 },
2144 
2145     { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i8,   2 },
2146     { ISD::TRUNCATE,    MVT::v2i1,   MVT::v16i8,  2 },
2147     { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i16,  2 },
2148     { ISD::TRUNCATE,    MVT::v2i1,   MVT::v8i16,  2 },
2149     { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i8,   2 },
2150     { ISD::TRUNCATE,    MVT::v4i1,   MVT::v16i8,  2 },
2151     { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i16,  2 },
2152     { ISD::TRUNCATE,    MVT::v4i1,   MVT::v8i16,  2 },
2153     { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i8,   2 },
2154     { ISD::TRUNCATE,    MVT::v8i1,   MVT::v16i8,  2 },
2155     { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i16,  2 },
2156     { ISD::TRUNCATE,    MVT::v16i1,  MVT::v16i8,  2 },
2157     { ISD::TRUNCATE,    MVT::v16i1,  MVT::v16i16, 2 },
2158     { ISD::TRUNCATE,    MVT::v32i1,  MVT::v32i8,  2 },
2159     { ISD::TRUNCATE,    MVT::v32i1,  MVT::v32i16, 2 },
2160     { ISD::TRUNCATE,    MVT::v64i1,  MVT::v64i8,  2 },
2161     { ISD::TRUNCATE,    MVT::v64i1,  MVT::v32i16, 2 },
2162 
2163     { ISD::TRUNCATE,    MVT::v32i8,  MVT::v32i16, 2 },
2164     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i16, 2 }, // widen to zmm
2165     { ISD::TRUNCATE,    MVT::v2i8,   MVT::v2i16,  2 }, // vpmovwb
2166     { ISD::TRUNCATE,    MVT::v4i8,   MVT::v4i16,  2 }, // vpmovwb
2167     { ISD::TRUNCATE,    MVT::v8i8,   MVT::v8i16,  2 }, // vpmovwb
2168   };
2169 
2170   static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = {
2171     // Mask sign extend has an instruction.
2172     { ISD::SIGN_EXTEND, MVT::v2i64,  MVT::v2i1,   1 },
2173     { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v2i1,   1 },
2174     { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v4i1,   1 },
2175     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i1,   1 },
2176     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i1,   1 },
2177     { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v16i1,  1 },
2178     { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i1,   1 },
2179     { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1,  1 },
2180 
2181     // Mask zero extend is a sext + shift.
2182     { ISD::ZERO_EXTEND, MVT::v2i64,  MVT::v2i1,   2 },
2183     { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v2i1,   2 },
2184     { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v4i1,   2 },
2185     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i1,   2 },
2186     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i1,   2 },
2187     { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v16i1,  2 },
2188     { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i1,   2 },
2189     { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1,  2 },
2190 
2191     { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i64,  2 },
2192     { ISD::TRUNCATE,    MVT::v2i1,   MVT::v4i32,  2 },
2193     { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i32,  2 },
2194     { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i64,  2 },
2195     { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i32,  2 },
2196     { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i64,  2 },
2197     { ISD::TRUNCATE,    MVT::v16i1,  MVT::v16i32, 2 },
2198     { ISD::TRUNCATE,    MVT::v16i1,  MVT::v8i64,  2 },
2199 
2200     { ISD::SINT_TO_FP,  MVT::v8f32,  MVT::v8i64,  1 },
2201     { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i64,  1 },
2202 
2203     { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i64,  1 },
2204     { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i64,  1 },
2205 
2206     { ISD::FP_TO_SINT,  MVT::v8i64,  MVT::v8f32,  1 },
2207     { ISD::FP_TO_SINT,  MVT::v8i64,  MVT::v8f64,  1 },
2208 
2209     { ISD::FP_TO_UINT,  MVT::v8i64,  MVT::v8f32,  1 },
2210     { ISD::FP_TO_UINT,  MVT::v8i64,  MVT::v8f64,  1 },
2211   };
2212 
2213   // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and
2214   // 256-bit wide vectors.
2215 
2216   static const TypeConversionCostTblEntry AVX512FConversionTbl[] = {
2217     { ISD::FP_EXTEND, MVT::v8f64,   MVT::v8f32,  1 },
2218     { ISD::FP_EXTEND, MVT::v8f64,   MVT::v16f32, 3 },
2219     { ISD::FP_ROUND,  MVT::v8f32,   MVT::v8f64,  1 },
2220 
2221     { ISD::TRUNCATE,  MVT::v2i1,    MVT::v2i8,   3 }, // sext+vpslld+vptestmd
2222     { ISD::TRUNCATE,  MVT::v4i1,    MVT::v4i8,   3 }, // sext+vpslld+vptestmd
2223     { ISD::TRUNCATE,  MVT::v8i1,    MVT::v8i8,   3 }, // sext+vpslld+vptestmd
2224     { ISD::TRUNCATE,  MVT::v16i1,   MVT::v16i8,  3 }, // sext+vpslld+vptestmd
2225     { ISD::TRUNCATE,  MVT::v2i1,    MVT::v2i16,  3 }, // sext+vpsllq+vptestmq
2226     { ISD::TRUNCATE,  MVT::v4i1,    MVT::v4i16,  3 }, // sext+vpsllq+vptestmq
2227     { ISD::TRUNCATE,  MVT::v8i1,    MVT::v8i16,  3 }, // sext+vpsllq+vptestmq
2228     { ISD::TRUNCATE,  MVT::v16i1,   MVT::v16i16, 3 }, // sext+vpslld+vptestmd
2229     { ISD::TRUNCATE,  MVT::v2i1,    MVT::v2i32,  2 }, // zmm vpslld+vptestmd
2230     { ISD::TRUNCATE,  MVT::v4i1,    MVT::v4i32,  2 }, // zmm vpslld+vptestmd
2231     { ISD::TRUNCATE,  MVT::v8i1,    MVT::v8i32,  2 }, // zmm vpslld+vptestmd
2232     { ISD::TRUNCATE,  MVT::v16i1,   MVT::v16i32, 2 }, // vpslld+vptestmd
2233     { ISD::TRUNCATE,  MVT::v2i1,    MVT::v2i64,  2 }, // zmm vpsllq+vptestmq
2234     { ISD::TRUNCATE,  MVT::v4i1,    MVT::v4i64,  2 }, // zmm vpsllq+vptestmq
2235     { ISD::TRUNCATE,  MVT::v8i1,    MVT::v8i64,  2 }, // vpsllq+vptestmq
2236     { ISD::TRUNCATE,  MVT::v2i8,    MVT::v2i32,  2 }, // vpmovdb
2237     { ISD::TRUNCATE,  MVT::v4i8,    MVT::v4i32,  2 }, // vpmovdb
2238     { ISD::TRUNCATE,  MVT::v16i8,   MVT::v16i32, 2 }, // vpmovdb
2239     { ISD::TRUNCATE,  MVT::v32i8,   MVT::v16i32, 2 }, // vpmovdb
2240     { ISD::TRUNCATE,  MVT::v64i8,   MVT::v16i32, 2 }, // vpmovdb
2241     { ISD::TRUNCATE,  MVT::v16i16,  MVT::v16i32, 2 }, // vpmovdw
2242     { ISD::TRUNCATE,  MVT::v32i16,  MVT::v16i32, 2 }, // vpmovdw
2243     { ISD::TRUNCATE,  MVT::v2i8,    MVT::v2i64,  2 }, // vpmovqb
2244     { ISD::TRUNCATE,  MVT::v2i16,   MVT::v2i64,  1 }, // vpshufb
2245     { ISD::TRUNCATE,  MVT::v8i8,    MVT::v8i64,  2 }, // vpmovqb
2246     { ISD::TRUNCATE,  MVT::v16i8,   MVT::v8i64,  2 }, // vpmovqb
2247     { ISD::TRUNCATE,  MVT::v32i8,   MVT::v8i64,  2 }, // vpmovqb
2248     { ISD::TRUNCATE,  MVT::v64i8,   MVT::v8i64,  2 }, // vpmovqb
2249     { ISD::TRUNCATE,  MVT::v8i16,   MVT::v8i64,  2 }, // vpmovqw
2250     { ISD::TRUNCATE,  MVT::v16i16,  MVT::v8i64,  2 }, // vpmovqw
2251     { ISD::TRUNCATE,  MVT::v32i16,  MVT::v8i64,  2 }, // vpmovqw
2252     { ISD::TRUNCATE,  MVT::v8i32,   MVT::v8i64,  1 }, // vpmovqd
2253     { ISD::TRUNCATE,  MVT::v4i32,   MVT::v4i64,  1 }, // zmm vpmovqd
2254     { ISD::TRUNCATE,  MVT::v16i8,   MVT::v16i64, 5 },// 2*vpmovqd+concat+vpmovdb
2255 
2256     { ISD::TRUNCATE,  MVT::v16i8,  MVT::v16i16,  3 }, // extend to v16i32
2257     { ISD::TRUNCATE,  MVT::v32i8,  MVT::v32i16,  8 },
2258     { ISD::TRUNCATE,  MVT::v64i8,  MVT::v32i16,  8 },
2259 
2260     // Sign extend is zmm vpternlogd+vptruncdb.
2261     // Zero extend is zmm broadcast load+vptruncdw.
2262     { ISD::SIGN_EXTEND, MVT::v2i8,   MVT::v2i1,   3 },
2263     { ISD::ZERO_EXTEND, MVT::v2i8,   MVT::v2i1,   4 },
2264     { ISD::SIGN_EXTEND, MVT::v4i8,   MVT::v4i1,   3 },
2265     { ISD::ZERO_EXTEND, MVT::v4i8,   MVT::v4i1,   4 },
2266     { ISD::SIGN_EXTEND, MVT::v8i8,   MVT::v8i1,   3 },
2267     { ISD::ZERO_EXTEND, MVT::v8i8,   MVT::v8i1,   4 },
2268     { ISD::SIGN_EXTEND, MVT::v16i8,  MVT::v16i1,  3 },
2269     { ISD::ZERO_EXTEND, MVT::v16i8,  MVT::v16i1,  4 },
2270 
2271     // Sign extend is zmm vpternlogd+vptruncdw.
2272     // Zero extend is zmm vpternlogd+vptruncdw+vpsrlw.
2273     { ISD::SIGN_EXTEND, MVT::v2i16,  MVT::v2i1,   3 },
2274     { ISD::ZERO_EXTEND, MVT::v2i16,  MVT::v2i1,   4 },
2275     { ISD::SIGN_EXTEND, MVT::v4i16,  MVT::v4i1,   3 },
2276     { ISD::ZERO_EXTEND, MVT::v4i16,  MVT::v4i1,   4 },
2277     { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v8i1,   3 },
2278     { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v8i1,   4 },
2279     { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1,  3 },
2280     { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1,  4 },
2281 
2282     { ISD::SIGN_EXTEND, MVT::v2i32,  MVT::v2i1,   1 }, // zmm vpternlogd
2283     { ISD::ZERO_EXTEND, MVT::v2i32,  MVT::v2i1,   2 }, // zmm vpternlogd+psrld
2284     { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v4i1,   1 }, // zmm vpternlogd
2285     { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v4i1,   2 }, // zmm vpternlogd+psrld
2286     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i1,   1 }, // zmm vpternlogd
2287     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i1,   2 }, // zmm vpternlogd+psrld
2288     { ISD::SIGN_EXTEND, MVT::v2i64,  MVT::v2i1,   1 }, // zmm vpternlogq
2289     { ISD::ZERO_EXTEND, MVT::v2i64,  MVT::v2i1,   2 }, // zmm vpternlogq+psrlq
2290     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i1,   1 }, // zmm vpternlogq
2291     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i1,   2 }, // zmm vpternlogq+psrlq
2292 
2293     { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1,  1 }, // vpternlogd
2294     { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1,  2 }, // vpternlogd+psrld
2295     { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i1,   1 }, // vpternlogq
2296     { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i1,   2 }, // vpternlogq+psrlq
2297 
2298     { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8,  1 },
2299     { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8,  1 },
2300     { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
2301     { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
2302     { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i8,   1 },
2303     { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i8,   1 },
2304     { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i16,  1 },
2305     { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i16,  1 },
2306     { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i32,  1 },
2307     { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i32,  1 },
2308 
2309     { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8,  3 }, // FIXME: May not be right
2310     { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8,  3 }, // FIXME: May not be right
2311 
2312     { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i1,   4 },
2313     { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i1,  3 },
2314     { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v16i8,  2 },
2315     { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i8,  1 },
2316     { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i16,  2 },
2317     { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i16, 1 },
2318     { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i32,  1 },
2319     { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i32, 1 },
2320 
2321     { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i1,   4 },
2322     { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i1,  3 },
2323     { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v16i8,  2 },
2324     { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i8,  1 },
2325     { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i16,  2 },
2326     { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i16, 1 },
2327     { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i32,  1 },
2328     { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i32, 1 },
2329     { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i64, 26 },
2330     { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i64,  5 },
2331 
2332     { ISD::FP_TO_SINT,  MVT::v16i8,  MVT::v16f32, 2 },
2333     { ISD::FP_TO_SINT,  MVT::v16i8,  MVT::v16f64, 7 },
2334     { ISD::FP_TO_SINT,  MVT::v32i8,  MVT::v32f64,15 },
2335     { ISD::FP_TO_SINT,  MVT::v64i8,  MVT::v64f32,11 },
2336     { ISD::FP_TO_SINT,  MVT::v64i8,  MVT::v64f64,31 },
2337     { ISD::FP_TO_SINT,  MVT::v8i16,  MVT::v8f64,  3 },
2338     { ISD::FP_TO_SINT,  MVT::v16i16, MVT::v16f64, 7 },
2339     { ISD::FP_TO_SINT,  MVT::v32i16, MVT::v32f32, 5 },
2340     { ISD::FP_TO_SINT,  MVT::v32i16, MVT::v32f64,15 },
2341     { ISD::FP_TO_SINT,  MVT::v8i32,  MVT::v8f64,  1 },
2342     { ISD::FP_TO_SINT,  MVT::v16i32, MVT::v16f64, 3 },
2343 
2344     { ISD::FP_TO_UINT,  MVT::v8i32,  MVT::v8f64,  1 },
2345     { ISD::FP_TO_UINT,  MVT::v8i16,  MVT::v8f64,  3 },
2346     { ISD::FP_TO_UINT,  MVT::v8i8,   MVT::v8f64,  3 },
2347     { ISD::FP_TO_UINT,  MVT::v16i32, MVT::v16f32, 1 },
2348     { ISD::FP_TO_UINT,  MVT::v16i16, MVT::v16f32, 3 },
2349     { ISD::FP_TO_UINT,  MVT::v16i8,  MVT::v16f32, 3 },
2350   };
2351 
2352   static const TypeConversionCostTblEntry AVX512BWVLConversionTbl[] {
2353     // Mask sign extend has an instruction.
2354     { ISD::SIGN_EXTEND, MVT::v2i8,   MVT::v2i1,   1 },
2355     { ISD::SIGN_EXTEND, MVT::v16i8,  MVT::v2i1,   1 },
2356     { ISD::SIGN_EXTEND, MVT::v2i16,  MVT::v2i1,   1 },
2357     { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v2i1,   1 },
2358     { ISD::SIGN_EXTEND, MVT::v4i16,  MVT::v4i1,   1 },
2359     { ISD::SIGN_EXTEND, MVT::v16i8,  MVT::v4i1,   1 },
2360     { ISD::SIGN_EXTEND, MVT::v4i8,   MVT::v4i1,   1 },
2361     { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v4i1,   1 },
2362     { ISD::SIGN_EXTEND, MVT::v8i8,   MVT::v8i1,   1 },
2363     { ISD::SIGN_EXTEND, MVT::v16i8,  MVT::v8i1,   1 },
2364     { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v8i1,   1 },
2365     { ISD::SIGN_EXTEND, MVT::v16i8,  MVT::v16i1,  1 },
2366     { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1,  1 },
2367     { ISD::SIGN_EXTEND, MVT::v32i8,  MVT::v32i1,  1 },
2368     { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v32i1,  1 },
2369     { ISD::SIGN_EXTEND, MVT::v32i8,  MVT::v64i1,  1 },
2370     { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v64i1,  1 },
2371 
2372     // Mask zero extend is a sext + shift.
2373     { ISD::ZERO_EXTEND, MVT::v2i8,   MVT::v2i1,   2 },
2374     { ISD::ZERO_EXTEND, MVT::v16i8,  MVT::v2i1,   2 },
2375     { ISD::ZERO_EXTEND, MVT::v2i16,  MVT::v2i1,   2 },
2376     { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v2i1,   2 },
2377     { ISD::ZERO_EXTEND, MVT::v4i8,   MVT::v4i1,   2 },
2378     { ISD::ZERO_EXTEND, MVT::v16i8,  MVT::v4i1,   2 },
2379     { ISD::ZERO_EXTEND, MVT::v4i16,  MVT::v4i1,   2 },
2380     { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v4i1,   2 },
2381     { ISD::ZERO_EXTEND, MVT::v8i8,   MVT::v8i1,   2 },
2382     { ISD::ZERO_EXTEND, MVT::v16i8,  MVT::v8i1,   2 },
2383     { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v8i1,   2 },
2384     { ISD::ZERO_EXTEND, MVT::v16i8,  MVT::v16i1,  2 },
2385     { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1,  2 },
2386     { ISD::ZERO_EXTEND, MVT::v32i8,  MVT::v32i1,  2 },
2387     { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v32i1,  2 },
2388     { ISD::ZERO_EXTEND, MVT::v32i8,  MVT::v64i1,  2 },
2389     { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v64i1,  2 },
2390 
2391     { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i8,   2 },
2392     { ISD::TRUNCATE,    MVT::v2i1,   MVT::v16i8,  2 },
2393     { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i16,  2 },
2394     { ISD::TRUNCATE,    MVT::v2i1,   MVT::v8i16,  2 },
2395     { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i8,   2 },
2396     { ISD::TRUNCATE,    MVT::v4i1,   MVT::v16i8,  2 },
2397     { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i16,  2 },
2398     { ISD::TRUNCATE,    MVT::v4i1,   MVT::v8i16,  2 },
2399     { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i8,   2 },
2400     { ISD::TRUNCATE,    MVT::v8i1,   MVT::v16i8,  2 },
2401     { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i16,  2 },
2402     { ISD::TRUNCATE,    MVT::v16i1,  MVT::v16i8,  2 },
2403     { ISD::TRUNCATE,    MVT::v16i1,  MVT::v16i16, 2 },
2404     { ISD::TRUNCATE,    MVT::v32i1,  MVT::v32i8,  2 },
2405     { ISD::TRUNCATE,    MVT::v32i1,  MVT::v16i16, 2 },
2406     { ISD::TRUNCATE,    MVT::v64i1,  MVT::v32i8,  2 },
2407     { ISD::TRUNCATE,    MVT::v64i1,  MVT::v16i16, 2 },
2408 
2409     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i16, 2 },
2410   };
2411 
2412   static const TypeConversionCostTblEntry AVX512DQVLConversionTbl[] = {
2413     // Mask sign extend has an instruction.
2414     { ISD::SIGN_EXTEND, MVT::v2i64,  MVT::v2i1,   1 },
2415     { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v2i1,   1 },
2416     { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v4i1,   1 },
2417     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v16i1,  1 },
2418     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i1,   1 },
2419     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v8i1,   1 },
2420     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v16i1,  1 },
2421     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i1,   1 },
2422 
2423     // Mask zero extend is a sext + shift.
2424     { ISD::ZERO_EXTEND, MVT::v2i64,  MVT::v2i1,   2 },
2425     { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v2i1,   2 },
2426     { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v4i1,   2 },
2427     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v16i1,  2 },
2428     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i1,   2 },
2429     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v8i1,   2 },
2430     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v16i1,  2 },
2431     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i1,   2 },
2432 
2433     { ISD::TRUNCATE,    MVT::v16i1,  MVT::v4i64,  2 },
2434     { ISD::TRUNCATE,    MVT::v16i1,  MVT::v8i32,  2 },
2435     { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i64,  2 },
2436     { ISD::TRUNCATE,    MVT::v2i1,   MVT::v4i32,  2 },
2437     { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i32,  2 },
2438     { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i64,  2 },
2439     { ISD::TRUNCATE,    MVT::v8i1,   MVT::v4i64,  2 },
2440     { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i32,  2 },
2441 
2442     { ISD::SINT_TO_FP,  MVT::v2f32,  MVT::v2i64,  1 },
2443     { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v2i64,  1 },
2444     { ISD::SINT_TO_FP,  MVT::v4f32,  MVT::v4i64,  1 },
2445     { ISD::SINT_TO_FP,  MVT::v4f64,  MVT::v4i64,  1 },
2446 
2447     { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i64,  1 },
2448     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i64,  1 },
2449     { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v4i64,  1 },
2450     { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i64,  1 },
2451 
2452     { ISD::FP_TO_SINT,  MVT::v2i64,  MVT::v4f32,  1 },
2453     { ISD::FP_TO_SINT,  MVT::v4i64,  MVT::v4f32,  1 },
2454     { ISD::FP_TO_SINT,  MVT::v2i64,  MVT::v2f64,  1 },
2455     { ISD::FP_TO_SINT,  MVT::v4i64,  MVT::v4f64,  1 },
2456 
2457     { ISD::FP_TO_UINT,  MVT::v2i64,  MVT::v4f32,  1 },
2458     { ISD::FP_TO_UINT,  MVT::v4i64,  MVT::v4f32,  1 },
2459     { ISD::FP_TO_UINT,  MVT::v2i64,  MVT::v2f64,  1 },
2460     { ISD::FP_TO_UINT,  MVT::v4i64,  MVT::v4f64,  1 },
2461   };
2462 
2463   static const TypeConversionCostTblEntry AVX512VLConversionTbl[] = {
2464     { ISD::TRUNCATE,  MVT::v2i1,    MVT::v2i8,   3 }, // sext+vpslld+vptestmd
2465     { ISD::TRUNCATE,  MVT::v4i1,    MVT::v4i8,   3 }, // sext+vpslld+vptestmd
2466     { ISD::TRUNCATE,  MVT::v8i1,    MVT::v8i8,   3 }, // sext+vpslld+vptestmd
2467     { ISD::TRUNCATE,  MVT::v16i1,   MVT::v16i8,  8 }, // split+2*v8i8
2468     { ISD::TRUNCATE,  MVT::v2i1,    MVT::v2i16,  3 }, // sext+vpsllq+vptestmq
2469     { ISD::TRUNCATE,  MVT::v4i1,    MVT::v4i16,  3 }, // sext+vpsllq+vptestmq
2470     { ISD::TRUNCATE,  MVT::v8i1,    MVT::v8i16,  3 }, // sext+vpsllq+vptestmq
2471     { ISD::TRUNCATE,  MVT::v16i1,   MVT::v16i16, 8 }, // split+2*v8i16
2472     { ISD::TRUNCATE,  MVT::v2i1,    MVT::v2i32,  2 }, // vpslld+vptestmd
2473     { ISD::TRUNCATE,  MVT::v4i1,    MVT::v4i32,  2 }, // vpslld+vptestmd
2474     { ISD::TRUNCATE,  MVT::v8i1,    MVT::v8i32,  2 }, // vpslld+vptestmd
2475     { ISD::TRUNCATE,  MVT::v16i1,   MVT::v8i32,  2 }, // vpslld+vptestmd
2476     { ISD::TRUNCATE,  MVT::v2i1,    MVT::v2i64,  2 }, // vpsllq+vptestmq
2477     { ISD::TRUNCATE,  MVT::v4i1,    MVT::v4i64,  2 }, // vpsllq+vptestmq
2478     { ISD::TRUNCATE,  MVT::v4i32,   MVT::v4i64,  1 }, // vpmovqd
2479     { ISD::TRUNCATE,  MVT::v4i8,    MVT::v4i64,  2 }, // vpmovqb
2480     { ISD::TRUNCATE,  MVT::v4i16,   MVT::v4i64,  2 }, // vpmovqw
2481     { ISD::TRUNCATE,  MVT::v8i8,    MVT::v8i32,  2 }, // vpmovwb
2482 
2483     // sign extend is vpcmpeq+maskedmove+vpmovdw+vpacksswb
2484     // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw+vpackuswb
2485     { ISD::SIGN_EXTEND, MVT::v2i8,   MVT::v2i1,   5 },
2486     { ISD::ZERO_EXTEND, MVT::v2i8,   MVT::v2i1,   6 },
2487     { ISD::SIGN_EXTEND, MVT::v4i8,   MVT::v4i1,   5 },
2488     { ISD::ZERO_EXTEND, MVT::v4i8,   MVT::v4i1,   6 },
2489     { ISD::SIGN_EXTEND, MVT::v8i8,   MVT::v8i1,   5 },
2490     { ISD::ZERO_EXTEND, MVT::v8i8,   MVT::v8i1,   6 },
2491     { ISD::SIGN_EXTEND, MVT::v16i8,  MVT::v16i1, 10 },
2492     { ISD::ZERO_EXTEND, MVT::v16i8,  MVT::v16i1, 12 },
2493 
2494     // sign extend is vpcmpeq+maskedmove+vpmovdw
2495     // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw
2496     { ISD::SIGN_EXTEND, MVT::v2i16,  MVT::v2i1,   4 },
2497     { ISD::ZERO_EXTEND, MVT::v2i16,  MVT::v2i1,   5 },
2498     { ISD::SIGN_EXTEND, MVT::v4i16,  MVT::v4i1,   4 },
2499     { ISD::ZERO_EXTEND, MVT::v4i16,  MVT::v4i1,   5 },
2500     { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v8i1,   4 },
2501     { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v8i1,   5 },
2502     { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 10 },
2503     { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 12 },
2504 
2505     { ISD::SIGN_EXTEND, MVT::v2i32,  MVT::v2i1,   1 }, // vpternlogd
2506     { ISD::ZERO_EXTEND, MVT::v2i32,  MVT::v2i1,   2 }, // vpternlogd+psrld
2507     { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v4i1,   1 }, // vpternlogd
2508     { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v4i1,   2 }, // vpternlogd+psrld
2509     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i1,   1 }, // vpternlogd
2510     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i1,   2 }, // vpternlogd+psrld
2511     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v16i1,  1 }, // vpternlogd
2512     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v16i1,  2 }, // vpternlogd+psrld
2513 
2514     { ISD::SIGN_EXTEND, MVT::v2i64,  MVT::v2i1,   1 }, // vpternlogq
2515     { ISD::ZERO_EXTEND, MVT::v2i64,  MVT::v2i1,   2 }, // vpternlogq+psrlq
2516     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i1,   1 }, // vpternlogq
2517     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i1,   2 }, // vpternlogq+psrlq
2518 
2519     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v16i8,  1 },
2520     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v16i8,  1 },
2521     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v16i8,  1 },
2522     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v16i8,  1 },
2523     { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8,  1 },
2524     { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8,  1 },
2525     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v8i16,  1 },
2526     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v8i16,  1 },
2527     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i16,  1 },
2528     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i16,  1 },
2529     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i32,  1 },
2530     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i32,  1 },
2531 
2532     { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v16i8,  1 },
2533     { ISD::SINT_TO_FP,  MVT::v8f32,  MVT::v16i8,  1 },
2534     { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v8i16,  1 },
2535     { ISD::SINT_TO_FP,  MVT::v8f32,  MVT::v8i16,  1 },
2536 
2537     { ISD::UINT_TO_FP,  MVT::f32,    MVT::i64,    1 },
2538     { ISD::UINT_TO_FP,  MVT::f64,    MVT::i64,    1 },
2539     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v16i8,  1 },
2540     { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v16i8,  1 },
2541     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v8i16,  1 },
2542     { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i16,  1 },
2543     { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i32,  1 },
2544     { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v4i32,  1 },
2545     { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i32,  1 },
2546     { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i32,  1 },
2547     { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i64,  5 },
2548     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i64,  5 },
2549     { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i64,  5 },
2550 
2551     { ISD::FP_TO_SINT,  MVT::v16i8,  MVT::v8f32,  2 },
2552     { ISD::FP_TO_SINT,  MVT::v16i8,  MVT::v16f32, 2 },
2553     { ISD::FP_TO_SINT,  MVT::v32i8,  MVT::v32f32, 5 },
2554 
2555     { ISD::FP_TO_UINT,  MVT::i64,    MVT::f32,    1 },
2556     { ISD::FP_TO_UINT,  MVT::i64,    MVT::f64,    1 },
2557     { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v4f32,  1 },
2558     { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v2f64,  1 },
2559     { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v4f64,  1 },
2560     { ISD::FP_TO_UINT,  MVT::v8i32,  MVT::v8f32,  1 },
2561     { ISD::FP_TO_UINT,  MVT::v8i32,  MVT::v8f64,  1 },
2562   };
2563 
2564   static const TypeConversionCostTblEntry AVX2ConversionTbl[] = {
2565     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i1,   3 },
2566     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i1,   3 },
2567     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i1,   3 },
2568     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i1,   3 },
2569     { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1,  1 },
2570     { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1,  1 },
2571 
2572     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v16i8,  2 },
2573     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v16i8,  2 },
2574     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v16i8,  2 },
2575     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v16i8,  2 },
2576     { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8,  2 },
2577     { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8,  2 },
2578     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v8i16,  2 },
2579     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v8i16,  2 },
2580     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i16,  2 },
2581     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i16,  2 },
2582     { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 3 },
2583     { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 3 },
2584     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i32,  2 },
2585     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i32,  2 },
2586 
2587     { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i32,  2 },
2588 
2589     { ISD::TRUNCATE,    MVT::v16i16, MVT::v16i32, 4 },
2590     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i32, 4 },
2591     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v8i16,  1 },
2592     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v4i32,  1 },
2593     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v2i64,  1 },
2594     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v8i32,  4 },
2595     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v4i64,  4 },
2596     { ISD::TRUNCATE,    MVT::v8i16,  MVT::v4i32,  1 },
2597     { ISD::TRUNCATE,    MVT::v8i16,  MVT::v2i64,  1 },
2598     { ISD::TRUNCATE,    MVT::v8i16,  MVT::v4i64,  5 },
2599     { ISD::TRUNCATE,    MVT::v4i32,  MVT::v4i64,  1 },
2600     { ISD::TRUNCATE,    MVT::v8i16,  MVT::v8i32,  2 },
2601 
2602     { ISD::FP_EXTEND,   MVT::v8f64,  MVT::v8f32,  3 },
2603     { ISD::FP_ROUND,    MVT::v8f32,  MVT::v8f64,  3 },
2604 
2605     { ISD::FP_TO_SINT,  MVT::v16i16, MVT::v8f32,  1 },
2606     { ISD::FP_TO_SINT,  MVT::v4i32,  MVT::v4f64,  1 },
2607     { ISD::FP_TO_SINT,  MVT::v8i32,  MVT::v8f32,  1 },
2608     { ISD::FP_TO_SINT,  MVT::v8i32,  MVT::v8f64,  3 },
2609 
2610     { ISD::FP_TO_UINT,  MVT::i64,    MVT::f32,    3 },
2611     { ISD::FP_TO_UINT,  MVT::i64,    MVT::f64,    3 },
2612     { ISD::FP_TO_UINT,  MVT::v16i16, MVT::v8f32,  1 },
2613     { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v4f32,  3 },
2614     { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v2f64,  4 },
2615     { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v4f64,  4 },
2616     { ISD::FP_TO_UINT,  MVT::v8i32,  MVT::v8f32,  3 },
2617     { ISD::FP_TO_UINT,  MVT::v8i32,  MVT::v4f64,  4 },
2618 
2619     { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v16i8,  2 },
2620     { ISD::SINT_TO_FP,  MVT::v8f32,  MVT::v16i8,  2 },
2621     { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v8i16,  2 },
2622     { ISD::SINT_TO_FP,  MVT::v8f32,  MVT::v8i16,  2 },
2623     { ISD::SINT_TO_FP,  MVT::v4f64,  MVT::v4i32,  1 },
2624     { ISD::SINT_TO_FP,  MVT::v8f32,  MVT::v8i32,  1 },
2625     { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i32,  3 },
2626 
2627     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v16i8,  2 },
2628     { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v16i8,  2 },
2629     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v8i16,  2 },
2630     { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i16,  2 },
2631     { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i32,  2 },
2632     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i32,  1 },
2633     { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v4i32,  2 },
2634     { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i32,  2 },
2635     { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i32,  2 },
2636     { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i32,  4 },
2637   };
2638 
2639   static const TypeConversionCostTblEntry AVXConversionTbl[] = {
2640     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i1,   6 },
2641     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i1,   4 },
2642     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i1,   7 },
2643     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i1,   4 },
2644     { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1,  4 },
2645     { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1,  4 },
2646 
2647     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v16i8,  3 },
2648     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v16i8,  3 },
2649     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v16i8,  3 },
2650     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v16i8,  3 },
2651     { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8,  3 },
2652     { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8,  3 },
2653     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v8i16,  3 },
2654     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v8i16,  3 },
2655     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i16,  3 },
2656     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i16,  3 },
2657     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i32,  3 },
2658     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i32,  3 },
2659 
2660     { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i64,  4 },
2661     { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i32,  5 },
2662     { ISD::TRUNCATE,    MVT::v16i1,  MVT::v16i16, 4 },
2663     { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i64,  9 },
2664     { ISD::TRUNCATE,    MVT::v16i1,  MVT::v16i64, 11 },
2665 
2666     { ISD::TRUNCATE,    MVT::v16i16, MVT::v16i32, 6 },
2667     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i32, 6 },
2668     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i16, 2 }, // and+extract+packuswb
2669     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v8i32,  5 },
2670     { ISD::TRUNCATE,    MVT::v8i16,  MVT::v8i32,  5 },
2671     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v4i64,  5 },
2672     { ISD::TRUNCATE,    MVT::v8i16,  MVT::v4i64,  3 }, // and+extract+2*packusdw
2673     { ISD::TRUNCATE,    MVT::v4i32,  MVT::v4i64,  2 },
2674 
2675     { ISD::SINT_TO_FP,  MVT::v4f32,  MVT::v4i1,   3 },
2676     { ISD::SINT_TO_FP,  MVT::v4f64,  MVT::v4i1,   3 },
2677     { ISD::SINT_TO_FP,  MVT::v8f32,  MVT::v8i1,   8 },
2678     { ISD::SINT_TO_FP,  MVT::v8f32,  MVT::v16i8,  4 },
2679     { ISD::SINT_TO_FP,  MVT::v4f64,  MVT::v16i8,  2 },
2680     { ISD::SINT_TO_FP,  MVT::v8f32,  MVT::v8i16,  4 },
2681     { ISD::SINT_TO_FP,  MVT::v4f64,  MVT::v8i16,  2 },
2682     { ISD::SINT_TO_FP,  MVT::v4f64,  MVT::v4i32,  2 },
2683     { ISD::SINT_TO_FP,  MVT::v8f32,  MVT::v8i32,  2 },
2684     { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i32,  4 },
2685     { ISD::SINT_TO_FP,  MVT::v4f32,  MVT::v2i64,  5 },
2686     { ISD::SINT_TO_FP,  MVT::v4f32,  MVT::v4i64,  8 },
2687 
2688     { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v4i1,   7 },
2689     { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i1,   7 },
2690     { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i1,   6 },
2691     { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v16i8,  4 },
2692     { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v16i8,  2 },
2693     { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i16,  4 },
2694     { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v8i16,  2 },
2695     { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i32,  4 },
2696     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i32,  4 },
2697     { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v4i32,  5 },
2698     { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i32,  6 },
2699     { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i32,  8 },
2700     { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i32, 10 },
2701     { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i64, 10 },
2702     { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v4i64, 18 },
2703     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i64,  5 },
2704     { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i64, 10 },
2705 
2706     { ISD::FP_TO_SINT,  MVT::v16i8,  MVT::v8f32,  2 },
2707     { ISD::FP_TO_SINT,  MVT::v16i8,  MVT::v4f64,  2 },
2708     { ISD::FP_TO_SINT,  MVT::v32i8,  MVT::v8f32,  2 },
2709     { ISD::FP_TO_SINT,  MVT::v32i8,  MVT::v4f64,  2 },
2710     { ISD::FP_TO_SINT,  MVT::v8i16,  MVT::v8f32,  2 },
2711     { ISD::FP_TO_SINT,  MVT::v8i16,  MVT::v4f64,  2 },
2712     { ISD::FP_TO_SINT,  MVT::v16i16, MVT::v8f32,  2 },
2713     { ISD::FP_TO_SINT,  MVT::v16i16, MVT::v4f64,  2 },
2714     { ISD::FP_TO_SINT,  MVT::v4i32,  MVT::v4f64,  2 },
2715     { ISD::FP_TO_SINT,  MVT::v8i32,  MVT::v8f32,  2 },
2716     { ISD::FP_TO_SINT,  MVT::v8i32,  MVT::v8f64,  5 },
2717 
2718     { ISD::FP_TO_UINT,  MVT::v16i8,  MVT::v8f32,  2 },
2719     { ISD::FP_TO_UINT,  MVT::v16i8,  MVT::v4f64,  2 },
2720     { ISD::FP_TO_UINT,  MVT::v32i8,  MVT::v8f32,  2 },
2721     { ISD::FP_TO_UINT,  MVT::v32i8,  MVT::v4f64,  2 },
2722     { ISD::FP_TO_UINT,  MVT::v8i16,  MVT::v8f32,  2 },
2723     { ISD::FP_TO_UINT,  MVT::v8i16,  MVT::v4f64,  2 },
2724     { ISD::FP_TO_UINT,  MVT::v16i16, MVT::v8f32,  2 },
2725     { ISD::FP_TO_UINT,  MVT::v16i16, MVT::v4f64,  2 },
2726     { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v4f32,  3 },
2727     { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v2f64,  4 },
2728     { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v4f64,  6 },
2729     { ISD::FP_TO_UINT,  MVT::v8i32,  MVT::v8f32,  7 },
2730     { ISD::FP_TO_UINT,  MVT::v8i32,  MVT::v4f64,  7 },
2731 
2732     { ISD::FP_EXTEND,   MVT::v4f64,  MVT::v4f32,  1 },
2733     { ISD::FP_ROUND,    MVT::v4f32,  MVT::v4f64,  1 },
2734   };
2735 
2736   static const TypeConversionCostTblEntry SSE41ConversionTbl[] = {
2737     { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8,   1 },
2738     { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8,   1 },
2739     { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8,   1 },
2740     { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8,   1 },
2741     { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8,   1 },
2742     { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8,   1 },
2743     { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16,   1 },
2744     { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16,   1 },
2745     { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16,   1 },
2746     { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16,   1 },
2747     { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32,   1 },
2748     { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32,   1 },
2749 
2750     // These truncates end up widening elements.
2751     { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i8,   1 }, // PMOVXZBQ
2752     { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i16,  1 }, // PMOVXZWQ
2753     { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i8,   1 }, // PMOVXZBD
2754 
2755     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v4i32,  2 },
2756     { ISD::TRUNCATE,    MVT::v8i16,  MVT::v4i32,  2 },
2757     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v2i64,  2 },
2758 
2759     { ISD::SINT_TO_FP,  MVT::f32,    MVT::i32,    1 },
2760     { ISD::SINT_TO_FP,  MVT::f64,    MVT::i32,    1 },
2761     { ISD::SINT_TO_FP,  MVT::f32,    MVT::i64,    1 },
2762     { ISD::SINT_TO_FP,  MVT::f64,    MVT::i64,    1 },
2763     { ISD::SINT_TO_FP,  MVT::v4f32,  MVT::v16i8,  1 },
2764     { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v16i8,  1 },
2765     { ISD::SINT_TO_FP,  MVT::v4f32,  MVT::v8i16,  1 },
2766     { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v8i16,  1 },
2767     { ISD::SINT_TO_FP,  MVT::v4f32,  MVT::v4i32,  1 },
2768     { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v4i32,  1 },
2769     { ISD::SINT_TO_FP,  MVT::v4f64,  MVT::v4i32,  2 },
2770 
2771     { ISD::UINT_TO_FP,  MVT::f32,    MVT::i32,    1 },
2772     { ISD::UINT_TO_FP,  MVT::f64,    MVT::i32,    1 },
2773     { ISD::UINT_TO_FP,  MVT::f32,    MVT::i64,    4 },
2774     { ISD::UINT_TO_FP,  MVT::f64,    MVT::i64,    4 },
2775     { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v16i8,  1 },
2776     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v16i8,  1 },
2777     { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v8i16,  1 },
2778     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v8i16,  1 },
2779     { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i32,  3 },
2780     { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v4i32,  3 },
2781     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v4i32,  2 },
2782     { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v2i64, 12 },
2783     { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v4i64, 22 },
2784     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i64,  4 },
2785 
2786     { ISD::FP_TO_SINT,  MVT::i32,    MVT::f32,    1 },
2787     { ISD::FP_TO_SINT,  MVT::i64,    MVT::f32,    1 },
2788     { ISD::FP_TO_SINT,  MVT::i32,    MVT::f64,    1 },
2789     { ISD::FP_TO_SINT,  MVT::i64,    MVT::f64,    1 },
2790     { ISD::FP_TO_SINT,  MVT::v16i8,  MVT::v4f32,  2 },
2791     { ISD::FP_TO_SINT,  MVT::v16i8,  MVT::v2f64,  2 },
2792     { ISD::FP_TO_SINT,  MVT::v8i16,  MVT::v4f32,  1 },
2793     { ISD::FP_TO_SINT,  MVT::v8i16,  MVT::v2f64,  1 },
2794     { ISD::FP_TO_SINT,  MVT::v4i32,  MVT::v4f32,  1 },
2795     { ISD::FP_TO_SINT,  MVT::v4i32,  MVT::v2f64,  1 },
2796 
2797     { ISD::FP_TO_UINT,  MVT::i32,    MVT::f32,    1 },
2798     { ISD::FP_TO_UINT,  MVT::i64,    MVT::f32,    4 },
2799     { ISD::FP_TO_UINT,  MVT::i32,    MVT::f64,    1 },
2800     { ISD::FP_TO_UINT,  MVT::i64,    MVT::f64,    4 },
2801     { ISD::FP_TO_UINT,  MVT::v16i8,  MVT::v4f32,  2 },
2802     { ISD::FP_TO_UINT,  MVT::v16i8,  MVT::v2f64,  2 },
2803     { ISD::FP_TO_UINT,  MVT::v8i16,  MVT::v4f32,  1 },
2804     { ISD::FP_TO_UINT,  MVT::v8i16,  MVT::v2f64,  1 },
2805     { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v4f32,  4 },
2806     { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v2f64,  4 },
2807   };
2808 
2809   static const TypeConversionCostTblEntry SSE2ConversionTbl[] = {
2810     // These are somewhat magic numbers justified by comparing the
2811     // output of llvm-mca for our various supported scheduler models
2812     // and basing it off the worst case scenario.
2813     { ISD::SINT_TO_FP,  MVT::f32,    MVT::i32,    3 },
2814     { ISD::SINT_TO_FP,  MVT::f64,    MVT::i32,    3 },
2815     { ISD::SINT_TO_FP,  MVT::f32,    MVT::i64,    3 },
2816     { ISD::SINT_TO_FP,  MVT::f64,    MVT::i64,    3 },
2817     { ISD::SINT_TO_FP,  MVT::v4f32,  MVT::v16i8,  3 },
2818     { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v16i8,  4 },
2819     { ISD::SINT_TO_FP,  MVT::v4f32,  MVT::v8i16,  3 },
2820     { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v8i16,  4 },
2821     { ISD::SINT_TO_FP,  MVT::v4f32,  MVT::v4i32,  3 },
2822     { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v4i32,  4 },
2823     { ISD::SINT_TO_FP,  MVT::v4f32,  MVT::v2i64,  8 },
2824     { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v2i64,  8 },
2825 
2826     { ISD::UINT_TO_FP,  MVT::f32,    MVT::i32,    3 },
2827     { ISD::UINT_TO_FP,  MVT::f64,    MVT::i32,    3 },
2828     { ISD::UINT_TO_FP,  MVT::f32,    MVT::i64,    8 },
2829     { ISD::UINT_TO_FP,  MVT::f64,    MVT::i64,    9 },
2830     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v16i8,  4 },
2831     { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v16i8,  4 },
2832     { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v8i16,  4 },
2833     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v8i16,  4 },
2834     { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i32,  7 },
2835     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v4i32,  7 },
2836     { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v4i32,  5 },
2837     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i64, 15 },
2838     { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v2i64, 18 },
2839 
2840     { ISD::FP_TO_SINT,  MVT::i32,    MVT::f32,    4 },
2841     { ISD::FP_TO_SINT,  MVT::i64,    MVT::f32,    4 },
2842     { ISD::FP_TO_SINT,  MVT::i32,    MVT::f64,    4 },
2843     { ISD::FP_TO_SINT,  MVT::i64,    MVT::f64,    4 },
2844     { ISD::FP_TO_SINT,  MVT::v16i8,  MVT::v4f32,  6 },
2845     { ISD::FP_TO_SINT,  MVT::v16i8,  MVT::v2f64,  6 },
2846     { ISD::FP_TO_SINT,  MVT::v8i16,  MVT::v4f32,  5 },
2847     { ISD::FP_TO_SINT,  MVT::v8i16,  MVT::v2f64,  5 },
2848     { ISD::FP_TO_SINT,  MVT::v4i32,  MVT::v4f32,  4 },
2849     { ISD::FP_TO_SINT,  MVT::v4i32,  MVT::v2f64,  4 },
2850 
2851     { ISD::FP_TO_UINT,  MVT::i32,    MVT::f32,    4 },
2852     { ISD::FP_TO_UINT,  MVT::i64,    MVT::f32,    4 },
2853     { ISD::FP_TO_UINT,  MVT::i32,    MVT::f64,    4 },
2854     { ISD::FP_TO_UINT,  MVT::i64,    MVT::f64,   15 },
2855     { ISD::FP_TO_UINT,  MVT::v16i8,  MVT::v4f32,  6 },
2856     { ISD::FP_TO_UINT,  MVT::v16i8,  MVT::v2f64,  6 },
2857     { ISD::FP_TO_UINT,  MVT::v8i16,  MVT::v4f32,  5 },
2858     { ISD::FP_TO_UINT,  MVT::v8i16,  MVT::v2f64,  5 },
2859     { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v4f32,  8 },
2860     { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v2f64,  8 },
2861 
2862     { ISD::ZERO_EXTEND, MVT::v2i64,  MVT::v16i8,  4 },
2863     { ISD::SIGN_EXTEND, MVT::v2i64,  MVT::v16i8,  4 },
2864     { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v16i8,  2 },
2865     { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v16i8,  3 },
2866     { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v16i8,  1 },
2867     { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v16i8,  2 },
2868     { ISD::ZERO_EXTEND, MVT::v2i64,  MVT::v8i16,  2 },
2869     { ISD::SIGN_EXTEND, MVT::v2i64,  MVT::v8i16,  3 },
2870     { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v8i16,  1 },
2871     { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v8i16,  2 },
2872     { ISD::ZERO_EXTEND, MVT::v2i64,  MVT::v4i32,  1 },
2873     { ISD::SIGN_EXTEND, MVT::v2i64,  MVT::v4i32,  2 },
2874 
2875     // These truncates are really widening elements.
2876     { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i32,  1 }, // PSHUFD
2877     { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i16,  2 }, // PUNPCKLWD+DQ
2878     { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i8,   3 }, // PUNPCKLBW+WD+PSHUFD
2879     { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i16,  1 }, // PUNPCKLWD
2880     { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i8,   2 }, // PUNPCKLBW+WD
2881     { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i8,   1 }, // PUNPCKLBW
2882 
2883     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v8i16,  2 }, // PAND+PACKUSWB
2884     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i16, 3 },
2885     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v4i32,  3 }, // PAND+2*PACKUSWB
2886     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i32, 7 },
2887     { ISD::TRUNCATE,    MVT::v2i16,  MVT::v2i32,  1 },
2888     { ISD::TRUNCATE,    MVT::v8i16,  MVT::v4i32,  3 },
2889     { ISD::TRUNCATE,    MVT::v8i16,  MVT::v8i32,  5 },
2890     { ISD::TRUNCATE,    MVT::v16i16, MVT::v16i32,10 },
2891     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v2i64,  4 }, // PAND+3*PACKUSWB
2892     { ISD::TRUNCATE,    MVT::v8i16,  MVT::v2i64,  2 }, // PSHUFD+PSHUFLW
2893     { ISD::TRUNCATE,    MVT::v4i32,  MVT::v2i64,  1 }, // PSHUFD
2894   };
2895 
2896   // Attempt to map directly to (simple) MVT types to let us match custom entries.
2897   EVT SrcTy = TLI->getValueType(DL, Src);
2898   EVT DstTy = TLI->getValueType(DL, Dst);
2899 
2900   // The function getSimpleVT only handles simple value types.
2901   if (SrcTy.isSimple() && DstTy.isSimple()) {
2902     MVT SimpleSrcTy = SrcTy.getSimpleVT();
2903     MVT SimpleDstTy = DstTy.getSimpleVT();
2904 
2905     if (ST->useAVX512Regs()) {
2906       if (ST->hasBWI())
2907         if (const auto *Entry = ConvertCostTableLookup(
2908                 AVX512BWConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2909           return AdjustCost(Entry->Cost);
2910 
2911       if (ST->hasDQI())
2912         if (const auto *Entry = ConvertCostTableLookup(
2913                 AVX512DQConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2914           return AdjustCost(Entry->Cost);
2915 
2916       if (ST->hasAVX512())
2917         if (const auto *Entry = ConvertCostTableLookup(
2918                 AVX512FConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2919           return AdjustCost(Entry->Cost);
2920     }
2921 
2922     if (ST->hasBWI())
2923       if (const auto *Entry = ConvertCostTableLookup(
2924               AVX512BWVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2925         return AdjustCost(Entry->Cost);
2926 
2927     if (ST->hasDQI())
2928       if (const auto *Entry = ConvertCostTableLookup(
2929               AVX512DQVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2930         return AdjustCost(Entry->Cost);
2931 
2932     if (ST->hasAVX512())
2933       if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
2934                                                      SimpleDstTy, SimpleSrcTy))
2935         return AdjustCost(Entry->Cost);
2936 
2937     if (ST->hasAVX2()) {
2938       if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
2939                                                      SimpleDstTy, SimpleSrcTy))
2940         return AdjustCost(Entry->Cost);
2941     }
2942 
2943     if (ST->hasAVX()) {
2944       if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
2945                                                      SimpleDstTy, SimpleSrcTy))
2946         return AdjustCost(Entry->Cost);
2947     }
2948 
2949     if (ST->hasSSE41()) {
2950       if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
2951                                                      SimpleDstTy, SimpleSrcTy))
2952         return AdjustCost(Entry->Cost);
2953     }
2954 
2955     if (ST->hasSSE2()) {
2956       if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
2957                                                      SimpleDstTy, SimpleSrcTy))
2958         return AdjustCost(Entry->Cost);
2959     }
2960   }
2961 
2962   // Fall back to legalized types.
2963   std::pair<InstructionCost, MVT> LTSrc = getTypeLegalizationCost(Src);
2964   std::pair<InstructionCost, MVT> LTDest = getTypeLegalizationCost(Dst);
2965 
2966   // If we're truncating to the same legalized type - just assume its free.
2967   if (ISD == ISD::TRUNCATE && LTSrc.second == LTDest.second)
2968     return TTI::TCC_Free;
2969 
2970   if (ST->useAVX512Regs()) {
2971     if (ST->hasBWI())
2972       if (const auto *Entry = ConvertCostTableLookup(
2973               AVX512BWConversionTbl, ISD, LTDest.second, LTSrc.second))
2974         return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2975 
2976     if (ST->hasDQI())
2977       if (const auto *Entry = ConvertCostTableLookup(
2978               AVX512DQConversionTbl, ISD, LTDest.second, LTSrc.second))
2979         return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2980 
2981     if (ST->hasAVX512())
2982       if (const auto *Entry = ConvertCostTableLookup(
2983               AVX512FConversionTbl, ISD, LTDest.second, LTSrc.second))
2984         return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2985   }
2986 
2987   if (ST->hasBWI())
2988     if (const auto *Entry = ConvertCostTableLookup(AVX512BWVLConversionTbl, ISD,
2989                                                    LTDest.second, LTSrc.second))
2990       return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2991 
2992   if (ST->hasDQI())
2993     if (const auto *Entry = ConvertCostTableLookup(AVX512DQVLConversionTbl, ISD,
2994                                                    LTDest.second, LTSrc.second))
2995       return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2996 
2997   if (ST->hasAVX512())
2998     if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
2999                                                    LTDest.second, LTSrc.second))
3000       return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
3001 
3002   if (ST->hasAVX2())
3003     if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
3004                                                    LTDest.second, LTSrc.second))
3005       return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
3006 
3007   if (ST->hasAVX())
3008     if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
3009                                                    LTDest.second, LTSrc.second))
3010       return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
3011 
3012   if (ST->hasSSE41())
3013     if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
3014                                                    LTDest.second, LTSrc.second))
3015       return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
3016 
3017   if (ST->hasSSE2())
3018     if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
3019                                                    LTDest.second, LTSrc.second))
3020       return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
3021 
3022   // Fallback, for i8/i16 sitofp/uitofp cases we need to extend to i32 for
3023   // sitofp.
3024   if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) &&
3025       1 < Src->getScalarSizeInBits() && Src->getScalarSizeInBits() < 32) {
3026     Type *ExtSrc = Src->getWithNewBitWidth(32);
3027     unsigned ExtOpc =
3028         (ISD == ISD::SINT_TO_FP) ? Instruction::SExt : Instruction::ZExt;
3029 
3030     // For scalar loads the extend would be free.
3031     InstructionCost ExtCost = 0;
3032     if (!(Src->isIntegerTy() && I && isa<LoadInst>(I->getOperand(0))))
3033       ExtCost = getCastInstrCost(ExtOpc, ExtSrc, Src, CCH, CostKind);
3034 
3035     return ExtCost + getCastInstrCost(Instruction::SIToFP, Dst, ExtSrc,
3036                                       TTI::CastContextHint::None, CostKind);
3037   }
3038 
3039   // Fallback for fptosi/fptoui i8/i16 cases we need to truncate from fptosi
3040   // i32.
3041   if ((ISD == ISD::FP_TO_SINT || ISD == ISD::FP_TO_UINT) &&
3042       1 < Dst->getScalarSizeInBits() && Dst->getScalarSizeInBits() < 32) {
3043     Type *TruncDst = Dst->getWithNewBitWidth(32);
3044     return getCastInstrCost(Instruction::FPToSI, TruncDst, Src, CCH, CostKind) +
3045            getCastInstrCost(Instruction::Trunc, Dst, TruncDst,
3046                             TTI::CastContextHint::None, CostKind);
3047   }
3048 
3049   return AdjustCost(
3050       BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
3051 }
3052 
3053 InstructionCost X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
3054                                                Type *CondTy,
3055                                                CmpInst::Predicate VecPred,
3056                                                TTI::TargetCostKind CostKind,
3057                                                const Instruction *I) {
3058   // Early out if this type isn't scalar/vector integer/float.
3059   if (!(ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy()))
3060     return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
3061                                      I);
3062 
3063   // Legalize the type.
3064   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
3065 
3066   MVT MTy = LT.second;
3067 
3068   int ISD = TLI->InstructionOpcodeToISD(Opcode);
3069   assert(ISD && "Invalid opcode");
3070 
3071   InstructionCost ExtraCost = 0;
3072   if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) {
3073     // Some vector comparison predicates cost extra instructions.
3074     // TODO: Should we invert this and assume worst case cmp costs
3075     // and reduce for particular predicates?
3076     if (MTy.isVector() &&
3077         !((ST->hasXOP() && (!ST->hasAVX2() || MTy.is128BitVector())) ||
3078           (ST->hasAVX512() && 32 <= MTy.getScalarSizeInBits()) ||
3079           ST->hasBWI())) {
3080       // Fallback to I if a specific predicate wasn't specified.
3081       CmpInst::Predicate Pred = VecPred;
3082       if (I && (Pred == CmpInst::BAD_ICMP_PREDICATE ||
3083                 Pred == CmpInst::BAD_FCMP_PREDICATE))
3084         Pred = cast<CmpInst>(I)->getPredicate();
3085 
3086       switch (Pred) {
3087       case CmpInst::Predicate::ICMP_NE:
3088         // xor(cmpeq(x,y),-1)
3089         ExtraCost = 1;
3090         break;
3091       case CmpInst::Predicate::ICMP_SGE:
3092       case CmpInst::Predicate::ICMP_SLE:
3093         // xor(cmpgt(x,y),-1)
3094         ExtraCost = 1;
3095         break;
3096       case CmpInst::Predicate::ICMP_ULT:
3097       case CmpInst::Predicate::ICMP_UGT:
3098         // cmpgt(xor(x,signbit),xor(y,signbit))
3099         // xor(cmpeq(pmaxu(x,y),x),-1)
3100         ExtraCost = 2;
3101         break;
3102       case CmpInst::Predicate::ICMP_ULE:
3103       case CmpInst::Predicate::ICMP_UGE:
3104         if ((ST->hasSSE41() && MTy.getScalarSizeInBits() == 32) ||
3105             (ST->hasSSE2() && MTy.getScalarSizeInBits() < 32)) {
3106           // cmpeq(psubus(x,y),0)
3107           // cmpeq(pminu(x,y),x)
3108           ExtraCost = 1;
3109         } else {
3110           // xor(cmpgt(xor(x,signbit),xor(y,signbit)),-1)
3111           ExtraCost = 3;
3112         }
3113         break;
3114       case CmpInst::Predicate::FCMP_ONE:
3115       case CmpInst::Predicate::FCMP_UEQ:
3116         // Without AVX we need to expand FCMP_ONE/FCMP_UEQ cases.
3117         // Use FCMP_UEQ expansion - FCMP_ONE should be the same.
3118         if (CondTy && !ST->hasAVX())
3119           return getCmpSelInstrCost(Opcode, ValTy, CondTy,
3120                                     CmpInst::Predicate::FCMP_UNO, CostKind) +
3121                  getCmpSelInstrCost(Opcode, ValTy, CondTy,
3122                                     CmpInst::Predicate::FCMP_OEQ, CostKind) +
3123                  getArithmeticInstrCost(Instruction::Or, CondTy, CostKind);
3124 
3125         break;
3126       case CmpInst::Predicate::BAD_ICMP_PREDICATE:
3127       case CmpInst::Predicate::BAD_FCMP_PREDICATE:
3128         // Assume worst case scenario and add the maximum extra cost.
3129         ExtraCost = 3;
3130         break;
3131       default:
3132         break;
3133       }
3134     }
3135   }
3136 
3137   static const CostKindTblEntry SLMCostTbl[] = {
3138     // slm pcmpeq/pcmpgt throughput is 2
3139     { ISD::SETCC,   MVT::v2i64,   { 2, 5, 1, 2 } },
3140     // slm pblendvb/blendvpd/blendvps throughput is 4
3141     { ISD::SELECT,  MVT::v2f64,   { 4, 4, 1, 3 } }, // vblendvpd
3142     { ISD::SELECT,  MVT::v4f32,   { 4, 4, 1, 3 } }, // vblendvps
3143     { ISD::SELECT,  MVT::v2i64,   { 4, 4, 1, 3 } }, // pblendvb
3144     { ISD::SELECT,  MVT::v8i32,   { 4, 4, 1, 3 } }, // pblendvb
3145     { ISD::SELECT,  MVT::v8i16,   { 4, 4, 1, 3 } }, // pblendvb
3146     { ISD::SELECT,  MVT::v16i8,   { 4, 4, 1, 3 } }, // pblendvb
3147   };
3148 
3149   static const CostKindTblEntry AVX512BWCostTbl[] = {
3150     { ISD::SETCC,   MVT::v32i16,  { 1, 1, 1, 1 } },
3151     { ISD::SETCC,   MVT::v16i16,  { 1, 1, 1, 1 } },
3152     { ISD::SETCC,   MVT::v64i8,   { 1, 1, 1, 1 } },
3153     { ISD::SETCC,   MVT::v32i8,   { 1, 1, 1, 1 } },
3154 
3155     { ISD::SELECT,  MVT::v32i16,  { 1, 1, 1, 1 } },
3156     { ISD::SELECT,  MVT::v64i8,   { 1, 1, 1, 1 } },
3157   };
3158 
3159   static const CostKindTblEntry AVX512CostTbl[] = {
3160     { ISD::SETCC,   MVT::v8f64,   { 1, 4, 1, 1 } },
3161     { ISD::SETCC,   MVT::v4f64,   { 1, 4, 1, 1 } },
3162     { ISD::SETCC,   MVT::v16f32,  { 1, 4, 1, 1 } },
3163     { ISD::SETCC,   MVT::v8f32,   { 1, 4, 1, 1 } },
3164 
3165     { ISD::SETCC,   MVT::v8i64,   { 1, 1, 1, 1 } },
3166     { ISD::SETCC,   MVT::v4i64,   { 1, 1, 1, 1 } },
3167     { ISD::SETCC,   MVT::v2i64,   { 1, 1, 1, 1 } },
3168     { ISD::SETCC,   MVT::v16i32,  { 1, 1, 1, 1 } },
3169     { ISD::SETCC,   MVT::v8i32,   { 1, 1, 1, 1 } },
3170     { ISD::SETCC,   MVT::v32i16,  { 3, 7, 5, 5 } },
3171     { ISD::SETCC,   MVT::v64i8,   { 3, 7, 5, 5 } },
3172 
3173     { ISD::SELECT,  MVT::v8i64,   { 1, 1, 1, 1 } },
3174     { ISD::SELECT,  MVT::v4i64,   { 1, 1, 1, 1 } },
3175     { ISD::SELECT,  MVT::v2i64,   { 1, 1, 1, 1 } },
3176     { ISD::SELECT,  MVT::v16i32,  { 1, 1, 1, 1 } },
3177     { ISD::SELECT,  MVT::v8i32,   { 1, 1, 1, 1 } },
3178     { ISD::SELECT,  MVT::v4i32,   { 1, 1, 1, 1 } },
3179     { ISD::SELECT,  MVT::v8f64,   { 1, 1, 1, 1 } },
3180     { ISD::SELECT,  MVT::v4f64,   { 1, 1, 1, 1 } },
3181     { ISD::SELECT,  MVT::v2f64,   { 1, 1, 1, 1 } },
3182     { ISD::SELECT,  MVT::f64,     { 1, 1, 1, 1 } },
3183     { ISD::SELECT,  MVT::v16f32,  { 1, 1, 1, 1 } },
3184     { ISD::SELECT,  MVT::v8f32 ,  { 1, 1, 1, 1 } },
3185     { ISD::SELECT,  MVT::v4f32,   { 1, 1, 1, 1 } },
3186     { ISD::SELECT,  MVT::f32  ,   { 1, 1, 1, 1 } },
3187 
3188     { ISD::SELECT,  MVT::v32i16,  { 2, 2, 4, 4 } },
3189     { ISD::SELECT,  MVT::v16i16,  { 1, 1, 1, 1 } },
3190     { ISD::SELECT,  MVT::v8i16,   { 1, 1, 1, 1 } },
3191     { ISD::SELECT,  MVT::v64i8,   { 2, 2, 4, 4 } },
3192     { ISD::SELECT,  MVT::v32i8,   { 1, 1, 1, 1 } },
3193     { ISD::SELECT,  MVT::v16i8,   { 1, 1, 1, 1 } },
3194   };
3195 
3196   static const CostKindTblEntry AVX2CostTbl[] = {
3197     { ISD::SETCC,   MVT::v4f64,   { 1, 4, 1, 2 } },
3198     { ISD::SETCC,   MVT::v2f64,   { 1, 4, 1, 1 } },
3199     { ISD::SETCC,   MVT::f64,     { 1, 4, 1, 1 } },
3200     { ISD::SETCC,   MVT::v8f32,   { 1, 4, 1, 2 } },
3201     { ISD::SETCC,   MVT::v4f32,   { 1, 4, 1, 1 } },
3202     { ISD::SETCC,   MVT::f32,     { 1, 4, 1, 1 } },
3203 
3204     { ISD::SETCC,   MVT::v4i64,   { 1, 1, 1, 2 } },
3205     { ISD::SETCC,   MVT::v8i32,   { 1, 1, 1, 2 } },
3206     { ISD::SETCC,   MVT::v16i16,  { 1, 1, 1, 2 } },
3207     { ISD::SETCC,   MVT::v32i8,   { 1, 1, 1, 2 } },
3208 
3209     { ISD::SELECT,  MVT::v4f64,   { 2, 2, 1, 2 } }, // vblendvpd
3210     { ISD::SELECT,  MVT::v8f32,   { 2, 2, 1, 2 } }, // vblendvps
3211     { ISD::SELECT,  MVT::v4i64,   { 2, 2, 1, 2 } }, // pblendvb
3212     { ISD::SELECT,  MVT::v8i32,   { 2, 2, 1, 2 } }, // pblendvb
3213     { ISD::SELECT,  MVT::v16i16,  { 2, 2, 1, 2 } }, // pblendvb
3214     { ISD::SELECT,  MVT::v32i8,   { 2, 2, 1, 2 } }, // pblendvb
3215   };
3216 
3217   static const CostKindTblEntry XOPCostTbl[] = {
3218     { ISD::SETCC,   MVT::v4i64,   { 4, 2, 5, 6 } },
3219     { ISD::SETCC,   MVT::v2i64,   { 1, 1, 1, 1 } },
3220   };
3221 
3222   static const CostKindTblEntry AVX1CostTbl[] = {
3223     { ISD::SETCC,   MVT::v4f64,   { 2, 3, 1, 2 } },
3224     { ISD::SETCC,   MVT::v2f64,   { 1, 3, 1, 1 } },
3225     { ISD::SETCC,   MVT::f64,     { 1, 3, 1, 1 } },
3226     { ISD::SETCC,   MVT::v8f32,   { 2, 3, 1, 2 } },
3227     { ISD::SETCC,   MVT::v4f32,   { 1, 3, 1, 1 } },
3228     { ISD::SETCC,   MVT::f32,     { 1, 3, 1, 1 } },
3229 
3230     // AVX1 does not support 8-wide integer compare.
3231     { ISD::SETCC,   MVT::v4i64,   { 4, 2, 5, 6 } },
3232     { ISD::SETCC,   MVT::v8i32,   { 4, 2, 5, 6 } },
3233     { ISD::SETCC,   MVT::v16i16,  { 4, 2, 5, 6 } },
3234     { ISD::SETCC,   MVT::v32i8,   { 4, 2, 5, 6 } },
3235 
3236     { ISD::SELECT,  MVT::v4f64,   { 3, 3, 1, 2 } }, // vblendvpd
3237     { ISD::SELECT,  MVT::v8f32,   { 3, 3, 1, 2 } }, // vblendvps
3238     { ISD::SELECT,  MVT::v4i64,   { 3, 3, 1, 2 } }, // vblendvpd
3239     { ISD::SELECT,  MVT::v8i32,   { 3, 3, 1, 2 } }, // vblendvps
3240     { ISD::SELECT,  MVT::v16i16,  { 3, 3, 3, 3 } }, // vandps + vandnps + vorps
3241     { ISD::SELECT,  MVT::v32i8,   { 3, 3, 3, 3 } }, // vandps + vandnps + vorps
3242   };
3243 
3244   static const CostKindTblEntry SSE42CostTbl[] = {
3245     { ISD::SETCC,   MVT::v2i64,   { 1, 2, 1, 2 } },
3246   };
3247 
3248   static const CostKindTblEntry SSE41CostTbl[] = {
3249     { ISD::SETCC,   MVT::v2f64,   { 1, 5, 1, 1 } },
3250     { ISD::SETCC,   MVT::v4f32,   { 1, 5, 1, 1 } },
3251 
3252     { ISD::SELECT,  MVT::v2f64,   { 2, 2, 1, 2 } }, // blendvpd
3253     { ISD::SELECT,  MVT::f64,     { 2, 2, 1, 2 } }, // blendvpd
3254     { ISD::SELECT,  MVT::v4f32,   { 2, 2, 1, 2 } }, // blendvps
3255     { ISD::SELECT,  MVT::f32  ,   { 2, 2, 1, 2 } }, // blendvps
3256     { ISD::SELECT,  MVT::v2i64,   { 2, 2, 1, 2 } }, // pblendvb
3257     { ISD::SELECT,  MVT::v4i32,   { 2, 2, 1, 2 } }, // pblendvb
3258     { ISD::SELECT,  MVT::v8i16,   { 2, 2, 1, 2 } }, // pblendvb
3259     { ISD::SELECT,  MVT::v16i8,   { 2, 2, 1, 2 } }, // pblendvb
3260   };
3261 
3262   static const CostKindTblEntry SSE2CostTbl[] = {
3263     { ISD::SETCC,   MVT::v2f64,   { 2, 5, 1, 1 } },
3264     { ISD::SETCC,   MVT::f64,     { 1, 5, 1, 1 } },
3265 
3266     { ISD::SETCC,   MVT::v2i64,   { 5, 4, 5, 5 } }, // pcmpeqd/pcmpgtd expansion
3267     { ISD::SETCC,   MVT::v4i32,   { 1, 1, 1, 1 } },
3268     { ISD::SETCC,   MVT::v8i16,   { 1, 1, 1, 1 } },
3269     { ISD::SETCC,   MVT::v16i8,   { 1, 1, 1, 1 } },
3270 
3271     { ISD::SELECT,  MVT::v2f64,   { 2, 2, 3, 3 } }, // andpd + andnpd + orpd
3272     { ISD::SELECT,  MVT::f64,     { 2, 2, 3, 3 } }, // andpd + andnpd + orpd
3273     { ISD::SELECT,  MVT::v2i64,   { 2, 2, 3, 3 } }, // pand + pandn + por
3274     { ISD::SELECT,  MVT::v4i32,   { 2, 2, 3, 3 } }, // pand + pandn + por
3275     { ISD::SELECT,  MVT::v8i16,   { 2, 2, 3, 3 } }, // pand + pandn + por
3276     { ISD::SELECT,  MVT::v16i8,   { 2, 2, 3, 3 } }, // pand + pandn + por
3277   };
3278 
3279   static const CostKindTblEntry SSE1CostTbl[] = {
3280     { ISD::SETCC,   MVT::v4f32,   { 2, 5, 1, 1 } },
3281     { ISD::SETCC,   MVT::f32,     { 1, 5, 1, 1 } },
3282 
3283     { ISD::SELECT,  MVT::v4f32,   { 2, 2, 3, 3 } }, // andps + andnps + orps
3284     { ISD::SELECT,  MVT::f32,     { 2, 2, 3, 3 } }, // andps + andnps + orps
3285   };
3286 
3287   if (ST->useSLMArithCosts())
3288     if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
3289       if (auto KindCost = Entry->Cost[CostKind])
3290         return LT.first * (ExtraCost + *KindCost);
3291 
3292   if (ST->hasBWI())
3293     if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
3294       if (auto KindCost = Entry->Cost[CostKind])
3295         return LT.first * (ExtraCost + *KindCost);
3296 
3297   if (ST->hasAVX512())
3298     if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
3299       if (auto KindCost = Entry->Cost[CostKind])
3300         return LT.first * (ExtraCost + *KindCost);
3301 
3302   if (ST->hasAVX2())
3303     if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
3304       if (auto KindCost = Entry->Cost[CostKind])
3305         return LT.first * (ExtraCost + *KindCost);
3306 
3307   if (ST->hasXOP())
3308     if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
3309       if (auto KindCost = Entry->Cost[CostKind])
3310         return LT.first * (ExtraCost + *KindCost);
3311 
3312   if (ST->hasAVX())
3313     if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
3314       if (auto KindCost = Entry->Cost[CostKind])
3315         return LT.first * (ExtraCost + *KindCost);
3316 
3317   if (ST->hasSSE42())
3318     if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
3319       if (auto KindCost = Entry->Cost[CostKind])
3320         return LT.first * (ExtraCost + *KindCost);
3321 
3322   if (ST->hasSSE41())
3323     if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
3324       if (auto KindCost = Entry->Cost[CostKind])
3325         return LT.first * (ExtraCost + *KindCost);
3326 
3327   if (ST->hasSSE2())
3328     if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
3329       if (auto KindCost = Entry->Cost[CostKind])
3330         return LT.first * (ExtraCost + *KindCost);
3331 
3332   if (ST->hasSSE1())
3333     if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
3334       if (auto KindCost = Entry->Cost[CostKind])
3335         return LT.first * (ExtraCost + *KindCost);
3336 
3337   // Assume a 3cy latency for fp select ops.
3338   if (CostKind == TTI::TCK_Latency && Opcode == Instruction::Select)
3339     if (ValTy->getScalarType()->isFloatingPointTy())
3340       return 3;
3341 
3342   return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
3343 }
3344 
3345 unsigned X86TTIImpl::getAtomicMemIntrinsicMaxElementSize() const { return 16; }
3346 
3347 InstructionCost
3348 X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
3349                                   TTI::TargetCostKind CostKind) {
3350   // Costs should match the codegen from:
3351   // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll
3352   // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll
3353   // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll
3354   // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll
3355   // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll
3356 
3357   // TODO: Overflow intrinsics (*ADDO, *SUBO, *MULO) with vector types are not
3358   //       specialized in these tables yet.
3359   static const CostKindTblEntry AVX512VBMI2CostTbl[] = {
3360     { ISD::FSHL,       MVT::v8i64,   {  1,  1,  1,  1 } },
3361     { ISD::FSHL,       MVT::v4i64,   {  1,  1,  1,  1 } },
3362     { ISD::FSHL,       MVT::v2i64,   {  1,  1,  1,  1 } },
3363     { ISD::FSHL,       MVT::v16i32,  {  1,  1,  1,  1 } },
3364     { ISD::FSHL,       MVT::v8i32,   {  1,  1,  1,  1 } },
3365     { ISD::FSHL,       MVT::v4i32,   {  1,  1,  1,  1 } },
3366     { ISD::FSHL,       MVT::v32i16,  {  1,  1,  1,  1 } },
3367     { ISD::FSHL,       MVT::v16i16,  {  1,  1,  1,  1 } },
3368     { ISD::FSHL,       MVT::v8i16,   {  1,  1,  1,  1 } },
3369     { ISD::ROTL,       MVT::v32i16,  {  1,  1,  1,  1 } },
3370     { ISD::ROTL,       MVT::v16i16,  {  1,  1,  1,  1 } },
3371     { ISD::ROTL,       MVT::v8i16,   {  1,  1,  1,  1 } },
3372     { ISD::ROTR,       MVT::v32i16,  {  1,  1,  1,  1 } },
3373     { ISD::ROTR,       MVT::v16i16,  {  1,  1,  1,  1 } },
3374     { ISD::ROTR,       MVT::v8i16,   {  1,  1,  1,  1 } },
3375   };
3376   static const CostKindTblEntry AVX512BITALGCostTbl[] = {
3377     { ISD::CTPOP,      MVT::v32i16,  {  1,  1,  1,  1 } },
3378     { ISD::CTPOP,      MVT::v64i8,   {  1,  1,  1,  1 } },
3379     { ISD::CTPOP,      MVT::v16i16,  {  1,  1,  1,  1 } },
3380     { ISD::CTPOP,      MVT::v32i8,   {  1,  1,  1,  1 } },
3381     { ISD::CTPOP,      MVT::v8i16,   {  1,  1,  1,  1 } },
3382     { ISD::CTPOP,      MVT::v16i8,   {  1,  1,  1,  1 } },
3383   };
3384   static const CostKindTblEntry AVX512VPOPCNTDQCostTbl[] = {
3385     { ISD::CTPOP,      MVT::v8i64,   {  1,  1,  1,  1 } },
3386     { ISD::CTPOP,      MVT::v16i32,  {  1,  1,  1,  1 } },
3387     { ISD::CTPOP,      MVT::v4i64,   {  1,  1,  1,  1 } },
3388     { ISD::CTPOP,      MVT::v8i32,   {  1,  1,  1,  1 } },
3389     { ISD::CTPOP,      MVT::v2i64,   {  1,  1,  1,  1 } },
3390     { ISD::CTPOP,      MVT::v4i32,   {  1,  1,  1,  1 } },
3391   };
3392   static const CostKindTblEntry AVX512CDCostTbl[] = {
3393     { ISD::CTLZ,       MVT::v8i64,   {  1,  5,  1,  1 } },
3394     { ISD::CTLZ,       MVT::v16i32,  {  1,  5,  1,  1 } },
3395     { ISD::CTLZ,       MVT::v32i16,  { 18, 27, 23, 27 } },
3396     { ISD::CTLZ,       MVT::v64i8,   {  3, 16,  9, 11 } },
3397     { ISD::CTLZ,       MVT::v4i64,   {  1,  5,  1,  1 } },
3398     { ISD::CTLZ,       MVT::v8i32,   {  1,  5,  1,  1 } },
3399     { ISD::CTLZ,       MVT::v16i16,  {  8, 19, 11, 13 } },
3400     { ISD::CTLZ,       MVT::v32i8,   {  2, 11,  9, 10 } },
3401     { ISD::CTLZ,       MVT::v2i64,   {  1,  5,  1,  1 } },
3402     { ISD::CTLZ,       MVT::v4i32,   {  1,  5,  1,  1 } },
3403     { ISD::CTLZ,       MVT::v8i16,   {  3, 15,  4,  6 } },
3404     { ISD::CTLZ,       MVT::v16i8,   {  2, 10,  9, 10 } },
3405 
3406     { ISD::CTTZ,       MVT::v8i64,   {  2,  8,  6,  7 } },
3407     { ISD::CTTZ,       MVT::v16i32,  {  2,  8,  6,  7 } },
3408     { ISD::CTTZ,       MVT::v4i64,   {  1,  8,  6,  6 } },
3409     { ISD::CTTZ,       MVT::v8i32,   {  1,  8,  6,  6 } },
3410     { ISD::CTTZ,       MVT::v2i64,   {  1,  8,  6,  6 } },
3411     { ISD::CTTZ,       MVT::v4i32,   {  1,  8,  6,  6 } },
3412   };
3413   static const CostKindTblEntry AVX512BWCostTbl[] = {
3414     { ISD::ABS,        MVT::v32i16,  {  1,  1,  1,  1 } },
3415     { ISD::ABS,        MVT::v64i8,   {  1,  1,  1,  1 } },
3416     { ISD::BITREVERSE, MVT::v2i64,   {  3, 10, 10, 11 } },
3417     { ISD::BITREVERSE, MVT::v4i64,   {  3, 11, 10, 11 } },
3418     { ISD::BITREVERSE, MVT::v8i64,   {  3, 12, 10, 14 } },
3419     { ISD::BITREVERSE, MVT::v4i32,   {  3, 10, 10, 11 } },
3420     { ISD::BITREVERSE, MVT::v8i32,   {  3, 11, 10, 11 } },
3421     { ISD::BITREVERSE, MVT::v16i32,  {  3, 12, 10, 14 } },
3422     { ISD::BITREVERSE, MVT::v8i16,   {  3, 10, 10, 11 } },
3423     { ISD::BITREVERSE, MVT::v16i16,  {  3, 11, 10, 11 } },
3424     { ISD::BITREVERSE, MVT::v32i16,  {  3, 12, 10, 14 } },
3425     { ISD::BITREVERSE, MVT::v16i8,   {  2,  5,  9,  9 } },
3426     { ISD::BITREVERSE, MVT::v32i8,   {  2,  5,  9,  9 } },
3427     { ISD::BITREVERSE, MVT::v64i8,   {  2,  5,  9, 12 } },
3428     { ISD::BSWAP,      MVT::v2i64,   {  1,  1,  1,  2 } },
3429     { ISD::BSWAP,      MVT::v4i64,   {  1,  1,  1,  2 } },
3430     { ISD::BSWAP,      MVT::v8i64,   {  1,  1,  1,  2 } },
3431     { ISD::BSWAP,      MVT::v4i32,   {  1,  1,  1,  2 } },
3432     { ISD::BSWAP,      MVT::v8i32,   {  1,  1,  1,  2 } },
3433     { ISD::BSWAP,      MVT::v16i32,  {  1,  1,  1,  2 } },
3434     { ISD::BSWAP,      MVT::v8i16,   {  1,  1,  1,  2 } },
3435     { ISD::BSWAP,      MVT::v16i16,  {  1,  1,  1,  2 } },
3436     { ISD::BSWAP,      MVT::v32i16,  {  1,  1,  1,  2 } },
3437     { ISD::CTLZ,       MVT::v8i64,   {  8, 22, 23, 23 } },
3438     { ISD::CTLZ,       MVT::v16i32,  {  8, 23, 25, 25 } },
3439     { ISD::CTLZ,       MVT::v32i16,  {  4, 15, 15, 16 } },
3440     { ISD::CTLZ,       MVT::v64i8,   {  3, 12, 10,  9 } },
3441     { ISD::CTPOP,      MVT::v2i64,   {  3,  7, 10, 10 } },
3442     { ISD::CTPOP,      MVT::v4i64,   {  3,  7, 10, 10 } },
3443     { ISD::CTPOP,      MVT::v8i64,   {  3,  8, 10, 12 } },
3444     { ISD::CTPOP,      MVT::v4i32,   {  7, 11, 14, 14 } },
3445     { ISD::CTPOP,      MVT::v8i32,   {  7, 11, 14, 14 } },
3446     { ISD::CTPOP,      MVT::v16i32,  {  7, 12, 14, 16 } },
3447     { ISD::CTPOP,      MVT::v8i16,   {  2,  7, 11, 11 } },
3448     { ISD::CTPOP,      MVT::v16i16,  {  2,  7, 11, 11 } },
3449     { ISD::CTPOP,      MVT::v32i16,  {  3,  7, 11, 13 } },
3450     { ISD::CTPOP,      MVT::v16i8,   {  2,  4,  8,  8 } },
3451     { ISD::CTPOP,      MVT::v32i8,   {  2,  4,  8,  8 } },
3452     { ISD::CTPOP,      MVT::v64i8,   {  2,  5,  8, 10 } },
3453     { ISD::CTTZ,       MVT::v8i16,   {  3,  9, 14, 14 } },
3454     { ISD::CTTZ,       MVT::v16i16,  {  3,  9, 14, 14 } },
3455     { ISD::CTTZ,       MVT::v32i16,  {  3, 10, 14, 16 } },
3456     { ISD::CTTZ,       MVT::v16i8,   {  2,  6, 11, 11 } },
3457     { ISD::CTTZ,       MVT::v32i8,   {  2,  6, 11, 11 } },
3458     { ISD::CTTZ,       MVT::v64i8,   {  3,  7, 11, 13 } },
3459     { ISD::ROTL,       MVT::v32i16,  {  2,  8,  6,  8 } },
3460     { ISD::ROTL,       MVT::v16i16,  {  2,  8,  6,  7 } },
3461     { ISD::ROTL,       MVT::v8i16,   {  2,  7,  6,  7 } },
3462     { ISD::ROTL,       MVT::v64i8,   {  5,  6, 11, 12 } },
3463     { ISD::ROTL,       MVT::v32i8,   {  5, 15,  7, 10 } },
3464     { ISD::ROTL,       MVT::v16i8,   {  5, 15,  7, 10 } },
3465     { ISD::ROTR,       MVT::v32i16,  {  2,  8,  6,  8 } },
3466     { ISD::ROTR,       MVT::v16i16,  {  2,  8,  6,  7 } },
3467     { ISD::ROTR,       MVT::v8i16,   {  2,  7,  6,  7 } },
3468     { ISD::ROTR,       MVT::v64i8,   {  5,  6, 12, 14 } },
3469     { ISD::ROTR,       MVT::v32i8,   {  5, 14,  6,  9 } },
3470     { ISD::ROTR,       MVT::v16i8,   {  5, 14,  6,  9 } },
3471     { ISD::SADDSAT,    MVT::v32i16,  {  1 } },
3472     { ISD::SADDSAT,    MVT::v64i8,   {  1 } },
3473     { ISD::SMAX,       MVT::v32i16,  {  1,  1,  1,  1 } },
3474     { ISD::SMAX,       MVT::v64i8,   {  1,  1,  1,  1 } },
3475     { ISD::SMIN,       MVT::v32i16,  {  1,  1,  1,  1 } },
3476     { ISD::SMIN,       MVT::v64i8,   {  1,  1,  1,  1 } },
3477     { ISD::SSUBSAT,    MVT::v32i16,  {  1 } },
3478     { ISD::SSUBSAT,    MVT::v64i8,   {  1 } },
3479     { ISD::UADDSAT,    MVT::v32i16,  {  1 } },
3480     { ISD::UADDSAT,    MVT::v64i8,   {  1 } },
3481     { ISD::UMAX,       MVT::v32i16,  {  1,  1,  1,  1 } },
3482     { ISD::UMAX,       MVT::v64i8,   {  1,  1,  1,  1 } },
3483     { ISD::UMIN,       MVT::v32i16,  {  1,  1,  1,  1 } },
3484     { ISD::UMIN,       MVT::v64i8,   {  1,  1,  1,  1 } },
3485     { ISD::USUBSAT,    MVT::v32i16,  {  1 } },
3486     { ISD::USUBSAT,    MVT::v64i8,   {  1 } },
3487   };
3488   static const CostKindTblEntry AVX512CostTbl[] = {
3489     { ISD::ABS,        MVT::v8i64,   {  1,  1,  1,  1 } },
3490     { ISD::ABS,        MVT::v4i64,   {  1,  1,  1,  1 } },
3491     { ISD::ABS,        MVT::v2i64,   {  1,  1,  1,  1 } },
3492     { ISD::ABS,        MVT::v16i32,  {  1,  1,  1,  1 } },
3493     { ISD::ABS,        MVT::v8i32,   {  1,  1,  1,  1 } },
3494     { ISD::ABS,        MVT::v32i16,  {  2,  7,  4,  4 } },
3495     { ISD::ABS,        MVT::v16i16,  {  1,  1,  1,  1 } },
3496     { ISD::ABS,        MVT::v64i8,   {  2,  7,  4,  4 } },
3497     { ISD::ABS,        MVT::v32i8,   {  1,  1,  1,  1 } },
3498     { ISD::BITREVERSE, MVT::v8i64,   {  9, 13, 20, 20 } },
3499     { ISD::BITREVERSE, MVT::v16i32,  {  9, 13, 20, 20 } },
3500     { ISD::BITREVERSE, MVT::v32i16,  {  9, 13, 20, 20 } },
3501     { ISD::BITREVERSE, MVT::v64i8,   {  6, 11, 17, 17 } },
3502     { ISD::BSWAP,      MVT::v8i64,   {  4,  7,  5,  5 } },
3503     { ISD::BSWAP,      MVT::v16i32,  {  4,  7,  5,  5 } },
3504     { ISD::BSWAP,      MVT::v32i16,  {  4,  7,  5,  5 } },
3505     { ISD::CTLZ,       MVT::v8i64,   { 10, 28, 32, 32 } },
3506     { ISD::CTLZ,       MVT::v16i32,  { 12, 30, 38, 38 } },
3507     { ISD::CTLZ,       MVT::v32i16,  {  8, 15, 29, 29 } },
3508     { ISD::CTLZ,       MVT::v64i8,   {  6, 11, 19, 19 } },
3509     { ISD::CTPOP,      MVT::v8i64,   { 16, 16, 19, 19 } },
3510     { ISD::CTPOP,      MVT::v16i32,  { 24, 19, 27, 27 } },
3511     { ISD::CTPOP,      MVT::v32i16,  { 18, 15, 22, 22 } },
3512     { ISD::CTPOP,      MVT::v64i8,   { 12, 11, 16, 16 } },
3513     { ISD::CTTZ,       MVT::v8i64,   {  2,  8,  6,  7 } },
3514     { ISD::CTTZ,       MVT::v16i32,  {  2,  8,  6,  7 } },
3515     { ISD::CTTZ,       MVT::v32i16,  {  7, 17, 27, 27 } },
3516     { ISD::CTTZ,       MVT::v64i8,   {  6, 13, 21, 21 } },
3517     { ISD::ROTL,       MVT::v8i64,   {  1,  1,  1,  1 } },
3518     { ISD::ROTL,       MVT::v4i64,   {  1,  1,  1,  1 } },
3519     { ISD::ROTL,       MVT::v2i64,   {  1,  1,  1,  1 } },
3520     { ISD::ROTL,       MVT::v16i32,  {  1,  1,  1,  1 } },
3521     { ISD::ROTL,       MVT::v8i32,   {  1,  1,  1,  1 } },
3522     { ISD::ROTL,       MVT::v4i32,   {  1,  1,  1,  1 } },
3523     { ISD::ROTR,       MVT::v8i64,   {  1,  1,  1,  1 } },
3524     { ISD::ROTR,       MVT::v4i64,   {  1,  1,  1,  1 } },
3525     { ISD::ROTR,       MVT::v2i64,   {  1,  1,  1,  1 } },
3526     { ISD::ROTR,       MVT::v16i32,  {  1,  1,  1,  1 } },
3527     { ISD::ROTR,       MVT::v8i32,   {  1,  1,  1,  1 } },
3528     { ISD::ROTR,       MVT::v4i32,   {  1,  1,  1,  1 } },
3529     { ISD::SMAX,       MVT::v8i64,   {  1,  3,  1,  1 } },
3530     { ISD::SMAX,       MVT::v16i32,  {  1,  1,  1,  1 } },
3531     { ISD::SMAX,       MVT::v32i16,  {  3,  7,  5,  5 } },
3532     { ISD::SMAX,       MVT::v64i8,   {  3,  7,  5,  5 } },
3533     { ISD::SMAX,       MVT::v4i64,   {  1,  3,  1,  1 } },
3534     { ISD::SMAX,       MVT::v2i64,   {  1,  3,  1,  1 } },
3535     { ISD::SMIN,       MVT::v8i64,   {  1,  3,  1,  1 } },
3536     { ISD::SMIN,       MVT::v16i32,  {  1,  1,  1,  1 } },
3537     { ISD::SMIN,       MVT::v32i16,  {  3,  7,  5,  5 } },
3538     { ISD::SMIN,       MVT::v64i8,   {  3,  7,  5,  5 } },
3539     { ISD::SMIN,       MVT::v4i64,   {  1,  3,  1,  1 } },
3540     { ISD::SMIN,       MVT::v2i64,   {  1,  3,  1,  1 } },
3541     { ISD::UMAX,       MVT::v8i64,   {  1,  3,  1,  1 } },
3542     { ISD::UMAX,       MVT::v16i32,  {  1,  1,  1,  1 } },
3543     { ISD::UMAX,       MVT::v32i16,  {  3,  7,  5,  5 } },
3544     { ISD::UMAX,       MVT::v64i8,   {  3,  7,  5,  5 } },
3545     { ISD::UMAX,       MVT::v4i64,   {  1,  3,  1,  1 } },
3546     { ISD::UMAX,       MVT::v2i64,   {  1,  3,  1,  1 } },
3547     { ISD::UMIN,       MVT::v8i64,   {  1,  3,  1,  1 } },
3548     { ISD::UMIN,       MVT::v16i32,  {  1,  1,  1,  1 } },
3549     { ISD::UMIN,       MVT::v32i16,  {  3,  7,  5,  5 } },
3550     { ISD::UMIN,       MVT::v64i8,   {  3,  7,  5,  5 } },
3551     { ISD::UMIN,       MVT::v4i64,   {  1,  3,  1,  1 } },
3552     { ISD::UMIN,       MVT::v2i64,   {  1,  3,  1,  1 } },
3553     { ISD::USUBSAT,    MVT::v16i32,  {  2 } }, // pmaxud + psubd
3554     { ISD::USUBSAT,    MVT::v2i64,   {  2 } }, // pmaxuq + psubq
3555     { ISD::USUBSAT,    MVT::v4i64,   {  2 } }, // pmaxuq + psubq
3556     { ISD::USUBSAT,    MVT::v8i64,   {  2 } }, // pmaxuq + psubq
3557     { ISD::UADDSAT,    MVT::v16i32,  {  3 } }, // not + pminud + paddd
3558     { ISD::UADDSAT,    MVT::v2i64,   {  3 } }, // not + pminuq + paddq
3559     { ISD::UADDSAT,    MVT::v4i64,   {  3 } }, // not + pminuq + paddq
3560     { ISD::UADDSAT,    MVT::v8i64,   {  3 } }, // not + pminuq + paddq
3561     { ISD::SADDSAT,    MVT::v32i16,  {  2 } },
3562     { ISD::SADDSAT,    MVT::v64i8,   {  2 } },
3563     { ISD::SSUBSAT,    MVT::v32i16,  {  2 } },
3564     { ISD::SSUBSAT,    MVT::v64i8,   {  2 } },
3565     { ISD::UADDSAT,    MVT::v32i16,  {  2 } },
3566     { ISD::UADDSAT,    MVT::v64i8,   {  2 } },
3567     { ISD::USUBSAT,    MVT::v32i16,  {  2 } },
3568     { ISD::USUBSAT,    MVT::v64i8,   {  2 } },
3569     { ISD::FMAXNUM,    MVT::f32,     {  2,  2,  3,  3 } },
3570     { ISD::FMAXNUM,    MVT::v4f32,   {  1,  1,  3,  3 } },
3571     { ISD::FMAXNUM,    MVT::v8f32,   {  2,  2,  3,  3 } },
3572     { ISD::FMAXNUM,    MVT::v16f32,  {  4,  4,  3,  3 } },
3573     { ISD::FMAXNUM,    MVT::f64,     {  2,  2,  3,  3 } },
3574     { ISD::FMAXNUM,    MVT::v2f64,   {  1,  1,  3,  3 } },
3575     { ISD::FMAXNUM,    MVT::v4f64,   {  2,  2,  3,  3 } },
3576     { ISD::FMAXNUM,    MVT::v8f64,   {  3,  3,  3,  3 } },
3577     { ISD::FSQRT,      MVT::f32,     {  3, 12,  1,  1 } }, // Skylake from http://www.agner.org/
3578     { ISD::FSQRT,      MVT::v4f32,   {  3, 12,  1,  1 } }, // Skylake from http://www.agner.org/
3579     { ISD::FSQRT,      MVT::v8f32,   {  6, 12,  1,  1 } }, // Skylake from http://www.agner.org/
3580     { ISD::FSQRT,      MVT::v16f32,  { 12, 20,  1,  3 } }, // Skylake from http://www.agner.org/
3581     { ISD::FSQRT,      MVT::f64,     {  6, 18,  1,  1 } }, // Skylake from http://www.agner.org/
3582     { ISD::FSQRT,      MVT::v2f64,   {  6, 18,  1,  1 } }, // Skylake from http://www.agner.org/
3583     { ISD::FSQRT,      MVT::v4f64,   { 12, 18,  1,  1 } }, // Skylake from http://www.agner.org/
3584     { ISD::FSQRT,      MVT::v8f64,   { 24, 32,  1,  3 } }, // Skylake from http://www.agner.org/
3585   };
3586   static const CostKindTblEntry XOPCostTbl[] = {
3587     { ISD::BITREVERSE, MVT::v4i64,   {  3,  6,  5,  6 } },
3588     { ISD::BITREVERSE, MVT::v8i32,   {  3,  6,  5,  6 } },
3589     { ISD::BITREVERSE, MVT::v16i16,  {  3,  6,  5,  6 } },
3590     { ISD::BITREVERSE, MVT::v32i8,   {  3,  6,  5,  6 } },
3591     { ISD::BITREVERSE, MVT::v2i64,   {  2,  7,  1,  1 } },
3592     { ISD::BITREVERSE, MVT::v4i32,   {  2,  7,  1,  1 } },
3593     { ISD::BITREVERSE, MVT::v8i16,   {  2,  7,  1,  1 } },
3594     { ISD::BITREVERSE, MVT::v16i8,   {  2,  7,  1,  1 } },
3595     { ISD::BITREVERSE, MVT::i64,     {  2,  2,  3,  4 } },
3596     { ISD::BITREVERSE, MVT::i32,     {  2,  2,  3,  4 } },
3597     { ISD::BITREVERSE, MVT::i16,     {  2,  2,  3,  4 } },
3598     { ISD::BITREVERSE, MVT::i8,      {  2,  2,  3,  4 } },
3599     // XOP: ROTL = VPROT(X,Y), ROTR = VPROT(X,SUB(0,Y))
3600     { ISD::ROTL,       MVT::v4i64,   {  4,  7,  5,  6 } },
3601     { ISD::ROTL,       MVT::v8i32,   {  4,  7,  5,  6 } },
3602     { ISD::ROTL,       MVT::v16i16,  {  4,  7,  5,  6 } },
3603     { ISD::ROTL,       MVT::v32i8,   {  4,  7,  5,  6 } },
3604     { ISD::ROTL,       MVT::v2i64,   {  1,  3,  1,  1 } },
3605     { ISD::ROTL,       MVT::v4i32,   {  1,  3,  1,  1 } },
3606     { ISD::ROTL,       MVT::v8i16,   {  1,  3,  1,  1 } },
3607     { ISD::ROTL,       MVT::v16i8,   {  1,  3,  1,  1 } },
3608     { ISD::ROTR,       MVT::v4i64,   {  4,  7,  8,  9 } },
3609     { ISD::ROTR,       MVT::v8i32,   {  4,  7,  8,  9 } },
3610     { ISD::ROTR,       MVT::v16i16,  {  4,  7,  8,  9 } },
3611     { ISD::ROTR,       MVT::v32i8,   {  4,  7,  8,  9 } },
3612     { ISD::ROTR,       MVT::v2i64,   {  1,  3,  3,  3 } },
3613     { ISD::ROTR,       MVT::v4i32,   {  1,  3,  3,  3 } },
3614     { ISD::ROTR,       MVT::v8i16,   {  1,  3,  3,  3 } },
3615     { ISD::ROTR,       MVT::v16i8,   {  1,  3,  3,  3 } }
3616   };
3617   static const CostKindTblEntry AVX2CostTbl[] = {
3618     { ISD::ABS,        MVT::v2i64,   {  2,  4,  3,  5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
3619     { ISD::ABS,        MVT::v4i64,   {  2,  4,  3,  5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
3620     { ISD::ABS,        MVT::v4i32,   {  1,  1,  1,  1 } },
3621     { ISD::ABS,        MVT::v8i32,   {  1,  1,  1,  2 } },
3622     { ISD::ABS,        MVT::v8i16,   {  1,  1,  1,  1 } },
3623     { ISD::ABS,        MVT::v16i16,  {  1,  1,  1,  2 } },
3624     { ISD::ABS,        MVT::v16i8,   {  1,  1,  1,  1 } },
3625     { ISD::ABS,        MVT::v32i8,   {  1,  1,  1,  2 } },
3626     { ISD::BITREVERSE, MVT::v2i64,   {  3, 11, 10, 11 } },
3627     { ISD::BITREVERSE, MVT::v4i64,   {  5, 11, 10, 17 } },
3628     { ISD::BITREVERSE, MVT::v4i32,   {  3, 11, 10, 11 } },
3629     { ISD::BITREVERSE, MVT::v8i32,   {  5, 11, 10, 17 } },
3630     { ISD::BITREVERSE, MVT::v8i16,   {  3, 11, 10, 11 } },
3631     { ISD::BITREVERSE, MVT::v16i16,  {  5, 11, 10, 17 } },
3632     { ISD::BITREVERSE, MVT::v16i8,   {  3,  6,  9,  9 } },
3633     { ISD::BITREVERSE, MVT::v32i8,   {  4,  5,  9, 15 } },
3634     { ISD::BSWAP,      MVT::v2i64,   {  1,  2,  1,  2 } },
3635     { ISD::BSWAP,      MVT::v4i64,   {  1,  3,  1,  2 } },
3636     { ISD::BSWAP,      MVT::v4i32,   {  1,  2,  1,  2 } },
3637     { ISD::BSWAP,      MVT::v8i32,   {  1,  3,  1,  2 } },
3638     { ISD::BSWAP,      MVT::v8i16,   {  1,  2,  1,  2 } },
3639     { ISD::BSWAP,      MVT::v16i16,  {  1,  3,  1,  2 } },
3640     { ISD::CTLZ,       MVT::v2i64,   {  7, 18, 24, 25 } },
3641     { ISD::CTLZ,       MVT::v4i64,   { 14, 18, 24, 44 } },
3642     { ISD::CTLZ,       MVT::v4i32,   {  5, 16, 19, 20 } },
3643     { ISD::CTLZ,       MVT::v8i32,   { 10, 16, 19, 34 } },
3644     { ISD::CTLZ,       MVT::v8i16,   {  4, 13, 14, 15 } },
3645     { ISD::CTLZ,       MVT::v16i16,  {  6, 14, 14, 24 } },
3646     { ISD::CTLZ,       MVT::v16i8,   {  3, 12,  9, 10 } },
3647     { ISD::CTLZ,       MVT::v32i8,   {  4, 12,  9, 14 } },
3648     { ISD::CTPOP,      MVT::v2i64,   {  3,  9, 10, 10 } },
3649     { ISD::CTPOP,      MVT::v4i64,   {  4,  9, 10, 14 } },
3650     { ISD::CTPOP,      MVT::v4i32,   {  7, 12, 14, 14 } },
3651     { ISD::CTPOP,      MVT::v8i32,   {  7, 12, 14, 18 } },
3652     { ISD::CTPOP,      MVT::v8i16,   {  3,  7, 11, 11 } },
3653     { ISD::CTPOP,      MVT::v16i16,  {  6,  8, 11, 18 } },
3654     { ISD::CTPOP,      MVT::v16i8,   {  2,  5,  8,  8 } },
3655     { ISD::CTPOP,      MVT::v32i8,   {  3,  5,  8, 12 } },
3656     { ISD::CTTZ,       MVT::v2i64,   {  4, 11, 13, 13 } },
3657     { ISD::CTTZ,       MVT::v4i64,   {  5, 11, 13, 20 } },
3658     { ISD::CTTZ,       MVT::v4i32,   {  7, 14, 17, 17 } },
3659     { ISD::CTTZ,       MVT::v8i32,   {  7, 15, 17, 24 } },
3660     { ISD::CTTZ,       MVT::v8i16,   {  4,  9, 14, 14 } },
3661     { ISD::CTTZ,       MVT::v16i16,  {  6,  9, 14, 24 } },
3662     { ISD::CTTZ,       MVT::v16i8,   {  3,  7, 11, 11 } },
3663     { ISD::CTTZ,       MVT::v32i8,   {  5,  7, 11, 18 } },
3664     { ISD::SADDSAT,    MVT::v16i16,  {  1 } },
3665     { ISD::SADDSAT,    MVT::v32i8,   {  1 } },
3666     { ISD::SMAX,       MVT::v2i64,   {  2,  7,  2,  3 } },
3667     { ISD::SMAX,       MVT::v4i64,   {  2,  7,  2,  3 } },
3668     { ISD::SMAX,       MVT::v8i32,   {  1,  1,  1,  2 } },
3669     { ISD::SMAX,       MVT::v16i16,  {  1,  1,  1,  2 } },
3670     { ISD::SMAX,       MVT::v32i8,   {  1,  1,  1,  2 } },
3671     { ISD::SMIN,       MVT::v2i64,   {  2,  7,  2,  3 } },
3672     { ISD::SMIN,       MVT::v4i64,   {  2,  7,  2,  3 } },
3673     { ISD::SMIN,       MVT::v8i32,   {  1,  1,  1,  2 } },
3674     { ISD::SMIN,       MVT::v16i16,  {  1,  1,  1,  2 } },
3675     { ISD::SMIN,       MVT::v32i8,   {  1,  1,  1,  2 } },
3676     { ISD::SSUBSAT,    MVT::v16i16,  {  1 } },
3677     { ISD::SSUBSAT,    MVT::v32i8,   {  1 } },
3678     { ISD::UADDSAT,    MVT::v16i16,  {  1 } },
3679     { ISD::UADDSAT,    MVT::v32i8,   {  1 } },
3680     { ISD::UADDSAT,    MVT::v8i32,   {  3 } }, // not + pminud + paddd
3681     { ISD::UMAX,       MVT::v2i64,   {  2,  8,  5,  6 } },
3682     { ISD::UMAX,       MVT::v4i64,   {  2,  8,  5,  8 } },
3683     { ISD::UMAX,       MVT::v8i32,   {  1,  1,  1,  2 } },
3684     { ISD::UMAX,       MVT::v16i16,  {  1,  1,  1,  2 } },
3685     { ISD::UMAX,       MVT::v32i8,   {  1,  1,  1,  2 } },
3686     { ISD::UMIN,       MVT::v2i64,   {  2,  8,  5,  6 } },
3687     { ISD::UMIN,       MVT::v4i64,   {  2,  8,  5,  8 } },
3688     { ISD::UMIN,       MVT::v8i32,   {  1,  1,  1,  2 } },
3689     { ISD::UMIN,       MVT::v16i16,  {  1,  1,  1,  2 } },
3690     { ISD::UMIN,       MVT::v32i8,   {  1,  1,  1,  2 } },
3691     { ISD::USUBSAT,    MVT::v16i16,  {  1 } },
3692     { ISD::USUBSAT,    MVT::v32i8,   {  1 } },
3693     { ISD::USUBSAT,    MVT::v8i32,   {  2 } }, // pmaxud + psubd
3694     { ISD::FMAXNUM,    MVT::f32,     {  2,  7,  3,  5 } }, // MAXSS + CMPUNORDSS + BLENDVPS
3695     { ISD::FMAXNUM,    MVT::v4f32,   {  2,  7,  3,  5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
3696     { ISD::FMAXNUM,    MVT::v8f32,   {  3,  7,  3,  6 } }, // MAXPS + CMPUNORDPS + BLENDVPS
3697     { ISD::FMAXNUM,    MVT::f64,     {  2,  7,  3,  5 } }, // MAXSD + CMPUNORDSD + BLENDVPD
3698     { ISD::FMAXNUM,    MVT::v2f64,   {  2,  7,  3,  5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
3699     { ISD::FMAXNUM,    MVT::v4f64,   {  3,  7,  3,  6 } }, // MAXPD + CMPUNORDPD + BLENDVPD
3700     { ISD::FSQRT,      MVT::f32,     {  7, 15,  1,  1 } }, // vsqrtss
3701     { ISD::FSQRT,      MVT::v4f32,   {  7, 15,  1,  1 } }, // vsqrtps
3702     { ISD::FSQRT,      MVT::v8f32,   { 14, 21,  1,  3 } }, // vsqrtps
3703     { ISD::FSQRT,      MVT::f64,     { 14, 21,  1,  1 } }, // vsqrtsd
3704     { ISD::FSQRT,      MVT::v2f64,   { 14, 21,  1,  1 } }, // vsqrtpd
3705     { ISD::FSQRT,      MVT::v4f64,   { 28, 35,  1,  3 } }, // vsqrtpd
3706   };
3707   static const CostKindTblEntry AVX1CostTbl[] = {
3708     { ISD::ABS,        MVT::v4i64,   {  6,  8,  6, 12 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
3709     { ISD::ABS,        MVT::v8i32,   {  3,  6,  4,  5 } },
3710     { ISD::ABS,        MVT::v16i16,  {  3,  6,  4,  5 } },
3711     { ISD::ABS,        MVT::v32i8,   {  3,  6,  4,  5 } },
3712     { ISD::BITREVERSE, MVT::v4i64,   { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
3713     { ISD::BITREVERSE, MVT::v2i64,   {  8, 13, 10, 16 } },
3714     { ISD::BITREVERSE, MVT::v8i32,   { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
3715     { ISD::BITREVERSE, MVT::v4i32,   {  8, 13, 10, 16 } },
3716     { ISD::BITREVERSE, MVT::v16i16,  { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
3717     { ISD::BITREVERSE, MVT::v8i16,   {  8, 13, 10, 16 } },
3718     { ISD::BITREVERSE, MVT::v32i8,   { 13, 15, 17, 26 } }, // 2 x 128-bit Op + extract/insert
3719     { ISD::BITREVERSE, MVT::v16i8,   {  7,  7,  9, 13 } },
3720     { ISD::BSWAP,      MVT::v4i64,   {  5,  7,  5, 10 } },
3721     { ISD::BSWAP,      MVT::v2i64,   {  2,  3,  1,  3 } },
3722     { ISD::BSWAP,      MVT::v8i32,   {  5,  7,  5, 10 } },
3723     { ISD::BSWAP,      MVT::v4i32,   {  2,  3,  1,  3 } },
3724     { ISD::BSWAP,      MVT::v16i16,  {  5,  6,  5, 10 } },
3725     { ISD::BSWAP,      MVT::v8i16,   {  2,  2,  1,  3 } },
3726     { ISD::CTLZ,       MVT::v4i64,   { 29, 33, 49, 58 } }, // 2 x 128-bit Op + extract/insert
3727     { ISD::CTLZ,       MVT::v2i64,   { 14, 24, 24, 28 } },
3728     { ISD::CTLZ,       MVT::v8i32,   { 24, 28, 39, 48 } }, // 2 x 128-bit Op + extract/insert
3729     { ISD::CTLZ,       MVT::v4i32,   { 12, 20, 19, 23 } },
3730     { ISD::CTLZ,       MVT::v16i16,  { 19, 22, 29, 38 } }, // 2 x 128-bit Op + extract/insert
3731     { ISD::CTLZ,       MVT::v8i16,   {  9, 16, 14, 18 } },
3732     { ISD::CTLZ,       MVT::v32i8,   { 14, 15, 19, 28 } }, // 2 x 128-bit Op + extract/insert
3733     { ISD::CTLZ,       MVT::v16i8,   {  7, 12,  9, 13 } },
3734     { ISD::CTPOP,      MVT::v4i64,   { 14, 18, 19, 28 } }, // 2 x 128-bit Op + extract/insert
3735     { ISD::CTPOP,      MVT::v2i64,   {  7, 14, 10, 14 } },
3736     { ISD::CTPOP,      MVT::v8i32,   { 18, 24, 27, 36 } }, // 2 x 128-bit Op + extract/insert
3737     { ISD::CTPOP,      MVT::v4i32,   {  9, 20, 14, 18 } },
3738     { ISD::CTPOP,      MVT::v16i16,  { 16, 21, 22, 31 } }, // 2 x 128-bit Op + extract/insert
3739     { ISD::CTPOP,      MVT::v8i16,   {  8, 18, 11, 15 } },
3740     { ISD::CTPOP,      MVT::v32i8,   { 13, 15, 16, 25 } }, // 2 x 128-bit Op + extract/insert
3741     { ISD::CTPOP,      MVT::v16i8,   {  6, 12,  8, 12 } },
3742     { ISD::CTTZ,       MVT::v4i64,   { 17, 22, 24, 33 } }, // 2 x 128-bit Op + extract/insert
3743     { ISD::CTTZ,       MVT::v2i64,   {  9, 19, 13, 17 } },
3744     { ISD::CTTZ,       MVT::v8i32,   { 21, 27, 32, 41 } }, // 2 x 128-bit Op + extract/insert
3745     { ISD::CTTZ,       MVT::v4i32,   { 11, 24, 17, 21 } },
3746     { ISD::CTTZ,       MVT::v16i16,  { 18, 24, 27, 36 } }, // 2 x 128-bit Op + extract/insert
3747     { ISD::CTTZ,       MVT::v8i16,   {  9, 21, 14, 18 } },
3748     { ISD::CTTZ,       MVT::v32i8,   { 15, 18, 21, 30 } }, // 2 x 128-bit Op + extract/insert
3749     { ISD::CTTZ,       MVT::v16i8,   {  8, 16, 11, 15 } },
3750     { ISD::SADDSAT,    MVT::v16i16,  {  4 } }, // 2 x 128-bit Op + extract/insert
3751     { ISD::SADDSAT,    MVT::v32i8,   {  4 } }, // 2 x 128-bit Op + extract/insert
3752     { ISD::SMAX,       MVT::v4i64,   {  6,  9,  6, 12 } }, // 2 x 128-bit Op + extract/insert
3753     { ISD::SMAX,       MVT::v2i64,   {  3,  7,  2,  4 } },
3754     { ISD::SMAX,       MVT::v8i32,   {  4,  6,  5,  6 } }, // 2 x 128-bit Op + extract/insert
3755     { ISD::SMAX,       MVT::v16i16,  {  4,  6,  5,  6 } }, // 2 x 128-bit Op + extract/insert
3756     { ISD::SMAX,       MVT::v32i8,   {  4,  6,  5,  6 } }, // 2 x 128-bit Op + extract/insert
3757     { ISD::SMIN,       MVT::v4i64,   {  6,  9,  6, 12 } }, // 2 x 128-bit Op + extract/insert
3758     { ISD::SMIN,       MVT::v2i64,   {  3,  7,  2,  3 } },
3759     { ISD::SMIN,       MVT::v8i32,   {  4,  6,  5,  6 } }, // 2 x 128-bit Op + extract/insert
3760     { ISD::SMIN,       MVT::v16i16,  {  4,  6,  5,  6 } }, // 2 x 128-bit Op + extract/insert
3761     { ISD::SMIN,       MVT::v32i8,   {  4,  6,  5,  6 } }, // 2 x 128-bit Op + extract/insert
3762     { ISD::SSUBSAT,    MVT::v16i16,  {  4 } }, // 2 x 128-bit Op + extract/insert
3763     { ISD::SSUBSAT,    MVT::v32i8,   {  4 } }, // 2 x 128-bit Op + extract/insert
3764     { ISD::UADDSAT,    MVT::v16i16,  {  4 } }, // 2 x 128-bit Op + extract/insert
3765     { ISD::UADDSAT,    MVT::v32i8,   {  4 } }, // 2 x 128-bit Op + extract/insert
3766     { ISD::UADDSAT,    MVT::v8i32,   {  8 } }, // 2 x 128-bit Op + extract/insert
3767     { ISD::UMAX,       MVT::v4i64,   {  9, 10, 11, 17 } }, // 2 x 128-bit Op + extract/insert
3768     { ISD::UMAX,       MVT::v2i64,   {  4,  8,  5,  7 } },
3769     { ISD::UMAX,       MVT::v8i32,   {  4,  6,  5,  6 } }, // 2 x 128-bit Op + extract/insert
3770     { ISD::UMAX,       MVT::v16i16,  {  4,  6,  5,  6 } }, // 2 x 128-bit Op + extract/insert
3771     { ISD::UMAX,       MVT::v32i8,   {  4,  6,  5,  6 } }, // 2 x 128-bit Op + extract/insert
3772     { ISD::UMIN,       MVT::v4i64,   {  9, 10, 11, 17 } }, // 2 x 128-bit Op + extract/insert
3773     { ISD::UMIN,       MVT::v2i64,   {  4,  8,  5,  7 } },
3774     { ISD::UMIN,       MVT::v8i32,   {  4,  6,  5,  6 } }, // 2 x 128-bit Op + extract/insert
3775     { ISD::UMIN,       MVT::v16i16,  {  4,  6,  5,  6 } }, // 2 x 128-bit Op + extract/insert
3776     { ISD::UMIN,       MVT::v32i8,   {  4,  6,  5,  6 } }, // 2 x 128-bit Op + extract/insert
3777     { ISD::USUBSAT,    MVT::v16i16,  {  4 } }, // 2 x 128-bit Op + extract/insert
3778     { ISD::USUBSAT,    MVT::v32i8,   {  4 } }, // 2 x 128-bit Op + extract/insert
3779     { ISD::USUBSAT,    MVT::v8i32,   {  6 } }, // 2 x 128-bit Op + extract/insert
3780     { ISD::FMAXNUM,    MVT::f32,     {  3,  6,  3,  5 } }, // MAXSS + CMPUNORDSS + BLENDVPS
3781     { ISD::FMAXNUM,    MVT::v4f32,   {  3,  6,  3,  5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
3782     { ISD::FMAXNUM,    MVT::v8f32,   {  5,  7,  3, 10 } }, // MAXPS + CMPUNORDPS + BLENDVPS
3783     { ISD::FMAXNUM,    MVT::f64,     {  3,  6,  3,  5 } }, // MAXSD + CMPUNORDSD + BLENDVPD
3784     { ISD::FMAXNUM,    MVT::v2f64,   {  3,  6,  3,  5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
3785     { ISD::FMAXNUM,    MVT::v4f64,   {  5,  7,  3, 10 } }, // MAXPD + CMPUNORDPD + BLENDVPD
3786     { ISD::FSQRT,      MVT::f32,     { 21, 21,  1,  1 } }, // vsqrtss
3787     { ISD::FSQRT,      MVT::v4f32,   { 21, 21,  1,  1 } }, // vsqrtps
3788     { ISD::FSQRT,      MVT::v8f32,   { 42, 42,  1,  3 } }, // vsqrtps
3789     { ISD::FSQRT,      MVT::f64,     { 27, 27,  1,  1 } }, // vsqrtsd
3790     { ISD::FSQRT,      MVT::v2f64,   { 27, 27,  1,  1 } }, // vsqrtpd
3791     { ISD::FSQRT,      MVT::v4f64,   { 54, 54,  1,  3 } }, // vsqrtpd
3792   };
3793   static const CostKindTblEntry GLMCostTbl[] = {
3794     { ISD::FSQRT,      MVT::f32,     { 19, 20, 1, 1 } }, // sqrtss
3795     { ISD::FSQRT,      MVT::v4f32,   { 37, 41, 1, 5 } }, // sqrtps
3796     { ISD::FSQRT,      MVT::f64,     { 34, 35, 1, 1 } }, // sqrtsd
3797     { ISD::FSQRT,      MVT::v2f64,   { 67, 71, 1, 5 } }, // sqrtpd
3798   };
3799   static const CostKindTblEntry SLMCostTbl[] = {
3800     { ISD::FSQRT,      MVT::f32,     { 20, 20, 1, 1 } }, // sqrtss
3801     { ISD::FSQRT,      MVT::v4f32,   { 40, 41, 1, 5 } }, // sqrtps
3802     { ISD::FSQRT,      MVT::f64,     { 35, 35, 1, 1 } }, // sqrtsd
3803     { ISD::FSQRT,      MVT::v2f64,   { 70, 71, 1, 5 } }, // sqrtpd
3804   };
3805   static const CostKindTblEntry SSE42CostTbl[] = {
3806     { ISD::USUBSAT,    MVT::v4i32,   {  2 } }, // pmaxud + psubd
3807     { ISD::UADDSAT,    MVT::v4i32,   {  3 } }, // not + pminud + paddd
3808     { ISD::FMAXNUM,    MVT::f32,     {  5,  5,  7,  7 } }, // MAXSS + CMPUNORDSS + BLENDVPS
3809     { ISD::FMAXNUM,    MVT::v4f32,   {  4,  4,  4,  5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
3810     { ISD::FMAXNUM,    MVT::f64,     {  5,  5,  7,  7 } }, // MAXSD + CMPUNORDSD + BLENDVPD
3811     { ISD::FMAXNUM,    MVT::v2f64,   {  4,  4,  4,  5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
3812     { ISD::FSQRT,      MVT::f32,     { 18, 18,  1,  1 } }, // Nehalem from http://www.agner.org/
3813     { ISD::FSQRT,      MVT::v4f32,   { 18, 18,  1,  1 } }, // Nehalem from http://www.agner.org/
3814   };
3815   static const CostKindTblEntry SSE41CostTbl[] = {
3816     { ISD::ABS,        MVT::v2i64,   {  3,  4,  3,  5 } }, // BLENDVPD(X,PSUBQ(0,X),X)
3817     { ISD::SMAX,       MVT::v2i64,   {  3,  7,  2,  3 } },
3818     { ISD::SMAX,       MVT::v4i32,   {  1,  1,  1,  1 } },
3819     { ISD::SMAX,       MVT::v16i8,   {  1,  1,  1,  1 } },
3820     { ISD::SMIN,       MVT::v2i64,   {  3,  7,  2,  3 } },
3821     { ISD::SMIN,       MVT::v4i32,   {  1,  1,  1,  1 } },
3822     { ISD::SMIN,       MVT::v16i8,   {  1,  1,  1,  1 } },
3823     { ISD::UMAX,       MVT::v2i64,   {  2, 11,  6,  7 } },
3824     { ISD::UMAX,       MVT::v4i32,   {  1,  1,  1,  1 } },
3825     { ISD::UMAX,       MVT::v8i16,   {  1,  1,  1,  1 } },
3826     { ISD::UMIN,       MVT::v2i64,   {  2, 11,  6,  7 } },
3827     { ISD::UMIN,       MVT::v4i32,   {  1,  1,  1,  1 } },
3828     { ISD::UMIN,       MVT::v8i16,   {  1,  1,  1,  1 } },
3829   };
3830   static const CostKindTblEntry SSSE3CostTbl[] = {
3831     { ISD::ABS,        MVT::v4i32,   {  1,  2,  1,  1 } },
3832     { ISD::ABS,        MVT::v8i16,   {  1,  2,  1,  1 } },
3833     { ISD::ABS,        MVT::v16i8,   {  1,  2,  1,  1 } },
3834     { ISD::BITREVERSE, MVT::v2i64,   { 16, 20, 11, 21 } },
3835     { ISD::BITREVERSE, MVT::v4i32,   { 16, 20, 11, 21 } },
3836     { ISD::BITREVERSE, MVT::v8i16,   { 16, 20, 11, 21 } },
3837     { ISD::BITREVERSE, MVT::v16i8,   { 11, 12, 10, 16 } },
3838     { ISD::BSWAP,      MVT::v2i64,   {  5,  5,  1,  5 } },
3839     { ISD::BSWAP,      MVT::v4i32,   {  5,  5,  1,  5 } },
3840     { ISD::BSWAP,      MVT::v8i16,   {  5,  5,  1,  5 } },
3841     { ISD::CTLZ,       MVT::v2i64,   { 18, 28, 28, 35 } },
3842     { ISD::CTLZ,       MVT::v4i32,   { 15, 20, 22, 28 } },
3843     { ISD::CTLZ,       MVT::v8i16,   { 13, 17, 16, 22 } },
3844     { ISD::CTLZ,       MVT::v16i8,   { 11, 15, 10, 16 } },
3845     { ISD::CTPOP,      MVT::v2i64,   { 13, 19, 12, 18 } },
3846     { ISD::CTPOP,      MVT::v4i32,   { 18, 24, 16, 22 } },
3847     { ISD::CTPOP,      MVT::v8i16,   { 13, 18, 14, 20 } },
3848     { ISD::CTPOP,      MVT::v16i8,   { 11, 12, 10, 16 } },
3849     { ISD::CTTZ,       MVT::v2i64,   { 13, 25, 15, 22 } },
3850     { ISD::CTTZ,       MVT::v4i32,   { 18, 26, 19, 25 } },
3851     { ISD::CTTZ,       MVT::v8i16,   { 13, 20, 17, 23 } },
3852     { ISD::CTTZ,       MVT::v16i8,   { 11, 16, 13, 19 } }
3853   };
3854   static const CostKindTblEntry SSE2CostTbl[] = {
3855     { ISD::ABS,        MVT::v2i64,   {  3,  6,  5,  5 } },
3856     { ISD::ABS,        MVT::v4i32,   {  1,  4,  4,  4 } },
3857     { ISD::ABS,        MVT::v8i16,   {  1,  2,  3,  3 } },
3858     { ISD::ABS,        MVT::v16i8,   {  1,  2,  3,  3 } },
3859     { ISD::BITREVERSE, MVT::v2i64,   { 16, 20, 32, 32 } },
3860     { ISD::BITREVERSE, MVT::v4i32,   { 16, 20, 30, 30 } },
3861     { ISD::BITREVERSE, MVT::v8i16,   { 16, 20, 25, 25 } },
3862     { ISD::BITREVERSE, MVT::v16i8,   { 11, 12, 21, 21 } },
3863     { ISD::BSWAP,      MVT::v2i64,   {  5,  6, 11, 11 } },
3864     { ISD::BSWAP,      MVT::v4i32,   {  5,  5,  9,  9 } },
3865     { ISD::BSWAP,      MVT::v8i16,   {  5,  5,  4,  5 } },
3866     { ISD::CTLZ,       MVT::v2i64,   { 10, 45, 36, 38 } },
3867     { ISD::CTLZ,       MVT::v4i32,   { 10, 45, 38, 40 } },
3868     { ISD::CTLZ,       MVT::v8i16,   {  9, 38, 32, 34 } },
3869     { ISD::CTLZ,       MVT::v16i8,   {  8, 39, 29, 32 } },
3870     { ISD::CTPOP,      MVT::v2i64,   { 12, 26, 16, 18 } },
3871     { ISD::CTPOP,      MVT::v4i32,   { 15, 29, 21, 23 } },
3872     { ISD::CTPOP,      MVT::v8i16,   { 13, 25, 18, 20 } },
3873     { ISD::CTPOP,      MVT::v16i8,   { 10, 21, 14, 16 } },
3874     { ISD::CTTZ,       MVT::v2i64,   { 14, 28, 19, 21 } },
3875     { ISD::CTTZ,       MVT::v4i32,   { 18, 31, 24, 26 } },
3876     { ISD::CTTZ,       MVT::v8i16,   { 16, 27, 21, 23 } },
3877     { ISD::CTTZ,       MVT::v16i8,   { 13, 23, 17, 19 } },
3878     { ISD::SADDSAT,    MVT::v8i16,   {  1 } },
3879     { ISD::SADDSAT,    MVT::v16i8,   {  1 } },
3880     { ISD::SMAX,       MVT::v2i64,   {  4,  8, 15, 15 } },
3881     { ISD::SMAX,       MVT::v4i32,   {  2,  4,  5,  5 } },
3882     { ISD::SMAX,       MVT::v8i16,   {  1,  1,  1,  1 } },
3883     { ISD::SMAX,       MVT::v16i8,   {  2,  4,  5,  5 } },
3884     { ISD::SMIN,       MVT::v2i64,   {  4,  8, 15, 15 } },
3885     { ISD::SMIN,       MVT::v4i32,   {  2,  4,  5,  5 } },
3886     { ISD::SMIN,       MVT::v8i16,   {  1,  1,  1,  1 } },
3887     { ISD::SMIN,       MVT::v16i8,   {  2,  4,  5,  5 } },
3888     { ISD::SSUBSAT,    MVT::v8i16,   {  1 } },
3889     { ISD::SSUBSAT,    MVT::v16i8,   {  1 } },
3890     { ISD::UADDSAT,    MVT::v8i16,   {  1 } },
3891     { ISD::UADDSAT,    MVT::v16i8,   {  1 } },
3892     { ISD::UMAX,       MVT::v2i64,   {  4,  8, 15, 15 } },
3893     { ISD::UMAX,       MVT::v4i32,   {  2,  5,  8,  8 } },
3894     { ISD::UMAX,       MVT::v8i16,   {  1,  3,  3,  3 } },
3895     { ISD::UMAX,       MVT::v16i8,   {  1,  1,  1,  1 } },
3896     { ISD::UMIN,       MVT::v2i64,   {  4,  8, 15, 15 } },
3897     { ISD::UMIN,       MVT::v4i32,   {  2,  5,  8,  8 } },
3898     { ISD::UMIN,       MVT::v8i16,   {  1,  3,  3,  3 } },
3899     { ISD::UMIN,       MVT::v16i8,   {  1,  1,  1,  1 } },
3900     { ISD::USUBSAT,    MVT::v8i16,   {  1 } },
3901     { ISD::USUBSAT,    MVT::v16i8,   {  1 } },
3902     { ISD::FMAXNUM,    MVT::f64,     {  5,  5,  7,  7 } },
3903     { ISD::FMAXNUM,    MVT::v2f64,   {  4,  6,  6,  6 } },
3904     { ISD::FSQRT,      MVT::f64,     { 32, 32,  1,  1 } }, // Nehalem from http://www.agner.org/
3905     { ISD::FSQRT,      MVT::v2f64,   { 32, 32,  1,  1 } }, // Nehalem from http://www.agner.org/
3906   };
3907   static const CostKindTblEntry SSE1CostTbl[] = {
3908     { ISD::FMAXNUM,    MVT::f32,     {  5,  5,  7,  7 } },
3909     { ISD::FMAXNUM,    MVT::v4f32,   {  4,  6,  6,  6 } },
3910     { ISD::FSQRT,      MVT::f32,     { 28, 30,  1,  2 } }, // Pentium III from http://www.agner.org/
3911     { ISD::FSQRT,      MVT::v4f32,   { 56, 56,  1,  2 } }, // Pentium III from http://www.agner.org/
3912   };
3913   static const CostKindTblEntry BMI64CostTbl[] = { // 64-bit targets
3914     { ISD::CTTZ,       MVT::i64,     {  1 } },
3915   };
3916   static const CostKindTblEntry BMI32CostTbl[] = { // 32 or 64-bit targets
3917     { ISD::CTTZ,       MVT::i32,     {  1 } },
3918     { ISD::CTTZ,       MVT::i16,     {  1 } },
3919     { ISD::CTTZ,       MVT::i8,      {  1 } },
3920   };
3921   static const CostKindTblEntry LZCNT64CostTbl[] = { // 64-bit targets
3922     { ISD::CTLZ,       MVT::i64,     {  1 } },
3923   };
3924   static const CostKindTblEntry LZCNT32CostTbl[] = { // 32 or 64-bit targets
3925     { ISD::CTLZ,       MVT::i32,     {  1 } },
3926     { ISD::CTLZ,       MVT::i16,     {  2 } },
3927     { ISD::CTLZ,       MVT::i8,      {  2 } },
3928   };
3929   static const CostKindTblEntry POPCNT64CostTbl[] = { // 64-bit targets
3930     { ISD::CTPOP,      MVT::i64,     {  1, 1, 1, 1 } }, // popcnt
3931   };
3932   static const CostKindTblEntry POPCNT32CostTbl[] = { // 32 or 64-bit targets
3933     { ISD::CTPOP,      MVT::i32,     {  1, 1, 1, 1 } }, // popcnt
3934     { ISD::CTPOP,      MVT::i16,     {  1, 1, 2, 2 } }, // popcnt(zext())
3935     { ISD::CTPOP,      MVT::i8,      {  1, 1, 2, 2 } }, // popcnt(zext())
3936   };
3937   static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets
3938     { ISD::ABS,        MVT::i64,     {  1,  2,  3,  4 } }, // SUB+CMOV
3939     { ISD::BITREVERSE, MVT::i64,     { 10, 12, 20, 22 } },
3940     { ISD::BSWAP,      MVT::i64,     {  1,  2,  1,  2 } },
3941     { ISD::CTLZ,       MVT::i64,     {  4 } }, // BSR+XOR or BSR+XOR+CMOV
3942     { ISD::CTLZ_ZERO_UNDEF, MVT::i64,{  1,  1,  1,  1 } }, // BSR+XOR
3943     { ISD::CTTZ,       MVT::i64,     {  3 } }, // TEST+BSF+CMOV/BRANCH
3944     { ISD::CTTZ_ZERO_UNDEF, MVT::i64,{  1,  1,  1,  1 } }, // BSR
3945     { ISD::CTPOP,      MVT::i64,     { 10,  6, 19, 19 } },
3946     { ISD::ROTL,       MVT::i64,     {  2, 3, 1, 3 } },
3947     { ISD::ROTR,       MVT::i64,     {  2, 3, 1, 3 } },
3948     { ISD::FSHL,       MVT::i64,     {  4, 4, 1, 4 } },
3949     { ISD::SMAX,       MVT::i64,     {  1,  3,  2,  3 } },
3950     { ISD::SMIN,       MVT::i64,     {  1,  3,  2,  3 } },
3951     { ISD::UMAX,       MVT::i64,     {  1,  3,  2,  3 } },
3952     { ISD::UMIN,       MVT::i64,     {  1,  3,  2,  3 } },
3953     { ISD::SADDO,      MVT::i64,     {  1 } },
3954     { ISD::UADDO,      MVT::i64,     {  1 } },
3955     { ISD::UMULO,      MVT::i64,     {  2 } }, // mulq + seto
3956   };
3957   static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets
3958     { ISD::ABS,        MVT::i32,     {  1,  2,  3,  4 } }, // SUB+XOR+SRA or SUB+CMOV
3959     { ISD::ABS,        MVT::i16,     {  2,  2,  3,  4 } }, // SUB+XOR+SRA or SUB+CMOV
3960     { ISD::ABS,        MVT::i8,      {  2,  4,  4,  4 } }, // SUB+XOR+SRA
3961     { ISD::BITREVERSE, MVT::i32,     {  9, 12, 17, 19 } },
3962     { ISD::BITREVERSE, MVT::i16,     {  9, 12, 17, 19 } },
3963     { ISD::BITREVERSE, MVT::i8,      {  7,  9, 13, 14 } },
3964     { ISD::BSWAP,      MVT::i32,     {  1,  1,  1,  1 } },
3965     { ISD::BSWAP,      MVT::i16,     {  1,  2,  1,  2 } }, // ROL
3966     { ISD::CTLZ,       MVT::i32,     {  4 } }, // BSR+XOR or BSR+XOR+CMOV
3967     { ISD::CTLZ,       MVT::i16,     {  4 } }, // BSR+XOR or BSR+XOR+CMOV
3968     { ISD::CTLZ,       MVT::i8,      {  4 } }, // BSR+XOR or BSR+XOR+CMOV
3969     { ISD::CTLZ_ZERO_UNDEF, MVT::i32,{  1,  1,  1,  1 } }, // BSR+XOR
3970     { ISD::CTLZ_ZERO_UNDEF, MVT::i16,{  2,  2,  3,  3 } }, // BSR+XOR
3971     { ISD::CTLZ_ZERO_UNDEF, MVT::i8, {  2,  2,  3,  3 } }, // BSR+XOR
3972     { ISD::CTTZ,       MVT::i32,     {  3 } }, // TEST+BSF+CMOV/BRANCH
3973     { ISD::CTTZ,       MVT::i16,     {  3 } }, // TEST+BSF+CMOV/BRANCH
3974     { ISD::CTTZ,       MVT::i8,      {  3 } }, // TEST+BSF+CMOV/BRANCH
3975     { ISD::CTTZ_ZERO_UNDEF, MVT::i32,{  1,  1,  1,  1 } }, // BSF
3976     { ISD::CTTZ_ZERO_UNDEF, MVT::i16,{  2,  2,  1,  1 } }, // BSF
3977     { ISD::CTTZ_ZERO_UNDEF, MVT::i8, {  2,  2,  1,  1 } }, // BSF
3978     { ISD::CTPOP,      MVT::i32,     {  8,  7, 15, 15 } },
3979     { ISD::CTPOP,      MVT::i16,     {  9,  8, 17, 17 } },
3980     { ISD::CTPOP,      MVT::i8,      {  7,  6, 13, 13 } },
3981     { ISD::ROTL,       MVT::i32,     {  2,  3,  1,  3 } },
3982     { ISD::ROTL,       MVT::i16,     {  2,  3,  1,  3 } },
3983     { ISD::ROTL,       MVT::i8,      {  2,  3,  1,  3 } },
3984     { ISD::ROTR,       MVT::i32,     {  2,  3,  1,  3 } },
3985     { ISD::ROTR,       MVT::i16,     {  2,  3,  1,  3 } },
3986     { ISD::ROTR,       MVT::i8,      {  2,  3,  1,  3 } },
3987     { ISD::FSHL,       MVT::i32,     {  4,  4,  1,  4 } },
3988     { ISD::FSHL,       MVT::i16,     {  4,  4,  2,  5 } },
3989     { ISD::FSHL,       MVT::i8,      {  4,  4,  2,  5 } },
3990     { ISD::SMAX,       MVT::i32,     {  1,  2,  2,  3 } },
3991     { ISD::SMAX,       MVT::i16,     {  1,  4,  2,  4 } },
3992     { ISD::SMAX,       MVT::i8,      {  1,  4,  2,  4 } },
3993     { ISD::SMIN,       MVT::i32,     {  1,  2,  2,  3 } },
3994     { ISD::SMIN,       MVT::i16,     {  1,  4,  2,  4 } },
3995     { ISD::SMIN,       MVT::i8,      {  1,  4,  2,  4 } },
3996     { ISD::UMAX,       MVT::i32,     {  1,  2,  2,  3 } },
3997     { ISD::UMAX,       MVT::i16,     {  1,  4,  2,  4 } },
3998     { ISD::UMAX,       MVT::i8,      {  1,  4,  2,  4 } },
3999     { ISD::UMIN,       MVT::i32,     {  1,  2,  2,  3 } },
4000     { ISD::UMIN,       MVT::i16,     {  1,  4,  2,  4 } },
4001     { ISD::UMIN,       MVT::i8,      {  1,  4,  2,  4 } },
4002     { ISD::SADDO,      MVT::i32,     {  1 } },
4003     { ISD::SADDO,      MVT::i16,     {  1 } },
4004     { ISD::SADDO,      MVT::i8,      {  1 } },
4005     { ISD::UADDO,      MVT::i32,     {  1 } },
4006     { ISD::UADDO,      MVT::i16,     {  1 } },
4007     { ISD::UADDO,      MVT::i8,      {  1 } },
4008     { ISD::UMULO,      MVT::i32,     {  2 } }, // mul + seto
4009     { ISD::UMULO,      MVT::i16,     {  2 } },
4010     { ISD::UMULO,      MVT::i8,      {  2 } },
4011   };
4012 
4013   Type *RetTy = ICA.getReturnType();
4014   Type *OpTy = RetTy;
4015   Intrinsic::ID IID = ICA.getID();
4016   unsigned ISD = ISD::DELETED_NODE;
4017   switch (IID) {
4018   default:
4019     break;
4020   case Intrinsic::abs:
4021     ISD = ISD::ABS;
4022     break;
4023   case Intrinsic::bitreverse:
4024     ISD = ISD::BITREVERSE;
4025     break;
4026   case Intrinsic::bswap:
4027     ISD = ISD::BSWAP;
4028     break;
4029   case Intrinsic::ctlz:
4030     ISD = ISD::CTLZ;
4031     break;
4032   case Intrinsic::ctpop:
4033     ISD = ISD::CTPOP;
4034     break;
4035   case Intrinsic::cttz:
4036     ISD = ISD::CTTZ;
4037     break;
4038   case Intrinsic::fshl:
4039     ISD = ISD::FSHL;
4040     if (!ICA.isTypeBasedOnly()) {
4041       const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4042       if (Args[0] == Args[1])
4043         ISD = ISD::ROTL;
4044     }
4045     break;
4046   case Intrinsic::fshr:
4047     // FSHR has same costs so don't duplicate.
4048     ISD = ISD::FSHL;
4049     if (!ICA.isTypeBasedOnly()) {
4050       const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4051       if (Args[0] == Args[1])
4052         ISD = ISD::ROTR;
4053     }
4054     break;
4055   case Intrinsic::maxnum:
4056   case Intrinsic::minnum:
4057     // FMINNUM has same costs so don't duplicate.
4058     ISD = ISD::FMAXNUM;
4059     break;
4060   case Intrinsic::sadd_sat:
4061     ISD = ISD::SADDSAT;
4062     break;
4063   case Intrinsic::smax:
4064     ISD = ISD::SMAX;
4065     break;
4066   case Intrinsic::smin:
4067     ISD = ISD::SMIN;
4068     break;
4069   case Intrinsic::ssub_sat:
4070     ISD = ISD::SSUBSAT;
4071     break;
4072   case Intrinsic::uadd_sat:
4073     ISD = ISD::UADDSAT;
4074     break;
4075   case Intrinsic::umax:
4076     ISD = ISD::UMAX;
4077     break;
4078   case Intrinsic::umin:
4079     ISD = ISD::UMIN;
4080     break;
4081   case Intrinsic::usub_sat:
4082     ISD = ISD::USUBSAT;
4083     break;
4084   case Intrinsic::sqrt:
4085     ISD = ISD::FSQRT;
4086     break;
4087   case Intrinsic::sadd_with_overflow:
4088   case Intrinsic::ssub_with_overflow:
4089     // SSUBO has same costs so don't duplicate.
4090     ISD = ISD::SADDO;
4091     OpTy = RetTy->getContainedType(0);
4092     break;
4093   case Intrinsic::uadd_with_overflow:
4094   case Intrinsic::usub_with_overflow:
4095     // USUBO has same costs so don't duplicate.
4096     ISD = ISD::UADDO;
4097     OpTy = RetTy->getContainedType(0);
4098     break;
4099   case Intrinsic::umul_with_overflow:
4100   case Intrinsic::smul_with_overflow:
4101     // SMULO has same costs so don't duplicate.
4102     ISD = ISD::UMULO;
4103     OpTy = RetTy->getContainedType(0);
4104     break;
4105   }
4106 
4107   if (ISD != ISD::DELETED_NODE) {
4108     // Legalize the type.
4109     std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(OpTy);
4110     MVT MTy = LT.second;
4111 
4112     // Attempt to lookup cost.
4113     if (ISD == ISD::BITREVERSE && ST->hasGFNI() && ST->hasSSSE3() &&
4114         MTy.isVector()) {
4115       // With PSHUFB the code is very similar for all types. If we have integer
4116       // byte operations, we just need a GF2P8AFFINEQB for vXi8. For other types
4117       // we also need a PSHUFB.
4118       unsigned Cost = MTy.getVectorElementType() == MVT::i8 ? 1 : 2;
4119 
4120       // Without byte operations, we need twice as many GF2P8AFFINEQB and PSHUFB
4121       // instructions. We also need an extract and an insert.
4122       if (!(MTy.is128BitVector() || (ST->hasAVX2() && MTy.is256BitVector()) ||
4123             (ST->hasBWI() && MTy.is512BitVector())))
4124         Cost = Cost * 2 + 2;
4125 
4126       return LT.first * Cost;
4127     }
4128 
4129     // Without BMI/LZCNT see if we're only looking for a *_ZERO_UNDEF cost.
4130     if (((ISD == ISD::CTTZ && !ST->hasBMI()) ||
4131          (ISD == ISD::CTLZ && !ST->hasLZCNT())) &&
4132         !MTy.isVector() && !ICA.isTypeBasedOnly()) {
4133       const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4134       if (auto *Cst = dyn_cast<ConstantInt>(Args[1]))
4135         if (Cst->isAllOnesValue())
4136           ISD = ISD == ISD::CTTZ ? ISD::CTTZ_ZERO_UNDEF : ISD::CTLZ_ZERO_UNDEF;
4137     }
4138 
4139     // FSQRT is a single instruction.
4140     if (ISD == ISD::FSQRT && CostKind == TTI::TCK_CodeSize)
4141       return LT.first;
4142 
4143     auto adjustTableCost = [](int ISD, unsigned Cost,
4144                               InstructionCost LegalizationCost,
4145                               FastMathFlags FMF) {
4146       // If there are no NANs to deal with, then these are reduced to a
4147       // single MIN** or MAX** instruction instead of the MIN/CMP/SELECT that we
4148       // assume is used in the non-fast case.
4149       if (ISD == ISD::FMAXNUM || ISD == ISD::FMINNUM) {
4150         if (FMF.noNaNs())
4151           return LegalizationCost * 1;
4152       }
4153       return LegalizationCost * (int)Cost;
4154     };
4155 
4156     if (ST->useGLMDivSqrtCosts())
4157       if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy))
4158         if (auto KindCost = Entry->Cost[CostKind])
4159           return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4160                                  ICA.getFlags());
4161 
4162     if (ST->useSLMArithCosts())
4163       if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
4164         if (auto KindCost = Entry->Cost[CostKind])
4165           return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4166                                  ICA.getFlags());
4167 
4168     if (ST->hasVBMI2())
4169       if (const auto *Entry = CostTableLookup(AVX512VBMI2CostTbl, ISD, MTy))
4170         if (auto KindCost = Entry->Cost[CostKind])
4171           return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4172                                  ICA.getFlags());
4173 
4174     if (ST->hasBITALG())
4175       if (const auto *Entry = CostTableLookup(AVX512BITALGCostTbl, ISD, MTy))
4176         if (auto KindCost = Entry->Cost[CostKind])
4177           return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4178                                  ICA.getFlags());
4179 
4180     if (ST->hasVPOPCNTDQ())
4181       if (const auto *Entry = CostTableLookup(AVX512VPOPCNTDQCostTbl, ISD, MTy))
4182         if (auto KindCost = Entry->Cost[CostKind])
4183           return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4184                                  ICA.getFlags());
4185 
4186     if (ST->hasCDI())
4187       if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy))
4188         if (auto KindCost = Entry->Cost[CostKind])
4189           return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4190                                  ICA.getFlags());
4191 
4192     if (ST->hasBWI())
4193       if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
4194         if (auto KindCost = Entry->Cost[CostKind])
4195           return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4196                                  ICA.getFlags());
4197 
4198     if (ST->hasAVX512())
4199       if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
4200         if (auto KindCost = Entry->Cost[CostKind])
4201           return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4202                                  ICA.getFlags());
4203 
4204     if (ST->hasXOP())
4205       if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
4206         if (auto KindCost = Entry->Cost[CostKind])
4207           return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4208                                  ICA.getFlags());
4209 
4210     if (ST->hasAVX2())
4211       if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
4212         if (auto KindCost = Entry->Cost[CostKind])
4213           return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4214                                  ICA.getFlags());
4215 
4216     if (ST->hasAVX())
4217       if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
4218         if (auto KindCost = Entry->Cost[CostKind])
4219           return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4220                                  ICA.getFlags());
4221 
4222     if (ST->hasSSE42())
4223       if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
4224         if (auto KindCost = Entry->Cost[CostKind])
4225           return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4226                                  ICA.getFlags());
4227 
4228     if (ST->hasSSE41())
4229       if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
4230         if (auto KindCost = Entry->Cost[CostKind])
4231           return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4232                                  ICA.getFlags());
4233 
4234     if (ST->hasSSSE3())
4235       if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy))
4236         if (auto KindCost = Entry->Cost[CostKind])
4237           return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4238                                  ICA.getFlags());
4239 
4240     if (ST->hasSSE2())
4241       if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
4242         if (auto KindCost = Entry->Cost[CostKind])
4243           return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4244                                  ICA.getFlags());
4245 
4246     if (ST->hasSSE1())
4247       if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
4248         if (auto KindCost = Entry->Cost[CostKind])
4249           return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4250                                  ICA.getFlags());
4251 
4252     if (ST->hasBMI()) {
4253       if (ST->is64Bit())
4254         if (const auto *Entry = CostTableLookup(BMI64CostTbl, ISD, MTy))
4255           if (auto KindCost = Entry->Cost[CostKind])
4256             return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4257                                    ICA.getFlags());
4258 
4259       if (const auto *Entry = CostTableLookup(BMI32CostTbl, ISD, MTy))
4260         if (auto KindCost = Entry->Cost[CostKind])
4261           return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4262                                  ICA.getFlags());
4263     }
4264 
4265     if (ST->hasLZCNT()) {
4266       if (ST->is64Bit())
4267         if (const auto *Entry = CostTableLookup(LZCNT64CostTbl, ISD, MTy))
4268           if (auto KindCost = Entry->Cost[CostKind])
4269             return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4270                                    ICA.getFlags());
4271 
4272       if (const auto *Entry = CostTableLookup(LZCNT32CostTbl, ISD, MTy))
4273         if (auto KindCost = Entry->Cost[CostKind])
4274           return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4275                                  ICA.getFlags());
4276     }
4277 
4278     if (ST->hasPOPCNT()) {
4279       if (ST->is64Bit())
4280         if (const auto *Entry = CostTableLookup(POPCNT64CostTbl, ISD, MTy))
4281           if (auto KindCost = Entry->Cost[CostKind])
4282             return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4283                                    ICA.getFlags());
4284 
4285       if (const auto *Entry = CostTableLookup(POPCNT32CostTbl, ISD, MTy))
4286         if (auto KindCost = Entry->Cost[CostKind])
4287           return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4288                                  ICA.getFlags());
4289     }
4290 
4291     if (ISD == ISD::BSWAP && ST->hasMOVBE() && ST->hasFastMOVBE()) {
4292       if (const Instruction *II = ICA.getInst()) {
4293         if (II->hasOneUse() && isa<StoreInst>(II->user_back()))
4294           return TTI::TCC_Free;
4295         if (auto *LI = dyn_cast<LoadInst>(II->getOperand(0))) {
4296           if (LI->hasOneUse())
4297             return TTI::TCC_Free;
4298         }
4299       }
4300     }
4301 
4302     if (ST->is64Bit())
4303       if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
4304         if (auto KindCost = Entry->Cost[CostKind])
4305           return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4306                                  ICA.getFlags());
4307 
4308     if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
4309       if (auto KindCost = Entry->Cost[CostKind])
4310         return adjustTableCost(Entry->ISD, *KindCost, LT.first, ICA.getFlags());
4311   }
4312 
4313   return BaseT::getIntrinsicInstrCost(ICA, CostKind);
4314 }
4315 
4316 InstructionCost X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
4317                                                TTI::TargetCostKind CostKind,
4318                                                unsigned Index, Value *Op0,
4319                                                Value *Op1) {
4320   static const CostTblEntry SLMCostTbl[] = {
4321      { ISD::EXTRACT_VECTOR_ELT,       MVT::i8,      4 },
4322      { ISD::EXTRACT_VECTOR_ELT,       MVT::i16,     4 },
4323      { ISD::EXTRACT_VECTOR_ELT,       MVT::i32,     4 },
4324      { ISD::EXTRACT_VECTOR_ELT,       MVT::i64,     7 }
4325    };
4326 
4327   assert(Val->isVectorTy() && "This must be a vector type");
4328   Type *ScalarType = Val->getScalarType();
4329   InstructionCost RegisterFileMoveCost = 0;
4330 
4331   // Non-immediate extraction/insertion can be handled as a sequence of
4332   // aliased loads+stores via the stack.
4333   if (Index == -1U && (Opcode == Instruction::ExtractElement ||
4334                        Opcode == Instruction::InsertElement)) {
4335     // TODO: On some SSE41+ targets, we expand to cmp+splat+select patterns:
4336     // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
4337 
4338     // TODO: Move this to BasicTTIImpl.h? We'd need better gep + index handling.
4339     assert(isa<FixedVectorType>(Val) && "Fixed vector type expected");
4340     Align VecAlign = DL.getPrefTypeAlign(Val);
4341     Align SclAlign = DL.getPrefTypeAlign(ScalarType);
4342 
4343     // Extract - store vector to stack, load scalar.
4344     if (Opcode == Instruction::ExtractElement) {
4345       return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
4346              getMemoryOpCost(Instruction::Load, ScalarType, SclAlign, 0,
4347                              CostKind);
4348     }
4349     // Insert - store vector to stack, store scalar, load vector.
4350     if (Opcode == Instruction::InsertElement) {
4351       return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
4352              getMemoryOpCost(Instruction::Store, ScalarType, SclAlign, 0,
4353                              CostKind) +
4354              getMemoryOpCost(Instruction::Load, Val, VecAlign, 0, CostKind);
4355     }
4356   }
4357 
4358   if (Index != -1U && (Opcode == Instruction::ExtractElement ||
4359                        Opcode == Instruction::InsertElement)) {
4360     // Extraction of vXi1 elements are now efficiently handled by MOVMSK.
4361     if (Opcode == Instruction::ExtractElement &&
4362         ScalarType->getScalarSizeInBits() == 1 &&
4363         cast<FixedVectorType>(Val)->getNumElements() > 1)
4364       return 1;
4365 
4366     // Legalize the type.
4367     std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
4368 
4369     // This type is legalized to a scalar type.
4370     if (!LT.second.isVector())
4371       return 0;
4372 
4373     // The type may be split. Normalize the index to the new type.
4374     unsigned SizeInBits = LT.second.getSizeInBits();
4375     unsigned NumElts = LT.second.getVectorNumElements();
4376     unsigned SubNumElts = NumElts;
4377     Index = Index % NumElts;
4378 
4379     // For >128-bit vectors, we need to extract higher 128-bit subvectors.
4380     // For inserts, we also need to insert the subvector back.
4381     if (SizeInBits > 128) {
4382       assert((SizeInBits % 128) == 0 && "Illegal vector");
4383       unsigned NumSubVecs = SizeInBits / 128;
4384       SubNumElts = NumElts / NumSubVecs;
4385       if (SubNumElts <= Index) {
4386         RegisterFileMoveCost += (Opcode == Instruction::InsertElement ? 2 : 1);
4387         Index %= SubNumElts;
4388       }
4389     }
4390 
4391     MVT MScalarTy = LT.second.getScalarType();
4392     auto IsCheapPInsrPExtrInsertPS = [&]() {
4393       // Assume pinsr/pextr XMM <-> GPR is relatively cheap on all targets.
4394       // Also, assume insertps is relatively cheap on all >= SSE41 targets.
4395       return (MScalarTy == MVT::i16 && ST->hasSSE2()) ||
4396              (MScalarTy.isInteger() && ST->hasSSE41()) ||
4397              (MScalarTy == MVT::f32 && ST->hasSSE41() &&
4398               Opcode == Instruction::InsertElement);
4399     };
4400 
4401     if (Index == 0) {
4402       // Floating point scalars are already located in index #0.
4403       // Many insertions to #0 can fold away for scalar fp-ops, so let's assume
4404       // true for all.
4405       if (ScalarType->isFloatingPointTy() &&
4406           (Opcode != Instruction::InsertElement || !Op0 ||
4407            isa<UndefValue>(Op0)))
4408         return RegisterFileMoveCost;
4409 
4410       if (Opcode == Instruction::InsertElement &&
4411           isa_and_nonnull<UndefValue>(Op0)) {
4412         // Consider the gather cost to be cheap.
4413         if (isa_and_nonnull<LoadInst>(Op1))
4414           return RegisterFileMoveCost;
4415         if (!IsCheapPInsrPExtrInsertPS()) {
4416           // mov constant-to-GPR + movd/movq GPR -> XMM.
4417           if (isa_and_nonnull<Constant>(Op1) && Op1->getType()->isIntegerTy())
4418             return 2 + RegisterFileMoveCost;
4419           // Assume movd/movq GPR -> XMM is relatively cheap on all targets.
4420           return 1 + RegisterFileMoveCost;
4421         }
4422       }
4423 
4424       // Assume movd/movq XMM -> GPR is relatively cheap on all targets.
4425       if (ScalarType->isIntegerTy() && Opcode == Instruction::ExtractElement)
4426         return 1 + RegisterFileMoveCost;
4427     }
4428 
4429     int ISD = TLI->InstructionOpcodeToISD(Opcode);
4430     assert(ISD && "Unexpected vector opcode");
4431     if (ST->useSLMArithCosts())
4432       if (auto *Entry = CostTableLookup(SLMCostTbl, ISD, MScalarTy))
4433         return Entry->Cost + RegisterFileMoveCost;
4434 
4435     // Consider cheap cases.
4436     if (IsCheapPInsrPExtrInsertPS())
4437       return 1 + RegisterFileMoveCost;
4438 
4439     // For extractions we just need to shuffle the element to index 0, which
4440     // should be very cheap (assume cost = 1). For insertions we need to shuffle
4441     // the elements to its destination. In both cases we must handle the
4442     // subvector move(s).
4443     // If the vector type is already less than 128-bits then don't reduce it.
4444     // TODO: Under what circumstances should we shuffle using the full width?
4445     InstructionCost ShuffleCost = 1;
4446     if (Opcode == Instruction::InsertElement) {
4447       auto *SubTy = cast<VectorType>(Val);
4448       EVT VT = TLI->getValueType(DL, Val);
4449       if (VT.getScalarType() != MScalarTy || VT.getSizeInBits() >= 128)
4450         SubTy = FixedVectorType::get(ScalarType, SubNumElts);
4451       ShuffleCost = getShuffleCost(TTI::SK_PermuteTwoSrc, SubTy, std::nullopt,
4452                                    CostKind, 0, SubTy);
4453     }
4454     int IntOrFpCost = ScalarType->isFloatingPointTy() ? 0 : 1;
4455     return ShuffleCost + IntOrFpCost + RegisterFileMoveCost;
4456   }
4457 
4458   return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1) +
4459          RegisterFileMoveCost;
4460 }
4461 
4462 InstructionCost
4463 X86TTIImpl::getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts,
4464                                      bool Insert, bool Extract,
4465                                      TTI::TargetCostKind CostKind) {
4466   assert(DemandedElts.getBitWidth() ==
4467              cast<FixedVectorType>(Ty)->getNumElements() &&
4468          "Vector size mismatch");
4469 
4470   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
4471   MVT MScalarTy = LT.second.getScalarType();
4472   unsigned LegalVectorBitWidth = LT.second.getSizeInBits();
4473   InstructionCost Cost = 0;
4474 
4475   constexpr unsigned LaneBitWidth = 128;
4476   assert((LegalVectorBitWidth < LaneBitWidth ||
4477           (LegalVectorBitWidth % LaneBitWidth) == 0) &&
4478          "Illegal vector");
4479 
4480   const int NumLegalVectors = *LT.first.getValue();
4481   assert(NumLegalVectors >= 0 && "Negative cost!");
4482 
4483   // For insertions, a ISD::BUILD_VECTOR style vector initialization can be much
4484   // cheaper than an accumulation of ISD::INSERT_VECTOR_ELT.
4485   if (Insert) {
4486     if ((MScalarTy == MVT::i16 && ST->hasSSE2()) ||
4487         (MScalarTy.isInteger() && ST->hasSSE41()) ||
4488         (MScalarTy == MVT::f32 && ST->hasSSE41())) {
4489       // For types we can insert directly, insertion into 128-bit sub vectors is
4490       // cheap, followed by a cheap chain of concatenations.
4491       if (LegalVectorBitWidth <= LaneBitWidth) {
4492         Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert,
4493                                                 /*Extract*/ false, CostKind);
4494       } else {
4495         // In each 128-lane, if at least one index is demanded but not all
4496         // indices are demanded and this 128-lane is not the first 128-lane of
4497         // the legalized-vector, then this 128-lane needs a extracti128; If in
4498         // each 128-lane, there is at least one demanded index, this 128-lane
4499         // needs a inserti128.
4500 
4501         // The following cases will help you build a better understanding:
4502         // Assume we insert several elements into a v8i32 vector in avx2,
4503         // Case#1: inserting into 1th index needs vpinsrd + inserti128.
4504         // Case#2: inserting into 5th index needs extracti128 + vpinsrd +
4505         // inserti128.
4506         // Case#3: inserting into 4,5,6,7 index needs 4*vpinsrd + inserti128.
4507         assert((LegalVectorBitWidth % LaneBitWidth) == 0 && "Illegal vector");
4508         unsigned NumLegalLanes = LegalVectorBitWidth / LaneBitWidth;
4509         unsigned NumLanesTotal = NumLegalLanes * NumLegalVectors;
4510         unsigned NumLegalElts =
4511             LT.second.getVectorNumElements() * NumLegalVectors;
4512         assert(NumLegalElts >= DemandedElts.getBitWidth() &&
4513                "Vector has been legalized to smaller element count");
4514         assert((NumLegalElts % NumLanesTotal) == 0 &&
4515                "Unexpected elts per lane");
4516         unsigned NumEltsPerLane = NumLegalElts / NumLanesTotal;
4517 
4518         APInt WidenedDemandedElts = DemandedElts.zext(NumLegalElts);
4519         auto *LaneTy =
4520             FixedVectorType::get(Ty->getElementType(), NumEltsPerLane);
4521 
4522         for (unsigned I = 0; I != NumLanesTotal; ++I) {
4523           APInt LaneEltMask = WidenedDemandedElts.extractBits(
4524               NumEltsPerLane, NumEltsPerLane * I);
4525           if (LaneEltMask.isZero())
4526             continue;
4527           // FIXME: we don't need to extract if all non-demanded elements
4528           //        are legalization-inserted padding.
4529           if (!LaneEltMask.isAllOnes())
4530             Cost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt,
4531                                    CostKind, I * NumEltsPerLane, LaneTy);
4532           Cost += BaseT::getScalarizationOverhead(LaneTy, LaneEltMask, Insert,
4533                                                   /*Extract*/ false, CostKind);
4534         }
4535 
4536         APInt AffectedLanes =
4537             APIntOps::ScaleBitMask(WidenedDemandedElts, NumLanesTotal);
4538         APInt FullyAffectedLegalVectors = APIntOps::ScaleBitMask(
4539             AffectedLanes, NumLegalVectors, /*MatchAllBits=*/true);
4540         for (int LegalVec = 0; LegalVec != NumLegalVectors; ++LegalVec) {
4541           for (unsigned Lane = 0; Lane != NumLegalLanes; ++Lane) {
4542             unsigned I = NumLegalLanes * LegalVec + Lane;
4543             // No need to insert unaffected lane; or lane 0 of each legal vector
4544             // iff ALL lanes of that vector were affected and will be inserted.
4545             if (!AffectedLanes[I] ||
4546                 (Lane == 0 && FullyAffectedLegalVectors[LegalVec]))
4547               continue;
4548             Cost += getShuffleCost(TTI::SK_InsertSubvector, Ty, std::nullopt,
4549                                    CostKind, I * NumEltsPerLane, LaneTy);
4550           }
4551         }
4552       }
4553     } else if (LT.second.isVector()) {
4554       // Without fast insertion, we need to use MOVD/MOVQ to pass each demanded
4555       // integer element as a SCALAR_TO_VECTOR, then we build the vector as a
4556       // series of UNPCK followed by CONCAT_VECTORS - all of these can be
4557       // considered cheap.
4558       if (Ty->isIntOrIntVectorTy())
4559         Cost += DemandedElts.popcount();
4560 
4561       // Get the smaller of the legalized or original pow2-extended number of
4562       // vector elements, which represents the number of unpacks we'll end up
4563       // performing.
4564       unsigned NumElts = LT.second.getVectorNumElements();
4565       unsigned Pow2Elts =
4566           PowerOf2Ceil(cast<FixedVectorType>(Ty)->getNumElements());
4567       Cost += (std::min<unsigned>(NumElts, Pow2Elts) - 1) * LT.first;
4568     }
4569   }
4570 
4571   if (Extract) {
4572     // vXi1 can be efficiently extracted with MOVMSK.
4573     // TODO: AVX512 predicate mask handling.
4574     // NOTE: This doesn't work well for roundtrip scalarization.
4575     if (!Insert && Ty->getScalarSizeInBits() == 1 && !ST->hasAVX512()) {
4576       unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements();
4577       unsigned MaxElts = ST->hasAVX2() ? 32 : 16;
4578       unsigned MOVMSKCost = (NumElts + MaxElts - 1) / MaxElts;
4579       return MOVMSKCost;
4580     }
4581 
4582     if (LT.second.isVector()) {
4583       unsigned NumLegalElts =
4584           LT.second.getVectorNumElements() * NumLegalVectors;
4585       assert(NumLegalElts >= DemandedElts.getBitWidth() &&
4586              "Vector has been legalized to smaller element count");
4587 
4588       // If we're extracting elements from a 128-bit subvector lane,
4589       // we only need to extract each lane once, not for every element.
4590       if (LegalVectorBitWidth > LaneBitWidth) {
4591         unsigned NumLegalLanes = LegalVectorBitWidth / LaneBitWidth;
4592         unsigned NumLanesTotal = NumLegalLanes * NumLegalVectors;
4593         assert((NumLegalElts % NumLanesTotal) == 0 &&
4594                "Unexpected elts per lane");
4595         unsigned NumEltsPerLane = NumLegalElts / NumLanesTotal;
4596 
4597         // Add cost for each demanded 128-bit subvector extraction.
4598         // Luckily this is a lot easier than for insertion.
4599         APInt WidenedDemandedElts = DemandedElts.zext(NumLegalElts);
4600         auto *LaneTy =
4601             FixedVectorType::get(Ty->getElementType(), NumEltsPerLane);
4602 
4603         for (unsigned I = 0; I != NumLanesTotal; ++I) {
4604           APInt LaneEltMask = WidenedDemandedElts.extractBits(
4605               NumEltsPerLane, I * NumEltsPerLane);
4606           if (LaneEltMask.isZero())
4607             continue;
4608           Cost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt,
4609                                  CostKind, I * NumEltsPerLane, LaneTy);
4610           Cost += BaseT::getScalarizationOverhead(
4611               LaneTy, LaneEltMask, /*Insert*/ false, Extract, CostKind);
4612         }
4613 
4614         return Cost;
4615       }
4616     }
4617 
4618     // Fallback to default extraction.
4619     Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, /*Insert*/ false,
4620                                             Extract, CostKind);
4621   }
4622 
4623   return Cost;
4624 }
4625 
4626 InstructionCost
4627 X86TTIImpl::getReplicationShuffleCost(Type *EltTy, int ReplicationFactor,
4628                                       int VF, const APInt &DemandedDstElts,
4629                                       TTI::TargetCostKind CostKind) {
4630   const unsigned EltTyBits = DL.getTypeSizeInBits(EltTy);
4631   // We don't differentiate element types here, only element bit width.
4632   EltTy = IntegerType::getIntNTy(EltTy->getContext(), EltTyBits);
4633 
4634   auto bailout = [&]() {
4635     return BaseT::getReplicationShuffleCost(EltTy, ReplicationFactor, VF,
4636                                             DemandedDstElts, CostKind);
4637   };
4638 
4639   // For now, only deal with AVX512 cases.
4640   if (!ST->hasAVX512())
4641     return bailout();
4642 
4643   // Do we have a native shuffle for this element type, or should we promote?
4644   unsigned PromEltTyBits = EltTyBits;
4645   switch (EltTyBits) {
4646   case 32:
4647   case 64:
4648     break; // AVX512F.
4649   case 16:
4650     if (!ST->hasBWI())
4651       PromEltTyBits = 32; // promote to i32, AVX512F.
4652     break;                // AVX512BW
4653   case 8:
4654     if (!ST->hasVBMI())
4655       PromEltTyBits = 32; // promote to i32, AVX512F.
4656     break;                // AVX512VBMI
4657   case 1:
4658     // There is no support for shuffling i1 elements. We *must* promote.
4659     if (ST->hasBWI()) {
4660       if (ST->hasVBMI())
4661         PromEltTyBits = 8; // promote to i8, AVX512VBMI.
4662       else
4663         PromEltTyBits = 16; // promote to i16, AVX512BW.
4664       break;
4665     }
4666     PromEltTyBits = 32; // promote to i32, AVX512F.
4667     break;
4668   default:
4669     return bailout();
4670   }
4671   auto *PromEltTy = IntegerType::getIntNTy(EltTy->getContext(), PromEltTyBits);
4672 
4673   auto *SrcVecTy = FixedVectorType::get(EltTy, VF);
4674   auto *PromSrcVecTy = FixedVectorType::get(PromEltTy, VF);
4675 
4676   int NumDstElements = VF * ReplicationFactor;
4677   auto *PromDstVecTy = FixedVectorType::get(PromEltTy, NumDstElements);
4678   auto *DstVecTy = FixedVectorType::get(EltTy, NumDstElements);
4679 
4680   // Legalize the types.
4681   MVT LegalSrcVecTy = getTypeLegalizationCost(SrcVecTy).second;
4682   MVT LegalPromSrcVecTy = getTypeLegalizationCost(PromSrcVecTy).second;
4683   MVT LegalPromDstVecTy = getTypeLegalizationCost(PromDstVecTy).second;
4684   MVT LegalDstVecTy = getTypeLegalizationCost(DstVecTy).second;
4685   // They should have legalized into vector types.
4686   if (!LegalSrcVecTy.isVector() || !LegalPromSrcVecTy.isVector() ||
4687       !LegalPromDstVecTy.isVector() || !LegalDstVecTy.isVector())
4688     return bailout();
4689 
4690   if (PromEltTyBits != EltTyBits) {
4691     // If we have to perform the shuffle with wider elt type than our data type,
4692     // then we will first need to anyext (we don't care about the new bits)
4693     // the source elements, and then truncate Dst elements.
4694     InstructionCost PromotionCost;
4695     PromotionCost += getCastInstrCost(
4696         Instruction::SExt, /*Dst=*/PromSrcVecTy, /*Src=*/SrcVecTy,
4697         TargetTransformInfo::CastContextHint::None, CostKind);
4698     PromotionCost +=
4699         getCastInstrCost(Instruction::Trunc, /*Dst=*/DstVecTy,
4700                          /*Src=*/PromDstVecTy,
4701                          TargetTransformInfo::CastContextHint::None, CostKind);
4702     return PromotionCost + getReplicationShuffleCost(PromEltTy,
4703                                                      ReplicationFactor, VF,
4704                                                      DemandedDstElts, CostKind);
4705   }
4706 
4707   assert(LegalSrcVecTy.getScalarSizeInBits() == EltTyBits &&
4708          LegalSrcVecTy.getScalarType() == LegalDstVecTy.getScalarType() &&
4709          "We expect that the legalization doesn't affect the element width, "
4710          "doesn't coalesce/split elements.");
4711 
4712   unsigned NumEltsPerDstVec = LegalDstVecTy.getVectorNumElements();
4713   unsigned NumDstVectors =
4714       divideCeil(DstVecTy->getNumElements(), NumEltsPerDstVec);
4715 
4716   auto *SingleDstVecTy = FixedVectorType::get(EltTy, NumEltsPerDstVec);
4717 
4718   // Not all the produced Dst elements may be demanded. In our case,
4719   // given that a single Dst vector is formed by a single shuffle,
4720   // if all elements that will form a single Dst vector aren't demanded,
4721   // then we won't need to do that shuffle, so adjust the cost accordingly.
4722   APInt DemandedDstVectors = APIntOps::ScaleBitMask(
4723       DemandedDstElts.zext(NumDstVectors * NumEltsPerDstVec), NumDstVectors);
4724   unsigned NumDstVectorsDemanded = DemandedDstVectors.popcount();
4725 
4726   InstructionCost SingleShuffleCost = getShuffleCost(
4727       TTI::SK_PermuteSingleSrc, SingleDstVecTy, /*Mask=*/std::nullopt, CostKind,
4728       /*Index=*/0, /*SubTp=*/nullptr);
4729   return NumDstVectorsDemanded * SingleShuffleCost;
4730 }
4731 
4732 InstructionCost X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
4733                                             MaybeAlign Alignment,
4734                                             unsigned AddressSpace,
4735                                             TTI::TargetCostKind CostKind,
4736                                             TTI::OperandValueInfo OpInfo,
4737                                             const Instruction *I) {
4738   // TODO: Handle other cost kinds.
4739   if (CostKind != TTI::TCK_RecipThroughput) {
4740     if (auto *SI = dyn_cast_or_null<StoreInst>(I)) {
4741       // Store instruction with index and scale costs 2 Uops.
4742       // Check the preceding GEP to identify non-const indices.
4743       if (auto *GEP = dyn_cast<GetElementPtrInst>(SI->getPointerOperand())) {
4744         if (!all_of(GEP->indices(), [](Value *V) { return isa<Constant>(V); }))
4745           return TTI::TCC_Basic * 2;
4746       }
4747     }
4748     return TTI::TCC_Basic;
4749   }
4750 
4751   assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
4752          "Invalid Opcode");
4753   // Type legalization can't handle structs
4754   if (TLI->getValueType(DL, Src, true) == MVT::Other)
4755     return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
4756                                   CostKind);
4757 
4758   // Legalize the type.
4759   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
4760 
4761   auto *VTy = dyn_cast<FixedVectorType>(Src);
4762 
4763   InstructionCost Cost = 0;
4764 
4765   // Add a cost for constant load to vector.
4766   if (Opcode == Instruction::Store && OpInfo.isConstant())
4767     Cost += getMemoryOpCost(Instruction::Load, Src, DL.getABITypeAlign(Src),
4768                             /*AddressSpace=*/0, CostKind);
4769 
4770   // Handle the simple case of non-vectors.
4771   // NOTE: this assumes that legalization never creates vector from scalars!
4772   if (!VTy || !LT.second.isVector()) {
4773     // Each load/store unit costs 1.
4774     return (LT.second.isFloatingPoint() ? Cost : 0) + LT.first * 1;
4775   }
4776 
4777   bool IsLoad = Opcode == Instruction::Load;
4778 
4779   Type *EltTy = VTy->getElementType();
4780 
4781   const int EltTyBits = DL.getTypeSizeInBits(EltTy);
4782 
4783   // Source of truth: how many elements were there in the original IR vector?
4784   const unsigned SrcNumElt = VTy->getNumElements();
4785 
4786   // How far have we gotten?
4787   int NumEltRemaining = SrcNumElt;
4788   // Note that we intentionally capture by-reference, NumEltRemaining changes.
4789   auto NumEltDone = [&]() { return SrcNumElt - NumEltRemaining; };
4790 
4791   const int MaxLegalOpSizeBytes = divideCeil(LT.second.getSizeInBits(), 8);
4792 
4793   // Note that even if we can store 64 bits of an XMM, we still operate on XMM.
4794   const unsigned XMMBits = 128;
4795   if (XMMBits % EltTyBits != 0)
4796     // Vector size must be a multiple of the element size. I.e. no padding.
4797     return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
4798                                   CostKind);
4799   const int NumEltPerXMM = XMMBits / EltTyBits;
4800 
4801   auto *XMMVecTy = FixedVectorType::get(EltTy, NumEltPerXMM);
4802 
4803   for (int CurrOpSizeBytes = MaxLegalOpSizeBytes, SubVecEltsLeft = 0;
4804        NumEltRemaining > 0; CurrOpSizeBytes /= 2) {
4805     // How many elements would a single op deal with at once?
4806     if ((8 * CurrOpSizeBytes) % EltTyBits != 0)
4807       // Vector size must be a multiple of the element size. I.e. no padding.
4808       return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
4809                                     CostKind);
4810     int CurrNumEltPerOp = (8 * CurrOpSizeBytes) / EltTyBits;
4811 
4812     assert(CurrOpSizeBytes > 0 && CurrNumEltPerOp > 0 && "How'd we get here?");
4813     assert((((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) ||
4814             (CurrOpSizeBytes == MaxLegalOpSizeBytes)) &&
4815            "Unless we haven't halved the op size yet, "
4816            "we have less than two op's sized units of work left.");
4817 
4818     auto *CurrVecTy = CurrNumEltPerOp > NumEltPerXMM
4819                           ? FixedVectorType::get(EltTy, CurrNumEltPerOp)
4820                           : XMMVecTy;
4821 
4822     assert(CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 &&
4823            "After halving sizes, the vector elt count is no longer a multiple "
4824            "of number of elements per operation?");
4825     auto *CoalescedVecTy =
4826         CurrNumEltPerOp == 1
4827             ? CurrVecTy
4828             : FixedVectorType::get(
4829                   IntegerType::get(Src->getContext(),
4830                                    EltTyBits * CurrNumEltPerOp),
4831                   CurrVecTy->getNumElements() / CurrNumEltPerOp);
4832     assert(DL.getTypeSizeInBits(CoalescedVecTy) ==
4833                DL.getTypeSizeInBits(CurrVecTy) &&
4834            "coalesciing elements doesn't change vector width.");
4835 
4836     while (NumEltRemaining > 0) {
4837       assert(SubVecEltsLeft >= 0 && "Subreg element count overconsumtion?");
4838 
4839       // Can we use this vector size, as per the remaining element count?
4840       // Iff the vector is naturally aligned, we can do a wide load regardless.
4841       if (NumEltRemaining < CurrNumEltPerOp &&
4842           (!IsLoad || Alignment.valueOrOne() < CurrOpSizeBytes) &&
4843           CurrOpSizeBytes != 1)
4844         break; // Try smalled vector size.
4845 
4846       bool Is0thSubVec = (NumEltDone() % LT.second.getVectorNumElements()) == 0;
4847 
4848       // If we have fully processed the previous reg, we need to replenish it.
4849       if (SubVecEltsLeft == 0) {
4850         SubVecEltsLeft += CurrVecTy->getNumElements();
4851         // And that's free only for the 0'th subvector of a legalized vector.
4852         if (!Is0thSubVec)
4853           Cost += getShuffleCost(IsLoad ? TTI::ShuffleKind::SK_InsertSubvector
4854                                         : TTI::ShuffleKind::SK_ExtractSubvector,
4855                                  VTy, std::nullopt, CostKind, NumEltDone(),
4856                                  CurrVecTy);
4857       }
4858 
4859       // While we can directly load/store ZMM, YMM, and 64-bit halves of XMM,
4860       // for smaller widths (32/16/8) we have to insert/extract them separately.
4861       // Again, it's free for the 0'th subreg (if op is 32/64 bit wide,
4862       // but let's pretend that it is also true for 16/8 bit wide ops...)
4863       if (CurrOpSizeBytes <= 32 / 8 && !Is0thSubVec) {
4864         int NumEltDoneInCurrXMM = NumEltDone() % NumEltPerXMM;
4865         assert(NumEltDoneInCurrXMM % CurrNumEltPerOp == 0 && "");
4866         int CoalescedVecEltIdx = NumEltDoneInCurrXMM / CurrNumEltPerOp;
4867         APInt DemandedElts =
4868             APInt::getBitsSet(CoalescedVecTy->getNumElements(),
4869                               CoalescedVecEltIdx, CoalescedVecEltIdx + 1);
4870         assert(DemandedElts.popcount() == 1 && "Inserting single value");
4871         Cost += getScalarizationOverhead(CoalescedVecTy, DemandedElts, IsLoad,
4872                                          !IsLoad, CostKind);
4873       }
4874 
4875       // This isn't exactly right. We're using slow unaligned 32-byte accesses
4876       // as a proxy for a double-pumped AVX memory interface such as on
4877       // Sandybridge.
4878       // Sub-32-bit loads/stores will be slower either with PINSR*/PEXTR* or
4879       // will be scalarized.
4880       if (CurrOpSizeBytes == 32 && ST->isUnalignedMem32Slow())
4881         Cost += 2;
4882       else if (CurrOpSizeBytes < 4)
4883         Cost += 2;
4884       else
4885         Cost += 1;
4886 
4887       SubVecEltsLeft -= CurrNumEltPerOp;
4888       NumEltRemaining -= CurrNumEltPerOp;
4889       Alignment = commonAlignment(Alignment.valueOrOne(), CurrOpSizeBytes);
4890     }
4891   }
4892 
4893   assert(NumEltRemaining <= 0 && "Should have processed all the elements.");
4894 
4895   return Cost;
4896 }
4897 
4898 InstructionCost
4899 X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, Align Alignment,
4900                                   unsigned AddressSpace,
4901                                   TTI::TargetCostKind CostKind) {
4902   bool IsLoad = (Instruction::Load == Opcode);
4903   bool IsStore = (Instruction::Store == Opcode);
4904 
4905   auto *SrcVTy = dyn_cast<FixedVectorType>(SrcTy);
4906   if (!SrcVTy)
4907     // To calculate scalar take the regular cost, without mask
4908     return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace, CostKind);
4909 
4910   unsigned NumElem = SrcVTy->getNumElements();
4911   auto *MaskTy =
4912       FixedVectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem);
4913   if ((IsLoad && !isLegalMaskedLoad(SrcVTy, Alignment)) ||
4914       (IsStore && !isLegalMaskedStore(SrcVTy, Alignment))) {
4915     // Scalarization
4916     APInt DemandedElts = APInt::getAllOnes(NumElem);
4917     InstructionCost MaskSplitCost = getScalarizationOverhead(
4918         MaskTy, DemandedElts, /*Insert*/ false, /*Extract*/ true, CostKind);
4919     InstructionCost ScalarCompareCost = getCmpSelInstrCost(
4920         Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr,
4921         CmpInst::BAD_ICMP_PREDICATE, CostKind);
4922     InstructionCost BranchCost = getCFInstrCost(Instruction::Br, CostKind);
4923     InstructionCost MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);
4924     InstructionCost ValueSplitCost = getScalarizationOverhead(
4925         SrcVTy, DemandedElts, IsLoad, IsStore, CostKind);
4926     InstructionCost MemopCost =
4927         NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
4928                                          Alignment, AddressSpace, CostKind);
4929     return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost;
4930   }
4931 
4932   // Legalize the type.
4933   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcVTy);
4934   auto VT = TLI->getValueType(DL, SrcVTy);
4935   InstructionCost Cost = 0;
4936   if (VT.isSimple() && LT.second != VT.getSimpleVT() &&
4937       LT.second.getVectorNumElements() == NumElem)
4938     // Promotion requires extend/truncate for data and a shuffle for mask.
4939     Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVTy, std::nullopt,
4940                            CostKind, 0, nullptr) +
4941             getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, std::nullopt,
4942                            CostKind, 0, nullptr);
4943 
4944   else if (LT.first * LT.second.getVectorNumElements() > NumElem) {
4945     auto *NewMaskTy = FixedVectorType::get(MaskTy->getElementType(),
4946                                            LT.second.getVectorNumElements());
4947     // Expanding requires fill mask with zeroes
4948     Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, std::nullopt,
4949                            CostKind, 0, MaskTy);
4950   }
4951 
4952   // Pre-AVX512 - each maskmov load costs 2 + store costs ~8.
4953   if (!ST->hasAVX512())
4954     return Cost + LT.first * (IsLoad ? 2 : 8);
4955 
4956   // AVX-512 masked load/store is cheaper
4957   return Cost + LT.first;
4958 }
4959 
4960 InstructionCost
4961 X86TTIImpl::getPointersChainCost(ArrayRef<const Value *> Ptrs,
4962                                  const Value *Base,
4963                                  const TTI::PointersChainInfo &Info,
4964                                  Type *AccessTy, TTI::TargetCostKind CostKind) {
4965   if (Info.isSameBase() && Info.isKnownStride()) {
4966     // If all the pointers have known stride all the differences are translated
4967     // into constants. X86 memory addressing allows encoding it into
4968     // displacement. So we just need to take the base GEP cost.
4969     if (const auto *BaseGEP = dyn_cast<GetElementPtrInst>(Base)) {
4970       SmallVector<const Value *> Indices(BaseGEP->indices());
4971       return getGEPCost(BaseGEP->getSourceElementType(),
4972                         BaseGEP->getPointerOperand(), Indices, nullptr,
4973                         CostKind);
4974     }
4975     return TTI::TCC_Free;
4976   }
4977   return BaseT::getPointersChainCost(Ptrs, Base, Info, AccessTy, CostKind);
4978 }
4979 
4980 InstructionCost X86TTIImpl::getAddressComputationCost(Type *Ty,
4981                                                       ScalarEvolution *SE,
4982                                                       const SCEV *Ptr) {
4983   // Address computations in vectorized code with non-consecutive addresses will
4984   // likely result in more instructions compared to scalar code where the
4985   // computation can more often be merged into the index mode. The resulting
4986   // extra micro-ops can significantly decrease throughput.
4987   const unsigned NumVectorInstToHideOverhead = 10;
4988 
4989   // Cost modeling of Strided Access Computation is hidden by the indexing
4990   // modes of X86 regardless of the stride value. We dont believe that there
4991   // is a difference between constant strided access in gerenal and constant
4992   // strided value which is less than or equal to 64.
4993   // Even in the case of (loop invariant) stride whose value is not known at
4994   // compile time, the address computation will not incur more than one extra
4995   // ADD instruction.
4996   if (Ty->isVectorTy() && SE && !ST->hasAVX2()) {
4997     // TODO: AVX2 is the current cut-off because we don't have correct
4998     //       interleaving costs for prior ISA's.
4999     if (!BaseT::isStridedAccess(Ptr))
5000       return NumVectorInstToHideOverhead;
5001     if (!BaseT::getConstantStrideStep(SE, Ptr))
5002       return 1;
5003   }
5004 
5005   return BaseT::getAddressComputationCost(Ty, SE, Ptr);
5006 }
5007 
5008 InstructionCost
5009 X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
5010                                        std::optional<FastMathFlags> FMF,
5011                                        TTI::TargetCostKind CostKind) {
5012   if (TTI::requiresOrderedReduction(FMF))
5013     return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
5014 
5015   // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
5016   // and make it as the cost.
5017 
5018   static const CostTblEntry SLMCostTbl[] = {
5019     { ISD::FADD,  MVT::v2f64,   3 },
5020     { ISD::ADD,   MVT::v2i64,   5 },
5021   };
5022 
5023   static const CostTblEntry SSE2CostTbl[] = {
5024     { ISD::FADD,  MVT::v2f64,   2 },
5025     { ISD::FADD,  MVT::v2f32,   2 },
5026     { ISD::FADD,  MVT::v4f32,   4 },
5027     { ISD::ADD,   MVT::v2i64,   2 },      // The data reported by the IACA tool is "1.6".
5028     { ISD::ADD,   MVT::v2i32,   2 }, // FIXME: chosen to be less than v4i32
5029     { ISD::ADD,   MVT::v4i32,   3 },      // The data reported by the IACA tool is "3.3".
5030     { ISD::ADD,   MVT::v2i16,   2 },      // The data reported by the IACA tool is "4.3".
5031     { ISD::ADD,   MVT::v4i16,   3 },      // The data reported by the IACA tool is "4.3".
5032     { ISD::ADD,   MVT::v8i16,   4 },      // The data reported by the IACA tool is "4.3".
5033     { ISD::ADD,   MVT::v2i8,    2 },
5034     { ISD::ADD,   MVT::v4i8,    2 },
5035     { ISD::ADD,   MVT::v8i8,    2 },
5036     { ISD::ADD,   MVT::v16i8,   3 },
5037   };
5038 
5039   static const CostTblEntry AVX1CostTbl[] = {
5040     { ISD::FADD,  MVT::v4f64,   3 },
5041     { ISD::FADD,  MVT::v4f32,   3 },
5042     { ISD::FADD,  MVT::v8f32,   4 },
5043     { ISD::ADD,   MVT::v2i64,   1 },      // The data reported by the IACA tool is "1.5".
5044     { ISD::ADD,   MVT::v4i64,   3 },
5045     { ISD::ADD,   MVT::v8i32,   5 },
5046     { ISD::ADD,   MVT::v16i16,  5 },
5047     { ISD::ADD,   MVT::v32i8,   4 },
5048   };
5049 
5050   int ISD = TLI->InstructionOpcodeToISD(Opcode);
5051   assert(ISD && "Invalid opcode");
5052 
5053   // Before legalizing the type, give a chance to look up illegal narrow types
5054   // in the table.
5055   // FIXME: Is there a better way to do this?
5056   EVT VT = TLI->getValueType(DL, ValTy);
5057   if (VT.isSimple()) {
5058     MVT MTy = VT.getSimpleVT();
5059     if (ST->useSLMArithCosts())
5060       if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
5061         return Entry->Cost;
5062 
5063     if (ST->hasAVX())
5064       if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5065         return Entry->Cost;
5066 
5067     if (ST->hasSSE2())
5068       if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5069         return Entry->Cost;
5070   }
5071 
5072   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5073 
5074   MVT MTy = LT.second;
5075 
5076   auto *ValVTy = cast<FixedVectorType>(ValTy);
5077 
5078   // Special case: vXi8 mul reductions are performed as vXi16.
5079   if (ISD == ISD::MUL && MTy.getScalarType() == MVT::i8) {
5080     auto *WideSclTy = IntegerType::get(ValVTy->getContext(), 16);
5081     auto *WideVecTy = FixedVectorType::get(WideSclTy, ValVTy->getNumElements());
5082     return getCastInstrCost(Instruction::ZExt, WideVecTy, ValTy,
5083                             TargetTransformInfo::CastContextHint::None,
5084                             CostKind) +
5085            getArithmeticReductionCost(Opcode, WideVecTy, FMF, CostKind);
5086   }
5087 
5088   InstructionCost ArithmeticCost = 0;
5089   if (LT.first != 1 && MTy.isVector() &&
5090       MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5091     // Type needs to be split. We need LT.first - 1 arithmetic ops.
5092     auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
5093                                             MTy.getVectorNumElements());
5094     ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
5095     ArithmeticCost *= LT.first - 1;
5096   }
5097 
5098   if (ST->useSLMArithCosts())
5099     if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
5100       return ArithmeticCost + Entry->Cost;
5101 
5102   if (ST->hasAVX())
5103     if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5104       return ArithmeticCost + Entry->Cost;
5105 
5106   if (ST->hasSSE2())
5107     if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5108       return ArithmeticCost + Entry->Cost;
5109 
5110   // FIXME: These assume a naive kshift+binop lowering, which is probably
5111   // conservative in most cases.
5112   static const CostTblEntry AVX512BoolReduction[] = {
5113     { ISD::AND,  MVT::v2i1,   3 },
5114     { ISD::AND,  MVT::v4i1,   5 },
5115     { ISD::AND,  MVT::v8i1,   7 },
5116     { ISD::AND,  MVT::v16i1,  9 },
5117     { ISD::AND,  MVT::v32i1, 11 },
5118     { ISD::AND,  MVT::v64i1, 13 },
5119     { ISD::OR,   MVT::v2i1,   3 },
5120     { ISD::OR,   MVT::v4i1,   5 },
5121     { ISD::OR,   MVT::v8i1,   7 },
5122     { ISD::OR,   MVT::v16i1,  9 },
5123     { ISD::OR,   MVT::v32i1, 11 },
5124     { ISD::OR,   MVT::v64i1, 13 },
5125   };
5126 
5127   static const CostTblEntry AVX2BoolReduction[] = {
5128     { ISD::AND,  MVT::v16i16,  2 }, // vpmovmskb + cmp
5129     { ISD::AND,  MVT::v32i8,   2 }, // vpmovmskb + cmp
5130     { ISD::OR,   MVT::v16i16,  2 }, // vpmovmskb + cmp
5131     { ISD::OR,   MVT::v32i8,   2 }, // vpmovmskb + cmp
5132   };
5133 
5134   static const CostTblEntry AVX1BoolReduction[] = {
5135     { ISD::AND,  MVT::v4i64,   2 }, // vmovmskpd + cmp
5136     { ISD::AND,  MVT::v8i32,   2 }, // vmovmskps + cmp
5137     { ISD::AND,  MVT::v16i16,  4 }, // vextractf128 + vpand + vpmovmskb + cmp
5138     { ISD::AND,  MVT::v32i8,   4 }, // vextractf128 + vpand + vpmovmskb + cmp
5139     { ISD::OR,   MVT::v4i64,   2 }, // vmovmskpd + cmp
5140     { ISD::OR,   MVT::v8i32,   2 }, // vmovmskps + cmp
5141     { ISD::OR,   MVT::v16i16,  4 }, // vextractf128 + vpor + vpmovmskb + cmp
5142     { ISD::OR,   MVT::v32i8,   4 }, // vextractf128 + vpor + vpmovmskb + cmp
5143   };
5144 
5145   static const CostTblEntry SSE2BoolReduction[] = {
5146     { ISD::AND,  MVT::v2i64,   2 }, // movmskpd + cmp
5147     { ISD::AND,  MVT::v4i32,   2 }, // movmskps + cmp
5148     { ISD::AND,  MVT::v8i16,   2 }, // pmovmskb + cmp
5149     { ISD::AND,  MVT::v16i8,   2 }, // pmovmskb + cmp
5150     { ISD::OR,   MVT::v2i64,   2 }, // movmskpd + cmp
5151     { ISD::OR,   MVT::v4i32,   2 }, // movmskps + cmp
5152     { ISD::OR,   MVT::v8i16,   2 }, // pmovmskb + cmp
5153     { ISD::OR,   MVT::v16i8,   2 }, // pmovmskb + cmp
5154   };
5155 
5156   // Handle bool allof/anyof patterns.
5157   if (ValVTy->getElementType()->isIntegerTy(1)) {
5158     InstructionCost ArithmeticCost = 0;
5159     if (LT.first != 1 && MTy.isVector() &&
5160         MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5161       // Type needs to be split. We need LT.first - 1 arithmetic ops.
5162       auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
5163                                               MTy.getVectorNumElements());
5164       ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
5165       ArithmeticCost *= LT.first - 1;
5166     }
5167 
5168     if (ST->hasAVX512())
5169       if (const auto *Entry = CostTableLookup(AVX512BoolReduction, ISD, MTy))
5170         return ArithmeticCost + Entry->Cost;
5171     if (ST->hasAVX2())
5172       if (const auto *Entry = CostTableLookup(AVX2BoolReduction, ISD, MTy))
5173         return ArithmeticCost + Entry->Cost;
5174     if (ST->hasAVX())
5175       if (const auto *Entry = CostTableLookup(AVX1BoolReduction, ISD, MTy))
5176         return ArithmeticCost + Entry->Cost;
5177     if (ST->hasSSE2())
5178       if (const auto *Entry = CostTableLookup(SSE2BoolReduction, ISD, MTy))
5179         return ArithmeticCost + Entry->Cost;
5180 
5181     return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind);
5182   }
5183 
5184   unsigned NumVecElts = ValVTy->getNumElements();
5185   unsigned ScalarSize = ValVTy->getScalarSizeInBits();
5186 
5187   // Special case power of 2 reductions where the scalar type isn't changed
5188   // by type legalization.
5189   if (!isPowerOf2_32(NumVecElts) || ScalarSize != MTy.getScalarSizeInBits())
5190     return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind);
5191 
5192   InstructionCost ReductionCost = 0;
5193 
5194   auto *Ty = ValVTy;
5195   if (LT.first != 1 && MTy.isVector() &&
5196       MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5197     // Type needs to be split. We need LT.first - 1 arithmetic ops.
5198     Ty = FixedVectorType::get(ValVTy->getElementType(),
5199                               MTy.getVectorNumElements());
5200     ReductionCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
5201     ReductionCost *= LT.first - 1;
5202     NumVecElts = MTy.getVectorNumElements();
5203   }
5204 
5205   // Now handle reduction with the legal type, taking into account size changes
5206   // at each level.
5207   while (NumVecElts > 1) {
5208     // Determine the size of the remaining vector we need to reduce.
5209     unsigned Size = NumVecElts * ScalarSize;
5210     NumVecElts /= 2;
5211     // If we're reducing from 256/512 bits, use an extract_subvector.
5212     if (Size > 128) {
5213       auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
5214       ReductionCost +=
5215           getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt, CostKind,
5216                          NumVecElts, SubTy);
5217       Ty = SubTy;
5218     } else if (Size == 128) {
5219       // Reducing from 128 bits is a permute of v2f64/v2i64.
5220       FixedVectorType *ShufTy;
5221       if (ValVTy->isFloatingPointTy())
5222         ShufTy =
5223             FixedVectorType::get(Type::getDoubleTy(ValVTy->getContext()), 2);
5224       else
5225         ShufTy =
5226             FixedVectorType::get(Type::getInt64Ty(ValVTy->getContext()), 2);
5227       ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy,
5228                                       std::nullopt, CostKind, 0, nullptr);
5229     } else if (Size == 64) {
5230       // Reducing from 64 bits is a shuffle of v4f32/v4i32.
5231       FixedVectorType *ShufTy;
5232       if (ValVTy->isFloatingPointTy())
5233         ShufTy =
5234             FixedVectorType::get(Type::getFloatTy(ValVTy->getContext()), 4);
5235       else
5236         ShufTy =
5237             FixedVectorType::get(Type::getInt32Ty(ValVTy->getContext()), 4);
5238       ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy,
5239                                       std::nullopt, CostKind, 0, nullptr);
5240     } else {
5241       // Reducing from smaller size is a shift by immediate.
5242       auto *ShiftTy = FixedVectorType::get(
5243           Type::getIntNTy(ValVTy->getContext(), Size), 128 / Size);
5244       ReductionCost += getArithmeticInstrCost(
5245           Instruction::LShr, ShiftTy, CostKind,
5246           {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
5247           {TargetTransformInfo::OK_UniformConstantValue, TargetTransformInfo::OP_None});
5248     }
5249 
5250     // Add the arithmetic op for this level.
5251     ReductionCost += getArithmeticInstrCost(Opcode, Ty, CostKind);
5252   }
5253 
5254   // Add the final extract element to the cost.
5255   return ReductionCost + getVectorInstrCost(Instruction::ExtractElement, Ty,
5256                                             CostKind, 0, nullptr, nullptr);
5257 }
5258 
5259 InstructionCost X86TTIImpl::getMinMaxCost(Intrinsic::ID IID, Type *Ty,
5260                                           TTI::TargetCostKind CostKind,
5261                                           FastMathFlags FMF) {
5262   IntrinsicCostAttributes ICA(IID, Ty, {Ty, Ty}, FMF);
5263   return getIntrinsicInstrCost(ICA, CostKind);
5264 }
5265 
5266 InstructionCost
5267 X86TTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *ValTy,
5268                                    FastMathFlags FMF,
5269                                    TTI::TargetCostKind CostKind) {
5270   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5271 
5272   MVT MTy = LT.second;
5273 
5274   int ISD;
5275   if (ValTy->isIntOrIntVectorTy()) {
5276     ISD = (IID == Intrinsic::umin || IID == Intrinsic::umax) ? ISD::UMIN
5277                                                              : ISD::SMIN;
5278   } else {
5279     assert(ValTy->isFPOrFPVectorTy() &&
5280            "Expected float point or integer vector type.");
5281     ISD = (IID == Intrinsic::minnum || IID == Intrinsic::maxnum)
5282               ? ISD::FMINNUM
5283               : ISD::FMINIMUM;
5284   }
5285 
5286   // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
5287   // and make it as the cost.
5288 
5289   static const CostTblEntry SSE2CostTbl[] = {
5290       {ISD::UMIN, MVT::v2i16, 5}, // need pxors to use pminsw/pmaxsw
5291       {ISD::UMIN, MVT::v4i16, 7}, // need pxors to use pminsw/pmaxsw
5292       {ISD::UMIN, MVT::v8i16, 9}, // need pxors to use pminsw/pmaxsw
5293   };
5294 
5295   static const CostTblEntry SSE41CostTbl[] = {
5296       {ISD::SMIN, MVT::v2i16, 3}, // same as sse2
5297       {ISD::SMIN, MVT::v4i16, 5}, // same as sse2
5298       {ISD::UMIN, MVT::v2i16, 5}, // same as sse2
5299       {ISD::UMIN, MVT::v4i16, 7}, // same as sse2
5300       {ISD::SMIN, MVT::v8i16, 4}, // phminposuw+xor
5301       {ISD::UMIN, MVT::v8i16, 4}, // FIXME: umin is cheaper than umax
5302       {ISD::SMIN, MVT::v2i8,  3}, // pminsb
5303       {ISD::SMIN, MVT::v4i8,  5}, // pminsb
5304       {ISD::SMIN, MVT::v8i8,  7}, // pminsb
5305       {ISD::SMIN, MVT::v16i8, 6},
5306       {ISD::UMIN, MVT::v2i8,  3}, // same as sse2
5307       {ISD::UMIN, MVT::v4i8,  5}, // same as sse2
5308       {ISD::UMIN, MVT::v8i8,  7}, // same as sse2
5309       {ISD::UMIN, MVT::v16i8, 6}, // FIXME: umin is cheaper than umax
5310   };
5311 
5312   static const CostTblEntry AVX1CostTbl[] = {
5313       {ISD::SMIN, MVT::v16i16, 6},
5314       {ISD::UMIN, MVT::v16i16, 6}, // FIXME: umin is cheaper than umax
5315       {ISD::SMIN, MVT::v32i8, 8},
5316       {ISD::UMIN, MVT::v32i8, 8},
5317   };
5318 
5319   static const CostTblEntry AVX512BWCostTbl[] = {
5320       {ISD::SMIN, MVT::v32i16, 8},
5321       {ISD::UMIN, MVT::v32i16, 8}, // FIXME: umin is cheaper than umax
5322       {ISD::SMIN, MVT::v64i8, 10},
5323       {ISD::UMIN, MVT::v64i8, 10},
5324   };
5325 
5326   // Before legalizing the type, give a chance to look up illegal narrow types
5327   // in the table.
5328   // FIXME: Is there a better way to do this?
5329   EVT VT = TLI->getValueType(DL, ValTy);
5330   if (VT.isSimple()) {
5331     MVT MTy = VT.getSimpleVT();
5332     if (ST->hasBWI())
5333       if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
5334         return Entry->Cost;
5335 
5336     if (ST->hasAVX())
5337       if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5338         return Entry->Cost;
5339 
5340     if (ST->hasSSE41())
5341       if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
5342         return Entry->Cost;
5343 
5344     if (ST->hasSSE2())
5345       if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5346         return Entry->Cost;
5347   }
5348 
5349   auto *ValVTy = cast<FixedVectorType>(ValTy);
5350   unsigned NumVecElts = ValVTy->getNumElements();
5351 
5352   auto *Ty = ValVTy;
5353   InstructionCost MinMaxCost = 0;
5354   if (LT.first != 1 && MTy.isVector() &&
5355       MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5356     // Type needs to be split. We need LT.first - 1 operations ops.
5357     Ty = FixedVectorType::get(ValVTy->getElementType(),
5358                               MTy.getVectorNumElements());
5359     MinMaxCost = getMinMaxCost(IID, Ty, CostKind, FMF);
5360     MinMaxCost *= LT.first - 1;
5361     NumVecElts = MTy.getVectorNumElements();
5362   }
5363 
5364   if (ST->hasBWI())
5365     if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
5366       return MinMaxCost + Entry->Cost;
5367 
5368   if (ST->hasAVX())
5369     if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5370       return MinMaxCost + Entry->Cost;
5371 
5372   if (ST->hasSSE41())
5373     if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
5374       return MinMaxCost + Entry->Cost;
5375 
5376   if (ST->hasSSE2())
5377     if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5378       return MinMaxCost + Entry->Cost;
5379 
5380   unsigned ScalarSize = ValTy->getScalarSizeInBits();
5381 
5382   // Special case power of 2 reductions where the scalar type isn't changed
5383   // by type legalization.
5384   if (!isPowerOf2_32(ValVTy->getNumElements()) ||
5385       ScalarSize != MTy.getScalarSizeInBits())
5386     return BaseT::getMinMaxReductionCost(IID, ValTy, FMF, CostKind);
5387 
5388   // Now handle reduction with the legal type, taking into account size changes
5389   // at each level.
5390   while (NumVecElts > 1) {
5391     // Determine the size of the remaining vector we need to reduce.
5392     unsigned Size = NumVecElts * ScalarSize;
5393     NumVecElts /= 2;
5394     // If we're reducing from 256/512 bits, use an extract_subvector.
5395     if (Size > 128) {
5396       auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
5397       MinMaxCost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt,
5398                                    CostKind, NumVecElts, SubTy);
5399       Ty = SubTy;
5400     } else if (Size == 128) {
5401       // Reducing from 128 bits is a permute of v2f64/v2i64.
5402       VectorType *ShufTy;
5403       if (ValTy->isFloatingPointTy())
5404         ShufTy =
5405             FixedVectorType::get(Type::getDoubleTy(ValTy->getContext()), 2);
5406       else
5407         ShufTy = FixedVectorType::get(Type::getInt64Ty(ValTy->getContext()), 2);
5408       MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy,
5409                                    std::nullopt, CostKind, 0, nullptr);
5410     } else if (Size == 64) {
5411       // Reducing from 64 bits is a shuffle of v4f32/v4i32.
5412       FixedVectorType *ShufTy;
5413       if (ValTy->isFloatingPointTy())
5414         ShufTy = FixedVectorType::get(Type::getFloatTy(ValTy->getContext()), 4);
5415       else
5416         ShufTy = FixedVectorType::get(Type::getInt32Ty(ValTy->getContext()), 4);
5417       MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy,
5418                                    std::nullopt, CostKind, 0, nullptr);
5419     } else {
5420       // Reducing from smaller size is a shift by immediate.
5421       auto *ShiftTy = FixedVectorType::get(
5422           Type::getIntNTy(ValTy->getContext(), Size), 128 / Size);
5423       MinMaxCost += getArithmeticInstrCost(
5424           Instruction::LShr, ShiftTy, TTI::TCK_RecipThroughput,
5425           {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
5426           {TargetTransformInfo::OK_UniformConstantValue, TargetTransformInfo::OP_None});
5427     }
5428 
5429     // Add the arithmetic op for this level.
5430     MinMaxCost += getMinMaxCost(IID, Ty, CostKind, FMF);
5431   }
5432 
5433   // Add the final extract element to the cost.
5434   return MinMaxCost + getVectorInstrCost(Instruction::ExtractElement, Ty,
5435                                          CostKind, 0, nullptr, nullptr);
5436 }
5437 
5438 /// Calculate the cost of materializing a 64-bit value. This helper
5439 /// method might only calculate a fraction of a larger immediate. Therefore it
5440 /// is valid to return a cost of ZERO.
5441 InstructionCost X86TTIImpl::getIntImmCost(int64_t Val) {
5442   if (Val == 0)
5443     return TTI::TCC_Free;
5444 
5445   if (isInt<32>(Val))
5446     return TTI::TCC_Basic;
5447 
5448   return 2 * TTI::TCC_Basic;
5449 }
5450 
5451 InstructionCost X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
5452                                           TTI::TargetCostKind CostKind) {
5453   assert(Ty->isIntegerTy());
5454 
5455   unsigned BitSize = Ty->getPrimitiveSizeInBits();
5456   if (BitSize == 0)
5457     return ~0U;
5458 
5459   // Never hoist constants larger than 128bit, because this might lead to
5460   // incorrect code generation or assertions in codegen.
5461   // Fixme: Create a cost model for types larger than i128 once the codegen
5462   // issues have been fixed.
5463   if (BitSize > 128)
5464     return TTI::TCC_Free;
5465 
5466   if (Imm == 0)
5467     return TTI::TCC_Free;
5468 
5469   // Sign-extend all constants to a multiple of 64-bit.
5470   APInt ImmVal = Imm;
5471   if (BitSize % 64 != 0)
5472     ImmVal = Imm.sext(alignTo(BitSize, 64));
5473 
5474   // Split the constant into 64-bit chunks and calculate the cost for each
5475   // chunk.
5476   InstructionCost Cost = 0;
5477   for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
5478     APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
5479     int64_t Val = Tmp.getSExtValue();
5480     Cost += getIntImmCost(Val);
5481   }
5482   // We need at least one instruction to materialize the constant.
5483   return std::max<InstructionCost>(1, Cost);
5484 }
5485 
5486 InstructionCost X86TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
5487                                               const APInt &Imm, Type *Ty,
5488                                               TTI::TargetCostKind CostKind,
5489                                               Instruction *Inst) {
5490   assert(Ty->isIntegerTy());
5491 
5492   unsigned BitSize = Ty->getPrimitiveSizeInBits();
5493   // There is no cost model for constants with a bit size of 0. Return TCC_Free
5494   // here, so that constant hoisting will ignore this constant.
5495   if (BitSize == 0)
5496     return TTI::TCC_Free;
5497 
5498   unsigned ImmIdx = ~0U;
5499   switch (Opcode) {
5500   default:
5501     return TTI::TCC_Free;
5502   case Instruction::GetElementPtr:
5503     // Always hoist the base address of a GetElementPtr. This prevents the
5504     // creation of new constants for every base constant that gets constant
5505     // folded with the offset.
5506     if (Idx == 0)
5507       return 2 * TTI::TCC_Basic;
5508     return TTI::TCC_Free;
5509   case Instruction::Store:
5510     ImmIdx = 0;
5511     break;
5512   case Instruction::ICmp:
5513     // This is an imperfect hack to prevent constant hoisting of
5514     // compares that might be trying to check if a 64-bit value fits in
5515     // 32-bits. The backend can optimize these cases using a right shift by 32.
5516     // Ideally we would check the compare predicate here. There also other
5517     // similar immediates the backend can use shifts for.
5518     if (Idx == 1 && Imm.getBitWidth() == 64) {
5519       uint64_t ImmVal = Imm.getZExtValue();
5520       if (ImmVal == 0x100000000ULL || ImmVal == 0xffffffff)
5521         return TTI::TCC_Free;
5522     }
5523     ImmIdx = 1;
5524     break;
5525   case Instruction::And:
5526     // We support 64-bit ANDs with immediates with 32-bits of leading zeroes
5527     // by using a 32-bit operation with implicit zero extension. Detect such
5528     // immediates here as the normal path expects bit 31 to be sign extended.
5529     if (Idx == 1 && Imm.getBitWidth() == 64 && Imm.isIntN(32))
5530       return TTI::TCC_Free;
5531     ImmIdx = 1;
5532     break;
5533   case Instruction::Add:
5534   case Instruction::Sub:
5535     // For add/sub, we can use the opposite instruction for INT32_MIN.
5536     if (Idx == 1 && Imm.getBitWidth() == 64 && Imm.getZExtValue() == 0x80000000)
5537       return TTI::TCC_Free;
5538     ImmIdx = 1;
5539     break;
5540   case Instruction::UDiv:
5541   case Instruction::SDiv:
5542   case Instruction::URem:
5543   case Instruction::SRem:
5544     // Division by constant is typically expanded later into a different
5545     // instruction sequence. This completely changes the constants.
5546     // Report them as "free" to stop ConstantHoist from marking them as opaque.
5547     return TTI::TCC_Free;
5548   case Instruction::Mul:
5549   case Instruction::Or:
5550   case Instruction::Xor:
5551     ImmIdx = 1;
5552     break;
5553   // Always return TCC_Free for the shift value of a shift instruction.
5554   case Instruction::Shl:
5555   case Instruction::LShr:
5556   case Instruction::AShr:
5557     if (Idx == 1)
5558       return TTI::TCC_Free;
5559     break;
5560   case Instruction::Trunc:
5561   case Instruction::ZExt:
5562   case Instruction::SExt:
5563   case Instruction::IntToPtr:
5564   case Instruction::PtrToInt:
5565   case Instruction::BitCast:
5566   case Instruction::PHI:
5567   case Instruction::Call:
5568   case Instruction::Select:
5569   case Instruction::Ret:
5570   case Instruction::Load:
5571     break;
5572   }
5573 
5574   if (Idx == ImmIdx) {
5575     uint64_t NumConstants = divideCeil(BitSize, 64);
5576     InstructionCost Cost = X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
5577     return (Cost <= NumConstants * TTI::TCC_Basic)
5578                ? static_cast<int>(TTI::TCC_Free)
5579                : Cost;
5580   }
5581 
5582   return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
5583 }
5584 
5585 InstructionCost X86TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
5586                                                 const APInt &Imm, Type *Ty,
5587                                                 TTI::TargetCostKind CostKind) {
5588   assert(Ty->isIntegerTy());
5589 
5590   unsigned BitSize = Ty->getPrimitiveSizeInBits();
5591   // There is no cost model for constants with a bit size of 0. Return TCC_Free
5592   // here, so that constant hoisting will ignore this constant.
5593   if (BitSize == 0)
5594     return TTI::TCC_Free;
5595 
5596   switch (IID) {
5597   default:
5598     return TTI::TCC_Free;
5599   case Intrinsic::sadd_with_overflow:
5600   case Intrinsic::uadd_with_overflow:
5601   case Intrinsic::ssub_with_overflow:
5602   case Intrinsic::usub_with_overflow:
5603   case Intrinsic::smul_with_overflow:
5604   case Intrinsic::umul_with_overflow:
5605     if ((Idx == 1) && Imm.getBitWidth() <= 64 && Imm.isSignedIntN(32))
5606       return TTI::TCC_Free;
5607     break;
5608   case Intrinsic::experimental_stackmap:
5609     if ((Idx < 2) || (Imm.getBitWidth() <= 64 && Imm.isSignedIntN(64)))
5610       return TTI::TCC_Free;
5611     break;
5612   case Intrinsic::experimental_patchpoint_void:
5613   case Intrinsic::experimental_patchpoint_i64:
5614     if ((Idx < 4) || (Imm.getBitWidth() <= 64 && Imm.isSignedIntN(64)))
5615       return TTI::TCC_Free;
5616     break;
5617   }
5618   return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
5619 }
5620 
5621 InstructionCost X86TTIImpl::getCFInstrCost(unsigned Opcode,
5622                                            TTI::TargetCostKind CostKind,
5623                                            const Instruction *I) {
5624   if (CostKind != TTI::TCK_RecipThroughput)
5625     return Opcode == Instruction::PHI ? 0 : 1;
5626   // Branches are assumed to be predicted.
5627   return 0;
5628 }
5629 
5630 int X86TTIImpl::getGatherOverhead() const {
5631   // Some CPUs have more overhead for gather. The specified overhead is relative
5632   // to the Load operation. "2" is the number provided by Intel architects. This
5633   // parameter is used for cost estimation of Gather Op and comparison with
5634   // other alternatives.
5635   // TODO: Remove the explicit hasAVX512()?, That would mean we would only
5636   // enable gather with a -march.
5637   if (ST->hasAVX512() || (ST->hasAVX2() && ST->hasFastGather()))
5638     return 2;
5639 
5640   return 1024;
5641 }
5642 
5643 int X86TTIImpl::getScatterOverhead() const {
5644   if (ST->hasAVX512())
5645     return 2;
5646 
5647   return 1024;
5648 }
5649 
5650 // Return an average cost of Gather / Scatter instruction, maybe improved later.
5651 // FIXME: Add TargetCostKind support.
5652 InstructionCost X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy,
5653                                             const Value *Ptr, Align Alignment,
5654                                             unsigned AddressSpace) {
5655 
5656   assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost");
5657   unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements();
5658 
5659   // Try to reduce index size from 64 bit (default for GEP)
5660   // to 32. It is essential for VF 16. If the index can't be reduced to 32, the
5661   // operation will use 16 x 64 indices which do not fit in a zmm and needs
5662   // to split. Also check that the base pointer is the same for all lanes,
5663   // and that there's at most one variable index.
5664   auto getIndexSizeInBits = [](const Value *Ptr, const DataLayout &DL) {
5665     unsigned IndexSize = DL.getPointerSizeInBits();
5666     const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
5667     if (IndexSize < 64 || !GEP)
5668       return IndexSize;
5669 
5670     unsigned NumOfVarIndices = 0;
5671     const Value *Ptrs = GEP->getPointerOperand();
5672     if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs))
5673       return IndexSize;
5674     for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I) {
5675       if (isa<Constant>(GEP->getOperand(I)))
5676         continue;
5677       Type *IndxTy = GEP->getOperand(I)->getType();
5678       if (auto *IndexVTy = dyn_cast<VectorType>(IndxTy))
5679         IndxTy = IndexVTy->getElementType();
5680       if ((IndxTy->getPrimitiveSizeInBits() == 64 &&
5681            !isa<SExtInst>(GEP->getOperand(I))) ||
5682           ++NumOfVarIndices > 1)
5683         return IndexSize; // 64
5684     }
5685     return (unsigned)32;
5686   };
5687 
5688   // Trying to reduce IndexSize to 32 bits for vector 16.
5689   // By default the IndexSize is equal to pointer size.
5690   unsigned IndexSize = (ST->hasAVX512() && VF >= 16)
5691                            ? getIndexSizeInBits(Ptr, DL)
5692                            : DL.getPointerSizeInBits();
5693 
5694   auto *IndexVTy = FixedVectorType::get(
5695       IntegerType::get(SrcVTy->getContext(), IndexSize), VF);
5696   std::pair<InstructionCost, MVT> IdxsLT = getTypeLegalizationCost(IndexVTy);
5697   std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(SrcVTy);
5698   InstructionCost::CostType SplitFactor =
5699       *std::max(IdxsLT.first, SrcLT.first).getValue();
5700   if (SplitFactor > 1) {
5701     // Handle splitting of vector of pointers
5702     auto *SplitSrcTy =
5703         FixedVectorType::get(SrcVTy->getScalarType(), VF / SplitFactor);
5704     return SplitFactor * getGSVectorCost(Opcode, SplitSrcTy, Ptr, Alignment,
5705                                          AddressSpace);
5706   }
5707 
5708   // The gather / scatter cost is given by Intel architects. It is a rough
5709   // number since we are looking at one instruction in a time.
5710   const int GSOverhead = (Opcode == Instruction::Load)
5711                              ? getGatherOverhead()
5712                              : getScatterOverhead();
5713   return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
5714                                            MaybeAlign(Alignment), AddressSpace,
5715                                            TTI::TCK_RecipThroughput);
5716 }
5717 
5718 /// Return the cost of full scalarization of gather / scatter operation.
5719 ///
5720 /// Opcode - Load or Store instruction.
5721 /// SrcVTy - The type of the data vector that should be gathered or scattered.
5722 /// VariableMask - The mask is non-constant at compile time.
5723 /// Alignment - Alignment for one element.
5724 /// AddressSpace - pointer[s] address space.
5725 ///
5726 /// FIXME: Add TargetCostKind support.
5727 InstructionCost X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy,
5728                                             bool VariableMask, Align Alignment,
5729                                             unsigned AddressSpace) {
5730   Type *ScalarTy = SrcVTy->getScalarType();
5731   unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements();
5732   APInt DemandedElts = APInt::getAllOnes(VF);
5733   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
5734 
5735   InstructionCost MaskUnpackCost = 0;
5736   if (VariableMask) {
5737     auto *MaskTy =
5738         FixedVectorType::get(Type::getInt1Ty(SrcVTy->getContext()), VF);
5739     MaskUnpackCost = getScalarizationOverhead(
5740         MaskTy, DemandedElts, /*Insert=*/false, /*Extract=*/true, CostKind);
5741     InstructionCost ScalarCompareCost = getCmpSelInstrCost(
5742         Instruction::ICmp, Type::getInt1Ty(SrcVTy->getContext()), nullptr,
5743         CmpInst::BAD_ICMP_PREDICATE, CostKind);
5744     InstructionCost BranchCost = getCFInstrCost(Instruction::Br, CostKind);
5745     MaskUnpackCost += VF * (BranchCost + ScalarCompareCost);
5746   }
5747 
5748   InstructionCost AddressUnpackCost = getScalarizationOverhead(
5749       FixedVectorType::get(ScalarTy->getPointerTo(), VF), DemandedElts,
5750       /*Insert=*/false, /*Extract=*/true, CostKind);
5751 
5752   // The cost of the scalar loads/stores.
5753   InstructionCost MemoryOpCost =
5754       VF * getMemoryOpCost(Opcode, ScalarTy, MaybeAlign(Alignment),
5755                            AddressSpace, CostKind);
5756 
5757   // The cost of forming the vector from loaded scalars/
5758   // scalarizing the vector to perform scalar stores.
5759   InstructionCost InsertExtractCost = getScalarizationOverhead(
5760       cast<FixedVectorType>(SrcVTy), DemandedElts,
5761       /*Insert=*/Opcode == Instruction::Load,
5762       /*Extract=*/Opcode == Instruction::Store, CostKind);
5763 
5764   return AddressUnpackCost + MemoryOpCost + MaskUnpackCost + InsertExtractCost;
5765 }
5766 
5767 /// Calculate the cost of Gather / Scatter operation
5768 InstructionCost X86TTIImpl::getGatherScatterOpCost(
5769     unsigned Opcode, Type *SrcVTy, const Value *Ptr, bool VariableMask,
5770     Align Alignment, TTI::TargetCostKind CostKind,
5771     const Instruction *I = nullptr) {
5772   if (CostKind != TTI::TCK_RecipThroughput) {
5773     if ((Opcode == Instruction::Load &&
5774          isLegalMaskedGather(SrcVTy, Align(Alignment)) &&
5775          !forceScalarizeMaskedGather(cast<VectorType>(SrcVTy),
5776                                      Align(Alignment))) ||
5777         (Opcode == Instruction::Store &&
5778          isLegalMaskedScatter(SrcVTy, Align(Alignment)) &&
5779          !forceScalarizeMaskedScatter(cast<VectorType>(SrcVTy),
5780                                       Align(Alignment))))
5781       return 1;
5782     return BaseT::getGatherScatterOpCost(Opcode, SrcVTy, Ptr, VariableMask,
5783                                          Alignment, CostKind, I);
5784   }
5785 
5786   assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter");
5787   PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
5788   if (!PtrTy && Ptr->getType()->isVectorTy())
5789     PtrTy = dyn_cast<PointerType>(
5790         cast<VectorType>(Ptr->getType())->getElementType());
5791   assert(PtrTy && "Unexpected type for Ptr argument");
5792   unsigned AddressSpace = PtrTy->getAddressSpace();
5793 
5794   if ((Opcode == Instruction::Load &&
5795        (!isLegalMaskedGather(SrcVTy, Align(Alignment)) ||
5796         forceScalarizeMaskedGather(cast<VectorType>(SrcVTy),
5797                                    Align(Alignment)))) ||
5798       (Opcode == Instruction::Store &&
5799        (!isLegalMaskedScatter(SrcVTy, Align(Alignment)) ||
5800         forceScalarizeMaskedScatter(cast<VectorType>(SrcVTy),
5801                                     Align(Alignment)))))
5802     return getGSScalarCost(Opcode, SrcVTy, VariableMask, Alignment,
5803                            AddressSpace);
5804 
5805   return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace);
5806 }
5807 
5808 bool X86TTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
5809                                const TargetTransformInfo::LSRCost &C2) {
5810     // X86 specific here are "instruction number 1st priority".
5811     return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
5812                     C1.NumIVMuls, C1.NumBaseAdds,
5813                     C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
5814            std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
5815                     C2.NumIVMuls, C2.NumBaseAdds,
5816                     C2.ScaleCost, C2.ImmCost, C2.SetupCost);
5817 }
5818 
5819 bool X86TTIImpl::canMacroFuseCmp() {
5820   return ST->hasMacroFusion() || ST->hasBranchFusion();
5821 }
5822 
5823 bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) {
5824   if (!ST->hasAVX())
5825     return false;
5826 
5827   // The backend can't handle a single element vector.
5828   if (isa<VectorType>(DataTy) &&
5829       cast<FixedVectorType>(DataTy)->getNumElements() == 1)
5830     return false;
5831   Type *ScalarTy = DataTy->getScalarType();
5832 
5833   if (ScalarTy->isPointerTy())
5834     return true;
5835 
5836   if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
5837     return true;
5838 
5839   if (ScalarTy->isHalfTy() && ST->hasBWI())
5840     return true;
5841 
5842   if (ScalarTy->isBFloatTy() && ST->hasBF16())
5843     return true;
5844 
5845   if (!ScalarTy->isIntegerTy())
5846     return false;
5847 
5848   unsigned IntWidth = ScalarTy->getIntegerBitWidth();
5849   return IntWidth == 32 || IntWidth == 64 ||
5850          ((IntWidth == 8 || IntWidth == 16) && ST->hasBWI());
5851 }
5852 
5853 bool X86TTIImpl::isLegalMaskedStore(Type *DataType, Align Alignment) {
5854   return isLegalMaskedLoad(DataType, Alignment);
5855 }
5856 
5857 bool X86TTIImpl::isLegalNTLoad(Type *DataType, Align Alignment) {
5858   unsigned DataSize = DL.getTypeStoreSize(DataType);
5859   // The only supported nontemporal loads are for aligned vectors of 16 or 32
5860   // bytes.  Note that 32-byte nontemporal vector loads are supported by AVX2
5861   // (the equivalent stores only require AVX).
5862   if (Alignment >= DataSize && (DataSize == 16 || DataSize == 32))
5863     return DataSize == 16 ?  ST->hasSSE1() : ST->hasAVX2();
5864 
5865   return false;
5866 }
5867 
5868 bool X86TTIImpl::isLegalNTStore(Type *DataType, Align Alignment) {
5869   unsigned DataSize = DL.getTypeStoreSize(DataType);
5870 
5871   // SSE4A supports nontemporal stores of float and double at arbitrary
5872   // alignment.
5873   if (ST->hasSSE4A() && (DataType->isFloatTy() || DataType->isDoubleTy()))
5874     return true;
5875 
5876   // Besides the SSE4A subtarget exception above, only aligned stores are
5877   // available nontemporaly on any other subtarget.  And only stores with a size
5878   // of 4..32 bytes (powers of 2, only) are permitted.
5879   if (Alignment < DataSize || DataSize < 4 || DataSize > 32 ||
5880       !isPowerOf2_32(DataSize))
5881     return false;
5882 
5883   // 32-byte vector nontemporal stores are supported by AVX (the equivalent
5884   // loads require AVX2).
5885   if (DataSize == 32)
5886     return ST->hasAVX();
5887   if (DataSize == 16)
5888     return ST->hasSSE1();
5889   return true;
5890 }
5891 
5892 bool X86TTIImpl::isLegalBroadcastLoad(Type *ElementTy,
5893                                       ElementCount NumElements) const {
5894   // movddup
5895   return ST->hasSSE3() && !NumElements.isScalable() &&
5896          NumElements.getFixedValue() == 2 &&
5897          ElementTy == Type::getDoubleTy(ElementTy->getContext());
5898 }
5899 
5900 bool X86TTIImpl::isLegalMaskedExpandLoad(Type *DataTy) {
5901   if (!isa<VectorType>(DataTy))
5902     return false;
5903 
5904   if (!ST->hasAVX512())
5905     return false;
5906 
5907   // The backend can't handle a single element vector.
5908   if (cast<FixedVectorType>(DataTy)->getNumElements() == 1)
5909     return false;
5910 
5911   Type *ScalarTy = cast<VectorType>(DataTy)->getElementType();
5912 
5913   if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
5914     return true;
5915 
5916   if (!ScalarTy->isIntegerTy())
5917     return false;
5918 
5919   unsigned IntWidth = ScalarTy->getIntegerBitWidth();
5920   return IntWidth == 32 || IntWidth == 64 ||
5921          ((IntWidth == 8 || IntWidth == 16) && ST->hasVBMI2());
5922 }
5923 
5924 bool X86TTIImpl::isLegalMaskedCompressStore(Type *DataTy) {
5925   return isLegalMaskedExpandLoad(DataTy);
5926 }
5927 
5928 bool X86TTIImpl::supportsGather() const {
5929   // Some CPUs have better gather performance than others.
5930   // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only
5931   // enable gather with a -march.
5932   return ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2());
5933 }
5934 
5935 bool X86TTIImpl::forceScalarizeMaskedGather(VectorType *VTy, Align Alignment) {
5936   // Gather / Scatter for vector 2 is not profitable on KNL / SKX
5937   // Vector-4 of gather/scatter instruction does not exist on KNL. We can extend
5938   // it to 8 elements, but zeroing upper bits of the mask vector will add more
5939   // instructions. Right now we give the scalar cost of vector-4 for KNL. TODO:
5940   // Check, maybe the gather/scatter instruction is better in the VariableMask
5941   // case.
5942   unsigned NumElts = cast<FixedVectorType>(VTy)->getNumElements();
5943   return NumElts == 1 ||
5944          (ST->hasAVX512() && (NumElts == 2 || (NumElts == 4 && !ST->hasVLX())));
5945 }
5946 
5947 bool X86TTIImpl::isLegalMaskedGatherScatter(Type *DataTy, Align Alignment) {
5948   Type *ScalarTy = DataTy->getScalarType();
5949   if (ScalarTy->isPointerTy())
5950     return true;
5951 
5952   if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
5953     return true;
5954 
5955   if (!ScalarTy->isIntegerTy())
5956     return false;
5957 
5958   unsigned IntWidth = ScalarTy->getIntegerBitWidth();
5959   return IntWidth == 32 || IntWidth == 64;
5960 }
5961 
5962 bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, Align Alignment) {
5963   if (!supportsGather() || !ST->preferGather())
5964     return false;
5965   return isLegalMaskedGatherScatter(DataTy, Alignment);
5966 }
5967 
5968 bool X86TTIImpl::isLegalAltInstr(VectorType *VecTy, unsigned Opcode0,
5969                                  unsigned Opcode1,
5970                                  const SmallBitVector &OpcodeMask) const {
5971   // ADDSUBPS  4xf32 SSE3
5972   // VADDSUBPS 4xf32 AVX
5973   // VADDSUBPS 8xf32 AVX2
5974   // ADDSUBPD  2xf64 SSE3
5975   // VADDSUBPD 2xf64 AVX
5976   // VADDSUBPD 4xf64 AVX2
5977 
5978   unsigned NumElements = cast<FixedVectorType>(VecTy)->getNumElements();
5979   assert(OpcodeMask.size() == NumElements && "Mask and VecTy are incompatible");
5980   if (!isPowerOf2_32(NumElements))
5981     return false;
5982   // Check the opcode pattern. We apply the mask on the opcode arguments and
5983   // then check if it is what we expect.
5984   for (int Lane : seq<int>(0, NumElements)) {
5985     unsigned Opc = OpcodeMask.test(Lane) ? Opcode1 : Opcode0;
5986     // We expect FSub for even lanes and FAdd for odd lanes.
5987     if (Lane % 2 == 0 && Opc != Instruction::FSub)
5988       return false;
5989     if (Lane % 2 == 1 && Opc != Instruction::FAdd)
5990       return false;
5991   }
5992   // Now check that the pattern is supported by the target ISA.
5993   Type *ElemTy = cast<VectorType>(VecTy)->getElementType();
5994   if (ElemTy->isFloatTy())
5995     return ST->hasSSE3() && NumElements % 4 == 0;
5996   if (ElemTy->isDoubleTy())
5997     return ST->hasSSE3() && NumElements % 2 == 0;
5998   return false;
5999 }
6000 
6001 bool X86TTIImpl::isLegalMaskedScatter(Type *DataType, Align Alignment) {
6002   // AVX2 doesn't support scatter
6003   if (!ST->hasAVX512() || !ST->preferScatter())
6004     return false;
6005   return isLegalMaskedGatherScatter(DataType, Alignment);
6006 }
6007 
6008 bool X86TTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
6009   EVT VT = TLI->getValueType(DL, DataType);
6010   return TLI->isOperationLegal(IsSigned ? ISD::SDIVREM : ISD::UDIVREM, VT);
6011 }
6012 
6013 bool X86TTIImpl::isExpensiveToSpeculativelyExecute(const Instruction* I) {
6014   // FDIV is always expensive, even if it has a very low uop count.
6015   // TODO: Still necessary for recent CPUs with low latency/throughput fdiv?
6016   if (I->getOpcode() == Instruction::FDiv)
6017     return true;
6018 
6019   return BaseT::isExpensiveToSpeculativelyExecute(I);
6020 }
6021 
6022 bool X86TTIImpl::isFCmpOrdCheaperThanFCmpZero(Type *Ty) {
6023   return false;
6024 }
6025 
6026 bool X86TTIImpl::areInlineCompatible(const Function *Caller,
6027                                      const Function *Callee) const {
6028   const TargetMachine &TM = getTLI()->getTargetMachine();
6029 
6030   // Work this as a subsetting of subtarget features.
6031   const FeatureBitset &CallerBits =
6032       TM.getSubtargetImpl(*Caller)->getFeatureBits();
6033   const FeatureBitset &CalleeBits =
6034       TM.getSubtargetImpl(*Callee)->getFeatureBits();
6035 
6036   // Check whether features are the same (apart from the ignore list).
6037   FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
6038   FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
6039   if (RealCallerBits == RealCalleeBits)
6040     return true;
6041 
6042   // If the features are a subset, we need to additionally check for calls
6043   // that may become ABI-incompatible as a result of inlining.
6044   if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
6045     return false;
6046 
6047   for (const Instruction &I : instructions(Callee)) {
6048     if (const auto *CB = dyn_cast<CallBase>(&I)) {
6049       SmallVector<Type *, 8> Types;
6050       for (Value *Arg : CB->args())
6051         Types.push_back(Arg->getType());
6052       if (!CB->getType()->isVoidTy())
6053         Types.push_back(CB->getType());
6054 
6055       // Simple types are always ABI compatible.
6056       auto IsSimpleTy = [](Type *Ty) {
6057         return !Ty->isVectorTy() && !Ty->isAggregateType();
6058       };
6059       if (all_of(Types, IsSimpleTy))
6060         continue;
6061 
6062       if (Function *NestedCallee = CB->getCalledFunction()) {
6063         // Assume that intrinsics are always ABI compatible.
6064         if (NestedCallee->isIntrinsic())
6065           continue;
6066 
6067         // Do a precise compatibility check.
6068         if (!areTypesABICompatible(Caller, NestedCallee, Types))
6069           return false;
6070       } else {
6071         // We don't know the target features of the callee,
6072         // assume it is incompatible.
6073         return false;
6074       }
6075     }
6076   }
6077   return true;
6078 }
6079 
6080 bool X86TTIImpl::areTypesABICompatible(const Function *Caller,
6081                                        const Function *Callee,
6082                                        const ArrayRef<Type *> &Types) const {
6083   if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
6084     return false;
6085 
6086   // If we get here, we know the target features match. If one function
6087   // considers 512-bit vectors legal and the other does not, consider them
6088   // incompatible.
6089   const TargetMachine &TM = getTLI()->getTargetMachine();
6090 
6091   if (TM.getSubtarget<X86Subtarget>(*Caller).useAVX512Regs() ==
6092       TM.getSubtarget<X86Subtarget>(*Callee).useAVX512Regs())
6093     return true;
6094 
6095   // Consider the arguments compatible if they aren't vectors or aggregates.
6096   // FIXME: Look at the size of vectors.
6097   // FIXME: Look at the element types of aggregates to see if there are vectors.
6098   return llvm::none_of(Types,
6099       [](Type *T) { return T->isVectorTy() || T->isAggregateType(); });
6100 }
6101 
6102 X86TTIImpl::TTI::MemCmpExpansionOptions
6103 X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
6104   TTI::MemCmpExpansionOptions Options;
6105   Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
6106   Options.NumLoadsPerBlock = 2;
6107   // All GPR and vector loads can be unaligned.
6108   Options.AllowOverlappingLoads = true;
6109   if (IsZeroCmp) {
6110     // Only enable vector loads for equality comparison. Right now the vector
6111     // version is not as fast for three way compare (see #33329).
6112     const unsigned PreferredWidth = ST->getPreferVectorWidth();
6113     if (PreferredWidth >= 512 && ST->hasAVX512()) Options.LoadSizes.push_back(64);
6114     if (PreferredWidth >= 256 && ST->hasAVX()) Options.LoadSizes.push_back(32);
6115     if (PreferredWidth >= 128 && ST->hasSSE2()) Options.LoadSizes.push_back(16);
6116   }
6117   if (ST->is64Bit()) {
6118     Options.LoadSizes.push_back(8);
6119   }
6120   Options.LoadSizes.push_back(4);
6121   Options.LoadSizes.push_back(2);
6122   Options.LoadSizes.push_back(1);
6123   return Options;
6124 }
6125 
6126 bool X86TTIImpl::prefersVectorizedAddressing() const {
6127   return supportsGather();
6128 }
6129 
6130 bool X86TTIImpl::supportsEfficientVectorElementLoadStore() const {
6131   return false;
6132 }
6133 
6134 bool X86TTIImpl::enableInterleavedAccessVectorization() {
6135   // TODO: We expect this to be beneficial regardless of arch,
6136   // but there are currently some unexplained performance artifacts on Atom.
6137   // As a temporary solution, disable on Atom.
6138   return !(ST->isAtom());
6139 }
6140 
6141 // Get estimation for interleaved load/store operations and strided load.
6142 // \p Indices contains indices for strided load.
6143 // \p Factor - the factor of interleaving.
6144 // AVX-512 provides 3-src shuffles that significantly reduces the cost.
6145 InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX512(
6146     unsigned Opcode, FixedVectorType *VecTy, unsigned Factor,
6147     ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace,
6148     TTI::TargetCostKind CostKind, bool UseMaskForCond, bool UseMaskForGaps) {
6149   // VecTy for interleave memop is <VF*Factor x Elt>.
6150   // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
6151   // VecTy = <12 x i32>.
6152 
6153   // Calculate the number of memory operations (NumOfMemOps), required
6154   // for load/store the VecTy.
6155   MVT LegalVT = getTypeLegalizationCost(VecTy).second;
6156   unsigned VecTySize = DL.getTypeStoreSize(VecTy);
6157   unsigned LegalVTSize = LegalVT.getStoreSize();
6158   unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
6159 
6160   // Get the cost of one memory operation.
6161   auto *SingleMemOpTy = FixedVectorType::get(VecTy->getElementType(),
6162                                              LegalVT.getVectorNumElements());
6163   InstructionCost MemOpCost;
6164   bool UseMaskedMemOp = UseMaskForCond || UseMaskForGaps;
6165   if (UseMaskedMemOp)
6166     MemOpCost = getMaskedMemoryOpCost(Opcode, SingleMemOpTy, Alignment,
6167                                       AddressSpace, CostKind);
6168   else
6169     MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy, MaybeAlign(Alignment),
6170                                 AddressSpace, CostKind);
6171 
6172   unsigned VF = VecTy->getNumElements() / Factor;
6173   MVT VT = MVT::getVectorVT(MVT::getVT(VecTy->getScalarType()), VF);
6174 
6175   InstructionCost MaskCost;
6176   if (UseMaskedMemOp) {
6177     APInt DemandedLoadStoreElts = APInt::getZero(VecTy->getNumElements());
6178     for (unsigned Index : Indices) {
6179       assert(Index < Factor && "Invalid index for interleaved memory op");
6180       for (unsigned Elm = 0; Elm < VF; Elm++)
6181         DemandedLoadStoreElts.setBit(Index + Elm * Factor);
6182     }
6183 
6184     Type *I1Type = Type::getInt1Ty(VecTy->getContext());
6185 
6186     MaskCost = getReplicationShuffleCost(
6187         I1Type, Factor, VF,
6188         UseMaskForGaps ? DemandedLoadStoreElts
6189                        : APInt::getAllOnes(VecTy->getNumElements()),
6190         CostKind);
6191 
6192     // The Gaps mask is invariant and created outside the loop, therefore the
6193     // cost of creating it is not accounted for here. However if we have both
6194     // a MaskForGaps and some other mask that guards the execution of the
6195     // memory access, we need to account for the cost of And-ing the two masks
6196     // inside the loop.
6197     if (UseMaskForGaps) {
6198       auto *MaskVT = FixedVectorType::get(I1Type, VecTy->getNumElements());
6199       MaskCost += getArithmeticInstrCost(BinaryOperator::And, MaskVT, CostKind);
6200     }
6201   }
6202 
6203   if (Opcode == Instruction::Load) {
6204     // The tables (AVX512InterleavedLoadTbl and AVX512InterleavedStoreTbl)
6205     // contain the cost of the optimized shuffle sequence that the
6206     // X86InterleavedAccess pass will generate.
6207     // The cost of loads and stores are computed separately from the table.
6208 
6209     // X86InterleavedAccess support only the following interleaved-access group.
6210     static const CostTblEntry AVX512InterleavedLoadTbl[] = {
6211         {3, MVT::v16i8, 12}, //(load 48i8 and) deinterleave into 3 x 16i8
6212         {3, MVT::v32i8, 14}, //(load 96i8 and) deinterleave into 3 x 32i8
6213         {3, MVT::v64i8, 22}, //(load 96i8 and) deinterleave into 3 x 32i8
6214     };
6215 
6216     if (const auto *Entry =
6217             CostTableLookup(AVX512InterleavedLoadTbl, Factor, VT))
6218       return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost;
6219     //If an entry does not exist, fallback to the default implementation.
6220 
6221     // Kind of shuffle depends on number of loaded values.
6222     // If we load the entire data in one register, we can use a 1-src shuffle.
6223     // Otherwise, we'll merge 2 sources in each operation.
6224     TTI::ShuffleKind ShuffleKind =
6225         (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc;
6226 
6227     InstructionCost ShuffleCost = getShuffleCost(
6228         ShuffleKind, SingleMemOpTy, std::nullopt, CostKind, 0, nullptr);
6229 
6230     unsigned NumOfLoadsInInterleaveGrp =
6231         Indices.size() ? Indices.size() : Factor;
6232     auto *ResultTy = FixedVectorType::get(VecTy->getElementType(),
6233                                           VecTy->getNumElements() / Factor);
6234     InstructionCost NumOfResults =
6235         getTypeLegalizationCost(ResultTy).first * NumOfLoadsInInterleaveGrp;
6236 
6237     // About a half of the loads may be folded in shuffles when we have only
6238     // one result. If we have more than one result, or the loads are masked,
6239     // we do not fold loads at all.
6240     unsigned NumOfUnfoldedLoads =
6241         UseMaskedMemOp || NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2;
6242 
6243     // Get a number of shuffle operations per result.
6244     unsigned NumOfShufflesPerResult =
6245         std::max((unsigned)1, (unsigned)(NumOfMemOps - 1));
6246 
6247     // The SK_MergeTwoSrc shuffle clobbers one of src operands.
6248     // When we have more than one destination, we need additional instructions
6249     // to keep sources.
6250     InstructionCost NumOfMoves = 0;
6251     if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc)
6252       NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2;
6253 
6254     InstructionCost Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost +
6255                            MaskCost + NumOfUnfoldedLoads * MemOpCost +
6256                            NumOfMoves;
6257 
6258     return Cost;
6259   }
6260 
6261   // Store.
6262   assert(Opcode == Instruction::Store &&
6263          "Expected Store Instruction at this  point");
6264   // X86InterleavedAccess support only the following interleaved-access group.
6265   static const CostTblEntry AVX512InterleavedStoreTbl[] = {
6266       {3, MVT::v16i8, 12}, // interleave 3 x 16i8 into 48i8 (and store)
6267       {3, MVT::v32i8, 14}, // interleave 3 x 32i8 into 96i8 (and store)
6268       {3, MVT::v64i8, 26}, // interleave 3 x 64i8 into 96i8 (and store)
6269 
6270       {4, MVT::v8i8, 10},  // interleave 4 x 8i8  into 32i8  (and store)
6271       {4, MVT::v16i8, 11}, // interleave 4 x 16i8 into 64i8  (and store)
6272       {4, MVT::v32i8, 14}, // interleave 4 x 32i8 into 128i8 (and store)
6273       {4, MVT::v64i8, 24}  // interleave 4 x 32i8 into 256i8 (and store)
6274   };
6275 
6276   if (const auto *Entry =
6277           CostTableLookup(AVX512InterleavedStoreTbl, Factor, VT))
6278     return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost;
6279   //If an entry does not exist, fallback to the default implementation.
6280 
6281   // There is no strided stores meanwhile. And store can't be folded in
6282   // shuffle.
6283   unsigned NumOfSources = Factor; // The number of values to be merged.
6284   InstructionCost ShuffleCost = getShuffleCost(
6285       TTI::SK_PermuteTwoSrc, SingleMemOpTy, std::nullopt, CostKind, 0, nullptr);
6286   unsigned NumOfShufflesPerStore = NumOfSources - 1;
6287 
6288   // The SK_MergeTwoSrc shuffle clobbers one of src operands.
6289   // We need additional instructions to keep sources.
6290   unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2;
6291   InstructionCost Cost =
6292       MaskCost +
6293       NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) +
6294       NumOfMoves;
6295   return Cost;
6296 }
6297 
6298 InstructionCost X86TTIImpl::getInterleavedMemoryOpCost(
6299     unsigned Opcode, Type *BaseTy, unsigned Factor, ArrayRef<unsigned> Indices,
6300     Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
6301     bool UseMaskForCond, bool UseMaskForGaps) {
6302   auto *VecTy = cast<FixedVectorType>(BaseTy);
6303 
6304   auto isSupportedOnAVX512 = [&](Type *VecTy) {
6305     Type *EltTy = cast<VectorType>(VecTy)->getElementType();
6306     if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) ||
6307         EltTy->isIntegerTy(32) || EltTy->isPointerTy())
6308       return true;
6309     if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8) || EltTy->isHalfTy())
6310       return ST->hasBWI();
6311     if (EltTy->isBFloatTy())
6312       return ST->hasBF16();
6313     return false;
6314   };
6315   if (ST->hasAVX512() && isSupportedOnAVX512(VecTy))
6316     return getInterleavedMemoryOpCostAVX512(
6317         Opcode, VecTy, Factor, Indices, Alignment,
6318         AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps);
6319 
6320   if (UseMaskForCond || UseMaskForGaps)
6321     return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6322                                              Alignment, AddressSpace, CostKind,
6323                                              UseMaskForCond, UseMaskForGaps);
6324 
6325   // Get estimation for interleaved load/store operations for SSE-AVX2.
6326   // As opposed to AVX-512, SSE-AVX2 do not have generic shuffles that allow
6327   // computing the cost using a generic formula as a function of generic
6328   // shuffles. We therefore use a lookup table instead, filled according to
6329   // the instruction sequences that codegen currently generates.
6330 
6331   // VecTy for interleave memop is <VF*Factor x Elt>.
6332   // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
6333   // VecTy = <12 x i32>.
6334   MVT LegalVT = getTypeLegalizationCost(VecTy).second;
6335 
6336   // This function can be called with VecTy=<6xi128>, Factor=3, in which case
6337   // the VF=2, while v2i128 is an unsupported MVT vector type
6338   // (see MachineValueType.h::getVectorVT()).
6339   if (!LegalVT.isVector())
6340     return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6341                                              Alignment, AddressSpace, CostKind);
6342 
6343   unsigned VF = VecTy->getNumElements() / Factor;
6344   Type *ScalarTy = VecTy->getElementType();
6345   // Deduplicate entries, model floats/pointers as appropriately-sized integers.
6346   if (!ScalarTy->isIntegerTy())
6347     ScalarTy =
6348         Type::getIntNTy(ScalarTy->getContext(), DL.getTypeSizeInBits(ScalarTy));
6349 
6350   // Get the cost of all the memory operations.
6351   // FIXME: discount dead loads.
6352   InstructionCost MemOpCosts = getMemoryOpCost(
6353       Opcode, VecTy, MaybeAlign(Alignment), AddressSpace, CostKind);
6354 
6355   auto *VT = FixedVectorType::get(ScalarTy, VF);
6356   EVT ETy = TLI->getValueType(DL, VT);
6357   if (!ETy.isSimple())
6358     return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6359                                              Alignment, AddressSpace, CostKind);
6360 
6361   // TODO: Complete for other data-types and strides.
6362   // Each combination of Stride, element bit width and VF results in a different
6363   // sequence; The cost tables are therefore accessed with:
6364   // Factor (stride) and VectorType=VFxiN.
6365   // The Cost accounts only for the shuffle sequence;
6366   // The cost of the loads/stores is accounted for separately.
6367   //
6368   static const CostTblEntry AVX2InterleavedLoadTbl[] = {
6369       {2, MVT::v2i8, 2},  // (load 4i8 and) deinterleave into 2 x 2i8
6370       {2, MVT::v4i8, 2},  // (load 8i8 and) deinterleave into 2 x 4i8
6371       {2, MVT::v8i8, 2},  // (load 16i8 and) deinterleave into 2 x 8i8
6372       {2, MVT::v16i8, 4}, // (load 32i8 and) deinterleave into 2 x 16i8
6373       {2, MVT::v32i8, 6}, // (load 64i8 and) deinterleave into 2 x 32i8
6374 
6375       {2, MVT::v8i16, 6},   // (load 16i16 and) deinterleave into 2 x 8i16
6376       {2, MVT::v16i16, 9},  // (load 32i16 and) deinterleave into 2 x 16i16
6377       {2, MVT::v32i16, 18}, // (load 64i16 and) deinterleave into 2 x 32i16
6378 
6379       {2, MVT::v8i32, 4},   // (load 16i32 and) deinterleave into 2 x 8i32
6380       {2, MVT::v16i32, 8},  // (load 32i32 and) deinterleave into 2 x 16i32
6381       {2, MVT::v32i32, 16}, // (load 64i32 and) deinterleave into 2 x 32i32
6382 
6383       {2, MVT::v4i64, 4},   // (load 8i64 and) deinterleave into 2 x 4i64
6384       {2, MVT::v8i64, 8},   // (load 16i64 and) deinterleave into 2 x 8i64
6385       {2, MVT::v16i64, 16}, // (load 32i64 and) deinterleave into 2 x 16i64
6386       {2, MVT::v32i64, 32}, // (load 64i64 and) deinterleave into 2 x 32i64
6387 
6388       {3, MVT::v2i8, 3},   // (load 6i8 and) deinterleave into 3 x 2i8
6389       {3, MVT::v4i8, 3},   // (load 12i8 and) deinterleave into 3 x 4i8
6390       {3, MVT::v8i8, 6},   // (load 24i8 and) deinterleave into 3 x 8i8
6391       {3, MVT::v16i8, 11}, // (load 48i8 and) deinterleave into 3 x 16i8
6392       {3, MVT::v32i8, 14}, // (load 96i8 and) deinterleave into 3 x 32i8
6393 
6394       {3, MVT::v2i16, 5},   // (load 6i16 and) deinterleave into 3 x 2i16
6395       {3, MVT::v4i16, 7},   // (load 12i16 and) deinterleave into 3 x 4i16
6396       {3, MVT::v8i16, 9},   // (load 24i16 and) deinterleave into 3 x 8i16
6397       {3, MVT::v16i16, 28}, // (load 48i16 and) deinterleave into 3 x 16i16
6398       {3, MVT::v32i16, 56}, // (load 96i16 and) deinterleave into 3 x 32i16
6399 
6400       {3, MVT::v2i32, 3},   // (load 6i32 and) deinterleave into 3 x 2i32
6401       {3, MVT::v4i32, 3},   // (load 12i32 and) deinterleave into 3 x 4i32
6402       {3, MVT::v8i32, 7},   // (load 24i32 and) deinterleave into 3 x 8i32
6403       {3, MVT::v16i32, 14}, // (load 48i32 and) deinterleave into 3 x 16i32
6404       {3, MVT::v32i32, 32}, // (load 96i32 and) deinterleave into 3 x 32i32
6405 
6406       {3, MVT::v2i64, 1},   // (load 6i64 and) deinterleave into 3 x 2i64
6407       {3, MVT::v4i64, 5},   // (load 12i64 and) deinterleave into 3 x 4i64
6408       {3, MVT::v8i64, 10},  // (load 24i64 and) deinterleave into 3 x 8i64
6409       {3, MVT::v16i64, 20}, // (load 48i64 and) deinterleave into 3 x 16i64
6410 
6411       {4, MVT::v2i8, 4},   // (load 8i8 and) deinterleave into 4 x 2i8
6412       {4, MVT::v4i8, 4},   // (load 16i8 and) deinterleave into 4 x 4i8
6413       {4, MVT::v8i8, 12},  // (load 32i8 and) deinterleave into 4 x 8i8
6414       {4, MVT::v16i8, 24}, // (load 64i8 and) deinterleave into 4 x 16i8
6415       {4, MVT::v32i8, 56}, // (load 128i8 and) deinterleave into 4 x 32i8
6416 
6417       {4, MVT::v2i16, 6},    // (load 8i16 and) deinterleave into 4 x 2i16
6418       {4, MVT::v4i16, 17},   // (load 16i16 and) deinterleave into 4 x 4i16
6419       {4, MVT::v8i16, 33},   // (load 32i16 and) deinterleave into 4 x 8i16
6420       {4, MVT::v16i16, 75},  // (load 64i16 and) deinterleave into 4 x 16i16
6421       {4, MVT::v32i16, 150}, // (load 128i16 and) deinterleave into 4 x 32i16
6422 
6423       {4, MVT::v2i32, 4},   // (load 8i32 and) deinterleave into 4 x 2i32
6424       {4, MVT::v4i32, 8},   // (load 16i32 and) deinterleave into 4 x 4i32
6425       {4, MVT::v8i32, 16},  // (load 32i32 and) deinterleave into 4 x 8i32
6426       {4, MVT::v16i32, 32}, // (load 64i32 and) deinterleave into 4 x 16i32
6427       {4, MVT::v32i32, 68}, // (load 128i32 and) deinterleave into 4 x 32i32
6428 
6429       {4, MVT::v2i64, 6},  // (load 8i64 and) deinterleave into 4 x 2i64
6430       {4, MVT::v4i64, 8},  // (load 16i64 and) deinterleave into 4 x 4i64
6431       {4, MVT::v8i64, 20}, // (load 32i64 and) deinterleave into 4 x 8i64
6432       {4, MVT::v16i64, 40}, // (load 64i64 and) deinterleave into 4 x 16i64
6433 
6434       {6, MVT::v2i8, 6},   // (load 12i8 and) deinterleave into 6 x 2i8
6435       {6, MVT::v4i8, 14},  // (load 24i8 and) deinterleave into 6 x 4i8
6436       {6, MVT::v8i8, 18},  // (load 48i8 and) deinterleave into 6 x 8i8
6437       {6, MVT::v16i8, 43}, // (load 96i8 and) deinterleave into 6 x 16i8
6438       {6, MVT::v32i8, 82}, // (load 192i8 and) deinterleave into 6 x 32i8
6439 
6440       {6, MVT::v2i16, 13},   // (load 12i16 and) deinterleave into 6 x 2i16
6441       {6, MVT::v4i16, 9},    // (load 24i16 and) deinterleave into 6 x 4i16
6442       {6, MVT::v8i16, 39},   // (load 48i16 and) deinterleave into 6 x 8i16
6443       {6, MVT::v16i16, 106}, // (load 96i16 and) deinterleave into 6 x 16i16
6444       {6, MVT::v32i16, 212}, // (load 192i16 and) deinterleave into 6 x 32i16
6445 
6446       {6, MVT::v2i32, 6},   // (load 12i32 and) deinterleave into 6 x 2i32
6447       {6, MVT::v4i32, 15},  // (load 24i32 and) deinterleave into 6 x 4i32
6448       {6, MVT::v8i32, 31},  // (load 48i32 and) deinterleave into 6 x 8i32
6449       {6, MVT::v16i32, 64}, // (load 96i32 and) deinterleave into 6 x 16i32
6450 
6451       {6, MVT::v2i64, 6},  // (load 12i64 and) deinterleave into 6 x 2i64
6452       {6, MVT::v4i64, 18}, // (load 24i64 and) deinterleave into 6 x 4i64
6453       {6, MVT::v8i64, 36}, // (load 48i64 and) deinterleave into 6 x 8i64
6454 
6455       {8, MVT::v8i32, 40} // (load 64i32 and) deinterleave into 8 x 8i32
6456   };
6457 
6458   static const CostTblEntry SSSE3InterleavedLoadTbl[] = {
6459       {2, MVT::v4i16, 2},   // (load 8i16 and) deinterleave into 2 x 4i16
6460   };
6461 
6462   static const CostTblEntry SSE2InterleavedLoadTbl[] = {
6463       {2, MVT::v2i16, 2},   // (load 4i16 and) deinterleave into 2 x 2i16
6464       {2, MVT::v4i16, 7},   // (load 8i16 and) deinterleave into 2 x 4i16
6465 
6466       {2, MVT::v2i32, 2},   // (load 4i32 and) deinterleave into 2 x 2i32
6467       {2, MVT::v4i32, 2},   // (load 8i32 and) deinterleave into 2 x 4i32
6468 
6469       {2, MVT::v2i64, 2},   // (load 4i64 and) deinterleave into 2 x 2i64
6470   };
6471 
6472   static const CostTblEntry AVX2InterleavedStoreTbl[] = {
6473       {2, MVT::v16i8, 3}, // interleave 2 x 16i8 into 32i8 (and store)
6474       {2, MVT::v32i8, 4}, // interleave 2 x 32i8 into 64i8 (and store)
6475 
6476       {2, MVT::v8i16, 3},  // interleave 2 x 8i16 into 16i16 (and store)
6477       {2, MVT::v16i16, 4}, // interleave 2 x 16i16 into 32i16 (and store)
6478       {2, MVT::v32i16, 8}, // interleave 2 x 32i16 into 64i16 (and store)
6479 
6480       {2, MVT::v4i32, 2},   // interleave 2 x 4i32 into 8i32 (and store)
6481       {2, MVT::v8i32, 4},   // interleave 2 x 8i32 into 16i32 (and store)
6482       {2, MVT::v16i32, 8},  // interleave 2 x 16i32 into 32i32 (and store)
6483       {2, MVT::v32i32, 16}, // interleave 2 x 32i32 into 64i32 (and store)
6484 
6485       {2, MVT::v2i64, 2},   // interleave 2 x 2i64 into 4i64 (and store)
6486       {2, MVT::v4i64, 4},   // interleave 2 x 4i64 into 8i64 (and store)
6487       {2, MVT::v8i64, 8},   // interleave 2 x 8i64 into 16i64 (and store)
6488       {2, MVT::v16i64, 16}, // interleave 2 x 16i64 into 32i64 (and store)
6489       {2, MVT::v32i64, 32}, // interleave 2 x 32i64 into 64i64 (and store)
6490 
6491       {3, MVT::v2i8, 4},   // interleave 3 x 2i8 into 6i8 (and store)
6492       {3, MVT::v4i8, 4},   // interleave 3 x 4i8 into 12i8 (and store)
6493       {3, MVT::v8i8, 6},   // interleave 3 x 8i8 into 24i8 (and store)
6494       {3, MVT::v16i8, 11}, // interleave 3 x 16i8 into 48i8 (and store)
6495       {3, MVT::v32i8, 13}, // interleave 3 x 32i8 into 96i8 (and store)
6496 
6497       {3, MVT::v2i16, 4},   // interleave 3 x 2i16 into 6i16 (and store)
6498       {3, MVT::v4i16, 6},   // interleave 3 x 4i16 into 12i16 (and store)
6499       {3, MVT::v8i16, 12},  // interleave 3 x 8i16 into 24i16 (and store)
6500       {3, MVT::v16i16, 27}, // interleave 3 x 16i16 into 48i16 (and store)
6501       {3, MVT::v32i16, 54}, // interleave 3 x 32i16 into 96i16 (and store)
6502 
6503       {3, MVT::v2i32, 4},   // interleave 3 x 2i32 into 6i32 (and store)
6504       {3, MVT::v4i32, 5},   // interleave 3 x 4i32 into 12i32 (and store)
6505       {3, MVT::v8i32, 11},  // interleave 3 x 8i32 into 24i32 (and store)
6506       {3, MVT::v16i32, 22}, // interleave 3 x 16i32 into 48i32 (and store)
6507       {3, MVT::v32i32, 48}, // interleave 3 x 32i32 into 96i32 (and store)
6508 
6509       {3, MVT::v2i64, 4},   // interleave 3 x 2i64 into 6i64 (and store)
6510       {3, MVT::v4i64, 6},   // interleave 3 x 4i64 into 12i64 (and store)
6511       {3, MVT::v8i64, 12},  // interleave 3 x 8i64 into 24i64 (and store)
6512       {3, MVT::v16i64, 24}, // interleave 3 x 16i64 into 48i64 (and store)
6513 
6514       {4, MVT::v2i8, 4},   // interleave 4 x 2i8 into 8i8 (and store)
6515       {4, MVT::v4i8, 4},   // interleave 4 x 4i8 into 16i8 (and store)
6516       {4, MVT::v8i8, 4},   // interleave 4 x 8i8 into 32i8 (and store)
6517       {4, MVT::v16i8, 8},  // interleave 4 x 16i8 into 64i8 (and store)
6518       {4, MVT::v32i8, 12}, // interleave 4 x 32i8 into 128i8 (and store)
6519 
6520       {4, MVT::v2i16, 2},   // interleave 4 x 2i16 into 8i16 (and store)
6521       {4, MVT::v4i16, 6},   // interleave 4 x 4i16 into 16i16 (and store)
6522       {4, MVT::v8i16, 10},  // interleave 4 x 8i16 into 32i16 (and store)
6523       {4, MVT::v16i16, 32}, // interleave 4 x 16i16 into 64i16 (and store)
6524       {4, MVT::v32i16, 64}, // interleave 4 x 32i16 into 128i16 (and store)
6525 
6526       {4, MVT::v2i32, 5},   // interleave 4 x 2i32 into 8i32 (and store)
6527       {4, MVT::v4i32, 6},   // interleave 4 x 4i32 into 16i32 (and store)
6528       {4, MVT::v8i32, 16},  // interleave 4 x 8i32 into 32i32 (and store)
6529       {4, MVT::v16i32, 32}, // interleave 4 x 16i32 into 64i32 (and store)
6530       {4, MVT::v32i32, 64}, // interleave 4 x 32i32 into 128i32 (and store)
6531 
6532       {4, MVT::v2i64, 6},  // interleave 4 x 2i64 into 8i64 (and store)
6533       {4, MVT::v4i64, 8},  // interleave 4 x 4i64 into 16i64 (and store)
6534       {4, MVT::v8i64, 20}, // interleave 4 x 8i64 into 32i64 (and store)
6535       {4, MVT::v16i64, 40}, // interleave 4 x 16i64 into 64i64 (and store)
6536 
6537       {6, MVT::v2i8, 7},   // interleave 6 x 2i8 into 12i8 (and store)
6538       {6, MVT::v4i8, 9},   // interleave 6 x 4i8 into 24i8 (and store)
6539       {6, MVT::v8i8, 16},  // interleave 6 x 8i8 into 48i8 (and store)
6540       {6, MVT::v16i8, 27}, // interleave 6 x 16i8 into 96i8 (and store)
6541       {6, MVT::v32i8, 90}, // interleave 6 x 32i8 into 192i8 (and store)
6542 
6543       {6, MVT::v2i16, 10},  // interleave 6 x 2i16 into 12i16 (and store)
6544       {6, MVT::v4i16, 15},  // interleave 6 x 4i16 into 24i16 (and store)
6545       {6, MVT::v8i16, 21},  // interleave 6 x 8i16 into 48i16 (and store)
6546       {6, MVT::v16i16, 58}, // interleave 6 x 16i16 into 96i16 (and store)
6547       {6, MVT::v32i16, 90}, // interleave 6 x 32i16 into 192i16 (and store)
6548 
6549       {6, MVT::v2i32, 9},   // interleave 6 x 2i32 into 12i32 (and store)
6550       {6, MVT::v4i32, 12},  // interleave 6 x 4i32 into 24i32 (and store)
6551       {6, MVT::v8i32, 33},  // interleave 6 x 8i32 into 48i32 (and store)
6552       {6, MVT::v16i32, 66}, // interleave 6 x 16i32 into 96i32 (and store)
6553 
6554       {6, MVT::v2i64, 8},  // interleave 6 x 2i64 into 12i64 (and store)
6555       {6, MVT::v4i64, 15}, // interleave 6 x 4i64 into 24i64 (and store)
6556       {6, MVT::v8i64, 30}, // interleave 6 x 8i64 into 48i64 (and store)
6557   };
6558 
6559   static const CostTblEntry SSE2InterleavedStoreTbl[] = {
6560       {2, MVT::v2i8, 1},   // interleave 2 x 2i8 into 4i8 (and store)
6561       {2, MVT::v4i8, 1},   // interleave 2 x 4i8 into 8i8 (and store)
6562       {2, MVT::v8i8, 1},   // interleave 2 x 8i8 into 16i8 (and store)
6563 
6564       {2, MVT::v2i16, 1},  // interleave 2 x 2i16 into 4i16 (and store)
6565       {2, MVT::v4i16, 1},  // interleave 2 x 4i16 into 8i16 (and store)
6566 
6567       {2, MVT::v2i32, 1},  // interleave 2 x 2i32 into 4i32 (and store)
6568   };
6569 
6570   if (Opcode == Instruction::Load) {
6571     auto GetDiscountedCost = [Factor, NumMembers = Indices.size(),
6572                               MemOpCosts](const CostTblEntry *Entry) {
6573       // NOTE: this is just an approximation!
6574       //       It can over/under -estimate the cost!
6575       return MemOpCosts + divideCeil(NumMembers * Entry->Cost, Factor);
6576     };
6577 
6578     if (ST->hasAVX2())
6579       if (const auto *Entry = CostTableLookup(AVX2InterleavedLoadTbl, Factor,
6580                                               ETy.getSimpleVT()))
6581         return GetDiscountedCost(Entry);
6582 
6583     if (ST->hasSSSE3())
6584       if (const auto *Entry = CostTableLookup(SSSE3InterleavedLoadTbl, Factor,
6585                                               ETy.getSimpleVT()))
6586         return GetDiscountedCost(Entry);
6587 
6588     if (ST->hasSSE2())
6589       if (const auto *Entry = CostTableLookup(SSE2InterleavedLoadTbl, Factor,
6590                                               ETy.getSimpleVT()))
6591         return GetDiscountedCost(Entry);
6592   } else {
6593     assert(Opcode == Instruction::Store &&
6594            "Expected Store Instruction at this point");
6595     assert((!Indices.size() || Indices.size() == Factor) &&
6596            "Interleaved store only supports fully-interleaved groups.");
6597     if (ST->hasAVX2())
6598       if (const auto *Entry = CostTableLookup(AVX2InterleavedStoreTbl, Factor,
6599                                               ETy.getSimpleVT()))
6600         return MemOpCosts + Entry->Cost;
6601 
6602     if (ST->hasSSE2())
6603       if (const auto *Entry = CostTableLookup(SSE2InterleavedStoreTbl, Factor,
6604                                               ETy.getSimpleVT()))
6605         return MemOpCosts + Entry->Cost;
6606   }
6607 
6608   return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6609                                            Alignment, AddressSpace, CostKind,
6610                                            UseMaskForCond, UseMaskForGaps);
6611 }
6612 
6613 InstructionCost X86TTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
6614                                                  int64_t BaseOffset,
6615                                                  bool HasBaseReg, int64_t Scale,
6616                                                  unsigned AddrSpace) const {
6617   // Scaling factors are not free at all.
6618   // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
6619   // will take 2 allocations in the out of order engine instead of 1
6620   // for plain addressing mode, i.e. inst (reg1).
6621   // E.g.,
6622   // vaddps (%rsi,%rdx), %ymm0, %ymm1
6623   // Requires two allocations (one for the load, one for the computation)
6624   // whereas:
6625   // vaddps (%rsi), %ymm0, %ymm1
6626   // Requires just 1 allocation, i.e., freeing allocations for other operations
6627   // and having less micro operations to execute.
6628   //
6629   // For some X86 architectures, this is even worse because for instance for
6630   // stores, the complex addressing mode forces the instruction to use the
6631   // "load" ports instead of the dedicated "store" port.
6632   // E.g., on Haswell:
6633   // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
6634   // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
6635   TargetLoweringBase::AddrMode AM;
6636   AM.BaseGV = BaseGV;
6637   AM.BaseOffs = BaseOffset;
6638   AM.HasBaseReg = HasBaseReg;
6639   AM.Scale = Scale;
6640   if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
6641     // Scale represents reg2 * scale, thus account for 1
6642     // as soon as we use a second register.
6643     return AM.Scale != 0;
6644   return -1;
6645 }
6646