xref: /freebsd/contrib/llvm-project/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp (revision c9539b89010900499a200cdd6c0265ea5d950875)
1 //===-- X86InstCombineIntrinsic.cpp - X86 specific InstCombine pass -------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements a TargetTransformInfo analysis pass specific to the
10 /// X86 target machine. It uses the target's detailed information to provide
11 /// more precise answers to certain TTI queries, while letting the target
12 /// independent and default TTI implementations handle the rest.
13 ///
14 //===----------------------------------------------------------------------===//
15 
16 #include "X86TargetTransformInfo.h"
17 #include "llvm/IR/IntrinsicInst.h"
18 #include "llvm/IR/IntrinsicsX86.h"
19 #include "llvm/Support/KnownBits.h"
20 #include "llvm/Transforms/InstCombine/InstCombiner.h"
21 
22 using namespace llvm;
23 
24 #define DEBUG_TYPE "x86tti"
25 
26 /// Return a constant boolean vector that has true elements in all positions
27 /// where the input constant data vector has an element with the sign bit set.
28 static Constant *getNegativeIsTrueBoolVec(Constant *V) {
29   VectorType *IntTy = VectorType::getInteger(cast<VectorType>(V->getType()));
30   V = ConstantExpr::getBitCast(V, IntTy);
31   V = ConstantExpr::getICmp(CmpInst::ICMP_SGT, Constant::getNullValue(IntTy),
32                             V);
33   return V;
34 }
35 
36 /// Convert the x86 XMM integer vector mask to a vector of bools based on
37 /// each element's most significant bit (the sign bit).
38 static Value *getBoolVecFromMask(Value *Mask) {
39   // Fold Constant Mask.
40   if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask))
41     return getNegativeIsTrueBoolVec(ConstantMask);
42 
43   // Mask was extended from a boolean vector.
44   Value *ExtMask;
45   if (PatternMatch::match(
46           Mask, PatternMatch::m_SExt(PatternMatch::m_Value(ExtMask))) &&
47       ExtMask->getType()->isIntOrIntVectorTy(1))
48     return ExtMask;
49 
50   return nullptr;
51 }
52 
53 // TODO: If the x86 backend knew how to convert a bool vector mask back to an
54 // XMM register mask efficiently, we could transform all x86 masked intrinsics
55 // to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
56 static Instruction *simplifyX86MaskedLoad(IntrinsicInst &II, InstCombiner &IC) {
57   Value *Ptr = II.getOperand(0);
58   Value *Mask = II.getOperand(1);
59   Constant *ZeroVec = Constant::getNullValue(II.getType());
60 
61   // Zero Mask - masked load instruction creates a zero vector.
62   if (isa<ConstantAggregateZero>(Mask))
63     return IC.replaceInstUsesWith(II, ZeroVec);
64 
65   // The mask is constant or extended from a bool vector. Convert this x86
66   // intrinsic to the LLVM intrinsic to allow target-independent optimizations.
67   if (Value *BoolMask = getBoolVecFromMask(Mask)) {
68     // First, cast the x86 intrinsic scalar pointer to a vector pointer to match
69     // the LLVM intrinsic definition for the pointer argument.
70     unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
71     PointerType *VecPtrTy = PointerType::get(II.getType(), AddrSpace);
72     Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");
73 
74     // The pass-through vector for an x86 masked load is a zero vector.
75     CallInst *NewMaskedLoad = IC.Builder.CreateMaskedLoad(
76         II.getType(), PtrCast, Align(1), BoolMask, ZeroVec);
77     return IC.replaceInstUsesWith(II, NewMaskedLoad);
78   }
79 
80   return nullptr;
81 }
82 
83 // TODO: If the x86 backend knew how to convert a bool vector mask back to an
84 // XMM register mask efficiently, we could transform all x86 masked intrinsics
85 // to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
86 static bool simplifyX86MaskedStore(IntrinsicInst &II, InstCombiner &IC) {
87   Value *Ptr = II.getOperand(0);
88   Value *Mask = II.getOperand(1);
89   Value *Vec = II.getOperand(2);
90 
91   // Zero Mask - this masked store instruction does nothing.
92   if (isa<ConstantAggregateZero>(Mask)) {
93     IC.eraseInstFromFunction(II);
94     return true;
95   }
96 
97   // The SSE2 version is too weird (eg, unaligned but non-temporal) to do
98   // anything else at this level.
99   if (II.getIntrinsicID() == Intrinsic::x86_sse2_maskmov_dqu)
100     return false;
101 
102   // The mask is constant or extended from a bool vector. Convert this x86
103   // intrinsic to the LLVM intrinsic to allow target-independent optimizations.
104   if (Value *BoolMask = getBoolVecFromMask(Mask)) {
105     unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
106     PointerType *VecPtrTy = PointerType::get(Vec->getType(), AddrSpace);
107     Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");
108 
109     IC.Builder.CreateMaskedStore(Vec, PtrCast, Align(1), BoolMask);
110 
111     // 'Replace uses' doesn't work for stores. Erase the original masked store.
112     IC.eraseInstFromFunction(II);
113     return true;
114   }
115 
116   return false;
117 }
118 
119 static Value *simplifyX86immShift(const IntrinsicInst &II,
120                                   InstCombiner::BuilderTy &Builder) {
121   bool LogicalShift = false;
122   bool ShiftLeft = false;
123   bool IsImm = false;
124 
125   switch (II.getIntrinsicID()) {
126   default:
127     llvm_unreachable("Unexpected intrinsic!");
128   case Intrinsic::x86_sse2_psrai_d:
129   case Intrinsic::x86_sse2_psrai_w:
130   case Intrinsic::x86_avx2_psrai_d:
131   case Intrinsic::x86_avx2_psrai_w:
132   case Intrinsic::x86_avx512_psrai_q_128:
133   case Intrinsic::x86_avx512_psrai_q_256:
134   case Intrinsic::x86_avx512_psrai_d_512:
135   case Intrinsic::x86_avx512_psrai_q_512:
136   case Intrinsic::x86_avx512_psrai_w_512:
137     IsImm = true;
138     LLVM_FALLTHROUGH;
139   case Intrinsic::x86_sse2_psra_d:
140   case Intrinsic::x86_sse2_psra_w:
141   case Intrinsic::x86_avx2_psra_d:
142   case Intrinsic::x86_avx2_psra_w:
143   case Intrinsic::x86_avx512_psra_q_128:
144   case Intrinsic::x86_avx512_psra_q_256:
145   case Intrinsic::x86_avx512_psra_d_512:
146   case Intrinsic::x86_avx512_psra_q_512:
147   case Intrinsic::x86_avx512_psra_w_512:
148     LogicalShift = false;
149     ShiftLeft = false;
150     break;
151   case Intrinsic::x86_sse2_psrli_d:
152   case Intrinsic::x86_sse2_psrli_q:
153   case Intrinsic::x86_sse2_psrli_w:
154   case Intrinsic::x86_avx2_psrli_d:
155   case Intrinsic::x86_avx2_psrli_q:
156   case Intrinsic::x86_avx2_psrli_w:
157   case Intrinsic::x86_avx512_psrli_d_512:
158   case Intrinsic::x86_avx512_psrli_q_512:
159   case Intrinsic::x86_avx512_psrli_w_512:
160     IsImm = true;
161     LLVM_FALLTHROUGH;
162   case Intrinsic::x86_sse2_psrl_d:
163   case Intrinsic::x86_sse2_psrl_q:
164   case Intrinsic::x86_sse2_psrl_w:
165   case Intrinsic::x86_avx2_psrl_d:
166   case Intrinsic::x86_avx2_psrl_q:
167   case Intrinsic::x86_avx2_psrl_w:
168   case Intrinsic::x86_avx512_psrl_d_512:
169   case Intrinsic::x86_avx512_psrl_q_512:
170   case Intrinsic::x86_avx512_psrl_w_512:
171     LogicalShift = true;
172     ShiftLeft = false;
173     break;
174   case Intrinsic::x86_sse2_pslli_d:
175   case Intrinsic::x86_sse2_pslli_q:
176   case Intrinsic::x86_sse2_pslli_w:
177   case Intrinsic::x86_avx2_pslli_d:
178   case Intrinsic::x86_avx2_pslli_q:
179   case Intrinsic::x86_avx2_pslli_w:
180   case Intrinsic::x86_avx512_pslli_d_512:
181   case Intrinsic::x86_avx512_pslli_q_512:
182   case Intrinsic::x86_avx512_pslli_w_512:
183     IsImm = true;
184     LLVM_FALLTHROUGH;
185   case Intrinsic::x86_sse2_psll_d:
186   case Intrinsic::x86_sse2_psll_q:
187   case Intrinsic::x86_sse2_psll_w:
188   case Intrinsic::x86_avx2_psll_d:
189   case Intrinsic::x86_avx2_psll_q:
190   case Intrinsic::x86_avx2_psll_w:
191   case Intrinsic::x86_avx512_psll_d_512:
192   case Intrinsic::x86_avx512_psll_q_512:
193   case Intrinsic::x86_avx512_psll_w_512:
194     LogicalShift = true;
195     ShiftLeft = true;
196     break;
197   }
198   assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
199 
200   Value *Vec = II.getArgOperand(0);
201   Value *Amt = II.getArgOperand(1);
202   auto *VT = cast<FixedVectorType>(Vec->getType());
203   Type *SVT = VT->getElementType();
204   Type *AmtVT = Amt->getType();
205   unsigned VWidth = VT->getNumElements();
206   unsigned BitWidth = SVT->getPrimitiveSizeInBits();
207 
208   // If the shift amount is guaranteed to be in-range we can replace it with a
209   // generic shift. If its guaranteed to be out of range, logical shifts combine
210   // to zero and arithmetic shifts are clamped to (BitWidth - 1).
211   if (IsImm) {
212     assert(AmtVT->isIntegerTy(32) && "Unexpected shift-by-immediate type");
213     KnownBits KnownAmtBits =
214         llvm::computeKnownBits(Amt, II.getModule()->getDataLayout());
215     if (KnownAmtBits.getMaxValue().ult(BitWidth)) {
216       Amt = Builder.CreateZExtOrTrunc(Amt, SVT);
217       Amt = Builder.CreateVectorSplat(VWidth, Amt);
218       return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
219                                         : Builder.CreateLShr(Vec, Amt))
220                            : Builder.CreateAShr(Vec, Amt));
221     }
222     if (KnownAmtBits.getMinValue().uge(BitWidth)) {
223       if (LogicalShift)
224         return ConstantAggregateZero::get(VT);
225       Amt = ConstantInt::get(SVT, BitWidth - 1);
226       return Builder.CreateAShr(Vec, Builder.CreateVectorSplat(VWidth, Amt));
227     }
228   } else {
229     // Ensure the first element has an in-range value and the rest of the
230     // elements in the bottom 64 bits are zero.
231     assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 &&
232            cast<VectorType>(AmtVT)->getElementType() == SVT &&
233            "Unexpected shift-by-scalar type");
234     unsigned NumAmtElts = cast<FixedVectorType>(AmtVT)->getNumElements();
235     APInt DemandedLower = APInt::getOneBitSet(NumAmtElts, 0);
236     APInt DemandedUpper = APInt::getBitsSet(NumAmtElts, 1, NumAmtElts / 2);
237     KnownBits KnownLowerBits = llvm::computeKnownBits(
238         Amt, DemandedLower, II.getModule()->getDataLayout());
239     KnownBits KnownUpperBits = llvm::computeKnownBits(
240         Amt, DemandedUpper, II.getModule()->getDataLayout());
241     if (KnownLowerBits.getMaxValue().ult(BitWidth) &&
242         (DemandedUpper.isZero() || KnownUpperBits.isZero())) {
243       SmallVector<int, 16> ZeroSplat(VWidth, 0);
244       Amt = Builder.CreateShuffleVector(Amt, ZeroSplat);
245       return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
246                                         : Builder.CreateLShr(Vec, Amt))
247                            : Builder.CreateAShr(Vec, Amt));
248     }
249   }
250 
251   // Simplify if count is constant vector.
252   auto *CDV = dyn_cast<ConstantDataVector>(Amt);
253   if (!CDV)
254     return nullptr;
255 
256   // SSE2/AVX2 uses all the first 64-bits of the 128-bit vector
257   // operand to compute the shift amount.
258   assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 &&
259          cast<VectorType>(AmtVT)->getElementType() == SVT &&
260          "Unexpected shift-by-scalar type");
261 
262   // Concatenate the sub-elements to create the 64-bit value.
263   APInt Count(64, 0);
264   for (unsigned i = 0, NumSubElts = 64 / BitWidth; i != NumSubElts; ++i) {
265     unsigned SubEltIdx = (NumSubElts - 1) - i;
266     auto *SubElt = cast<ConstantInt>(CDV->getElementAsConstant(SubEltIdx));
267     Count <<= BitWidth;
268     Count |= SubElt->getValue().zextOrTrunc(64);
269   }
270 
271   // If shift-by-zero then just return the original value.
272   if (Count.isZero())
273     return Vec;
274 
275   // Handle cases when Shift >= BitWidth.
276   if (Count.uge(BitWidth)) {
277     // If LogicalShift - just return zero.
278     if (LogicalShift)
279       return ConstantAggregateZero::get(VT);
280 
281     // If ArithmeticShift - clamp Shift to (BitWidth - 1).
282     Count = APInt(64, BitWidth - 1);
283   }
284 
285   // Get a constant vector of the same type as the first operand.
286   auto ShiftAmt = ConstantInt::get(SVT, Count.zextOrTrunc(BitWidth));
287   auto ShiftVec = Builder.CreateVectorSplat(VWidth, ShiftAmt);
288 
289   if (ShiftLeft)
290     return Builder.CreateShl(Vec, ShiftVec);
291 
292   if (LogicalShift)
293     return Builder.CreateLShr(Vec, ShiftVec);
294 
295   return Builder.CreateAShr(Vec, ShiftVec);
296 }
297 
298 // Attempt to simplify AVX2 per-element shift intrinsics to a generic IR shift.
299 // Unlike the generic IR shifts, the intrinsics have defined behaviour for out
300 // of range shift amounts (logical - set to zero, arithmetic - splat sign bit).
301 static Value *simplifyX86varShift(const IntrinsicInst &II,
302                                   InstCombiner::BuilderTy &Builder) {
303   bool LogicalShift = false;
304   bool ShiftLeft = false;
305 
306   switch (II.getIntrinsicID()) {
307   default:
308     llvm_unreachable("Unexpected intrinsic!");
309   case Intrinsic::x86_avx2_psrav_d:
310   case Intrinsic::x86_avx2_psrav_d_256:
311   case Intrinsic::x86_avx512_psrav_q_128:
312   case Intrinsic::x86_avx512_psrav_q_256:
313   case Intrinsic::x86_avx512_psrav_d_512:
314   case Intrinsic::x86_avx512_psrav_q_512:
315   case Intrinsic::x86_avx512_psrav_w_128:
316   case Intrinsic::x86_avx512_psrav_w_256:
317   case Intrinsic::x86_avx512_psrav_w_512:
318     LogicalShift = false;
319     ShiftLeft = false;
320     break;
321   case Intrinsic::x86_avx2_psrlv_d:
322   case Intrinsic::x86_avx2_psrlv_d_256:
323   case Intrinsic::x86_avx2_psrlv_q:
324   case Intrinsic::x86_avx2_psrlv_q_256:
325   case Intrinsic::x86_avx512_psrlv_d_512:
326   case Intrinsic::x86_avx512_psrlv_q_512:
327   case Intrinsic::x86_avx512_psrlv_w_128:
328   case Intrinsic::x86_avx512_psrlv_w_256:
329   case Intrinsic::x86_avx512_psrlv_w_512:
330     LogicalShift = true;
331     ShiftLeft = false;
332     break;
333   case Intrinsic::x86_avx2_psllv_d:
334   case Intrinsic::x86_avx2_psllv_d_256:
335   case Intrinsic::x86_avx2_psllv_q:
336   case Intrinsic::x86_avx2_psllv_q_256:
337   case Intrinsic::x86_avx512_psllv_d_512:
338   case Intrinsic::x86_avx512_psllv_q_512:
339   case Intrinsic::x86_avx512_psllv_w_128:
340   case Intrinsic::x86_avx512_psllv_w_256:
341   case Intrinsic::x86_avx512_psllv_w_512:
342     LogicalShift = true;
343     ShiftLeft = true;
344     break;
345   }
346   assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
347 
348   Value *Vec = II.getArgOperand(0);
349   Value *Amt = II.getArgOperand(1);
350   auto *VT = cast<FixedVectorType>(II.getType());
351   Type *SVT = VT->getElementType();
352   int NumElts = VT->getNumElements();
353   int BitWidth = SVT->getIntegerBitWidth();
354 
355   // If the shift amount is guaranteed to be in-range we can replace it with a
356   // generic shift.
357   KnownBits KnownAmt =
358       llvm::computeKnownBits(Amt, II.getModule()->getDataLayout());
359   if (KnownAmt.getMaxValue().ult(BitWidth)) {
360     return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
361                                       : Builder.CreateLShr(Vec, Amt))
362                          : Builder.CreateAShr(Vec, Amt));
363   }
364 
365   // Simplify if all shift amounts are constant/undef.
366   auto *CShift = dyn_cast<Constant>(Amt);
367   if (!CShift)
368     return nullptr;
369 
370   // Collect each element's shift amount.
371   // We also collect special cases: UNDEF = -1, OUT-OF-RANGE = BitWidth.
372   bool AnyOutOfRange = false;
373   SmallVector<int, 8> ShiftAmts;
374   for (int I = 0; I < NumElts; ++I) {
375     auto *CElt = CShift->getAggregateElement(I);
376     if (isa_and_nonnull<UndefValue>(CElt)) {
377       ShiftAmts.push_back(-1);
378       continue;
379     }
380 
381     auto *COp = dyn_cast_or_null<ConstantInt>(CElt);
382     if (!COp)
383       return nullptr;
384 
385     // Handle out of range shifts.
386     // If LogicalShift - set to BitWidth (special case).
387     // If ArithmeticShift - set to (BitWidth - 1) (sign splat).
388     APInt ShiftVal = COp->getValue();
389     if (ShiftVal.uge(BitWidth)) {
390       AnyOutOfRange = LogicalShift;
391       ShiftAmts.push_back(LogicalShift ? BitWidth : BitWidth - 1);
392       continue;
393     }
394 
395     ShiftAmts.push_back((int)ShiftVal.getZExtValue());
396   }
397 
398   // If all elements out of range or UNDEF, return vector of zeros/undefs.
399   // ArithmeticShift should only hit this if they are all UNDEF.
400   auto OutOfRange = [&](int Idx) { return (Idx < 0) || (BitWidth <= Idx); };
401   if (llvm::all_of(ShiftAmts, OutOfRange)) {
402     SmallVector<Constant *, 8> ConstantVec;
403     for (int Idx : ShiftAmts) {
404       if (Idx < 0) {
405         ConstantVec.push_back(UndefValue::get(SVT));
406       } else {
407         assert(LogicalShift && "Logical shift expected");
408         ConstantVec.push_back(ConstantInt::getNullValue(SVT));
409       }
410     }
411     return ConstantVector::get(ConstantVec);
412   }
413 
414   // We can't handle only some out of range values with generic logical shifts.
415   if (AnyOutOfRange)
416     return nullptr;
417 
418   // Build the shift amount constant vector.
419   SmallVector<Constant *, 8> ShiftVecAmts;
420   for (int Idx : ShiftAmts) {
421     if (Idx < 0)
422       ShiftVecAmts.push_back(UndefValue::get(SVT));
423     else
424       ShiftVecAmts.push_back(ConstantInt::get(SVT, Idx));
425   }
426   auto ShiftVec = ConstantVector::get(ShiftVecAmts);
427 
428   if (ShiftLeft)
429     return Builder.CreateShl(Vec, ShiftVec);
430 
431   if (LogicalShift)
432     return Builder.CreateLShr(Vec, ShiftVec);
433 
434   return Builder.CreateAShr(Vec, ShiftVec);
435 }
436 
437 static Value *simplifyX86pack(IntrinsicInst &II,
438                               InstCombiner::BuilderTy &Builder, bool IsSigned) {
439   Value *Arg0 = II.getArgOperand(0);
440   Value *Arg1 = II.getArgOperand(1);
441   Type *ResTy = II.getType();
442 
443   // Fast all undef handling.
444   if (isa<UndefValue>(Arg0) && isa<UndefValue>(Arg1))
445     return UndefValue::get(ResTy);
446 
447   auto *ArgTy = cast<FixedVectorType>(Arg0->getType());
448   unsigned NumLanes = ResTy->getPrimitiveSizeInBits() / 128;
449   unsigned NumSrcElts = ArgTy->getNumElements();
450   assert(cast<FixedVectorType>(ResTy)->getNumElements() == (2 * NumSrcElts) &&
451          "Unexpected packing types");
452 
453   unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
454   unsigned DstScalarSizeInBits = ResTy->getScalarSizeInBits();
455   unsigned SrcScalarSizeInBits = ArgTy->getScalarSizeInBits();
456   assert(SrcScalarSizeInBits == (2 * DstScalarSizeInBits) &&
457          "Unexpected packing types");
458 
459   // Constant folding.
460   if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1))
461     return nullptr;
462 
463   // Clamp Values - signed/unsigned both use signed clamp values, but they
464   // differ on the min/max values.
465   APInt MinValue, MaxValue;
466   if (IsSigned) {
467     // PACKSS: Truncate signed value with signed saturation.
468     // Source values less than dst minint are saturated to minint.
469     // Source values greater than dst maxint are saturated to maxint.
470     MinValue =
471         APInt::getSignedMinValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits);
472     MaxValue =
473         APInt::getSignedMaxValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits);
474   } else {
475     // PACKUS: Truncate signed value with unsigned saturation.
476     // Source values less than zero are saturated to zero.
477     // Source values greater than dst maxuint are saturated to maxuint.
478     MinValue = APInt::getZero(SrcScalarSizeInBits);
479     MaxValue = APInt::getLowBitsSet(SrcScalarSizeInBits, DstScalarSizeInBits);
480   }
481 
482   auto *MinC = Constant::getIntegerValue(ArgTy, MinValue);
483   auto *MaxC = Constant::getIntegerValue(ArgTy, MaxValue);
484   Arg0 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg0, MinC), MinC, Arg0);
485   Arg1 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg1, MinC), MinC, Arg1);
486   Arg0 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg0, MaxC), MaxC, Arg0);
487   Arg1 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg1, MaxC), MaxC, Arg1);
488 
489   // Shuffle clamped args together at the lane level.
490   SmallVector<int, 32> PackMask;
491   for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
492     for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt)
493       PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane));
494     for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt)
495       PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane) + NumSrcElts);
496   }
497   auto *Shuffle = Builder.CreateShuffleVector(Arg0, Arg1, PackMask);
498 
499   // Truncate to dst size.
500   return Builder.CreateTrunc(Shuffle, ResTy);
501 }
502 
503 static Value *simplifyX86movmsk(const IntrinsicInst &II,
504                                 InstCombiner::BuilderTy &Builder) {
505   Value *Arg = II.getArgOperand(0);
506   Type *ResTy = II.getType();
507 
508   // movmsk(undef) -> zero as we must ensure the upper bits are zero.
509   if (isa<UndefValue>(Arg))
510     return Constant::getNullValue(ResTy);
511 
512   auto *ArgTy = dyn_cast<FixedVectorType>(Arg->getType());
513   // We can't easily peek through x86_mmx types.
514   if (!ArgTy)
515     return nullptr;
516 
517   // Expand MOVMSK to compare/bitcast/zext:
518   // e.g. PMOVMSKB(v16i8 x):
519   // %cmp = icmp slt <16 x i8> %x, zeroinitializer
520   // %int = bitcast <16 x i1> %cmp to i16
521   // %res = zext i16 %int to i32
522   unsigned NumElts = ArgTy->getNumElements();
523   Type *IntegerTy = Builder.getIntNTy(NumElts);
524 
525   Value *Res = Builder.CreateBitCast(Arg, VectorType::getInteger(ArgTy));
526   Res = Builder.CreateIsNeg(Res);
527   Res = Builder.CreateBitCast(Res, IntegerTy);
528   Res = Builder.CreateZExtOrTrunc(Res, ResTy);
529   return Res;
530 }
531 
532 static Value *simplifyX86addcarry(const IntrinsicInst &II,
533                                   InstCombiner::BuilderTy &Builder) {
534   Value *CarryIn = II.getArgOperand(0);
535   Value *Op1 = II.getArgOperand(1);
536   Value *Op2 = II.getArgOperand(2);
537   Type *RetTy = II.getType();
538   Type *OpTy = Op1->getType();
539   assert(RetTy->getStructElementType(0)->isIntegerTy(8) &&
540          RetTy->getStructElementType(1) == OpTy && OpTy == Op2->getType() &&
541          "Unexpected types for x86 addcarry");
542 
543   // If carry-in is zero, this is just an unsigned add with overflow.
544   if (match(CarryIn, PatternMatch::m_ZeroInt())) {
545     Value *UAdd = Builder.CreateIntrinsic(Intrinsic::uadd_with_overflow, OpTy,
546                                           {Op1, Op2});
547     // The types have to be adjusted to match the x86 call types.
548     Value *UAddResult = Builder.CreateExtractValue(UAdd, 0);
549     Value *UAddOV = Builder.CreateZExt(Builder.CreateExtractValue(UAdd, 1),
550                                        Builder.getInt8Ty());
551     Value *Res = UndefValue::get(RetTy);
552     Res = Builder.CreateInsertValue(Res, UAddOV, 0);
553     return Builder.CreateInsertValue(Res, UAddResult, 1);
554   }
555 
556   return nullptr;
557 }
558 
559 static Value *simplifyX86insertps(const IntrinsicInst &II,
560                                   InstCombiner::BuilderTy &Builder) {
561   auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2));
562   if (!CInt)
563     return nullptr;
564 
565   auto *VecTy = cast<FixedVectorType>(II.getType());
566   assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type");
567 
568   // The immediate permute control byte looks like this:
569   //    [3:0] - zero mask for each 32-bit lane
570   //    [5:4] - select one 32-bit destination lane
571   //    [7:6] - select one 32-bit source lane
572 
573   uint8_t Imm = CInt->getZExtValue();
574   uint8_t ZMask = Imm & 0xf;
575   uint8_t DestLane = (Imm >> 4) & 0x3;
576   uint8_t SourceLane = (Imm >> 6) & 0x3;
577 
578   ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy);
579 
580   // If all zero mask bits are set, this was just a weird way to
581   // generate a zero vector.
582   if (ZMask == 0xf)
583     return ZeroVector;
584 
585   // Initialize by passing all of the first source bits through.
586   int ShuffleMask[4] = {0, 1, 2, 3};
587 
588   // We may replace the second operand with the zero vector.
589   Value *V1 = II.getArgOperand(1);
590 
591   if (ZMask) {
592     // If the zero mask is being used with a single input or the zero mask
593     // overrides the destination lane, this is a shuffle with the zero vector.
594     if ((II.getArgOperand(0) == II.getArgOperand(1)) ||
595         (ZMask & (1 << DestLane))) {
596       V1 = ZeroVector;
597       // We may still move 32-bits of the first source vector from one lane
598       // to another.
599       ShuffleMask[DestLane] = SourceLane;
600       // The zero mask may override the previous insert operation.
601       for (unsigned i = 0; i < 4; ++i)
602         if ((ZMask >> i) & 0x1)
603           ShuffleMask[i] = i + 4;
604     } else {
605       // TODO: Model this case as 2 shuffles or a 'logical and' plus shuffle?
606       return nullptr;
607     }
608   } else {
609     // Replace the selected destination lane with the selected source lane.
610     ShuffleMask[DestLane] = SourceLane + 4;
611   }
612 
613   return Builder.CreateShuffleVector(II.getArgOperand(0), V1, ShuffleMask);
614 }
615 
616 /// Attempt to simplify SSE4A EXTRQ/EXTRQI instructions using constant folding
617 /// or conversion to a shuffle vector.
618 static Value *simplifyX86extrq(IntrinsicInst &II, Value *Op0,
619                                ConstantInt *CILength, ConstantInt *CIIndex,
620                                InstCombiner::BuilderTy &Builder) {
621   auto LowConstantHighUndef = [&](uint64_t Val) {
622     Type *IntTy64 = Type::getInt64Ty(II.getContext());
623     Constant *Args[] = {ConstantInt::get(IntTy64, Val),
624                         UndefValue::get(IntTy64)};
625     return ConstantVector::get(Args);
626   };
627 
628   // See if we're dealing with constant values.
629   auto *C0 = dyn_cast<Constant>(Op0);
630   auto *CI0 =
631       C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
632          : nullptr;
633 
634   // Attempt to constant fold.
635   if (CILength && CIIndex) {
636     // From AMD documentation: "The bit index and field length are each six
637     // bits in length other bits of the field are ignored."
638     APInt APIndex = CIIndex->getValue().zextOrTrunc(6);
639     APInt APLength = CILength->getValue().zextOrTrunc(6);
640 
641     unsigned Index = APIndex.getZExtValue();
642 
643     // From AMD documentation: "a value of zero in the field length is
644     // defined as length of 64".
645     unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue();
646 
647     // From AMD documentation: "If the sum of the bit index + length field
648     // is greater than 64, the results are undefined".
649     unsigned End = Index + Length;
650 
651     // Note that both field index and field length are 8-bit quantities.
652     // Since variables 'Index' and 'Length' are unsigned values
653     // obtained from zero-extending field index and field length
654     // respectively, their sum should never wrap around.
655     if (End > 64)
656       return UndefValue::get(II.getType());
657 
658     // If we are inserting whole bytes, we can convert this to a shuffle.
659     // Lowering can recognize EXTRQI shuffle masks.
660     if ((Length % 8) == 0 && (Index % 8) == 0) {
661       // Convert bit indices to byte indices.
662       Length /= 8;
663       Index /= 8;
664 
665       Type *IntTy8 = Type::getInt8Ty(II.getContext());
666       auto *ShufTy = FixedVectorType::get(IntTy8, 16);
667 
668       SmallVector<int, 16> ShuffleMask;
669       for (int i = 0; i != (int)Length; ++i)
670         ShuffleMask.push_back(i + Index);
671       for (int i = Length; i != 8; ++i)
672         ShuffleMask.push_back(i + 16);
673       for (int i = 8; i != 16; ++i)
674         ShuffleMask.push_back(-1);
675 
676       Value *SV = Builder.CreateShuffleVector(
677           Builder.CreateBitCast(Op0, ShufTy),
678           ConstantAggregateZero::get(ShufTy), ShuffleMask);
679       return Builder.CreateBitCast(SV, II.getType());
680     }
681 
682     // Constant Fold - shift Index'th bit to lowest position and mask off
683     // Length bits.
684     if (CI0) {
685       APInt Elt = CI0->getValue();
686       Elt.lshrInPlace(Index);
687       Elt = Elt.zextOrTrunc(Length);
688       return LowConstantHighUndef(Elt.getZExtValue());
689     }
690 
691     // If we were an EXTRQ call, we'll save registers if we convert to EXTRQI.
692     if (II.getIntrinsicID() == Intrinsic::x86_sse4a_extrq) {
693       Value *Args[] = {Op0, CILength, CIIndex};
694       Module *M = II.getModule();
695       Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_extrqi);
696       return Builder.CreateCall(F, Args);
697     }
698   }
699 
700   // Constant Fold - extraction from zero is always {zero, undef}.
701   if (CI0 && CI0->isZero())
702     return LowConstantHighUndef(0);
703 
704   return nullptr;
705 }
706 
707 /// Attempt to simplify SSE4A INSERTQ/INSERTQI instructions using constant
708 /// folding or conversion to a shuffle vector.
709 static Value *simplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1,
710                                  APInt APLength, APInt APIndex,
711                                  InstCombiner::BuilderTy &Builder) {
712   // From AMD documentation: "The bit index and field length are each six bits
713   // in length other bits of the field are ignored."
714   APIndex = APIndex.zextOrTrunc(6);
715   APLength = APLength.zextOrTrunc(6);
716 
717   // Attempt to constant fold.
718   unsigned Index = APIndex.getZExtValue();
719 
720   // From AMD documentation: "a value of zero in the field length is
721   // defined as length of 64".
722   unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue();
723 
724   // From AMD documentation: "If the sum of the bit index + length field
725   // is greater than 64, the results are undefined".
726   unsigned End = Index + Length;
727 
728   // Note that both field index and field length are 8-bit quantities.
729   // Since variables 'Index' and 'Length' are unsigned values
730   // obtained from zero-extending field index and field length
731   // respectively, their sum should never wrap around.
732   if (End > 64)
733     return UndefValue::get(II.getType());
734 
735   // If we are inserting whole bytes, we can convert this to a shuffle.
736   // Lowering can recognize INSERTQI shuffle masks.
737   if ((Length % 8) == 0 && (Index % 8) == 0) {
738     // Convert bit indices to byte indices.
739     Length /= 8;
740     Index /= 8;
741 
742     Type *IntTy8 = Type::getInt8Ty(II.getContext());
743     auto *ShufTy = FixedVectorType::get(IntTy8, 16);
744 
745     SmallVector<int, 16> ShuffleMask;
746     for (int i = 0; i != (int)Index; ++i)
747       ShuffleMask.push_back(i);
748     for (int i = 0; i != (int)Length; ++i)
749       ShuffleMask.push_back(i + 16);
750     for (int i = Index + Length; i != 8; ++i)
751       ShuffleMask.push_back(i);
752     for (int i = 8; i != 16; ++i)
753       ShuffleMask.push_back(-1);
754 
755     Value *SV = Builder.CreateShuffleVector(Builder.CreateBitCast(Op0, ShufTy),
756                                             Builder.CreateBitCast(Op1, ShufTy),
757                                             ShuffleMask);
758     return Builder.CreateBitCast(SV, II.getType());
759   }
760 
761   // See if we're dealing with constant values.
762   auto *C0 = dyn_cast<Constant>(Op0);
763   auto *C1 = dyn_cast<Constant>(Op1);
764   auto *CI00 =
765       C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
766          : nullptr;
767   auto *CI10 =
768       C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
769          : nullptr;
770 
771   // Constant Fold - insert bottom Length bits starting at the Index'th bit.
772   if (CI00 && CI10) {
773     APInt V00 = CI00->getValue();
774     APInt V10 = CI10->getValue();
775     APInt Mask = APInt::getLowBitsSet(64, Length).shl(Index);
776     V00 = V00 & ~Mask;
777     V10 = V10.zextOrTrunc(Length).zextOrTrunc(64).shl(Index);
778     APInt Val = V00 | V10;
779     Type *IntTy64 = Type::getInt64Ty(II.getContext());
780     Constant *Args[] = {ConstantInt::get(IntTy64, Val.getZExtValue()),
781                         UndefValue::get(IntTy64)};
782     return ConstantVector::get(Args);
783   }
784 
785   // If we were an INSERTQ call, we'll save demanded elements if we convert to
786   // INSERTQI.
787   if (II.getIntrinsicID() == Intrinsic::x86_sse4a_insertq) {
788     Type *IntTy8 = Type::getInt8Ty(II.getContext());
789     Constant *CILength = ConstantInt::get(IntTy8, Length, false);
790     Constant *CIIndex = ConstantInt::get(IntTy8, Index, false);
791 
792     Value *Args[] = {Op0, Op1, CILength, CIIndex};
793     Module *M = II.getModule();
794     Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi);
795     return Builder.CreateCall(F, Args);
796   }
797 
798   return nullptr;
799 }
800 
801 /// Attempt to convert pshufb* to shufflevector if the mask is constant.
802 static Value *simplifyX86pshufb(const IntrinsicInst &II,
803                                 InstCombiner::BuilderTy &Builder) {
804   auto *V = dyn_cast<Constant>(II.getArgOperand(1));
805   if (!V)
806     return nullptr;
807 
808   auto *VecTy = cast<FixedVectorType>(II.getType());
809   unsigned NumElts = VecTy->getNumElements();
810   assert((NumElts == 16 || NumElts == 32 || NumElts == 64) &&
811          "Unexpected number of elements in shuffle mask!");
812 
813   // Construct a shuffle mask from constant integers or UNDEFs.
814   int Indexes[64];
815 
816   // Each byte in the shuffle control mask forms an index to permute the
817   // corresponding byte in the destination operand.
818   for (unsigned I = 0; I < NumElts; ++I) {
819     Constant *COp = V->getAggregateElement(I);
820     if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
821       return nullptr;
822 
823     if (isa<UndefValue>(COp)) {
824       Indexes[I] = -1;
825       continue;
826     }
827 
828     int8_t Index = cast<ConstantInt>(COp)->getValue().getZExtValue();
829 
830     // If the most significant bit (bit[7]) of each byte of the shuffle
831     // control mask is set, then zero is written in the result byte.
832     // The zero vector is in the right-hand side of the resulting
833     // shufflevector.
834 
835     // The value of each index for the high 128-bit lane is the least
836     // significant 4 bits of the respective shuffle control byte.
837     Index = ((Index < 0) ? NumElts : Index & 0x0F) + (I & 0xF0);
838     Indexes[I] = Index;
839   }
840 
841   auto V1 = II.getArgOperand(0);
842   auto V2 = Constant::getNullValue(VecTy);
843   return Builder.CreateShuffleVector(V1, V2, makeArrayRef(Indexes, NumElts));
844 }
845 
846 /// Attempt to convert vpermilvar* to shufflevector if the mask is constant.
847 static Value *simplifyX86vpermilvar(const IntrinsicInst &II,
848                                     InstCombiner::BuilderTy &Builder) {
849   auto *V = dyn_cast<Constant>(II.getArgOperand(1));
850   if (!V)
851     return nullptr;
852 
853   auto *VecTy = cast<FixedVectorType>(II.getType());
854   unsigned NumElts = VecTy->getNumElements();
855   bool IsPD = VecTy->getScalarType()->isDoubleTy();
856   unsigned NumLaneElts = IsPD ? 2 : 4;
857   assert(NumElts == 16 || NumElts == 8 || NumElts == 4 || NumElts == 2);
858 
859   // Construct a shuffle mask from constant integers or UNDEFs.
860   int Indexes[16];
861 
862   // The intrinsics only read one or two bits, clear the rest.
863   for (unsigned I = 0; I < NumElts; ++I) {
864     Constant *COp = V->getAggregateElement(I);
865     if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
866       return nullptr;
867 
868     if (isa<UndefValue>(COp)) {
869       Indexes[I] = -1;
870       continue;
871     }
872 
873     APInt Index = cast<ConstantInt>(COp)->getValue();
874     Index = Index.zextOrTrunc(32).getLoBits(2);
875 
876     // The PD variants uses bit 1 to select per-lane element index, so
877     // shift down to convert to generic shuffle mask index.
878     if (IsPD)
879       Index.lshrInPlace(1);
880 
881     // The _256 variants are a bit trickier since the mask bits always index
882     // into the corresponding 128 half. In order to convert to a generic
883     // shuffle, we have to make that explicit.
884     Index += APInt(32, (I / NumLaneElts) * NumLaneElts);
885 
886     Indexes[I] = Index.getZExtValue();
887   }
888 
889   auto V1 = II.getArgOperand(0);
890   return Builder.CreateShuffleVector(V1, makeArrayRef(Indexes, NumElts));
891 }
892 
893 /// Attempt to convert vpermd/vpermps to shufflevector if the mask is constant.
894 static Value *simplifyX86vpermv(const IntrinsicInst &II,
895                                 InstCombiner::BuilderTy &Builder) {
896   auto *V = dyn_cast<Constant>(II.getArgOperand(1));
897   if (!V)
898     return nullptr;
899 
900   auto *VecTy = cast<FixedVectorType>(II.getType());
901   unsigned Size = VecTy->getNumElements();
902   assert((Size == 4 || Size == 8 || Size == 16 || Size == 32 || Size == 64) &&
903          "Unexpected shuffle mask size");
904 
905   // Construct a shuffle mask from constant integers or UNDEFs.
906   int Indexes[64];
907 
908   for (unsigned I = 0; I < Size; ++I) {
909     Constant *COp = V->getAggregateElement(I);
910     if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
911       return nullptr;
912 
913     if (isa<UndefValue>(COp)) {
914       Indexes[I] = -1;
915       continue;
916     }
917 
918     uint32_t Index = cast<ConstantInt>(COp)->getZExtValue();
919     Index &= Size - 1;
920     Indexes[I] = Index;
921   }
922 
923   auto V1 = II.getArgOperand(0);
924   return Builder.CreateShuffleVector(V1, makeArrayRef(Indexes, Size));
925 }
926 
927 Optional<Instruction *>
928 X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
929   auto SimplifyDemandedVectorEltsLow = [&IC](Value *Op, unsigned Width,
930                                              unsigned DemandedWidth) {
931     APInt UndefElts(Width, 0);
932     APInt DemandedElts = APInt::getLowBitsSet(Width, DemandedWidth);
933     return IC.SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts);
934   };
935 
936   Intrinsic::ID IID = II.getIntrinsicID();
937   switch (IID) {
938   case Intrinsic::x86_bmi_bextr_32:
939   case Intrinsic::x86_bmi_bextr_64:
940   case Intrinsic::x86_tbm_bextri_u32:
941   case Intrinsic::x86_tbm_bextri_u64:
942     // If the RHS is a constant we can try some simplifications.
943     if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
944       uint64_t Shift = C->getZExtValue();
945       uint64_t Length = (Shift >> 8) & 0xff;
946       Shift &= 0xff;
947       unsigned BitWidth = II.getType()->getIntegerBitWidth();
948       // If the length is 0 or the shift is out of range, replace with zero.
949       if (Length == 0 || Shift >= BitWidth) {
950         return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
951       }
952       // If the LHS is also a constant, we can completely constant fold this.
953       if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
954         uint64_t Result = InC->getZExtValue() >> Shift;
955         if (Length > BitWidth)
956           Length = BitWidth;
957         Result &= maskTrailingOnes<uint64_t>(Length);
958         return IC.replaceInstUsesWith(II,
959                                       ConstantInt::get(II.getType(), Result));
960       }
961       // TODO should we turn this into 'and' if shift is 0? Or 'shl' if we
962       // are only masking bits that a shift already cleared?
963     }
964     break;
965 
966   case Intrinsic::x86_bmi_bzhi_32:
967   case Intrinsic::x86_bmi_bzhi_64:
968     // If the RHS is a constant we can try some simplifications.
969     if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
970       uint64_t Index = C->getZExtValue() & 0xff;
971       unsigned BitWidth = II.getType()->getIntegerBitWidth();
972       if (Index >= BitWidth) {
973         return IC.replaceInstUsesWith(II, II.getArgOperand(0));
974       }
975       if (Index == 0) {
976         return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
977       }
978       // If the LHS is also a constant, we can completely constant fold this.
979       if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
980         uint64_t Result = InC->getZExtValue();
981         Result &= maskTrailingOnes<uint64_t>(Index);
982         return IC.replaceInstUsesWith(II,
983                                       ConstantInt::get(II.getType(), Result));
984       }
985       // TODO should we convert this to an AND if the RHS is constant?
986     }
987     break;
988   case Intrinsic::x86_bmi_pext_32:
989   case Intrinsic::x86_bmi_pext_64:
990     if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
991       if (MaskC->isNullValue()) {
992         return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
993       }
994       if (MaskC->isAllOnesValue()) {
995         return IC.replaceInstUsesWith(II, II.getArgOperand(0));
996       }
997 
998       unsigned MaskIdx, MaskLen;
999       if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) {
1000         // any single contingous sequence of 1s anywhere in the mask simply
1001         // describes a subset of the input bits shifted to the appropriate
1002         // position.  Replace with the straight forward IR.
1003         Value *Input = II.getArgOperand(0);
1004         Value *Masked = IC.Builder.CreateAnd(Input, II.getArgOperand(1));
1005         Value *ShiftAmt = ConstantInt::get(II.getType(), MaskIdx);
1006         Value *Shifted = IC.Builder.CreateLShr(Masked, ShiftAmt);
1007         return IC.replaceInstUsesWith(II, Shifted);
1008       }
1009 
1010       if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
1011         uint64_t Src = SrcC->getZExtValue();
1012         uint64_t Mask = MaskC->getZExtValue();
1013         uint64_t Result = 0;
1014         uint64_t BitToSet = 1;
1015 
1016         while (Mask) {
1017           // Isolate lowest set bit.
1018           uint64_t BitToTest = Mask & -Mask;
1019           if (BitToTest & Src)
1020             Result |= BitToSet;
1021 
1022           BitToSet <<= 1;
1023           // Clear lowest set bit.
1024           Mask &= Mask - 1;
1025         }
1026 
1027         return IC.replaceInstUsesWith(II,
1028                                       ConstantInt::get(II.getType(), Result));
1029       }
1030     }
1031     break;
1032   case Intrinsic::x86_bmi_pdep_32:
1033   case Intrinsic::x86_bmi_pdep_64:
1034     if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
1035       if (MaskC->isNullValue()) {
1036         return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
1037       }
1038       if (MaskC->isAllOnesValue()) {
1039         return IC.replaceInstUsesWith(II, II.getArgOperand(0));
1040       }
1041 
1042       unsigned MaskIdx, MaskLen;
1043       if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) {
1044         // any single contingous sequence of 1s anywhere in the mask simply
1045         // describes a subset of the input bits shifted to the appropriate
1046         // position.  Replace with the straight forward IR.
1047         Value *Input = II.getArgOperand(0);
1048         Value *ShiftAmt = ConstantInt::get(II.getType(), MaskIdx);
1049         Value *Shifted = IC.Builder.CreateShl(Input, ShiftAmt);
1050         Value *Masked = IC.Builder.CreateAnd(Shifted, II.getArgOperand(1));
1051         return IC.replaceInstUsesWith(II, Masked);
1052       }
1053 
1054       if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
1055         uint64_t Src = SrcC->getZExtValue();
1056         uint64_t Mask = MaskC->getZExtValue();
1057         uint64_t Result = 0;
1058         uint64_t BitToTest = 1;
1059 
1060         while (Mask) {
1061           // Isolate lowest set bit.
1062           uint64_t BitToSet = Mask & -Mask;
1063           if (BitToTest & Src)
1064             Result |= BitToSet;
1065 
1066           BitToTest <<= 1;
1067           // Clear lowest set bit;
1068           Mask &= Mask - 1;
1069         }
1070 
1071         return IC.replaceInstUsesWith(II,
1072                                       ConstantInt::get(II.getType(), Result));
1073       }
1074     }
1075     break;
1076 
1077   case Intrinsic::x86_sse_cvtss2si:
1078   case Intrinsic::x86_sse_cvtss2si64:
1079   case Intrinsic::x86_sse_cvttss2si:
1080   case Intrinsic::x86_sse_cvttss2si64:
1081   case Intrinsic::x86_sse2_cvtsd2si:
1082   case Intrinsic::x86_sse2_cvtsd2si64:
1083   case Intrinsic::x86_sse2_cvttsd2si:
1084   case Intrinsic::x86_sse2_cvttsd2si64:
1085   case Intrinsic::x86_avx512_vcvtss2si32:
1086   case Intrinsic::x86_avx512_vcvtss2si64:
1087   case Intrinsic::x86_avx512_vcvtss2usi32:
1088   case Intrinsic::x86_avx512_vcvtss2usi64:
1089   case Intrinsic::x86_avx512_vcvtsd2si32:
1090   case Intrinsic::x86_avx512_vcvtsd2si64:
1091   case Intrinsic::x86_avx512_vcvtsd2usi32:
1092   case Intrinsic::x86_avx512_vcvtsd2usi64:
1093   case Intrinsic::x86_avx512_cvttss2si:
1094   case Intrinsic::x86_avx512_cvttss2si64:
1095   case Intrinsic::x86_avx512_cvttss2usi:
1096   case Intrinsic::x86_avx512_cvttss2usi64:
1097   case Intrinsic::x86_avx512_cvttsd2si:
1098   case Intrinsic::x86_avx512_cvttsd2si64:
1099   case Intrinsic::x86_avx512_cvttsd2usi:
1100   case Intrinsic::x86_avx512_cvttsd2usi64: {
1101     // These intrinsics only demand the 0th element of their input vectors. If
1102     // we can simplify the input based on that, do so now.
1103     Value *Arg = II.getArgOperand(0);
1104     unsigned VWidth = cast<FixedVectorType>(Arg->getType())->getNumElements();
1105     if (Value *V = SimplifyDemandedVectorEltsLow(Arg, VWidth, 1)) {
1106       return IC.replaceOperand(II, 0, V);
1107     }
1108     break;
1109   }
1110 
1111   case Intrinsic::x86_mmx_pmovmskb:
1112   case Intrinsic::x86_sse_movmsk_ps:
1113   case Intrinsic::x86_sse2_movmsk_pd:
1114   case Intrinsic::x86_sse2_pmovmskb_128:
1115   case Intrinsic::x86_avx_movmsk_pd_256:
1116   case Intrinsic::x86_avx_movmsk_ps_256:
1117   case Intrinsic::x86_avx2_pmovmskb:
1118     if (Value *V = simplifyX86movmsk(II, IC.Builder)) {
1119       return IC.replaceInstUsesWith(II, V);
1120     }
1121     break;
1122 
1123   case Intrinsic::x86_sse_comieq_ss:
1124   case Intrinsic::x86_sse_comige_ss:
1125   case Intrinsic::x86_sse_comigt_ss:
1126   case Intrinsic::x86_sse_comile_ss:
1127   case Intrinsic::x86_sse_comilt_ss:
1128   case Intrinsic::x86_sse_comineq_ss:
1129   case Intrinsic::x86_sse_ucomieq_ss:
1130   case Intrinsic::x86_sse_ucomige_ss:
1131   case Intrinsic::x86_sse_ucomigt_ss:
1132   case Intrinsic::x86_sse_ucomile_ss:
1133   case Intrinsic::x86_sse_ucomilt_ss:
1134   case Intrinsic::x86_sse_ucomineq_ss:
1135   case Intrinsic::x86_sse2_comieq_sd:
1136   case Intrinsic::x86_sse2_comige_sd:
1137   case Intrinsic::x86_sse2_comigt_sd:
1138   case Intrinsic::x86_sse2_comile_sd:
1139   case Intrinsic::x86_sse2_comilt_sd:
1140   case Intrinsic::x86_sse2_comineq_sd:
1141   case Intrinsic::x86_sse2_ucomieq_sd:
1142   case Intrinsic::x86_sse2_ucomige_sd:
1143   case Intrinsic::x86_sse2_ucomigt_sd:
1144   case Intrinsic::x86_sse2_ucomile_sd:
1145   case Intrinsic::x86_sse2_ucomilt_sd:
1146   case Intrinsic::x86_sse2_ucomineq_sd:
1147   case Intrinsic::x86_avx512_vcomi_ss:
1148   case Intrinsic::x86_avx512_vcomi_sd:
1149   case Intrinsic::x86_avx512_mask_cmp_ss:
1150   case Intrinsic::x86_avx512_mask_cmp_sd: {
1151     // These intrinsics only demand the 0th element of their input vectors. If
1152     // we can simplify the input based on that, do so now.
1153     bool MadeChange = false;
1154     Value *Arg0 = II.getArgOperand(0);
1155     Value *Arg1 = II.getArgOperand(1);
1156     unsigned VWidth = cast<FixedVectorType>(Arg0->getType())->getNumElements();
1157     if (Value *V = SimplifyDemandedVectorEltsLow(Arg0, VWidth, 1)) {
1158       IC.replaceOperand(II, 0, V);
1159       MadeChange = true;
1160     }
1161     if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, 1)) {
1162       IC.replaceOperand(II, 1, V);
1163       MadeChange = true;
1164     }
1165     if (MadeChange) {
1166       return &II;
1167     }
1168     break;
1169   }
1170 
1171   case Intrinsic::x86_avx512_add_ps_512:
1172   case Intrinsic::x86_avx512_div_ps_512:
1173   case Intrinsic::x86_avx512_mul_ps_512:
1174   case Intrinsic::x86_avx512_sub_ps_512:
1175   case Intrinsic::x86_avx512_add_pd_512:
1176   case Intrinsic::x86_avx512_div_pd_512:
1177   case Intrinsic::x86_avx512_mul_pd_512:
1178   case Intrinsic::x86_avx512_sub_pd_512:
1179     // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
1180     // IR operations.
1181     if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
1182       if (R->getValue() == 4) {
1183         Value *Arg0 = II.getArgOperand(0);
1184         Value *Arg1 = II.getArgOperand(1);
1185 
1186         Value *V;
1187         switch (IID) {
1188         default:
1189           llvm_unreachable("Case stmts out of sync!");
1190         case Intrinsic::x86_avx512_add_ps_512:
1191         case Intrinsic::x86_avx512_add_pd_512:
1192           V = IC.Builder.CreateFAdd(Arg0, Arg1);
1193           break;
1194         case Intrinsic::x86_avx512_sub_ps_512:
1195         case Intrinsic::x86_avx512_sub_pd_512:
1196           V = IC.Builder.CreateFSub(Arg0, Arg1);
1197           break;
1198         case Intrinsic::x86_avx512_mul_ps_512:
1199         case Intrinsic::x86_avx512_mul_pd_512:
1200           V = IC.Builder.CreateFMul(Arg0, Arg1);
1201           break;
1202         case Intrinsic::x86_avx512_div_ps_512:
1203         case Intrinsic::x86_avx512_div_pd_512:
1204           V = IC.Builder.CreateFDiv(Arg0, Arg1);
1205           break;
1206         }
1207 
1208         return IC.replaceInstUsesWith(II, V);
1209       }
1210     }
1211     break;
1212 
1213   case Intrinsic::x86_avx512_mask_add_ss_round:
1214   case Intrinsic::x86_avx512_mask_div_ss_round:
1215   case Intrinsic::x86_avx512_mask_mul_ss_round:
1216   case Intrinsic::x86_avx512_mask_sub_ss_round:
1217   case Intrinsic::x86_avx512_mask_add_sd_round:
1218   case Intrinsic::x86_avx512_mask_div_sd_round:
1219   case Intrinsic::x86_avx512_mask_mul_sd_round:
1220   case Intrinsic::x86_avx512_mask_sub_sd_round:
1221     // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
1222     // IR operations.
1223     if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(4))) {
1224       if (R->getValue() == 4) {
1225         // Extract the element as scalars.
1226         Value *Arg0 = II.getArgOperand(0);
1227         Value *Arg1 = II.getArgOperand(1);
1228         Value *LHS = IC.Builder.CreateExtractElement(Arg0, (uint64_t)0);
1229         Value *RHS = IC.Builder.CreateExtractElement(Arg1, (uint64_t)0);
1230 
1231         Value *V;
1232         switch (IID) {
1233         default:
1234           llvm_unreachable("Case stmts out of sync!");
1235         case Intrinsic::x86_avx512_mask_add_ss_round:
1236         case Intrinsic::x86_avx512_mask_add_sd_round:
1237           V = IC.Builder.CreateFAdd(LHS, RHS);
1238           break;
1239         case Intrinsic::x86_avx512_mask_sub_ss_round:
1240         case Intrinsic::x86_avx512_mask_sub_sd_round:
1241           V = IC.Builder.CreateFSub(LHS, RHS);
1242           break;
1243         case Intrinsic::x86_avx512_mask_mul_ss_round:
1244         case Intrinsic::x86_avx512_mask_mul_sd_round:
1245           V = IC.Builder.CreateFMul(LHS, RHS);
1246           break;
1247         case Intrinsic::x86_avx512_mask_div_ss_round:
1248         case Intrinsic::x86_avx512_mask_div_sd_round:
1249           V = IC.Builder.CreateFDiv(LHS, RHS);
1250           break;
1251         }
1252 
1253         // Handle the masking aspect of the intrinsic.
1254         Value *Mask = II.getArgOperand(3);
1255         auto *C = dyn_cast<ConstantInt>(Mask);
1256         // We don't need a select if we know the mask bit is a 1.
1257         if (!C || !C->getValue()[0]) {
1258           // Cast the mask to an i1 vector and then extract the lowest element.
1259           auto *MaskTy = FixedVectorType::get(
1260               IC.Builder.getInt1Ty(),
1261               cast<IntegerType>(Mask->getType())->getBitWidth());
1262           Mask = IC.Builder.CreateBitCast(Mask, MaskTy);
1263           Mask = IC.Builder.CreateExtractElement(Mask, (uint64_t)0);
1264           // Extract the lowest element from the passthru operand.
1265           Value *Passthru =
1266               IC.Builder.CreateExtractElement(II.getArgOperand(2), (uint64_t)0);
1267           V = IC.Builder.CreateSelect(Mask, V, Passthru);
1268         }
1269 
1270         // Insert the result back into the original argument 0.
1271         V = IC.Builder.CreateInsertElement(Arg0, V, (uint64_t)0);
1272 
1273         return IC.replaceInstUsesWith(II, V);
1274       }
1275     }
1276     break;
1277 
1278   // Constant fold ashr( <A x Bi>, Ci ).
1279   // Constant fold lshr( <A x Bi>, Ci ).
1280   // Constant fold shl( <A x Bi>, Ci ).
1281   case Intrinsic::x86_sse2_psrai_d:
1282   case Intrinsic::x86_sse2_psrai_w:
1283   case Intrinsic::x86_avx2_psrai_d:
1284   case Intrinsic::x86_avx2_psrai_w:
1285   case Intrinsic::x86_avx512_psrai_q_128:
1286   case Intrinsic::x86_avx512_psrai_q_256:
1287   case Intrinsic::x86_avx512_psrai_d_512:
1288   case Intrinsic::x86_avx512_psrai_q_512:
1289   case Intrinsic::x86_avx512_psrai_w_512:
1290   case Intrinsic::x86_sse2_psrli_d:
1291   case Intrinsic::x86_sse2_psrli_q:
1292   case Intrinsic::x86_sse2_psrli_w:
1293   case Intrinsic::x86_avx2_psrli_d:
1294   case Intrinsic::x86_avx2_psrli_q:
1295   case Intrinsic::x86_avx2_psrli_w:
1296   case Intrinsic::x86_avx512_psrli_d_512:
1297   case Intrinsic::x86_avx512_psrli_q_512:
1298   case Intrinsic::x86_avx512_psrli_w_512:
1299   case Intrinsic::x86_sse2_pslli_d:
1300   case Intrinsic::x86_sse2_pslli_q:
1301   case Intrinsic::x86_sse2_pslli_w:
1302   case Intrinsic::x86_avx2_pslli_d:
1303   case Intrinsic::x86_avx2_pslli_q:
1304   case Intrinsic::x86_avx2_pslli_w:
1305   case Intrinsic::x86_avx512_pslli_d_512:
1306   case Intrinsic::x86_avx512_pslli_q_512:
1307   case Intrinsic::x86_avx512_pslli_w_512:
1308     if (Value *V = simplifyX86immShift(II, IC.Builder)) {
1309       return IC.replaceInstUsesWith(II, V);
1310     }
1311     break;
1312 
1313   case Intrinsic::x86_sse2_psra_d:
1314   case Intrinsic::x86_sse2_psra_w:
1315   case Intrinsic::x86_avx2_psra_d:
1316   case Intrinsic::x86_avx2_psra_w:
1317   case Intrinsic::x86_avx512_psra_q_128:
1318   case Intrinsic::x86_avx512_psra_q_256:
1319   case Intrinsic::x86_avx512_psra_d_512:
1320   case Intrinsic::x86_avx512_psra_q_512:
1321   case Intrinsic::x86_avx512_psra_w_512:
1322   case Intrinsic::x86_sse2_psrl_d:
1323   case Intrinsic::x86_sse2_psrl_q:
1324   case Intrinsic::x86_sse2_psrl_w:
1325   case Intrinsic::x86_avx2_psrl_d:
1326   case Intrinsic::x86_avx2_psrl_q:
1327   case Intrinsic::x86_avx2_psrl_w:
1328   case Intrinsic::x86_avx512_psrl_d_512:
1329   case Intrinsic::x86_avx512_psrl_q_512:
1330   case Intrinsic::x86_avx512_psrl_w_512:
1331   case Intrinsic::x86_sse2_psll_d:
1332   case Intrinsic::x86_sse2_psll_q:
1333   case Intrinsic::x86_sse2_psll_w:
1334   case Intrinsic::x86_avx2_psll_d:
1335   case Intrinsic::x86_avx2_psll_q:
1336   case Intrinsic::x86_avx2_psll_w:
1337   case Intrinsic::x86_avx512_psll_d_512:
1338   case Intrinsic::x86_avx512_psll_q_512:
1339   case Intrinsic::x86_avx512_psll_w_512: {
1340     if (Value *V = simplifyX86immShift(II, IC.Builder)) {
1341       return IC.replaceInstUsesWith(II, V);
1342     }
1343 
1344     // SSE2/AVX2 uses only the first 64-bits of the 128-bit vector
1345     // operand to compute the shift amount.
1346     Value *Arg1 = II.getArgOperand(1);
1347     assert(Arg1->getType()->getPrimitiveSizeInBits() == 128 &&
1348            "Unexpected packed shift size");
1349     unsigned VWidth = cast<FixedVectorType>(Arg1->getType())->getNumElements();
1350 
1351     if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, VWidth / 2)) {
1352       return IC.replaceOperand(II, 1, V);
1353     }
1354     break;
1355   }
1356 
1357   case Intrinsic::x86_avx2_psllv_d:
1358   case Intrinsic::x86_avx2_psllv_d_256:
1359   case Intrinsic::x86_avx2_psllv_q:
1360   case Intrinsic::x86_avx2_psllv_q_256:
1361   case Intrinsic::x86_avx512_psllv_d_512:
1362   case Intrinsic::x86_avx512_psllv_q_512:
1363   case Intrinsic::x86_avx512_psllv_w_128:
1364   case Intrinsic::x86_avx512_psllv_w_256:
1365   case Intrinsic::x86_avx512_psllv_w_512:
1366   case Intrinsic::x86_avx2_psrav_d:
1367   case Intrinsic::x86_avx2_psrav_d_256:
1368   case Intrinsic::x86_avx512_psrav_q_128:
1369   case Intrinsic::x86_avx512_psrav_q_256:
1370   case Intrinsic::x86_avx512_psrav_d_512:
1371   case Intrinsic::x86_avx512_psrav_q_512:
1372   case Intrinsic::x86_avx512_psrav_w_128:
1373   case Intrinsic::x86_avx512_psrav_w_256:
1374   case Intrinsic::x86_avx512_psrav_w_512:
1375   case Intrinsic::x86_avx2_psrlv_d:
1376   case Intrinsic::x86_avx2_psrlv_d_256:
1377   case Intrinsic::x86_avx2_psrlv_q:
1378   case Intrinsic::x86_avx2_psrlv_q_256:
1379   case Intrinsic::x86_avx512_psrlv_d_512:
1380   case Intrinsic::x86_avx512_psrlv_q_512:
1381   case Intrinsic::x86_avx512_psrlv_w_128:
1382   case Intrinsic::x86_avx512_psrlv_w_256:
1383   case Intrinsic::x86_avx512_psrlv_w_512:
1384     if (Value *V = simplifyX86varShift(II, IC.Builder)) {
1385       return IC.replaceInstUsesWith(II, V);
1386     }
1387     break;
1388 
1389   case Intrinsic::x86_sse2_packssdw_128:
1390   case Intrinsic::x86_sse2_packsswb_128:
1391   case Intrinsic::x86_avx2_packssdw:
1392   case Intrinsic::x86_avx2_packsswb:
1393   case Intrinsic::x86_avx512_packssdw_512:
1394   case Intrinsic::x86_avx512_packsswb_512:
1395     if (Value *V = simplifyX86pack(II, IC.Builder, true)) {
1396       return IC.replaceInstUsesWith(II, V);
1397     }
1398     break;
1399 
1400   case Intrinsic::x86_sse2_packuswb_128:
1401   case Intrinsic::x86_sse41_packusdw:
1402   case Intrinsic::x86_avx2_packusdw:
1403   case Intrinsic::x86_avx2_packuswb:
1404   case Intrinsic::x86_avx512_packusdw_512:
1405   case Intrinsic::x86_avx512_packuswb_512:
1406     if (Value *V = simplifyX86pack(II, IC.Builder, false)) {
1407       return IC.replaceInstUsesWith(II, V);
1408     }
1409     break;
1410 
1411   case Intrinsic::x86_pclmulqdq:
1412   case Intrinsic::x86_pclmulqdq_256:
1413   case Intrinsic::x86_pclmulqdq_512: {
1414     if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
1415       unsigned Imm = C->getZExtValue();
1416 
1417       bool MadeChange = false;
1418       Value *Arg0 = II.getArgOperand(0);
1419       Value *Arg1 = II.getArgOperand(1);
1420       unsigned VWidth =
1421           cast<FixedVectorType>(Arg0->getType())->getNumElements();
1422 
1423       APInt UndefElts1(VWidth, 0);
1424       APInt DemandedElts1 =
1425           APInt::getSplat(VWidth, APInt(2, (Imm & 0x01) ? 2 : 1));
1426       if (Value *V =
1427               IC.SimplifyDemandedVectorElts(Arg0, DemandedElts1, UndefElts1)) {
1428         IC.replaceOperand(II, 0, V);
1429         MadeChange = true;
1430       }
1431 
1432       APInt UndefElts2(VWidth, 0);
1433       APInt DemandedElts2 =
1434           APInt::getSplat(VWidth, APInt(2, (Imm & 0x10) ? 2 : 1));
1435       if (Value *V =
1436               IC.SimplifyDemandedVectorElts(Arg1, DemandedElts2, UndefElts2)) {
1437         IC.replaceOperand(II, 1, V);
1438         MadeChange = true;
1439       }
1440 
1441       // If either input elements are undef, the result is zero.
1442       if (DemandedElts1.isSubsetOf(UndefElts1) ||
1443           DemandedElts2.isSubsetOf(UndefElts2)) {
1444         return IC.replaceInstUsesWith(II,
1445                                       ConstantAggregateZero::get(II.getType()));
1446       }
1447 
1448       if (MadeChange) {
1449         return &II;
1450       }
1451     }
1452     break;
1453   }
1454 
1455   case Intrinsic::x86_sse41_insertps:
1456     if (Value *V = simplifyX86insertps(II, IC.Builder)) {
1457       return IC.replaceInstUsesWith(II, V);
1458     }
1459     break;
1460 
1461   case Intrinsic::x86_sse4a_extrq: {
1462     Value *Op0 = II.getArgOperand(0);
1463     Value *Op1 = II.getArgOperand(1);
1464     unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements();
1465     unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements();
1466     assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
1467            Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
1468            VWidth1 == 16 && "Unexpected operand sizes");
1469 
1470     // See if we're dealing with constant values.
1471     auto *C1 = dyn_cast<Constant>(Op1);
1472     auto *CILength =
1473         C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
1474            : nullptr;
1475     auto *CIIndex =
1476         C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
1477            : nullptr;
1478 
1479     // Attempt to simplify to a constant, shuffle vector or EXTRQI call.
1480     if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) {
1481       return IC.replaceInstUsesWith(II, V);
1482     }
1483 
1484     // EXTRQ only uses the lowest 64-bits of the first 128-bit vector
1485     // operands and the lowest 16-bits of the second.
1486     bool MadeChange = false;
1487     if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
1488       IC.replaceOperand(II, 0, V);
1489       MadeChange = true;
1490     }
1491     if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 2)) {
1492       IC.replaceOperand(II, 1, V);
1493       MadeChange = true;
1494     }
1495     if (MadeChange) {
1496       return &II;
1497     }
1498     break;
1499   }
1500 
1501   case Intrinsic::x86_sse4a_extrqi: {
1502     // EXTRQI: Extract Length bits starting from Index. Zero pad the remaining
1503     // bits of the lower 64-bits. The upper 64-bits are undefined.
1504     Value *Op0 = II.getArgOperand(0);
1505     unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements();
1506     assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
1507            "Unexpected operand size");
1508 
1509     // See if we're dealing with constant values.
1510     auto *CILength = dyn_cast<ConstantInt>(II.getArgOperand(1));
1511     auto *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(2));
1512 
1513     // Attempt to simplify to a constant or shuffle vector.
1514     if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) {
1515       return IC.replaceInstUsesWith(II, V);
1516     }
1517 
1518     // EXTRQI only uses the lowest 64-bits of the first 128-bit vector
1519     // operand.
1520     if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
1521       return IC.replaceOperand(II, 0, V);
1522     }
1523     break;
1524   }
1525 
1526   case Intrinsic::x86_sse4a_insertq: {
1527     Value *Op0 = II.getArgOperand(0);
1528     Value *Op1 = II.getArgOperand(1);
1529     unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements();
1530     assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
1531            Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
1532            cast<FixedVectorType>(Op1->getType())->getNumElements() == 2 &&
1533            "Unexpected operand size");
1534 
1535     // See if we're dealing with constant values.
1536     auto *C1 = dyn_cast<Constant>(Op1);
1537     auto *CI11 =
1538         C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
1539            : nullptr;
1540 
1541     // Attempt to simplify to a constant, shuffle vector or INSERTQI call.
1542     if (CI11) {
1543       const APInt &V11 = CI11->getValue();
1544       APInt Len = V11.zextOrTrunc(6);
1545       APInt Idx = V11.lshr(8).zextOrTrunc(6);
1546       if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) {
1547         return IC.replaceInstUsesWith(II, V);
1548       }
1549     }
1550 
1551     // INSERTQ only uses the lowest 64-bits of the first 128-bit vector
1552     // operand.
1553     if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
1554       return IC.replaceOperand(II, 0, V);
1555     }
1556     break;
1557   }
1558 
1559   case Intrinsic::x86_sse4a_insertqi: {
1560     // INSERTQI: Extract lowest Length bits from lower half of second source and
1561     // insert over first source starting at Index bit. The upper 64-bits are
1562     // undefined.
1563     Value *Op0 = II.getArgOperand(0);
1564     Value *Op1 = II.getArgOperand(1);
1565     unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements();
1566     unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements();
1567     assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
1568            Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
1569            VWidth1 == 2 && "Unexpected operand sizes");
1570 
1571     // See if we're dealing with constant values.
1572     auto *CILength = dyn_cast<ConstantInt>(II.getArgOperand(2));
1573     auto *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(3));
1574 
1575     // Attempt to simplify to a constant or shuffle vector.
1576     if (CILength && CIIndex) {
1577       APInt Len = CILength->getValue().zextOrTrunc(6);
1578       APInt Idx = CIIndex->getValue().zextOrTrunc(6);
1579       if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) {
1580         return IC.replaceInstUsesWith(II, V);
1581       }
1582     }
1583 
1584     // INSERTQI only uses the lowest 64-bits of the first two 128-bit vector
1585     // operands.
1586     bool MadeChange = false;
1587     if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
1588       IC.replaceOperand(II, 0, V);
1589       MadeChange = true;
1590     }
1591     if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 1)) {
1592       IC.replaceOperand(II, 1, V);
1593       MadeChange = true;
1594     }
1595     if (MadeChange) {
1596       return &II;
1597     }
1598     break;
1599   }
1600 
1601   case Intrinsic::x86_sse41_pblendvb:
1602   case Intrinsic::x86_sse41_blendvps:
1603   case Intrinsic::x86_sse41_blendvpd:
1604   case Intrinsic::x86_avx_blendv_ps_256:
1605   case Intrinsic::x86_avx_blendv_pd_256:
1606   case Intrinsic::x86_avx2_pblendvb: {
1607     // fold (blend A, A, Mask) -> A
1608     Value *Op0 = II.getArgOperand(0);
1609     Value *Op1 = II.getArgOperand(1);
1610     Value *Mask = II.getArgOperand(2);
1611     if (Op0 == Op1) {
1612       return IC.replaceInstUsesWith(II, Op0);
1613     }
1614 
1615     // Zero Mask - select 1st argument.
1616     if (isa<ConstantAggregateZero>(Mask)) {
1617       return IC.replaceInstUsesWith(II, Op0);
1618     }
1619 
1620     // Constant Mask - select 1st/2nd argument lane based on top bit of mask.
1621     if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) {
1622       Constant *NewSelector = getNegativeIsTrueBoolVec(ConstantMask);
1623       return SelectInst::Create(NewSelector, Op1, Op0, "blendv");
1624     }
1625 
1626     // Convert to a vector select if we can bypass casts and find a boolean
1627     // vector condition value.
1628     Value *BoolVec;
1629     Mask = InstCombiner::peekThroughBitcast(Mask);
1630     if (match(Mask, PatternMatch::m_SExt(PatternMatch::m_Value(BoolVec))) &&
1631         BoolVec->getType()->isVectorTy() &&
1632         BoolVec->getType()->getScalarSizeInBits() == 1) {
1633       assert(Mask->getType()->getPrimitiveSizeInBits() ==
1634                  II.getType()->getPrimitiveSizeInBits() &&
1635              "Not expecting mask and operands with different sizes");
1636 
1637       unsigned NumMaskElts =
1638           cast<FixedVectorType>(Mask->getType())->getNumElements();
1639       unsigned NumOperandElts =
1640           cast<FixedVectorType>(II.getType())->getNumElements();
1641       if (NumMaskElts == NumOperandElts) {
1642         return SelectInst::Create(BoolVec, Op1, Op0);
1643       }
1644 
1645       // If the mask has less elements than the operands, each mask bit maps to
1646       // multiple elements of the operands. Bitcast back and forth.
1647       if (NumMaskElts < NumOperandElts) {
1648         Value *CastOp0 = IC.Builder.CreateBitCast(Op0, Mask->getType());
1649         Value *CastOp1 = IC.Builder.CreateBitCast(Op1, Mask->getType());
1650         Value *Sel = IC.Builder.CreateSelect(BoolVec, CastOp1, CastOp0);
1651         return new BitCastInst(Sel, II.getType());
1652       }
1653     }
1654 
1655     break;
1656   }
1657 
1658   case Intrinsic::x86_ssse3_pshuf_b_128:
1659   case Intrinsic::x86_avx2_pshuf_b:
1660   case Intrinsic::x86_avx512_pshuf_b_512:
1661     if (Value *V = simplifyX86pshufb(II, IC.Builder)) {
1662       return IC.replaceInstUsesWith(II, V);
1663     }
1664     break;
1665 
1666   case Intrinsic::x86_avx_vpermilvar_ps:
1667   case Intrinsic::x86_avx_vpermilvar_ps_256:
1668   case Intrinsic::x86_avx512_vpermilvar_ps_512:
1669   case Intrinsic::x86_avx_vpermilvar_pd:
1670   case Intrinsic::x86_avx_vpermilvar_pd_256:
1671   case Intrinsic::x86_avx512_vpermilvar_pd_512:
1672     if (Value *V = simplifyX86vpermilvar(II, IC.Builder)) {
1673       return IC.replaceInstUsesWith(II, V);
1674     }
1675     break;
1676 
1677   case Intrinsic::x86_avx2_permd:
1678   case Intrinsic::x86_avx2_permps:
1679   case Intrinsic::x86_avx512_permvar_df_256:
1680   case Intrinsic::x86_avx512_permvar_df_512:
1681   case Intrinsic::x86_avx512_permvar_di_256:
1682   case Intrinsic::x86_avx512_permvar_di_512:
1683   case Intrinsic::x86_avx512_permvar_hi_128:
1684   case Intrinsic::x86_avx512_permvar_hi_256:
1685   case Intrinsic::x86_avx512_permvar_hi_512:
1686   case Intrinsic::x86_avx512_permvar_qi_128:
1687   case Intrinsic::x86_avx512_permvar_qi_256:
1688   case Intrinsic::x86_avx512_permvar_qi_512:
1689   case Intrinsic::x86_avx512_permvar_sf_512:
1690   case Intrinsic::x86_avx512_permvar_si_512:
1691     if (Value *V = simplifyX86vpermv(II, IC.Builder)) {
1692       return IC.replaceInstUsesWith(II, V);
1693     }
1694     break;
1695 
1696   case Intrinsic::x86_avx_maskload_ps:
1697   case Intrinsic::x86_avx_maskload_pd:
1698   case Intrinsic::x86_avx_maskload_ps_256:
1699   case Intrinsic::x86_avx_maskload_pd_256:
1700   case Intrinsic::x86_avx2_maskload_d:
1701   case Intrinsic::x86_avx2_maskload_q:
1702   case Intrinsic::x86_avx2_maskload_d_256:
1703   case Intrinsic::x86_avx2_maskload_q_256:
1704     if (Instruction *I = simplifyX86MaskedLoad(II, IC)) {
1705       return I;
1706     }
1707     break;
1708 
1709   case Intrinsic::x86_sse2_maskmov_dqu:
1710   case Intrinsic::x86_avx_maskstore_ps:
1711   case Intrinsic::x86_avx_maskstore_pd:
1712   case Intrinsic::x86_avx_maskstore_ps_256:
1713   case Intrinsic::x86_avx_maskstore_pd_256:
1714   case Intrinsic::x86_avx2_maskstore_d:
1715   case Intrinsic::x86_avx2_maskstore_q:
1716   case Intrinsic::x86_avx2_maskstore_d_256:
1717   case Intrinsic::x86_avx2_maskstore_q_256:
1718     if (simplifyX86MaskedStore(II, IC)) {
1719       return nullptr;
1720     }
1721     break;
1722 
1723   case Intrinsic::x86_addcarry_32:
1724   case Intrinsic::x86_addcarry_64:
1725     if (Value *V = simplifyX86addcarry(II, IC.Builder)) {
1726       return IC.replaceInstUsesWith(II, V);
1727     }
1728     break;
1729 
1730   default:
1731     break;
1732   }
1733   return None;
1734 }
1735 
1736 Optional<Value *> X86TTIImpl::simplifyDemandedUseBitsIntrinsic(
1737     InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known,
1738     bool &KnownBitsComputed) const {
1739   switch (II.getIntrinsicID()) {
1740   default:
1741     break;
1742   case Intrinsic::x86_mmx_pmovmskb:
1743   case Intrinsic::x86_sse_movmsk_ps:
1744   case Intrinsic::x86_sse2_movmsk_pd:
1745   case Intrinsic::x86_sse2_pmovmskb_128:
1746   case Intrinsic::x86_avx_movmsk_ps_256:
1747   case Intrinsic::x86_avx_movmsk_pd_256:
1748   case Intrinsic::x86_avx2_pmovmskb: {
1749     // MOVMSK copies the vector elements' sign bits to the low bits
1750     // and zeros the high bits.
1751     unsigned ArgWidth;
1752     if (II.getIntrinsicID() == Intrinsic::x86_mmx_pmovmskb) {
1753       ArgWidth = 8; // Arg is x86_mmx, but treated as <8 x i8>.
1754     } else {
1755       auto *ArgType = cast<FixedVectorType>(II.getArgOperand(0)->getType());
1756       ArgWidth = ArgType->getNumElements();
1757     }
1758 
1759     // If we don't need any of low bits then return zero,
1760     // we know that DemandedMask is non-zero already.
1761     APInt DemandedElts = DemandedMask.zextOrTrunc(ArgWidth);
1762     Type *VTy = II.getType();
1763     if (DemandedElts.isZero()) {
1764       return ConstantInt::getNullValue(VTy);
1765     }
1766 
1767     // We know that the upper bits are set to zero.
1768     Known.Zero.setBitsFrom(ArgWidth);
1769     KnownBitsComputed = true;
1770     break;
1771   }
1772   }
1773   return None;
1774 }
1775 
1776 Optional<Value *> X86TTIImpl::simplifyDemandedVectorEltsIntrinsic(
1777     InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
1778     APInt &UndefElts2, APInt &UndefElts3,
1779     std::function<void(Instruction *, unsigned, APInt, APInt &)>
1780         simplifyAndSetOp) const {
1781   unsigned VWidth = cast<FixedVectorType>(II.getType())->getNumElements();
1782   switch (II.getIntrinsicID()) {
1783   default:
1784     break;
1785   case Intrinsic::x86_xop_vfrcz_ss:
1786   case Intrinsic::x86_xop_vfrcz_sd:
1787     // The instructions for these intrinsics are speced to zero upper bits not
1788     // pass them through like other scalar intrinsics. So we shouldn't just
1789     // use Arg0 if DemandedElts[0] is clear like we do for other intrinsics.
1790     // Instead we should return a zero vector.
1791     if (!DemandedElts[0]) {
1792       IC.addToWorklist(&II);
1793       return ConstantAggregateZero::get(II.getType());
1794     }
1795 
1796     // Only the lower element is used.
1797     DemandedElts = 1;
1798     simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
1799 
1800     // Only the lower element is undefined. The high elements are zero.
1801     UndefElts = UndefElts[0];
1802     break;
1803 
1804   // Unary scalar-as-vector operations that work column-wise.
1805   case Intrinsic::x86_sse_rcp_ss:
1806   case Intrinsic::x86_sse_rsqrt_ss:
1807     simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
1808 
1809     // If lowest element of a scalar op isn't used then use Arg0.
1810     if (!DemandedElts[0]) {
1811       IC.addToWorklist(&II);
1812       return II.getArgOperand(0);
1813     }
1814     // TODO: If only low elt lower SQRT to FSQRT (with rounding/exceptions
1815     // checks).
1816     break;
1817 
1818   // Binary scalar-as-vector operations that work column-wise. The high
1819   // elements come from operand 0. The low element is a function of both
1820   // operands.
1821   case Intrinsic::x86_sse_min_ss:
1822   case Intrinsic::x86_sse_max_ss:
1823   case Intrinsic::x86_sse_cmp_ss:
1824   case Intrinsic::x86_sse2_min_sd:
1825   case Intrinsic::x86_sse2_max_sd:
1826   case Intrinsic::x86_sse2_cmp_sd: {
1827     simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
1828 
1829     // If lowest element of a scalar op isn't used then use Arg0.
1830     if (!DemandedElts[0]) {
1831       IC.addToWorklist(&II);
1832       return II.getArgOperand(0);
1833     }
1834 
1835     // Only lower element is used for operand 1.
1836     DemandedElts = 1;
1837     simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
1838 
1839     // Lower element is undefined if both lower elements are undefined.
1840     // Consider things like undef&0.  The result is known zero, not undef.
1841     if (!UndefElts2[0])
1842       UndefElts.clearBit(0);
1843 
1844     break;
1845   }
1846 
1847   // Binary scalar-as-vector operations that work column-wise. The high
1848   // elements come from operand 0 and the low element comes from operand 1.
1849   case Intrinsic::x86_sse41_round_ss:
1850   case Intrinsic::x86_sse41_round_sd: {
1851     // Don't use the low element of operand 0.
1852     APInt DemandedElts2 = DemandedElts;
1853     DemandedElts2.clearBit(0);
1854     simplifyAndSetOp(&II, 0, DemandedElts2, UndefElts);
1855 
1856     // If lowest element of a scalar op isn't used then use Arg0.
1857     if (!DemandedElts[0]) {
1858       IC.addToWorklist(&II);
1859       return II.getArgOperand(0);
1860     }
1861 
1862     // Only lower element is used for operand 1.
1863     DemandedElts = 1;
1864     simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
1865 
1866     // Take the high undef elements from operand 0 and take the lower element
1867     // from operand 1.
1868     UndefElts.clearBit(0);
1869     UndefElts |= UndefElts2[0];
1870     break;
1871   }
1872 
1873   // Three input scalar-as-vector operations that work column-wise. The high
1874   // elements come from operand 0 and the low element is a function of all
1875   // three inputs.
1876   case Intrinsic::x86_avx512_mask_add_ss_round:
1877   case Intrinsic::x86_avx512_mask_div_ss_round:
1878   case Intrinsic::x86_avx512_mask_mul_ss_round:
1879   case Intrinsic::x86_avx512_mask_sub_ss_round:
1880   case Intrinsic::x86_avx512_mask_max_ss_round:
1881   case Intrinsic::x86_avx512_mask_min_ss_round:
1882   case Intrinsic::x86_avx512_mask_add_sd_round:
1883   case Intrinsic::x86_avx512_mask_div_sd_round:
1884   case Intrinsic::x86_avx512_mask_mul_sd_round:
1885   case Intrinsic::x86_avx512_mask_sub_sd_round:
1886   case Intrinsic::x86_avx512_mask_max_sd_round:
1887   case Intrinsic::x86_avx512_mask_min_sd_round:
1888     simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
1889 
1890     // If lowest element of a scalar op isn't used then use Arg0.
1891     if (!DemandedElts[0]) {
1892       IC.addToWorklist(&II);
1893       return II.getArgOperand(0);
1894     }
1895 
1896     // Only lower element is used for operand 1 and 2.
1897     DemandedElts = 1;
1898     simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
1899     simplifyAndSetOp(&II, 2, DemandedElts, UndefElts3);
1900 
1901     // Lower element is undefined if all three lower elements are undefined.
1902     // Consider things like undef&0.  The result is known zero, not undef.
1903     if (!UndefElts2[0] || !UndefElts3[0])
1904       UndefElts.clearBit(0);
1905     break;
1906 
1907   // TODO: Add fmaddsub support?
1908   case Intrinsic::x86_sse3_addsub_pd:
1909   case Intrinsic::x86_sse3_addsub_ps:
1910   case Intrinsic::x86_avx_addsub_pd_256:
1911   case Intrinsic::x86_avx_addsub_ps_256: {
1912     // If none of the even or none of the odd lanes are required, turn this
1913     // into a generic FP math instruction.
1914     APInt SubMask = APInt::getSplat(VWidth, APInt(2, 0x1));
1915     APInt AddMask = APInt::getSplat(VWidth, APInt(2, 0x2));
1916     bool IsSubOnly = DemandedElts.isSubsetOf(SubMask);
1917     bool IsAddOnly = DemandedElts.isSubsetOf(AddMask);
1918     if (IsSubOnly || IsAddOnly) {
1919       assert((IsSubOnly ^ IsAddOnly) && "Can't be both add-only and sub-only");
1920       IRBuilderBase::InsertPointGuard Guard(IC.Builder);
1921       IC.Builder.SetInsertPoint(&II);
1922       Value *Arg0 = II.getArgOperand(0), *Arg1 = II.getArgOperand(1);
1923       return IC.Builder.CreateBinOp(
1924           IsSubOnly ? Instruction::FSub : Instruction::FAdd, Arg0, Arg1);
1925     }
1926 
1927     simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
1928     simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
1929     UndefElts &= UndefElts2;
1930     break;
1931   }
1932 
1933   // General per-element vector operations.
1934   case Intrinsic::x86_avx2_psllv_d:
1935   case Intrinsic::x86_avx2_psllv_d_256:
1936   case Intrinsic::x86_avx2_psllv_q:
1937   case Intrinsic::x86_avx2_psllv_q_256:
1938   case Intrinsic::x86_avx2_psrlv_d:
1939   case Intrinsic::x86_avx2_psrlv_d_256:
1940   case Intrinsic::x86_avx2_psrlv_q:
1941   case Intrinsic::x86_avx2_psrlv_q_256:
1942   case Intrinsic::x86_avx2_psrav_d:
1943   case Intrinsic::x86_avx2_psrav_d_256: {
1944     simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
1945     simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
1946     UndefElts &= UndefElts2;
1947     break;
1948   }
1949 
1950   case Intrinsic::x86_sse2_packssdw_128:
1951   case Intrinsic::x86_sse2_packsswb_128:
1952   case Intrinsic::x86_sse2_packuswb_128:
1953   case Intrinsic::x86_sse41_packusdw:
1954   case Intrinsic::x86_avx2_packssdw:
1955   case Intrinsic::x86_avx2_packsswb:
1956   case Intrinsic::x86_avx2_packusdw:
1957   case Intrinsic::x86_avx2_packuswb:
1958   case Intrinsic::x86_avx512_packssdw_512:
1959   case Intrinsic::x86_avx512_packsswb_512:
1960   case Intrinsic::x86_avx512_packusdw_512:
1961   case Intrinsic::x86_avx512_packuswb_512: {
1962     auto *Ty0 = II.getArgOperand(0)->getType();
1963     unsigned InnerVWidth = cast<FixedVectorType>(Ty0)->getNumElements();
1964     assert(VWidth == (InnerVWidth * 2) && "Unexpected input size");
1965 
1966     unsigned NumLanes = Ty0->getPrimitiveSizeInBits() / 128;
1967     unsigned VWidthPerLane = VWidth / NumLanes;
1968     unsigned InnerVWidthPerLane = InnerVWidth / NumLanes;
1969 
1970     // Per lane, pack the elements of the first input and then the second.
1971     // e.g.
1972     // v8i16 PACK(v4i32 X, v4i32 Y) - (X[0..3],Y[0..3])
1973     // v32i8 PACK(v16i16 X, v16i16 Y) - (X[0..7],Y[0..7]),(X[8..15],Y[8..15])
1974     for (int OpNum = 0; OpNum != 2; ++OpNum) {
1975       APInt OpDemandedElts(InnerVWidth, 0);
1976       for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
1977         unsigned LaneIdx = Lane * VWidthPerLane;
1978         for (unsigned Elt = 0; Elt != InnerVWidthPerLane; ++Elt) {
1979           unsigned Idx = LaneIdx + Elt + InnerVWidthPerLane * OpNum;
1980           if (DemandedElts[Idx])
1981             OpDemandedElts.setBit((Lane * InnerVWidthPerLane) + Elt);
1982         }
1983       }
1984 
1985       // Demand elements from the operand.
1986       APInt OpUndefElts(InnerVWidth, 0);
1987       simplifyAndSetOp(&II, OpNum, OpDemandedElts, OpUndefElts);
1988 
1989       // Pack the operand's UNDEF elements, one lane at a time.
1990       OpUndefElts = OpUndefElts.zext(VWidth);
1991       for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
1992         APInt LaneElts = OpUndefElts.lshr(InnerVWidthPerLane * Lane);
1993         LaneElts = LaneElts.getLoBits(InnerVWidthPerLane);
1994         LaneElts <<= InnerVWidthPerLane * (2 * Lane + OpNum);
1995         UndefElts |= LaneElts;
1996       }
1997     }
1998     break;
1999   }
2000 
2001   // PSHUFB
2002   case Intrinsic::x86_ssse3_pshuf_b_128:
2003   case Intrinsic::x86_avx2_pshuf_b:
2004   case Intrinsic::x86_avx512_pshuf_b_512:
2005   // PERMILVAR
2006   case Intrinsic::x86_avx_vpermilvar_ps:
2007   case Intrinsic::x86_avx_vpermilvar_ps_256:
2008   case Intrinsic::x86_avx512_vpermilvar_ps_512:
2009   case Intrinsic::x86_avx_vpermilvar_pd:
2010   case Intrinsic::x86_avx_vpermilvar_pd_256:
2011   case Intrinsic::x86_avx512_vpermilvar_pd_512:
2012   // PERMV
2013   case Intrinsic::x86_avx2_permd:
2014   case Intrinsic::x86_avx2_permps: {
2015     simplifyAndSetOp(&II, 1, DemandedElts, UndefElts);
2016     break;
2017   }
2018 
2019   // SSE4A instructions leave the upper 64-bits of the 128-bit result
2020   // in an undefined state.
2021   case Intrinsic::x86_sse4a_extrq:
2022   case Intrinsic::x86_sse4a_extrqi:
2023   case Intrinsic::x86_sse4a_insertq:
2024   case Intrinsic::x86_sse4a_insertqi:
2025     UndefElts.setHighBits(VWidth / 2);
2026     break;
2027   }
2028   return None;
2029 }
2030