xref: /freebsd/contrib/llvm-project/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp (revision 63f537551380d2dab29fa402ad1269feae17e594)
1 //===-- X86InstCombineIntrinsic.cpp - X86 specific InstCombine pass -------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements a TargetTransformInfo analysis pass specific to the
10 /// X86 target machine. It uses the target's detailed information to provide
11 /// more precise answers to certain TTI queries, while letting the target
12 /// independent and default TTI implementations handle the rest.
13 ///
14 //===----------------------------------------------------------------------===//
15 
16 #include "X86TargetTransformInfo.h"
17 #include "llvm/IR/IntrinsicInst.h"
18 #include "llvm/IR/IntrinsicsX86.h"
19 #include "llvm/Support/KnownBits.h"
20 #include "llvm/Transforms/InstCombine/InstCombiner.h"
21 #include <optional>
22 
23 using namespace llvm;
24 
25 #define DEBUG_TYPE "x86tti"
26 
27 /// Return a constant boolean vector that has true elements in all positions
28 /// where the input constant data vector has an element with the sign bit set.
29 static Constant *getNegativeIsTrueBoolVec(Constant *V) {
30   VectorType *IntTy = VectorType::getInteger(cast<VectorType>(V->getType()));
31   V = ConstantExpr::getBitCast(V, IntTy);
32   V = ConstantExpr::getICmp(CmpInst::ICMP_SGT, Constant::getNullValue(IntTy),
33                             V);
34   return V;
35 }
36 
37 /// Convert the x86 XMM integer vector mask to a vector of bools based on
38 /// each element's most significant bit (the sign bit).
39 static Value *getBoolVecFromMask(Value *Mask) {
40   // Fold Constant Mask.
41   if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask))
42     return getNegativeIsTrueBoolVec(ConstantMask);
43 
44   // Mask was extended from a boolean vector.
45   Value *ExtMask;
46   if (PatternMatch::match(
47           Mask, PatternMatch::m_SExt(PatternMatch::m_Value(ExtMask))) &&
48       ExtMask->getType()->isIntOrIntVectorTy(1))
49     return ExtMask;
50 
51   return nullptr;
52 }
53 
54 // TODO: If the x86 backend knew how to convert a bool vector mask back to an
55 // XMM register mask efficiently, we could transform all x86 masked intrinsics
56 // to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
57 static Instruction *simplifyX86MaskedLoad(IntrinsicInst &II, InstCombiner &IC) {
58   Value *Ptr = II.getOperand(0);
59   Value *Mask = II.getOperand(1);
60   Constant *ZeroVec = Constant::getNullValue(II.getType());
61 
62   // Zero Mask - masked load instruction creates a zero vector.
63   if (isa<ConstantAggregateZero>(Mask))
64     return IC.replaceInstUsesWith(II, ZeroVec);
65 
66   // The mask is constant or extended from a bool vector. Convert this x86
67   // intrinsic to the LLVM intrinsic to allow target-independent optimizations.
68   if (Value *BoolMask = getBoolVecFromMask(Mask)) {
69     // First, cast the x86 intrinsic scalar pointer to a vector pointer to match
70     // the LLVM intrinsic definition for the pointer argument.
71     unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
72     PointerType *VecPtrTy = PointerType::get(II.getType(), AddrSpace);
73     Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");
74 
75     // The pass-through vector for an x86 masked load is a zero vector.
76     CallInst *NewMaskedLoad = IC.Builder.CreateMaskedLoad(
77         II.getType(), PtrCast, Align(1), BoolMask, ZeroVec);
78     return IC.replaceInstUsesWith(II, NewMaskedLoad);
79   }
80 
81   return nullptr;
82 }
83 
84 // TODO: If the x86 backend knew how to convert a bool vector mask back to an
85 // XMM register mask efficiently, we could transform all x86 masked intrinsics
86 // to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
87 static bool simplifyX86MaskedStore(IntrinsicInst &II, InstCombiner &IC) {
88   Value *Ptr = II.getOperand(0);
89   Value *Mask = II.getOperand(1);
90   Value *Vec = II.getOperand(2);
91 
92   // Zero Mask - this masked store instruction does nothing.
93   if (isa<ConstantAggregateZero>(Mask)) {
94     IC.eraseInstFromFunction(II);
95     return true;
96   }
97 
98   // The SSE2 version is too weird (eg, unaligned but non-temporal) to do
99   // anything else at this level.
100   if (II.getIntrinsicID() == Intrinsic::x86_sse2_maskmov_dqu)
101     return false;
102 
103   // The mask is constant or extended from a bool vector. Convert this x86
104   // intrinsic to the LLVM intrinsic to allow target-independent optimizations.
105   if (Value *BoolMask = getBoolVecFromMask(Mask)) {
106     unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
107     PointerType *VecPtrTy = PointerType::get(Vec->getType(), AddrSpace);
108     Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");
109 
110     IC.Builder.CreateMaskedStore(Vec, PtrCast, Align(1), BoolMask);
111 
112     // 'Replace uses' doesn't work for stores. Erase the original masked store.
113     IC.eraseInstFromFunction(II);
114     return true;
115   }
116 
117   return false;
118 }
119 
120 static Value *simplifyX86immShift(const IntrinsicInst &II,
121                                   InstCombiner::BuilderTy &Builder) {
122   bool LogicalShift = false;
123   bool ShiftLeft = false;
124   bool IsImm = false;
125 
126   switch (II.getIntrinsicID()) {
127   default:
128     llvm_unreachable("Unexpected intrinsic!");
129   case Intrinsic::x86_sse2_psrai_d:
130   case Intrinsic::x86_sse2_psrai_w:
131   case Intrinsic::x86_avx2_psrai_d:
132   case Intrinsic::x86_avx2_psrai_w:
133   case Intrinsic::x86_avx512_psrai_q_128:
134   case Intrinsic::x86_avx512_psrai_q_256:
135   case Intrinsic::x86_avx512_psrai_d_512:
136   case Intrinsic::x86_avx512_psrai_q_512:
137   case Intrinsic::x86_avx512_psrai_w_512:
138     IsImm = true;
139     [[fallthrough]];
140   case Intrinsic::x86_sse2_psra_d:
141   case Intrinsic::x86_sse2_psra_w:
142   case Intrinsic::x86_avx2_psra_d:
143   case Intrinsic::x86_avx2_psra_w:
144   case Intrinsic::x86_avx512_psra_q_128:
145   case Intrinsic::x86_avx512_psra_q_256:
146   case Intrinsic::x86_avx512_psra_d_512:
147   case Intrinsic::x86_avx512_psra_q_512:
148   case Intrinsic::x86_avx512_psra_w_512:
149     LogicalShift = false;
150     ShiftLeft = false;
151     break;
152   case Intrinsic::x86_sse2_psrli_d:
153   case Intrinsic::x86_sse2_psrli_q:
154   case Intrinsic::x86_sse2_psrli_w:
155   case Intrinsic::x86_avx2_psrli_d:
156   case Intrinsic::x86_avx2_psrli_q:
157   case Intrinsic::x86_avx2_psrli_w:
158   case Intrinsic::x86_avx512_psrli_d_512:
159   case Intrinsic::x86_avx512_psrli_q_512:
160   case Intrinsic::x86_avx512_psrli_w_512:
161     IsImm = true;
162     [[fallthrough]];
163   case Intrinsic::x86_sse2_psrl_d:
164   case Intrinsic::x86_sse2_psrl_q:
165   case Intrinsic::x86_sse2_psrl_w:
166   case Intrinsic::x86_avx2_psrl_d:
167   case Intrinsic::x86_avx2_psrl_q:
168   case Intrinsic::x86_avx2_psrl_w:
169   case Intrinsic::x86_avx512_psrl_d_512:
170   case Intrinsic::x86_avx512_psrl_q_512:
171   case Intrinsic::x86_avx512_psrl_w_512:
172     LogicalShift = true;
173     ShiftLeft = false;
174     break;
175   case Intrinsic::x86_sse2_pslli_d:
176   case Intrinsic::x86_sse2_pslli_q:
177   case Intrinsic::x86_sse2_pslli_w:
178   case Intrinsic::x86_avx2_pslli_d:
179   case Intrinsic::x86_avx2_pslli_q:
180   case Intrinsic::x86_avx2_pslli_w:
181   case Intrinsic::x86_avx512_pslli_d_512:
182   case Intrinsic::x86_avx512_pslli_q_512:
183   case Intrinsic::x86_avx512_pslli_w_512:
184     IsImm = true;
185     [[fallthrough]];
186   case Intrinsic::x86_sse2_psll_d:
187   case Intrinsic::x86_sse2_psll_q:
188   case Intrinsic::x86_sse2_psll_w:
189   case Intrinsic::x86_avx2_psll_d:
190   case Intrinsic::x86_avx2_psll_q:
191   case Intrinsic::x86_avx2_psll_w:
192   case Intrinsic::x86_avx512_psll_d_512:
193   case Intrinsic::x86_avx512_psll_q_512:
194   case Intrinsic::x86_avx512_psll_w_512:
195     LogicalShift = true;
196     ShiftLeft = true;
197     break;
198   }
199   assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
200 
201   Value *Vec = II.getArgOperand(0);
202   Value *Amt = II.getArgOperand(1);
203   auto *VT = cast<FixedVectorType>(Vec->getType());
204   Type *SVT = VT->getElementType();
205   Type *AmtVT = Amt->getType();
206   unsigned VWidth = VT->getNumElements();
207   unsigned BitWidth = SVT->getPrimitiveSizeInBits();
208 
209   // If the shift amount is guaranteed to be in-range we can replace it with a
210   // generic shift. If its guaranteed to be out of range, logical shifts combine
211   // to zero and arithmetic shifts are clamped to (BitWidth - 1).
212   if (IsImm) {
213     assert(AmtVT->isIntegerTy(32) && "Unexpected shift-by-immediate type");
214     KnownBits KnownAmtBits =
215         llvm::computeKnownBits(Amt, II.getModule()->getDataLayout());
216     if (KnownAmtBits.getMaxValue().ult(BitWidth)) {
217       Amt = Builder.CreateZExtOrTrunc(Amt, SVT);
218       Amt = Builder.CreateVectorSplat(VWidth, Amt);
219       return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
220                                         : Builder.CreateLShr(Vec, Amt))
221                            : Builder.CreateAShr(Vec, Amt));
222     }
223     if (KnownAmtBits.getMinValue().uge(BitWidth)) {
224       if (LogicalShift)
225         return ConstantAggregateZero::get(VT);
226       Amt = ConstantInt::get(SVT, BitWidth - 1);
227       return Builder.CreateAShr(Vec, Builder.CreateVectorSplat(VWidth, Amt));
228     }
229   } else {
230     // Ensure the first element has an in-range value and the rest of the
231     // elements in the bottom 64 bits are zero.
232     assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 &&
233            cast<VectorType>(AmtVT)->getElementType() == SVT &&
234            "Unexpected shift-by-scalar type");
235     unsigned NumAmtElts = cast<FixedVectorType>(AmtVT)->getNumElements();
236     APInt DemandedLower = APInt::getOneBitSet(NumAmtElts, 0);
237     APInt DemandedUpper = APInt::getBitsSet(NumAmtElts, 1, NumAmtElts / 2);
238     KnownBits KnownLowerBits = llvm::computeKnownBits(
239         Amt, DemandedLower, II.getModule()->getDataLayout());
240     KnownBits KnownUpperBits = llvm::computeKnownBits(
241         Amt, DemandedUpper, II.getModule()->getDataLayout());
242     if (KnownLowerBits.getMaxValue().ult(BitWidth) &&
243         (DemandedUpper.isZero() || KnownUpperBits.isZero())) {
244       SmallVector<int, 16> ZeroSplat(VWidth, 0);
245       Amt = Builder.CreateShuffleVector(Amt, ZeroSplat);
246       return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
247                                         : Builder.CreateLShr(Vec, Amt))
248                            : Builder.CreateAShr(Vec, Amt));
249     }
250   }
251 
252   // Simplify if count is constant vector.
253   auto *CDV = dyn_cast<ConstantDataVector>(Amt);
254   if (!CDV)
255     return nullptr;
256 
257   // SSE2/AVX2 uses all the first 64-bits of the 128-bit vector
258   // operand to compute the shift amount.
259   assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 &&
260          cast<VectorType>(AmtVT)->getElementType() == SVT &&
261          "Unexpected shift-by-scalar type");
262 
263   // Concatenate the sub-elements to create the 64-bit value.
264   APInt Count(64, 0);
265   for (unsigned i = 0, NumSubElts = 64 / BitWidth; i != NumSubElts; ++i) {
266     unsigned SubEltIdx = (NumSubElts - 1) - i;
267     auto *SubElt = cast<ConstantInt>(CDV->getElementAsConstant(SubEltIdx));
268     Count <<= BitWidth;
269     Count |= SubElt->getValue().zextOrTrunc(64);
270   }
271 
272   // If shift-by-zero then just return the original value.
273   if (Count.isZero())
274     return Vec;
275 
276   // Handle cases when Shift >= BitWidth.
277   if (Count.uge(BitWidth)) {
278     // If LogicalShift - just return zero.
279     if (LogicalShift)
280       return ConstantAggregateZero::get(VT);
281 
282     // If ArithmeticShift - clamp Shift to (BitWidth - 1).
283     Count = APInt(64, BitWidth - 1);
284   }
285 
286   // Get a constant vector of the same type as the first operand.
287   auto ShiftAmt = ConstantInt::get(SVT, Count.zextOrTrunc(BitWidth));
288   auto ShiftVec = Builder.CreateVectorSplat(VWidth, ShiftAmt);
289 
290   if (ShiftLeft)
291     return Builder.CreateShl(Vec, ShiftVec);
292 
293   if (LogicalShift)
294     return Builder.CreateLShr(Vec, ShiftVec);
295 
296   return Builder.CreateAShr(Vec, ShiftVec);
297 }
298 
299 // Attempt to simplify AVX2 per-element shift intrinsics to a generic IR shift.
300 // Unlike the generic IR shifts, the intrinsics have defined behaviour for out
301 // of range shift amounts (logical - set to zero, arithmetic - splat sign bit).
302 static Value *simplifyX86varShift(const IntrinsicInst &II,
303                                   InstCombiner::BuilderTy &Builder) {
304   bool LogicalShift = false;
305   bool ShiftLeft = false;
306 
307   switch (II.getIntrinsicID()) {
308   default:
309     llvm_unreachable("Unexpected intrinsic!");
310   case Intrinsic::x86_avx2_psrav_d:
311   case Intrinsic::x86_avx2_psrav_d_256:
312   case Intrinsic::x86_avx512_psrav_q_128:
313   case Intrinsic::x86_avx512_psrav_q_256:
314   case Intrinsic::x86_avx512_psrav_d_512:
315   case Intrinsic::x86_avx512_psrav_q_512:
316   case Intrinsic::x86_avx512_psrav_w_128:
317   case Intrinsic::x86_avx512_psrav_w_256:
318   case Intrinsic::x86_avx512_psrav_w_512:
319     LogicalShift = false;
320     ShiftLeft = false;
321     break;
322   case Intrinsic::x86_avx2_psrlv_d:
323   case Intrinsic::x86_avx2_psrlv_d_256:
324   case Intrinsic::x86_avx2_psrlv_q:
325   case Intrinsic::x86_avx2_psrlv_q_256:
326   case Intrinsic::x86_avx512_psrlv_d_512:
327   case Intrinsic::x86_avx512_psrlv_q_512:
328   case Intrinsic::x86_avx512_psrlv_w_128:
329   case Intrinsic::x86_avx512_psrlv_w_256:
330   case Intrinsic::x86_avx512_psrlv_w_512:
331     LogicalShift = true;
332     ShiftLeft = false;
333     break;
334   case Intrinsic::x86_avx2_psllv_d:
335   case Intrinsic::x86_avx2_psllv_d_256:
336   case Intrinsic::x86_avx2_psllv_q:
337   case Intrinsic::x86_avx2_psllv_q_256:
338   case Intrinsic::x86_avx512_psllv_d_512:
339   case Intrinsic::x86_avx512_psllv_q_512:
340   case Intrinsic::x86_avx512_psllv_w_128:
341   case Intrinsic::x86_avx512_psllv_w_256:
342   case Intrinsic::x86_avx512_psllv_w_512:
343     LogicalShift = true;
344     ShiftLeft = true;
345     break;
346   }
347   assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
348 
349   Value *Vec = II.getArgOperand(0);
350   Value *Amt = II.getArgOperand(1);
351   auto *VT = cast<FixedVectorType>(II.getType());
352   Type *SVT = VT->getElementType();
353   int NumElts = VT->getNumElements();
354   int BitWidth = SVT->getIntegerBitWidth();
355 
356   // If the shift amount is guaranteed to be in-range we can replace it with a
357   // generic shift.
358   KnownBits KnownAmt =
359       llvm::computeKnownBits(Amt, II.getModule()->getDataLayout());
360   if (KnownAmt.getMaxValue().ult(BitWidth)) {
361     return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
362                                       : Builder.CreateLShr(Vec, Amt))
363                          : Builder.CreateAShr(Vec, Amt));
364   }
365 
366   // Simplify if all shift amounts are constant/undef.
367   auto *CShift = dyn_cast<Constant>(Amt);
368   if (!CShift)
369     return nullptr;
370 
371   // Collect each element's shift amount.
372   // We also collect special cases: UNDEF = -1, OUT-OF-RANGE = BitWidth.
373   bool AnyOutOfRange = false;
374   SmallVector<int, 8> ShiftAmts;
375   for (int I = 0; I < NumElts; ++I) {
376     auto *CElt = CShift->getAggregateElement(I);
377     if (isa_and_nonnull<UndefValue>(CElt)) {
378       ShiftAmts.push_back(-1);
379       continue;
380     }
381 
382     auto *COp = dyn_cast_or_null<ConstantInt>(CElt);
383     if (!COp)
384       return nullptr;
385 
386     // Handle out of range shifts.
387     // If LogicalShift - set to BitWidth (special case).
388     // If ArithmeticShift - set to (BitWidth - 1) (sign splat).
389     APInt ShiftVal = COp->getValue();
390     if (ShiftVal.uge(BitWidth)) {
391       AnyOutOfRange = LogicalShift;
392       ShiftAmts.push_back(LogicalShift ? BitWidth : BitWidth - 1);
393       continue;
394     }
395 
396     ShiftAmts.push_back((int)ShiftVal.getZExtValue());
397   }
398 
399   // If all elements out of range or UNDEF, return vector of zeros/undefs.
400   // ArithmeticShift should only hit this if they are all UNDEF.
401   auto OutOfRange = [&](int Idx) { return (Idx < 0) || (BitWidth <= Idx); };
402   if (llvm::all_of(ShiftAmts, OutOfRange)) {
403     SmallVector<Constant *, 8> ConstantVec;
404     for (int Idx : ShiftAmts) {
405       if (Idx < 0) {
406         ConstantVec.push_back(UndefValue::get(SVT));
407       } else {
408         assert(LogicalShift && "Logical shift expected");
409         ConstantVec.push_back(ConstantInt::getNullValue(SVT));
410       }
411     }
412     return ConstantVector::get(ConstantVec);
413   }
414 
415   // We can't handle only some out of range values with generic logical shifts.
416   if (AnyOutOfRange)
417     return nullptr;
418 
419   // Build the shift amount constant vector.
420   SmallVector<Constant *, 8> ShiftVecAmts;
421   for (int Idx : ShiftAmts) {
422     if (Idx < 0)
423       ShiftVecAmts.push_back(UndefValue::get(SVT));
424     else
425       ShiftVecAmts.push_back(ConstantInt::get(SVT, Idx));
426   }
427   auto ShiftVec = ConstantVector::get(ShiftVecAmts);
428 
429   if (ShiftLeft)
430     return Builder.CreateShl(Vec, ShiftVec);
431 
432   if (LogicalShift)
433     return Builder.CreateLShr(Vec, ShiftVec);
434 
435   return Builder.CreateAShr(Vec, ShiftVec);
436 }
437 
438 static Value *simplifyX86pack(IntrinsicInst &II,
439                               InstCombiner::BuilderTy &Builder, bool IsSigned) {
440   Value *Arg0 = II.getArgOperand(0);
441   Value *Arg1 = II.getArgOperand(1);
442   Type *ResTy = II.getType();
443 
444   // Fast all undef handling.
445   if (isa<UndefValue>(Arg0) && isa<UndefValue>(Arg1))
446     return UndefValue::get(ResTy);
447 
448   auto *ArgTy = cast<FixedVectorType>(Arg0->getType());
449   unsigned NumLanes = ResTy->getPrimitiveSizeInBits() / 128;
450   unsigned NumSrcElts = ArgTy->getNumElements();
451   assert(cast<FixedVectorType>(ResTy)->getNumElements() == (2 * NumSrcElts) &&
452          "Unexpected packing types");
453 
454   unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
455   unsigned DstScalarSizeInBits = ResTy->getScalarSizeInBits();
456   unsigned SrcScalarSizeInBits = ArgTy->getScalarSizeInBits();
457   assert(SrcScalarSizeInBits == (2 * DstScalarSizeInBits) &&
458          "Unexpected packing types");
459 
460   // Constant folding.
461   if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1))
462     return nullptr;
463 
464   // Clamp Values - signed/unsigned both use signed clamp values, but they
465   // differ on the min/max values.
466   APInt MinValue, MaxValue;
467   if (IsSigned) {
468     // PACKSS: Truncate signed value with signed saturation.
469     // Source values less than dst minint are saturated to minint.
470     // Source values greater than dst maxint are saturated to maxint.
471     MinValue =
472         APInt::getSignedMinValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits);
473     MaxValue =
474         APInt::getSignedMaxValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits);
475   } else {
476     // PACKUS: Truncate signed value with unsigned saturation.
477     // Source values less than zero are saturated to zero.
478     // Source values greater than dst maxuint are saturated to maxuint.
479     MinValue = APInt::getZero(SrcScalarSizeInBits);
480     MaxValue = APInt::getLowBitsSet(SrcScalarSizeInBits, DstScalarSizeInBits);
481   }
482 
483   auto *MinC = Constant::getIntegerValue(ArgTy, MinValue);
484   auto *MaxC = Constant::getIntegerValue(ArgTy, MaxValue);
485   Arg0 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg0, MinC), MinC, Arg0);
486   Arg1 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg1, MinC), MinC, Arg1);
487   Arg0 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg0, MaxC), MaxC, Arg0);
488   Arg1 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg1, MaxC), MaxC, Arg1);
489 
490   // Shuffle clamped args together at the lane level.
491   SmallVector<int, 32> PackMask;
492   for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
493     for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt)
494       PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane));
495     for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt)
496       PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane) + NumSrcElts);
497   }
498   auto *Shuffle = Builder.CreateShuffleVector(Arg0, Arg1, PackMask);
499 
500   // Truncate to dst size.
501   return Builder.CreateTrunc(Shuffle, ResTy);
502 }
503 
504 static Value *simplifyX86movmsk(const IntrinsicInst &II,
505                                 InstCombiner::BuilderTy &Builder) {
506   Value *Arg = II.getArgOperand(0);
507   Type *ResTy = II.getType();
508 
509   // movmsk(undef) -> zero as we must ensure the upper bits are zero.
510   if (isa<UndefValue>(Arg))
511     return Constant::getNullValue(ResTy);
512 
513   auto *ArgTy = dyn_cast<FixedVectorType>(Arg->getType());
514   // We can't easily peek through x86_mmx types.
515   if (!ArgTy)
516     return nullptr;
517 
518   // Expand MOVMSK to compare/bitcast/zext:
519   // e.g. PMOVMSKB(v16i8 x):
520   // %cmp = icmp slt <16 x i8> %x, zeroinitializer
521   // %int = bitcast <16 x i1> %cmp to i16
522   // %res = zext i16 %int to i32
523   unsigned NumElts = ArgTy->getNumElements();
524   Type *IntegerTy = Builder.getIntNTy(NumElts);
525 
526   Value *Res = Builder.CreateBitCast(Arg, VectorType::getInteger(ArgTy));
527   Res = Builder.CreateIsNeg(Res);
528   Res = Builder.CreateBitCast(Res, IntegerTy);
529   Res = Builder.CreateZExtOrTrunc(Res, ResTy);
530   return Res;
531 }
532 
533 static Value *simplifyX86addcarry(const IntrinsicInst &II,
534                                   InstCombiner::BuilderTy &Builder) {
535   Value *CarryIn = II.getArgOperand(0);
536   Value *Op1 = II.getArgOperand(1);
537   Value *Op2 = II.getArgOperand(2);
538   Type *RetTy = II.getType();
539   Type *OpTy = Op1->getType();
540   assert(RetTy->getStructElementType(0)->isIntegerTy(8) &&
541          RetTy->getStructElementType(1) == OpTy && OpTy == Op2->getType() &&
542          "Unexpected types for x86 addcarry");
543 
544   // If carry-in is zero, this is just an unsigned add with overflow.
545   if (match(CarryIn, PatternMatch::m_ZeroInt())) {
546     Value *UAdd = Builder.CreateIntrinsic(Intrinsic::uadd_with_overflow, OpTy,
547                                           {Op1, Op2});
548     // The types have to be adjusted to match the x86 call types.
549     Value *UAddResult = Builder.CreateExtractValue(UAdd, 0);
550     Value *UAddOV = Builder.CreateZExt(Builder.CreateExtractValue(UAdd, 1),
551                                        Builder.getInt8Ty());
552     Value *Res = PoisonValue::get(RetTy);
553     Res = Builder.CreateInsertValue(Res, UAddOV, 0);
554     return Builder.CreateInsertValue(Res, UAddResult, 1);
555   }
556 
557   return nullptr;
558 }
559 
560 static Value *simplifyX86insertps(const IntrinsicInst &II,
561                                   InstCombiner::BuilderTy &Builder) {
562   auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2));
563   if (!CInt)
564     return nullptr;
565 
566   auto *VecTy = cast<FixedVectorType>(II.getType());
567   assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type");
568 
569   // The immediate permute control byte looks like this:
570   //    [3:0] - zero mask for each 32-bit lane
571   //    [5:4] - select one 32-bit destination lane
572   //    [7:6] - select one 32-bit source lane
573 
574   uint8_t Imm = CInt->getZExtValue();
575   uint8_t ZMask = Imm & 0xf;
576   uint8_t DestLane = (Imm >> 4) & 0x3;
577   uint8_t SourceLane = (Imm >> 6) & 0x3;
578 
579   ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy);
580 
581   // If all zero mask bits are set, this was just a weird way to
582   // generate a zero vector.
583   if (ZMask == 0xf)
584     return ZeroVector;
585 
586   // Initialize by passing all of the first source bits through.
587   int ShuffleMask[4] = {0, 1, 2, 3};
588 
589   // We may replace the second operand with the zero vector.
590   Value *V1 = II.getArgOperand(1);
591 
592   if (ZMask) {
593     // If the zero mask is being used with a single input or the zero mask
594     // overrides the destination lane, this is a shuffle with the zero vector.
595     if ((II.getArgOperand(0) == II.getArgOperand(1)) ||
596         (ZMask & (1 << DestLane))) {
597       V1 = ZeroVector;
598       // We may still move 32-bits of the first source vector from one lane
599       // to another.
600       ShuffleMask[DestLane] = SourceLane;
601       // The zero mask may override the previous insert operation.
602       for (unsigned i = 0; i < 4; ++i)
603         if ((ZMask >> i) & 0x1)
604           ShuffleMask[i] = i + 4;
605     } else {
606       // TODO: Model this case as 2 shuffles or a 'logical and' plus shuffle?
607       return nullptr;
608     }
609   } else {
610     // Replace the selected destination lane with the selected source lane.
611     ShuffleMask[DestLane] = SourceLane + 4;
612   }
613 
614   return Builder.CreateShuffleVector(II.getArgOperand(0), V1, ShuffleMask);
615 }
616 
617 /// Attempt to simplify SSE4A EXTRQ/EXTRQI instructions using constant folding
618 /// or conversion to a shuffle vector.
619 static Value *simplifyX86extrq(IntrinsicInst &II, Value *Op0,
620                                ConstantInt *CILength, ConstantInt *CIIndex,
621                                InstCombiner::BuilderTy &Builder) {
622   auto LowConstantHighUndef = [&](uint64_t Val) {
623     Type *IntTy64 = Type::getInt64Ty(II.getContext());
624     Constant *Args[] = {ConstantInt::get(IntTy64, Val),
625                         UndefValue::get(IntTy64)};
626     return ConstantVector::get(Args);
627   };
628 
629   // See if we're dealing with constant values.
630   auto *C0 = dyn_cast<Constant>(Op0);
631   auto *CI0 =
632       C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
633          : nullptr;
634 
635   // Attempt to constant fold.
636   if (CILength && CIIndex) {
637     // From AMD documentation: "The bit index and field length are each six
638     // bits in length other bits of the field are ignored."
639     APInt APIndex = CIIndex->getValue().zextOrTrunc(6);
640     APInt APLength = CILength->getValue().zextOrTrunc(6);
641 
642     unsigned Index = APIndex.getZExtValue();
643 
644     // From AMD documentation: "a value of zero in the field length is
645     // defined as length of 64".
646     unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue();
647 
648     // From AMD documentation: "If the sum of the bit index + length field
649     // is greater than 64, the results are undefined".
650     unsigned End = Index + Length;
651 
652     // Note that both field index and field length are 8-bit quantities.
653     // Since variables 'Index' and 'Length' are unsigned values
654     // obtained from zero-extending field index and field length
655     // respectively, their sum should never wrap around.
656     if (End > 64)
657       return UndefValue::get(II.getType());
658 
659     // If we are inserting whole bytes, we can convert this to a shuffle.
660     // Lowering can recognize EXTRQI shuffle masks.
661     if ((Length % 8) == 0 && (Index % 8) == 0) {
662       // Convert bit indices to byte indices.
663       Length /= 8;
664       Index /= 8;
665 
666       Type *IntTy8 = Type::getInt8Ty(II.getContext());
667       auto *ShufTy = FixedVectorType::get(IntTy8, 16);
668 
669       SmallVector<int, 16> ShuffleMask;
670       for (int i = 0; i != (int)Length; ++i)
671         ShuffleMask.push_back(i + Index);
672       for (int i = Length; i != 8; ++i)
673         ShuffleMask.push_back(i + 16);
674       for (int i = 8; i != 16; ++i)
675         ShuffleMask.push_back(-1);
676 
677       Value *SV = Builder.CreateShuffleVector(
678           Builder.CreateBitCast(Op0, ShufTy),
679           ConstantAggregateZero::get(ShufTy), ShuffleMask);
680       return Builder.CreateBitCast(SV, II.getType());
681     }
682 
683     // Constant Fold - shift Index'th bit to lowest position and mask off
684     // Length bits.
685     if (CI0) {
686       APInt Elt = CI0->getValue();
687       Elt.lshrInPlace(Index);
688       Elt = Elt.zextOrTrunc(Length);
689       return LowConstantHighUndef(Elt.getZExtValue());
690     }
691 
692     // If we were an EXTRQ call, we'll save registers if we convert to EXTRQI.
693     if (II.getIntrinsicID() == Intrinsic::x86_sse4a_extrq) {
694       Value *Args[] = {Op0, CILength, CIIndex};
695       Module *M = II.getModule();
696       Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_extrqi);
697       return Builder.CreateCall(F, Args);
698     }
699   }
700 
701   // Constant Fold - extraction from zero is always {zero, undef}.
702   if (CI0 && CI0->isZero())
703     return LowConstantHighUndef(0);
704 
705   return nullptr;
706 }
707 
708 /// Attempt to simplify SSE4A INSERTQ/INSERTQI instructions using constant
709 /// folding or conversion to a shuffle vector.
710 static Value *simplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1,
711                                  APInt APLength, APInt APIndex,
712                                  InstCombiner::BuilderTy &Builder) {
713   // From AMD documentation: "The bit index and field length are each six bits
714   // in length other bits of the field are ignored."
715   APIndex = APIndex.zextOrTrunc(6);
716   APLength = APLength.zextOrTrunc(6);
717 
718   // Attempt to constant fold.
719   unsigned Index = APIndex.getZExtValue();
720 
721   // From AMD documentation: "a value of zero in the field length is
722   // defined as length of 64".
723   unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue();
724 
725   // From AMD documentation: "If the sum of the bit index + length field
726   // is greater than 64, the results are undefined".
727   unsigned End = Index + Length;
728 
729   // Note that both field index and field length are 8-bit quantities.
730   // Since variables 'Index' and 'Length' are unsigned values
731   // obtained from zero-extending field index and field length
732   // respectively, their sum should never wrap around.
733   if (End > 64)
734     return UndefValue::get(II.getType());
735 
736   // If we are inserting whole bytes, we can convert this to a shuffle.
737   // Lowering can recognize INSERTQI shuffle masks.
738   if ((Length % 8) == 0 && (Index % 8) == 0) {
739     // Convert bit indices to byte indices.
740     Length /= 8;
741     Index /= 8;
742 
743     Type *IntTy8 = Type::getInt8Ty(II.getContext());
744     auto *ShufTy = FixedVectorType::get(IntTy8, 16);
745 
746     SmallVector<int, 16> ShuffleMask;
747     for (int i = 0; i != (int)Index; ++i)
748       ShuffleMask.push_back(i);
749     for (int i = 0; i != (int)Length; ++i)
750       ShuffleMask.push_back(i + 16);
751     for (int i = Index + Length; i != 8; ++i)
752       ShuffleMask.push_back(i);
753     for (int i = 8; i != 16; ++i)
754       ShuffleMask.push_back(-1);
755 
756     Value *SV = Builder.CreateShuffleVector(Builder.CreateBitCast(Op0, ShufTy),
757                                             Builder.CreateBitCast(Op1, ShufTy),
758                                             ShuffleMask);
759     return Builder.CreateBitCast(SV, II.getType());
760   }
761 
762   // See if we're dealing with constant values.
763   auto *C0 = dyn_cast<Constant>(Op0);
764   auto *C1 = dyn_cast<Constant>(Op1);
765   auto *CI00 =
766       C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
767          : nullptr;
768   auto *CI10 =
769       C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
770          : nullptr;
771 
772   // Constant Fold - insert bottom Length bits starting at the Index'th bit.
773   if (CI00 && CI10) {
774     APInt V00 = CI00->getValue();
775     APInt V10 = CI10->getValue();
776     APInt Mask = APInt::getLowBitsSet(64, Length).shl(Index);
777     V00 = V00 & ~Mask;
778     V10 = V10.zextOrTrunc(Length).zextOrTrunc(64).shl(Index);
779     APInt Val = V00 | V10;
780     Type *IntTy64 = Type::getInt64Ty(II.getContext());
781     Constant *Args[] = {ConstantInt::get(IntTy64, Val.getZExtValue()),
782                         UndefValue::get(IntTy64)};
783     return ConstantVector::get(Args);
784   }
785 
786   // If we were an INSERTQ call, we'll save demanded elements if we convert to
787   // INSERTQI.
788   if (II.getIntrinsicID() == Intrinsic::x86_sse4a_insertq) {
789     Type *IntTy8 = Type::getInt8Ty(II.getContext());
790     Constant *CILength = ConstantInt::get(IntTy8, Length, false);
791     Constant *CIIndex = ConstantInt::get(IntTy8, Index, false);
792 
793     Value *Args[] = {Op0, Op1, CILength, CIIndex};
794     Module *M = II.getModule();
795     Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi);
796     return Builder.CreateCall(F, Args);
797   }
798 
799   return nullptr;
800 }
801 
802 /// Attempt to convert pshufb* to shufflevector if the mask is constant.
803 static Value *simplifyX86pshufb(const IntrinsicInst &II,
804                                 InstCombiner::BuilderTy &Builder) {
805   auto *V = dyn_cast<Constant>(II.getArgOperand(1));
806   if (!V)
807     return nullptr;
808 
809   auto *VecTy = cast<FixedVectorType>(II.getType());
810   unsigned NumElts = VecTy->getNumElements();
811   assert((NumElts == 16 || NumElts == 32 || NumElts == 64) &&
812          "Unexpected number of elements in shuffle mask!");
813 
814   // Construct a shuffle mask from constant integers or UNDEFs.
815   int Indexes[64];
816 
817   // Each byte in the shuffle control mask forms an index to permute the
818   // corresponding byte in the destination operand.
819   for (unsigned I = 0; I < NumElts; ++I) {
820     Constant *COp = V->getAggregateElement(I);
821     if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
822       return nullptr;
823 
824     if (isa<UndefValue>(COp)) {
825       Indexes[I] = -1;
826       continue;
827     }
828 
829     int8_t Index = cast<ConstantInt>(COp)->getValue().getZExtValue();
830 
831     // If the most significant bit (bit[7]) of each byte of the shuffle
832     // control mask is set, then zero is written in the result byte.
833     // The zero vector is in the right-hand side of the resulting
834     // shufflevector.
835 
836     // The value of each index for the high 128-bit lane is the least
837     // significant 4 bits of the respective shuffle control byte.
838     Index = ((Index < 0) ? NumElts : Index & 0x0F) + (I & 0xF0);
839     Indexes[I] = Index;
840   }
841 
842   auto V1 = II.getArgOperand(0);
843   auto V2 = Constant::getNullValue(VecTy);
844   return Builder.CreateShuffleVector(V1, V2, ArrayRef(Indexes, NumElts));
845 }
846 
847 /// Attempt to convert vpermilvar* to shufflevector if the mask is constant.
848 static Value *simplifyX86vpermilvar(const IntrinsicInst &II,
849                                     InstCombiner::BuilderTy &Builder) {
850   auto *V = dyn_cast<Constant>(II.getArgOperand(1));
851   if (!V)
852     return nullptr;
853 
854   auto *VecTy = cast<FixedVectorType>(II.getType());
855   unsigned NumElts = VecTy->getNumElements();
856   bool IsPD = VecTy->getScalarType()->isDoubleTy();
857   unsigned NumLaneElts = IsPD ? 2 : 4;
858   assert(NumElts == 16 || NumElts == 8 || NumElts == 4 || NumElts == 2);
859 
860   // Construct a shuffle mask from constant integers or UNDEFs.
861   int Indexes[16];
862 
863   // The intrinsics only read one or two bits, clear the rest.
864   for (unsigned I = 0; I < NumElts; ++I) {
865     Constant *COp = V->getAggregateElement(I);
866     if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
867       return nullptr;
868 
869     if (isa<UndefValue>(COp)) {
870       Indexes[I] = -1;
871       continue;
872     }
873 
874     APInt Index = cast<ConstantInt>(COp)->getValue();
875     Index = Index.zextOrTrunc(32).getLoBits(2);
876 
877     // The PD variants uses bit 1 to select per-lane element index, so
878     // shift down to convert to generic shuffle mask index.
879     if (IsPD)
880       Index.lshrInPlace(1);
881 
882     // The _256 variants are a bit trickier since the mask bits always index
883     // into the corresponding 128 half. In order to convert to a generic
884     // shuffle, we have to make that explicit.
885     Index += APInt(32, (I / NumLaneElts) * NumLaneElts);
886 
887     Indexes[I] = Index.getZExtValue();
888   }
889 
890   auto V1 = II.getArgOperand(0);
891   return Builder.CreateShuffleVector(V1, ArrayRef(Indexes, NumElts));
892 }
893 
894 /// Attempt to convert vpermd/vpermps to shufflevector if the mask is constant.
895 static Value *simplifyX86vpermv(const IntrinsicInst &II,
896                                 InstCombiner::BuilderTy &Builder) {
897   auto *V = dyn_cast<Constant>(II.getArgOperand(1));
898   if (!V)
899     return nullptr;
900 
901   auto *VecTy = cast<FixedVectorType>(II.getType());
902   unsigned Size = VecTy->getNumElements();
903   assert((Size == 4 || Size == 8 || Size == 16 || Size == 32 || Size == 64) &&
904          "Unexpected shuffle mask size");
905 
906   // Construct a shuffle mask from constant integers or UNDEFs.
907   int Indexes[64];
908 
909   for (unsigned I = 0; I < Size; ++I) {
910     Constant *COp = V->getAggregateElement(I);
911     if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
912       return nullptr;
913 
914     if (isa<UndefValue>(COp)) {
915       Indexes[I] = -1;
916       continue;
917     }
918 
919     uint32_t Index = cast<ConstantInt>(COp)->getZExtValue();
920     Index &= Size - 1;
921     Indexes[I] = Index;
922   }
923 
924   auto V1 = II.getArgOperand(0);
925   return Builder.CreateShuffleVector(V1, ArrayRef(Indexes, Size));
926 }
927 
928 std::optional<Instruction *>
929 X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
930   auto SimplifyDemandedVectorEltsLow = [&IC](Value *Op, unsigned Width,
931                                              unsigned DemandedWidth) {
932     APInt UndefElts(Width, 0);
933     APInt DemandedElts = APInt::getLowBitsSet(Width, DemandedWidth);
934     return IC.SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts);
935   };
936 
937   Intrinsic::ID IID = II.getIntrinsicID();
938   switch (IID) {
939   case Intrinsic::x86_bmi_bextr_32:
940   case Intrinsic::x86_bmi_bextr_64:
941   case Intrinsic::x86_tbm_bextri_u32:
942   case Intrinsic::x86_tbm_bextri_u64:
943     // If the RHS is a constant we can try some simplifications.
944     if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
945       uint64_t Shift = C->getZExtValue();
946       uint64_t Length = (Shift >> 8) & 0xff;
947       Shift &= 0xff;
948       unsigned BitWidth = II.getType()->getIntegerBitWidth();
949       // If the length is 0 or the shift is out of range, replace with zero.
950       if (Length == 0 || Shift >= BitWidth) {
951         return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
952       }
953       // If the LHS is also a constant, we can completely constant fold this.
954       if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
955         uint64_t Result = InC->getZExtValue() >> Shift;
956         if (Length > BitWidth)
957           Length = BitWidth;
958         Result &= maskTrailingOnes<uint64_t>(Length);
959         return IC.replaceInstUsesWith(II,
960                                       ConstantInt::get(II.getType(), Result));
961       }
962       // TODO should we turn this into 'and' if shift is 0? Or 'shl' if we
963       // are only masking bits that a shift already cleared?
964     }
965     break;
966 
967   case Intrinsic::x86_bmi_bzhi_32:
968   case Intrinsic::x86_bmi_bzhi_64:
969     // If the RHS is a constant we can try some simplifications.
970     if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
971       uint64_t Index = C->getZExtValue() & 0xff;
972       unsigned BitWidth = II.getType()->getIntegerBitWidth();
973       if (Index >= BitWidth) {
974         return IC.replaceInstUsesWith(II, II.getArgOperand(0));
975       }
976       if (Index == 0) {
977         return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
978       }
979       // If the LHS is also a constant, we can completely constant fold this.
980       if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
981         uint64_t Result = InC->getZExtValue();
982         Result &= maskTrailingOnes<uint64_t>(Index);
983         return IC.replaceInstUsesWith(II,
984                                       ConstantInt::get(II.getType(), Result));
985       }
986       // TODO should we convert this to an AND if the RHS is constant?
987     }
988     break;
989   case Intrinsic::x86_bmi_pext_32:
990   case Intrinsic::x86_bmi_pext_64:
991     if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
992       if (MaskC->isNullValue()) {
993         return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
994       }
995       if (MaskC->isAllOnesValue()) {
996         return IC.replaceInstUsesWith(II, II.getArgOperand(0));
997       }
998 
999       unsigned MaskIdx, MaskLen;
1000       if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) {
1001         // any single contingous sequence of 1s anywhere in the mask simply
1002         // describes a subset of the input bits shifted to the appropriate
1003         // position.  Replace with the straight forward IR.
1004         Value *Input = II.getArgOperand(0);
1005         Value *Masked = IC.Builder.CreateAnd(Input, II.getArgOperand(1));
1006         Value *ShiftAmt = ConstantInt::get(II.getType(), MaskIdx);
1007         Value *Shifted = IC.Builder.CreateLShr(Masked, ShiftAmt);
1008         return IC.replaceInstUsesWith(II, Shifted);
1009       }
1010 
1011       if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
1012         uint64_t Src = SrcC->getZExtValue();
1013         uint64_t Mask = MaskC->getZExtValue();
1014         uint64_t Result = 0;
1015         uint64_t BitToSet = 1;
1016 
1017         while (Mask) {
1018           // Isolate lowest set bit.
1019           uint64_t BitToTest = Mask & -Mask;
1020           if (BitToTest & Src)
1021             Result |= BitToSet;
1022 
1023           BitToSet <<= 1;
1024           // Clear lowest set bit.
1025           Mask &= Mask - 1;
1026         }
1027 
1028         return IC.replaceInstUsesWith(II,
1029                                       ConstantInt::get(II.getType(), Result));
1030       }
1031     }
1032     break;
1033   case Intrinsic::x86_bmi_pdep_32:
1034   case Intrinsic::x86_bmi_pdep_64:
1035     if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
1036       if (MaskC->isNullValue()) {
1037         return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
1038       }
1039       if (MaskC->isAllOnesValue()) {
1040         return IC.replaceInstUsesWith(II, II.getArgOperand(0));
1041       }
1042 
1043       unsigned MaskIdx, MaskLen;
1044       if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) {
1045         // any single contingous sequence of 1s anywhere in the mask simply
1046         // describes a subset of the input bits shifted to the appropriate
1047         // position.  Replace with the straight forward IR.
1048         Value *Input = II.getArgOperand(0);
1049         Value *ShiftAmt = ConstantInt::get(II.getType(), MaskIdx);
1050         Value *Shifted = IC.Builder.CreateShl(Input, ShiftAmt);
1051         Value *Masked = IC.Builder.CreateAnd(Shifted, II.getArgOperand(1));
1052         return IC.replaceInstUsesWith(II, Masked);
1053       }
1054 
1055       if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
1056         uint64_t Src = SrcC->getZExtValue();
1057         uint64_t Mask = MaskC->getZExtValue();
1058         uint64_t Result = 0;
1059         uint64_t BitToTest = 1;
1060 
1061         while (Mask) {
1062           // Isolate lowest set bit.
1063           uint64_t BitToSet = Mask & -Mask;
1064           if (BitToTest & Src)
1065             Result |= BitToSet;
1066 
1067           BitToTest <<= 1;
1068           // Clear lowest set bit;
1069           Mask &= Mask - 1;
1070         }
1071 
1072         return IC.replaceInstUsesWith(II,
1073                                       ConstantInt::get(II.getType(), Result));
1074       }
1075     }
1076     break;
1077 
1078   case Intrinsic::x86_sse_cvtss2si:
1079   case Intrinsic::x86_sse_cvtss2si64:
1080   case Intrinsic::x86_sse_cvttss2si:
1081   case Intrinsic::x86_sse_cvttss2si64:
1082   case Intrinsic::x86_sse2_cvtsd2si:
1083   case Intrinsic::x86_sse2_cvtsd2si64:
1084   case Intrinsic::x86_sse2_cvttsd2si:
1085   case Intrinsic::x86_sse2_cvttsd2si64:
1086   case Intrinsic::x86_avx512_vcvtss2si32:
1087   case Intrinsic::x86_avx512_vcvtss2si64:
1088   case Intrinsic::x86_avx512_vcvtss2usi32:
1089   case Intrinsic::x86_avx512_vcvtss2usi64:
1090   case Intrinsic::x86_avx512_vcvtsd2si32:
1091   case Intrinsic::x86_avx512_vcvtsd2si64:
1092   case Intrinsic::x86_avx512_vcvtsd2usi32:
1093   case Intrinsic::x86_avx512_vcvtsd2usi64:
1094   case Intrinsic::x86_avx512_cvttss2si:
1095   case Intrinsic::x86_avx512_cvttss2si64:
1096   case Intrinsic::x86_avx512_cvttss2usi:
1097   case Intrinsic::x86_avx512_cvttss2usi64:
1098   case Intrinsic::x86_avx512_cvttsd2si:
1099   case Intrinsic::x86_avx512_cvttsd2si64:
1100   case Intrinsic::x86_avx512_cvttsd2usi:
1101   case Intrinsic::x86_avx512_cvttsd2usi64: {
1102     // These intrinsics only demand the 0th element of their input vectors. If
1103     // we can simplify the input based on that, do so now.
1104     Value *Arg = II.getArgOperand(0);
1105     unsigned VWidth = cast<FixedVectorType>(Arg->getType())->getNumElements();
1106     if (Value *V = SimplifyDemandedVectorEltsLow(Arg, VWidth, 1)) {
1107       return IC.replaceOperand(II, 0, V);
1108     }
1109     break;
1110   }
1111 
1112   case Intrinsic::x86_mmx_pmovmskb:
1113   case Intrinsic::x86_sse_movmsk_ps:
1114   case Intrinsic::x86_sse2_movmsk_pd:
1115   case Intrinsic::x86_sse2_pmovmskb_128:
1116   case Intrinsic::x86_avx_movmsk_pd_256:
1117   case Intrinsic::x86_avx_movmsk_ps_256:
1118   case Intrinsic::x86_avx2_pmovmskb:
1119     if (Value *V = simplifyX86movmsk(II, IC.Builder)) {
1120       return IC.replaceInstUsesWith(II, V);
1121     }
1122     break;
1123 
1124   case Intrinsic::x86_sse_comieq_ss:
1125   case Intrinsic::x86_sse_comige_ss:
1126   case Intrinsic::x86_sse_comigt_ss:
1127   case Intrinsic::x86_sse_comile_ss:
1128   case Intrinsic::x86_sse_comilt_ss:
1129   case Intrinsic::x86_sse_comineq_ss:
1130   case Intrinsic::x86_sse_ucomieq_ss:
1131   case Intrinsic::x86_sse_ucomige_ss:
1132   case Intrinsic::x86_sse_ucomigt_ss:
1133   case Intrinsic::x86_sse_ucomile_ss:
1134   case Intrinsic::x86_sse_ucomilt_ss:
1135   case Intrinsic::x86_sse_ucomineq_ss:
1136   case Intrinsic::x86_sse2_comieq_sd:
1137   case Intrinsic::x86_sse2_comige_sd:
1138   case Intrinsic::x86_sse2_comigt_sd:
1139   case Intrinsic::x86_sse2_comile_sd:
1140   case Intrinsic::x86_sse2_comilt_sd:
1141   case Intrinsic::x86_sse2_comineq_sd:
1142   case Intrinsic::x86_sse2_ucomieq_sd:
1143   case Intrinsic::x86_sse2_ucomige_sd:
1144   case Intrinsic::x86_sse2_ucomigt_sd:
1145   case Intrinsic::x86_sse2_ucomile_sd:
1146   case Intrinsic::x86_sse2_ucomilt_sd:
1147   case Intrinsic::x86_sse2_ucomineq_sd:
1148   case Intrinsic::x86_avx512_vcomi_ss:
1149   case Intrinsic::x86_avx512_vcomi_sd:
1150   case Intrinsic::x86_avx512_mask_cmp_ss:
1151   case Intrinsic::x86_avx512_mask_cmp_sd: {
1152     // These intrinsics only demand the 0th element of their input vectors. If
1153     // we can simplify the input based on that, do so now.
1154     bool MadeChange = false;
1155     Value *Arg0 = II.getArgOperand(0);
1156     Value *Arg1 = II.getArgOperand(1);
1157     unsigned VWidth = cast<FixedVectorType>(Arg0->getType())->getNumElements();
1158     if (Value *V = SimplifyDemandedVectorEltsLow(Arg0, VWidth, 1)) {
1159       IC.replaceOperand(II, 0, V);
1160       MadeChange = true;
1161     }
1162     if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, 1)) {
1163       IC.replaceOperand(II, 1, V);
1164       MadeChange = true;
1165     }
1166     if (MadeChange) {
1167       return &II;
1168     }
1169     break;
1170   }
1171 
1172   case Intrinsic::x86_avx512_add_ps_512:
1173   case Intrinsic::x86_avx512_div_ps_512:
1174   case Intrinsic::x86_avx512_mul_ps_512:
1175   case Intrinsic::x86_avx512_sub_ps_512:
1176   case Intrinsic::x86_avx512_add_pd_512:
1177   case Intrinsic::x86_avx512_div_pd_512:
1178   case Intrinsic::x86_avx512_mul_pd_512:
1179   case Intrinsic::x86_avx512_sub_pd_512:
1180     // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
1181     // IR operations.
1182     if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
1183       if (R->getValue() == 4) {
1184         Value *Arg0 = II.getArgOperand(0);
1185         Value *Arg1 = II.getArgOperand(1);
1186 
1187         Value *V;
1188         switch (IID) {
1189         default:
1190           llvm_unreachable("Case stmts out of sync!");
1191         case Intrinsic::x86_avx512_add_ps_512:
1192         case Intrinsic::x86_avx512_add_pd_512:
1193           V = IC.Builder.CreateFAdd(Arg0, Arg1);
1194           break;
1195         case Intrinsic::x86_avx512_sub_ps_512:
1196         case Intrinsic::x86_avx512_sub_pd_512:
1197           V = IC.Builder.CreateFSub(Arg0, Arg1);
1198           break;
1199         case Intrinsic::x86_avx512_mul_ps_512:
1200         case Intrinsic::x86_avx512_mul_pd_512:
1201           V = IC.Builder.CreateFMul(Arg0, Arg1);
1202           break;
1203         case Intrinsic::x86_avx512_div_ps_512:
1204         case Intrinsic::x86_avx512_div_pd_512:
1205           V = IC.Builder.CreateFDiv(Arg0, Arg1);
1206           break;
1207         }
1208 
1209         return IC.replaceInstUsesWith(II, V);
1210       }
1211     }
1212     break;
1213 
1214   case Intrinsic::x86_avx512_mask_add_ss_round:
1215   case Intrinsic::x86_avx512_mask_div_ss_round:
1216   case Intrinsic::x86_avx512_mask_mul_ss_round:
1217   case Intrinsic::x86_avx512_mask_sub_ss_round:
1218   case Intrinsic::x86_avx512_mask_add_sd_round:
1219   case Intrinsic::x86_avx512_mask_div_sd_round:
1220   case Intrinsic::x86_avx512_mask_mul_sd_round:
1221   case Intrinsic::x86_avx512_mask_sub_sd_round:
1222     // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
1223     // IR operations.
1224     if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(4))) {
1225       if (R->getValue() == 4) {
1226         // Extract the element as scalars.
1227         Value *Arg0 = II.getArgOperand(0);
1228         Value *Arg1 = II.getArgOperand(1);
1229         Value *LHS = IC.Builder.CreateExtractElement(Arg0, (uint64_t)0);
1230         Value *RHS = IC.Builder.CreateExtractElement(Arg1, (uint64_t)0);
1231 
1232         Value *V;
1233         switch (IID) {
1234         default:
1235           llvm_unreachable("Case stmts out of sync!");
1236         case Intrinsic::x86_avx512_mask_add_ss_round:
1237         case Intrinsic::x86_avx512_mask_add_sd_round:
1238           V = IC.Builder.CreateFAdd(LHS, RHS);
1239           break;
1240         case Intrinsic::x86_avx512_mask_sub_ss_round:
1241         case Intrinsic::x86_avx512_mask_sub_sd_round:
1242           V = IC.Builder.CreateFSub(LHS, RHS);
1243           break;
1244         case Intrinsic::x86_avx512_mask_mul_ss_round:
1245         case Intrinsic::x86_avx512_mask_mul_sd_round:
1246           V = IC.Builder.CreateFMul(LHS, RHS);
1247           break;
1248         case Intrinsic::x86_avx512_mask_div_ss_round:
1249         case Intrinsic::x86_avx512_mask_div_sd_round:
1250           V = IC.Builder.CreateFDiv(LHS, RHS);
1251           break;
1252         }
1253 
1254         // Handle the masking aspect of the intrinsic.
1255         Value *Mask = II.getArgOperand(3);
1256         auto *C = dyn_cast<ConstantInt>(Mask);
1257         // We don't need a select if we know the mask bit is a 1.
1258         if (!C || !C->getValue()[0]) {
1259           // Cast the mask to an i1 vector and then extract the lowest element.
1260           auto *MaskTy = FixedVectorType::get(
1261               IC.Builder.getInt1Ty(),
1262               cast<IntegerType>(Mask->getType())->getBitWidth());
1263           Mask = IC.Builder.CreateBitCast(Mask, MaskTy);
1264           Mask = IC.Builder.CreateExtractElement(Mask, (uint64_t)0);
1265           // Extract the lowest element from the passthru operand.
1266           Value *Passthru =
1267               IC.Builder.CreateExtractElement(II.getArgOperand(2), (uint64_t)0);
1268           V = IC.Builder.CreateSelect(Mask, V, Passthru);
1269         }
1270 
1271         // Insert the result back into the original argument 0.
1272         V = IC.Builder.CreateInsertElement(Arg0, V, (uint64_t)0);
1273 
1274         return IC.replaceInstUsesWith(II, V);
1275       }
1276     }
1277     break;
1278 
1279   // Constant fold ashr( <A x Bi>, Ci ).
1280   // Constant fold lshr( <A x Bi>, Ci ).
1281   // Constant fold shl( <A x Bi>, Ci ).
1282   case Intrinsic::x86_sse2_psrai_d:
1283   case Intrinsic::x86_sse2_psrai_w:
1284   case Intrinsic::x86_avx2_psrai_d:
1285   case Intrinsic::x86_avx2_psrai_w:
1286   case Intrinsic::x86_avx512_psrai_q_128:
1287   case Intrinsic::x86_avx512_psrai_q_256:
1288   case Intrinsic::x86_avx512_psrai_d_512:
1289   case Intrinsic::x86_avx512_psrai_q_512:
1290   case Intrinsic::x86_avx512_psrai_w_512:
1291   case Intrinsic::x86_sse2_psrli_d:
1292   case Intrinsic::x86_sse2_psrli_q:
1293   case Intrinsic::x86_sse2_psrli_w:
1294   case Intrinsic::x86_avx2_psrli_d:
1295   case Intrinsic::x86_avx2_psrli_q:
1296   case Intrinsic::x86_avx2_psrli_w:
1297   case Intrinsic::x86_avx512_psrli_d_512:
1298   case Intrinsic::x86_avx512_psrli_q_512:
1299   case Intrinsic::x86_avx512_psrli_w_512:
1300   case Intrinsic::x86_sse2_pslli_d:
1301   case Intrinsic::x86_sse2_pslli_q:
1302   case Intrinsic::x86_sse2_pslli_w:
1303   case Intrinsic::x86_avx2_pslli_d:
1304   case Intrinsic::x86_avx2_pslli_q:
1305   case Intrinsic::x86_avx2_pslli_w:
1306   case Intrinsic::x86_avx512_pslli_d_512:
1307   case Intrinsic::x86_avx512_pslli_q_512:
1308   case Intrinsic::x86_avx512_pslli_w_512:
1309     if (Value *V = simplifyX86immShift(II, IC.Builder)) {
1310       return IC.replaceInstUsesWith(II, V);
1311     }
1312     break;
1313 
1314   case Intrinsic::x86_sse2_psra_d:
1315   case Intrinsic::x86_sse2_psra_w:
1316   case Intrinsic::x86_avx2_psra_d:
1317   case Intrinsic::x86_avx2_psra_w:
1318   case Intrinsic::x86_avx512_psra_q_128:
1319   case Intrinsic::x86_avx512_psra_q_256:
1320   case Intrinsic::x86_avx512_psra_d_512:
1321   case Intrinsic::x86_avx512_psra_q_512:
1322   case Intrinsic::x86_avx512_psra_w_512:
1323   case Intrinsic::x86_sse2_psrl_d:
1324   case Intrinsic::x86_sse2_psrl_q:
1325   case Intrinsic::x86_sse2_psrl_w:
1326   case Intrinsic::x86_avx2_psrl_d:
1327   case Intrinsic::x86_avx2_psrl_q:
1328   case Intrinsic::x86_avx2_psrl_w:
1329   case Intrinsic::x86_avx512_psrl_d_512:
1330   case Intrinsic::x86_avx512_psrl_q_512:
1331   case Intrinsic::x86_avx512_psrl_w_512:
1332   case Intrinsic::x86_sse2_psll_d:
1333   case Intrinsic::x86_sse2_psll_q:
1334   case Intrinsic::x86_sse2_psll_w:
1335   case Intrinsic::x86_avx2_psll_d:
1336   case Intrinsic::x86_avx2_psll_q:
1337   case Intrinsic::x86_avx2_psll_w:
1338   case Intrinsic::x86_avx512_psll_d_512:
1339   case Intrinsic::x86_avx512_psll_q_512:
1340   case Intrinsic::x86_avx512_psll_w_512: {
1341     if (Value *V = simplifyX86immShift(II, IC.Builder)) {
1342       return IC.replaceInstUsesWith(II, V);
1343     }
1344 
1345     // SSE2/AVX2 uses only the first 64-bits of the 128-bit vector
1346     // operand to compute the shift amount.
1347     Value *Arg1 = II.getArgOperand(1);
1348     assert(Arg1->getType()->getPrimitiveSizeInBits() == 128 &&
1349            "Unexpected packed shift size");
1350     unsigned VWidth = cast<FixedVectorType>(Arg1->getType())->getNumElements();
1351 
1352     if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, VWidth / 2)) {
1353       return IC.replaceOperand(II, 1, V);
1354     }
1355     break;
1356   }
1357 
1358   case Intrinsic::x86_avx2_psllv_d:
1359   case Intrinsic::x86_avx2_psllv_d_256:
1360   case Intrinsic::x86_avx2_psllv_q:
1361   case Intrinsic::x86_avx2_psllv_q_256:
1362   case Intrinsic::x86_avx512_psllv_d_512:
1363   case Intrinsic::x86_avx512_psllv_q_512:
1364   case Intrinsic::x86_avx512_psllv_w_128:
1365   case Intrinsic::x86_avx512_psllv_w_256:
1366   case Intrinsic::x86_avx512_psllv_w_512:
1367   case Intrinsic::x86_avx2_psrav_d:
1368   case Intrinsic::x86_avx2_psrav_d_256:
1369   case Intrinsic::x86_avx512_psrav_q_128:
1370   case Intrinsic::x86_avx512_psrav_q_256:
1371   case Intrinsic::x86_avx512_psrav_d_512:
1372   case Intrinsic::x86_avx512_psrav_q_512:
1373   case Intrinsic::x86_avx512_psrav_w_128:
1374   case Intrinsic::x86_avx512_psrav_w_256:
1375   case Intrinsic::x86_avx512_psrav_w_512:
1376   case Intrinsic::x86_avx2_psrlv_d:
1377   case Intrinsic::x86_avx2_psrlv_d_256:
1378   case Intrinsic::x86_avx2_psrlv_q:
1379   case Intrinsic::x86_avx2_psrlv_q_256:
1380   case Intrinsic::x86_avx512_psrlv_d_512:
1381   case Intrinsic::x86_avx512_psrlv_q_512:
1382   case Intrinsic::x86_avx512_psrlv_w_128:
1383   case Intrinsic::x86_avx512_psrlv_w_256:
1384   case Intrinsic::x86_avx512_psrlv_w_512:
1385     if (Value *V = simplifyX86varShift(II, IC.Builder)) {
1386       return IC.replaceInstUsesWith(II, V);
1387     }
1388     break;
1389 
1390   case Intrinsic::x86_sse2_packssdw_128:
1391   case Intrinsic::x86_sse2_packsswb_128:
1392   case Intrinsic::x86_avx2_packssdw:
1393   case Intrinsic::x86_avx2_packsswb:
1394   case Intrinsic::x86_avx512_packssdw_512:
1395   case Intrinsic::x86_avx512_packsswb_512:
1396     if (Value *V = simplifyX86pack(II, IC.Builder, true)) {
1397       return IC.replaceInstUsesWith(II, V);
1398     }
1399     break;
1400 
1401   case Intrinsic::x86_sse2_packuswb_128:
1402   case Intrinsic::x86_sse41_packusdw:
1403   case Intrinsic::x86_avx2_packusdw:
1404   case Intrinsic::x86_avx2_packuswb:
1405   case Intrinsic::x86_avx512_packusdw_512:
1406   case Intrinsic::x86_avx512_packuswb_512:
1407     if (Value *V = simplifyX86pack(II, IC.Builder, false)) {
1408       return IC.replaceInstUsesWith(II, V);
1409     }
1410     break;
1411 
1412   case Intrinsic::x86_pclmulqdq:
1413   case Intrinsic::x86_pclmulqdq_256:
1414   case Intrinsic::x86_pclmulqdq_512: {
1415     if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
1416       unsigned Imm = C->getZExtValue();
1417 
1418       bool MadeChange = false;
1419       Value *Arg0 = II.getArgOperand(0);
1420       Value *Arg1 = II.getArgOperand(1);
1421       unsigned VWidth =
1422           cast<FixedVectorType>(Arg0->getType())->getNumElements();
1423 
1424       APInt UndefElts1(VWidth, 0);
1425       APInt DemandedElts1 =
1426           APInt::getSplat(VWidth, APInt(2, (Imm & 0x01) ? 2 : 1));
1427       if (Value *V =
1428               IC.SimplifyDemandedVectorElts(Arg0, DemandedElts1, UndefElts1)) {
1429         IC.replaceOperand(II, 0, V);
1430         MadeChange = true;
1431       }
1432 
1433       APInt UndefElts2(VWidth, 0);
1434       APInt DemandedElts2 =
1435           APInt::getSplat(VWidth, APInt(2, (Imm & 0x10) ? 2 : 1));
1436       if (Value *V =
1437               IC.SimplifyDemandedVectorElts(Arg1, DemandedElts2, UndefElts2)) {
1438         IC.replaceOperand(II, 1, V);
1439         MadeChange = true;
1440       }
1441 
1442       // If either input elements are undef, the result is zero.
1443       if (DemandedElts1.isSubsetOf(UndefElts1) ||
1444           DemandedElts2.isSubsetOf(UndefElts2)) {
1445         return IC.replaceInstUsesWith(II,
1446                                       ConstantAggregateZero::get(II.getType()));
1447       }
1448 
1449       if (MadeChange) {
1450         return &II;
1451       }
1452     }
1453     break;
1454   }
1455 
1456   case Intrinsic::x86_sse41_insertps:
1457     if (Value *V = simplifyX86insertps(II, IC.Builder)) {
1458       return IC.replaceInstUsesWith(II, V);
1459     }
1460     break;
1461 
1462   case Intrinsic::x86_sse4a_extrq: {
1463     Value *Op0 = II.getArgOperand(0);
1464     Value *Op1 = II.getArgOperand(1);
1465     unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements();
1466     unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements();
1467     assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
1468            Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
1469            VWidth1 == 16 && "Unexpected operand sizes");
1470 
1471     // See if we're dealing with constant values.
1472     auto *C1 = dyn_cast<Constant>(Op1);
1473     auto *CILength =
1474         C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
1475            : nullptr;
1476     auto *CIIndex =
1477         C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
1478            : nullptr;
1479 
1480     // Attempt to simplify to a constant, shuffle vector or EXTRQI call.
1481     if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) {
1482       return IC.replaceInstUsesWith(II, V);
1483     }
1484 
1485     // EXTRQ only uses the lowest 64-bits of the first 128-bit vector
1486     // operands and the lowest 16-bits of the second.
1487     bool MadeChange = false;
1488     if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
1489       IC.replaceOperand(II, 0, V);
1490       MadeChange = true;
1491     }
1492     if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 2)) {
1493       IC.replaceOperand(II, 1, V);
1494       MadeChange = true;
1495     }
1496     if (MadeChange) {
1497       return &II;
1498     }
1499     break;
1500   }
1501 
1502   case Intrinsic::x86_sse4a_extrqi: {
1503     // EXTRQI: Extract Length bits starting from Index. Zero pad the remaining
1504     // bits of the lower 64-bits. The upper 64-bits are undefined.
1505     Value *Op0 = II.getArgOperand(0);
1506     unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements();
1507     assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
1508            "Unexpected operand size");
1509 
1510     // See if we're dealing with constant values.
1511     auto *CILength = dyn_cast<ConstantInt>(II.getArgOperand(1));
1512     auto *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(2));
1513 
1514     // Attempt to simplify to a constant or shuffle vector.
1515     if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) {
1516       return IC.replaceInstUsesWith(II, V);
1517     }
1518 
1519     // EXTRQI only uses the lowest 64-bits of the first 128-bit vector
1520     // operand.
1521     if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
1522       return IC.replaceOperand(II, 0, V);
1523     }
1524     break;
1525   }
1526 
1527   case Intrinsic::x86_sse4a_insertq: {
1528     Value *Op0 = II.getArgOperand(0);
1529     Value *Op1 = II.getArgOperand(1);
1530     unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements();
1531     assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
1532            Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
1533            cast<FixedVectorType>(Op1->getType())->getNumElements() == 2 &&
1534            "Unexpected operand size");
1535 
1536     // See if we're dealing with constant values.
1537     auto *C1 = dyn_cast<Constant>(Op1);
1538     auto *CI11 =
1539         C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
1540            : nullptr;
1541 
1542     // Attempt to simplify to a constant, shuffle vector or INSERTQI call.
1543     if (CI11) {
1544       const APInt &V11 = CI11->getValue();
1545       APInt Len = V11.zextOrTrunc(6);
1546       APInt Idx = V11.lshr(8).zextOrTrunc(6);
1547       if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) {
1548         return IC.replaceInstUsesWith(II, V);
1549       }
1550     }
1551 
1552     // INSERTQ only uses the lowest 64-bits of the first 128-bit vector
1553     // operand.
1554     if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
1555       return IC.replaceOperand(II, 0, V);
1556     }
1557     break;
1558   }
1559 
1560   case Intrinsic::x86_sse4a_insertqi: {
1561     // INSERTQI: Extract lowest Length bits from lower half of second source and
1562     // insert over first source starting at Index bit. The upper 64-bits are
1563     // undefined.
1564     Value *Op0 = II.getArgOperand(0);
1565     Value *Op1 = II.getArgOperand(1);
1566     unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements();
1567     unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements();
1568     assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
1569            Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
1570            VWidth1 == 2 && "Unexpected operand sizes");
1571 
1572     // See if we're dealing with constant values.
1573     auto *CILength = dyn_cast<ConstantInt>(II.getArgOperand(2));
1574     auto *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(3));
1575 
1576     // Attempt to simplify to a constant or shuffle vector.
1577     if (CILength && CIIndex) {
1578       APInt Len = CILength->getValue().zextOrTrunc(6);
1579       APInt Idx = CIIndex->getValue().zextOrTrunc(6);
1580       if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) {
1581         return IC.replaceInstUsesWith(II, V);
1582       }
1583     }
1584 
1585     // INSERTQI only uses the lowest 64-bits of the first two 128-bit vector
1586     // operands.
1587     bool MadeChange = false;
1588     if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
1589       IC.replaceOperand(II, 0, V);
1590       MadeChange = true;
1591     }
1592     if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 1)) {
1593       IC.replaceOperand(II, 1, V);
1594       MadeChange = true;
1595     }
1596     if (MadeChange) {
1597       return &II;
1598     }
1599     break;
1600   }
1601 
1602   case Intrinsic::x86_sse41_pblendvb:
1603   case Intrinsic::x86_sse41_blendvps:
1604   case Intrinsic::x86_sse41_blendvpd:
1605   case Intrinsic::x86_avx_blendv_ps_256:
1606   case Intrinsic::x86_avx_blendv_pd_256:
1607   case Intrinsic::x86_avx2_pblendvb: {
1608     // fold (blend A, A, Mask) -> A
1609     Value *Op0 = II.getArgOperand(0);
1610     Value *Op1 = II.getArgOperand(1);
1611     Value *Mask = II.getArgOperand(2);
1612     if (Op0 == Op1) {
1613       return IC.replaceInstUsesWith(II, Op0);
1614     }
1615 
1616     // Zero Mask - select 1st argument.
1617     if (isa<ConstantAggregateZero>(Mask)) {
1618       return IC.replaceInstUsesWith(II, Op0);
1619     }
1620 
1621     // Constant Mask - select 1st/2nd argument lane based on top bit of mask.
1622     if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) {
1623       Constant *NewSelector = getNegativeIsTrueBoolVec(ConstantMask);
1624       return SelectInst::Create(NewSelector, Op1, Op0, "blendv");
1625     }
1626 
1627     // Convert to a vector select if we can bypass casts and find a boolean
1628     // vector condition value.
1629     Value *BoolVec;
1630     Mask = InstCombiner::peekThroughBitcast(Mask);
1631     if (match(Mask, PatternMatch::m_SExt(PatternMatch::m_Value(BoolVec))) &&
1632         BoolVec->getType()->isVectorTy() &&
1633         BoolVec->getType()->getScalarSizeInBits() == 1) {
1634       assert(Mask->getType()->getPrimitiveSizeInBits() ==
1635                  II.getType()->getPrimitiveSizeInBits() &&
1636              "Not expecting mask and operands with different sizes");
1637 
1638       unsigned NumMaskElts =
1639           cast<FixedVectorType>(Mask->getType())->getNumElements();
1640       unsigned NumOperandElts =
1641           cast<FixedVectorType>(II.getType())->getNumElements();
1642       if (NumMaskElts == NumOperandElts) {
1643         return SelectInst::Create(BoolVec, Op1, Op0);
1644       }
1645 
1646       // If the mask has less elements than the operands, each mask bit maps to
1647       // multiple elements of the operands. Bitcast back and forth.
1648       if (NumMaskElts < NumOperandElts) {
1649         Value *CastOp0 = IC.Builder.CreateBitCast(Op0, Mask->getType());
1650         Value *CastOp1 = IC.Builder.CreateBitCast(Op1, Mask->getType());
1651         Value *Sel = IC.Builder.CreateSelect(BoolVec, CastOp1, CastOp0);
1652         return new BitCastInst(Sel, II.getType());
1653       }
1654     }
1655 
1656     break;
1657   }
1658 
1659   case Intrinsic::x86_ssse3_pshuf_b_128:
1660   case Intrinsic::x86_avx2_pshuf_b:
1661   case Intrinsic::x86_avx512_pshuf_b_512:
1662     if (Value *V = simplifyX86pshufb(II, IC.Builder)) {
1663       return IC.replaceInstUsesWith(II, V);
1664     }
1665     break;
1666 
1667   case Intrinsic::x86_avx_vpermilvar_ps:
1668   case Intrinsic::x86_avx_vpermilvar_ps_256:
1669   case Intrinsic::x86_avx512_vpermilvar_ps_512:
1670   case Intrinsic::x86_avx_vpermilvar_pd:
1671   case Intrinsic::x86_avx_vpermilvar_pd_256:
1672   case Intrinsic::x86_avx512_vpermilvar_pd_512:
1673     if (Value *V = simplifyX86vpermilvar(II, IC.Builder)) {
1674       return IC.replaceInstUsesWith(II, V);
1675     }
1676     break;
1677 
1678   case Intrinsic::x86_avx2_permd:
1679   case Intrinsic::x86_avx2_permps:
1680   case Intrinsic::x86_avx512_permvar_df_256:
1681   case Intrinsic::x86_avx512_permvar_df_512:
1682   case Intrinsic::x86_avx512_permvar_di_256:
1683   case Intrinsic::x86_avx512_permvar_di_512:
1684   case Intrinsic::x86_avx512_permvar_hi_128:
1685   case Intrinsic::x86_avx512_permvar_hi_256:
1686   case Intrinsic::x86_avx512_permvar_hi_512:
1687   case Intrinsic::x86_avx512_permvar_qi_128:
1688   case Intrinsic::x86_avx512_permvar_qi_256:
1689   case Intrinsic::x86_avx512_permvar_qi_512:
1690   case Intrinsic::x86_avx512_permvar_sf_512:
1691   case Intrinsic::x86_avx512_permvar_si_512:
1692     if (Value *V = simplifyX86vpermv(II, IC.Builder)) {
1693       return IC.replaceInstUsesWith(II, V);
1694     }
1695     break;
1696 
1697   case Intrinsic::x86_avx_maskload_ps:
1698   case Intrinsic::x86_avx_maskload_pd:
1699   case Intrinsic::x86_avx_maskload_ps_256:
1700   case Intrinsic::x86_avx_maskload_pd_256:
1701   case Intrinsic::x86_avx2_maskload_d:
1702   case Intrinsic::x86_avx2_maskload_q:
1703   case Intrinsic::x86_avx2_maskload_d_256:
1704   case Intrinsic::x86_avx2_maskload_q_256:
1705     if (Instruction *I = simplifyX86MaskedLoad(II, IC)) {
1706       return I;
1707     }
1708     break;
1709 
1710   case Intrinsic::x86_sse2_maskmov_dqu:
1711   case Intrinsic::x86_avx_maskstore_ps:
1712   case Intrinsic::x86_avx_maskstore_pd:
1713   case Intrinsic::x86_avx_maskstore_ps_256:
1714   case Intrinsic::x86_avx_maskstore_pd_256:
1715   case Intrinsic::x86_avx2_maskstore_d:
1716   case Intrinsic::x86_avx2_maskstore_q:
1717   case Intrinsic::x86_avx2_maskstore_d_256:
1718   case Intrinsic::x86_avx2_maskstore_q_256:
1719     if (simplifyX86MaskedStore(II, IC)) {
1720       return nullptr;
1721     }
1722     break;
1723 
1724   case Intrinsic::x86_addcarry_32:
1725   case Intrinsic::x86_addcarry_64:
1726     if (Value *V = simplifyX86addcarry(II, IC.Builder)) {
1727       return IC.replaceInstUsesWith(II, V);
1728     }
1729     break;
1730 
1731   default:
1732     break;
1733   }
1734   return std::nullopt;
1735 }
1736 
1737 std::optional<Value *> X86TTIImpl::simplifyDemandedUseBitsIntrinsic(
1738     InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known,
1739     bool &KnownBitsComputed) const {
1740   switch (II.getIntrinsicID()) {
1741   default:
1742     break;
1743   case Intrinsic::x86_mmx_pmovmskb:
1744   case Intrinsic::x86_sse_movmsk_ps:
1745   case Intrinsic::x86_sse2_movmsk_pd:
1746   case Intrinsic::x86_sse2_pmovmskb_128:
1747   case Intrinsic::x86_avx_movmsk_ps_256:
1748   case Intrinsic::x86_avx_movmsk_pd_256:
1749   case Intrinsic::x86_avx2_pmovmskb: {
1750     // MOVMSK copies the vector elements' sign bits to the low bits
1751     // and zeros the high bits.
1752     unsigned ArgWidth;
1753     if (II.getIntrinsicID() == Intrinsic::x86_mmx_pmovmskb) {
1754       ArgWidth = 8; // Arg is x86_mmx, but treated as <8 x i8>.
1755     } else {
1756       auto *ArgType = cast<FixedVectorType>(II.getArgOperand(0)->getType());
1757       ArgWidth = ArgType->getNumElements();
1758     }
1759 
1760     // If we don't need any of low bits then return zero,
1761     // we know that DemandedMask is non-zero already.
1762     APInt DemandedElts = DemandedMask.zextOrTrunc(ArgWidth);
1763     Type *VTy = II.getType();
1764     if (DemandedElts.isZero()) {
1765       return ConstantInt::getNullValue(VTy);
1766     }
1767 
1768     // We know that the upper bits are set to zero.
1769     Known.Zero.setBitsFrom(ArgWidth);
1770     KnownBitsComputed = true;
1771     break;
1772   }
1773   }
1774   return std::nullopt;
1775 }
1776 
1777 std::optional<Value *> X86TTIImpl::simplifyDemandedVectorEltsIntrinsic(
1778     InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
1779     APInt &UndefElts2, APInt &UndefElts3,
1780     std::function<void(Instruction *, unsigned, APInt, APInt &)>
1781         simplifyAndSetOp) const {
1782   unsigned VWidth = cast<FixedVectorType>(II.getType())->getNumElements();
1783   switch (II.getIntrinsicID()) {
1784   default:
1785     break;
1786   case Intrinsic::x86_xop_vfrcz_ss:
1787   case Intrinsic::x86_xop_vfrcz_sd:
1788     // The instructions for these intrinsics are speced to zero upper bits not
1789     // pass them through like other scalar intrinsics. So we shouldn't just
1790     // use Arg0 if DemandedElts[0] is clear like we do for other intrinsics.
1791     // Instead we should return a zero vector.
1792     if (!DemandedElts[0]) {
1793       IC.addToWorklist(&II);
1794       return ConstantAggregateZero::get(II.getType());
1795     }
1796 
1797     // Only the lower element is used.
1798     DemandedElts = 1;
1799     simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
1800 
1801     // Only the lower element is undefined. The high elements are zero.
1802     UndefElts = UndefElts[0];
1803     break;
1804 
1805   // Unary scalar-as-vector operations that work column-wise.
1806   case Intrinsic::x86_sse_rcp_ss:
1807   case Intrinsic::x86_sse_rsqrt_ss:
1808     simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
1809 
1810     // If lowest element of a scalar op isn't used then use Arg0.
1811     if (!DemandedElts[0]) {
1812       IC.addToWorklist(&II);
1813       return II.getArgOperand(0);
1814     }
1815     // TODO: If only low elt lower SQRT to FSQRT (with rounding/exceptions
1816     // checks).
1817     break;
1818 
1819   // Binary scalar-as-vector operations that work column-wise. The high
1820   // elements come from operand 0. The low element is a function of both
1821   // operands.
1822   case Intrinsic::x86_sse_min_ss:
1823   case Intrinsic::x86_sse_max_ss:
1824   case Intrinsic::x86_sse_cmp_ss:
1825   case Intrinsic::x86_sse2_min_sd:
1826   case Intrinsic::x86_sse2_max_sd:
1827   case Intrinsic::x86_sse2_cmp_sd: {
1828     simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
1829 
1830     // If lowest element of a scalar op isn't used then use Arg0.
1831     if (!DemandedElts[0]) {
1832       IC.addToWorklist(&II);
1833       return II.getArgOperand(0);
1834     }
1835 
1836     // Only lower element is used for operand 1.
1837     DemandedElts = 1;
1838     simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
1839 
1840     // Lower element is undefined if both lower elements are undefined.
1841     // Consider things like undef&0.  The result is known zero, not undef.
1842     if (!UndefElts2[0])
1843       UndefElts.clearBit(0);
1844 
1845     break;
1846   }
1847 
1848   // Binary scalar-as-vector operations that work column-wise. The high
1849   // elements come from operand 0 and the low element comes from operand 1.
1850   case Intrinsic::x86_sse41_round_ss:
1851   case Intrinsic::x86_sse41_round_sd: {
1852     // Don't use the low element of operand 0.
1853     APInt DemandedElts2 = DemandedElts;
1854     DemandedElts2.clearBit(0);
1855     simplifyAndSetOp(&II, 0, DemandedElts2, UndefElts);
1856 
1857     // If lowest element of a scalar op isn't used then use Arg0.
1858     if (!DemandedElts[0]) {
1859       IC.addToWorklist(&II);
1860       return II.getArgOperand(0);
1861     }
1862 
1863     // Only lower element is used for operand 1.
1864     DemandedElts = 1;
1865     simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
1866 
1867     // Take the high undef elements from operand 0 and take the lower element
1868     // from operand 1.
1869     UndefElts.clearBit(0);
1870     UndefElts |= UndefElts2[0];
1871     break;
1872   }
1873 
1874   // Three input scalar-as-vector operations that work column-wise. The high
1875   // elements come from operand 0 and the low element is a function of all
1876   // three inputs.
1877   case Intrinsic::x86_avx512_mask_add_ss_round:
1878   case Intrinsic::x86_avx512_mask_div_ss_round:
1879   case Intrinsic::x86_avx512_mask_mul_ss_round:
1880   case Intrinsic::x86_avx512_mask_sub_ss_round:
1881   case Intrinsic::x86_avx512_mask_max_ss_round:
1882   case Intrinsic::x86_avx512_mask_min_ss_round:
1883   case Intrinsic::x86_avx512_mask_add_sd_round:
1884   case Intrinsic::x86_avx512_mask_div_sd_round:
1885   case Intrinsic::x86_avx512_mask_mul_sd_round:
1886   case Intrinsic::x86_avx512_mask_sub_sd_round:
1887   case Intrinsic::x86_avx512_mask_max_sd_round:
1888   case Intrinsic::x86_avx512_mask_min_sd_round:
1889     simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
1890 
1891     // If lowest element of a scalar op isn't used then use Arg0.
1892     if (!DemandedElts[0]) {
1893       IC.addToWorklist(&II);
1894       return II.getArgOperand(0);
1895     }
1896 
1897     // Only lower element is used for operand 1 and 2.
1898     DemandedElts = 1;
1899     simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
1900     simplifyAndSetOp(&II, 2, DemandedElts, UndefElts3);
1901 
1902     // Lower element is undefined if all three lower elements are undefined.
1903     // Consider things like undef&0.  The result is known zero, not undef.
1904     if (!UndefElts2[0] || !UndefElts3[0])
1905       UndefElts.clearBit(0);
1906     break;
1907 
1908   // TODO: Add fmaddsub support?
1909   case Intrinsic::x86_sse3_addsub_pd:
1910   case Intrinsic::x86_sse3_addsub_ps:
1911   case Intrinsic::x86_avx_addsub_pd_256:
1912   case Intrinsic::x86_avx_addsub_ps_256: {
1913     // If none of the even or none of the odd lanes are required, turn this
1914     // into a generic FP math instruction.
1915     APInt SubMask = APInt::getSplat(VWidth, APInt(2, 0x1));
1916     APInt AddMask = APInt::getSplat(VWidth, APInt(2, 0x2));
1917     bool IsSubOnly = DemandedElts.isSubsetOf(SubMask);
1918     bool IsAddOnly = DemandedElts.isSubsetOf(AddMask);
1919     if (IsSubOnly || IsAddOnly) {
1920       assert((IsSubOnly ^ IsAddOnly) && "Can't be both add-only and sub-only");
1921       IRBuilderBase::InsertPointGuard Guard(IC.Builder);
1922       IC.Builder.SetInsertPoint(&II);
1923       Value *Arg0 = II.getArgOperand(0), *Arg1 = II.getArgOperand(1);
1924       return IC.Builder.CreateBinOp(
1925           IsSubOnly ? Instruction::FSub : Instruction::FAdd, Arg0, Arg1);
1926     }
1927 
1928     simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
1929     simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
1930     UndefElts &= UndefElts2;
1931     break;
1932   }
1933 
1934   // General per-element vector operations.
1935   case Intrinsic::x86_avx2_psllv_d:
1936   case Intrinsic::x86_avx2_psllv_d_256:
1937   case Intrinsic::x86_avx2_psllv_q:
1938   case Intrinsic::x86_avx2_psllv_q_256:
1939   case Intrinsic::x86_avx2_psrlv_d:
1940   case Intrinsic::x86_avx2_psrlv_d_256:
1941   case Intrinsic::x86_avx2_psrlv_q:
1942   case Intrinsic::x86_avx2_psrlv_q_256:
1943   case Intrinsic::x86_avx2_psrav_d:
1944   case Intrinsic::x86_avx2_psrav_d_256: {
1945     simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
1946     simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
1947     UndefElts &= UndefElts2;
1948     break;
1949   }
1950 
1951   case Intrinsic::x86_sse2_packssdw_128:
1952   case Intrinsic::x86_sse2_packsswb_128:
1953   case Intrinsic::x86_sse2_packuswb_128:
1954   case Intrinsic::x86_sse41_packusdw:
1955   case Intrinsic::x86_avx2_packssdw:
1956   case Intrinsic::x86_avx2_packsswb:
1957   case Intrinsic::x86_avx2_packusdw:
1958   case Intrinsic::x86_avx2_packuswb:
1959   case Intrinsic::x86_avx512_packssdw_512:
1960   case Intrinsic::x86_avx512_packsswb_512:
1961   case Intrinsic::x86_avx512_packusdw_512:
1962   case Intrinsic::x86_avx512_packuswb_512: {
1963     auto *Ty0 = II.getArgOperand(0)->getType();
1964     unsigned InnerVWidth = cast<FixedVectorType>(Ty0)->getNumElements();
1965     assert(VWidth == (InnerVWidth * 2) && "Unexpected input size");
1966 
1967     unsigned NumLanes = Ty0->getPrimitiveSizeInBits() / 128;
1968     unsigned VWidthPerLane = VWidth / NumLanes;
1969     unsigned InnerVWidthPerLane = InnerVWidth / NumLanes;
1970 
1971     // Per lane, pack the elements of the first input and then the second.
1972     // e.g.
1973     // v8i16 PACK(v4i32 X, v4i32 Y) - (X[0..3],Y[0..3])
1974     // v32i8 PACK(v16i16 X, v16i16 Y) - (X[0..7],Y[0..7]),(X[8..15],Y[8..15])
1975     for (int OpNum = 0; OpNum != 2; ++OpNum) {
1976       APInt OpDemandedElts(InnerVWidth, 0);
1977       for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
1978         unsigned LaneIdx = Lane * VWidthPerLane;
1979         for (unsigned Elt = 0; Elt != InnerVWidthPerLane; ++Elt) {
1980           unsigned Idx = LaneIdx + Elt + InnerVWidthPerLane * OpNum;
1981           if (DemandedElts[Idx])
1982             OpDemandedElts.setBit((Lane * InnerVWidthPerLane) + Elt);
1983         }
1984       }
1985 
1986       // Demand elements from the operand.
1987       APInt OpUndefElts(InnerVWidth, 0);
1988       simplifyAndSetOp(&II, OpNum, OpDemandedElts, OpUndefElts);
1989 
1990       // Pack the operand's UNDEF elements, one lane at a time.
1991       OpUndefElts = OpUndefElts.zext(VWidth);
1992       for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
1993         APInt LaneElts = OpUndefElts.lshr(InnerVWidthPerLane * Lane);
1994         LaneElts = LaneElts.getLoBits(InnerVWidthPerLane);
1995         LaneElts <<= InnerVWidthPerLane * (2 * Lane + OpNum);
1996         UndefElts |= LaneElts;
1997       }
1998     }
1999     break;
2000   }
2001 
2002   // PSHUFB
2003   case Intrinsic::x86_ssse3_pshuf_b_128:
2004   case Intrinsic::x86_avx2_pshuf_b:
2005   case Intrinsic::x86_avx512_pshuf_b_512:
2006   // PERMILVAR
2007   case Intrinsic::x86_avx_vpermilvar_ps:
2008   case Intrinsic::x86_avx_vpermilvar_ps_256:
2009   case Intrinsic::x86_avx512_vpermilvar_ps_512:
2010   case Intrinsic::x86_avx_vpermilvar_pd:
2011   case Intrinsic::x86_avx_vpermilvar_pd_256:
2012   case Intrinsic::x86_avx512_vpermilvar_pd_512:
2013   // PERMV
2014   case Intrinsic::x86_avx2_permd:
2015   case Intrinsic::x86_avx2_permps: {
2016     simplifyAndSetOp(&II, 1, DemandedElts, UndefElts);
2017     break;
2018   }
2019 
2020   // SSE4A instructions leave the upper 64-bits of the 128-bit result
2021   // in an undefined state.
2022   case Intrinsic::x86_sse4a_extrq:
2023   case Intrinsic::x86_sse4a_extrqi:
2024   case Intrinsic::x86_sse4a_insertq:
2025   case Intrinsic::x86_sse4a_insertqi:
2026     UndefElts.setHighBits(VWidth / 2);
2027     break;
2028   }
2029   return std::nullopt;
2030 }
2031