1 //===-- X86InstCombineIntrinsic.cpp - X86 specific InstCombine pass -------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements a TargetTransformInfo analysis pass specific to the
10 /// X86 target machine. It uses the target's detailed information to provide
11 /// more precise answers to certain TTI queries, while letting the target
12 /// independent and default TTI implementations handle the rest.
13 ///
14 //===----------------------------------------------------------------------===//
15
16 #include "X86TargetTransformInfo.h"
17 #include "llvm/IR/IntrinsicInst.h"
18 #include "llvm/IR/IntrinsicsX86.h"
19 #include "llvm/Support/KnownBits.h"
20 #include "llvm/Transforms/InstCombine/InstCombiner.h"
21 #include <optional>
22
23 using namespace llvm;
24 using namespace llvm::PatternMatch;
25
26 #define DEBUG_TYPE "x86tti"
27
28 /// Return a constant boolean vector that has true elements in all positions
29 /// where the input constant data vector has an element with the sign bit set.
getNegativeIsTrueBoolVec(Constant * V,const DataLayout & DL)30 static Constant *getNegativeIsTrueBoolVec(Constant *V, const DataLayout &DL) {
31 VectorType *IntTy = VectorType::getInteger(cast<VectorType>(V->getType()));
32 V = ConstantExpr::getBitCast(V, IntTy);
33 V = ConstantFoldCompareInstOperands(CmpInst::ICMP_SGT,
34 Constant::getNullValue(IntTy), V, DL);
35 assert(V && "Vector must be foldable");
36 return V;
37 }
38
39 /// Convert the x86 XMM integer vector mask to a vector of bools based on
40 /// each element's most significant bit (the sign bit).
getBoolVecFromMask(Value * Mask,const DataLayout & DL)41 static Value *getBoolVecFromMask(Value *Mask, const DataLayout &DL) {
42 // Fold Constant Mask.
43 if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask))
44 return getNegativeIsTrueBoolVec(ConstantMask, DL);
45
46 // Mask was extended from a boolean vector.
47 Value *ExtMask;
48 if (match(Mask, m_SExt(m_Value(ExtMask))) &&
49 ExtMask->getType()->isIntOrIntVectorTy(1))
50 return ExtMask;
51
52 return nullptr;
53 }
54
55 // TODO: If the x86 backend knew how to convert a bool vector mask back to an
56 // XMM register mask efficiently, we could transform all x86 masked intrinsics
57 // to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
simplifyX86MaskedLoad(IntrinsicInst & II,InstCombiner & IC)58 static Instruction *simplifyX86MaskedLoad(IntrinsicInst &II, InstCombiner &IC) {
59 Value *Ptr = II.getOperand(0);
60 Value *Mask = II.getOperand(1);
61 Constant *ZeroVec = Constant::getNullValue(II.getType());
62
63 // Zero Mask - masked load instruction creates a zero vector.
64 if (isa<ConstantAggregateZero>(Mask))
65 return IC.replaceInstUsesWith(II, ZeroVec);
66
67 // The mask is constant or extended from a bool vector. Convert this x86
68 // intrinsic to the LLVM intrinsic to allow target-independent optimizations.
69 if (Value *BoolMask = getBoolVecFromMask(Mask, IC.getDataLayout())) {
70 // First, cast the x86 intrinsic scalar pointer to a vector pointer to match
71 // the LLVM intrinsic definition for the pointer argument.
72 unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
73 PointerType *VecPtrTy = PointerType::get(II.getType(), AddrSpace);
74 Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");
75
76 // The pass-through vector for an x86 masked load is a zero vector.
77 CallInst *NewMaskedLoad = IC.Builder.CreateMaskedLoad(
78 II.getType(), PtrCast, Align(1), BoolMask, ZeroVec);
79 return IC.replaceInstUsesWith(II, NewMaskedLoad);
80 }
81
82 return nullptr;
83 }
84
85 // TODO: If the x86 backend knew how to convert a bool vector mask back to an
86 // XMM register mask efficiently, we could transform all x86 masked intrinsics
87 // to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
simplifyX86MaskedStore(IntrinsicInst & II,InstCombiner & IC)88 static bool simplifyX86MaskedStore(IntrinsicInst &II, InstCombiner &IC) {
89 Value *Ptr = II.getOperand(0);
90 Value *Mask = II.getOperand(1);
91 Value *Vec = II.getOperand(2);
92
93 // Zero Mask - this masked store instruction does nothing.
94 if (isa<ConstantAggregateZero>(Mask)) {
95 IC.eraseInstFromFunction(II);
96 return true;
97 }
98
99 // The SSE2 version is too weird (eg, unaligned but non-temporal) to do
100 // anything else at this level.
101 if (II.getIntrinsicID() == Intrinsic::x86_sse2_maskmov_dqu)
102 return false;
103
104 // The mask is constant or extended from a bool vector. Convert this x86
105 // intrinsic to the LLVM intrinsic to allow target-independent optimizations.
106 if (Value *BoolMask = getBoolVecFromMask(Mask, IC.getDataLayout())) {
107 unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
108 PointerType *VecPtrTy = PointerType::get(Vec->getType(), AddrSpace);
109 Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");
110
111 IC.Builder.CreateMaskedStore(Vec, PtrCast, Align(1), BoolMask);
112
113 // 'Replace uses' doesn't work for stores. Erase the original masked store.
114 IC.eraseInstFromFunction(II);
115 return true;
116 }
117
118 return false;
119 }
120
simplifyX86immShift(const IntrinsicInst & II,InstCombiner::BuilderTy & Builder)121 static Value *simplifyX86immShift(const IntrinsicInst &II,
122 InstCombiner::BuilderTy &Builder) {
123 bool LogicalShift = false;
124 bool ShiftLeft = false;
125 bool IsImm = false;
126
127 switch (II.getIntrinsicID()) {
128 default:
129 llvm_unreachable("Unexpected intrinsic!");
130 case Intrinsic::x86_sse2_psrai_d:
131 case Intrinsic::x86_sse2_psrai_w:
132 case Intrinsic::x86_avx2_psrai_d:
133 case Intrinsic::x86_avx2_psrai_w:
134 case Intrinsic::x86_avx512_psrai_q_128:
135 case Intrinsic::x86_avx512_psrai_q_256:
136 case Intrinsic::x86_avx512_psrai_d_512:
137 case Intrinsic::x86_avx512_psrai_q_512:
138 case Intrinsic::x86_avx512_psrai_w_512:
139 IsImm = true;
140 [[fallthrough]];
141 case Intrinsic::x86_sse2_psra_d:
142 case Intrinsic::x86_sse2_psra_w:
143 case Intrinsic::x86_avx2_psra_d:
144 case Intrinsic::x86_avx2_psra_w:
145 case Intrinsic::x86_avx512_psra_q_128:
146 case Intrinsic::x86_avx512_psra_q_256:
147 case Intrinsic::x86_avx512_psra_d_512:
148 case Intrinsic::x86_avx512_psra_q_512:
149 case Intrinsic::x86_avx512_psra_w_512:
150 LogicalShift = false;
151 ShiftLeft = false;
152 break;
153 case Intrinsic::x86_sse2_psrli_d:
154 case Intrinsic::x86_sse2_psrli_q:
155 case Intrinsic::x86_sse2_psrli_w:
156 case Intrinsic::x86_avx2_psrli_d:
157 case Intrinsic::x86_avx2_psrli_q:
158 case Intrinsic::x86_avx2_psrli_w:
159 case Intrinsic::x86_avx512_psrli_d_512:
160 case Intrinsic::x86_avx512_psrli_q_512:
161 case Intrinsic::x86_avx512_psrli_w_512:
162 IsImm = true;
163 [[fallthrough]];
164 case Intrinsic::x86_sse2_psrl_d:
165 case Intrinsic::x86_sse2_psrl_q:
166 case Intrinsic::x86_sse2_psrl_w:
167 case Intrinsic::x86_avx2_psrl_d:
168 case Intrinsic::x86_avx2_psrl_q:
169 case Intrinsic::x86_avx2_psrl_w:
170 case Intrinsic::x86_avx512_psrl_d_512:
171 case Intrinsic::x86_avx512_psrl_q_512:
172 case Intrinsic::x86_avx512_psrl_w_512:
173 LogicalShift = true;
174 ShiftLeft = false;
175 break;
176 case Intrinsic::x86_sse2_pslli_d:
177 case Intrinsic::x86_sse2_pslli_q:
178 case Intrinsic::x86_sse2_pslli_w:
179 case Intrinsic::x86_avx2_pslli_d:
180 case Intrinsic::x86_avx2_pslli_q:
181 case Intrinsic::x86_avx2_pslli_w:
182 case Intrinsic::x86_avx512_pslli_d_512:
183 case Intrinsic::x86_avx512_pslli_q_512:
184 case Intrinsic::x86_avx512_pslli_w_512:
185 IsImm = true;
186 [[fallthrough]];
187 case Intrinsic::x86_sse2_psll_d:
188 case Intrinsic::x86_sse2_psll_q:
189 case Intrinsic::x86_sse2_psll_w:
190 case Intrinsic::x86_avx2_psll_d:
191 case Intrinsic::x86_avx2_psll_q:
192 case Intrinsic::x86_avx2_psll_w:
193 case Intrinsic::x86_avx512_psll_d_512:
194 case Intrinsic::x86_avx512_psll_q_512:
195 case Intrinsic::x86_avx512_psll_w_512:
196 LogicalShift = true;
197 ShiftLeft = true;
198 break;
199 }
200 assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
201
202 Value *Vec = II.getArgOperand(0);
203 Value *Amt = II.getArgOperand(1);
204 auto *VT = cast<FixedVectorType>(Vec->getType());
205 Type *SVT = VT->getElementType();
206 Type *AmtVT = Amt->getType();
207 unsigned VWidth = VT->getNumElements();
208 unsigned BitWidth = SVT->getPrimitiveSizeInBits();
209
210 // If the shift amount is guaranteed to be in-range we can replace it with a
211 // generic shift. If its guaranteed to be out of range, logical shifts combine
212 // to zero and arithmetic shifts are clamped to (BitWidth - 1).
213 if (IsImm) {
214 assert(AmtVT->isIntegerTy(32) && "Unexpected shift-by-immediate type");
215 KnownBits KnownAmtBits =
216 llvm::computeKnownBits(Amt, II.getDataLayout());
217 if (KnownAmtBits.getMaxValue().ult(BitWidth)) {
218 Amt = Builder.CreateZExtOrTrunc(Amt, SVT);
219 Amt = Builder.CreateVectorSplat(VWidth, Amt);
220 return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
221 : Builder.CreateLShr(Vec, Amt))
222 : Builder.CreateAShr(Vec, Amt));
223 }
224 if (KnownAmtBits.getMinValue().uge(BitWidth)) {
225 if (LogicalShift)
226 return ConstantAggregateZero::get(VT);
227 Amt = ConstantInt::get(SVT, BitWidth - 1);
228 return Builder.CreateAShr(Vec, Builder.CreateVectorSplat(VWidth, Amt));
229 }
230 } else {
231 // Ensure the first element has an in-range value and the rest of the
232 // elements in the bottom 64 bits are zero.
233 assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 &&
234 cast<VectorType>(AmtVT)->getElementType() == SVT &&
235 "Unexpected shift-by-scalar type");
236 unsigned NumAmtElts = cast<FixedVectorType>(AmtVT)->getNumElements();
237 APInt DemandedLower = APInt::getOneBitSet(NumAmtElts, 0);
238 APInt DemandedUpper = APInt::getBitsSet(NumAmtElts, 1, NumAmtElts / 2);
239 KnownBits KnownLowerBits = llvm::computeKnownBits(
240 Amt, DemandedLower, II.getDataLayout());
241 KnownBits KnownUpperBits = llvm::computeKnownBits(
242 Amt, DemandedUpper, II.getDataLayout());
243 if (KnownLowerBits.getMaxValue().ult(BitWidth) &&
244 (DemandedUpper.isZero() || KnownUpperBits.isZero())) {
245 SmallVector<int, 16> ZeroSplat(VWidth, 0);
246 Amt = Builder.CreateShuffleVector(Amt, ZeroSplat);
247 return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
248 : Builder.CreateLShr(Vec, Amt))
249 : Builder.CreateAShr(Vec, Amt));
250 }
251 }
252
253 // Simplify if count is constant vector.
254 auto *CDV = dyn_cast<ConstantDataVector>(Amt);
255 if (!CDV)
256 return nullptr;
257
258 // SSE2/AVX2 uses all the first 64-bits of the 128-bit vector
259 // operand to compute the shift amount.
260 assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 &&
261 cast<VectorType>(AmtVT)->getElementType() == SVT &&
262 "Unexpected shift-by-scalar type");
263
264 // Concatenate the sub-elements to create the 64-bit value.
265 APInt Count(64, 0);
266 for (unsigned i = 0, NumSubElts = 64 / BitWidth; i != NumSubElts; ++i) {
267 unsigned SubEltIdx = (NumSubElts - 1) - i;
268 auto *SubElt = cast<ConstantInt>(CDV->getElementAsConstant(SubEltIdx));
269 Count <<= BitWidth;
270 Count |= SubElt->getValue().zextOrTrunc(64);
271 }
272
273 // If shift-by-zero then just return the original value.
274 if (Count.isZero())
275 return Vec;
276
277 // Handle cases when Shift >= BitWidth.
278 if (Count.uge(BitWidth)) {
279 // If LogicalShift - just return zero.
280 if (LogicalShift)
281 return ConstantAggregateZero::get(VT);
282
283 // If ArithmeticShift - clamp Shift to (BitWidth - 1).
284 Count = APInt(64, BitWidth - 1);
285 }
286
287 // Get a constant vector of the same type as the first operand.
288 auto ShiftAmt = ConstantInt::get(SVT, Count.zextOrTrunc(BitWidth));
289 auto ShiftVec = Builder.CreateVectorSplat(VWidth, ShiftAmt);
290
291 if (ShiftLeft)
292 return Builder.CreateShl(Vec, ShiftVec);
293
294 if (LogicalShift)
295 return Builder.CreateLShr(Vec, ShiftVec);
296
297 return Builder.CreateAShr(Vec, ShiftVec);
298 }
299
300 // Attempt to simplify AVX2 per-element shift intrinsics to a generic IR shift.
301 // Unlike the generic IR shifts, the intrinsics have defined behaviour for out
302 // of range shift amounts (logical - set to zero, arithmetic - splat sign bit).
simplifyX86varShift(const IntrinsicInst & II,InstCombiner::BuilderTy & Builder)303 static Value *simplifyX86varShift(const IntrinsicInst &II,
304 InstCombiner::BuilderTy &Builder) {
305 bool LogicalShift = false;
306 bool ShiftLeft = false;
307
308 switch (II.getIntrinsicID()) {
309 default:
310 llvm_unreachable("Unexpected intrinsic!");
311 case Intrinsic::x86_avx2_psrav_d:
312 case Intrinsic::x86_avx2_psrav_d_256:
313 case Intrinsic::x86_avx512_psrav_q_128:
314 case Intrinsic::x86_avx512_psrav_q_256:
315 case Intrinsic::x86_avx512_psrav_d_512:
316 case Intrinsic::x86_avx512_psrav_q_512:
317 case Intrinsic::x86_avx512_psrav_w_128:
318 case Intrinsic::x86_avx512_psrav_w_256:
319 case Intrinsic::x86_avx512_psrav_w_512:
320 LogicalShift = false;
321 ShiftLeft = false;
322 break;
323 case Intrinsic::x86_avx2_psrlv_d:
324 case Intrinsic::x86_avx2_psrlv_d_256:
325 case Intrinsic::x86_avx2_psrlv_q:
326 case Intrinsic::x86_avx2_psrlv_q_256:
327 case Intrinsic::x86_avx512_psrlv_d_512:
328 case Intrinsic::x86_avx512_psrlv_q_512:
329 case Intrinsic::x86_avx512_psrlv_w_128:
330 case Intrinsic::x86_avx512_psrlv_w_256:
331 case Intrinsic::x86_avx512_psrlv_w_512:
332 LogicalShift = true;
333 ShiftLeft = false;
334 break;
335 case Intrinsic::x86_avx2_psllv_d:
336 case Intrinsic::x86_avx2_psllv_d_256:
337 case Intrinsic::x86_avx2_psllv_q:
338 case Intrinsic::x86_avx2_psllv_q_256:
339 case Intrinsic::x86_avx512_psllv_d_512:
340 case Intrinsic::x86_avx512_psllv_q_512:
341 case Intrinsic::x86_avx512_psllv_w_128:
342 case Intrinsic::x86_avx512_psllv_w_256:
343 case Intrinsic::x86_avx512_psllv_w_512:
344 LogicalShift = true;
345 ShiftLeft = true;
346 break;
347 }
348 assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
349
350 Value *Vec = II.getArgOperand(0);
351 Value *Amt = II.getArgOperand(1);
352 auto *VT = cast<FixedVectorType>(II.getType());
353 Type *SVT = VT->getElementType();
354 int NumElts = VT->getNumElements();
355 int BitWidth = SVT->getIntegerBitWidth();
356
357 // If the shift amount is guaranteed to be in-range we can replace it with a
358 // generic shift.
359 KnownBits KnownAmt =
360 llvm::computeKnownBits(Amt, II.getDataLayout());
361 if (KnownAmt.getMaxValue().ult(BitWidth)) {
362 return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
363 : Builder.CreateLShr(Vec, Amt))
364 : Builder.CreateAShr(Vec, Amt));
365 }
366
367 // Simplify if all shift amounts are constant/undef.
368 auto *CShift = dyn_cast<Constant>(Amt);
369 if (!CShift)
370 return nullptr;
371
372 // Collect each element's shift amount.
373 // We also collect special cases: UNDEF = -1, OUT-OF-RANGE = BitWidth.
374 bool AnyOutOfRange = false;
375 SmallVector<int, 8> ShiftAmts;
376 for (int I = 0; I < NumElts; ++I) {
377 auto *CElt = CShift->getAggregateElement(I);
378 if (isa_and_nonnull<UndefValue>(CElt)) {
379 ShiftAmts.push_back(-1);
380 continue;
381 }
382
383 auto *COp = dyn_cast_or_null<ConstantInt>(CElt);
384 if (!COp)
385 return nullptr;
386
387 // Handle out of range shifts.
388 // If LogicalShift - set to BitWidth (special case).
389 // If ArithmeticShift - set to (BitWidth - 1) (sign splat).
390 APInt ShiftVal = COp->getValue();
391 if (ShiftVal.uge(BitWidth)) {
392 AnyOutOfRange = LogicalShift;
393 ShiftAmts.push_back(LogicalShift ? BitWidth : BitWidth - 1);
394 continue;
395 }
396
397 ShiftAmts.push_back((int)ShiftVal.getZExtValue());
398 }
399
400 // If all elements out of range or UNDEF, return vector of zeros/undefs.
401 // ArithmeticShift should only hit this if they are all UNDEF.
402 auto OutOfRange = [&](int Idx) { return (Idx < 0) || (BitWidth <= Idx); };
403 if (llvm::all_of(ShiftAmts, OutOfRange)) {
404 SmallVector<Constant *, 8> ConstantVec;
405 for (int Idx : ShiftAmts) {
406 if (Idx < 0) {
407 ConstantVec.push_back(UndefValue::get(SVT));
408 } else {
409 assert(LogicalShift && "Logical shift expected");
410 ConstantVec.push_back(ConstantInt::getNullValue(SVT));
411 }
412 }
413 return ConstantVector::get(ConstantVec);
414 }
415
416 // We can't handle only some out of range values with generic logical shifts.
417 if (AnyOutOfRange)
418 return nullptr;
419
420 // Build the shift amount constant vector.
421 SmallVector<Constant *, 8> ShiftVecAmts;
422 for (int Idx : ShiftAmts) {
423 if (Idx < 0)
424 ShiftVecAmts.push_back(UndefValue::get(SVT));
425 else
426 ShiftVecAmts.push_back(ConstantInt::get(SVT, Idx));
427 }
428 auto ShiftVec = ConstantVector::get(ShiftVecAmts);
429
430 if (ShiftLeft)
431 return Builder.CreateShl(Vec, ShiftVec);
432
433 if (LogicalShift)
434 return Builder.CreateLShr(Vec, ShiftVec);
435
436 return Builder.CreateAShr(Vec, ShiftVec);
437 }
438
simplifyX86pack(IntrinsicInst & II,InstCombiner::BuilderTy & Builder,bool IsSigned)439 static Value *simplifyX86pack(IntrinsicInst &II,
440 InstCombiner::BuilderTy &Builder, bool IsSigned) {
441 Value *Arg0 = II.getArgOperand(0);
442 Value *Arg1 = II.getArgOperand(1);
443 Type *ResTy = II.getType();
444
445 // Fast all undef handling.
446 if (isa<UndefValue>(Arg0) && isa<UndefValue>(Arg1))
447 return UndefValue::get(ResTy);
448
449 auto *ArgTy = cast<FixedVectorType>(Arg0->getType());
450 unsigned NumLanes = ResTy->getPrimitiveSizeInBits() / 128;
451 unsigned NumSrcElts = ArgTy->getNumElements();
452 assert(cast<FixedVectorType>(ResTy)->getNumElements() == (2 * NumSrcElts) &&
453 "Unexpected packing types");
454
455 unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
456 unsigned DstScalarSizeInBits = ResTy->getScalarSizeInBits();
457 unsigned SrcScalarSizeInBits = ArgTy->getScalarSizeInBits();
458 assert(SrcScalarSizeInBits == (2 * DstScalarSizeInBits) &&
459 "Unexpected packing types");
460
461 // Constant folding.
462 if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1))
463 return nullptr;
464
465 // Clamp Values - signed/unsigned both use signed clamp values, but they
466 // differ on the min/max values.
467 APInt MinValue, MaxValue;
468 if (IsSigned) {
469 // PACKSS: Truncate signed value with signed saturation.
470 // Source values less than dst minint are saturated to minint.
471 // Source values greater than dst maxint are saturated to maxint.
472 MinValue =
473 APInt::getSignedMinValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits);
474 MaxValue =
475 APInt::getSignedMaxValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits);
476 } else {
477 // PACKUS: Truncate signed value with unsigned saturation.
478 // Source values less than zero are saturated to zero.
479 // Source values greater than dst maxuint are saturated to maxuint.
480 MinValue = APInt::getZero(SrcScalarSizeInBits);
481 MaxValue = APInt::getLowBitsSet(SrcScalarSizeInBits, DstScalarSizeInBits);
482 }
483
484 auto *MinC = Constant::getIntegerValue(ArgTy, MinValue);
485 auto *MaxC = Constant::getIntegerValue(ArgTy, MaxValue);
486 Arg0 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg0, MinC), MinC, Arg0);
487 Arg1 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg1, MinC), MinC, Arg1);
488 Arg0 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg0, MaxC), MaxC, Arg0);
489 Arg1 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg1, MaxC), MaxC, Arg1);
490
491 // Shuffle clamped args together at the lane level.
492 SmallVector<int, 32> PackMask;
493 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
494 for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt)
495 PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane));
496 for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt)
497 PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane) + NumSrcElts);
498 }
499 auto *Shuffle = Builder.CreateShuffleVector(Arg0, Arg1, PackMask);
500
501 // Truncate to dst size.
502 return Builder.CreateTrunc(Shuffle, ResTy);
503 }
504
simplifyX86pmulh(IntrinsicInst & II,InstCombiner::BuilderTy & Builder,bool IsSigned,bool IsRounding)505 static Value *simplifyX86pmulh(IntrinsicInst &II,
506 InstCombiner::BuilderTy &Builder, bool IsSigned,
507 bool IsRounding) {
508 Value *Arg0 = II.getArgOperand(0);
509 Value *Arg1 = II.getArgOperand(1);
510 auto *ResTy = cast<FixedVectorType>(II.getType());
511 auto *ArgTy = cast<FixedVectorType>(Arg0->getType());
512 assert(ArgTy == ResTy && ResTy->getScalarSizeInBits() == 16 &&
513 "Unexpected PMULH types");
514 assert((!IsRounding || IsSigned) && "PMULHRS instruction must be signed");
515
516 // Multiply by undef -> zero (NOT undef!) as other arg could still be zero.
517 if (isa<UndefValue>(Arg0) || isa<UndefValue>(Arg1))
518 return ConstantAggregateZero::get(ResTy);
519
520 // Multiply by zero.
521 if (isa<ConstantAggregateZero>(Arg0) || isa<ConstantAggregateZero>(Arg1))
522 return ConstantAggregateZero::get(ResTy);
523
524 // Multiply by one.
525 if (!IsRounding) {
526 if (match(Arg0, m_One()))
527 return IsSigned ? Builder.CreateAShr(Arg1, 15)
528 : ConstantAggregateZero::get(ResTy);
529 if (match(Arg1, m_One()))
530 return IsSigned ? Builder.CreateAShr(Arg0, 15)
531 : ConstantAggregateZero::get(ResTy);
532 }
533
534 // Constant folding.
535 if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1))
536 return nullptr;
537
538 // Extend to twice the width and multiply.
539 auto Cast =
540 IsSigned ? Instruction::CastOps::SExt : Instruction::CastOps::ZExt;
541 auto *ExtTy = FixedVectorType::getExtendedElementVectorType(ArgTy);
542 Value *LHS = Builder.CreateCast(Cast, Arg0, ExtTy);
543 Value *RHS = Builder.CreateCast(Cast, Arg1, ExtTy);
544 Value *Mul = Builder.CreateMul(LHS, RHS);
545
546 if (IsRounding) {
547 // PMULHRSW: truncate to vXi18 of the most significant bits, add one and
548 // extract bits[16:1].
549 auto *RndEltTy = IntegerType::get(ExtTy->getContext(), 18);
550 auto *RndTy = FixedVectorType::get(RndEltTy, ExtTy);
551 Mul = Builder.CreateLShr(Mul, 14);
552 Mul = Builder.CreateTrunc(Mul, RndTy);
553 Mul = Builder.CreateAdd(Mul, ConstantInt::get(RndTy, 1));
554 Mul = Builder.CreateLShr(Mul, 1);
555 } else {
556 // PMULH/PMULHU: extract the vXi16 most significant bits.
557 Mul = Builder.CreateLShr(Mul, 16);
558 }
559
560 return Builder.CreateTrunc(Mul, ResTy);
561 }
562
simplifyX86pmadd(IntrinsicInst & II,InstCombiner::BuilderTy & Builder,bool IsPMADDWD)563 static Value *simplifyX86pmadd(IntrinsicInst &II,
564 InstCombiner::BuilderTy &Builder,
565 bool IsPMADDWD) {
566 Value *Arg0 = II.getArgOperand(0);
567 Value *Arg1 = II.getArgOperand(1);
568 auto *ResTy = cast<FixedVectorType>(II.getType());
569 [[maybe_unused]] auto *ArgTy = cast<FixedVectorType>(Arg0->getType());
570
571 unsigned NumDstElts = ResTy->getNumElements();
572 assert(ArgTy->getNumElements() == (2 * NumDstElts) &&
573 ResTy->getScalarSizeInBits() == (2 * ArgTy->getScalarSizeInBits()) &&
574 "Unexpected PMADD types");
575
576 // Multiply by undef -> zero (NOT undef!) as other arg could still be zero.
577 if (isa<UndefValue>(Arg0) || isa<UndefValue>(Arg1))
578 return ConstantAggregateZero::get(ResTy);
579
580 // Multiply by zero.
581 if (isa<ConstantAggregateZero>(Arg0) || isa<ConstantAggregateZero>(Arg1))
582 return ConstantAggregateZero::get(ResTy);
583
584 // Constant folding.
585 if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1))
586 return nullptr;
587
588 // Split Lo/Hi elements pairs, extend and add together.
589 // PMADDWD(X,Y) =
590 // add(mul(sext(lhs[0]),sext(rhs[0])),mul(sext(lhs[1]),sext(rhs[1])))
591 // PMADDUBSW(X,Y) =
592 // sadd_sat(mul(zext(lhs[0]),sext(rhs[0])),mul(zext(lhs[1]),sext(rhs[1])))
593 SmallVector<int> LoMask, HiMask;
594 for (unsigned I = 0; I != NumDstElts; ++I) {
595 LoMask.push_back(2 * I + 0);
596 HiMask.push_back(2 * I + 1);
597 }
598
599 auto *LHSLo = Builder.CreateShuffleVector(Arg0, LoMask);
600 auto *LHSHi = Builder.CreateShuffleVector(Arg0, HiMask);
601 auto *RHSLo = Builder.CreateShuffleVector(Arg1, LoMask);
602 auto *RHSHi = Builder.CreateShuffleVector(Arg1, HiMask);
603
604 auto LHSCast =
605 IsPMADDWD ? Instruction::CastOps::SExt : Instruction::CastOps::ZExt;
606 LHSLo = Builder.CreateCast(LHSCast, LHSLo, ResTy);
607 LHSHi = Builder.CreateCast(LHSCast, LHSHi, ResTy);
608 RHSLo = Builder.CreateCast(Instruction::CastOps::SExt, RHSLo, ResTy);
609 RHSHi = Builder.CreateCast(Instruction::CastOps::SExt, RHSHi, ResTy);
610 Value *Lo = Builder.CreateMul(LHSLo, RHSLo);
611 Value *Hi = Builder.CreateMul(LHSHi, RHSHi);
612 return IsPMADDWD
613 ? Builder.CreateAdd(Lo, Hi)
614 : Builder.CreateIntrinsic(ResTy, Intrinsic::sadd_sat, {Lo, Hi});
615 }
616
simplifyX86movmsk(const IntrinsicInst & II,InstCombiner::BuilderTy & Builder)617 static Value *simplifyX86movmsk(const IntrinsicInst &II,
618 InstCombiner::BuilderTy &Builder) {
619 Value *Arg = II.getArgOperand(0);
620 Type *ResTy = II.getType();
621
622 // movmsk(undef) -> zero as we must ensure the upper bits are zero.
623 if (isa<UndefValue>(Arg))
624 return Constant::getNullValue(ResTy);
625
626 auto *ArgTy = dyn_cast<FixedVectorType>(Arg->getType());
627 // We can't easily peek through x86_mmx types.
628 if (!ArgTy)
629 return nullptr;
630
631 // Expand MOVMSK to compare/bitcast/zext:
632 // e.g. PMOVMSKB(v16i8 x):
633 // %cmp = icmp slt <16 x i8> %x, zeroinitializer
634 // %int = bitcast <16 x i1> %cmp to i16
635 // %res = zext i16 %int to i32
636 unsigned NumElts = ArgTy->getNumElements();
637 Type *IntegerTy = Builder.getIntNTy(NumElts);
638
639 Value *Res = Builder.CreateBitCast(Arg, VectorType::getInteger(ArgTy));
640 Res = Builder.CreateIsNeg(Res);
641 Res = Builder.CreateBitCast(Res, IntegerTy);
642 Res = Builder.CreateZExtOrTrunc(Res, ResTy);
643 return Res;
644 }
645
simplifyX86addcarry(const IntrinsicInst & II,InstCombiner::BuilderTy & Builder)646 static Value *simplifyX86addcarry(const IntrinsicInst &II,
647 InstCombiner::BuilderTy &Builder) {
648 Value *CarryIn = II.getArgOperand(0);
649 Value *Op1 = II.getArgOperand(1);
650 Value *Op2 = II.getArgOperand(2);
651 Type *RetTy = II.getType();
652 Type *OpTy = Op1->getType();
653 assert(RetTy->getStructElementType(0)->isIntegerTy(8) &&
654 RetTy->getStructElementType(1) == OpTy && OpTy == Op2->getType() &&
655 "Unexpected types for x86 addcarry");
656
657 // If carry-in is zero, this is just an unsigned add with overflow.
658 if (match(CarryIn, m_ZeroInt())) {
659 Value *UAdd = Builder.CreateIntrinsic(Intrinsic::uadd_with_overflow, OpTy,
660 {Op1, Op2});
661 // The types have to be adjusted to match the x86 call types.
662 Value *UAddResult = Builder.CreateExtractValue(UAdd, 0);
663 Value *UAddOV = Builder.CreateZExt(Builder.CreateExtractValue(UAdd, 1),
664 Builder.getInt8Ty());
665 Value *Res = PoisonValue::get(RetTy);
666 Res = Builder.CreateInsertValue(Res, UAddOV, 0);
667 return Builder.CreateInsertValue(Res, UAddResult, 1);
668 }
669
670 return nullptr;
671 }
672
simplifyTernarylogic(const IntrinsicInst & II,InstCombiner::BuilderTy & Builder)673 static Value *simplifyTernarylogic(const IntrinsicInst &II,
674 InstCombiner::BuilderTy &Builder) {
675
676 auto *ArgImm = dyn_cast<ConstantInt>(II.getArgOperand(3));
677 if (!ArgImm || ArgImm->getValue().uge(256))
678 return nullptr;
679
680 Value *ArgA = II.getArgOperand(0);
681 Value *ArgB = II.getArgOperand(1);
682 Value *ArgC = II.getArgOperand(2);
683
684 Type *Ty = II.getType();
685
686 auto Or = [&](auto Lhs, auto Rhs) -> std::pair<Value *, uint8_t> {
687 return {Builder.CreateOr(Lhs.first, Rhs.first), Lhs.second | Rhs.second};
688 };
689 auto Xor = [&](auto Lhs, auto Rhs) -> std::pair<Value *, uint8_t> {
690 return {Builder.CreateXor(Lhs.first, Rhs.first), Lhs.second ^ Rhs.second};
691 };
692 auto And = [&](auto Lhs, auto Rhs) -> std::pair<Value *, uint8_t> {
693 return {Builder.CreateAnd(Lhs.first, Rhs.first), Lhs.second & Rhs.second};
694 };
695 auto Not = [&](auto V) -> std::pair<Value *, uint8_t> {
696 return {Builder.CreateNot(V.first), ~V.second};
697 };
698 auto Nor = [&](auto Lhs, auto Rhs) { return Not(Or(Lhs, Rhs)); };
699 auto Xnor = [&](auto Lhs, auto Rhs) { return Not(Xor(Lhs, Rhs)); };
700 auto Nand = [&](auto Lhs, auto Rhs) { return Not(And(Lhs, Rhs)); };
701
702 bool AIsConst = match(ArgA, m_ImmConstant());
703 bool BIsConst = match(ArgB, m_ImmConstant());
704 bool CIsConst = match(ArgC, m_ImmConstant());
705
706 bool ABIsConst = AIsConst && BIsConst;
707 bool ACIsConst = AIsConst && CIsConst;
708 bool BCIsConst = BIsConst && CIsConst;
709 bool ABCIsConst = AIsConst && BIsConst && CIsConst;
710
711 // Use for verification. Its a big table. Its difficult to go from Imm ->
712 // logic ops, but easy to verify that a set of logic ops is correct. We track
713 // the logic ops through the second value in the pair. At the end it should
714 // equal Imm.
715 std::pair<Value *, uint8_t> A = {ArgA, 0xf0};
716 std::pair<Value *, uint8_t> B = {ArgB, 0xcc};
717 std::pair<Value *, uint8_t> C = {ArgC, 0xaa};
718 std::pair<Value *, uint8_t> Res = {nullptr, 0};
719
720 // Currently we only handle cases that convert directly to another instruction
721 // or cases where all the ops are constant. This is because we don't properly
722 // handle creating ternary ops in the backend, so splitting them here may
723 // cause regressions. As the backend improves, uncomment more cases.
724
725 uint8_t Imm = ArgImm->getValue().getZExtValue();
726 switch (Imm) {
727 case 0x0:
728 Res = {Constant::getNullValue(Ty), 0};
729 break;
730 case 0x1:
731 if (ABCIsConst)
732 Res = Nor(Or(A, B), C);
733 break;
734 case 0x2:
735 if (ABCIsConst)
736 Res = And(Nor(A, B), C);
737 break;
738 case 0x3:
739 if (ABIsConst)
740 Res = Nor(A, B);
741 break;
742 case 0x4:
743 if (ABCIsConst)
744 Res = And(Nor(A, C), B);
745 break;
746 case 0x5:
747 if (ACIsConst)
748 Res = Nor(A, C);
749 break;
750 case 0x6:
751 if (ABCIsConst)
752 Res = Nor(A, Xnor(B, C));
753 break;
754 case 0x7:
755 if (ABCIsConst)
756 Res = Nor(A, And(B, C));
757 break;
758 case 0x8:
759 if (ABCIsConst)
760 Res = Nor(A, Nand(B, C));
761 break;
762 case 0x9:
763 if (ABCIsConst)
764 Res = Nor(A, Xor(B, C));
765 break;
766 case 0xa:
767 if (ACIsConst)
768 Res = Nor(A, Not(C));
769 break;
770 case 0xb:
771 if (ABCIsConst)
772 Res = Nor(A, Nor(C, Not(B)));
773 break;
774 case 0xc:
775 if (ABIsConst)
776 Res = Nor(A, Not(B));
777 break;
778 case 0xd:
779 if (ABCIsConst)
780 Res = Nor(A, Nor(B, Not(C)));
781 break;
782 case 0xe:
783 if (ABCIsConst)
784 Res = Nor(A, Nor(B, C));
785 break;
786 case 0xf:
787 Res = Not(A);
788 break;
789 case 0x10:
790 if (ABCIsConst)
791 Res = And(A, Nor(B, C));
792 break;
793 case 0x11:
794 if (BCIsConst)
795 Res = Nor(B, C);
796 break;
797 case 0x12:
798 if (ABCIsConst)
799 Res = Nor(Xnor(A, C), B);
800 break;
801 case 0x13:
802 if (ABCIsConst)
803 Res = Nor(And(A, C), B);
804 break;
805 case 0x14:
806 if (ABCIsConst)
807 Res = Nor(Xnor(A, B), C);
808 break;
809 case 0x15:
810 if (ABCIsConst)
811 Res = Nor(And(A, B), C);
812 break;
813 case 0x16:
814 if (ABCIsConst)
815 Res = Xor(Xor(A, B), And(Nand(A, B), C));
816 break;
817 case 0x17:
818 if (ABCIsConst)
819 Res = Xor(Or(A, B), Or(Xnor(A, B), C));
820 break;
821 case 0x18:
822 if (ABCIsConst)
823 Res = Nor(Xnor(A, B), Xnor(A, C));
824 break;
825 case 0x19:
826 if (ABCIsConst)
827 Res = And(Nand(A, B), Xnor(B, C));
828 break;
829 case 0x1a:
830 if (ABCIsConst)
831 Res = Xor(A, Or(And(A, B), C));
832 break;
833 case 0x1b:
834 if (ABCIsConst)
835 Res = Xor(A, Or(Xnor(A, B), C));
836 break;
837 case 0x1c:
838 if (ABCIsConst)
839 Res = Xor(A, Or(And(A, C), B));
840 break;
841 case 0x1d:
842 if (ABCIsConst)
843 Res = Xor(A, Or(Xnor(A, C), B));
844 break;
845 case 0x1e:
846 if (ABCIsConst)
847 Res = Xor(A, Or(B, C));
848 break;
849 case 0x1f:
850 if (ABCIsConst)
851 Res = Nand(A, Or(B, C));
852 break;
853 case 0x20:
854 if (ABCIsConst)
855 Res = Nor(Nand(A, C), B);
856 break;
857 case 0x21:
858 if (ABCIsConst)
859 Res = Nor(Xor(A, C), B);
860 break;
861 case 0x22:
862 if (BCIsConst)
863 Res = Nor(B, Not(C));
864 break;
865 case 0x23:
866 if (ABCIsConst)
867 Res = Nor(B, Nor(C, Not(A)));
868 break;
869 case 0x24:
870 if (ABCIsConst)
871 Res = Nor(Xnor(A, B), Xor(A, C));
872 break;
873 case 0x25:
874 if (ABCIsConst)
875 Res = Xor(A, Nand(Nand(A, B), C));
876 break;
877 case 0x26:
878 if (ABCIsConst)
879 Res = And(Nand(A, B), Xor(B, C));
880 break;
881 case 0x27:
882 if (ABCIsConst)
883 Res = Xor(Or(Xnor(A, B), C), B);
884 break;
885 case 0x28:
886 if (ABCIsConst)
887 Res = And(Xor(A, B), C);
888 break;
889 case 0x29:
890 if (ABCIsConst)
891 Res = Xor(Xor(A, B), Nor(And(A, B), C));
892 break;
893 case 0x2a:
894 if (ABCIsConst)
895 Res = And(Nand(A, B), C);
896 break;
897 case 0x2b:
898 if (ABCIsConst)
899 Res = Xor(Or(Xnor(A, B), Xor(A, C)), A);
900 break;
901 case 0x2c:
902 if (ABCIsConst)
903 Res = Nor(Xnor(A, B), Nor(B, C));
904 break;
905 case 0x2d:
906 if (ABCIsConst)
907 Res = Xor(A, Or(B, Not(C)));
908 break;
909 case 0x2e:
910 if (ABCIsConst)
911 Res = Xor(A, Or(Xor(A, C), B));
912 break;
913 case 0x2f:
914 if (ABCIsConst)
915 Res = Nand(A, Or(B, Not(C)));
916 break;
917 case 0x30:
918 if (ABIsConst)
919 Res = Nor(B, Not(A));
920 break;
921 case 0x31:
922 if (ABCIsConst)
923 Res = Nor(Nor(A, Not(C)), B);
924 break;
925 case 0x32:
926 if (ABCIsConst)
927 Res = Nor(Nor(A, C), B);
928 break;
929 case 0x33:
930 Res = Not(B);
931 break;
932 case 0x34:
933 if (ABCIsConst)
934 Res = And(Xor(A, B), Nand(B, C));
935 break;
936 case 0x35:
937 if (ABCIsConst)
938 Res = Xor(B, Or(A, Xnor(B, C)));
939 break;
940 case 0x36:
941 if (ABCIsConst)
942 Res = Xor(Or(A, C), B);
943 break;
944 case 0x37:
945 if (ABCIsConst)
946 Res = Nand(Or(A, C), B);
947 break;
948 case 0x38:
949 if (ABCIsConst)
950 Res = Nor(Xnor(A, B), Nor(A, C));
951 break;
952 case 0x39:
953 if (ABCIsConst)
954 Res = Xor(Or(A, Not(C)), B);
955 break;
956 case 0x3a:
957 if (ABCIsConst)
958 Res = Xor(B, Or(A, Xor(B, C)));
959 break;
960 case 0x3b:
961 if (ABCIsConst)
962 Res = Nand(Or(A, Not(C)), B);
963 break;
964 case 0x3c:
965 Res = Xor(A, B);
966 break;
967 case 0x3d:
968 if (ABCIsConst)
969 Res = Xor(A, Or(Nor(A, C), B));
970 break;
971 case 0x3e:
972 if (ABCIsConst)
973 Res = Xor(A, Or(Nor(A, Not(C)), B));
974 break;
975 case 0x3f:
976 if (ABIsConst)
977 Res = Nand(A, B);
978 break;
979 case 0x40:
980 if (ABCIsConst)
981 Res = Nor(Nand(A, B), C);
982 break;
983 case 0x41:
984 if (ABCIsConst)
985 Res = Nor(Xor(A, B), C);
986 break;
987 case 0x42:
988 if (ABCIsConst)
989 Res = Nor(Xor(A, B), Xnor(A, C));
990 break;
991 case 0x43:
992 if (ABCIsConst)
993 Res = Xor(A, Nand(Nand(A, C), B));
994 break;
995 case 0x44:
996 if (BCIsConst)
997 Res = Nor(C, Not(B));
998 break;
999 case 0x45:
1000 if (ABCIsConst)
1001 Res = Nor(Nor(B, Not(A)), C);
1002 break;
1003 case 0x46:
1004 if (ABCIsConst)
1005 Res = Xor(Or(And(A, C), B), C);
1006 break;
1007 case 0x47:
1008 if (ABCIsConst)
1009 Res = Xor(Or(Xnor(A, C), B), C);
1010 break;
1011 case 0x48:
1012 if (ABCIsConst)
1013 Res = And(Xor(A, C), B);
1014 break;
1015 case 0x49:
1016 if (ABCIsConst)
1017 Res = Xor(Or(Xnor(A, B), And(A, C)), C);
1018 break;
1019 case 0x4a:
1020 if (ABCIsConst)
1021 Res = Nor(Xnor(A, C), Nor(B, C));
1022 break;
1023 case 0x4b:
1024 if (ABCIsConst)
1025 Res = Xor(A, Or(C, Not(B)));
1026 break;
1027 case 0x4c:
1028 if (ABCIsConst)
1029 Res = And(Nand(A, C), B);
1030 break;
1031 case 0x4d:
1032 if (ABCIsConst)
1033 Res = Xor(Or(Xor(A, B), Xnor(A, C)), A);
1034 break;
1035 case 0x4e:
1036 if (ABCIsConst)
1037 Res = Xor(A, Or(Xor(A, B), C));
1038 break;
1039 case 0x4f:
1040 if (ABCIsConst)
1041 Res = Nand(A, Nand(B, Not(C)));
1042 break;
1043 case 0x50:
1044 if (ACIsConst)
1045 Res = Nor(C, Not(A));
1046 break;
1047 case 0x51:
1048 if (ABCIsConst)
1049 Res = Nor(Nor(A, Not(B)), C);
1050 break;
1051 case 0x52:
1052 if (ABCIsConst)
1053 Res = And(Xor(A, C), Nand(B, C));
1054 break;
1055 case 0x53:
1056 if (ABCIsConst)
1057 Res = Xor(Or(Xnor(B, C), A), C);
1058 break;
1059 case 0x54:
1060 if (ABCIsConst)
1061 Res = Nor(Nor(A, B), C);
1062 break;
1063 case 0x55:
1064 Res = Not(C);
1065 break;
1066 case 0x56:
1067 if (ABCIsConst)
1068 Res = Xor(Or(A, B), C);
1069 break;
1070 case 0x57:
1071 if (ABCIsConst)
1072 Res = Nand(Or(A, B), C);
1073 break;
1074 case 0x58:
1075 if (ABCIsConst)
1076 Res = Nor(Nor(A, B), Xnor(A, C));
1077 break;
1078 case 0x59:
1079 if (ABCIsConst)
1080 Res = Xor(Or(A, Not(B)), C);
1081 break;
1082 case 0x5a:
1083 Res = Xor(A, C);
1084 break;
1085 case 0x5b:
1086 if (ABCIsConst)
1087 Res = Xor(A, Or(Nor(A, B), C));
1088 break;
1089 case 0x5c:
1090 if (ABCIsConst)
1091 Res = Xor(Or(Xor(B, C), A), C);
1092 break;
1093 case 0x5d:
1094 if (ABCIsConst)
1095 Res = Nand(Or(A, Not(B)), C);
1096 break;
1097 case 0x5e:
1098 if (ABCIsConst)
1099 Res = Xor(A, Or(Nor(A, Not(B)), C));
1100 break;
1101 case 0x5f:
1102 if (ACIsConst)
1103 Res = Nand(A, C);
1104 break;
1105 case 0x60:
1106 if (ABCIsConst)
1107 Res = And(A, Xor(B, C));
1108 break;
1109 case 0x61:
1110 if (ABCIsConst)
1111 Res = Xor(Or(Xnor(A, B), And(B, C)), C);
1112 break;
1113 case 0x62:
1114 if (ABCIsConst)
1115 Res = Nor(Nor(A, C), Xnor(B, C));
1116 break;
1117 case 0x63:
1118 if (ABCIsConst)
1119 Res = Xor(B, Or(C, Not(A)));
1120 break;
1121 case 0x64:
1122 if (ABCIsConst)
1123 Res = Nor(Nor(A, B), Xnor(B, C));
1124 break;
1125 case 0x65:
1126 if (ABCIsConst)
1127 Res = Xor(Or(B, Not(A)), C);
1128 break;
1129 case 0x66:
1130 Res = Xor(B, C);
1131 break;
1132 case 0x67:
1133 if (ABCIsConst)
1134 Res = Or(Nor(A, B), Xor(B, C));
1135 break;
1136 case 0x68:
1137 if (ABCIsConst)
1138 Res = Xor(Xor(A, B), Nor(Nor(A, B), C));
1139 break;
1140 case 0x69:
1141 if (ABCIsConst)
1142 Res = Xor(Xnor(A, B), C);
1143 break;
1144 case 0x6a:
1145 if (ABCIsConst)
1146 Res = Xor(And(A, B), C);
1147 break;
1148 case 0x6b:
1149 if (ABCIsConst)
1150 Res = Or(Nor(A, B), Xor(Xnor(A, B), C));
1151 break;
1152 case 0x6c:
1153 if (ABCIsConst)
1154 Res = Xor(And(A, C), B);
1155 break;
1156 case 0x6d:
1157 if (ABCIsConst)
1158 Res = Xor(Or(Xnor(A, B), Nor(A, C)), C);
1159 break;
1160 case 0x6e:
1161 if (ABCIsConst)
1162 Res = Or(Nor(A, Not(B)), Xor(B, C));
1163 break;
1164 case 0x6f:
1165 if (ABCIsConst)
1166 Res = Nand(A, Xnor(B, C));
1167 break;
1168 case 0x70:
1169 if (ABCIsConst)
1170 Res = And(A, Nand(B, C));
1171 break;
1172 case 0x71:
1173 if (ABCIsConst)
1174 Res = Xor(Nor(Xor(A, B), Xor(A, C)), A);
1175 break;
1176 case 0x72:
1177 if (ABCIsConst)
1178 Res = Xor(Or(Xor(A, B), C), B);
1179 break;
1180 case 0x73:
1181 if (ABCIsConst)
1182 Res = Nand(Nand(A, Not(C)), B);
1183 break;
1184 case 0x74:
1185 if (ABCIsConst)
1186 Res = Xor(Or(Xor(A, C), B), C);
1187 break;
1188 case 0x75:
1189 if (ABCIsConst)
1190 Res = Nand(Nand(A, Not(B)), C);
1191 break;
1192 case 0x76:
1193 if (ABCIsConst)
1194 Res = Xor(B, Or(Nor(B, Not(A)), C));
1195 break;
1196 case 0x77:
1197 if (BCIsConst)
1198 Res = Nand(B, C);
1199 break;
1200 case 0x78:
1201 if (ABCIsConst)
1202 Res = Xor(A, And(B, C));
1203 break;
1204 case 0x79:
1205 if (ABCIsConst)
1206 Res = Xor(Or(Xnor(A, B), Nor(B, C)), C);
1207 break;
1208 case 0x7a:
1209 if (ABCIsConst)
1210 Res = Or(Xor(A, C), Nor(B, Not(A)));
1211 break;
1212 case 0x7b:
1213 if (ABCIsConst)
1214 Res = Nand(Xnor(A, C), B);
1215 break;
1216 case 0x7c:
1217 if (ABCIsConst)
1218 Res = Or(Xor(A, B), Nor(C, Not(A)));
1219 break;
1220 case 0x7d:
1221 if (ABCIsConst)
1222 Res = Nand(Xnor(A, B), C);
1223 break;
1224 case 0x7e:
1225 if (ABCIsConst)
1226 Res = Or(Xor(A, B), Xor(A, C));
1227 break;
1228 case 0x7f:
1229 if (ABCIsConst)
1230 Res = Nand(And(A, B), C);
1231 break;
1232 case 0x80:
1233 if (ABCIsConst)
1234 Res = And(And(A, B), C);
1235 break;
1236 case 0x81:
1237 if (ABCIsConst)
1238 Res = Nor(Xor(A, B), Xor(A, C));
1239 break;
1240 case 0x82:
1241 if (ABCIsConst)
1242 Res = And(Xnor(A, B), C);
1243 break;
1244 case 0x83:
1245 if (ABCIsConst)
1246 Res = Nor(Xor(A, B), Nor(C, Not(A)));
1247 break;
1248 case 0x84:
1249 if (ABCIsConst)
1250 Res = And(Xnor(A, C), B);
1251 break;
1252 case 0x85:
1253 if (ABCIsConst)
1254 Res = Nor(Xor(A, C), Nor(B, Not(A)));
1255 break;
1256 case 0x86:
1257 if (ABCIsConst)
1258 Res = Xor(Nor(Xnor(A, B), Nor(B, C)), C);
1259 break;
1260 case 0x87:
1261 if (ABCIsConst)
1262 Res = Xor(A, Nand(B, C));
1263 break;
1264 case 0x88:
1265 Res = And(B, C);
1266 break;
1267 case 0x89:
1268 if (ABCIsConst)
1269 Res = Xor(B, Nor(Nor(B, Not(A)), C));
1270 break;
1271 case 0x8a:
1272 if (ABCIsConst)
1273 Res = And(Nand(A, Not(B)), C);
1274 break;
1275 case 0x8b:
1276 if (ABCIsConst)
1277 Res = Xor(Nor(Xor(A, C), B), C);
1278 break;
1279 case 0x8c:
1280 if (ABCIsConst)
1281 Res = And(Nand(A, Not(C)), B);
1282 break;
1283 case 0x8d:
1284 if (ABCIsConst)
1285 Res = Xor(Nor(Xor(A, B), C), B);
1286 break;
1287 case 0x8e:
1288 if (ABCIsConst)
1289 Res = Xor(Or(Xor(A, B), Xor(A, C)), A);
1290 break;
1291 case 0x8f:
1292 if (ABCIsConst)
1293 Res = Nand(A, Nand(B, C));
1294 break;
1295 case 0x90:
1296 if (ABCIsConst)
1297 Res = And(A, Xnor(B, C));
1298 break;
1299 case 0x91:
1300 if (ABCIsConst)
1301 Res = Nor(Nor(A, Not(B)), Xor(B, C));
1302 break;
1303 case 0x92:
1304 if (ABCIsConst)
1305 Res = Xor(Nor(Xnor(A, B), Nor(A, C)), C);
1306 break;
1307 case 0x93:
1308 if (ABCIsConst)
1309 Res = Xor(Nand(A, C), B);
1310 break;
1311 case 0x94:
1312 if (ABCIsConst)
1313 Res = Nor(Nor(A, B), Xor(Xnor(A, B), C));
1314 break;
1315 case 0x95:
1316 if (ABCIsConst)
1317 Res = Xor(Nand(A, B), C);
1318 break;
1319 case 0x96:
1320 if (ABCIsConst)
1321 Res = Xor(Xor(A, B), C);
1322 break;
1323 case 0x97:
1324 if (ABCIsConst)
1325 Res = Xor(Xor(A, B), Or(Nor(A, B), C));
1326 break;
1327 case 0x98:
1328 if (ABCIsConst)
1329 Res = Nor(Nor(A, B), Xor(B, C));
1330 break;
1331 case 0x99:
1332 if (BCIsConst)
1333 Res = Xnor(B, C);
1334 break;
1335 case 0x9a:
1336 if (ABCIsConst)
1337 Res = Xor(Nor(B, Not(A)), C);
1338 break;
1339 case 0x9b:
1340 if (ABCIsConst)
1341 Res = Or(Nor(A, B), Xnor(B, C));
1342 break;
1343 case 0x9c:
1344 if (ABCIsConst)
1345 Res = Xor(B, Nor(C, Not(A)));
1346 break;
1347 case 0x9d:
1348 if (ABCIsConst)
1349 Res = Or(Nor(A, C), Xnor(B, C));
1350 break;
1351 case 0x9e:
1352 if (ABCIsConst)
1353 Res = Xor(And(Xor(A, B), Nand(B, C)), C);
1354 break;
1355 case 0x9f:
1356 if (ABCIsConst)
1357 Res = Nand(A, Xor(B, C));
1358 break;
1359 case 0xa0:
1360 Res = And(A, C);
1361 break;
1362 case 0xa1:
1363 if (ABCIsConst)
1364 Res = Xor(A, Nor(Nor(A, Not(B)), C));
1365 break;
1366 case 0xa2:
1367 if (ABCIsConst)
1368 Res = And(Or(A, Not(B)), C);
1369 break;
1370 case 0xa3:
1371 if (ABCIsConst)
1372 Res = Xor(Nor(Xor(B, C), A), C);
1373 break;
1374 case 0xa4:
1375 if (ABCIsConst)
1376 Res = Xor(A, Nor(Nor(A, B), C));
1377 break;
1378 case 0xa5:
1379 if (ACIsConst)
1380 Res = Xnor(A, C);
1381 break;
1382 case 0xa6:
1383 if (ABCIsConst)
1384 Res = Xor(Nor(A, Not(B)), C);
1385 break;
1386 case 0xa7:
1387 if (ABCIsConst)
1388 Res = Or(Nor(A, B), Xnor(A, C));
1389 break;
1390 case 0xa8:
1391 if (ABCIsConst)
1392 Res = And(Or(A, B), C);
1393 break;
1394 case 0xa9:
1395 if (ABCIsConst)
1396 Res = Xor(Nor(A, B), C);
1397 break;
1398 case 0xaa:
1399 Res = C;
1400 break;
1401 case 0xab:
1402 if (ABCIsConst)
1403 Res = Or(Nor(A, B), C);
1404 break;
1405 case 0xac:
1406 if (ABCIsConst)
1407 Res = Xor(Nor(Xnor(B, C), A), C);
1408 break;
1409 case 0xad:
1410 if (ABCIsConst)
1411 Res = Or(Xnor(A, C), And(B, C));
1412 break;
1413 case 0xae:
1414 if (ABCIsConst)
1415 Res = Or(Nor(A, Not(B)), C);
1416 break;
1417 case 0xaf:
1418 if (ACIsConst)
1419 Res = Or(C, Not(A));
1420 break;
1421 case 0xb0:
1422 if (ABCIsConst)
1423 Res = And(A, Nand(B, Not(C)));
1424 break;
1425 case 0xb1:
1426 if (ABCIsConst)
1427 Res = Xor(A, Nor(Xor(A, B), C));
1428 break;
1429 case 0xb2:
1430 if (ABCIsConst)
1431 Res = Xor(Nor(Xor(A, B), Xnor(A, C)), A);
1432 break;
1433 case 0xb3:
1434 if (ABCIsConst)
1435 Res = Nand(Nand(A, C), B);
1436 break;
1437 case 0xb4:
1438 if (ABCIsConst)
1439 Res = Xor(A, Nor(C, Not(B)));
1440 break;
1441 case 0xb5:
1442 if (ABCIsConst)
1443 Res = Or(Xnor(A, C), Nor(B, C));
1444 break;
1445 case 0xb6:
1446 if (ABCIsConst)
1447 Res = Xor(And(Xor(A, B), Nand(A, C)), C);
1448 break;
1449 case 0xb7:
1450 if (ABCIsConst)
1451 Res = Nand(Xor(A, C), B);
1452 break;
1453 case 0xb8:
1454 if (ABCIsConst)
1455 Res = Xor(Nor(Xnor(A, C), B), C);
1456 break;
1457 case 0xb9:
1458 if (ABCIsConst)
1459 Res = Xor(Nor(And(A, C), B), C);
1460 break;
1461 case 0xba:
1462 if (ABCIsConst)
1463 Res = Or(Nor(B, Not(A)), C);
1464 break;
1465 case 0xbb:
1466 if (BCIsConst)
1467 Res = Or(C, Not(B));
1468 break;
1469 case 0xbc:
1470 if (ABCIsConst)
1471 Res = Xor(A, And(Nand(A, C), B));
1472 break;
1473 case 0xbd:
1474 if (ABCIsConst)
1475 Res = Or(Xor(A, B), Xnor(A, C));
1476 break;
1477 case 0xbe:
1478 if (ABCIsConst)
1479 Res = Or(Xor(A, B), C);
1480 break;
1481 case 0xbf:
1482 if (ABCIsConst)
1483 Res = Or(Nand(A, B), C);
1484 break;
1485 case 0xc0:
1486 Res = And(A, B);
1487 break;
1488 case 0xc1:
1489 if (ABCIsConst)
1490 Res = Xor(A, Nor(Nor(A, Not(C)), B));
1491 break;
1492 case 0xc2:
1493 if (ABCIsConst)
1494 Res = Xor(A, Nor(Nor(A, C), B));
1495 break;
1496 case 0xc3:
1497 if (ABIsConst)
1498 Res = Xnor(A, B);
1499 break;
1500 case 0xc4:
1501 if (ABCIsConst)
1502 Res = And(Or(A, Not(C)), B);
1503 break;
1504 case 0xc5:
1505 if (ABCIsConst)
1506 Res = Xor(B, Nor(A, Xor(B, C)));
1507 break;
1508 case 0xc6:
1509 if (ABCIsConst)
1510 Res = Xor(Nor(A, Not(C)), B);
1511 break;
1512 case 0xc7:
1513 if (ABCIsConst)
1514 Res = Or(Xnor(A, B), Nor(A, C));
1515 break;
1516 case 0xc8:
1517 if (ABCIsConst)
1518 Res = And(Or(A, C), B);
1519 break;
1520 case 0xc9:
1521 if (ABCIsConst)
1522 Res = Xor(Nor(A, C), B);
1523 break;
1524 case 0xca:
1525 if (ABCIsConst)
1526 Res = Xor(B, Nor(A, Xnor(B, C)));
1527 break;
1528 case 0xcb:
1529 if (ABCIsConst)
1530 Res = Or(Xnor(A, B), And(B, C));
1531 break;
1532 case 0xcc:
1533 Res = B;
1534 break;
1535 case 0xcd:
1536 if (ABCIsConst)
1537 Res = Or(Nor(A, C), B);
1538 break;
1539 case 0xce:
1540 if (ABCIsConst)
1541 Res = Or(Nor(A, Not(C)), B);
1542 break;
1543 case 0xcf:
1544 if (ABIsConst)
1545 Res = Or(B, Not(A));
1546 break;
1547 case 0xd0:
1548 if (ABCIsConst)
1549 Res = And(A, Or(B, Not(C)));
1550 break;
1551 case 0xd1:
1552 if (ABCIsConst)
1553 Res = Xor(A, Nor(Xor(A, C), B));
1554 break;
1555 case 0xd2:
1556 if (ABCIsConst)
1557 Res = Xor(A, Nor(B, Not(C)));
1558 break;
1559 case 0xd3:
1560 if (ABCIsConst)
1561 Res = Or(Xnor(A, B), Nor(B, C));
1562 break;
1563 case 0xd4:
1564 if (ABCIsConst)
1565 Res = Xor(Nor(Xnor(A, B), Xor(A, C)), A);
1566 break;
1567 case 0xd5:
1568 if (ABCIsConst)
1569 Res = Nand(Nand(A, B), C);
1570 break;
1571 case 0xd6:
1572 if (ABCIsConst)
1573 Res = Xor(Xor(A, B), Or(And(A, B), C));
1574 break;
1575 case 0xd7:
1576 if (ABCIsConst)
1577 Res = Nand(Xor(A, B), C);
1578 break;
1579 case 0xd8:
1580 if (ABCIsConst)
1581 Res = Xor(Nor(Xnor(A, B), C), B);
1582 break;
1583 case 0xd9:
1584 if (ABCIsConst)
1585 Res = Or(And(A, B), Xnor(B, C));
1586 break;
1587 case 0xda:
1588 if (ABCIsConst)
1589 Res = Xor(A, And(Nand(A, B), C));
1590 break;
1591 case 0xdb:
1592 if (ABCIsConst)
1593 Res = Or(Xnor(A, B), Xor(A, C));
1594 break;
1595 case 0xdc:
1596 if (ABCIsConst)
1597 Res = Or(B, Nor(C, Not(A)));
1598 break;
1599 case 0xdd:
1600 if (BCIsConst)
1601 Res = Or(B, Not(C));
1602 break;
1603 case 0xde:
1604 if (ABCIsConst)
1605 Res = Or(Xor(A, C), B);
1606 break;
1607 case 0xdf:
1608 if (ABCIsConst)
1609 Res = Or(Nand(A, C), B);
1610 break;
1611 case 0xe0:
1612 if (ABCIsConst)
1613 Res = And(A, Or(B, C));
1614 break;
1615 case 0xe1:
1616 if (ABCIsConst)
1617 Res = Xor(A, Nor(B, C));
1618 break;
1619 case 0xe2:
1620 if (ABCIsConst)
1621 Res = Xor(A, Nor(Xnor(A, C), B));
1622 break;
1623 case 0xe3:
1624 if (ABCIsConst)
1625 Res = Xor(A, Nor(And(A, C), B));
1626 break;
1627 case 0xe4:
1628 if (ABCIsConst)
1629 Res = Xor(A, Nor(Xnor(A, B), C));
1630 break;
1631 case 0xe5:
1632 if (ABCIsConst)
1633 Res = Xor(A, Nor(And(A, B), C));
1634 break;
1635 case 0xe6:
1636 if (ABCIsConst)
1637 Res = Or(And(A, B), Xor(B, C));
1638 break;
1639 case 0xe7:
1640 if (ABCIsConst)
1641 Res = Or(Xnor(A, B), Xnor(A, C));
1642 break;
1643 case 0xe8:
1644 if (ABCIsConst)
1645 Res = Xor(Or(A, B), Nor(Xnor(A, B), C));
1646 break;
1647 case 0xe9:
1648 if (ABCIsConst)
1649 Res = Xor(Xor(A, B), Nand(Nand(A, B), C));
1650 break;
1651 case 0xea:
1652 if (ABCIsConst)
1653 Res = Or(And(A, B), C);
1654 break;
1655 case 0xeb:
1656 if (ABCIsConst)
1657 Res = Or(Xnor(A, B), C);
1658 break;
1659 case 0xec:
1660 if (ABCIsConst)
1661 Res = Or(And(A, C), B);
1662 break;
1663 case 0xed:
1664 if (ABCIsConst)
1665 Res = Or(Xnor(A, C), B);
1666 break;
1667 case 0xee:
1668 Res = Or(B, C);
1669 break;
1670 case 0xef:
1671 if (ABCIsConst)
1672 Res = Nand(A, Nor(B, C));
1673 break;
1674 case 0xf0:
1675 Res = A;
1676 break;
1677 case 0xf1:
1678 if (ABCIsConst)
1679 Res = Or(A, Nor(B, C));
1680 break;
1681 case 0xf2:
1682 if (ABCIsConst)
1683 Res = Or(A, Nor(B, Not(C)));
1684 break;
1685 case 0xf3:
1686 if (ABIsConst)
1687 Res = Or(A, Not(B));
1688 break;
1689 case 0xf4:
1690 if (ABCIsConst)
1691 Res = Or(A, Nor(C, Not(B)));
1692 break;
1693 case 0xf5:
1694 if (ACIsConst)
1695 Res = Or(A, Not(C));
1696 break;
1697 case 0xf6:
1698 if (ABCIsConst)
1699 Res = Or(A, Xor(B, C));
1700 break;
1701 case 0xf7:
1702 if (ABCIsConst)
1703 Res = Or(A, Nand(B, C));
1704 break;
1705 case 0xf8:
1706 if (ABCIsConst)
1707 Res = Or(A, And(B, C));
1708 break;
1709 case 0xf9:
1710 if (ABCIsConst)
1711 Res = Or(A, Xnor(B, C));
1712 break;
1713 case 0xfa:
1714 Res = Or(A, C);
1715 break;
1716 case 0xfb:
1717 if (ABCIsConst)
1718 Res = Nand(Nor(A, C), B);
1719 break;
1720 case 0xfc:
1721 Res = Or(A, B);
1722 break;
1723 case 0xfd:
1724 if (ABCIsConst)
1725 Res = Nand(Nor(A, B), C);
1726 break;
1727 case 0xfe:
1728 if (ABCIsConst)
1729 Res = Or(Or(A, B), C);
1730 break;
1731 case 0xff:
1732 Res = {Constant::getAllOnesValue(Ty), 0xff};
1733 break;
1734 }
1735
1736 assert((Res.first == nullptr || Res.second == Imm) &&
1737 "Simplification of ternary logic does not verify!");
1738 return Res.first;
1739 }
1740
simplifyX86insertps(const IntrinsicInst & II,InstCombiner::BuilderTy & Builder)1741 static Value *simplifyX86insertps(const IntrinsicInst &II,
1742 InstCombiner::BuilderTy &Builder) {
1743 auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2));
1744 if (!CInt)
1745 return nullptr;
1746
1747 auto *VecTy = cast<FixedVectorType>(II.getType());
1748 assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type");
1749
1750 // The immediate permute control byte looks like this:
1751 // [3:0] - zero mask for each 32-bit lane
1752 // [5:4] - select one 32-bit destination lane
1753 // [7:6] - select one 32-bit source lane
1754
1755 uint8_t Imm = CInt->getZExtValue();
1756 uint8_t ZMask = Imm & 0xf;
1757 uint8_t DestLane = (Imm >> 4) & 0x3;
1758 uint8_t SourceLane = (Imm >> 6) & 0x3;
1759
1760 ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy);
1761
1762 // If all zero mask bits are set, this was just a weird way to
1763 // generate a zero vector.
1764 if (ZMask == 0xf)
1765 return ZeroVector;
1766
1767 // Initialize by passing all of the first source bits through.
1768 int ShuffleMask[4] = {0, 1, 2, 3};
1769
1770 // We may replace the second operand with the zero vector.
1771 Value *V1 = II.getArgOperand(1);
1772
1773 if (ZMask) {
1774 // If the zero mask is being used with a single input or the zero mask
1775 // overrides the destination lane, this is a shuffle with the zero vector.
1776 if ((II.getArgOperand(0) == II.getArgOperand(1)) ||
1777 (ZMask & (1 << DestLane))) {
1778 V1 = ZeroVector;
1779 // We may still move 32-bits of the first source vector from one lane
1780 // to another.
1781 ShuffleMask[DestLane] = SourceLane;
1782 // The zero mask may override the previous insert operation.
1783 for (unsigned i = 0; i < 4; ++i)
1784 if ((ZMask >> i) & 0x1)
1785 ShuffleMask[i] = i + 4;
1786 } else {
1787 // TODO: Model this case as 2 shuffles or a 'logical and' plus shuffle?
1788 return nullptr;
1789 }
1790 } else {
1791 // Replace the selected destination lane with the selected source lane.
1792 ShuffleMask[DestLane] = SourceLane + 4;
1793 }
1794
1795 return Builder.CreateShuffleVector(II.getArgOperand(0), V1, ShuffleMask);
1796 }
1797
1798 /// Attempt to simplify SSE4A EXTRQ/EXTRQI instructions using constant folding
1799 /// or conversion to a shuffle vector.
simplifyX86extrq(IntrinsicInst & II,Value * Op0,ConstantInt * CILength,ConstantInt * CIIndex,InstCombiner::BuilderTy & Builder)1800 static Value *simplifyX86extrq(IntrinsicInst &II, Value *Op0,
1801 ConstantInt *CILength, ConstantInt *CIIndex,
1802 InstCombiner::BuilderTy &Builder) {
1803 auto LowConstantHighUndef = [&](uint64_t Val) {
1804 Type *IntTy64 = Type::getInt64Ty(II.getContext());
1805 Constant *Args[] = {ConstantInt::get(IntTy64, Val),
1806 UndefValue::get(IntTy64)};
1807 return ConstantVector::get(Args);
1808 };
1809
1810 // See if we're dealing with constant values.
1811 auto *C0 = dyn_cast<Constant>(Op0);
1812 auto *CI0 =
1813 C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
1814 : nullptr;
1815
1816 // Attempt to constant fold.
1817 if (CILength && CIIndex) {
1818 // From AMD documentation: "The bit index and field length are each six
1819 // bits in length other bits of the field are ignored."
1820 APInt APIndex = CIIndex->getValue().zextOrTrunc(6);
1821 APInt APLength = CILength->getValue().zextOrTrunc(6);
1822
1823 unsigned Index = APIndex.getZExtValue();
1824
1825 // From AMD documentation: "a value of zero in the field length is
1826 // defined as length of 64".
1827 unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue();
1828
1829 // From AMD documentation: "If the sum of the bit index + length field
1830 // is greater than 64, the results are undefined".
1831 unsigned End = Index + Length;
1832
1833 // Note that both field index and field length are 8-bit quantities.
1834 // Since variables 'Index' and 'Length' are unsigned values
1835 // obtained from zero-extending field index and field length
1836 // respectively, their sum should never wrap around.
1837 if (End > 64)
1838 return UndefValue::get(II.getType());
1839
1840 // If we are inserting whole bytes, we can convert this to a shuffle.
1841 // Lowering can recognize EXTRQI shuffle masks.
1842 if ((Length % 8) == 0 && (Index % 8) == 0) {
1843 // Convert bit indices to byte indices.
1844 Length /= 8;
1845 Index /= 8;
1846
1847 Type *IntTy8 = Type::getInt8Ty(II.getContext());
1848 auto *ShufTy = FixedVectorType::get(IntTy8, 16);
1849
1850 SmallVector<int, 16> ShuffleMask;
1851 for (int i = 0; i != (int)Length; ++i)
1852 ShuffleMask.push_back(i + Index);
1853 for (int i = Length; i != 8; ++i)
1854 ShuffleMask.push_back(i + 16);
1855 for (int i = 8; i != 16; ++i)
1856 ShuffleMask.push_back(-1);
1857
1858 Value *SV = Builder.CreateShuffleVector(
1859 Builder.CreateBitCast(Op0, ShufTy),
1860 ConstantAggregateZero::get(ShufTy), ShuffleMask);
1861 return Builder.CreateBitCast(SV, II.getType());
1862 }
1863
1864 // Constant Fold - shift Index'th bit to lowest position and mask off
1865 // Length bits.
1866 if (CI0) {
1867 APInt Elt = CI0->getValue();
1868 Elt.lshrInPlace(Index);
1869 Elt = Elt.zextOrTrunc(Length);
1870 return LowConstantHighUndef(Elt.getZExtValue());
1871 }
1872
1873 // If we were an EXTRQ call, we'll save registers if we convert to EXTRQI.
1874 if (II.getIntrinsicID() == Intrinsic::x86_sse4a_extrq) {
1875 Value *Args[] = {Op0, CILength, CIIndex};
1876 Module *M = II.getModule();
1877 Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_extrqi);
1878 return Builder.CreateCall(F, Args);
1879 }
1880 }
1881
1882 // Constant Fold - extraction from zero is always {zero, undef}.
1883 if (CI0 && CI0->isZero())
1884 return LowConstantHighUndef(0);
1885
1886 return nullptr;
1887 }
1888
1889 /// Attempt to simplify SSE4A INSERTQ/INSERTQI instructions using constant
1890 /// folding or conversion to a shuffle vector.
simplifyX86insertq(IntrinsicInst & II,Value * Op0,Value * Op1,APInt APLength,APInt APIndex,InstCombiner::BuilderTy & Builder)1891 static Value *simplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1,
1892 APInt APLength, APInt APIndex,
1893 InstCombiner::BuilderTy &Builder) {
1894 // From AMD documentation: "The bit index and field length are each six bits
1895 // in length other bits of the field are ignored."
1896 APIndex = APIndex.zextOrTrunc(6);
1897 APLength = APLength.zextOrTrunc(6);
1898
1899 // Attempt to constant fold.
1900 unsigned Index = APIndex.getZExtValue();
1901
1902 // From AMD documentation: "a value of zero in the field length is
1903 // defined as length of 64".
1904 unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue();
1905
1906 // From AMD documentation: "If the sum of the bit index + length field
1907 // is greater than 64, the results are undefined".
1908 unsigned End = Index + Length;
1909
1910 // Note that both field index and field length are 8-bit quantities.
1911 // Since variables 'Index' and 'Length' are unsigned values
1912 // obtained from zero-extending field index and field length
1913 // respectively, their sum should never wrap around.
1914 if (End > 64)
1915 return UndefValue::get(II.getType());
1916
1917 // If we are inserting whole bytes, we can convert this to a shuffle.
1918 // Lowering can recognize INSERTQI shuffle masks.
1919 if ((Length % 8) == 0 && (Index % 8) == 0) {
1920 // Convert bit indices to byte indices.
1921 Length /= 8;
1922 Index /= 8;
1923
1924 Type *IntTy8 = Type::getInt8Ty(II.getContext());
1925 auto *ShufTy = FixedVectorType::get(IntTy8, 16);
1926
1927 SmallVector<int, 16> ShuffleMask;
1928 for (int i = 0; i != (int)Index; ++i)
1929 ShuffleMask.push_back(i);
1930 for (int i = 0; i != (int)Length; ++i)
1931 ShuffleMask.push_back(i + 16);
1932 for (int i = Index + Length; i != 8; ++i)
1933 ShuffleMask.push_back(i);
1934 for (int i = 8; i != 16; ++i)
1935 ShuffleMask.push_back(-1);
1936
1937 Value *SV = Builder.CreateShuffleVector(Builder.CreateBitCast(Op0, ShufTy),
1938 Builder.CreateBitCast(Op1, ShufTy),
1939 ShuffleMask);
1940 return Builder.CreateBitCast(SV, II.getType());
1941 }
1942
1943 // See if we're dealing with constant values.
1944 auto *C0 = dyn_cast<Constant>(Op0);
1945 auto *C1 = dyn_cast<Constant>(Op1);
1946 auto *CI00 =
1947 C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
1948 : nullptr;
1949 auto *CI10 =
1950 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
1951 : nullptr;
1952
1953 // Constant Fold - insert bottom Length bits starting at the Index'th bit.
1954 if (CI00 && CI10) {
1955 APInt V00 = CI00->getValue();
1956 APInt V10 = CI10->getValue();
1957 APInt Mask = APInt::getLowBitsSet(64, Length).shl(Index);
1958 V00 = V00 & ~Mask;
1959 V10 = V10.zextOrTrunc(Length).zextOrTrunc(64).shl(Index);
1960 APInt Val = V00 | V10;
1961 Type *IntTy64 = Type::getInt64Ty(II.getContext());
1962 Constant *Args[] = {ConstantInt::get(IntTy64, Val.getZExtValue()),
1963 UndefValue::get(IntTy64)};
1964 return ConstantVector::get(Args);
1965 }
1966
1967 // If we were an INSERTQ call, we'll save demanded elements if we convert to
1968 // INSERTQI.
1969 if (II.getIntrinsicID() == Intrinsic::x86_sse4a_insertq) {
1970 Type *IntTy8 = Type::getInt8Ty(II.getContext());
1971 Constant *CILength = ConstantInt::get(IntTy8, Length, false);
1972 Constant *CIIndex = ConstantInt::get(IntTy8, Index, false);
1973
1974 Value *Args[] = {Op0, Op1, CILength, CIIndex};
1975 Module *M = II.getModule();
1976 Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi);
1977 return Builder.CreateCall(F, Args);
1978 }
1979
1980 return nullptr;
1981 }
1982
1983 /// Attempt to convert pshufb* to shufflevector if the mask is constant.
simplifyX86pshufb(const IntrinsicInst & II,InstCombiner::BuilderTy & Builder)1984 static Value *simplifyX86pshufb(const IntrinsicInst &II,
1985 InstCombiner::BuilderTy &Builder) {
1986 auto *V = dyn_cast<Constant>(II.getArgOperand(1));
1987 if (!V)
1988 return nullptr;
1989
1990 auto *VecTy = cast<FixedVectorType>(II.getType());
1991 unsigned NumElts = VecTy->getNumElements();
1992 assert((NumElts == 16 || NumElts == 32 || NumElts == 64) &&
1993 "Unexpected number of elements in shuffle mask!");
1994
1995 // Construct a shuffle mask from constant integers or UNDEFs.
1996 int Indexes[64];
1997
1998 // Each byte in the shuffle control mask forms an index to permute the
1999 // corresponding byte in the destination operand.
2000 for (unsigned I = 0; I < NumElts; ++I) {
2001 Constant *COp = V->getAggregateElement(I);
2002 if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
2003 return nullptr;
2004
2005 if (isa<UndefValue>(COp)) {
2006 Indexes[I] = -1;
2007 continue;
2008 }
2009
2010 int8_t Index = cast<ConstantInt>(COp)->getValue().getZExtValue();
2011
2012 // If the most significant bit (bit[7]) of each byte of the shuffle
2013 // control mask is set, then zero is written in the result byte.
2014 // The zero vector is in the right-hand side of the resulting
2015 // shufflevector.
2016
2017 // The value of each index for the high 128-bit lane is the least
2018 // significant 4 bits of the respective shuffle control byte.
2019 Index = ((Index < 0) ? NumElts : Index & 0x0F) + (I & 0xF0);
2020 Indexes[I] = Index;
2021 }
2022
2023 auto V1 = II.getArgOperand(0);
2024 auto V2 = Constant::getNullValue(VecTy);
2025 return Builder.CreateShuffleVector(V1, V2, ArrayRef(Indexes, NumElts));
2026 }
2027
2028 /// Attempt to convert vpermilvar* to shufflevector if the mask is constant.
simplifyX86vpermilvar(const IntrinsicInst & II,InstCombiner::BuilderTy & Builder)2029 static Value *simplifyX86vpermilvar(const IntrinsicInst &II,
2030 InstCombiner::BuilderTy &Builder) {
2031 auto *V = dyn_cast<Constant>(II.getArgOperand(1));
2032 if (!V)
2033 return nullptr;
2034
2035 auto *VecTy = cast<FixedVectorType>(II.getType());
2036 unsigned NumElts = VecTy->getNumElements();
2037 bool IsPD = VecTy->getScalarType()->isDoubleTy();
2038 unsigned NumLaneElts = IsPD ? 2 : 4;
2039 assert(NumElts == 16 || NumElts == 8 || NumElts == 4 || NumElts == 2);
2040
2041 // Construct a shuffle mask from constant integers or UNDEFs.
2042 int Indexes[16];
2043
2044 // The intrinsics only read one or two bits, clear the rest.
2045 for (unsigned I = 0; I < NumElts; ++I) {
2046 Constant *COp = V->getAggregateElement(I);
2047 if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
2048 return nullptr;
2049
2050 if (isa<UndefValue>(COp)) {
2051 Indexes[I] = -1;
2052 continue;
2053 }
2054
2055 APInt Index = cast<ConstantInt>(COp)->getValue();
2056 Index = Index.zextOrTrunc(32).getLoBits(2);
2057
2058 // The PD variants uses bit 1 to select per-lane element index, so
2059 // shift down to convert to generic shuffle mask index.
2060 if (IsPD)
2061 Index.lshrInPlace(1);
2062
2063 // The _256 variants are a bit trickier since the mask bits always index
2064 // into the corresponding 128 half. In order to convert to a generic
2065 // shuffle, we have to make that explicit.
2066 Index += APInt(32, (I / NumLaneElts) * NumLaneElts);
2067
2068 Indexes[I] = Index.getZExtValue();
2069 }
2070
2071 auto V1 = II.getArgOperand(0);
2072 return Builder.CreateShuffleVector(V1, ArrayRef(Indexes, NumElts));
2073 }
2074
2075 /// Attempt to convert vpermd/vpermps to shufflevector if the mask is constant.
simplifyX86vpermv(const IntrinsicInst & II,InstCombiner::BuilderTy & Builder)2076 static Value *simplifyX86vpermv(const IntrinsicInst &II,
2077 InstCombiner::BuilderTy &Builder) {
2078 auto *V = dyn_cast<Constant>(II.getArgOperand(1));
2079 if (!V)
2080 return nullptr;
2081
2082 auto *VecTy = cast<FixedVectorType>(II.getType());
2083 unsigned Size = VecTy->getNumElements();
2084 assert((Size == 4 || Size == 8 || Size == 16 || Size == 32 || Size == 64) &&
2085 "Unexpected shuffle mask size");
2086
2087 // Construct a shuffle mask from constant integers or UNDEFs.
2088 int Indexes[64];
2089
2090 for (unsigned I = 0; I < Size; ++I) {
2091 Constant *COp = V->getAggregateElement(I);
2092 if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
2093 return nullptr;
2094
2095 if (isa<UndefValue>(COp)) {
2096 Indexes[I] = -1;
2097 continue;
2098 }
2099
2100 uint32_t Index = cast<ConstantInt>(COp)->getZExtValue();
2101 Index &= Size - 1;
2102 Indexes[I] = Index;
2103 }
2104
2105 auto V1 = II.getArgOperand(0);
2106 return Builder.CreateShuffleVector(V1, ArrayRef(Indexes, Size));
2107 }
2108
2109 /// Attempt to convert vpermi2/vpermt2 to shufflevector if the mask is constant.
simplifyX86vpermv3(const IntrinsicInst & II,InstCombiner::BuilderTy & Builder)2110 static Value *simplifyX86vpermv3(const IntrinsicInst &II,
2111 InstCombiner::BuilderTy &Builder) {
2112 auto *V = dyn_cast<Constant>(II.getArgOperand(1));
2113 if (!V)
2114 return nullptr;
2115
2116 auto *VecTy = cast<FixedVectorType>(II.getType());
2117 unsigned Size = VecTy->getNumElements();
2118 assert((Size == 2 || Size == 4 || Size == 8 || Size == 16 || Size == 32 ||
2119 Size == 64) &&
2120 "Unexpected shuffle mask size");
2121
2122 // Construct a shuffle mask from constant integers or UNDEFs.
2123 int Indexes[64];
2124
2125 for (unsigned I = 0; I < Size; ++I) {
2126 Constant *COp = V->getAggregateElement(I);
2127 if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
2128 return nullptr;
2129
2130 if (isa<UndefValue>(COp)) {
2131 Indexes[I] = -1;
2132 continue;
2133 }
2134
2135 uint32_t Index = cast<ConstantInt>(COp)->getZExtValue();
2136 Index &= (2 * Size) - 1;
2137 Indexes[I] = Index;
2138 }
2139
2140 auto V1 = II.getArgOperand(0);
2141 auto V2 = II.getArgOperand(2);
2142 return Builder.CreateShuffleVector(V1, V2, ArrayRef(Indexes, Size));
2143 }
2144
2145 std::optional<Instruction *>
instCombineIntrinsic(InstCombiner & IC,IntrinsicInst & II) const2146 X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
2147 auto SimplifyDemandedVectorEltsLow = [&IC](Value *Op, unsigned Width,
2148 unsigned DemandedWidth) {
2149 APInt UndefElts(Width, 0);
2150 APInt DemandedElts = APInt::getLowBitsSet(Width, DemandedWidth);
2151 return IC.SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts);
2152 };
2153
2154 Intrinsic::ID IID = II.getIntrinsicID();
2155 switch (IID) {
2156 case Intrinsic::x86_bmi_bextr_32:
2157 case Intrinsic::x86_bmi_bextr_64:
2158 case Intrinsic::x86_tbm_bextri_u32:
2159 case Intrinsic::x86_tbm_bextri_u64:
2160 // If the RHS is a constant we can try some simplifications.
2161 if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
2162 uint64_t Shift = C->getZExtValue();
2163 uint64_t Length = (Shift >> 8) & 0xff;
2164 Shift &= 0xff;
2165 unsigned BitWidth = II.getType()->getIntegerBitWidth();
2166 // If the length is 0 or the shift is out of range, replace with zero.
2167 if (Length == 0 || Shift >= BitWidth) {
2168 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
2169 }
2170 // If the LHS is also a constant, we can completely constant fold this.
2171 if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
2172 uint64_t Result = InC->getZExtValue() >> Shift;
2173 if (Length > BitWidth)
2174 Length = BitWidth;
2175 Result &= maskTrailingOnes<uint64_t>(Length);
2176 return IC.replaceInstUsesWith(II,
2177 ConstantInt::get(II.getType(), Result));
2178 }
2179 // TODO should we turn this into 'and' if shift is 0? Or 'shl' if we
2180 // are only masking bits that a shift already cleared?
2181 }
2182 break;
2183
2184 case Intrinsic::x86_bmi_bzhi_32:
2185 case Intrinsic::x86_bmi_bzhi_64:
2186 // If the RHS is a constant we can try some simplifications.
2187 if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
2188 uint64_t Index = C->getZExtValue() & 0xff;
2189 unsigned BitWidth = II.getType()->getIntegerBitWidth();
2190 if (Index >= BitWidth) {
2191 return IC.replaceInstUsesWith(II, II.getArgOperand(0));
2192 }
2193 if (Index == 0) {
2194 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
2195 }
2196 // If the LHS is also a constant, we can completely constant fold this.
2197 if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
2198 uint64_t Result = InC->getZExtValue();
2199 Result &= maskTrailingOnes<uint64_t>(Index);
2200 return IC.replaceInstUsesWith(II,
2201 ConstantInt::get(II.getType(), Result));
2202 }
2203 // TODO should we convert this to an AND if the RHS is constant?
2204 }
2205 break;
2206 case Intrinsic::x86_bmi_pext_32:
2207 case Intrinsic::x86_bmi_pext_64:
2208 if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
2209 if (MaskC->isNullValue()) {
2210 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
2211 }
2212 if (MaskC->isAllOnesValue()) {
2213 return IC.replaceInstUsesWith(II, II.getArgOperand(0));
2214 }
2215
2216 unsigned MaskIdx, MaskLen;
2217 if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) {
2218 // any single contingous sequence of 1s anywhere in the mask simply
2219 // describes a subset of the input bits shifted to the appropriate
2220 // position. Replace with the straight forward IR.
2221 Value *Input = II.getArgOperand(0);
2222 Value *Masked = IC.Builder.CreateAnd(Input, II.getArgOperand(1));
2223 Value *ShiftAmt = ConstantInt::get(II.getType(), MaskIdx);
2224 Value *Shifted = IC.Builder.CreateLShr(Masked, ShiftAmt);
2225 return IC.replaceInstUsesWith(II, Shifted);
2226 }
2227
2228 if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
2229 uint64_t Src = SrcC->getZExtValue();
2230 uint64_t Mask = MaskC->getZExtValue();
2231 uint64_t Result = 0;
2232 uint64_t BitToSet = 1;
2233
2234 while (Mask) {
2235 // Isolate lowest set bit.
2236 uint64_t BitToTest = Mask & -Mask;
2237 if (BitToTest & Src)
2238 Result |= BitToSet;
2239
2240 BitToSet <<= 1;
2241 // Clear lowest set bit.
2242 Mask &= Mask - 1;
2243 }
2244
2245 return IC.replaceInstUsesWith(II,
2246 ConstantInt::get(II.getType(), Result));
2247 }
2248 }
2249 break;
2250 case Intrinsic::x86_bmi_pdep_32:
2251 case Intrinsic::x86_bmi_pdep_64:
2252 if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
2253 if (MaskC->isNullValue()) {
2254 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
2255 }
2256 if (MaskC->isAllOnesValue()) {
2257 return IC.replaceInstUsesWith(II, II.getArgOperand(0));
2258 }
2259
2260 unsigned MaskIdx, MaskLen;
2261 if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) {
2262 // any single contingous sequence of 1s anywhere in the mask simply
2263 // describes a subset of the input bits shifted to the appropriate
2264 // position. Replace with the straight forward IR.
2265 Value *Input = II.getArgOperand(0);
2266 Value *ShiftAmt = ConstantInt::get(II.getType(), MaskIdx);
2267 Value *Shifted = IC.Builder.CreateShl(Input, ShiftAmt);
2268 Value *Masked = IC.Builder.CreateAnd(Shifted, II.getArgOperand(1));
2269 return IC.replaceInstUsesWith(II, Masked);
2270 }
2271
2272 if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
2273 uint64_t Src = SrcC->getZExtValue();
2274 uint64_t Mask = MaskC->getZExtValue();
2275 uint64_t Result = 0;
2276 uint64_t BitToTest = 1;
2277
2278 while (Mask) {
2279 // Isolate lowest set bit.
2280 uint64_t BitToSet = Mask & -Mask;
2281 if (BitToTest & Src)
2282 Result |= BitToSet;
2283
2284 BitToTest <<= 1;
2285 // Clear lowest set bit;
2286 Mask &= Mask - 1;
2287 }
2288
2289 return IC.replaceInstUsesWith(II,
2290 ConstantInt::get(II.getType(), Result));
2291 }
2292 }
2293 break;
2294
2295 case Intrinsic::x86_sse_cvtss2si:
2296 case Intrinsic::x86_sse_cvtss2si64:
2297 case Intrinsic::x86_sse_cvttss2si:
2298 case Intrinsic::x86_sse_cvttss2si64:
2299 case Intrinsic::x86_sse2_cvtsd2si:
2300 case Intrinsic::x86_sse2_cvtsd2si64:
2301 case Intrinsic::x86_sse2_cvttsd2si:
2302 case Intrinsic::x86_sse2_cvttsd2si64:
2303 case Intrinsic::x86_avx512_vcvtss2si32:
2304 case Intrinsic::x86_avx512_vcvtss2si64:
2305 case Intrinsic::x86_avx512_vcvtss2usi32:
2306 case Intrinsic::x86_avx512_vcvtss2usi64:
2307 case Intrinsic::x86_avx512_vcvtsd2si32:
2308 case Intrinsic::x86_avx512_vcvtsd2si64:
2309 case Intrinsic::x86_avx512_vcvtsd2usi32:
2310 case Intrinsic::x86_avx512_vcvtsd2usi64:
2311 case Intrinsic::x86_avx512_cvttss2si:
2312 case Intrinsic::x86_avx512_cvttss2si64:
2313 case Intrinsic::x86_avx512_cvttss2usi:
2314 case Intrinsic::x86_avx512_cvttss2usi64:
2315 case Intrinsic::x86_avx512_cvttsd2si:
2316 case Intrinsic::x86_avx512_cvttsd2si64:
2317 case Intrinsic::x86_avx512_cvttsd2usi:
2318 case Intrinsic::x86_avx512_cvttsd2usi64: {
2319 // These intrinsics only demand the 0th element of their input vectors. If
2320 // we can simplify the input based on that, do so now.
2321 Value *Arg = II.getArgOperand(0);
2322 unsigned VWidth = cast<FixedVectorType>(Arg->getType())->getNumElements();
2323 if (Value *V = SimplifyDemandedVectorEltsLow(Arg, VWidth, 1)) {
2324 return IC.replaceOperand(II, 0, V);
2325 }
2326 break;
2327 }
2328
2329 case Intrinsic::x86_mmx_pmovmskb:
2330 case Intrinsic::x86_sse_movmsk_ps:
2331 case Intrinsic::x86_sse2_movmsk_pd:
2332 case Intrinsic::x86_sse2_pmovmskb_128:
2333 case Intrinsic::x86_avx_movmsk_pd_256:
2334 case Intrinsic::x86_avx_movmsk_ps_256:
2335 case Intrinsic::x86_avx2_pmovmskb:
2336 if (Value *V = simplifyX86movmsk(II, IC.Builder)) {
2337 return IC.replaceInstUsesWith(II, V);
2338 }
2339 break;
2340
2341 case Intrinsic::x86_sse_comieq_ss:
2342 case Intrinsic::x86_sse_comige_ss:
2343 case Intrinsic::x86_sse_comigt_ss:
2344 case Intrinsic::x86_sse_comile_ss:
2345 case Intrinsic::x86_sse_comilt_ss:
2346 case Intrinsic::x86_sse_comineq_ss:
2347 case Intrinsic::x86_sse_ucomieq_ss:
2348 case Intrinsic::x86_sse_ucomige_ss:
2349 case Intrinsic::x86_sse_ucomigt_ss:
2350 case Intrinsic::x86_sse_ucomile_ss:
2351 case Intrinsic::x86_sse_ucomilt_ss:
2352 case Intrinsic::x86_sse_ucomineq_ss:
2353 case Intrinsic::x86_sse2_comieq_sd:
2354 case Intrinsic::x86_sse2_comige_sd:
2355 case Intrinsic::x86_sse2_comigt_sd:
2356 case Intrinsic::x86_sse2_comile_sd:
2357 case Intrinsic::x86_sse2_comilt_sd:
2358 case Intrinsic::x86_sse2_comineq_sd:
2359 case Intrinsic::x86_sse2_ucomieq_sd:
2360 case Intrinsic::x86_sse2_ucomige_sd:
2361 case Intrinsic::x86_sse2_ucomigt_sd:
2362 case Intrinsic::x86_sse2_ucomile_sd:
2363 case Intrinsic::x86_sse2_ucomilt_sd:
2364 case Intrinsic::x86_sse2_ucomineq_sd:
2365 case Intrinsic::x86_avx512_vcomi_ss:
2366 case Intrinsic::x86_avx512_vcomi_sd:
2367 case Intrinsic::x86_avx512_mask_cmp_ss:
2368 case Intrinsic::x86_avx512_mask_cmp_sd: {
2369 // These intrinsics only demand the 0th element of their input vectors. If
2370 // we can simplify the input based on that, do so now.
2371 bool MadeChange = false;
2372 Value *Arg0 = II.getArgOperand(0);
2373 Value *Arg1 = II.getArgOperand(1);
2374 unsigned VWidth = cast<FixedVectorType>(Arg0->getType())->getNumElements();
2375 if (Value *V = SimplifyDemandedVectorEltsLow(Arg0, VWidth, 1)) {
2376 IC.replaceOperand(II, 0, V);
2377 MadeChange = true;
2378 }
2379 if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, 1)) {
2380 IC.replaceOperand(II, 1, V);
2381 MadeChange = true;
2382 }
2383 if (MadeChange) {
2384 return &II;
2385 }
2386 break;
2387 }
2388
2389 case Intrinsic::x86_avx512_add_ps_512:
2390 case Intrinsic::x86_avx512_div_ps_512:
2391 case Intrinsic::x86_avx512_mul_ps_512:
2392 case Intrinsic::x86_avx512_sub_ps_512:
2393 case Intrinsic::x86_avx512_add_pd_512:
2394 case Intrinsic::x86_avx512_div_pd_512:
2395 case Intrinsic::x86_avx512_mul_pd_512:
2396 case Intrinsic::x86_avx512_sub_pd_512:
2397 // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
2398 // IR operations.
2399 if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
2400 if (R->getValue() == 4) {
2401 Value *Arg0 = II.getArgOperand(0);
2402 Value *Arg1 = II.getArgOperand(1);
2403
2404 Value *V;
2405 switch (IID) {
2406 default:
2407 llvm_unreachable("Case stmts out of sync!");
2408 case Intrinsic::x86_avx512_add_ps_512:
2409 case Intrinsic::x86_avx512_add_pd_512:
2410 V = IC.Builder.CreateFAdd(Arg0, Arg1);
2411 break;
2412 case Intrinsic::x86_avx512_sub_ps_512:
2413 case Intrinsic::x86_avx512_sub_pd_512:
2414 V = IC.Builder.CreateFSub(Arg0, Arg1);
2415 break;
2416 case Intrinsic::x86_avx512_mul_ps_512:
2417 case Intrinsic::x86_avx512_mul_pd_512:
2418 V = IC.Builder.CreateFMul(Arg0, Arg1);
2419 break;
2420 case Intrinsic::x86_avx512_div_ps_512:
2421 case Intrinsic::x86_avx512_div_pd_512:
2422 V = IC.Builder.CreateFDiv(Arg0, Arg1);
2423 break;
2424 }
2425
2426 return IC.replaceInstUsesWith(II, V);
2427 }
2428 }
2429 break;
2430
2431 case Intrinsic::x86_avx512_mask_add_ss_round:
2432 case Intrinsic::x86_avx512_mask_div_ss_round:
2433 case Intrinsic::x86_avx512_mask_mul_ss_round:
2434 case Intrinsic::x86_avx512_mask_sub_ss_round:
2435 case Intrinsic::x86_avx512_mask_add_sd_round:
2436 case Intrinsic::x86_avx512_mask_div_sd_round:
2437 case Intrinsic::x86_avx512_mask_mul_sd_round:
2438 case Intrinsic::x86_avx512_mask_sub_sd_round:
2439 // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
2440 // IR operations.
2441 if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(4))) {
2442 if (R->getValue() == 4) {
2443 // Extract the element as scalars.
2444 Value *Arg0 = II.getArgOperand(0);
2445 Value *Arg1 = II.getArgOperand(1);
2446 Value *LHS = IC.Builder.CreateExtractElement(Arg0, (uint64_t)0);
2447 Value *RHS = IC.Builder.CreateExtractElement(Arg1, (uint64_t)0);
2448
2449 Value *V;
2450 switch (IID) {
2451 default:
2452 llvm_unreachable("Case stmts out of sync!");
2453 case Intrinsic::x86_avx512_mask_add_ss_round:
2454 case Intrinsic::x86_avx512_mask_add_sd_round:
2455 V = IC.Builder.CreateFAdd(LHS, RHS);
2456 break;
2457 case Intrinsic::x86_avx512_mask_sub_ss_round:
2458 case Intrinsic::x86_avx512_mask_sub_sd_round:
2459 V = IC.Builder.CreateFSub(LHS, RHS);
2460 break;
2461 case Intrinsic::x86_avx512_mask_mul_ss_round:
2462 case Intrinsic::x86_avx512_mask_mul_sd_round:
2463 V = IC.Builder.CreateFMul(LHS, RHS);
2464 break;
2465 case Intrinsic::x86_avx512_mask_div_ss_round:
2466 case Intrinsic::x86_avx512_mask_div_sd_round:
2467 V = IC.Builder.CreateFDiv(LHS, RHS);
2468 break;
2469 }
2470
2471 // Handle the masking aspect of the intrinsic.
2472 Value *Mask = II.getArgOperand(3);
2473 auto *C = dyn_cast<ConstantInt>(Mask);
2474 // We don't need a select if we know the mask bit is a 1.
2475 if (!C || !C->getValue()[0]) {
2476 // Cast the mask to an i1 vector and then extract the lowest element.
2477 auto *MaskTy = FixedVectorType::get(
2478 IC.Builder.getInt1Ty(),
2479 cast<IntegerType>(Mask->getType())->getBitWidth());
2480 Mask = IC.Builder.CreateBitCast(Mask, MaskTy);
2481 Mask = IC.Builder.CreateExtractElement(Mask, (uint64_t)0);
2482 // Extract the lowest element from the passthru operand.
2483 Value *Passthru =
2484 IC.Builder.CreateExtractElement(II.getArgOperand(2), (uint64_t)0);
2485 V = IC.Builder.CreateSelect(Mask, V, Passthru);
2486 }
2487
2488 // Insert the result back into the original argument 0.
2489 V = IC.Builder.CreateInsertElement(Arg0, V, (uint64_t)0);
2490
2491 return IC.replaceInstUsesWith(II, V);
2492 }
2493 }
2494 break;
2495
2496 // Constant fold ashr( <A x Bi>, Ci ).
2497 // Constant fold lshr( <A x Bi>, Ci ).
2498 // Constant fold shl( <A x Bi>, Ci ).
2499 case Intrinsic::x86_sse2_psrai_d:
2500 case Intrinsic::x86_sse2_psrai_w:
2501 case Intrinsic::x86_avx2_psrai_d:
2502 case Intrinsic::x86_avx2_psrai_w:
2503 case Intrinsic::x86_avx512_psrai_q_128:
2504 case Intrinsic::x86_avx512_psrai_q_256:
2505 case Intrinsic::x86_avx512_psrai_d_512:
2506 case Intrinsic::x86_avx512_psrai_q_512:
2507 case Intrinsic::x86_avx512_psrai_w_512:
2508 case Intrinsic::x86_sse2_psrli_d:
2509 case Intrinsic::x86_sse2_psrli_q:
2510 case Intrinsic::x86_sse2_psrli_w:
2511 case Intrinsic::x86_avx2_psrli_d:
2512 case Intrinsic::x86_avx2_psrli_q:
2513 case Intrinsic::x86_avx2_psrli_w:
2514 case Intrinsic::x86_avx512_psrli_d_512:
2515 case Intrinsic::x86_avx512_psrli_q_512:
2516 case Intrinsic::x86_avx512_psrli_w_512:
2517 case Intrinsic::x86_sse2_pslli_d:
2518 case Intrinsic::x86_sse2_pslli_q:
2519 case Intrinsic::x86_sse2_pslli_w:
2520 case Intrinsic::x86_avx2_pslli_d:
2521 case Intrinsic::x86_avx2_pslli_q:
2522 case Intrinsic::x86_avx2_pslli_w:
2523 case Intrinsic::x86_avx512_pslli_d_512:
2524 case Intrinsic::x86_avx512_pslli_q_512:
2525 case Intrinsic::x86_avx512_pslli_w_512:
2526 if (Value *V = simplifyX86immShift(II, IC.Builder)) {
2527 return IC.replaceInstUsesWith(II, V);
2528 }
2529 break;
2530
2531 case Intrinsic::x86_sse2_psra_d:
2532 case Intrinsic::x86_sse2_psra_w:
2533 case Intrinsic::x86_avx2_psra_d:
2534 case Intrinsic::x86_avx2_psra_w:
2535 case Intrinsic::x86_avx512_psra_q_128:
2536 case Intrinsic::x86_avx512_psra_q_256:
2537 case Intrinsic::x86_avx512_psra_d_512:
2538 case Intrinsic::x86_avx512_psra_q_512:
2539 case Intrinsic::x86_avx512_psra_w_512:
2540 case Intrinsic::x86_sse2_psrl_d:
2541 case Intrinsic::x86_sse2_psrl_q:
2542 case Intrinsic::x86_sse2_psrl_w:
2543 case Intrinsic::x86_avx2_psrl_d:
2544 case Intrinsic::x86_avx2_psrl_q:
2545 case Intrinsic::x86_avx2_psrl_w:
2546 case Intrinsic::x86_avx512_psrl_d_512:
2547 case Intrinsic::x86_avx512_psrl_q_512:
2548 case Intrinsic::x86_avx512_psrl_w_512:
2549 case Intrinsic::x86_sse2_psll_d:
2550 case Intrinsic::x86_sse2_psll_q:
2551 case Intrinsic::x86_sse2_psll_w:
2552 case Intrinsic::x86_avx2_psll_d:
2553 case Intrinsic::x86_avx2_psll_q:
2554 case Intrinsic::x86_avx2_psll_w:
2555 case Intrinsic::x86_avx512_psll_d_512:
2556 case Intrinsic::x86_avx512_psll_q_512:
2557 case Intrinsic::x86_avx512_psll_w_512: {
2558 if (Value *V = simplifyX86immShift(II, IC.Builder)) {
2559 return IC.replaceInstUsesWith(II, V);
2560 }
2561
2562 // SSE2/AVX2 uses only the first 64-bits of the 128-bit vector
2563 // operand to compute the shift amount.
2564 Value *Arg1 = II.getArgOperand(1);
2565 assert(Arg1->getType()->getPrimitiveSizeInBits() == 128 &&
2566 "Unexpected packed shift size");
2567 unsigned VWidth = cast<FixedVectorType>(Arg1->getType())->getNumElements();
2568
2569 if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, VWidth / 2)) {
2570 return IC.replaceOperand(II, 1, V);
2571 }
2572 break;
2573 }
2574
2575 case Intrinsic::x86_avx2_psllv_d:
2576 case Intrinsic::x86_avx2_psllv_d_256:
2577 case Intrinsic::x86_avx2_psllv_q:
2578 case Intrinsic::x86_avx2_psllv_q_256:
2579 case Intrinsic::x86_avx512_psllv_d_512:
2580 case Intrinsic::x86_avx512_psllv_q_512:
2581 case Intrinsic::x86_avx512_psllv_w_128:
2582 case Intrinsic::x86_avx512_psllv_w_256:
2583 case Intrinsic::x86_avx512_psllv_w_512:
2584 case Intrinsic::x86_avx2_psrav_d:
2585 case Intrinsic::x86_avx2_psrav_d_256:
2586 case Intrinsic::x86_avx512_psrav_q_128:
2587 case Intrinsic::x86_avx512_psrav_q_256:
2588 case Intrinsic::x86_avx512_psrav_d_512:
2589 case Intrinsic::x86_avx512_psrav_q_512:
2590 case Intrinsic::x86_avx512_psrav_w_128:
2591 case Intrinsic::x86_avx512_psrav_w_256:
2592 case Intrinsic::x86_avx512_psrav_w_512:
2593 case Intrinsic::x86_avx2_psrlv_d:
2594 case Intrinsic::x86_avx2_psrlv_d_256:
2595 case Intrinsic::x86_avx2_psrlv_q:
2596 case Intrinsic::x86_avx2_psrlv_q_256:
2597 case Intrinsic::x86_avx512_psrlv_d_512:
2598 case Intrinsic::x86_avx512_psrlv_q_512:
2599 case Intrinsic::x86_avx512_psrlv_w_128:
2600 case Intrinsic::x86_avx512_psrlv_w_256:
2601 case Intrinsic::x86_avx512_psrlv_w_512:
2602 if (Value *V = simplifyX86varShift(II, IC.Builder)) {
2603 return IC.replaceInstUsesWith(II, V);
2604 }
2605 break;
2606
2607 case Intrinsic::x86_sse2_packssdw_128:
2608 case Intrinsic::x86_sse2_packsswb_128:
2609 case Intrinsic::x86_avx2_packssdw:
2610 case Intrinsic::x86_avx2_packsswb:
2611 case Intrinsic::x86_avx512_packssdw_512:
2612 case Intrinsic::x86_avx512_packsswb_512:
2613 if (Value *V = simplifyX86pack(II, IC.Builder, true)) {
2614 return IC.replaceInstUsesWith(II, V);
2615 }
2616 break;
2617
2618 case Intrinsic::x86_sse2_packuswb_128:
2619 case Intrinsic::x86_sse41_packusdw:
2620 case Intrinsic::x86_avx2_packusdw:
2621 case Intrinsic::x86_avx2_packuswb:
2622 case Intrinsic::x86_avx512_packusdw_512:
2623 case Intrinsic::x86_avx512_packuswb_512:
2624 if (Value *V = simplifyX86pack(II, IC.Builder, false)) {
2625 return IC.replaceInstUsesWith(II, V);
2626 }
2627 break;
2628
2629 case Intrinsic::x86_sse2_pmulh_w:
2630 case Intrinsic::x86_avx2_pmulh_w:
2631 case Intrinsic::x86_avx512_pmulh_w_512:
2632 if (Value *V = simplifyX86pmulh(II, IC.Builder, true, false)) {
2633 return IC.replaceInstUsesWith(II, V);
2634 }
2635 break;
2636
2637 case Intrinsic::x86_sse2_pmulhu_w:
2638 case Intrinsic::x86_avx2_pmulhu_w:
2639 case Intrinsic::x86_avx512_pmulhu_w_512:
2640 if (Value *V = simplifyX86pmulh(II, IC.Builder, false, false)) {
2641 return IC.replaceInstUsesWith(II, V);
2642 }
2643 break;
2644
2645 case Intrinsic::x86_ssse3_pmul_hr_sw_128:
2646 case Intrinsic::x86_avx2_pmul_hr_sw:
2647 case Intrinsic::x86_avx512_pmul_hr_sw_512:
2648 if (Value *V = simplifyX86pmulh(II, IC.Builder, true, true)) {
2649 return IC.replaceInstUsesWith(II, V);
2650 }
2651 break;
2652
2653 case Intrinsic::x86_sse2_pmadd_wd:
2654 case Intrinsic::x86_avx2_pmadd_wd:
2655 case Intrinsic::x86_avx512_pmaddw_d_512:
2656 if (Value *V = simplifyX86pmadd(II, IC.Builder, true)) {
2657 return IC.replaceInstUsesWith(II, V);
2658 }
2659 break;
2660
2661 case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
2662 case Intrinsic::x86_avx2_pmadd_ub_sw:
2663 case Intrinsic::x86_avx512_pmaddubs_w_512:
2664 if (Value *V = simplifyX86pmadd(II, IC.Builder, false)) {
2665 return IC.replaceInstUsesWith(II, V);
2666 }
2667 break;
2668
2669 case Intrinsic::x86_pclmulqdq:
2670 case Intrinsic::x86_pclmulqdq_256:
2671 case Intrinsic::x86_pclmulqdq_512: {
2672 if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
2673 unsigned Imm = C->getZExtValue();
2674
2675 bool MadeChange = false;
2676 Value *Arg0 = II.getArgOperand(0);
2677 Value *Arg1 = II.getArgOperand(1);
2678 unsigned VWidth =
2679 cast<FixedVectorType>(Arg0->getType())->getNumElements();
2680
2681 APInt UndefElts1(VWidth, 0);
2682 APInt DemandedElts1 =
2683 APInt::getSplat(VWidth, APInt(2, (Imm & 0x01) ? 2 : 1));
2684 if (Value *V =
2685 IC.SimplifyDemandedVectorElts(Arg0, DemandedElts1, UndefElts1)) {
2686 IC.replaceOperand(II, 0, V);
2687 MadeChange = true;
2688 }
2689
2690 APInt UndefElts2(VWidth, 0);
2691 APInt DemandedElts2 =
2692 APInt::getSplat(VWidth, APInt(2, (Imm & 0x10) ? 2 : 1));
2693 if (Value *V =
2694 IC.SimplifyDemandedVectorElts(Arg1, DemandedElts2, UndefElts2)) {
2695 IC.replaceOperand(II, 1, V);
2696 MadeChange = true;
2697 }
2698
2699 // If either input elements are undef, the result is zero.
2700 if (DemandedElts1.isSubsetOf(UndefElts1) ||
2701 DemandedElts2.isSubsetOf(UndefElts2)) {
2702 return IC.replaceInstUsesWith(II,
2703 ConstantAggregateZero::get(II.getType()));
2704 }
2705
2706 if (MadeChange) {
2707 return &II;
2708 }
2709 }
2710 break;
2711 }
2712
2713 case Intrinsic::x86_sse41_insertps:
2714 if (Value *V = simplifyX86insertps(II, IC.Builder)) {
2715 return IC.replaceInstUsesWith(II, V);
2716 }
2717 break;
2718
2719 case Intrinsic::x86_sse4a_extrq: {
2720 Value *Op0 = II.getArgOperand(0);
2721 Value *Op1 = II.getArgOperand(1);
2722 unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements();
2723 unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements();
2724 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
2725 Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
2726 VWidth1 == 16 && "Unexpected operand sizes");
2727
2728 // See if we're dealing with constant values.
2729 auto *C1 = dyn_cast<Constant>(Op1);
2730 auto *CILength =
2731 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
2732 : nullptr;
2733 auto *CIIndex =
2734 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
2735 : nullptr;
2736
2737 // Attempt to simplify to a constant, shuffle vector or EXTRQI call.
2738 if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) {
2739 return IC.replaceInstUsesWith(II, V);
2740 }
2741
2742 // EXTRQ only uses the lowest 64-bits of the first 128-bit vector
2743 // operands and the lowest 16-bits of the second.
2744 bool MadeChange = false;
2745 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
2746 IC.replaceOperand(II, 0, V);
2747 MadeChange = true;
2748 }
2749 if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 2)) {
2750 IC.replaceOperand(II, 1, V);
2751 MadeChange = true;
2752 }
2753 if (MadeChange) {
2754 return &II;
2755 }
2756 break;
2757 }
2758
2759 case Intrinsic::x86_sse4a_extrqi: {
2760 // EXTRQI: Extract Length bits starting from Index. Zero pad the remaining
2761 // bits of the lower 64-bits. The upper 64-bits are undefined.
2762 Value *Op0 = II.getArgOperand(0);
2763 unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements();
2764 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
2765 "Unexpected operand size");
2766
2767 // See if we're dealing with constant values.
2768 auto *CILength = dyn_cast<ConstantInt>(II.getArgOperand(1));
2769 auto *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(2));
2770
2771 // Attempt to simplify to a constant or shuffle vector.
2772 if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) {
2773 return IC.replaceInstUsesWith(II, V);
2774 }
2775
2776 // EXTRQI only uses the lowest 64-bits of the first 128-bit vector
2777 // operand.
2778 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
2779 return IC.replaceOperand(II, 0, V);
2780 }
2781 break;
2782 }
2783
2784 case Intrinsic::x86_sse4a_insertq: {
2785 Value *Op0 = II.getArgOperand(0);
2786 Value *Op1 = II.getArgOperand(1);
2787 unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements();
2788 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
2789 Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
2790 cast<FixedVectorType>(Op1->getType())->getNumElements() == 2 &&
2791 "Unexpected operand size");
2792
2793 // See if we're dealing with constant values.
2794 auto *C1 = dyn_cast<Constant>(Op1);
2795 auto *CI11 =
2796 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
2797 : nullptr;
2798
2799 // Attempt to simplify to a constant, shuffle vector or INSERTQI call.
2800 if (CI11) {
2801 const APInt &V11 = CI11->getValue();
2802 APInt Len = V11.zextOrTrunc(6);
2803 APInt Idx = V11.lshr(8).zextOrTrunc(6);
2804 if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) {
2805 return IC.replaceInstUsesWith(II, V);
2806 }
2807 }
2808
2809 // INSERTQ only uses the lowest 64-bits of the first 128-bit vector
2810 // operand.
2811 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
2812 return IC.replaceOperand(II, 0, V);
2813 }
2814 break;
2815 }
2816
2817 case Intrinsic::x86_sse4a_insertqi: {
2818 // INSERTQI: Extract lowest Length bits from lower half of second source and
2819 // insert over first source starting at Index bit. The upper 64-bits are
2820 // undefined.
2821 Value *Op0 = II.getArgOperand(0);
2822 Value *Op1 = II.getArgOperand(1);
2823 unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements();
2824 unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements();
2825 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
2826 Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
2827 VWidth1 == 2 && "Unexpected operand sizes");
2828
2829 // See if we're dealing with constant values.
2830 auto *CILength = dyn_cast<ConstantInt>(II.getArgOperand(2));
2831 auto *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(3));
2832
2833 // Attempt to simplify to a constant or shuffle vector.
2834 if (CILength && CIIndex) {
2835 APInt Len = CILength->getValue().zextOrTrunc(6);
2836 APInt Idx = CIIndex->getValue().zextOrTrunc(6);
2837 if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) {
2838 return IC.replaceInstUsesWith(II, V);
2839 }
2840 }
2841
2842 // INSERTQI only uses the lowest 64-bits of the first two 128-bit vector
2843 // operands.
2844 bool MadeChange = false;
2845 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
2846 IC.replaceOperand(II, 0, V);
2847 MadeChange = true;
2848 }
2849 if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 1)) {
2850 IC.replaceOperand(II, 1, V);
2851 MadeChange = true;
2852 }
2853 if (MadeChange) {
2854 return &II;
2855 }
2856 break;
2857 }
2858
2859 case Intrinsic::x86_sse41_pblendvb:
2860 case Intrinsic::x86_sse41_blendvps:
2861 case Intrinsic::x86_sse41_blendvpd:
2862 case Intrinsic::x86_avx_blendv_ps_256:
2863 case Intrinsic::x86_avx_blendv_pd_256:
2864 case Intrinsic::x86_avx2_pblendvb: {
2865 // fold (blend A, A, Mask) -> A
2866 Value *Op0 = II.getArgOperand(0);
2867 Value *Op1 = II.getArgOperand(1);
2868 Value *Mask = II.getArgOperand(2);
2869 if (Op0 == Op1) {
2870 return IC.replaceInstUsesWith(II, Op0);
2871 }
2872
2873 // Zero Mask - select 1st argument.
2874 if (isa<ConstantAggregateZero>(Mask)) {
2875 return IC.replaceInstUsesWith(II, Op0);
2876 }
2877
2878 // Constant Mask - select 1st/2nd argument lane based on top bit of mask.
2879 if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) {
2880 Constant *NewSelector =
2881 getNegativeIsTrueBoolVec(ConstantMask, IC.getDataLayout());
2882 return SelectInst::Create(NewSelector, Op1, Op0, "blendv");
2883 }
2884
2885 Mask = InstCombiner::peekThroughBitcast(Mask);
2886
2887 // Peek through a one-use shuffle - VectorCombine should have simplified
2888 // this for cases where we're splitting wider vectors to use blendv
2889 // intrinsics.
2890 Value *MaskSrc = nullptr;
2891 ArrayRef<int> ShuffleMask;
2892 if (match(Mask, m_OneUse(m_Shuffle(m_Value(MaskSrc), m_Undef(),
2893 m_Mask(ShuffleMask))))) {
2894 // Bail if the shuffle was irregular or contains undefs.
2895 int NumElts = cast<FixedVectorType>(MaskSrc->getType())->getNumElements();
2896 if (NumElts < (int)ShuffleMask.size() || !isPowerOf2_32(NumElts) ||
2897 any_of(ShuffleMask,
2898 [NumElts](int M) { return M < 0 || M >= NumElts; }))
2899 break;
2900 Mask = InstCombiner::peekThroughBitcast(MaskSrc);
2901 }
2902
2903 // Convert to a vector select if we can bypass casts and find a boolean
2904 // vector condition value.
2905 Value *BoolVec;
2906 if (match(Mask, m_SExt(m_Value(BoolVec))) &&
2907 BoolVec->getType()->isVectorTy() &&
2908 BoolVec->getType()->getScalarSizeInBits() == 1) {
2909 auto *MaskTy = cast<FixedVectorType>(Mask->getType());
2910 auto *OpTy = cast<FixedVectorType>(II.getType());
2911 unsigned NumMaskElts = MaskTy->getNumElements();
2912 unsigned NumOperandElts = OpTy->getNumElements();
2913
2914 // If we peeked through a shuffle, reapply the shuffle to the bool vector.
2915 if (MaskSrc) {
2916 unsigned NumMaskSrcElts =
2917 cast<FixedVectorType>(MaskSrc->getType())->getNumElements();
2918 NumMaskElts = (ShuffleMask.size() * NumMaskElts) / NumMaskSrcElts;
2919 // Multiple mask bits maps to the same operand element - bail out.
2920 if (NumMaskElts > NumOperandElts)
2921 break;
2922 SmallVector<int> ScaledMask;
2923 if (!llvm::scaleShuffleMaskElts(NumMaskElts, ShuffleMask, ScaledMask))
2924 break;
2925 BoolVec = IC.Builder.CreateShuffleVector(BoolVec, ScaledMask);
2926 MaskTy = FixedVectorType::get(MaskTy->getElementType(), NumMaskElts);
2927 }
2928 assert(MaskTy->getPrimitiveSizeInBits() ==
2929 OpTy->getPrimitiveSizeInBits() &&
2930 "Not expecting mask and operands with different sizes");
2931
2932 if (NumMaskElts == NumOperandElts) {
2933 return SelectInst::Create(BoolVec, Op1, Op0);
2934 }
2935
2936 // If the mask has less elements than the operands, each mask bit maps to
2937 // multiple elements of the operands. Bitcast back and forth.
2938 if (NumMaskElts < NumOperandElts) {
2939 Value *CastOp0 = IC.Builder.CreateBitCast(Op0, MaskTy);
2940 Value *CastOp1 = IC.Builder.CreateBitCast(Op1, MaskTy);
2941 Value *Sel = IC.Builder.CreateSelect(BoolVec, CastOp1, CastOp0);
2942 return new BitCastInst(Sel, II.getType());
2943 }
2944 }
2945
2946 break;
2947 }
2948
2949 case Intrinsic::x86_ssse3_pshuf_b_128:
2950 case Intrinsic::x86_avx2_pshuf_b:
2951 case Intrinsic::x86_avx512_pshuf_b_512:
2952 if (Value *V = simplifyX86pshufb(II, IC.Builder)) {
2953 return IC.replaceInstUsesWith(II, V);
2954 }
2955 break;
2956
2957 case Intrinsic::x86_avx_vpermilvar_ps:
2958 case Intrinsic::x86_avx_vpermilvar_ps_256:
2959 case Intrinsic::x86_avx512_vpermilvar_ps_512:
2960 case Intrinsic::x86_avx_vpermilvar_pd:
2961 case Intrinsic::x86_avx_vpermilvar_pd_256:
2962 case Intrinsic::x86_avx512_vpermilvar_pd_512:
2963 if (Value *V = simplifyX86vpermilvar(II, IC.Builder)) {
2964 return IC.replaceInstUsesWith(II, V);
2965 }
2966 break;
2967
2968 case Intrinsic::x86_avx2_permd:
2969 case Intrinsic::x86_avx2_permps:
2970 case Intrinsic::x86_avx512_permvar_df_256:
2971 case Intrinsic::x86_avx512_permvar_df_512:
2972 case Intrinsic::x86_avx512_permvar_di_256:
2973 case Intrinsic::x86_avx512_permvar_di_512:
2974 case Intrinsic::x86_avx512_permvar_hi_128:
2975 case Intrinsic::x86_avx512_permvar_hi_256:
2976 case Intrinsic::x86_avx512_permvar_hi_512:
2977 case Intrinsic::x86_avx512_permvar_qi_128:
2978 case Intrinsic::x86_avx512_permvar_qi_256:
2979 case Intrinsic::x86_avx512_permvar_qi_512:
2980 case Intrinsic::x86_avx512_permvar_sf_512:
2981 case Intrinsic::x86_avx512_permvar_si_512:
2982 if (Value *V = simplifyX86vpermv(II, IC.Builder)) {
2983 return IC.replaceInstUsesWith(II, V);
2984 }
2985 break;
2986
2987 case Intrinsic::x86_avx512_vpermi2var_d_128:
2988 case Intrinsic::x86_avx512_vpermi2var_d_256:
2989 case Intrinsic::x86_avx512_vpermi2var_d_512:
2990 case Intrinsic::x86_avx512_vpermi2var_hi_128:
2991 case Intrinsic::x86_avx512_vpermi2var_hi_256:
2992 case Intrinsic::x86_avx512_vpermi2var_hi_512:
2993 case Intrinsic::x86_avx512_vpermi2var_pd_128:
2994 case Intrinsic::x86_avx512_vpermi2var_pd_256:
2995 case Intrinsic::x86_avx512_vpermi2var_pd_512:
2996 case Intrinsic::x86_avx512_vpermi2var_ps_128:
2997 case Intrinsic::x86_avx512_vpermi2var_ps_256:
2998 case Intrinsic::x86_avx512_vpermi2var_ps_512:
2999 case Intrinsic::x86_avx512_vpermi2var_q_128:
3000 case Intrinsic::x86_avx512_vpermi2var_q_256:
3001 case Intrinsic::x86_avx512_vpermi2var_q_512:
3002 case Intrinsic::x86_avx512_vpermi2var_qi_128:
3003 case Intrinsic::x86_avx512_vpermi2var_qi_256:
3004 case Intrinsic::x86_avx512_vpermi2var_qi_512:
3005 if (Value *V = simplifyX86vpermv3(II, IC.Builder)) {
3006 return IC.replaceInstUsesWith(II, V);
3007 }
3008 break;
3009
3010 case Intrinsic::x86_avx_maskload_ps:
3011 case Intrinsic::x86_avx_maskload_pd:
3012 case Intrinsic::x86_avx_maskload_ps_256:
3013 case Intrinsic::x86_avx_maskload_pd_256:
3014 case Intrinsic::x86_avx2_maskload_d:
3015 case Intrinsic::x86_avx2_maskload_q:
3016 case Intrinsic::x86_avx2_maskload_d_256:
3017 case Intrinsic::x86_avx2_maskload_q_256:
3018 if (Instruction *I = simplifyX86MaskedLoad(II, IC)) {
3019 return I;
3020 }
3021 break;
3022
3023 case Intrinsic::x86_sse2_maskmov_dqu:
3024 case Intrinsic::x86_avx_maskstore_ps:
3025 case Intrinsic::x86_avx_maskstore_pd:
3026 case Intrinsic::x86_avx_maskstore_ps_256:
3027 case Intrinsic::x86_avx_maskstore_pd_256:
3028 case Intrinsic::x86_avx2_maskstore_d:
3029 case Intrinsic::x86_avx2_maskstore_q:
3030 case Intrinsic::x86_avx2_maskstore_d_256:
3031 case Intrinsic::x86_avx2_maskstore_q_256:
3032 if (simplifyX86MaskedStore(II, IC)) {
3033 return nullptr;
3034 }
3035 break;
3036
3037 case Intrinsic::x86_addcarry_32:
3038 case Intrinsic::x86_addcarry_64:
3039 if (Value *V = simplifyX86addcarry(II, IC.Builder)) {
3040 return IC.replaceInstUsesWith(II, V);
3041 }
3042 break;
3043
3044 case Intrinsic::x86_avx512_pternlog_d_128:
3045 case Intrinsic::x86_avx512_pternlog_d_256:
3046 case Intrinsic::x86_avx512_pternlog_d_512:
3047 case Intrinsic::x86_avx512_pternlog_q_128:
3048 case Intrinsic::x86_avx512_pternlog_q_256:
3049 case Intrinsic::x86_avx512_pternlog_q_512:
3050 if (Value *V = simplifyTernarylogic(II, IC.Builder)) {
3051 return IC.replaceInstUsesWith(II, V);
3052 }
3053 break;
3054 default:
3055 break;
3056 }
3057 return std::nullopt;
3058 }
3059
simplifyDemandedUseBitsIntrinsic(InstCombiner & IC,IntrinsicInst & II,APInt DemandedMask,KnownBits & Known,bool & KnownBitsComputed) const3060 std::optional<Value *> X86TTIImpl::simplifyDemandedUseBitsIntrinsic(
3061 InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known,
3062 bool &KnownBitsComputed) const {
3063 switch (II.getIntrinsicID()) {
3064 default:
3065 break;
3066 case Intrinsic::x86_mmx_pmovmskb:
3067 case Intrinsic::x86_sse_movmsk_ps:
3068 case Intrinsic::x86_sse2_movmsk_pd:
3069 case Intrinsic::x86_sse2_pmovmskb_128:
3070 case Intrinsic::x86_avx_movmsk_ps_256:
3071 case Intrinsic::x86_avx_movmsk_pd_256:
3072 case Intrinsic::x86_avx2_pmovmskb: {
3073 // MOVMSK copies the vector elements' sign bits to the low bits
3074 // and zeros the high bits.
3075 unsigned ArgWidth;
3076 if (II.getIntrinsicID() == Intrinsic::x86_mmx_pmovmskb) {
3077 ArgWidth = 8; // Arg is x86_mmx, but treated as <8 x i8>.
3078 } else {
3079 auto *ArgType = cast<FixedVectorType>(II.getArgOperand(0)->getType());
3080 ArgWidth = ArgType->getNumElements();
3081 }
3082
3083 // If we don't need any of low bits then return zero,
3084 // we know that DemandedMask is non-zero already.
3085 APInt DemandedElts = DemandedMask.zextOrTrunc(ArgWidth);
3086 Type *VTy = II.getType();
3087 if (DemandedElts.isZero()) {
3088 return ConstantInt::getNullValue(VTy);
3089 }
3090
3091 // We know that the upper bits are set to zero.
3092 Known.Zero.setBitsFrom(ArgWidth);
3093 KnownBitsComputed = true;
3094 break;
3095 }
3096 }
3097 return std::nullopt;
3098 }
3099
simplifyDemandedVectorEltsIntrinsic(InstCombiner & IC,IntrinsicInst & II,APInt DemandedElts,APInt & UndefElts,APInt & UndefElts2,APInt & UndefElts3,std::function<void (Instruction *,unsigned,APInt,APInt &)> simplifyAndSetOp) const3100 std::optional<Value *> X86TTIImpl::simplifyDemandedVectorEltsIntrinsic(
3101 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
3102 APInt &UndefElts2, APInt &UndefElts3,
3103 std::function<void(Instruction *, unsigned, APInt, APInt &)>
3104 simplifyAndSetOp) const {
3105 unsigned VWidth = cast<FixedVectorType>(II.getType())->getNumElements();
3106 switch (II.getIntrinsicID()) {
3107 default:
3108 break;
3109 case Intrinsic::x86_xop_vfrcz_ss:
3110 case Intrinsic::x86_xop_vfrcz_sd:
3111 // The instructions for these intrinsics are speced to zero upper bits not
3112 // pass them through like other scalar intrinsics. So we shouldn't just
3113 // use Arg0 if DemandedElts[0] is clear like we do for other intrinsics.
3114 // Instead we should return a zero vector.
3115 if (!DemandedElts[0]) {
3116 IC.addToWorklist(&II);
3117 return ConstantAggregateZero::get(II.getType());
3118 }
3119
3120 // Only the lower element is used.
3121 DemandedElts = 1;
3122 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
3123
3124 // Only the lower element is undefined. The high elements are zero.
3125 UndefElts = UndefElts[0];
3126 break;
3127
3128 // Unary scalar-as-vector operations that work column-wise.
3129 case Intrinsic::x86_sse_rcp_ss:
3130 case Intrinsic::x86_sse_rsqrt_ss:
3131 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
3132
3133 // If lowest element of a scalar op isn't used then use Arg0.
3134 if (!DemandedElts[0]) {
3135 IC.addToWorklist(&II);
3136 return II.getArgOperand(0);
3137 }
3138 // TODO: If only low elt lower SQRT to FSQRT (with rounding/exceptions
3139 // checks).
3140 break;
3141
3142 // Binary scalar-as-vector operations that work column-wise. The high
3143 // elements come from operand 0. The low element is a function of both
3144 // operands.
3145 case Intrinsic::x86_sse_min_ss:
3146 case Intrinsic::x86_sse_max_ss:
3147 case Intrinsic::x86_sse_cmp_ss:
3148 case Intrinsic::x86_sse2_min_sd:
3149 case Intrinsic::x86_sse2_max_sd:
3150 case Intrinsic::x86_sse2_cmp_sd: {
3151 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
3152
3153 // If lowest element of a scalar op isn't used then use Arg0.
3154 if (!DemandedElts[0]) {
3155 IC.addToWorklist(&II);
3156 return II.getArgOperand(0);
3157 }
3158
3159 // Only lower element is used for operand 1.
3160 DemandedElts = 1;
3161 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
3162
3163 // Lower element is undefined if both lower elements are undefined.
3164 // Consider things like undef&0. The result is known zero, not undef.
3165 if (!UndefElts2[0])
3166 UndefElts.clearBit(0);
3167
3168 break;
3169 }
3170
3171 // Binary scalar-as-vector operations that work column-wise. The high
3172 // elements come from operand 0 and the low element comes from operand 1.
3173 case Intrinsic::x86_sse41_round_ss:
3174 case Intrinsic::x86_sse41_round_sd: {
3175 // Don't use the low element of operand 0.
3176 APInt DemandedElts2 = DemandedElts;
3177 DemandedElts2.clearBit(0);
3178 simplifyAndSetOp(&II, 0, DemandedElts2, UndefElts);
3179
3180 // If lowest element of a scalar op isn't used then use Arg0.
3181 if (!DemandedElts[0]) {
3182 IC.addToWorklist(&II);
3183 return II.getArgOperand(0);
3184 }
3185
3186 // Only lower element is used for operand 1.
3187 DemandedElts = 1;
3188 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
3189
3190 // Take the high undef elements from operand 0 and take the lower element
3191 // from operand 1.
3192 UndefElts.clearBit(0);
3193 UndefElts |= UndefElts2[0];
3194 break;
3195 }
3196
3197 // Three input scalar-as-vector operations that work column-wise. The high
3198 // elements come from operand 0 and the low element is a function of all
3199 // three inputs.
3200 case Intrinsic::x86_avx512_mask_add_ss_round:
3201 case Intrinsic::x86_avx512_mask_div_ss_round:
3202 case Intrinsic::x86_avx512_mask_mul_ss_round:
3203 case Intrinsic::x86_avx512_mask_sub_ss_round:
3204 case Intrinsic::x86_avx512_mask_max_ss_round:
3205 case Intrinsic::x86_avx512_mask_min_ss_round:
3206 case Intrinsic::x86_avx512_mask_add_sd_round:
3207 case Intrinsic::x86_avx512_mask_div_sd_round:
3208 case Intrinsic::x86_avx512_mask_mul_sd_round:
3209 case Intrinsic::x86_avx512_mask_sub_sd_round:
3210 case Intrinsic::x86_avx512_mask_max_sd_round:
3211 case Intrinsic::x86_avx512_mask_min_sd_round:
3212 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
3213
3214 // If lowest element of a scalar op isn't used then use Arg0.
3215 if (!DemandedElts[0]) {
3216 IC.addToWorklist(&II);
3217 return II.getArgOperand(0);
3218 }
3219
3220 // Only lower element is used for operand 1 and 2.
3221 DemandedElts = 1;
3222 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
3223 simplifyAndSetOp(&II, 2, DemandedElts, UndefElts3);
3224
3225 // Lower element is undefined if all three lower elements are undefined.
3226 // Consider things like undef&0. The result is known zero, not undef.
3227 if (!UndefElts2[0] || !UndefElts3[0])
3228 UndefElts.clearBit(0);
3229 break;
3230
3231 // TODO: Add fmaddsub support?
3232 case Intrinsic::x86_sse3_addsub_pd:
3233 case Intrinsic::x86_sse3_addsub_ps:
3234 case Intrinsic::x86_avx_addsub_pd_256:
3235 case Intrinsic::x86_avx_addsub_ps_256: {
3236 // If none of the even or none of the odd lanes are required, turn this
3237 // into a generic FP math instruction.
3238 APInt SubMask = APInt::getSplat(VWidth, APInt(2, 0x1));
3239 APInt AddMask = APInt::getSplat(VWidth, APInt(2, 0x2));
3240 bool IsSubOnly = DemandedElts.isSubsetOf(SubMask);
3241 bool IsAddOnly = DemandedElts.isSubsetOf(AddMask);
3242 if (IsSubOnly || IsAddOnly) {
3243 assert((IsSubOnly ^ IsAddOnly) && "Can't be both add-only and sub-only");
3244 IRBuilderBase::InsertPointGuard Guard(IC.Builder);
3245 IC.Builder.SetInsertPoint(&II);
3246 Value *Arg0 = II.getArgOperand(0), *Arg1 = II.getArgOperand(1);
3247 return IC.Builder.CreateBinOp(
3248 IsSubOnly ? Instruction::FSub : Instruction::FAdd, Arg0, Arg1);
3249 }
3250
3251 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
3252 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
3253 UndefElts &= UndefElts2;
3254 break;
3255 }
3256
3257 // General per-element vector operations.
3258 case Intrinsic::x86_avx2_psllv_d:
3259 case Intrinsic::x86_avx2_psllv_d_256:
3260 case Intrinsic::x86_avx2_psllv_q:
3261 case Intrinsic::x86_avx2_psllv_q_256:
3262 case Intrinsic::x86_avx2_psrlv_d:
3263 case Intrinsic::x86_avx2_psrlv_d_256:
3264 case Intrinsic::x86_avx2_psrlv_q:
3265 case Intrinsic::x86_avx2_psrlv_q_256:
3266 case Intrinsic::x86_avx2_psrav_d:
3267 case Intrinsic::x86_avx2_psrav_d_256: {
3268 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
3269 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
3270 UndefElts &= UndefElts2;
3271 break;
3272 }
3273
3274 case Intrinsic::x86_sse2_pmulh_w:
3275 case Intrinsic::x86_avx2_pmulh_w:
3276 case Intrinsic::x86_avx512_pmulh_w_512:
3277 case Intrinsic::x86_sse2_pmulhu_w:
3278 case Intrinsic::x86_avx2_pmulhu_w:
3279 case Intrinsic::x86_avx512_pmulhu_w_512:
3280 case Intrinsic::x86_ssse3_pmul_hr_sw_128:
3281 case Intrinsic::x86_avx2_pmul_hr_sw:
3282 case Intrinsic::x86_avx512_pmul_hr_sw_512: {
3283 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
3284 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
3285 // NOTE: mulh(undef,undef) != undef.
3286 break;
3287 }
3288
3289 case Intrinsic::x86_sse2_packssdw_128:
3290 case Intrinsic::x86_sse2_packsswb_128:
3291 case Intrinsic::x86_sse2_packuswb_128:
3292 case Intrinsic::x86_sse41_packusdw:
3293 case Intrinsic::x86_avx2_packssdw:
3294 case Intrinsic::x86_avx2_packsswb:
3295 case Intrinsic::x86_avx2_packusdw:
3296 case Intrinsic::x86_avx2_packuswb:
3297 case Intrinsic::x86_avx512_packssdw_512:
3298 case Intrinsic::x86_avx512_packsswb_512:
3299 case Intrinsic::x86_avx512_packusdw_512:
3300 case Intrinsic::x86_avx512_packuswb_512: {
3301 auto *Ty0 = II.getArgOperand(0)->getType();
3302 unsigned InnerVWidth = cast<FixedVectorType>(Ty0)->getNumElements();
3303 assert(VWidth == (InnerVWidth * 2) && "Unexpected input size");
3304
3305 unsigned NumLanes = Ty0->getPrimitiveSizeInBits() / 128;
3306 unsigned VWidthPerLane = VWidth / NumLanes;
3307 unsigned InnerVWidthPerLane = InnerVWidth / NumLanes;
3308
3309 // Per lane, pack the elements of the first input and then the second.
3310 // e.g.
3311 // v8i16 PACK(v4i32 X, v4i32 Y) - (X[0..3],Y[0..3])
3312 // v32i8 PACK(v16i16 X, v16i16 Y) - (X[0..7],Y[0..7]),(X[8..15],Y[8..15])
3313 for (int OpNum = 0; OpNum != 2; ++OpNum) {
3314 APInt OpDemandedElts(InnerVWidth, 0);
3315 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
3316 unsigned LaneIdx = Lane * VWidthPerLane;
3317 for (unsigned Elt = 0; Elt != InnerVWidthPerLane; ++Elt) {
3318 unsigned Idx = LaneIdx + Elt + InnerVWidthPerLane * OpNum;
3319 if (DemandedElts[Idx])
3320 OpDemandedElts.setBit((Lane * InnerVWidthPerLane) + Elt);
3321 }
3322 }
3323
3324 // Demand elements from the operand.
3325 APInt OpUndefElts(InnerVWidth, 0);
3326 simplifyAndSetOp(&II, OpNum, OpDemandedElts, OpUndefElts);
3327
3328 // Pack the operand's UNDEF elements, one lane at a time.
3329 OpUndefElts = OpUndefElts.zext(VWidth);
3330 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
3331 APInt LaneElts = OpUndefElts.lshr(InnerVWidthPerLane * Lane);
3332 LaneElts = LaneElts.getLoBits(InnerVWidthPerLane);
3333 LaneElts <<= InnerVWidthPerLane * (2 * Lane + OpNum);
3334 UndefElts |= LaneElts;
3335 }
3336 }
3337 break;
3338 }
3339
3340 case Intrinsic::x86_sse2_pmadd_wd:
3341 case Intrinsic::x86_avx2_pmadd_wd:
3342 case Intrinsic::x86_avx512_pmaddw_d_512:
3343 case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
3344 case Intrinsic::x86_avx2_pmadd_ub_sw:
3345 case Intrinsic::x86_avx512_pmaddubs_w_512: {
3346 // PMADD - demand both src elements that map to each dst element.
3347 auto *ArgTy = II.getArgOperand(0)->getType();
3348 unsigned InnerVWidth = cast<FixedVectorType>(ArgTy)->getNumElements();
3349 assert((VWidth * 2) == InnerVWidth && "Unexpected input size");
3350 APInt OpDemandedElts = APIntOps::ScaleBitMask(DemandedElts, InnerVWidth);
3351 APInt Op0UndefElts(InnerVWidth, 0);
3352 APInt Op1UndefElts(InnerVWidth, 0);
3353 simplifyAndSetOp(&II, 0, OpDemandedElts, Op0UndefElts);
3354 simplifyAndSetOp(&II, 1, OpDemandedElts, Op1UndefElts);
3355 // NOTE: madd(undef,undef) != undef.
3356 break;
3357 }
3358
3359 // PSHUFB
3360 case Intrinsic::x86_ssse3_pshuf_b_128:
3361 case Intrinsic::x86_avx2_pshuf_b:
3362 case Intrinsic::x86_avx512_pshuf_b_512:
3363 // PERMILVAR
3364 case Intrinsic::x86_avx_vpermilvar_ps:
3365 case Intrinsic::x86_avx_vpermilvar_ps_256:
3366 case Intrinsic::x86_avx512_vpermilvar_ps_512:
3367 case Intrinsic::x86_avx_vpermilvar_pd:
3368 case Intrinsic::x86_avx_vpermilvar_pd_256:
3369 case Intrinsic::x86_avx512_vpermilvar_pd_512:
3370 // PERMV
3371 case Intrinsic::x86_avx2_permd:
3372 case Intrinsic::x86_avx2_permps: {
3373 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts);
3374 break;
3375 }
3376
3377 // SSE4A instructions leave the upper 64-bits of the 128-bit result
3378 // in an undefined state.
3379 case Intrinsic::x86_sse4a_extrq:
3380 case Intrinsic::x86_sse4a_extrqi:
3381 case Intrinsic::x86_sse4a_insertq:
3382 case Intrinsic::x86_sse4a_insertqi:
3383 UndefElts.setHighBits(VWidth / 2);
3384 break;
3385 }
3386 return std::nullopt;
3387 }
3388