1 //===-- X86InstCombineIntrinsic.cpp - X86 specific InstCombine pass -------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements a TargetTransformInfo analysis pass specific to the 10 /// X86 target machine. It uses the target's detailed information to provide 11 /// more precise answers to certain TTI queries, while letting the target 12 /// independent and default TTI implementations handle the rest. 13 /// 14 //===----------------------------------------------------------------------===// 15 16 #include "X86TargetTransformInfo.h" 17 #include "llvm/IR/IntrinsicInst.h" 18 #include "llvm/IR/IntrinsicsX86.h" 19 #include "llvm/Support/KnownBits.h" 20 #include "llvm/Transforms/InstCombine/InstCombiner.h" 21 #include <optional> 22 23 using namespace llvm; 24 using namespace llvm::PatternMatch; 25 26 #define DEBUG_TYPE "x86tti" 27 28 /// Return a constant boolean vector that has true elements in all positions 29 /// where the input constant data vector has an element with the sign bit set. 30 static Constant *getNegativeIsTrueBoolVec(Constant *V, const DataLayout &DL) { 31 VectorType *IntTy = VectorType::getInteger(cast<VectorType>(V->getType())); 32 V = ConstantExpr::getBitCast(V, IntTy); 33 V = ConstantFoldCompareInstOperands(CmpInst::ICMP_SGT, 34 Constant::getNullValue(IntTy), V, DL); 35 assert(V && "Vector must be foldable"); 36 return V; 37 } 38 39 /// Convert the x86 XMM integer vector mask to a vector of bools based on 40 /// each element's most significant bit (the sign bit). 41 static Value *getBoolVecFromMask(Value *Mask, const DataLayout &DL) { 42 // Fold Constant Mask. 43 if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) 44 return getNegativeIsTrueBoolVec(ConstantMask, DL); 45 46 // Mask was extended from a boolean vector. 47 Value *ExtMask; 48 if (match(Mask, m_SExt(m_Value(ExtMask))) && 49 ExtMask->getType()->isIntOrIntVectorTy(1)) 50 return ExtMask; 51 52 return nullptr; 53 } 54 55 // TODO: If the x86 backend knew how to convert a bool vector mask back to an 56 // XMM register mask efficiently, we could transform all x86 masked intrinsics 57 // to LLVM masked intrinsics and remove the x86 masked intrinsic defs. 58 static Instruction *simplifyX86MaskedLoad(IntrinsicInst &II, InstCombiner &IC) { 59 Value *Ptr = II.getOperand(0); 60 Value *Mask = II.getOperand(1); 61 Constant *ZeroVec = Constant::getNullValue(II.getType()); 62 63 // Zero Mask - masked load instruction creates a zero vector. 64 if (isa<ConstantAggregateZero>(Mask)) 65 return IC.replaceInstUsesWith(II, ZeroVec); 66 67 // The mask is constant or extended from a bool vector. Convert this x86 68 // intrinsic to the LLVM intrinsic to allow target-independent optimizations. 69 if (Value *BoolMask = getBoolVecFromMask(Mask, IC.getDataLayout())) { 70 // First, cast the x86 intrinsic scalar pointer to a vector pointer to match 71 // the LLVM intrinsic definition for the pointer argument. 72 unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace(); 73 PointerType *VecPtrTy = PointerType::get(II.getType(), AddrSpace); 74 Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec"); 75 76 // The pass-through vector for an x86 masked load is a zero vector. 77 CallInst *NewMaskedLoad = IC.Builder.CreateMaskedLoad( 78 II.getType(), PtrCast, Align(1), BoolMask, ZeroVec); 79 return IC.replaceInstUsesWith(II, NewMaskedLoad); 80 } 81 82 return nullptr; 83 } 84 85 // TODO: If the x86 backend knew how to convert a bool vector mask back to an 86 // XMM register mask efficiently, we could transform all x86 masked intrinsics 87 // to LLVM masked intrinsics and remove the x86 masked intrinsic defs. 88 static bool simplifyX86MaskedStore(IntrinsicInst &II, InstCombiner &IC) { 89 Value *Ptr = II.getOperand(0); 90 Value *Mask = II.getOperand(1); 91 Value *Vec = II.getOperand(2); 92 93 // Zero Mask - this masked store instruction does nothing. 94 if (isa<ConstantAggregateZero>(Mask)) { 95 IC.eraseInstFromFunction(II); 96 return true; 97 } 98 99 // The SSE2 version is too weird (eg, unaligned but non-temporal) to do 100 // anything else at this level. 101 if (II.getIntrinsicID() == Intrinsic::x86_sse2_maskmov_dqu) 102 return false; 103 104 // The mask is constant or extended from a bool vector. Convert this x86 105 // intrinsic to the LLVM intrinsic to allow target-independent optimizations. 106 if (Value *BoolMask = getBoolVecFromMask(Mask, IC.getDataLayout())) { 107 unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace(); 108 PointerType *VecPtrTy = PointerType::get(Vec->getType(), AddrSpace); 109 Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec"); 110 111 IC.Builder.CreateMaskedStore(Vec, PtrCast, Align(1), BoolMask); 112 113 // 'Replace uses' doesn't work for stores. Erase the original masked store. 114 IC.eraseInstFromFunction(II); 115 return true; 116 } 117 118 return false; 119 } 120 121 static Value *simplifyX86immShift(const IntrinsicInst &II, 122 InstCombiner::BuilderTy &Builder) { 123 bool LogicalShift = false; 124 bool ShiftLeft = false; 125 bool IsImm = false; 126 127 switch (II.getIntrinsicID()) { 128 default: 129 llvm_unreachable("Unexpected intrinsic!"); 130 case Intrinsic::x86_sse2_psrai_d: 131 case Intrinsic::x86_sse2_psrai_w: 132 case Intrinsic::x86_avx2_psrai_d: 133 case Intrinsic::x86_avx2_psrai_w: 134 case Intrinsic::x86_avx512_psrai_q_128: 135 case Intrinsic::x86_avx512_psrai_q_256: 136 case Intrinsic::x86_avx512_psrai_d_512: 137 case Intrinsic::x86_avx512_psrai_q_512: 138 case Intrinsic::x86_avx512_psrai_w_512: 139 IsImm = true; 140 [[fallthrough]]; 141 case Intrinsic::x86_sse2_psra_d: 142 case Intrinsic::x86_sse2_psra_w: 143 case Intrinsic::x86_avx2_psra_d: 144 case Intrinsic::x86_avx2_psra_w: 145 case Intrinsic::x86_avx512_psra_q_128: 146 case Intrinsic::x86_avx512_psra_q_256: 147 case Intrinsic::x86_avx512_psra_d_512: 148 case Intrinsic::x86_avx512_psra_q_512: 149 case Intrinsic::x86_avx512_psra_w_512: 150 LogicalShift = false; 151 ShiftLeft = false; 152 break; 153 case Intrinsic::x86_sse2_psrli_d: 154 case Intrinsic::x86_sse2_psrli_q: 155 case Intrinsic::x86_sse2_psrli_w: 156 case Intrinsic::x86_avx2_psrli_d: 157 case Intrinsic::x86_avx2_psrli_q: 158 case Intrinsic::x86_avx2_psrli_w: 159 case Intrinsic::x86_avx512_psrli_d_512: 160 case Intrinsic::x86_avx512_psrli_q_512: 161 case Intrinsic::x86_avx512_psrli_w_512: 162 IsImm = true; 163 [[fallthrough]]; 164 case Intrinsic::x86_sse2_psrl_d: 165 case Intrinsic::x86_sse2_psrl_q: 166 case Intrinsic::x86_sse2_psrl_w: 167 case Intrinsic::x86_avx2_psrl_d: 168 case Intrinsic::x86_avx2_psrl_q: 169 case Intrinsic::x86_avx2_psrl_w: 170 case Intrinsic::x86_avx512_psrl_d_512: 171 case Intrinsic::x86_avx512_psrl_q_512: 172 case Intrinsic::x86_avx512_psrl_w_512: 173 LogicalShift = true; 174 ShiftLeft = false; 175 break; 176 case Intrinsic::x86_sse2_pslli_d: 177 case Intrinsic::x86_sse2_pslli_q: 178 case Intrinsic::x86_sse2_pslli_w: 179 case Intrinsic::x86_avx2_pslli_d: 180 case Intrinsic::x86_avx2_pslli_q: 181 case Intrinsic::x86_avx2_pslli_w: 182 case Intrinsic::x86_avx512_pslli_d_512: 183 case Intrinsic::x86_avx512_pslli_q_512: 184 case Intrinsic::x86_avx512_pslli_w_512: 185 IsImm = true; 186 [[fallthrough]]; 187 case Intrinsic::x86_sse2_psll_d: 188 case Intrinsic::x86_sse2_psll_q: 189 case Intrinsic::x86_sse2_psll_w: 190 case Intrinsic::x86_avx2_psll_d: 191 case Intrinsic::x86_avx2_psll_q: 192 case Intrinsic::x86_avx2_psll_w: 193 case Intrinsic::x86_avx512_psll_d_512: 194 case Intrinsic::x86_avx512_psll_q_512: 195 case Intrinsic::x86_avx512_psll_w_512: 196 LogicalShift = true; 197 ShiftLeft = true; 198 break; 199 } 200 assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left"); 201 202 Value *Vec = II.getArgOperand(0); 203 Value *Amt = II.getArgOperand(1); 204 auto *VT = cast<FixedVectorType>(Vec->getType()); 205 Type *SVT = VT->getElementType(); 206 Type *AmtVT = Amt->getType(); 207 unsigned VWidth = VT->getNumElements(); 208 unsigned BitWidth = SVT->getPrimitiveSizeInBits(); 209 210 // If the shift amount is guaranteed to be in-range we can replace it with a 211 // generic shift. If its guaranteed to be out of range, logical shifts combine 212 // to zero and arithmetic shifts are clamped to (BitWidth - 1). 213 if (IsImm) { 214 assert(AmtVT->isIntegerTy(32) && "Unexpected shift-by-immediate type"); 215 KnownBits KnownAmtBits = 216 llvm::computeKnownBits(Amt, II.getDataLayout()); 217 if (KnownAmtBits.getMaxValue().ult(BitWidth)) { 218 Amt = Builder.CreateZExtOrTrunc(Amt, SVT); 219 Amt = Builder.CreateVectorSplat(VWidth, Amt); 220 return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt) 221 : Builder.CreateLShr(Vec, Amt)) 222 : Builder.CreateAShr(Vec, Amt)); 223 } 224 if (KnownAmtBits.getMinValue().uge(BitWidth)) { 225 if (LogicalShift) 226 return ConstantAggregateZero::get(VT); 227 Amt = ConstantInt::get(SVT, BitWidth - 1); 228 return Builder.CreateAShr(Vec, Builder.CreateVectorSplat(VWidth, Amt)); 229 } 230 } else { 231 // Ensure the first element has an in-range value and the rest of the 232 // elements in the bottom 64 bits are zero. 233 assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 && 234 cast<VectorType>(AmtVT)->getElementType() == SVT && 235 "Unexpected shift-by-scalar type"); 236 unsigned NumAmtElts = cast<FixedVectorType>(AmtVT)->getNumElements(); 237 APInt DemandedLower = APInt::getOneBitSet(NumAmtElts, 0); 238 APInt DemandedUpper = APInt::getBitsSet(NumAmtElts, 1, NumAmtElts / 2); 239 KnownBits KnownLowerBits = llvm::computeKnownBits( 240 Amt, DemandedLower, II.getDataLayout()); 241 KnownBits KnownUpperBits = llvm::computeKnownBits( 242 Amt, DemandedUpper, II.getDataLayout()); 243 if (KnownLowerBits.getMaxValue().ult(BitWidth) && 244 (DemandedUpper.isZero() || KnownUpperBits.isZero())) { 245 SmallVector<int, 16> ZeroSplat(VWidth, 0); 246 Amt = Builder.CreateShuffleVector(Amt, ZeroSplat); 247 return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt) 248 : Builder.CreateLShr(Vec, Amt)) 249 : Builder.CreateAShr(Vec, Amt)); 250 } 251 } 252 253 // Simplify if count is constant vector. 254 auto *CDV = dyn_cast<ConstantDataVector>(Amt); 255 if (!CDV) 256 return nullptr; 257 258 // SSE2/AVX2 uses all the first 64-bits of the 128-bit vector 259 // operand to compute the shift amount. 260 assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 && 261 cast<VectorType>(AmtVT)->getElementType() == SVT && 262 "Unexpected shift-by-scalar type"); 263 264 // Concatenate the sub-elements to create the 64-bit value. 265 APInt Count(64, 0); 266 for (unsigned i = 0, NumSubElts = 64 / BitWidth; i != NumSubElts; ++i) { 267 unsigned SubEltIdx = (NumSubElts - 1) - i; 268 auto *SubElt = cast<ConstantInt>(CDV->getElementAsConstant(SubEltIdx)); 269 Count <<= BitWidth; 270 Count |= SubElt->getValue().zextOrTrunc(64); 271 } 272 273 // If shift-by-zero then just return the original value. 274 if (Count.isZero()) 275 return Vec; 276 277 // Handle cases when Shift >= BitWidth. 278 if (Count.uge(BitWidth)) { 279 // If LogicalShift - just return zero. 280 if (LogicalShift) 281 return ConstantAggregateZero::get(VT); 282 283 // If ArithmeticShift - clamp Shift to (BitWidth - 1). 284 Count = APInt(64, BitWidth - 1); 285 } 286 287 // Get a constant vector of the same type as the first operand. 288 auto ShiftAmt = ConstantInt::get(SVT, Count.zextOrTrunc(BitWidth)); 289 auto ShiftVec = Builder.CreateVectorSplat(VWidth, ShiftAmt); 290 291 if (ShiftLeft) 292 return Builder.CreateShl(Vec, ShiftVec); 293 294 if (LogicalShift) 295 return Builder.CreateLShr(Vec, ShiftVec); 296 297 return Builder.CreateAShr(Vec, ShiftVec); 298 } 299 300 // Attempt to simplify AVX2 per-element shift intrinsics to a generic IR shift. 301 // Unlike the generic IR shifts, the intrinsics have defined behaviour for out 302 // of range shift amounts (logical - set to zero, arithmetic - splat sign bit). 303 static Value *simplifyX86varShift(const IntrinsicInst &II, 304 InstCombiner::BuilderTy &Builder) { 305 bool LogicalShift = false; 306 bool ShiftLeft = false; 307 308 switch (II.getIntrinsicID()) { 309 default: 310 llvm_unreachable("Unexpected intrinsic!"); 311 case Intrinsic::x86_avx2_psrav_d: 312 case Intrinsic::x86_avx2_psrav_d_256: 313 case Intrinsic::x86_avx512_psrav_q_128: 314 case Intrinsic::x86_avx512_psrav_q_256: 315 case Intrinsic::x86_avx512_psrav_d_512: 316 case Intrinsic::x86_avx512_psrav_q_512: 317 case Intrinsic::x86_avx512_psrav_w_128: 318 case Intrinsic::x86_avx512_psrav_w_256: 319 case Intrinsic::x86_avx512_psrav_w_512: 320 LogicalShift = false; 321 ShiftLeft = false; 322 break; 323 case Intrinsic::x86_avx2_psrlv_d: 324 case Intrinsic::x86_avx2_psrlv_d_256: 325 case Intrinsic::x86_avx2_psrlv_q: 326 case Intrinsic::x86_avx2_psrlv_q_256: 327 case Intrinsic::x86_avx512_psrlv_d_512: 328 case Intrinsic::x86_avx512_psrlv_q_512: 329 case Intrinsic::x86_avx512_psrlv_w_128: 330 case Intrinsic::x86_avx512_psrlv_w_256: 331 case Intrinsic::x86_avx512_psrlv_w_512: 332 LogicalShift = true; 333 ShiftLeft = false; 334 break; 335 case Intrinsic::x86_avx2_psllv_d: 336 case Intrinsic::x86_avx2_psllv_d_256: 337 case Intrinsic::x86_avx2_psllv_q: 338 case Intrinsic::x86_avx2_psllv_q_256: 339 case Intrinsic::x86_avx512_psllv_d_512: 340 case Intrinsic::x86_avx512_psllv_q_512: 341 case Intrinsic::x86_avx512_psllv_w_128: 342 case Intrinsic::x86_avx512_psllv_w_256: 343 case Intrinsic::x86_avx512_psllv_w_512: 344 LogicalShift = true; 345 ShiftLeft = true; 346 break; 347 } 348 assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left"); 349 350 Value *Vec = II.getArgOperand(0); 351 Value *Amt = II.getArgOperand(1); 352 auto *VT = cast<FixedVectorType>(II.getType()); 353 Type *SVT = VT->getElementType(); 354 int NumElts = VT->getNumElements(); 355 int BitWidth = SVT->getIntegerBitWidth(); 356 357 // If the shift amount is guaranteed to be in-range we can replace it with a 358 // generic shift. 359 KnownBits KnownAmt = 360 llvm::computeKnownBits(Amt, II.getDataLayout()); 361 if (KnownAmt.getMaxValue().ult(BitWidth)) { 362 return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt) 363 : Builder.CreateLShr(Vec, Amt)) 364 : Builder.CreateAShr(Vec, Amt)); 365 } 366 367 // Simplify if all shift amounts are constant/undef. 368 auto *CShift = dyn_cast<Constant>(Amt); 369 if (!CShift) 370 return nullptr; 371 372 // Collect each element's shift amount. 373 // We also collect special cases: UNDEF = -1, OUT-OF-RANGE = BitWidth. 374 bool AnyOutOfRange = false; 375 SmallVector<int, 8> ShiftAmts; 376 for (int I = 0; I < NumElts; ++I) { 377 auto *CElt = CShift->getAggregateElement(I); 378 if (isa_and_nonnull<UndefValue>(CElt)) { 379 ShiftAmts.push_back(-1); 380 continue; 381 } 382 383 auto *COp = dyn_cast_or_null<ConstantInt>(CElt); 384 if (!COp) 385 return nullptr; 386 387 // Handle out of range shifts. 388 // If LogicalShift - set to BitWidth (special case). 389 // If ArithmeticShift - set to (BitWidth - 1) (sign splat). 390 APInt ShiftVal = COp->getValue(); 391 if (ShiftVal.uge(BitWidth)) { 392 AnyOutOfRange = LogicalShift; 393 ShiftAmts.push_back(LogicalShift ? BitWidth : BitWidth - 1); 394 continue; 395 } 396 397 ShiftAmts.push_back((int)ShiftVal.getZExtValue()); 398 } 399 400 // If all elements out of range or UNDEF, return vector of zeros/undefs. 401 // ArithmeticShift should only hit this if they are all UNDEF. 402 auto OutOfRange = [&](int Idx) { return (Idx < 0) || (BitWidth <= Idx); }; 403 if (llvm::all_of(ShiftAmts, OutOfRange)) { 404 SmallVector<Constant *, 8> ConstantVec; 405 for (int Idx : ShiftAmts) { 406 if (Idx < 0) { 407 ConstantVec.push_back(UndefValue::get(SVT)); 408 } else { 409 assert(LogicalShift && "Logical shift expected"); 410 ConstantVec.push_back(ConstantInt::getNullValue(SVT)); 411 } 412 } 413 return ConstantVector::get(ConstantVec); 414 } 415 416 // We can't handle only some out of range values with generic logical shifts. 417 if (AnyOutOfRange) 418 return nullptr; 419 420 // Build the shift amount constant vector. 421 SmallVector<Constant *, 8> ShiftVecAmts; 422 for (int Idx : ShiftAmts) { 423 if (Idx < 0) 424 ShiftVecAmts.push_back(UndefValue::get(SVT)); 425 else 426 ShiftVecAmts.push_back(ConstantInt::get(SVT, Idx)); 427 } 428 auto ShiftVec = ConstantVector::get(ShiftVecAmts); 429 430 if (ShiftLeft) 431 return Builder.CreateShl(Vec, ShiftVec); 432 433 if (LogicalShift) 434 return Builder.CreateLShr(Vec, ShiftVec); 435 436 return Builder.CreateAShr(Vec, ShiftVec); 437 } 438 439 static Value *simplifyX86pack(IntrinsicInst &II, 440 InstCombiner::BuilderTy &Builder, bool IsSigned) { 441 Value *Arg0 = II.getArgOperand(0); 442 Value *Arg1 = II.getArgOperand(1); 443 Type *ResTy = II.getType(); 444 445 // Fast all undef handling. 446 if (isa<UndefValue>(Arg0) && isa<UndefValue>(Arg1)) 447 return UndefValue::get(ResTy); 448 449 auto *ArgTy = cast<FixedVectorType>(Arg0->getType()); 450 unsigned NumLanes = ResTy->getPrimitiveSizeInBits() / 128; 451 unsigned NumSrcElts = ArgTy->getNumElements(); 452 assert(cast<FixedVectorType>(ResTy)->getNumElements() == (2 * NumSrcElts) && 453 "Unexpected packing types"); 454 455 unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes; 456 unsigned DstScalarSizeInBits = ResTy->getScalarSizeInBits(); 457 unsigned SrcScalarSizeInBits = ArgTy->getScalarSizeInBits(); 458 assert(SrcScalarSizeInBits == (2 * DstScalarSizeInBits) && 459 "Unexpected packing types"); 460 461 // Constant folding. 462 if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1)) 463 return nullptr; 464 465 // Clamp Values - signed/unsigned both use signed clamp values, but they 466 // differ on the min/max values. 467 APInt MinValue, MaxValue; 468 if (IsSigned) { 469 // PACKSS: Truncate signed value with signed saturation. 470 // Source values less than dst minint are saturated to minint. 471 // Source values greater than dst maxint are saturated to maxint. 472 MinValue = 473 APInt::getSignedMinValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits); 474 MaxValue = 475 APInt::getSignedMaxValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits); 476 } else { 477 // PACKUS: Truncate signed value with unsigned saturation. 478 // Source values less than zero are saturated to zero. 479 // Source values greater than dst maxuint are saturated to maxuint. 480 MinValue = APInt::getZero(SrcScalarSizeInBits); 481 MaxValue = APInt::getLowBitsSet(SrcScalarSizeInBits, DstScalarSizeInBits); 482 } 483 484 auto *MinC = Constant::getIntegerValue(ArgTy, MinValue); 485 auto *MaxC = Constant::getIntegerValue(ArgTy, MaxValue); 486 Arg0 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg0, MinC), MinC, Arg0); 487 Arg1 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg1, MinC), MinC, Arg1); 488 Arg0 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg0, MaxC), MaxC, Arg0); 489 Arg1 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg1, MaxC), MaxC, Arg1); 490 491 // Shuffle clamped args together at the lane level. 492 SmallVector<int, 32> PackMask; 493 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { 494 for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt) 495 PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane)); 496 for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt) 497 PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane) + NumSrcElts); 498 } 499 auto *Shuffle = Builder.CreateShuffleVector(Arg0, Arg1, PackMask); 500 501 // Truncate to dst size. 502 return Builder.CreateTrunc(Shuffle, ResTy); 503 } 504 505 static Value *simplifyX86pmulh(IntrinsicInst &II, 506 InstCombiner::BuilderTy &Builder, bool IsSigned, 507 bool IsRounding) { 508 Value *Arg0 = II.getArgOperand(0); 509 Value *Arg1 = II.getArgOperand(1); 510 auto *ResTy = cast<FixedVectorType>(II.getType()); 511 auto *ArgTy = cast<FixedVectorType>(Arg0->getType()); 512 assert(ArgTy == ResTy && ResTy->getScalarSizeInBits() == 16 && 513 "Unexpected PMULH types"); 514 assert((!IsRounding || IsSigned) && "PMULHRS instruction must be signed"); 515 516 // Multiply by undef -> zero (NOT undef!) as other arg could still be zero. 517 if (isa<UndefValue>(Arg0) || isa<UndefValue>(Arg1)) 518 return ConstantAggregateZero::get(ResTy); 519 520 // Multiply by zero. 521 if (isa<ConstantAggregateZero>(Arg0) || isa<ConstantAggregateZero>(Arg1)) 522 return ConstantAggregateZero::get(ResTy); 523 524 // Multiply by one. 525 if (!IsRounding) { 526 if (match(Arg0, m_One())) 527 return IsSigned ? Builder.CreateAShr(Arg1, 15) 528 : ConstantAggregateZero::get(ResTy); 529 if (match(Arg1, m_One())) 530 return IsSigned ? Builder.CreateAShr(Arg0, 15) 531 : ConstantAggregateZero::get(ResTy); 532 } 533 534 // Constant folding. 535 if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1)) 536 return nullptr; 537 538 // Extend to twice the width and multiply. 539 auto Cast = 540 IsSigned ? Instruction::CastOps::SExt : Instruction::CastOps::ZExt; 541 auto *ExtTy = FixedVectorType::getExtendedElementVectorType(ArgTy); 542 Value *LHS = Builder.CreateCast(Cast, Arg0, ExtTy); 543 Value *RHS = Builder.CreateCast(Cast, Arg1, ExtTy); 544 Value *Mul = Builder.CreateMul(LHS, RHS); 545 546 if (IsRounding) { 547 // PMULHRSW: truncate to vXi18 of the most significant bits, add one and 548 // extract bits[16:1]. 549 auto *RndEltTy = IntegerType::get(ExtTy->getContext(), 18); 550 auto *RndTy = FixedVectorType::get(RndEltTy, ExtTy); 551 Mul = Builder.CreateLShr(Mul, 14); 552 Mul = Builder.CreateTrunc(Mul, RndTy); 553 Mul = Builder.CreateAdd(Mul, ConstantInt::get(RndTy, 1)); 554 Mul = Builder.CreateLShr(Mul, 1); 555 } else { 556 // PMULH/PMULHU: extract the vXi16 most significant bits. 557 Mul = Builder.CreateLShr(Mul, 16); 558 } 559 560 return Builder.CreateTrunc(Mul, ResTy); 561 } 562 563 static Value *simplifyX86pmadd(IntrinsicInst &II, 564 InstCombiner::BuilderTy &Builder, 565 bool IsPMADDWD) { 566 Value *Arg0 = II.getArgOperand(0); 567 Value *Arg1 = II.getArgOperand(1); 568 auto *ResTy = cast<FixedVectorType>(II.getType()); 569 [[maybe_unused]] auto *ArgTy = cast<FixedVectorType>(Arg0->getType()); 570 571 unsigned NumDstElts = ResTy->getNumElements(); 572 assert(ArgTy->getNumElements() == (2 * NumDstElts) && 573 ResTy->getScalarSizeInBits() == (2 * ArgTy->getScalarSizeInBits()) && 574 "Unexpected PMADD types"); 575 576 // Multiply by undef -> zero (NOT undef!) as other arg could still be zero. 577 if (isa<UndefValue>(Arg0) || isa<UndefValue>(Arg1)) 578 return ConstantAggregateZero::get(ResTy); 579 580 // Multiply by zero. 581 if (isa<ConstantAggregateZero>(Arg0) || isa<ConstantAggregateZero>(Arg1)) 582 return ConstantAggregateZero::get(ResTy); 583 584 // Constant folding. 585 if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1)) 586 return nullptr; 587 588 // Split Lo/Hi elements pairs, extend and add together. 589 // PMADDWD(X,Y) = 590 // add(mul(sext(lhs[0]),sext(rhs[0])),mul(sext(lhs[1]),sext(rhs[1]))) 591 // PMADDUBSW(X,Y) = 592 // sadd_sat(mul(zext(lhs[0]),sext(rhs[0])),mul(zext(lhs[1]),sext(rhs[1]))) 593 SmallVector<int> LoMask, HiMask; 594 for (unsigned I = 0; I != NumDstElts; ++I) { 595 LoMask.push_back(2 * I + 0); 596 HiMask.push_back(2 * I + 1); 597 } 598 599 auto *LHSLo = Builder.CreateShuffleVector(Arg0, LoMask); 600 auto *LHSHi = Builder.CreateShuffleVector(Arg0, HiMask); 601 auto *RHSLo = Builder.CreateShuffleVector(Arg1, LoMask); 602 auto *RHSHi = Builder.CreateShuffleVector(Arg1, HiMask); 603 604 auto LHSCast = 605 IsPMADDWD ? Instruction::CastOps::SExt : Instruction::CastOps::ZExt; 606 LHSLo = Builder.CreateCast(LHSCast, LHSLo, ResTy); 607 LHSHi = Builder.CreateCast(LHSCast, LHSHi, ResTy); 608 RHSLo = Builder.CreateCast(Instruction::CastOps::SExt, RHSLo, ResTy); 609 RHSHi = Builder.CreateCast(Instruction::CastOps::SExt, RHSHi, ResTy); 610 Value *Lo = Builder.CreateMul(LHSLo, RHSLo); 611 Value *Hi = Builder.CreateMul(LHSHi, RHSHi); 612 return IsPMADDWD 613 ? Builder.CreateAdd(Lo, Hi) 614 : Builder.CreateIntrinsic(ResTy, Intrinsic::sadd_sat, {Lo, Hi}); 615 } 616 617 static Value *simplifyX86movmsk(const IntrinsicInst &II, 618 InstCombiner::BuilderTy &Builder) { 619 Value *Arg = II.getArgOperand(0); 620 Type *ResTy = II.getType(); 621 622 // movmsk(undef) -> zero as we must ensure the upper bits are zero. 623 if (isa<UndefValue>(Arg)) 624 return Constant::getNullValue(ResTy); 625 626 auto *ArgTy = dyn_cast<FixedVectorType>(Arg->getType()); 627 // We can't easily peek through x86_mmx types. 628 if (!ArgTy) 629 return nullptr; 630 631 // Expand MOVMSK to compare/bitcast/zext: 632 // e.g. PMOVMSKB(v16i8 x): 633 // %cmp = icmp slt <16 x i8> %x, zeroinitializer 634 // %int = bitcast <16 x i1> %cmp to i16 635 // %res = zext i16 %int to i32 636 unsigned NumElts = ArgTy->getNumElements(); 637 Type *IntegerTy = Builder.getIntNTy(NumElts); 638 639 Value *Res = Builder.CreateBitCast(Arg, VectorType::getInteger(ArgTy)); 640 Res = Builder.CreateIsNeg(Res); 641 Res = Builder.CreateBitCast(Res, IntegerTy); 642 Res = Builder.CreateZExtOrTrunc(Res, ResTy); 643 return Res; 644 } 645 646 static Value *simplifyX86addcarry(const IntrinsicInst &II, 647 InstCombiner::BuilderTy &Builder) { 648 Value *CarryIn = II.getArgOperand(0); 649 Value *Op1 = II.getArgOperand(1); 650 Value *Op2 = II.getArgOperand(2); 651 Type *RetTy = II.getType(); 652 Type *OpTy = Op1->getType(); 653 assert(RetTy->getStructElementType(0)->isIntegerTy(8) && 654 RetTy->getStructElementType(1) == OpTy && OpTy == Op2->getType() && 655 "Unexpected types for x86 addcarry"); 656 657 // If carry-in is zero, this is just an unsigned add with overflow. 658 if (match(CarryIn, m_ZeroInt())) { 659 Value *UAdd = Builder.CreateIntrinsic(Intrinsic::uadd_with_overflow, OpTy, 660 {Op1, Op2}); 661 // The types have to be adjusted to match the x86 call types. 662 Value *UAddResult = Builder.CreateExtractValue(UAdd, 0); 663 Value *UAddOV = Builder.CreateZExt(Builder.CreateExtractValue(UAdd, 1), 664 Builder.getInt8Ty()); 665 Value *Res = PoisonValue::get(RetTy); 666 Res = Builder.CreateInsertValue(Res, UAddOV, 0); 667 return Builder.CreateInsertValue(Res, UAddResult, 1); 668 } 669 670 return nullptr; 671 } 672 673 static Value *simplifyTernarylogic(const IntrinsicInst &II, 674 InstCombiner::BuilderTy &Builder) { 675 676 auto *ArgImm = dyn_cast<ConstantInt>(II.getArgOperand(3)); 677 if (!ArgImm || ArgImm->getValue().uge(256)) 678 return nullptr; 679 680 Value *ArgA = II.getArgOperand(0); 681 Value *ArgB = II.getArgOperand(1); 682 Value *ArgC = II.getArgOperand(2); 683 684 Type *Ty = II.getType(); 685 686 auto Or = [&](auto Lhs, auto Rhs) -> std::pair<Value *, uint8_t> { 687 return {Builder.CreateOr(Lhs.first, Rhs.first), Lhs.second | Rhs.second}; 688 }; 689 auto Xor = [&](auto Lhs, auto Rhs) -> std::pair<Value *, uint8_t> { 690 return {Builder.CreateXor(Lhs.first, Rhs.first), Lhs.second ^ Rhs.second}; 691 }; 692 auto And = [&](auto Lhs, auto Rhs) -> std::pair<Value *, uint8_t> { 693 return {Builder.CreateAnd(Lhs.first, Rhs.first), Lhs.second & Rhs.second}; 694 }; 695 auto Not = [&](auto V) -> std::pair<Value *, uint8_t> { 696 return {Builder.CreateNot(V.first), ~V.second}; 697 }; 698 auto Nor = [&](auto Lhs, auto Rhs) { return Not(Or(Lhs, Rhs)); }; 699 auto Xnor = [&](auto Lhs, auto Rhs) { return Not(Xor(Lhs, Rhs)); }; 700 auto Nand = [&](auto Lhs, auto Rhs) { return Not(And(Lhs, Rhs)); }; 701 702 bool AIsConst = match(ArgA, m_ImmConstant()); 703 bool BIsConst = match(ArgB, m_ImmConstant()); 704 bool CIsConst = match(ArgC, m_ImmConstant()); 705 706 bool ABIsConst = AIsConst && BIsConst; 707 bool ACIsConst = AIsConst && CIsConst; 708 bool BCIsConst = BIsConst && CIsConst; 709 bool ABCIsConst = AIsConst && BIsConst && CIsConst; 710 711 // Use for verification. Its a big table. Its difficult to go from Imm -> 712 // logic ops, but easy to verify that a set of logic ops is correct. We track 713 // the logic ops through the second value in the pair. At the end it should 714 // equal Imm. 715 std::pair<Value *, uint8_t> A = {ArgA, 0xf0}; 716 std::pair<Value *, uint8_t> B = {ArgB, 0xcc}; 717 std::pair<Value *, uint8_t> C = {ArgC, 0xaa}; 718 std::pair<Value *, uint8_t> Res = {nullptr, 0}; 719 720 // Currently we only handle cases that convert directly to another instruction 721 // or cases where all the ops are constant. This is because we don't properly 722 // handle creating ternary ops in the backend, so splitting them here may 723 // cause regressions. As the backend improves, uncomment more cases. 724 725 uint8_t Imm = ArgImm->getValue().getZExtValue(); 726 switch (Imm) { 727 case 0x0: 728 Res = {Constant::getNullValue(Ty), 0}; 729 break; 730 case 0x1: 731 if (ABCIsConst) 732 Res = Nor(Or(A, B), C); 733 break; 734 case 0x2: 735 if (ABCIsConst) 736 Res = And(Nor(A, B), C); 737 break; 738 case 0x3: 739 if (ABIsConst) 740 Res = Nor(A, B); 741 break; 742 case 0x4: 743 if (ABCIsConst) 744 Res = And(Nor(A, C), B); 745 break; 746 case 0x5: 747 if (ACIsConst) 748 Res = Nor(A, C); 749 break; 750 case 0x6: 751 if (ABCIsConst) 752 Res = Nor(A, Xnor(B, C)); 753 break; 754 case 0x7: 755 if (ABCIsConst) 756 Res = Nor(A, And(B, C)); 757 break; 758 case 0x8: 759 if (ABCIsConst) 760 Res = Nor(A, Nand(B, C)); 761 break; 762 case 0x9: 763 if (ABCIsConst) 764 Res = Nor(A, Xor(B, C)); 765 break; 766 case 0xa: 767 if (ACIsConst) 768 Res = Nor(A, Not(C)); 769 break; 770 case 0xb: 771 if (ABCIsConst) 772 Res = Nor(A, Nor(C, Not(B))); 773 break; 774 case 0xc: 775 if (ABIsConst) 776 Res = Nor(A, Not(B)); 777 break; 778 case 0xd: 779 if (ABCIsConst) 780 Res = Nor(A, Nor(B, Not(C))); 781 break; 782 case 0xe: 783 if (ABCIsConst) 784 Res = Nor(A, Nor(B, C)); 785 break; 786 case 0xf: 787 Res = Not(A); 788 break; 789 case 0x10: 790 if (ABCIsConst) 791 Res = And(A, Nor(B, C)); 792 break; 793 case 0x11: 794 if (BCIsConst) 795 Res = Nor(B, C); 796 break; 797 case 0x12: 798 if (ABCIsConst) 799 Res = Nor(Xnor(A, C), B); 800 break; 801 case 0x13: 802 if (ABCIsConst) 803 Res = Nor(And(A, C), B); 804 break; 805 case 0x14: 806 if (ABCIsConst) 807 Res = Nor(Xnor(A, B), C); 808 break; 809 case 0x15: 810 if (ABCIsConst) 811 Res = Nor(And(A, B), C); 812 break; 813 case 0x16: 814 if (ABCIsConst) 815 Res = Xor(Xor(A, B), And(Nand(A, B), C)); 816 break; 817 case 0x17: 818 if (ABCIsConst) 819 Res = Xor(Or(A, B), Or(Xnor(A, B), C)); 820 break; 821 case 0x18: 822 if (ABCIsConst) 823 Res = Nor(Xnor(A, B), Xnor(A, C)); 824 break; 825 case 0x19: 826 if (ABCIsConst) 827 Res = And(Nand(A, B), Xnor(B, C)); 828 break; 829 case 0x1a: 830 if (ABCIsConst) 831 Res = Xor(A, Or(And(A, B), C)); 832 break; 833 case 0x1b: 834 if (ABCIsConst) 835 Res = Xor(A, Or(Xnor(A, B), C)); 836 break; 837 case 0x1c: 838 if (ABCIsConst) 839 Res = Xor(A, Or(And(A, C), B)); 840 break; 841 case 0x1d: 842 if (ABCIsConst) 843 Res = Xor(A, Or(Xnor(A, C), B)); 844 break; 845 case 0x1e: 846 if (ABCIsConst) 847 Res = Xor(A, Or(B, C)); 848 break; 849 case 0x1f: 850 if (ABCIsConst) 851 Res = Nand(A, Or(B, C)); 852 break; 853 case 0x20: 854 if (ABCIsConst) 855 Res = Nor(Nand(A, C), B); 856 break; 857 case 0x21: 858 if (ABCIsConst) 859 Res = Nor(Xor(A, C), B); 860 break; 861 case 0x22: 862 if (BCIsConst) 863 Res = Nor(B, Not(C)); 864 break; 865 case 0x23: 866 if (ABCIsConst) 867 Res = Nor(B, Nor(C, Not(A))); 868 break; 869 case 0x24: 870 if (ABCIsConst) 871 Res = Nor(Xnor(A, B), Xor(A, C)); 872 break; 873 case 0x25: 874 if (ABCIsConst) 875 Res = Xor(A, Nand(Nand(A, B), C)); 876 break; 877 case 0x26: 878 if (ABCIsConst) 879 Res = And(Nand(A, B), Xor(B, C)); 880 break; 881 case 0x27: 882 if (ABCIsConst) 883 Res = Xor(Or(Xnor(A, B), C), B); 884 break; 885 case 0x28: 886 if (ABCIsConst) 887 Res = And(Xor(A, B), C); 888 break; 889 case 0x29: 890 if (ABCIsConst) 891 Res = Xor(Xor(A, B), Nor(And(A, B), C)); 892 break; 893 case 0x2a: 894 if (ABCIsConst) 895 Res = And(Nand(A, B), C); 896 break; 897 case 0x2b: 898 if (ABCIsConst) 899 Res = Xor(Or(Xnor(A, B), Xor(A, C)), A); 900 break; 901 case 0x2c: 902 if (ABCIsConst) 903 Res = Nor(Xnor(A, B), Nor(B, C)); 904 break; 905 case 0x2d: 906 if (ABCIsConst) 907 Res = Xor(A, Or(B, Not(C))); 908 break; 909 case 0x2e: 910 if (ABCIsConst) 911 Res = Xor(A, Or(Xor(A, C), B)); 912 break; 913 case 0x2f: 914 if (ABCIsConst) 915 Res = Nand(A, Or(B, Not(C))); 916 break; 917 case 0x30: 918 if (ABIsConst) 919 Res = Nor(B, Not(A)); 920 break; 921 case 0x31: 922 if (ABCIsConst) 923 Res = Nor(Nor(A, Not(C)), B); 924 break; 925 case 0x32: 926 if (ABCIsConst) 927 Res = Nor(Nor(A, C), B); 928 break; 929 case 0x33: 930 Res = Not(B); 931 break; 932 case 0x34: 933 if (ABCIsConst) 934 Res = And(Xor(A, B), Nand(B, C)); 935 break; 936 case 0x35: 937 if (ABCIsConst) 938 Res = Xor(B, Or(A, Xnor(B, C))); 939 break; 940 case 0x36: 941 if (ABCIsConst) 942 Res = Xor(Or(A, C), B); 943 break; 944 case 0x37: 945 if (ABCIsConst) 946 Res = Nand(Or(A, C), B); 947 break; 948 case 0x38: 949 if (ABCIsConst) 950 Res = Nor(Xnor(A, B), Nor(A, C)); 951 break; 952 case 0x39: 953 if (ABCIsConst) 954 Res = Xor(Or(A, Not(C)), B); 955 break; 956 case 0x3a: 957 if (ABCIsConst) 958 Res = Xor(B, Or(A, Xor(B, C))); 959 break; 960 case 0x3b: 961 if (ABCIsConst) 962 Res = Nand(Or(A, Not(C)), B); 963 break; 964 case 0x3c: 965 Res = Xor(A, B); 966 break; 967 case 0x3d: 968 if (ABCIsConst) 969 Res = Xor(A, Or(Nor(A, C), B)); 970 break; 971 case 0x3e: 972 if (ABCIsConst) 973 Res = Xor(A, Or(Nor(A, Not(C)), B)); 974 break; 975 case 0x3f: 976 if (ABIsConst) 977 Res = Nand(A, B); 978 break; 979 case 0x40: 980 if (ABCIsConst) 981 Res = Nor(Nand(A, B), C); 982 break; 983 case 0x41: 984 if (ABCIsConst) 985 Res = Nor(Xor(A, B), C); 986 break; 987 case 0x42: 988 if (ABCIsConst) 989 Res = Nor(Xor(A, B), Xnor(A, C)); 990 break; 991 case 0x43: 992 if (ABCIsConst) 993 Res = Xor(A, Nand(Nand(A, C), B)); 994 break; 995 case 0x44: 996 if (BCIsConst) 997 Res = Nor(C, Not(B)); 998 break; 999 case 0x45: 1000 if (ABCIsConst) 1001 Res = Nor(Nor(B, Not(A)), C); 1002 break; 1003 case 0x46: 1004 if (ABCIsConst) 1005 Res = Xor(Or(And(A, C), B), C); 1006 break; 1007 case 0x47: 1008 if (ABCIsConst) 1009 Res = Xor(Or(Xnor(A, C), B), C); 1010 break; 1011 case 0x48: 1012 if (ABCIsConst) 1013 Res = And(Xor(A, C), B); 1014 break; 1015 case 0x49: 1016 if (ABCIsConst) 1017 Res = Xor(Or(Xnor(A, B), And(A, C)), C); 1018 break; 1019 case 0x4a: 1020 if (ABCIsConst) 1021 Res = Nor(Xnor(A, C), Nor(B, C)); 1022 break; 1023 case 0x4b: 1024 if (ABCIsConst) 1025 Res = Xor(A, Or(C, Not(B))); 1026 break; 1027 case 0x4c: 1028 if (ABCIsConst) 1029 Res = And(Nand(A, C), B); 1030 break; 1031 case 0x4d: 1032 if (ABCIsConst) 1033 Res = Xor(Or(Xor(A, B), Xnor(A, C)), A); 1034 break; 1035 case 0x4e: 1036 if (ABCIsConst) 1037 Res = Xor(A, Or(Xor(A, B), C)); 1038 break; 1039 case 0x4f: 1040 if (ABCIsConst) 1041 Res = Nand(A, Nand(B, Not(C))); 1042 break; 1043 case 0x50: 1044 if (ACIsConst) 1045 Res = Nor(C, Not(A)); 1046 break; 1047 case 0x51: 1048 if (ABCIsConst) 1049 Res = Nor(Nor(A, Not(B)), C); 1050 break; 1051 case 0x52: 1052 if (ABCIsConst) 1053 Res = And(Xor(A, C), Nand(B, C)); 1054 break; 1055 case 0x53: 1056 if (ABCIsConst) 1057 Res = Xor(Or(Xnor(B, C), A), C); 1058 break; 1059 case 0x54: 1060 if (ABCIsConst) 1061 Res = Nor(Nor(A, B), C); 1062 break; 1063 case 0x55: 1064 Res = Not(C); 1065 break; 1066 case 0x56: 1067 if (ABCIsConst) 1068 Res = Xor(Or(A, B), C); 1069 break; 1070 case 0x57: 1071 if (ABCIsConst) 1072 Res = Nand(Or(A, B), C); 1073 break; 1074 case 0x58: 1075 if (ABCIsConst) 1076 Res = Nor(Nor(A, B), Xnor(A, C)); 1077 break; 1078 case 0x59: 1079 if (ABCIsConst) 1080 Res = Xor(Or(A, Not(B)), C); 1081 break; 1082 case 0x5a: 1083 Res = Xor(A, C); 1084 break; 1085 case 0x5b: 1086 if (ABCIsConst) 1087 Res = Xor(A, Or(Nor(A, B), C)); 1088 break; 1089 case 0x5c: 1090 if (ABCIsConst) 1091 Res = Xor(Or(Xor(B, C), A), C); 1092 break; 1093 case 0x5d: 1094 if (ABCIsConst) 1095 Res = Nand(Or(A, Not(B)), C); 1096 break; 1097 case 0x5e: 1098 if (ABCIsConst) 1099 Res = Xor(A, Or(Nor(A, Not(B)), C)); 1100 break; 1101 case 0x5f: 1102 if (ACIsConst) 1103 Res = Nand(A, C); 1104 break; 1105 case 0x60: 1106 if (ABCIsConst) 1107 Res = And(A, Xor(B, C)); 1108 break; 1109 case 0x61: 1110 if (ABCIsConst) 1111 Res = Xor(Or(Xnor(A, B), And(B, C)), C); 1112 break; 1113 case 0x62: 1114 if (ABCIsConst) 1115 Res = Nor(Nor(A, C), Xnor(B, C)); 1116 break; 1117 case 0x63: 1118 if (ABCIsConst) 1119 Res = Xor(B, Or(C, Not(A))); 1120 break; 1121 case 0x64: 1122 if (ABCIsConst) 1123 Res = Nor(Nor(A, B), Xnor(B, C)); 1124 break; 1125 case 0x65: 1126 if (ABCIsConst) 1127 Res = Xor(Or(B, Not(A)), C); 1128 break; 1129 case 0x66: 1130 Res = Xor(B, C); 1131 break; 1132 case 0x67: 1133 if (ABCIsConst) 1134 Res = Or(Nor(A, B), Xor(B, C)); 1135 break; 1136 case 0x68: 1137 if (ABCIsConst) 1138 Res = Xor(Xor(A, B), Nor(Nor(A, B), C)); 1139 break; 1140 case 0x69: 1141 if (ABCIsConst) 1142 Res = Xor(Xnor(A, B), C); 1143 break; 1144 case 0x6a: 1145 if (ABCIsConst) 1146 Res = Xor(And(A, B), C); 1147 break; 1148 case 0x6b: 1149 if (ABCIsConst) 1150 Res = Or(Nor(A, B), Xor(Xnor(A, B), C)); 1151 break; 1152 case 0x6c: 1153 if (ABCIsConst) 1154 Res = Xor(And(A, C), B); 1155 break; 1156 case 0x6d: 1157 if (ABCIsConst) 1158 Res = Xor(Or(Xnor(A, B), Nor(A, C)), C); 1159 break; 1160 case 0x6e: 1161 if (ABCIsConst) 1162 Res = Or(Nor(A, Not(B)), Xor(B, C)); 1163 break; 1164 case 0x6f: 1165 if (ABCIsConst) 1166 Res = Nand(A, Xnor(B, C)); 1167 break; 1168 case 0x70: 1169 if (ABCIsConst) 1170 Res = And(A, Nand(B, C)); 1171 break; 1172 case 0x71: 1173 if (ABCIsConst) 1174 Res = Xor(Nor(Xor(A, B), Xor(A, C)), A); 1175 break; 1176 case 0x72: 1177 if (ABCIsConst) 1178 Res = Xor(Or(Xor(A, B), C), B); 1179 break; 1180 case 0x73: 1181 if (ABCIsConst) 1182 Res = Nand(Nand(A, Not(C)), B); 1183 break; 1184 case 0x74: 1185 if (ABCIsConst) 1186 Res = Xor(Or(Xor(A, C), B), C); 1187 break; 1188 case 0x75: 1189 if (ABCIsConst) 1190 Res = Nand(Nand(A, Not(B)), C); 1191 break; 1192 case 0x76: 1193 if (ABCIsConst) 1194 Res = Xor(B, Or(Nor(B, Not(A)), C)); 1195 break; 1196 case 0x77: 1197 if (BCIsConst) 1198 Res = Nand(B, C); 1199 break; 1200 case 0x78: 1201 if (ABCIsConst) 1202 Res = Xor(A, And(B, C)); 1203 break; 1204 case 0x79: 1205 if (ABCIsConst) 1206 Res = Xor(Or(Xnor(A, B), Nor(B, C)), C); 1207 break; 1208 case 0x7a: 1209 if (ABCIsConst) 1210 Res = Or(Xor(A, C), Nor(B, Not(A))); 1211 break; 1212 case 0x7b: 1213 if (ABCIsConst) 1214 Res = Nand(Xnor(A, C), B); 1215 break; 1216 case 0x7c: 1217 if (ABCIsConst) 1218 Res = Or(Xor(A, B), Nor(C, Not(A))); 1219 break; 1220 case 0x7d: 1221 if (ABCIsConst) 1222 Res = Nand(Xnor(A, B), C); 1223 break; 1224 case 0x7e: 1225 if (ABCIsConst) 1226 Res = Or(Xor(A, B), Xor(A, C)); 1227 break; 1228 case 0x7f: 1229 if (ABCIsConst) 1230 Res = Nand(And(A, B), C); 1231 break; 1232 case 0x80: 1233 if (ABCIsConst) 1234 Res = And(And(A, B), C); 1235 break; 1236 case 0x81: 1237 if (ABCIsConst) 1238 Res = Nor(Xor(A, B), Xor(A, C)); 1239 break; 1240 case 0x82: 1241 if (ABCIsConst) 1242 Res = And(Xnor(A, B), C); 1243 break; 1244 case 0x83: 1245 if (ABCIsConst) 1246 Res = Nor(Xor(A, B), Nor(C, Not(A))); 1247 break; 1248 case 0x84: 1249 if (ABCIsConst) 1250 Res = And(Xnor(A, C), B); 1251 break; 1252 case 0x85: 1253 if (ABCIsConst) 1254 Res = Nor(Xor(A, C), Nor(B, Not(A))); 1255 break; 1256 case 0x86: 1257 if (ABCIsConst) 1258 Res = Xor(Nor(Xnor(A, B), Nor(B, C)), C); 1259 break; 1260 case 0x87: 1261 if (ABCIsConst) 1262 Res = Xor(A, Nand(B, C)); 1263 break; 1264 case 0x88: 1265 Res = And(B, C); 1266 break; 1267 case 0x89: 1268 if (ABCIsConst) 1269 Res = Xor(B, Nor(Nor(B, Not(A)), C)); 1270 break; 1271 case 0x8a: 1272 if (ABCIsConst) 1273 Res = And(Nand(A, Not(B)), C); 1274 break; 1275 case 0x8b: 1276 if (ABCIsConst) 1277 Res = Xor(Nor(Xor(A, C), B), C); 1278 break; 1279 case 0x8c: 1280 if (ABCIsConst) 1281 Res = And(Nand(A, Not(C)), B); 1282 break; 1283 case 0x8d: 1284 if (ABCIsConst) 1285 Res = Xor(Nor(Xor(A, B), C), B); 1286 break; 1287 case 0x8e: 1288 if (ABCIsConst) 1289 Res = Xor(Or(Xor(A, B), Xor(A, C)), A); 1290 break; 1291 case 0x8f: 1292 if (ABCIsConst) 1293 Res = Nand(A, Nand(B, C)); 1294 break; 1295 case 0x90: 1296 if (ABCIsConst) 1297 Res = And(A, Xnor(B, C)); 1298 break; 1299 case 0x91: 1300 if (ABCIsConst) 1301 Res = Nor(Nor(A, Not(B)), Xor(B, C)); 1302 break; 1303 case 0x92: 1304 if (ABCIsConst) 1305 Res = Xor(Nor(Xnor(A, B), Nor(A, C)), C); 1306 break; 1307 case 0x93: 1308 if (ABCIsConst) 1309 Res = Xor(Nand(A, C), B); 1310 break; 1311 case 0x94: 1312 if (ABCIsConst) 1313 Res = Nor(Nor(A, B), Xor(Xnor(A, B), C)); 1314 break; 1315 case 0x95: 1316 if (ABCIsConst) 1317 Res = Xor(Nand(A, B), C); 1318 break; 1319 case 0x96: 1320 if (ABCIsConst) 1321 Res = Xor(Xor(A, B), C); 1322 break; 1323 case 0x97: 1324 if (ABCIsConst) 1325 Res = Xor(Xor(A, B), Or(Nor(A, B), C)); 1326 break; 1327 case 0x98: 1328 if (ABCIsConst) 1329 Res = Nor(Nor(A, B), Xor(B, C)); 1330 break; 1331 case 0x99: 1332 if (BCIsConst) 1333 Res = Xnor(B, C); 1334 break; 1335 case 0x9a: 1336 if (ABCIsConst) 1337 Res = Xor(Nor(B, Not(A)), C); 1338 break; 1339 case 0x9b: 1340 if (ABCIsConst) 1341 Res = Or(Nor(A, B), Xnor(B, C)); 1342 break; 1343 case 0x9c: 1344 if (ABCIsConst) 1345 Res = Xor(B, Nor(C, Not(A))); 1346 break; 1347 case 0x9d: 1348 if (ABCIsConst) 1349 Res = Or(Nor(A, C), Xnor(B, C)); 1350 break; 1351 case 0x9e: 1352 if (ABCIsConst) 1353 Res = Xor(And(Xor(A, B), Nand(B, C)), C); 1354 break; 1355 case 0x9f: 1356 if (ABCIsConst) 1357 Res = Nand(A, Xor(B, C)); 1358 break; 1359 case 0xa0: 1360 Res = And(A, C); 1361 break; 1362 case 0xa1: 1363 if (ABCIsConst) 1364 Res = Xor(A, Nor(Nor(A, Not(B)), C)); 1365 break; 1366 case 0xa2: 1367 if (ABCIsConst) 1368 Res = And(Or(A, Not(B)), C); 1369 break; 1370 case 0xa3: 1371 if (ABCIsConst) 1372 Res = Xor(Nor(Xor(B, C), A), C); 1373 break; 1374 case 0xa4: 1375 if (ABCIsConst) 1376 Res = Xor(A, Nor(Nor(A, B), C)); 1377 break; 1378 case 0xa5: 1379 if (ACIsConst) 1380 Res = Xnor(A, C); 1381 break; 1382 case 0xa6: 1383 if (ABCIsConst) 1384 Res = Xor(Nor(A, Not(B)), C); 1385 break; 1386 case 0xa7: 1387 if (ABCIsConst) 1388 Res = Or(Nor(A, B), Xnor(A, C)); 1389 break; 1390 case 0xa8: 1391 if (ABCIsConst) 1392 Res = And(Or(A, B), C); 1393 break; 1394 case 0xa9: 1395 if (ABCIsConst) 1396 Res = Xor(Nor(A, B), C); 1397 break; 1398 case 0xaa: 1399 Res = C; 1400 break; 1401 case 0xab: 1402 if (ABCIsConst) 1403 Res = Or(Nor(A, B), C); 1404 break; 1405 case 0xac: 1406 if (ABCIsConst) 1407 Res = Xor(Nor(Xnor(B, C), A), C); 1408 break; 1409 case 0xad: 1410 if (ABCIsConst) 1411 Res = Or(Xnor(A, C), And(B, C)); 1412 break; 1413 case 0xae: 1414 if (ABCIsConst) 1415 Res = Or(Nor(A, Not(B)), C); 1416 break; 1417 case 0xaf: 1418 if (ACIsConst) 1419 Res = Or(C, Not(A)); 1420 break; 1421 case 0xb0: 1422 if (ABCIsConst) 1423 Res = And(A, Nand(B, Not(C))); 1424 break; 1425 case 0xb1: 1426 if (ABCIsConst) 1427 Res = Xor(A, Nor(Xor(A, B), C)); 1428 break; 1429 case 0xb2: 1430 if (ABCIsConst) 1431 Res = Xor(Nor(Xor(A, B), Xnor(A, C)), A); 1432 break; 1433 case 0xb3: 1434 if (ABCIsConst) 1435 Res = Nand(Nand(A, C), B); 1436 break; 1437 case 0xb4: 1438 if (ABCIsConst) 1439 Res = Xor(A, Nor(C, Not(B))); 1440 break; 1441 case 0xb5: 1442 if (ABCIsConst) 1443 Res = Or(Xnor(A, C), Nor(B, C)); 1444 break; 1445 case 0xb6: 1446 if (ABCIsConst) 1447 Res = Xor(And(Xor(A, B), Nand(A, C)), C); 1448 break; 1449 case 0xb7: 1450 if (ABCIsConst) 1451 Res = Nand(Xor(A, C), B); 1452 break; 1453 case 0xb8: 1454 if (ABCIsConst) 1455 Res = Xor(Nor(Xnor(A, C), B), C); 1456 break; 1457 case 0xb9: 1458 if (ABCIsConst) 1459 Res = Xor(Nor(And(A, C), B), C); 1460 break; 1461 case 0xba: 1462 if (ABCIsConst) 1463 Res = Or(Nor(B, Not(A)), C); 1464 break; 1465 case 0xbb: 1466 if (BCIsConst) 1467 Res = Or(C, Not(B)); 1468 break; 1469 case 0xbc: 1470 if (ABCIsConst) 1471 Res = Xor(A, And(Nand(A, C), B)); 1472 break; 1473 case 0xbd: 1474 if (ABCIsConst) 1475 Res = Or(Xor(A, B), Xnor(A, C)); 1476 break; 1477 case 0xbe: 1478 if (ABCIsConst) 1479 Res = Or(Xor(A, B), C); 1480 break; 1481 case 0xbf: 1482 if (ABCIsConst) 1483 Res = Or(Nand(A, B), C); 1484 break; 1485 case 0xc0: 1486 Res = And(A, B); 1487 break; 1488 case 0xc1: 1489 if (ABCIsConst) 1490 Res = Xor(A, Nor(Nor(A, Not(C)), B)); 1491 break; 1492 case 0xc2: 1493 if (ABCIsConst) 1494 Res = Xor(A, Nor(Nor(A, C), B)); 1495 break; 1496 case 0xc3: 1497 if (ABIsConst) 1498 Res = Xnor(A, B); 1499 break; 1500 case 0xc4: 1501 if (ABCIsConst) 1502 Res = And(Or(A, Not(C)), B); 1503 break; 1504 case 0xc5: 1505 if (ABCIsConst) 1506 Res = Xor(B, Nor(A, Xor(B, C))); 1507 break; 1508 case 0xc6: 1509 if (ABCIsConst) 1510 Res = Xor(Nor(A, Not(C)), B); 1511 break; 1512 case 0xc7: 1513 if (ABCIsConst) 1514 Res = Or(Xnor(A, B), Nor(A, C)); 1515 break; 1516 case 0xc8: 1517 if (ABCIsConst) 1518 Res = And(Or(A, C), B); 1519 break; 1520 case 0xc9: 1521 if (ABCIsConst) 1522 Res = Xor(Nor(A, C), B); 1523 break; 1524 case 0xca: 1525 if (ABCIsConst) 1526 Res = Xor(B, Nor(A, Xnor(B, C))); 1527 break; 1528 case 0xcb: 1529 if (ABCIsConst) 1530 Res = Or(Xnor(A, B), And(B, C)); 1531 break; 1532 case 0xcc: 1533 Res = B; 1534 break; 1535 case 0xcd: 1536 if (ABCIsConst) 1537 Res = Or(Nor(A, C), B); 1538 break; 1539 case 0xce: 1540 if (ABCIsConst) 1541 Res = Or(Nor(A, Not(C)), B); 1542 break; 1543 case 0xcf: 1544 if (ABIsConst) 1545 Res = Or(B, Not(A)); 1546 break; 1547 case 0xd0: 1548 if (ABCIsConst) 1549 Res = And(A, Or(B, Not(C))); 1550 break; 1551 case 0xd1: 1552 if (ABCIsConst) 1553 Res = Xor(A, Nor(Xor(A, C), B)); 1554 break; 1555 case 0xd2: 1556 if (ABCIsConst) 1557 Res = Xor(A, Nor(B, Not(C))); 1558 break; 1559 case 0xd3: 1560 if (ABCIsConst) 1561 Res = Or(Xnor(A, B), Nor(B, C)); 1562 break; 1563 case 0xd4: 1564 if (ABCIsConst) 1565 Res = Xor(Nor(Xnor(A, B), Xor(A, C)), A); 1566 break; 1567 case 0xd5: 1568 if (ABCIsConst) 1569 Res = Nand(Nand(A, B), C); 1570 break; 1571 case 0xd6: 1572 if (ABCIsConst) 1573 Res = Xor(Xor(A, B), Or(And(A, B), C)); 1574 break; 1575 case 0xd7: 1576 if (ABCIsConst) 1577 Res = Nand(Xor(A, B), C); 1578 break; 1579 case 0xd8: 1580 if (ABCIsConst) 1581 Res = Xor(Nor(Xnor(A, B), C), B); 1582 break; 1583 case 0xd9: 1584 if (ABCIsConst) 1585 Res = Or(And(A, B), Xnor(B, C)); 1586 break; 1587 case 0xda: 1588 if (ABCIsConst) 1589 Res = Xor(A, And(Nand(A, B), C)); 1590 break; 1591 case 0xdb: 1592 if (ABCIsConst) 1593 Res = Or(Xnor(A, B), Xor(A, C)); 1594 break; 1595 case 0xdc: 1596 if (ABCIsConst) 1597 Res = Or(B, Nor(C, Not(A))); 1598 break; 1599 case 0xdd: 1600 if (BCIsConst) 1601 Res = Or(B, Not(C)); 1602 break; 1603 case 0xde: 1604 if (ABCIsConst) 1605 Res = Or(Xor(A, C), B); 1606 break; 1607 case 0xdf: 1608 if (ABCIsConst) 1609 Res = Or(Nand(A, C), B); 1610 break; 1611 case 0xe0: 1612 if (ABCIsConst) 1613 Res = And(A, Or(B, C)); 1614 break; 1615 case 0xe1: 1616 if (ABCIsConst) 1617 Res = Xor(A, Nor(B, C)); 1618 break; 1619 case 0xe2: 1620 if (ABCIsConst) 1621 Res = Xor(A, Nor(Xnor(A, C), B)); 1622 break; 1623 case 0xe3: 1624 if (ABCIsConst) 1625 Res = Xor(A, Nor(And(A, C), B)); 1626 break; 1627 case 0xe4: 1628 if (ABCIsConst) 1629 Res = Xor(A, Nor(Xnor(A, B), C)); 1630 break; 1631 case 0xe5: 1632 if (ABCIsConst) 1633 Res = Xor(A, Nor(And(A, B), C)); 1634 break; 1635 case 0xe6: 1636 if (ABCIsConst) 1637 Res = Or(And(A, B), Xor(B, C)); 1638 break; 1639 case 0xe7: 1640 if (ABCIsConst) 1641 Res = Or(Xnor(A, B), Xnor(A, C)); 1642 break; 1643 case 0xe8: 1644 if (ABCIsConst) 1645 Res = Xor(Or(A, B), Nor(Xnor(A, B), C)); 1646 break; 1647 case 0xe9: 1648 if (ABCIsConst) 1649 Res = Xor(Xor(A, B), Nand(Nand(A, B), C)); 1650 break; 1651 case 0xea: 1652 if (ABCIsConst) 1653 Res = Or(And(A, B), C); 1654 break; 1655 case 0xeb: 1656 if (ABCIsConst) 1657 Res = Or(Xnor(A, B), C); 1658 break; 1659 case 0xec: 1660 if (ABCIsConst) 1661 Res = Or(And(A, C), B); 1662 break; 1663 case 0xed: 1664 if (ABCIsConst) 1665 Res = Or(Xnor(A, C), B); 1666 break; 1667 case 0xee: 1668 Res = Or(B, C); 1669 break; 1670 case 0xef: 1671 if (ABCIsConst) 1672 Res = Nand(A, Nor(B, C)); 1673 break; 1674 case 0xf0: 1675 Res = A; 1676 break; 1677 case 0xf1: 1678 if (ABCIsConst) 1679 Res = Or(A, Nor(B, C)); 1680 break; 1681 case 0xf2: 1682 if (ABCIsConst) 1683 Res = Or(A, Nor(B, Not(C))); 1684 break; 1685 case 0xf3: 1686 if (ABIsConst) 1687 Res = Or(A, Not(B)); 1688 break; 1689 case 0xf4: 1690 if (ABCIsConst) 1691 Res = Or(A, Nor(C, Not(B))); 1692 break; 1693 case 0xf5: 1694 if (ACIsConst) 1695 Res = Or(A, Not(C)); 1696 break; 1697 case 0xf6: 1698 if (ABCIsConst) 1699 Res = Or(A, Xor(B, C)); 1700 break; 1701 case 0xf7: 1702 if (ABCIsConst) 1703 Res = Or(A, Nand(B, C)); 1704 break; 1705 case 0xf8: 1706 if (ABCIsConst) 1707 Res = Or(A, And(B, C)); 1708 break; 1709 case 0xf9: 1710 if (ABCIsConst) 1711 Res = Or(A, Xnor(B, C)); 1712 break; 1713 case 0xfa: 1714 Res = Or(A, C); 1715 break; 1716 case 0xfb: 1717 if (ABCIsConst) 1718 Res = Nand(Nor(A, C), B); 1719 break; 1720 case 0xfc: 1721 Res = Or(A, B); 1722 break; 1723 case 0xfd: 1724 if (ABCIsConst) 1725 Res = Nand(Nor(A, B), C); 1726 break; 1727 case 0xfe: 1728 if (ABCIsConst) 1729 Res = Or(Or(A, B), C); 1730 break; 1731 case 0xff: 1732 Res = {Constant::getAllOnesValue(Ty), 0xff}; 1733 break; 1734 } 1735 1736 assert((Res.first == nullptr || Res.second == Imm) && 1737 "Simplification of ternary logic does not verify!"); 1738 return Res.first; 1739 } 1740 1741 static Value *simplifyX86insertps(const IntrinsicInst &II, 1742 InstCombiner::BuilderTy &Builder) { 1743 auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2)); 1744 if (!CInt) 1745 return nullptr; 1746 1747 auto *VecTy = cast<FixedVectorType>(II.getType()); 1748 assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type"); 1749 1750 // The immediate permute control byte looks like this: 1751 // [3:0] - zero mask for each 32-bit lane 1752 // [5:4] - select one 32-bit destination lane 1753 // [7:6] - select one 32-bit source lane 1754 1755 uint8_t Imm = CInt->getZExtValue(); 1756 uint8_t ZMask = Imm & 0xf; 1757 uint8_t DestLane = (Imm >> 4) & 0x3; 1758 uint8_t SourceLane = (Imm >> 6) & 0x3; 1759 1760 ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy); 1761 1762 // If all zero mask bits are set, this was just a weird way to 1763 // generate a zero vector. 1764 if (ZMask == 0xf) 1765 return ZeroVector; 1766 1767 // Initialize by passing all of the first source bits through. 1768 int ShuffleMask[4] = {0, 1, 2, 3}; 1769 1770 // We may replace the second operand with the zero vector. 1771 Value *V1 = II.getArgOperand(1); 1772 1773 if (ZMask) { 1774 // If the zero mask is being used with a single input or the zero mask 1775 // overrides the destination lane, this is a shuffle with the zero vector. 1776 if ((II.getArgOperand(0) == II.getArgOperand(1)) || 1777 (ZMask & (1 << DestLane))) { 1778 V1 = ZeroVector; 1779 // We may still move 32-bits of the first source vector from one lane 1780 // to another. 1781 ShuffleMask[DestLane] = SourceLane; 1782 // The zero mask may override the previous insert operation. 1783 for (unsigned i = 0; i < 4; ++i) 1784 if ((ZMask >> i) & 0x1) 1785 ShuffleMask[i] = i + 4; 1786 } else { 1787 // TODO: Model this case as 2 shuffles or a 'logical and' plus shuffle? 1788 return nullptr; 1789 } 1790 } else { 1791 // Replace the selected destination lane with the selected source lane. 1792 ShuffleMask[DestLane] = SourceLane + 4; 1793 } 1794 1795 return Builder.CreateShuffleVector(II.getArgOperand(0), V1, ShuffleMask); 1796 } 1797 1798 /// Attempt to simplify SSE4A EXTRQ/EXTRQI instructions using constant folding 1799 /// or conversion to a shuffle vector. 1800 static Value *simplifyX86extrq(IntrinsicInst &II, Value *Op0, 1801 ConstantInt *CILength, ConstantInt *CIIndex, 1802 InstCombiner::BuilderTy &Builder) { 1803 auto LowConstantHighUndef = [&](uint64_t Val) { 1804 Type *IntTy64 = Type::getInt64Ty(II.getContext()); 1805 Constant *Args[] = {ConstantInt::get(IntTy64, Val), 1806 UndefValue::get(IntTy64)}; 1807 return ConstantVector::get(Args); 1808 }; 1809 1810 // See if we're dealing with constant values. 1811 auto *C0 = dyn_cast<Constant>(Op0); 1812 auto *CI0 = 1813 C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0)) 1814 : nullptr; 1815 1816 // Attempt to constant fold. 1817 if (CILength && CIIndex) { 1818 // From AMD documentation: "The bit index and field length are each six 1819 // bits in length other bits of the field are ignored." 1820 APInt APIndex = CIIndex->getValue().zextOrTrunc(6); 1821 APInt APLength = CILength->getValue().zextOrTrunc(6); 1822 1823 unsigned Index = APIndex.getZExtValue(); 1824 1825 // From AMD documentation: "a value of zero in the field length is 1826 // defined as length of 64". 1827 unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue(); 1828 1829 // From AMD documentation: "If the sum of the bit index + length field 1830 // is greater than 64, the results are undefined". 1831 unsigned End = Index + Length; 1832 1833 // Note that both field index and field length are 8-bit quantities. 1834 // Since variables 'Index' and 'Length' are unsigned values 1835 // obtained from zero-extending field index and field length 1836 // respectively, their sum should never wrap around. 1837 if (End > 64) 1838 return UndefValue::get(II.getType()); 1839 1840 // If we are inserting whole bytes, we can convert this to a shuffle. 1841 // Lowering can recognize EXTRQI shuffle masks. 1842 if ((Length % 8) == 0 && (Index % 8) == 0) { 1843 // Convert bit indices to byte indices. 1844 Length /= 8; 1845 Index /= 8; 1846 1847 Type *IntTy8 = Type::getInt8Ty(II.getContext()); 1848 auto *ShufTy = FixedVectorType::get(IntTy8, 16); 1849 1850 SmallVector<int, 16> ShuffleMask; 1851 for (int i = 0; i != (int)Length; ++i) 1852 ShuffleMask.push_back(i + Index); 1853 for (int i = Length; i != 8; ++i) 1854 ShuffleMask.push_back(i + 16); 1855 for (int i = 8; i != 16; ++i) 1856 ShuffleMask.push_back(-1); 1857 1858 Value *SV = Builder.CreateShuffleVector( 1859 Builder.CreateBitCast(Op0, ShufTy), 1860 ConstantAggregateZero::get(ShufTy), ShuffleMask); 1861 return Builder.CreateBitCast(SV, II.getType()); 1862 } 1863 1864 // Constant Fold - shift Index'th bit to lowest position and mask off 1865 // Length bits. 1866 if (CI0) { 1867 APInt Elt = CI0->getValue(); 1868 Elt.lshrInPlace(Index); 1869 Elt = Elt.zextOrTrunc(Length); 1870 return LowConstantHighUndef(Elt.getZExtValue()); 1871 } 1872 1873 // If we were an EXTRQ call, we'll save registers if we convert to EXTRQI. 1874 if (II.getIntrinsicID() == Intrinsic::x86_sse4a_extrq) { 1875 Value *Args[] = {Op0, CILength, CIIndex}; 1876 Module *M = II.getModule(); 1877 Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_extrqi); 1878 return Builder.CreateCall(F, Args); 1879 } 1880 } 1881 1882 // Constant Fold - extraction from zero is always {zero, undef}. 1883 if (CI0 && CI0->isZero()) 1884 return LowConstantHighUndef(0); 1885 1886 return nullptr; 1887 } 1888 1889 /// Attempt to simplify SSE4A INSERTQ/INSERTQI instructions using constant 1890 /// folding or conversion to a shuffle vector. 1891 static Value *simplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1, 1892 APInt APLength, APInt APIndex, 1893 InstCombiner::BuilderTy &Builder) { 1894 // From AMD documentation: "The bit index and field length are each six bits 1895 // in length other bits of the field are ignored." 1896 APIndex = APIndex.zextOrTrunc(6); 1897 APLength = APLength.zextOrTrunc(6); 1898 1899 // Attempt to constant fold. 1900 unsigned Index = APIndex.getZExtValue(); 1901 1902 // From AMD documentation: "a value of zero in the field length is 1903 // defined as length of 64". 1904 unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue(); 1905 1906 // From AMD documentation: "If the sum of the bit index + length field 1907 // is greater than 64, the results are undefined". 1908 unsigned End = Index + Length; 1909 1910 // Note that both field index and field length are 8-bit quantities. 1911 // Since variables 'Index' and 'Length' are unsigned values 1912 // obtained from zero-extending field index and field length 1913 // respectively, their sum should never wrap around. 1914 if (End > 64) 1915 return UndefValue::get(II.getType()); 1916 1917 // If we are inserting whole bytes, we can convert this to a shuffle. 1918 // Lowering can recognize INSERTQI shuffle masks. 1919 if ((Length % 8) == 0 && (Index % 8) == 0) { 1920 // Convert bit indices to byte indices. 1921 Length /= 8; 1922 Index /= 8; 1923 1924 Type *IntTy8 = Type::getInt8Ty(II.getContext()); 1925 auto *ShufTy = FixedVectorType::get(IntTy8, 16); 1926 1927 SmallVector<int, 16> ShuffleMask; 1928 for (int i = 0; i != (int)Index; ++i) 1929 ShuffleMask.push_back(i); 1930 for (int i = 0; i != (int)Length; ++i) 1931 ShuffleMask.push_back(i + 16); 1932 for (int i = Index + Length; i != 8; ++i) 1933 ShuffleMask.push_back(i); 1934 for (int i = 8; i != 16; ++i) 1935 ShuffleMask.push_back(-1); 1936 1937 Value *SV = Builder.CreateShuffleVector(Builder.CreateBitCast(Op0, ShufTy), 1938 Builder.CreateBitCast(Op1, ShufTy), 1939 ShuffleMask); 1940 return Builder.CreateBitCast(SV, II.getType()); 1941 } 1942 1943 // See if we're dealing with constant values. 1944 auto *C0 = dyn_cast<Constant>(Op0); 1945 auto *C1 = dyn_cast<Constant>(Op1); 1946 auto *CI00 = 1947 C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0)) 1948 : nullptr; 1949 auto *CI10 = 1950 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0)) 1951 : nullptr; 1952 1953 // Constant Fold - insert bottom Length bits starting at the Index'th bit. 1954 if (CI00 && CI10) { 1955 APInt V00 = CI00->getValue(); 1956 APInt V10 = CI10->getValue(); 1957 APInt Mask = APInt::getLowBitsSet(64, Length).shl(Index); 1958 V00 = V00 & ~Mask; 1959 V10 = V10.zextOrTrunc(Length).zextOrTrunc(64).shl(Index); 1960 APInt Val = V00 | V10; 1961 Type *IntTy64 = Type::getInt64Ty(II.getContext()); 1962 Constant *Args[] = {ConstantInt::get(IntTy64, Val.getZExtValue()), 1963 UndefValue::get(IntTy64)}; 1964 return ConstantVector::get(Args); 1965 } 1966 1967 // If we were an INSERTQ call, we'll save demanded elements if we convert to 1968 // INSERTQI. 1969 if (II.getIntrinsicID() == Intrinsic::x86_sse4a_insertq) { 1970 Type *IntTy8 = Type::getInt8Ty(II.getContext()); 1971 Constant *CILength = ConstantInt::get(IntTy8, Length, false); 1972 Constant *CIIndex = ConstantInt::get(IntTy8, Index, false); 1973 1974 Value *Args[] = {Op0, Op1, CILength, CIIndex}; 1975 Module *M = II.getModule(); 1976 Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi); 1977 return Builder.CreateCall(F, Args); 1978 } 1979 1980 return nullptr; 1981 } 1982 1983 /// Attempt to convert pshufb* to shufflevector if the mask is constant. 1984 static Value *simplifyX86pshufb(const IntrinsicInst &II, 1985 InstCombiner::BuilderTy &Builder) { 1986 auto *V = dyn_cast<Constant>(II.getArgOperand(1)); 1987 if (!V) 1988 return nullptr; 1989 1990 auto *VecTy = cast<FixedVectorType>(II.getType()); 1991 unsigned NumElts = VecTy->getNumElements(); 1992 assert((NumElts == 16 || NumElts == 32 || NumElts == 64) && 1993 "Unexpected number of elements in shuffle mask!"); 1994 1995 // Construct a shuffle mask from constant integers or UNDEFs. 1996 int Indexes[64]; 1997 1998 // Each byte in the shuffle control mask forms an index to permute the 1999 // corresponding byte in the destination operand. 2000 for (unsigned I = 0; I < NumElts; ++I) { 2001 Constant *COp = V->getAggregateElement(I); 2002 if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) 2003 return nullptr; 2004 2005 if (isa<UndefValue>(COp)) { 2006 Indexes[I] = -1; 2007 continue; 2008 } 2009 2010 int8_t Index = cast<ConstantInt>(COp)->getValue().getZExtValue(); 2011 2012 // If the most significant bit (bit[7]) of each byte of the shuffle 2013 // control mask is set, then zero is written in the result byte. 2014 // The zero vector is in the right-hand side of the resulting 2015 // shufflevector. 2016 2017 // The value of each index for the high 128-bit lane is the least 2018 // significant 4 bits of the respective shuffle control byte. 2019 Index = ((Index < 0) ? NumElts : Index & 0x0F) + (I & 0xF0); 2020 Indexes[I] = Index; 2021 } 2022 2023 auto V1 = II.getArgOperand(0); 2024 auto V2 = Constant::getNullValue(VecTy); 2025 return Builder.CreateShuffleVector(V1, V2, ArrayRef(Indexes, NumElts)); 2026 } 2027 2028 /// Attempt to convert vpermilvar* to shufflevector if the mask is constant. 2029 static Value *simplifyX86vpermilvar(const IntrinsicInst &II, 2030 InstCombiner::BuilderTy &Builder) { 2031 auto *V = dyn_cast<Constant>(II.getArgOperand(1)); 2032 if (!V) 2033 return nullptr; 2034 2035 auto *VecTy = cast<FixedVectorType>(II.getType()); 2036 unsigned NumElts = VecTy->getNumElements(); 2037 bool IsPD = VecTy->getScalarType()->isDoubleTy(); 2038 unsigned NumLaneElts = IsPD ? 2 : 4; 2039 assert(NumElts == 16 || NumElts == 8 || NumElts == 4 || NumElts == 2); 2040 2041 // Construct a shuffle mask from constant integers or UNDEFs. 2042 int Indexes[16]; 2043 2044 // The intrinsics only read one or two bits, clear the rest. 2045 for (unsigned I = 0; I < NumElts; ++I) { 2046 Constant *COp = V->getAggregateElement(I); 2047 if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) 2048 return nullptr; 2049 2050 if (isa<UndefValue>(COp)) { 2051 Indexes[I] = -1; 2052 continue; 2053 } 2054 2055 APInt Index = cast<ConstantInt>(COp)->getValue(); 2056 Index = Index.zextOrTrunc(32).getLoBits(2); 2057 2058 // The PD variants uses bit 1 to select per-lane element index, so 2059 // shift down to convert to generic shuffle mask index. 2060 if (IsPD) 2061 Index.lshrInPlace(1); 2062 2063 // The _256 variants are a bit trickier since the mask bits always index 2064 // into the corresponding 128 half. In order to convert to a generic 2065 // shuffle, we have to make that explicit. 2066 Index += APInt(32, (I / NumLaneElts) * NumLaneElts); 2067 2068 Indexes[I] = Index.getZExtValue(); 2069 } 2070 2071 auto V1 = II.getArgOperand(0); 2072 return Builder.CreateShuffleVector(V1, ArrayRef(Indexes, NumElts)); 2073 } 2074 2075 /// Attempt to convert vpermd/vpermps to shufflevector if the mask is constant. 2076 static Value *simplifyX86vpermv(const IntrinsicInst &II, 2077 InstCombiner::BuilderTy &Builder) { 2078 auto *V = dyn_cast<Constant>(II.getArgOperand(1)); 2079 if (!V) 2080 return nullptr; 2081 2082 auto *VecTy = cast<FixedVectorType>(II.getType()); 2083 unsigned Size = VecTy->getNumElements(); 2084 assert((Size == 4 || Size == 8 || Size == 16 || Size == 32 || Size == 64) && 2085 "Unexpected shuffle mask size"); 2086 2087 // Construct a shuffle mask from constant integers or UNDEFs. 2088 int Indexes[64]; 2089 2090 for (unsigned I = 0; I < Size; ++I) { 2091 Constant *COp = V->getAggregateElement(I); 2092 if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) 2093 return nullptr; 2094 2095 if (isa<UndefValue>(COp)) { 2096 Indexes[I] = -1; 2097 continue; 2098 } 2099 2100 uint32_t Index = cast<ConstantInt>(COp)->getZExtValue(); 2101 Index &= Size - 1; 2102 Indexes[I] = Index; 2103 } 2104 2105 auto V1 = II.getArgOperand(0); 2106 return Builder.CreateShuffleVector(V1, ArrayRef(Indexes, Size)); 2107 } 2108 2109 /// Attempt to convert vpermi2/vpermt2 to shufflevector if the mask is constant. 2110 static Value *simplifyX86vpermv3(const IntrinsicInst &II, 2111 InstCombiner::BuilderTy &Builder) { 2112 auto *V = dyn_cast<Constant>(II.getArgOperand(1)); 2113 if (!V) 2114 return nullptr; 2115 2116 auto *VecTy = cast<FixedVectorType>(II.getType()); 2117 unsigned Size = VecTy->getNumElements(); 2118 assert((Size == 2 || Size == 4 || Size == 8 || Size == 16 || Size == 32 || 2119 Size == 64) && 2120 "Unexpected shuffle mask size"); 2121 2122 // Construct a shuffle mask from constant integers or UNDEFs. 2123 int Indexes[64]; 2124 2125 for (unsigned I = 0; I < Size; ++I) { 2126 Constant *COp = V->getAggregateElement(I); 2127 if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) 2128 return nullptr; 2129 2130 if (isa<UndefValue>(COp)) { 2131 Indexes[I] = -1; 2132 continue; 2133 } 2134 2135 uint32_t Index = cast<ConstantInt>(COp)->getZExtValue(); 2136 Index &= (2 * Size) - 1; 2137 Indexes[I] = Index; 2138 } 2139 2140 auto V1 = II.getArgOperand(0); 2141 auto V2 = II.getArgOperand(2); 2142 return Builder.CreateShuffleVector(V1, V2, ArrayRef(Indexes, Size)); 2143 } 2144 2145 std::optional<Instruction *> 2146 X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { 2147 auto SimplifyDemandedVectorEltsLow = [&IC](Value *Op, unsigned Width, 2148 unsigned DemandedWidth) { 2149 APInt UndefElts(Width, 0); 2150 APInt DemandedElts = APInt::getLowBitsSet(Width, DemandedWidth); 2151 return IC.SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts); 2152 }; 2153 2154 Intrinsic::ID IID = II.getIntrinsicID(); 2155 switch (IID) { 2156 case Intrinsic::x86_bmi_bextr_32: 2157 case Intrinsic::x86_bmi_bextr_64: 2158 case Intrinsic::x86_tbm_bextri_u32: 2159 case Intrinsic::x86_tbm_bextri_u64: 2160 // If the RHS is a constant we can try some simplifications. 2161 if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) { 2162 uint64_t Shift = C->getZExtValue(); 2163 uint64_t Length = (Shift >> 8) & 0xff; 2164 Shift &= 0xff; 2165 unsigned BitWidth = II.getType()->getIntegerBitWidth(); 2166 // If the length is 0 or the shift is out of range, replace with zero. 2167 if (Length == 0 || Shift >= BitWidth) { 2168 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0)); 2169 } 2170 // If the LHS is also a constant, we can completely constant fold this. 2171 if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) { 2172 uint64_t Result = InC->getZExtValue() >> Shift; 2173 if (Length > BitWidth) 2174 Length = BitWidth; 2175 Result &= maskTrailingOnes<uint64_t>(Length); 2176 return IC.replaceInstUsesWith(II, 2177 ConstantInt::get(II.getType(), Result)); 2178 } 2179 // TODO should we turn this into 'and' if shift is 0? Or 'shl' if we 2180 // are only masking bits that a shift already cleared? 2181 } 2182 break; 2183 2184 case Intrinsic::x86_bmi_bzhi_32: 2185 case Intrinsic::x86_bmi_bzhi_64: 2186 // If the RHS is a constant we can try some simplifications. 2187 if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) { 2188 uint64_t Index = C->getZExtValue() & 0xff; 2189 unsigned BitWidth = II.getType()->getIntegerBitWidth(); 2190 if (Index >= BitWidth) { 2191 return IC.replaceInstUsesWith(II, II.getArgOperand(0)); 2192 } 2193 if (Index == 0) { 2194 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0)); 2195 } 2196 // If the LHS is also a constant, we can completely constant fold this. 2197 if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) { 2198 uint64_t Result = InC->getZExtValue(); 2199 Result &= maskTrailingOnes<uint64_t>(Index); 2200 return IC.replaceInstUsesWith(II, 2201 ConstantInt::get(II.getType(), Result)); 2202 } 2203 // TODO should we convert this to an AND if the RHS is constant? 2204 } 2205 break; 2206 case Intrinsic::x86_bmi_pext_32: 2207 case Intrinsic::x86_bmi_pext_64: 2208 if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) { 2209 if (MaskC->isNullValue()) { 2210 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0)); 2211 } 2212 if (MaskC->isAllOnesValue()) { 2213 return IC.replaceInstUsesWith(II, II.getArgOperand(0)); 2214 } 2215 2216 unsigned MaskIdx, MaskLen; 2217 if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) { 2218 // any single contingous sequence of 1s anywhere in the mask simply 2219 // describes a subset of the input bits shifted to the appropriate 2220 // position. Replace with the straight forward IR. 2221 Value *Input = II.getArgOperand(0); 2222 Value *Masked = IC.Builder.CreateAnd(Input, II.getArgOperand(1)); 2223 Value *ShiftAmt = ConstantInt::get(II.getType(), MaskIdx); 2224 Value *Shifted = IC.Builder.CreateLShr(Masked, ShiftAmt); 2225 return IC.replaceInstUsesWith(II, Shifted); 2226 } 2227 2228 if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) { 2229 uint64_t Src = SrcC->getZExtValue(); 2230 uint64_t Mask = MaskC->getZExtValue(); 2231 uint64_t Result = 0; 2232 uint64_t BitToSet = 1; 2233 2234 while (Mask) { 2235 // Isolate lowest set bit. 2236 uint64_t BitToTest = Mask & -Mask; 2237 if (BitToTest & Src) 2238 Result |= BitToSet; 2239 2240 BitToSet <<= 1; 2241 // Clear lowest set bit. 2242 Mask &= Mask - 1; 2243 } 2244 2245 return IC.replaceInstUsesWith(II, 2246 ConstantInt::get(II.getType(), Result)); 2247 } 2248 } 2249 break; 2250 case Intrinsic::x86_bmi_pdep_32: 2251 case Intrinsic::x86_bmi_pdep_64: 2252 if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) { 2253 if (MaskC->isNullValue()) { 2254 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0)); 2255 } 2256 if (MaskC->isAllOnesValue()) { 2257 return IC.replaceInstUsesWith(II, II.getArgOperand(0)); 2258 } 2259 2260 unsigned MaskIdx, MaskLen; 2261 if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) { 2262 // any single contingous sequence of 1s anywhere in the mask simply 2263 // describes a subset of the input bits shifted to the appropriate 2264 // position. Replace with the straight forward IR. 2265 Value *Input = II.getArgOperand(0); 2266 Value *ShiftAmt = ConstantInt::get(II.getType(), MaskIdx); 2267 Value *Shifted = IC.Builder.CreateShl(Input, ShiftAmt); 2268 Value *Masked = IC.Builder.CreateAnd(Shifted, II.getArgOperand(1)); 2269 return IC.replaceInstUsesWith(II, Masked); 2270 } 2271 2272 if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) { 2273 uint64_t Src = SrcC->getZExtValue(); 2274 uint64_t Mask = MaskC->getZExtValue(); 2275 uint64_t Result = 0; 2276 uint64_t BitToTest = 1; 2277 2278 while (Mask) { 2279 // Isolate lowest set bit. 2280 uint64_t BitToSet = Mask & -Mask; 2281 if (BitToTest & Src) 2282 Result |= BitToSet; 2283 2284 BitToTest <<= 1; 2285 // Clear lowest set bit; 2286 Mask &= Mask - 1; 2287 } 2288 2289 return IC.replaceInstUsesWith(II, 2290 ConstantInt::get(II.getType(), Result)); 2291 } 2292 } 2293 break; 2294 2295 case Intrinsic::x86_sse_cvtss2si: 2296 case Intrinsic::x86_sse_cvtss2si64: 2297 case Intrinsic::x86_sse_cvttss2si: 2298 case Intrinsic::x86_sse_cvttss2si64: 2299 case Intrinsic::x86_sse2_cvtsd2si: 2300 case Intrinsic::x86_sse2_cvtsd2si64: 2301 case Intrinsic::x86_sse2_cvttsd2si: 2302 case Intrinsic::x86_sse2_cvttsd2si64: 2303 case Intrinsic::x86_avx512_vcvtss2si32: 2304 case Intrinsic::x86_avx512_vcvtss2si64: 2305 case Intrinsic::x86_avx512_vcvtss2usi32: 2306 case Intrinsic::x86_avx512_vcvtss2usi64: 2307 case Intrinsic::x86_avx512_vcvtsd2si32: 2308 case Intrinsic::x86_avx512_vcvtsd2si64: 2309 case Intrinsic::x86_avx512_vcvtsd2usi32: 2310 case Intrinsic::x86_avx512_vcvtsd2usi64: 2311 case Intrinsic::x86_avx512_cvttss2si: 2312 case Intrinsic::x86_avx512_cvttss2si64: 2313 case Intrinsic::x86_avx512_cvttss2usi: 2314 case Intrinsic::x86_avx512_cvttss2usi64: 2315 case Intrinsic::x86_avx512_cvttsd2si: 2316 case Intrinsic::x86_avx512_cvttsd2si64: 2317 case Intrinsic::x86_avx512_cvttsd2usi: 2318 case Intrinsic::x86_avx512_cvttsd2usi64: { 2319 // These intrinsics only demand the 0th element of their input vectors. If 2320 // we can simplify the input based on that, do so now. 2321 Value *Arg = II.getArgOperand(0); 2322 unsigned VWidth = cast<FixedVectorType>(Arg->getType())->getNumElements(); 2323 if (Value *V = SimplifyDemandedVectorEltsLow(Arg, VWidth, 1)) { 2324 return IC.replaceOperand(II, 0, V); 2325 } 2326 break; 2327 } 2328 2329 case Intrinsic::x86_mmx_pmovmskb: 2330 case Intrinsic::x86_sse_movmsk_ps: 2331 case Intrinsic::x86_sse2_movmsk_pd: 2332 case Intrinsic::x86_sse2_pmovmskb_128: 2333 case Intrinsic::x86_avx_movmsk_pd_256: 2334 case Intrinsic::x86_avx_movmsk_ps_256: 2335 case Intrinsic::x86_avx2_pmovmskb: 2336 if (Value *V = simplifyX86movmsk(II, IC.Builder)) { 2337 return IC.replaceInstUsesWith(II, V); 2338 } 2339 break; 2340 2341 case Intrinsic::x86_sse_comieq_ss: 2342 case Intrinsic::x86_sse_comige_ss: 2343 case Intrinsic::x86_sse_comigt_ss: 2344 case Intrinsic::x86_sse_comile_ss: 2345 case Intrinsic::x86_sse_comilt_ss: 2346 case Intrinsic::x86_sse_comineq_ss: 2347 case Intrinsic::x86_sse_ucomieq_ss: 2348 case Intrinsic::x86_sse_ucomige_ss: 2349 case Intrinsic::x86_sse_ucomigt_ss: 2350 case Intrinsic::x86_sse_ucomile_ss: 2351 case Intrinsic::x86_sse_ucomilt_ss: 2352 case Intrinsic::x86_sse_ucomineq_ss: 2353 case Intrinsic::x86_sse2_comieq_sd: 2354 case Intrinsic::x86_sse2_comige_sd: 2355 case Intrinsic::x86_sse2_comigt_sd: 2356 case Intrinsic::x86_sse2_comile_sd: 2357 case Intrinsic::x86_sse2_comilt_sd: 2358 case Intrinsic::x86_sse2_comineq_sd: 2359 case Intrinsic::x86_sse2_ucomieq_sd: 2360 case Intrinsic::x86_sse2_ucomige_sd: 2361 case Intrinsic::x86_sse2_ucomigt_sd: 2362 case Intrinsic::x86_sse2_ucomile_sd: 2363 case Intrinsic::x86_sse2_ucomilt_sd: 2364 case Intrinsic::x86_sse2_ucomineq_sd: 2365 case Intrinsic::x86_avx512_vcomi_ss: 2366 case Intrinsic::x86_avx512_vcomi_sd: 2367 case Intrinsic::x86_avx512_mask_cmp_ss: 2368 case Intrinsic::x86_avx512_mask_cmp_sd: { 2369 // These intrinsics only demand the 0th element of their input vectors. If 2370 // we can simplify the input based on that, do so now. 2371 bool MadeChange = false; 2372 Value *Arg0 = II.getArgOperand(0); 2373 Value *Arg1 = II.getArgOperand(1); 2374 unsigned VWidth = cast<FixedVectorType>(Arg0->getType())->getNumElements(); 2375 if (Value *V = SimplifyDemandedVectorEltsLow(Arg0, VWidth, 1)) { 2376 IC.replaceOperand(II, 0, V); 2377 MadeChange = true; 2378 } 2379 if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, 1)) { 2380 IC.replaceOperand(II, 1, V); 2381 MadeChange = true; 2382 } 2383 if (MadeChange) { 2384 return &II; 2385 } 2386 break; 2387 } 2388 2389 case Intrinsic::x86_avx512_add_ps_512: 2390 case Intrinsic::x86_avx512_div_ps_512: 2391 case Intrinsic::x86_avx512_mul_ps_512: 2392 case Intrinsic::x86_avx512_sub_ps_512: 2393 case Intrinsic::x86_avx512_add_pd_512: 2394 case Intrinsic::x86_avx512_div_pd_512: 2395 case Intrinsic::x86_avx512_mul_pd_512: 2396 case Intrinsic::x86_avx512_sub_pd_512: 2397 // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular 2398 // IR operations. 2399 if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(2))) { 2400 if (R->getValue() == 4) { 2401 Value *Arg0 = II.getArgOperand(0); 2402 Value *Arg1 = II.getArgOperand(1); 2403 2404 Value *V; 2405 switch (IID) { 2406 default: 2407 llvm_unreachable("Case stmts out of sync!"); 2408 case Intrinsic::x86_avx512_add_ps_512: 2409 case Intrinsic::x86_avx512_add_pd_512: 2410 V = IC.Builder.CreateFAdd(Arg0, Arg1); 2411 break; 2412 case Intrinsic::x86_avx512_sub_ps_512: 2413 case Intrinsic::x86_avx512_sub_pd_512: 2414 V = IC.Builder.CreateFSub(Arg0, Arg1); 2415 break; 2416 case Intrinsic::x86_avx512_mul_ps_512: 2417 case Intrinsic::x86_avx512_mul_pd_512: 2418 V = IC.Builder.CreateFMul(Arg0, Arg1); 2419 break; 2420 case Intrinsic::x86_avx512_div_ps_512: 2421 case Intrinsic::x86_avx512_div_pd_512: 2422 V = IC.Builder.CreateFDiv(Arg0, Arg1); 2423 break; 2424 } 2425 2426 return IC.replaceInstUsesWith(II, V); 2427 } 2428 } 2429 break; 2430 2431 case Intrinsic::x86_avx512_mask_add_ss_round: 2432 case Intrinsic::x86_avx512_mask_div_ss_round: 2433 case Intrinsic::x86_avx512_mask_mul_ss_round: 2434 case Intrinsic::x86_avx512_mask_sub_ss_round: 2435 case Intrinsic::x86_avx512_mask_add_sd_round: 2436 case Intrinsic::x86_avx512_mask_div_sd_round: 2437 case Intrinsic::x86_avx512_mask_mul_sd_round: 2438 case Intrinsic::x86_avx512_mask_sub_sd_round: 2439 // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular 2440 // IR operations. 2441 if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(4))) { 2442 if (R->getValue() == 4) { 2443 // Extract the element as scalars. 2444 Value *Arg0 = II.getArgOperand(0); 2445 Value *Arg1 = II.getArgOperand(1); 2446 Value *LHS = IC.Builder.CreateExtractElement(Arg0, (uint64_t)0); 2447 Value *RHS = IC.Builder.CreateExtractElement(Arg1, (uint64_t)0); 2448 2449 Value *V; 2450 switch (IID) { 2451 default: 2452 llvm_unreachable("Case stmts out of sync!"); 2453 case Intrinsic::x86_avx512_mask_add_ss_round: 2454 case Intrinsic::x86_avx512_mask_add_sd_round: 2455 V = IC.Builder.CreateFAdd(LHS, RHS); 2456 break; 2457 case Intrinsic::x86_avx512_mask_sub_ss_round: 2458 case Intrinsic::x86_avx512_mask_sub_sd_round: 2459 V = IC.Builder.CreateFSub(LHS, RHS); 2460 break; 2461 case Intrinsic::x86_avx512_mask_mul_ss_round: 2462 case Intrinsic::x86_avx512_mask_mul_sd_round: 2463 V = IC.Builder.CreateFMul(LHS, RHS); 2464 break; 2465 case Intrinsic::x86_avx512_mask_div_ss_round: 2466 case Intrinsic::x86_avx512_mask_div_sd_round: 2467 V = IC.Builder.CreateFDiv(LHS, RHS); 2468 break; 2469 } 2470 2471 // Handle the masking aspect of the intrinsic. 2472 Value *Mask = II.getArgOperand(3); 2473 auto *C = dyn_cast<ConstantInt>(Mask); 2474 // We don't need a select if we know the mask bit is a 1. 2475 if (!C || !C->getValue()[0]) { 2476 // Cast the mask to an i1 vector and then extract the lowest element. 2477 auto *MaskTy = FixedVectorType::get( 2478 IC.Builder.getInt1Ty(), 2479 cast<IntegerType>(Mask->getType())->getBitWidth()); 2480 Mask = IC.Builder.CreateBitCast(Mask, MaskTy); 2481 Mask = IC.Builder.CreateExtractElement(Mask, (uint64_t)0); 2482 // Extract the lowest element from the passthru operand. 2483 Value *Passthru = 2484 IC.Builder.CreateExtractElement(II.getArgOperand(2), (uint64_t)0); 2485 V = IC.Builder.CreateSelect(Mask, V, Passthru); 2486 } 2487 2488 // Insert the result back into the original argument 0. 2489 V = IC.Builder.CreateInsertElement(Arg0, V, (uint64_t)0); 2490 2491 return IC.replaceInstUsesWith(II, V); 2492 } 2493 } 2494 break; 2495 2496 // Constant fold ashr( <A x Bi>, Ci ). 2497 // Constant fold lshr( <A x Bi>, Ci ). 2498 // Constant fold shl( <A x Bi>, Ci ). 2499 case Intrinsic::x86_sse2_psrai_d: 2500 case Intrinsic::x86_sse2_psrai_w: 2501 case Intrinsic::x86_avx2_psrai_d: 2502 case Intrinsic::x86_avx2_psrai_w: 2503 case Intrinsic::x86_avx512_psrai_q_128: 2504 case Intrinsic::x86_avx512_psrai_q_256: 2505 case Intrinsic::x86_avx512_psrai_d_512: 2506 case Intrinsic::x86_avx512_psrai_q_512: 2507 case Intrinsic::x86_avx512_psrai_w_512: 2508 case Intrinsic::x86_sse2_psrli_d: 2509 case Intrinsic::x86_sse2_psrli_q: 2510 case Intrinsic::x86_sse2_psrli_w: 2511 case Intrinsic::x86_avx2_psrli_d: 2512 case Intrinsic::x86_avx2_psrli_q: 2513 case Intrinsic::x86_avx2_psrli_w: 2514 case Intrinsic::x86_avx512_psrli_d_512: 2515 case Intrinsic::x86_avx512_psrli_q_512: 2516 case Intrinsic::x86_avx512_psrli_w_512: 2517 case Intrinsic::x86_sse2_pslli_d: 2518 case Intrinsic::x86_sse2_pslli_q: 2519 case Intrinsic::x86_sse2_pslli_w: 2520 case Intrinsic::x86_avx2_pslli_d: 2521 case Intrinsic::x86_avx2_pslli_q: 2522 case Intrinsic::x86_avx2_pslli_w: 2523 case Intrinsic::x86_avx512_pslli_d_512: 2524 case Intrinsic::x86_avx512_pslli_q_512: 2525 case Intrinsic::x86_avx512_pslli_w_512: 2526 if (Value *V = simplifyX86immShift(II, IC.Builder)) { 2527 return IC.replaceInstUsesWith(II, V); 2528 } 2529 break; 2530 2531 case Intrinsic::x86_sse2_psra_d: 2532 case Intrinsic::x86_sse2_psra_w: 2533 case Intrinsic::x86_avx2_psra_d: 2534 case Intrinsic::x86_avx2_psra_w: 2535 case Intrinsic::x86_avx512_psra_q_128: 2536 case Intrinsic::x86_avx512_psra_q_256: 2537 case Intrinsic::x86_avx512_psra_d_512: 2538 case Intrinsic::x86_avx512_psra_q_512: 2539 case Intrinsic::x86_avx512_psra_w_512: 2540 case Intrinsic::x86_sse2_psrl_d: 2541 case Intrinsic::x86_sse2_psrl_q: 2542 case Intrinsic::x86_sse2_psrl_w: 2543 case Intrinsic::x86_avx2_psrl_d: 2544 case Intrinsic::x86_avx2_psrl_q: 2545 case Intrinsic::x86_avx2_psrl_w: 2546 case Intrinsic::x86_avx512_psrl_d_512: 2547 case Intrinsic::x86_avx512_psrl_q_512: 2548 case Intrinsic::x86_avx512_psrl_w_512: 2549 case Intrinsic::x86_sse2_psll_d: 2550 case Intrinsic::x86_sse2_psll_q: 2551 case Intrinsic::x86_sse2_psll_w: 2552 case Intrinsic::x86_avx2_psll_d: 2553 case Intrinsic::x86_avx2_psll_q: 2554 case Intrinsic::x86_avx2_psll_w: 2555 case Intrinsic::x86_avx512_psll_d_512: 2556 case Intrinsic::x86_avx512_psll_q_512: 2557 case Intrinsic::x86_avx512_psll_w_512: { 2558 if (Value *V = simplifyX86immShift(II, IC.Builder)) { 2559 return IC.replaceInstUsesWith(II, V); 2560 } 2561 2562 // SSE2/AVX2 uses only the first 64-bits of the 128-bit vector 2563 // operand to compute the shift amount. 2564 Value *Arg1 = II.getArgOperand(1); 2565 assert(Arg1->getType()->getPrimitiveSizeInBits() == 128 && 2566 "Unexpected packed shift size"); 2567 unsigned VWidth = cast<FixedVectorType>(Arg1->getType())->getNumElements(); 2568 2569 if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, VWidth / 2)) { 2570 return IC.replaceOperand(II, 1, V); 2571 } 2572 break; 2573 } 2574 2575 case Intrinsic::x86_avx2_psllv_d: 2576 case Intrinsic::x86_avx2_psllv_d_256: 2577 case Intrinsic::x86_avx2_psllv_q: 2578 case Intrinsic::x86_avx2_psllv_q_256: 2579 case Intrinsic::x86_avx512_psllv_d_512: 2580 case Intrinsic::x86_avx512_psllv_q_512: 2581 case Intrinsic::x86_avx512_psllv_w_128: 2582 case Intrinsic::x86_avx512_psllv_w_256: 2583 case Intrinsic::x86_avx512_psllv_w_512: 2584 case Intrinsic::x86_avx2_psrav_d: 2585 case Intrinsic::x86_avx2_psrav_d_256: 2586 case Intrinsic::x86_avx512_psrav_q_128: 2587 case Intrinsic::x86_avx512_psrav_q_256: 2588 case Intrinsic::x86_avx512_psrav_d_512: 2589 case Intrinsic::x86_avx512_psrav_q_512: 2590 case Intrinsic::x86_avx512_psrav_w_128: 2591 case Intrinsic::x86_avx512_psrav_w_256: 2592 case Intrinsic::x86_avx512_psrav_w_512: 2593 case Intrinsic::x86_avx2_psrlv_d: 2594 case Intrinsic::x86_avx2_psrlv_d_256: 2595 case Intrinsic::x86_avx2_psrlv_q: 2596 case Intrinsic::x86_avx2_psrlv_q_256: 2597 case Intrinsic::x86_avx512_psrlv_d_512: 2598 case Intrinsic::x86_avx512_psrlv_q_512: 2599 case Intrinsic::x86_avx512_psrlv_w_128: 2600 case Intrinsic::x86_avx512_psrlv_w_256: 2601 case Intrinsic::x86_avx512_psrlv_w_512: 2602 if (Value *V = simplifyX86varShift(II, IC.Builder)) { 2603 return IC.replaceInstUsesWith(II, V); 2604 } 2605 break; 2606 2607 case Intrinsic::x86_sse2_packssdw_128: 2608 case Intrinsic::x86_sse2_packsswb_128: 2609 case Intrinsic::x86_avx2_packssdw: 2610 case Intrinsic::x86_avx2_packsswb: 2611 case Intrinsic::x86_avx512_packssdw_512: 2612 case Intrinsic::x86_avx512_packsswb_512: 2613 if (Value *V = simplifyX86pack(II, IC.Builder, true)) { 2614 return IC.replaceInstUsesWith(II, V); 2615 } 2616 break; 2617 2618 case Intrinsic::x86_sse2_packuswb_128: 2619 case Intrinsic::x86_sse41_packusdw: 2620 case Intrinsic::x86_avx2_packusdw: 2621 case Intrinsic::x86_avx2_packuswb: 2622 case Intrinsic::x86_avx512_packusdw_512: 2623 case Intrinsic::x86_avx512_packuswb_512: 2624 if (Value *V = simplifyX86pack(II, IC.Builder, false)) { 2625 return IC.replaceInstUsesWith(II, V); 2626 } 2627 break; 2628 2629 case Intrinsic::x86_sse2_pmulh_w: 2630 case Intrinsic::x86_avx2_pmulh_w: 2631 case Intrinsic::x86_avx512_pmulh_w_512: 2632 if (Value *V = simplifyX86pmulh(II, IC.Builder, true, false)) { 2633 return IC.replaceInstUsesWith(II, V); 2634 } 2635 break; 2636 2637 case Intrinsic::x86_sse2_pmulhu_w: 2638 case Intrinsic::x86_avx2_pmulhu_w: 2639 case Intrinsic::x86_avx512_pmulhu_w_512: 2640 if (Value *V = simplifyX86pmulh(II, IC.Builder, false, false)) { 2641 return IC.replaceInstUsesWith(II, V); 2642 } 2643 break; 2644 2645 case Intrinsic::x86_ssse3_pmul_hr_sw_128: 2646 case Intrinsic::x86_avx2_pmul_hr_sw: 2647 case Intrinsic::x86_avx512_pmul_hr_sw_512: 2648 if (Value *V = simplifyX86pmulh(II, IC.Builder, true, true)) { 2649 return IC.replaceInstUsesWith(II, V); 2650 } 2651 break; 2652 2653 case Intrinsic::x86_sse2_pmadd_wd: 2654 case Intrinsic::x86_avx2_pmadd_wd: 2655 case Intrinsic::x86_avx512_pmaddw_d_512: 2656 if (Value *V = simplifyX86pmadd(II, IC.Builder, true)) { 2657 return IC.replaceInstUsesWith(II, V); 2658 } 2659 break; 2660 2661 case Intrinsic::x86_ssse3_pmadd_ub_sw_128: 2662 case Intrinsic::x86_avx2_pmadd_ub_sw: 2663 case Intrinsic::x86_avx512_pmaddubs_w_512: 2664 if (Value *V = simplifyX86pmadd(II, IC.Builder, false)) { 2665 return IC.replaceInstUsesWith(II, V); 2666 } 2667 break; 2668 2669 case Intrinsic::x86_pclmulqdq: 2670 case Intrinsic::x86_pclmulqdq_256: 2671 case Intrinsic::x86_pclmulqdq_512: { 2672 if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(2))) { 2673 unsigned Imm = C->getZExtValue(); 2674 2675 bool MadeChange = false; 2676 Value *Arg0 = II.getArgOperand(0); 2677 Value *Arg1 = II.getArgOperand(1); 2678 unsigned VWidth = 2679 cast<FixedVectorType>(Arg0->getType())->getNumElements(); 2680 2681 APInt UndefElts1(VWidth, 0); 2682 APInt DemandedElts1 = 2683 APInt::getSplat(VWidth, APInt(2, (Imm & 0x01) ? 2 : 1)); 2684 if (Value *V = 2685 IC.SimplifyDemandedVectorElts(Arg0, DemandedElts1, UndefElts1)) { 2686 IC.replaceOperand(II, 0, V); 2687 MadeChange = true; 2688 } 2689 2690 APInt UndefElts2(VWidth, 0); 2691 APInt DemandedElts2 = 2692 APInt::getSplat(VWidth, APInt(2, (Imm & 0x10) ? 2 : 1)); 2693 if (Value *V = 2694 IC.SimplifyDemandedVectorElts(Arg1, DemandedElts2, UndefElts2)) { 2695 IC.replaceOperand(II, 1, V); 2696 MadeChange = true; 2697 } 2698 2699 // If either input elements are undef, the result is zero. 2700 if (DemandedElts1.isSubsetOf(UndefElts1) || 2701 DemandedElts2.isSubsetOf(UndefElts2)) { 2702 return IC.replaceInstUsesWith(II, 2703 ConstantAggregateZero::get(II.getType())); 2704 } 2705 2706 if (MadeChange) { 2707 return &II; 2708 } 2709 } 2710 break; 2711 } 2712 2713 case Intrinsic::x86_sse41_insertps: 2714 if (Value *V = simplifyX86insertps(II, IC.Builder)) { 2715 return IC.replaceInstUsesWith(II, V); 2716 } 2717 break; 2718 2719 case Intrinsic::x86_sse4a_extrq: { 2720 Value *Op0 = II.getArgOperand(0); 2721 Value *Op1 = II.getArgOperand(1); 2722 unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements(); 2723 unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements(); 2724 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && 2725 Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 && 2726 VWidth1 == 16 && "Unexpected operand sizes"); 2727 2728 // See if we're dealing with constant values. 2729 auto *C1 = dyn_cast<Constant>(Op1); 2730 auto *CILength = 2731 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0)) 2732 : nullptr; 2733 auto *CIIndex = 2734 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1)) 2735 : nullptr; 2736 2737 // Attempt to simplify to a constant, shuffle vector or EXTRQI call. 2738 if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) { 2739 return IC.replaceInstUsesWith(II, V); 2740 } 2741 2742 // EXTRQ only uses the lowest 64-bits of the first 128-bit vector 2743 // operands and the lowest 16-bits of the second. 2744 bool MadeChange = false; 2745 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) { 2746 IC.replaceOperand(II, 0, V); 2747 MadeChange = true; 2748 } 2749 if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 2)) { 2750 IC.replaceOperand(II, 1, V); 2751 MadeChange = true; 2752 } 2753 if (MadeChange) { 2754 return &II; 2755 } 2756 break; 2757 } 2758 2759 case Intrinsic::x86_sse4a_extrqi: { 2760 // EXTRQI: Extract Length bits starting from Index. Zero pad the remaining 2761 // bits of the lower 64-bits. The upper 64-bits are undefined. 2762 Value *Op0 = II.getArgOperand(0); 2763 unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements(); 2764 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 && 2765 "Unexpected operand size"); 2766 2767 // See if we're dealing with constant values. 2768 auto *CILength = dyn_cast<ConstantInt>(II.getArgOperand(1)); 2769 auto *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(2)); 2770 2771 // Attempt to simplify to a constant or shuffle vector. 2772 if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) { 2773 return IC.replaceInstUsesWith(II, V); 2774 } 2775 2776 // EXTRQI only uses the lowest 64-bits of the first 128-bit vector 2777 // operand. 2778 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) { 2779 return IC.replaceOperand(II, 0, V); 2780 } 2781 break; 2782 } 2783 2784 case Intrinsic::x86_sse4a_insertq: { 2785 Value *Op0 = II.getArgOperand(0); 2786 Value *Op1 = II.getArgOperand(1); 2787 unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements(); 2788 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && 2789 Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 && 2790 cast<FixedVectorType>(Op1->getType())->getNumElements() == 2 && 2791 "Unexpected operand size"); 2792 2793 // See if we're dealing with constant values. 2794 auto *C1 = dyn_cast<Constant>(Op1); 2795 auto *CI11 = 2796 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1)) 2797 : nullptr; 2798 2799 // Attempt to simplify to a constant, shuffle vector or INSERTQI call. 2800 if (CI11) { 2801 const APInt &V11 = CI11->getValue(); 2802 APInt Len = V11.zextOrTrunc(6); 2803 APInt Idx = V11.lshr(8).zextOrTrunc(6); 2804 if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) { 2805 return IC.replaceInstUsesWith(II, V); 2806 } 2807 } 2808 2809 // INSERTQ only uses the lowest 64-bits of the first 128-bit vector 2810 // operand. 2811 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) { 2812 return IC.replaceOperand(II, 0, V); 2813 } 2814 break; 2815 } 2816 2817 case Intrinsic::x86_sse4a_insertqi: { 2818 // INSERTQI: Extract lowest Length bits from lower half of second source and 2819 // insert over first source starting at Index bit. The upper 64-bits are 2820 // undefined. 2821 Value *Op0 = II.getArgOperand(0); 2822 Value *Op1 = II.getArgOperand(1); 2823 unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements(); 2824 unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements(); 2825 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && 2826 Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 && 2827 VWidth1 == 2 && "Unexpected operand sizes"); 2828 2829 // See if we're dealing with constant values. 2830 auto *CILength = dyn_cast<ConstantInt>(II.getArgOperand(2)); 2831 auto *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(3)); 2832 2833 // Attempt to simplify to a constant or shuffle vector. 2834 if (CILength && CIIndex) { 2835 APInt Len = CILength->getValue().zextOrTrunc(6); 2836 APInt Idx = CIIndex->getValue().zextOrTrunc(6); 2837 if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) { 2838 return IC.replaceInstUsesWith(II, V); 2839 } 2840 } 2841 2842 // INSERTQI only uses the lowest 64-bits of the first two 128-bit vector 2843 // operands. 2844 bool MadeChange = false; 2845 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) { 2846 IC.replaceOperand(II, 0, V); 2847 MadeChange = true; 2848 } 2849 if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 1)) { 2850 IC.replaceOperand(II, 1, V); 2851 MadeChange = true; 2852 } 2853 if (MadeChange) { 2854 return &II; 2855 } 2856 break; 2857 } 2858 2859 case Intrinsic::x86_sse41_pblendvb: 2860 case Intrinsic::x86_sse41_blendvps: 2861 case Intrinsic::x86_sse41_blendvpd: 2862 case Intrinsic::x86_avx_blendv_ps_256: 2863 case Intrinsic::x86_avx_blendv_pd_256: 2864 case Intrinsic::x86_avx2_pblendvb: { 2865 // fold (blend A, A, Mask) -> A 2866 Value *Op0 = II.getArgOperand(0); 2867 Value *Op1 = II.getArgOperand(1); 2868 Value *Mask = II.getArgOperand(2); 2869 if (Op0 == Op1) { 2870 return IC.replaceInstUsesWith(II, Op0); 2871 } 2872 2873 // Zero Mask - select 1st argument. 2874 if (isa<ConstantAggregateZero>(Mask)) { 2875 return IC.replaceInstUsesWith(II, Op0); 2876 } 2877 2878 // Constant Mask - select 1st/2nd argument lane based on top bit of mask. 2879 if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) { 2880 Constant *NewSelector = 2881 getNegativeIsTrueBoolVec(ConstantMask, IC.getDataLayout()); 2882 return SelectInst::Create(NewSelector, Op1, Op0, "blendv"); 2883 } 2884 2885 Mask = InstCombiner::peekThroughBitcast(Mask); 2886 2887 // Peek through a one-use shuffle - VectorCombine should have simplified 2888 // this for cases where we're splitting wider vectors to use blendv 2889 // intrinsics. 2890 Value *MaskSrc = nullptr; 2891 ArrayRef<int> ShuffleMask; 2892 if (match(Mask, m_OneUse(m_Shuffle(m_Value(MaskSrc), m_Undef(), 2893 m_Mask(ShuffleMask))))) { 2894 // Bail if the shuffle was irregular or contains undefs. 2895 int NumElts = cast<FixedVectorType>(MaskSrc->getType())->getNumElements(); 2896 if (NumElts < (int)ShuffleMask.size() || !isPowerOf2_32(NumElts) || 2897 any_of(ShuffleMask, 2898 [NumElts](int M) { return M < 0 || M >= NumElts; })) 2899 break; 2900 Mask = InstCombiner::peekThroughBitcast(MaskSrc); 2901 } 2902 2903 // Convert to a vector select if we can bypass casts and find a boolean 2904 // vector condition value. 2905 Value *BoolVec; 2906 if (match(Mask, m_SExt(m_Value(BoolVec))) && 2907 BoolVec->getType()->isVectorTy() && 2908 BoolVec->getType()->getScalarSizeInBits() == 1) { 2909 auto *MaskTy = cast<FixedVectorType>(Mask->getType()); 2910 auto *OpTy = cast<FixedVectorType>(II.getType()); 2911 unsigned NumMaskElts = MaskTy->getNumElements(); 2912 unsigned NumOperandElts = OpTy->getNumElements(); 2913 2914 // If we peeked through a shuffle, reapply the shuffle to the bool vector. 2915 if (MaskSrc) { 2916 unsigned NumMaskSrcElts = 2917 cast<FixedVectorType>(MaskSrc->getType())->getNumElements(); 2918 NumMaskElts = (ShuffleMask.size() * NumMaskElts) / NumMaskSrcElts; 2919 // Multiple mask bits maps to the same operand element - bail out. 2920 if (NumMaskElts > NumOperandElts) 2921 break; 2922 SmallVector<int> ScaledMask; 2923 if (!llvm::scaleShuffleMaskElts(NumMaskElts, ShuffleMask, ScaledMask)) 2924 break; 2925 BoolVec = IC.Builder.CreateShuffleVector(BoolVec, ScaledMask); 2926 MaskTy = FixedVectorType::get(MaskTy->getElementType(), NumMaskElts); 2927 } 2928 assert(MaskTy->getPrimitiveSizeInBits() == 2929 OpTy->getPrimitiveSizeInBits() && 2930 "Not expecting mask and operands with different sizes"); 2931 2932 if (NumMaskElts == NumOperandElts) { 2933 return SelectInst::Create(BoolVec, Op1, Op0); 2934 } 2935 2936 // If the mask has less elements than the operands, each mask bit maps to 2937 // multiple elements of the operands. Bitcast back and forth. 2938 if (NumMaskElts < NumOperandElts) { 2939 Value *CastOp0 = IC.Builder.CreateBitCast(Op0, MaskTy); 2940 Value *CastOp1 = IC.Builder.CreateBitCast(Op1, MaskTy); 2941 Value *Sel = IC.Builder.CreateSelect(BoolVec, CastOp1, CastOp0); 2942 return new BitCastInst(Sel, II.getType()); 2943 } 2944 } 2945 2946 break; 2947 } 2948 2949 case Intrinsic::x86_ssse3_pshuf_b_128: 2950 case Intrinsic::x86_avx2_pshuf_b: 2951 case Intrinsic::x86_avx512_pshuf_b_512: 2952 if (Value *V = simplifyX86pshufb(II, IC.Builder)) { 2953 return IC.replaceInstUsesWith(II, V); 2954 } 2955 break; 2956 2957 case Intrinsic::x86_avx_vpermilvar_ps: 2958 case Intrinsic::x86_avx_vpermilvar_ps_256: 2959 case Intrinsic::x86_avx512_vpermilvar_ps_512: 2960 case Intrinsic::x86_avx_vpermilvar_pd: 2961 case Intrinsic::x86_avx_vpermilvar_pd_256: 2962 case Intrinsic::x86_avx512_vpermilvar_pd_512: 2963 if (Value *V = simplifyX86vpermilvar(II, IC.Builder)) { 2964 return IC.replaceInstUsesWith(II, V); 2965 } 2966 break; 2967 2968 case Intrinsic::x86_avx2_permd: 2969 case Intrinsic::x86_avx2_permps: 2970 case Intrinsic::x86_avx512_permvar_df_256: 2971 case Intrinsic::x86_avx512_permvar_df_512: 2972 case Intrinsic::x86_avx512_permvar_di_256: 2973 case Intrinsic::x86_avx512_permvar_di_512: 2974 case Intrinsic::x86_avx512_permvar_hi_128: 2975 case Intrinsic::x86_avx512_permvar_hi_256: 2976 case Intrinsic::x86_avx512_permvar_hi_512: 2977 case Intrinsic::x86_avx512_permvar_qi_128: 2978 case Intrinsic::x86_avx512_permvar_qi_256: 2979 case Intrinsic::x86_avx512_permvar_qi_512: 2980 case Intrinsic::x86_avx512_permvar_sf_512: 2981 case Intrinsic::x86_avx512_permvar_si_512: 2982 if (Value *V = simplifyX86vpermv(II, IC.Builder)) { 2983 return IC.replaceInstUsesWith(II, V); 2984 } 2985 break; 2986 2987 case Intrinsic::x86_avx512_vpermi2var_d_128: 2988 case Intrinsic::x86_avx512_vpermi2var_d_256: 2989 case Intrinsic::x86_avx512_vpermi2var_d_512: 2990 case Intrinsic::x86_avx512_vpermi2var_hi_128: 2991 case Intrinsic::x86_avx512_vpermi2var_hi_256: 2992 case Intrinsic::x86_avx512_vpermi2var_hi_512: 2993 case Intrinsic::x86_avx512_vpermi2var_pd_128: 2994 case Intrinsic::x86_avx512_vpermi2var_pd_256: 2995 case Intrinsic::x86_avx512_vpermi2var_pd_512: 2996 case Intrinsic::x86_avx512_vpermi2var_ps_128: 2997 case Intrinsic::x86_avx512_vpermi2var_ps_256: 2998 case Intrinsic::x86_avx512_vpermi2var_ps_512: 2999 case Intrinsic::x86_avx512_vpermi2var_q_128: 3000 case Intrinsic::x86_avx512_vpermi2var_q_256: 3001 case Intrinsic::x86_avx512_vpermi2var_q_512: 3002 case Intrinsic::x86_avx512_vpermi2var_qi_128: 3003 case Intrinsic::x86_avx512_vpermi2var_qi_256: 3004 case Intrinsic::x86_avx512_vpermi2var_qi_512: 3005 if (Value *V = simplifyX86vpermv3(II, IC.Builder)) { 3006 return IC.replaceInstUsesWith(II, V); 3007 } 3008 break; 3009 3010 case Intrinsic::x86_avx_maskload_ps: 3011 case Intrinsic::x86_avx_maskload_pd: 3012 case Intrinsic::x86_avx_maskload_ps_256: 3013 case Intrinsic::x86_avx_maskload_pd_256: 3014 case Intrinsic::x86_avx2_maskload_d: 3015 case Intrinsic::x86_avx2_maskload_q: 3016 case Intrinsic::x86_avx2_maskload_d_256: 3017 case Intrinsic::x86_avx2_maskload_q_256: 3018 if (Instruction *I = simplifyX86MaskedLoad(II, IC)) { 3019 return I; 3020 } 3021 break; 3022 3023 case Intrinsic::x86_sse2_maskmov_dqu: 3024 case Intrinsic::x86_avx_maskstore_ps: 3025 case Intrinsic::x86_avx_maskstore_pd: 3026 case Intrinsic::x86_avx_maskstore_ps_256: 3027 case Intrinsic::x86_avx_maskstore_pd_256: 3028 case Intrinsic::x86_avx2_maskstore_d: 3029 case Intrinsic::x86_avx2_maskstore_q: 3030 case Intrinsic::x86_avx2_maskstore_d_256: 3031 case Intrinsic::x86_avx2_maskstore_q_256: 3032 if (simplifyX86MaskedStore(II, IC)) { 3033 return nullptr; 3034 } 3035 break; 3036 3037 case Intrinsic::x86_addcarry_32: 3038 case Intrinsic::x86_addcarry_64: 3039 if (Value *V = simplifyX86addcarry(II, IC.Builder)) { 3040 return IC.replaceInstUsesWith(II, V); 3041 } 3042 break; 3043 3044 case Intrinsic::x86_avx512_pternlog_d_128: 3045 case Intrinsic::x86_avx512_pternlog_d_256: 3046 case Intrinsic::x86_avx512_pternlog_d_512: 3047 case Intrinsic::x86_avx512_pternlog_q_128: 3048 case Intrinsic::x86_avx512_pternlog_q_256: 3049 case Intrinsic::x86_avx512_pternlog_q_512: 3050 if (Value *V = simplifyTernarylogic(II, IC.Builder)) { 3051 return IC.replaceInstUsesWith(II, V); 3052 } 3053 break; 3054 default: 3055 break; 3056 } 3057 return std::nullopt; 3058 } 3059 3060 std::optional<Value *> X86TTIImpl::simplifyDemandedUseBitsIntrinsic( 3061 InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known, 3062 bool &KnownBitsComputed) const { 3063 switch (II.getIntrinsicID()) { 3064 default: 3065 break; 3066 case Intrinsic::x86_mmx_pmovmskb: 3067 case Intrinsic::x86_sse_movmsk_ps: 3068 case Intrinsic::x86_sse2_movmsk_pd: 3069 case Intrinsic::x86_sse2_pmovmskb_128: 3070 case Intrinsic::x86_avx_movmsk_ps_256: 3071 case Intrinsic::x86_avx_movmsk_pd_256: 3072 case Intrinsic::x86_avx2_pmovmskb: { 3073 // MOVMSK copies the vector elements' sign bits to the low bits 3074 // and zeros the high bits. 3075 unsigned ArgWidth; 3076 if (II.getIntrinsicID() == Intrinsic::x86_mmx_pmovmskb) { 3077 ArgWidth = 8; // Arg is x86_mmx, but treated as <8 x i8>. 3078 } else { 3079 auto *ArgType = cast<FixedVectorType>(II.getArgOperand(0)->getType()); 3080 ArgWidth = ArgType->getNumElements(); 3081 } 3082 3083 // If we don't need any of low bits then return zero, 3084 // we know that DemandedMask is non-zero already. 3085 APInt DemandedElts = DemandedMask.zextOrTrunc(ArgWidth); 3086 Type *VTy = II.getType(); 3087 if (DemandedElts.isZero()) { 3088 return ConstantInt::getNullValue(VTy); 3089 } 3090 3091 // We know that the upper bits are set to zero. 3092 Known.Zero.setBitsFrom(ArgWidth); 3093 KnownBitsComputed = true; 3094 break; 3095 } 3096 } 3097 return std::nullopt; 3098 } 3099 3100 std::optional<Value *> X86TTIImpl::simplifyDemandedVectorEltsIntrinsic( 3101 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, 3102 APInt &UndefElts2, APInt &UndefElts3, 3103 std::function<void(Instruction *, unsigned, APInt, APInt &)> 3104 simplifyAndSetOp) const { 3105 unsigned VWidth = cast<FixedVectorType>(II.getType())->getNumElements(); 3106 switch (II.getIntrinsicID()) { 3107 default: 3108 break; 3109 case Intrinsic::x86_xop_vfrcz_ss: 3110 case Intrinsic::x86_xop_vfrcz_sd: 3111 // The instructions for these intrinsics are speced to zero upper bits not 3112 // pass them through like other scalar intrinsics. So we shouldn't just 3113 // use Arg0 if DemandedElts[0] is clear like we do for other intrinsics. 3114 // Instead we should return a zero vector. 3115 if (!DemandedElts[0]) { 3116 IC.addToWorklist(&II); 3117 return ConstantAggregateZero::get(II.getType()); 3118 } 3119 3120 // Only the lower element is used. 3121 DemandedElts = 1; 3122 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); 3123 3124 // Only the lower element is undefined. The high elements are zero. 3125 UndefElts = UndefElts[0]; 3126 break; 3127 3128 // Unary scalar-as-vector operations that work column-wise. 3129 case Intrinsic::x86_sse_rcp_ss: 3130 case Intrinsic::x86_sse_rsqrt_ss: 3131 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); 3132 3133 // If lowest element of a scalar op isn't used then use Arg0. 3134 if (!DemandedElts[0]) { 3135 IC.addToWorklist(&II); 3136 return II.getArgOperand(0); 3137 } 3138 // TODO: If only low elt lower SQRT to FSQRT (with rounding/exceptions 3139 // checks). 3140 break; 3141 3142 // Binary scalar-as-vector operations that work column-wise. The high 3143 // elements come from operand 0. The low element is a function of both 3144 // operands. 3145 case Intrinsic::x86_sse_min_ss: 3146 case Intrinsic::x86_sse_max_ss: 3147 case Intrinsic::x86_sse_cmp_ss: 3148 case Intrinsic::x86_sse2_min_sd: 3149 case Intrinsic::x86_sse2_max_sd: 3150 case Intrinsic::x86_sse2_cmp_sd: { 3151 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); 3152 3153 // If lowest element of a scalar op isn't used then use Arg0. 3154 if (!DemandedElts[0]) { 3155 IC.addToWorklist(&II); 3156 return II.getArgOperand(0); 3157 } 3158 3159 // Only lower element is used for operand 1. 3160 DemandedElts = 1; 3161 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2); 3162 3163 // Lower element is undefined if both lower elements are undefined. 3164 // Consider things like undef&0. The result is known zero, not undef. 3165 if (!UndefElts2[0]) 3166 UndefElts.clearBit(0); 3167 3168 break; 3169 } 3170 3171 // Binary scalar-as-vector operations that work column-wise. The high 3172 // elements come from operand 0 and the low element comes from operand 1. 3173 case Intrinsic::x86_sse41_round_ss: 3174 case Intrinsic::x86_sse41_round_sd: { 3175 // Don't use the low element of operand 0. 3176 APInt DemandedElts2 = DemandedElts; 3177 DemandedElts2.clearBit(0); 3178 simplifyAndSetOp(&II, 0, DemandedElts2, UndefElts); 3179 3180 // If lowest element of a scalar op isn't used then use Arg0. 3181 if (!DemandedElts[0]) { 3182 IC.addToWorklist(&II); 3183 return II.getArgOperand(0); 3184 } 3185 3186 // Only lower element is used for operand 1. 3187 DemandedElts = 1; 3188 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2); 3189 3190 // Take the high undef elements from operand 0 and take the lower element 3191 // from operand 1. 3192 UndefElts.clearBit(0); 3193 UndefElts |= UndefElts2[0]; 3194 break; 3195 } 3196 3197 // Three input scalar-as-vector operations that work column-wise. The high 3198 // elements come from operand 0 and the low element is a function of all 3199 // three inputs. 3200 case Intrinsic::x86_avx512_mask_add_ss_round: 3201 case Intrinsic::x86_avx512_mask_div_ss_round: 3202 case Intrinsic::x86_avx512_mask_mul_ss_round: 3203 case Intrinsic::x86_avx512_mask_sub_ss_round: 3204 case Intrinsic::x86_avx512_mask_max_ss_round: 3205 case Intrinsic::x86_avx512_mask_min_ss_round: 3206 case Intrinsic::x86_avx512_mask_add_sd_round: 3207 case Intrinsic::x86_avx512_mask_div_sd_round: 3208 case Intrinsic::x86_avx512_mask_mul_sd_round: 3209 case Intrinsic::x86_avx512_mask_sub_sd_round: 3210 case Intrinsic::x86_avx512_mask_max_sd_round: 3211 case Intrinsic::x86_avx512_mask_min_sd_round: 3212 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); 3213 3214 // If lowest element of a scalar op isn't used then use Arg0. 3215 if (!DemandedElts[0]) { 3216 IC.addToWorklist(&II); 3217 return II.getArgOperand(0); 3218 } 3219 3220 // Only lower element is used for operand 1 and 2. 3221 DemandedElts = 1; 3222 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2); 3223 simplifyAndSetOp(&II, 2, DemandedElts, UndefElts3); 3224 3225 // Lower element is undefined if all three lower elements are undefined. 3226 // Consider things like undef&0. The result is known zero, not undef. 3227 if (!UndefElts2[0] || !UndefElts3[0]) 3228 UndefElts.clearBit(0); 3229 break; 3230 3231 // TODO: Add fmaddsub support? 3232 case Intrinsic::x86_sse3_addsub_pd: 3233 case Intrinsic::x86_sse3_addsub_ps: 3234 case Intrinsic::x86_avx_addsub_pd_256: 3235 case Intrinsic::x86_avx_addsub_ps_256: { 3236 // If none of the even or none of the odd lanes are required, turn this 3237 // into a generic FP math instruction. 3238 APInt SubMask = APInt::getSplat(VWidth, APInt(2, 0x1)); 3239 APInt AddMask = APInt::getSplat(VWidth, APInt(2, 0x2)); 3240 bool IsSubOnly = DemandedElts.isSubsetOf(SubMask); 3241 bool IsAddOnly = DemandedElts.isSubsetOf(AddMask); 3242 if (IsSubOnly || IsAddOnly) { 3243 assert((IsSubOnly ^ IsAddOnly) && "Can't be both add-only and sub-only"); 3244 IRBuilderBase::InsertPointGuard Guard(IC.Builder); 3245 IC.Builder.SetInsertPoint(&II); 3246 Value *Arg0 = II.getArgOperand(0), *Arg1 = II.getArgOperand(1); 3247 return IC.Builder.CreateBinOp( 3248 IsSubOnly ? Instruction::FSub : Instruction::FAdd, Arg0, Arg1); 3249 } 3250 3251 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); 3252 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2); 3253 UndefElts &= UndefElts2; 3254 break; 3255 } 3256 3257 // General per-element vector operations. 3258 case Intrinsic::x86_avx2_psllv_d: 3259 case Intrinsic::x86_avx2_psllv_d_256: 3260 case Intrinsic::x86_avx2_psllv_q: 3261 case Intrinsic::x86_avx2_psllv_q_256: 3262 case Intrinsic::x86_avx2_psrlv_d: 3263 case Intrinsic::x86_avx2_psrlv_d_256: 3264 case Intrinsic::x86_avx2_psrlv_q: 3265 case Intrinsic::x86_avx2_psrlv_q_256: 3266 case Intrinsic::x86_avx2_psrav_d: 3267 case Intrinsic::x86_avx2_psrav_d_256: { 3268 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); 3269 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2); 3270 UndefElts &= UndefElts2; 3271 break; 3272 } 3273 3274 case Intrinsic::x86_sse2_pmulh_w: 3275 case Intrinsic::x86_avx2_pmulh_w: 3276 case Intrinsic::x86_avx512_pmulh_w_512: 3277 case Intrinsic::x86_sse2_pmulhu_w: 3278 case Intrinsic::x86_avx2_pmulhu_w: 3279 case Intrinsic::x86_avx512_pmulhu_w_512: 3280 case Intrinsic::x86_ssse3_pmul_hr_sw_128: 3281 case Intrinsic::x86_avx2_pmul_hr_sw: 3282 case Intrinsic::x86_avx512_pmul_hr_sw_512: { 3283 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); 3284 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2); 3285 // NOTE: mulh(undef,undef) != undef. 3286 break; 3287 } 3288 3289 case Intrinsic::x86_sse2_packssdw_128: 3290 case Intrinsic::x86_sse2_packsswb_128: 3291 case Intrinsic::x86_sse2_packuswb_128: 3292 case Intrinsic::x86_sse41_packusdw: 3293 case Intrinsic::x86_avx2_packssdw: 3294 case Intrinsic::x86_avx2_packsswb: 3295 case Intrinsic::x86_avx2_packusdw: 3296 case Intrinsic::x86_avx2_packuswb: 3297 case Intrinsic::x86_avx512_packssdw_512: 3298 case Intrinsic::x86_avx512_packsswb_512: 3299 case Intrinsic::x86_avx512_packusdw_512: 3300 case Intrinsic::x86_avx512_packuswb_512: { 3301 auto *Ty0 = II.getArgOperand(0)->getType(); 3302 unsigned InnerVWidth = cast<FixedVectorType>(Ty0)->getNumElements(); 3303 assert(VWidth == (InnerVWidth * 2) && "Unexpected input size"); 3304 3305 unsigned NumLanes = Ty0->getPrimitiveSizeInBits() / 128; 3306 unsigned VWidthPerLane = VWidth / NumLanes; 3307 unsigned InnerVWidthPerLane = InnerVWidth / NumLanes; 3308 3309 // Per lane, pack the elements of the first input and then the second. 3310 // e.g. 3311 // v8i16 PACK(v4i32 X, v4i32 Y) - (X[0..3],Y[0..3]) 3312 // v32i8 PACK(v16i16 X, v16i16 Y) - (X[0..7],Y[0..7]),(X[8..15],Y[8..15]) 3313 for (int OpNum = 0; OpNum != 2; ++OpNum) { 3314 APInt OpDemandedElts(InnerVWidth, 0); 3315 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { 3316 unsigned LaneIdx = Lane * VWidthPerLane; 3317 for (unsigned Elt = 0; Elt != InnerVWidthPerLane; ++Elt) { 3318 unsigned Idx = LaneIdx + Elt + InnerVWidthPerLane * OpNum; 3319 if (DemandedElts[Idx]) 3320 OpDemandedElts.setBit((Lane * InnerVWidthPerLane) + Elt); 3321 } 3322 } 3323 3324 // Demand elements from the operand. 3325 APInt OpUndefElts(InnerVWidth, 0); 3326 simplifyAndSetOp(&II, OpNum, OpDemandedElts, OpUndefElts); 3327 3328 // Pack the operand's UNDEF elements, one lane at a time. 3329 OpUndefElts = OpUndefElts.zext(VWidth); 3330 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { 3331 APInt LaneElts = OpUndefElts.lshr(InnerVWidthPerLane * Lane); 3332 LaneElts = LaneElts.getLoBits(InnerVWidthPerLane); 3333 LaneElts <<= InnerVWidthPerLane * (2 * Lane + OpNum); 3334 UndefElts |= LaneElts; 3335 } 3336 } 3337 break; 3338 } 3339 3340 case Intrinsic::x86_sse2_pmadd_wd: 3341 case Intrinsic::x86_avx2_pmadd_wd: 3342 case Intrinsic::x86_avx512_pmaddw_d_512: 3343 case Intrinsic::x86_ssse3_pmadd_ub_sw_128: 3344 case Intrinsic::x86_avx2_pmadd_ub_sw: 3345 case Intrinsic::x86_avx512_pmaddubs_w_512: { 3346 // PMADD - demand both src elements that map to each dst element. 3347 auto *ArgTy = II.getArgOperand(0)->getType(); 3348 unsigned InnerVWidth = cast<FixedVectorType>(ArgTy)->getNumElements(); 3349 assert((VWidth * 2) == InnerVWidth && "Unexpected input size"); 3350 APInt OpDemandedElts = APIntOps::ScaleBitMask(DemandedElts, InnerVWidth); 3351 APInt Op0UndefElts(InnerVWidth, 0); 3352 APInt Op1UndefElts(InnerVWidth, 0); 3353 simplifyAndSetOp(&II, 0, OpDemandedElts, Op0UndefElts); 3354 simplifyAndSetOp(&II, 1, OpDemandedElts, Op1UndefElts); 3355 // NOTE: madd(undef,undef) != undef. 3356 break; 3357 } 3358 3359 // PSHUFB 3360 case Intrinsic::x86_ssse3_pshuf_b_128: 3361 case Intrinsic::x86_avx2_pshuf_b: 3362 case Intrinsic::x86_avx512_pshuf_b_512: 3363 // PERMILVAR 3364 case Intrinsic::x86_avx_vpermilvar_ps: 3365 case Intrinsic::x86_avx_vpermilvar_ps_256: 3366 case Intrinsic::x86_avx512_vpermilvar_ps_512: 3367 case Intrinsic::x86_avx_vpermilvar_pd: 3368 case Intrinsic::x86_avx_vpermilvar_pd_256: 3369 case Intrinsic::x86_avx512_vpermilvar_pd_512: 3370 // PERMV 3371 case Intrinsic::x86_avx2_permd: 3372 case Intrinsic::x86_avx2_permps: { 3373 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts); 3374 break; 3375 } 3376 3377 // SSE4A instructions leave the upper 64-bits of the 128-bit result 3378 // in an undefined state. 3379 case Intrinsic::x86_sse4a_extrq: 3380 case Intrinsic::x86_sse4a_extrqi: 3381 case Intrinsic::x86_sse4a_insertq: 3382 case Intrinsic::x86_sse4a_insertqi: 3383 UndefElts.setHighBits(VWidth / 2); 3384 break; 3385 } 3386 return std::nullopt; 3387 } 3388