1 //===-- X86InstCombineIntrinsic.cpp - X86 specific InstCombine pass -------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements a TargetTransformInfo analysis pass specific to the 10 /// X86 target machine. It uses the target's detailed information to provide 11 /// more precise answers to certain TTI queries, while letting the target 12 /// independent and default TTI implementations handle the rest. 13 /// 14 //===----------------------------------------------------------------------===// 15 16 #include "X86TargetTransformInfo.h" 17 #include "llvm/IR/IntrinsicInst.h" 18 #include "llvm/IR/IntrinsicsX86.h" 19 #include "llvm/Support/KnownBits.h" 20 #include "llvm/Transforms/InstCombine/InstCombiner.h" 21 22 using namespace llvm; 23 24 #define DEBUG_TYPE "x86tti" 25 26 /// Return a constant boolean vector that has true elements in all positions 27 /// where the input constant data vector has an element with the sign bit set. 28 static Constant *getNegativeIsTrueBoolVec(Constant *V) { 29 VectorType *IntTy = VectorType::getInteger(cast<VectorType>(V->getType())); 30 V = ConstantExpr::getBitCast(V, IntTy); 31 V = ConstantExpr::getICmp(CmpInst::ICMP_SGT, Constant::getNullValue(IntTy), 32 V); 33 return V; 34 } 35 36 /// Convert the x86 XMM integer vector mask to a vector of bools based on 37 /// each element's most significant bit (the sign bit). 38 static Value *getBoolVecFromMask(Value *Mask) { 39 // Fold Constant Mask. 40 if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) 41 return getNegativeIsTrueBoolVec(ConstantMask); 42 43 // Mask was extended from a boolean vector. 44 Value *ExtMask; 45 if (PatternMatch::match( 46 Mask, PatternMatch::m_SExt(PatternMatch::m_Value(ExtMask))) && 47 ExtMask->getType()->isIntOrIntVectorTy(1)) 48 return ExtMask; 49 50 return nullptr; 51 } 52 53 // TODO: If the x86 backend knew how to convert a bool vector mask back to an 54 // XMM register mask efficiently, we could transform all x86 masked intrinsics 55 // to LLVM masked intrinsics and remove the x86 masked intrinsic defs. 56 static Instruction *simplifyX86MaskedLoad(IntrinsicInst &II, InstCombiner &IC) { 57 Value *Ptr = II.getOperand(0); 58 Value *Mask = II.getOperand(1); 59 Constant *ZeroVec = Constant::getNullValue(II.getType()); 60 61 // Zero Mask - masked load instruction creates a zero vector. 62 if (isa<ConstantAggregateZero>(Mask)) 63 return IC.replaceInstUsesWith(II, ZeroVec); 64 65 // The mask is constant or extended from a bool vector. Convert this x86 66 // intrinsic to the LLVM intrinsic to allow target-independent optimizations. 67 if (Value *BoolMask = getBoolVecFromMask(Mask)) { 68 // First, cast the x86 intrinsic scalar pointer to a vector pointer to match 69 // the LLVM intrinsic definition for the pointer argument. 70 unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace(); 71 PointerType *VecPtrTy = PointerType::get(II.getType(), AddrSpace); 72 Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec"); 73 74 // The pass-through vector for an x86 masked load is a zero vector. 75 CallInst *NewMaskedLoad = IC.Builder.CreateMaskedLoad( 76 II.getType(), PtrCast, Align(1), BoolMask, ZeroVec); 77 return IC.replaceInstUsesWith(II, NewMaskedLoad); 78 } 79 80 return nullptr; 81 } 82 83 // TODO: If the x86 backend knew how to convert a bool vector mask back to an 84 // XMM register mask efficiently, we could transform all x86 masked intrinsics 85 // to LLVM masked intrinsics and remove the x86 masked intrinsic defs. 86 static bool simplifyX86MaskedStore(IntrinsicInst &II, InstCombiner &IC) { 87 Value *Ptr = II.getOperand(0); 88 Value *Mask = II.getOperand(1); 89 Value *Vec = II.getOperand(2); 90 91 // Zero Mask - this masked store instruction does nothing. 92 if (isa<ConstantAggregateZero>(Mask)) { 93 IC.eraseInstFromFunction(II); 94 return true; 95 } 96 97 // The SSE2 version is too weird (eg, unaligned but non-temporal) to do 98 // anything else at this level. 99 if (II.getIntrinsicID() == Intrinsic::x86_sse2_maskmov_dqu) 100 return false; 101 102 // The mask is constant or extended from a bool vector. Convert this x86 103 // intrinsic to the LLVM intrinsic to allow target-independent optimizations. 104 if (Value *BoolMask = getBoolVecFromMask(Mask)) { 105 unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace(); 106 PointerType *VecPtrTy = PointerType::get(Vec->getType(), AddrSpace); 107 Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec"); 108 109 IC.Builder.CreateMaskedStore(Vec, PtrCast, Align(1), BoolMask); 110 111 // 'Replace uses' doesn't work for stores. Erase the original masked store. 112 IC.eraseInstFromFunction(II); 113 return true; 114 } 115 116 return false; 117 } 118 119 static Value *simplifyX86immShift(const IntrinsicInst &II, 120 InstCombiner::BuilderTy &Builder) { 121 bool LogicalShift = false; 122 bool ShiftLeft = false; 123 bool IsImm = false; 124 125 switch (II.getIntrinsicID()) { 126 default: 127 llvm_unreachable("Unexpected intrinsic!"); 128 case Intrinsic::x86_sse2_psrai_d: 129 case Intrinsic::x86_sse2_psrai_w: 130 case Intrinsic::x86_avx2_psrai_d: 131 case Intrinsic::x86_avx2_psrai_w: 132 case Intrinsic::x86_avx512_psrai_q_128: 133 case Intrinsic::x86_avx512_psrai_q_256: 134 case Intrinsic::x86_avx512_psrai_d_512: 135 case Intrinsic::x86_avx512_psrai_q_512: 136 case Intrinsic::x86_avx512_psrai_w_512: 137 IsImm = true; 138 LLVM_FALLTHROUGH; 139 case Intrinsic::x86_sse2_psra_d: 140 case Intrinsic::x86_sse2_psra_w: 141 case Intrinsic::x86_avx2_psra_d: 142 case Intrinsic::x86_avx2_psra_w: 143 case Intrinsic::x86_avx512_psra_q_128: 144 case Intrinsic::x86_avx512_psra_q_256: 145 case Intrinsic::x86_avx512_psra_d_512: 146 case Intrinsic::x86_avx512_psra_q_512: 147 case Intrinsic::x86_avx512_psra_w_512: 148 LogicalShift = false; 149 ShiftLeft = false; 150 break; 151 case Intrinsic::x86_sse2_psrli_d: 152 case Intrinsic::x86_sse2_psrli_q: 153 case Intrinsic::x86_sse2_psrli_w: 154 case Intrinsic::x86_avx2_psrli_d: 155 case Intrinsic::x86_avx2_psrli_q: 156 case Intrinsic::x86_avx2_psrli_w: 157 case Intrinsic::x86_avx512_psrli_d_512: 158 case Intrinsic::x86_avx512_psrli_q_512: 159 case Intrinsic::x86_avx512_psrli_w_512: 160 IsImm = true; 161 LLVM_FALLTHROUGH; 162 case Intrinsic::x86_sse2_psrl_d: 163 case Intrinsic::x86_sse2_psrl_q: 164 case Intrinsic::x86_sse2_psrl_w: 165 case Intrinsic::x86_avx2_psrl_d: 166 case Intrinsic::x86_avx2_psrl_q: 167 case Intrinsic::x86_avx2_psrl_w: 168 case Intrinsic::x86_avx512_psrl_d_512: 169 case Intrinsic::x86_avx512_psrl_q_512: 170 case Intrinsic::x86_avx512_psrl_w_512: 171 LogicalShift = true; 172 ShiftLeft = false; 173 break; 174 case Intrinsic::x86_sse2_pslli_d: 175 case Intrinsic::x86_sse2_pslli_q: 176 case Intrinsic::x86_sse2_pslli_w: 177 case Intrinsic::x86_avx2_pslli_d: 178 case Intrinsic::x86_avx2_pslli_q: 179 case Intrinsic::x86_avx2_pslli_w: 180 case Intrinsic::x86_avx512_pslli_d_512: 181 case Intrinsic::x86_avx512_pslli_q_512: 182 case Intrinsic::x86_avx512_pslli_w_512: 183 IsImm = true; 184 LLVM_FALLTHROUGH; 185 case Intrinsic::x86_sse2_psll_d: 186 case Intrinsic::x86_sse2_psll_q: 187 case Intrinsic::x86_sse2_psll_w: 188 case Intrinsic::x86_avx2_psll_d: 189 case Intrinsic::x86_avx2_psll_q: 190 case Intrinsic::x86_avx2_psll_w: 191 case Intrinsic::x86_avx512_psll_d_512: 192 case Intrinsic::x86_avx512_psll_q_512: 193 case Intrinsic::x86_avx512_psll_w_512: 194 LogicalShift = true; 195 ShiftLeft = true; 196 break; 197 } 198 assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left"); 199 200 Value *Vec = II.getArgOperand(0); 201 Value *Amt = II.getArgOperand(1); 202 auto *VT = cast<FixedVectorType>(Vec->getType()); 203 Type *SVT = VT->getElementType(); 204 Type *AmtVT = Amt->getType(); 205 unsigned VWidth = VT->getNumElements(); 206 unsigned BitWidth = SVT->getPrimitiveSizeInBits(); 207 208 // If the shift amount is guaranteed to be in-range we can replace it with a 209 // generic shift. If its guaranteed to be out of range, logical shifts combine 210 // to zero and arithmetic shifts are clamped to (BitWidth - 1). 211 if (IsImm) { 212 assert(AmtVT->isIntegerTy(32) && "Unexpected shift-by-immediate type"); 213 KnownBits KnownAmtBits = 214 llvm::computeKnownBits(Amt, II.getModule()->getDataLayout()); 215 if (KnownAmtBits.getMaxValue().ult(BitWidth)) { 216 Amt = Builder.CreateZExtOrTrunc(Amt, SVT); 217 Amt = Builder.CreateVectorSplat(VWidth, Amt); 218 return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt) 219 : Builder.CreateLShr(Vec, Amt)) 220 : Builder.CreateAShr(Vec, Amt)); 221 } 222 if (KnownAmtBits.getMinValue().uge(BitWidth)) { 223 if (LogicalShift) 224 return ConstantAggregateZero::get(VT); 225 Amt = ConstantInt::get(SVT, BitWidth - 1); 226 return Builder.CreateAShr(Vec, Builder.CreateVectorSplat(VWidth, Amt)); 227 } 228 } else { 229 // Ensure the first element has an in-range value and the rest of the 230 // elements in the bottom 64 bits are zero. 231 assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 && 232 cast<VectorType>(AmtVT)->getElementType() == SVT && 233 "Unexpected shift-by-scalar type"); 234 unsigned NumAmtElts = cast<FixedVectorType>(AmtVT)->getNumElements(); 235 APInt DemandedLower = APInt::getOneBitSet(NumAmtElts, 0); 236 APInt DemandedUpper = APInt::getBitsSet(NumAmtElts, 1, NumAmtElts / 2); 237 KnownBits KnownLowerBits = llvm::computeKnownBits( 238 Amt, DemandedLower, II.getModule()->getDataLayout()); 239 KnownBits KnownUpperBits = llvm::computeKnownBits( 240 Amt, DemandedUpper, II.getModule()->getDataLayout()); 241 if (KnownLowerBits.getMaxValue().ult(BitWidth) && 242 (DemandedUpper.isZero() || KnownUpperBits.isZero())) { 243 SmallVector<int, 16> ZeroSplat(VWidth, 0); 244 Amt = Builder.CreateShuffleVector(Amt, ZeroSplat); 245 return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt) 246 : Builder.CreateLShr(Vec, Amt)) 247 : Builder.CreateAShr(Vec, Amt)); 248 } 249 } 250 251 // Simplify if count is constant vector. 252 auto *CDV = dyn_cast<ConstantDataVector>(Amt); 253 if (!CDV) 254 return nullptr; 255 256 // SSE2/AVX2 uses all the first 64-bits of the 128-bit vector 257 // operand to compute the shift amount. 258 assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 && 259 cast<VectorType>(AmtVT)->getElementType() == SVT && 260 "Unexpected shift-by-scalar type"); 261 262 // Concatenate the sub-elements to create the 64-bit value. 263 APInt Count(64, 0); 264 for (unsigned i = 0, NumSubElts = 64 / BitWidth; i != NumSubElts; ++i) { 265 unsigned SubEltIdx = (NumSubElts - 1) - i; 266 auto *SubElt = cast<ConstantInt>(CDV->getElementAsConstant(SubEltIdx)); 267 Count <<= BitWidth; 268 Count |= SubElt->getValue().zextOrTrunc(64); 269 } 270 271 // If shift-by-zero then just return the original value. 272 if (Count.isZero()) 273 return Vec; 274 275 // Handle cases when Shift >= BitWidth. 276 if (Count.uge(BitWidth)) { 277 // If LogicalShift - just return zero. 278 if (LogicalShift) 279 return ConstantAggregateZero::get(VT); 280 281 // If ArithmeticShift - clamp Shift to (BitWidth - 1). 282 Count = APInt(64, BitWidth - 1); 283 } 284 285 // Get a constant vector of the same type as the first operand. 286 auto ShiftAmt = ConstantInt::get(SVT, Count.zextOrTrunc(BitWidth)); 287 auto ShiftVec = Builder.CreateVectorSplat(VWidth, ShiftAmt); 288 289 if (ShiftLeft) 290 return Builder.CreateShl(Vec, ShiftVec); 291 292 if (LogicalShift) 293 return Builder.CreateLShr(Vec, ShiftVec); 294 295 return Builder.CreateAShr(Vec, ShiftVec); 296 } 297 298 // Attempt to simplify AVX2 per-element shift intrinsics to a generic IR shift. 299 // Unlike the generic IR shifts, the intrinsics have defined behaviour for out 300 // of range shift amounts (logical - set to zero, arithmetic - splat sign bit). 301 static Value *simplifyX86varShift(const IntrinsicInst &II, 302 InstCombiner::BuilderTy &Builder) { 303 bool LogicalShift = false; 304 bool ShiftLeft = false; 305 306 switch (II.getIntrinsicID()) { 307 default: 308 llvm_unreachable("Unexpected intrinsic!"); 309 case Intrinsic::x86_avx2_psrav_d: 310 case Intrinsic::x86_avx2_psrav_d_256: 311 case Intrinsic::x86_avx512_psrav_q_128: 312 case Intrinsic::x86_avx512_psrav_q_256: 313 case Intrinsic::x86_avx512_psrav_d_512: 314 case Intrinsic::x86_avx512_psrav_q_512: 315 case Intrinsic::x86_avx512_psrav_w_128: 316 case Intrinsic::x86_avx512_psrav_w_256: 317 case Intrinsic::x86_avx512_psrav_w_512: 318 LogicalShift = false; 319 ShiftLeft = false; 320 break; 321 case Intrinsic::x86_avx2_psrlv_d: 322 case Intrinsic::x86_avx2_psrlv_d_256: 323 case Intrinsic::x86_avx2_psrlv_q: 324 case Intrinsic::x86_avx2_psrlv_q_256: 325 case Intrinsic::x86_avx512_psrlv_d_512: 326 case Intrinsic::x86_avx512_psrlv_q_512: 327 case Intrinsic::x86_avx512_psrlv_w_128: 328 case Intrinsic::x86_avx512_psrlv_w_256: 329 case Intrinsic::x86_avx512_psrlv_w_512: 330 LogicalShift = true; 331 ShiftLeft = false; 332 break; 333 case Intrinsic::x86_avx2_psllv_d: 334 case Intrinsic::x86_avx2_psllv_d_256: 335 case Intrinsic::x86_avx2_psllv_q: 336 case Intrinsic::x86_avx2_psllv_q_256: 337 case Intrinsic::x86_avx512_psllv_d_512: 338 case Intrinsic::x86_avx512_psllv_q_512: 339 case Intrinsic::x86_avx512_psllv_w_128: 340 case Intrinsic::x86_avx512_psllv_w_256: 341 case Intrinsic::x86_avx512_psllv_w_512: 342 LogicalShift = true; 343 ShiftLeft = true; 344 break; 345 } 346 assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left"); 347 348 Value *Vec = II.getArgOperand(0); 349 Value *Amt = II.getArgOperand(1); 350 auto *VT = cast<FixedVectorType>(II.getType()); 351 Type *SVT = VT->getElementType(); 352 int NumElts = VT->getNumElements(); 353 int BitWidth = SVT->getIntegerBitWidth(); 354 355 // If the shift amount is guaranteed to be in-range we can replace it with a 356 // generic shift. 357 KnownBits KnownAmt = 358 llvm::computeKnownBits(Amt, II.getModule()->getDataLayout()); 359 if (KnownAmt.getMaxValue().ult(BitWidth)) { 360 return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt) 361 : Builder.CreateLShr(Vec, Amt)) 362 : Builder.CreateAShr(Vec, Amt)); 363 } 364 365 // Simplify if all shift amounts are constant/undef. 366 auto *CShift = dyn_cast<Constant>(Amt); 367 if (!CShift) 368 return nullptr; 369 370 // Collect each element's shift amount. 371 // We also collect special cases: UNDEF = -1, OUT-OF-RANGE = BitWidth. 372 bool AnyOutOfRange = false; 373 SmallVector<int, 8> ShiftAmts; 374 for (int I = 0; I < NumElts; ++I) { 375 auto *CElt = CShift->getAggregateElement(I); 376 if (isa_and_nonnull<UndefValue>(CElt)) { 377 ShiftAmts.push_back(-1); 378 continue; 379 } 380 381 auto *COp = dyn_cast_or_null<ConstantInt>(CElt); 382 if (!COp) 383 return nullptr; 384 385 // Handle out of range shifts. 386 // If LogicalShift - set to BitWidth (special case). 387 // If ArithmeticShift - set to (BitWidth - 1) (sign splat). 388 APInt ShiftVal = COp->getValue(); 389 if (ShiftVal.uge(BitWidth)) { 390 AnyOutOfRange = LogicalShift; 391 ShiftAmts.push_back(LogicalShift ? BitWidth : BitWidth - 1); 392 continue; 393 } 394 395 ShiftAmts.push_back((int)ShiftVal.getZExtValue()); 396 } 397 398 // If all elements out of range or UNDEF, return vector of zeros/undefs. 399 // ArithmeticShift should only hit this if they are all UNDEF. 400 auto OutOfRange = [&](int Idx) { return (Idx < 0) || (BitWidth <= Idx); }; 401 if (llvm::all_of(ShiftAmts, OutOfRange)) { 402 SmallVector<Constant *, 8> ConstantVec; 403 for (int Idx : ShiftAmts) { 404 if (Idx < 0) { 405 ConstantVec.push_back(UndefValue::get(SVT)); 406 } else { 407 assert(LogicalShift && "Logical shift expected"); 408 ConstantVec.push_back(ConstantInt::getNullValue(SVT)); 409 } 410 } 411 return ConstantVector::get(ConstantVec); 412 } 413 414 // We can't handle only some out of range values with generic logical shifts. 415 if (AnyOutOfRange) 416 return nullptr; 417 418 // Build the shift amount constant vector. 419 SmallVector<Constant *, 8> ShiftVecAmts; 420 for (int Idx : ShiftAmts) { 421 if (Idx < 0) 422 ShiftVecAmts.push_back(UndefValue::get(SVT)); 423 else 424 ShiftVecAmts.push_back(ConstantInt::get(SVT, Idx)); 425 } 426 auto ShiftVec = ConstantVector::get(ShiftVecAmts); 427 428 if (ShiftLeft) 429 return Builder.CreateShl(Vec, ShiftVec); 430 431 if (LogicalShift) 432 return Builder.CreateLShr(Vec, ShiftVec); 433 434 return Builder.CreateAShr(Vec, ShiftVec); 435 } 436 437 static Value *simplifyX86pack(IntrinsicInst &II, 438 InstCombiner::BuilderTy &Builder, bool IsSigned) { 439 Value *Arg0 = II.getArgOperand(0); 440 Value *Arg1 = II.getArgOperand(1); 441 Type *ResTy = II.getType(); 442 443 // Fast all undef handling. 444 if (isa<UndefValue>(Arg0) && isa<UndefValue>(Arg1)) 445 return UndefValue::get(ResTy); 446 447 auto *ArgTy = cast<FixedVectorType>(Arg0->getType()); 448 unsigned NumLanes = ResTy->getPrimitiveSizeInBits() / 128; 449 unsigned NumSrcElts = ArgTy->getNumElements(); 450 assert(cast<FixedVectorType>(ResTy)->getNumElements() == (2 * NumSrcElts) && 451 "Unexpected packing types"); 452 453 unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes; 454 unsigned DstScalarSizeInBits = ResTy->getScalarSizeInBits(); 455 unsigned SrcScalarSizeInBits = ArgTy->getScalarSizeInBits(); 456 assert(SrcScalarSizeInBits == (2 * DstScalarSizeInBits) && 457 "Unexpected packing types"); 458 459 // Constant folding. 460 if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1)) 461 return nullptr; 462 463 // Clamp Values - signed/unsigned both use signed clamp values, but they 464 // differ on the min/max values. 465 APInt MinValue, MaxValue; 466 if (IsSigned) { 467 // PACKSS: Truncate signed value with signed saturation. 468 // Source values less than dst minint are saturated to minint. 469 // Source values greater than dst maxint are saturated to maxint. 470 MinValue = 471 APInt::getSignedMinValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits); 472 MaxValue = 473 APInt::getSignedMaxValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits); 474 } else { 475 // PACKUS: Truncate signed value with unsigned saturation. 476 // Source values less than zero are saturated to zero. 477 // Source values greater than dst maxuint are saturated to maxuint. 478 MinValue = APInt::getZero(SrcScalarSizeInBits); 479 MaxValue = APInt::getLowBitsSet(SrcScalarSizeInBits, DstScalarSizeInBits); 480 } 481 482 auto *MinC = Constant::getIntegerValue(ArgTy, MinValue); 483 auto *MaxC = Constant::getIntegerValue(ArgTy, MaxValue); 484 Arg0 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg0, MinC), MinC, Arg0); 485 Arg1 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg1, MinC), MinC, Arg1); 486 Arg0 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg0, MaxC), MaxC, Arg0); 487 Arg1 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg1, MaxC), MaxC, Arg1); 488 489 // Shuffle clamped args together at the lane level. 490 SmallVector<int, 32> PackMask; 491 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { 492 for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt) 493 PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane)); 494 for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt) 495 PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane) + NumSrcElts); 496 } 497 auto *Shuffle = Builder.CreateShuffleVector(Arg0, Arg1, PackMask); 498 499 // Truncate to dst size. 500 return Builder.CreateTrunc(Shuffle, ResTy); 501 } 502 503 static Value *simplifyX86movmsk(const IntrinsicInst &II, 504 InstCombiner::BuilderTy &Builder) { 505 Value *Arg = II.getArgOperand(0); 506 Type *ResTy = II.getType(); 507 508 // movmsk(undef) -> zero as we must ensure the upper bits are zero. 509 if (isa<UndefValue>(Arg)) 510 return Constant::getNullValue(ResTy); 511 512 auto *ArgTy = dyn_cast<FixedVectorType>(Arg->getType()); 513 // We can't easily peek through x86_mmx types. 514 if (!ArgTy) 515 return nullptr; 516 517 // Expand MOVMSK to compare/bitcast/zext: 518 // e.g. PMOVMSKB(v16i8 x): 519 // %cmp = icmp slt <16 x i8> %x, zeroinitializer 520 // %int = bitcast <16 x i1> %cmp to i16 521 // %res = zext i16 %int to i32 522 unsigned NumElts = ArgTy->getNumElements(); 523 Type *IntegerTy = Builder.getIntNTy(NumElts); 524 525 Value *Res = Builder.CreateBitCast(Arg, VectorType::getInteger(ArgTy)); 526 Res = Builder.CreateIsNeg(Res); 527 Res = Builder.CreateBitCast(Res, IntegerTy); 528 Res = Builder.CreateZExtOrTrunc(Res, ResTy); 529 return Res; 530 } 531 532 static Value *simplifyX86addcarry(const IntrinsicInst &II, 533 InstCombiner::BuilderTy &Builder) { 534 Value *CarryIn = II.getArgOperand(0); 535 Value *Op1 = II.getArgOperand(1); 536 Value *Op2 = II.getArgOperand(2); 537 Type *RetTy = II.getType(); 538 Type *OpTy = Op1->getType(); 539 assert(RetTy->getStructElementType(0)->isIntegerTy(8) && 540 RetTy->getStructElementType(1) == OpTy && OpTy == Op2->getType() && 541 "Unexpected types for x86 addcarry"); 542 543 // If carry-in is zero, this is just an unsigned add with overflow. 544 if (match(CarryIn, PatternMatch::m_ZeroInt())) { 545 Value *UAdd = Builder.CreateIntrinsic(Intrinsic::uadd_with_overflow, OpTy, 546 {Op1, Op2}); 547 // The types have to be adjusted to match the x86 call types. 548 Value *UAddResult = Builder.CreateExtractValue(UAdd, 0); 549 Value *UAddOV = Builder.CreateZExt(Builder.CreateExtractValue(UAdd, 1), 550 Builder.getInt8Ty()); 551 Value *Res = UndefValue::get(RetTy); 552 Res = Builder.CreateInsertValue(Res, UAddOV, 0); 553 return Builder.CreateInsertValue(Res, UAddResult, 1); 554 } 555 556 return nullptr; 557 } 558 559 static Value *simplifyX86insertps(const IntrinsicInst &II, 560 InstCombiner::BuilderTy &Builder) { 561 auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2)); 562 if (!CInt) 563 return nullptr; 564 565 auto *VecTy = cast<FixedVectorType>(II.getType()); 566 assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type"); 567 568 // The immediate permute control byte looks like this: 569 // [3:0] - zero mask for each 32-bit lane 570 // [5:4] - select one 32-bit destination lane 571 // [7:6] - select one 32-bit source lane 572 573 uint8_t Imm = CInt->getZExtValue(); 574 uint8_t ZMask = Imm & 0xf; 575 uint8_t DestLane = (Imm >> 4) & 0x3; 576 uint8_t SourceLane = (Imm >> 6) & 0x3; 577 578 ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy); 579 580 // If all zero mask bits are set, this was just a weird way to 581 // generate a zero vector. 582 if (ZMask == 0xf) 583 return ZeroVector; 584 585 // Initialize by passing all of the first source bits through. 586 int ShuffleMask[4] = {0, 1, 2, 3}; 587 588 // We may replace the second operand with the zero vector. 589 Value *V1 = II.getArgOperand(1); 590 591 if (ZMask) { 592 // If the zero mask is being used with a single input or the zero mask 593 // overrides the destination lane, this is a shuffle with the zero vector. 594 if ((II.getArgOperand(0) == II.getArgOperand(1)) || 595 (ZMask & (1 << DestLane))) { 596 V1 = ZeroVector; 597 // We may still move 32-bits of the first source vector from one lane 598 // to another. 599 ShuffleMask[DestLane] = SourceLane; 600 // The zero mask may override the previous insert operation. 601 for (unsigned i = 0; i < 4; ++i) 602 if ((ZMask >> i) & 0x1) 603 ShuffleMask[i] = i + 4; 604 } else { 605 // TODO: Model this case as 2 shuffles or a 'logical and' plus shuffle? 606 return nullptr; 607 } 608 } else { 609 // Replace the selected destination lane with the selected source lane. 610 ShuffleMask[DestLane] = SourceLane + 4; 611 } 612 613 return Builder.CreateShuffleVector(II.getArgOperand(0), V1, ShuffleMask); 614 } 615 616 /// Attempt to simplify SSE4A EXTRQ/EXTRQI instructions using constant folding 617 /// or conversion to a shuffle vector. 618 static Value *simplifyX86extrq(IntrinsicInst &II, Value *Op0, 619 ConstantInt *CILength, ConstantInt *CIIndex, 620 InstCombiner::BuilderTy &Builder) { 621 auto LowConstantHighUndef = [&](uint64_t Val) { 622 Type *IntTy64 = Type::getInt64Ty(II.getContext()); 623 Constant *Args[] = {ConstantInt::get(IntTy64, Val), 624 UndefValue::get(IntTy64)}; 625 return ConstantVector::get(Args); 626 }; 627 628 // See if we're dealing with constant values. 629 auto *C0 = dyn_cast<Constant>(Op0); 630 auto *CI0 = 631 C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0)) 632 : nullptr; 633 634 // Attempt to constant fold. 635 if (CILength && CIIndex) { 636 // From AMD documentation: "The bit index and field length are each six 637 // bits in length other bits of the field are ignored." 638 APInt APIndex = CIIndex->getValue().zextOrTrunc(6); 639 APInt APLength = CILength->getValue().zextOrTrunc(6); 640 641 unsigned Index = APIndex.getZExtValue(); 642 643 // From AMD documentation: "a value of zero in the field length is 644 // defined as length of 64". 645 unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue(); 646 647 // From AMD documentation: "If the sum of the bit index + length field 648 // is greater than 64, the results are undefined". 649 unsigned End = Index + Length; 650 651 // Note that both field index and field length are 8-bit quantities. 652 // Since variables 'Index' and 'Length' are unsigned values 653 // obtained from zero-extending field index and field length 654 // respectively, their sum should never wrap around. 655 if (End > 64) 656 return UndefValue::get(II.getType()); 657 658 // If we are inserting whole bytes, we can convert this to a shuffle. 659 // Lowering can recognize EXTRQI shuffle masks. 660 if ((Length % 8) == 0 && (Index % 8) == 0) { 661 // Convert bit indices to byte indices. 662 Length /= 8; 663 Index /= 8; 664 665 Type *IntTy8 = Type::getInt8Ty(II.getContext()); 666 auto *ShufTy = FixedVectorType::get(IntTy8, 16); 667 668 SmallVector<int, 16> ShuffleMask; 669 for (int i = 0; i != (int)Length; ++i) 670 ShuffleMask.push_back(i + Index); 671 for (int i = Length; i != 8; ++i) 672 ShuffleMask.push_back(i + 16); 673 for (int i = 8; i != 16; ++i) 674 ShuffleMask.push_back(-1); 675 676 Value *SV = Builder.CreateShuffleVector( 677 Builder.CreateBitCast(Op0, ShufTy), 678 ConstantAggregateZero::get(ShufTy), ShuffleMask); 679 return Builder.CreateBitCast(SV, II.getType()); 680 } 681 682 // Constant Fold - shift Index'th bit to lowest position and mask off 683 // Length bits. 684 if (CI0) { 685 APInt Elt = CI0->getValue(); 686 Elt.lshrInPlace(Index); 687 Elt = Elt.zextOrTrunc(Length); 688 return LowConstantHighUndef(Elt.getZExtValue()); 689 } 690 691 // If we were an EXTRQ call, we'll save registers if we convert to EXTRQI. 692 if (II.getIntrinsicID() == Intrinsic::x86_sse4a_extrq) { 693 Value *Args[] = {Op0, CILength, CIIndex}; 694 Module *M = II.getModule(); 695 Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_extrqi); 696 return Builder.CreateCall(F, Args); 697 } 698 } 699 700 // Constant Fold - extraction from zero is always {zero, undef}. 701 if (CI0 && CI0->isZero()) 702 return LowConstantHighUndef(0); 703 704 return nullptr; 705 } 706 707 /// Attempt to simplify SSE4A INSERTQ/INSERTQI instructions using constant 708 /// folding or conversion to a shuffle vector. 709 static Value *simplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1, 710 APInt APLength, APInt APIndex, 711 InstCombiner::BuilderTy &Builder) { 712 // From AMD documentation: "The bit index and field length are each six bits 713 // in length other bits of the field are ignored." 714 APIndex = APIndex.zextOrTrunc(6); 715 APLength = APLength.zextOrTrunc(6); 716 717 // Attempt to constant fold. 718 unsigned Index = APIndex.getZExtValue(); 719 720 // From AMD documentation: "a value of zero in the field length is 721 // defined as length of 64". 722 unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue(); 723 724 // From AMD documentation: "If the sum of the bit index + length field 725 // is greater than 64, the results are undefined". 726 unsigned End = Index + Length; 727 728 // Note that both field index and field length are 8-bit quantities. 729 // Since variables 'Index' and 'Length' are unsigned values 730 // obtained from zero-extending field index and field length 731 // respectively, their sum should never wrap around. 732 if (End > 64) 733 return UndefValue::get(II.getType()); 734 735 // If we are inserting whole bytes, we can convert this to a shuffle. 736 // Lowering can recognize INSERTQI shuffle masks. 737 if ((Length % 8) == 0 && (Index % 8) == 0) { 738 // Convert bit indices to byte indices. 739 Length /= 8; 740 Index /= 8; 741 742 Type *IntTy8 = Type::getInt8Ty(II.getContext()); 743 auto *ShufTy = FixedVectorType::get(IntTy8, 16); 744 745 SmallVector<int, 16> ShuffleMask; 746 for (int i = 0; i != (int)Index; ++i) 747 ShuffleMask.push_back(i); 748 for (int i = 0; i != (int)Length; ++i) 749 ShuffleMask.push_back(i + 16); 750 for (int i = Index + Length; i != 8; ++i) 751 ShuffleMask.push_back(i); 752 for (int i = 8; i != 16; ++i) 753 ShuffleMask.push_back(-1); 754 755 Value *SV = Builder.CreateShuffleVector(Builder.CreateBitCast(Op0, ShufTy), 756 Builder.CreateBitCast(Op1, ShufTy), 757 ShuffleMask); 758 return Builder.CreateBitCast(SV, II.getType()); 759 } 760 761 // See if we're dealing with constant values. 762 auto *C0 = dyn_cast<Constant>(Op0); 763 auto *C1 = dyn_cast<Constant>(Op1); 764 auto *CI00 = 765 C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0)) 766 : nullptr; 767 auto *CI10 = 768 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0)) 769 : nullptr; 770 771 // Constant Fold - insert bottom Length bits starting at the Index'th bit. 772 if (CI00 && CI10) { 773 APInt V00 = CI00->getValue(); 774 APInt V10 = CI10->getValue(); 775 APInt Mask = APInt::getLowBitsSet(64, Length).shl(Index); 776 V00 = V00 & ~Mask; 777 V10 = V10.zextOrTrunc(Length).zextOrTrunc(64).shl(Index); 778 APInt Val = V00 | V10; 779 Type *IntTy64 = Type::getInt64Ty(II.getContext()); 780 Constant *Args[] = {ConstantInt::get(IntTy64, Val.getZExtValue()), 781 UndefValue::get(IntTy64)}; 782 return ConstantVector::get(Args); 783 } 784 785 // If we were an INSERTQ call, we'll save demanded elements if we convert to 786 // INSERTQI. 787 if (II.getIntrinsicID() == Intrinsic::x86_sse4a_insertq) { 788 Type *IntTy8 = Type::getInt8Ty(II.getContext()); 789 Constant *CILength = ConstantInt::get(IntTy8, Length, false); 790 Constant *CIIndex = ConstantInt::get(IntTy8, Index, false); 791 792 Value *Args[] = {Op0, Op1, CILength, CIIndex}; 793 Module *M = II.getModule(); 794 Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi); 795 return Builder.CreateCall(F, Args); 796 } 797 798 return nullptr; 799 } 800 801 /// Attempt to convert pshufb* to shufflevector if the mask is constant. 802 static Value *simplifyX86pshufb(const IntrinsicInst &II, 803 InstCombiner::BuilderTy &Builder) { 804 auto *V = dyn_cast<Constant>(II.getArgOperand(1)); 805 if (!V) 806 return nullptr; 807 808 auto *VecTy = cast<FixedVectorType>(II.getType()); 809 unsigned NumElts = VecTy->getNumElements(); 810 assert((NumElts == 16 || NumElts == 32 || NumElts == 64) && 811 "Unexpected number of elements in shuffle mask!"); 812 813 // Construct a shuffle mask from constant integers or UNDEFs. 814 int Indexes[64]; 815 816 // Each byte in the shuffle control mask forms an index to permute the 817 // corresponding byte in the destination operand. 818 for (unsigned I = 0; I < NumElts; ++I) { 819 Constant *COp = V->getAggregateElement(I); 820 if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) 821 return nullptr; 822 823 if (isa<UndefValue>(COp)) { 824 Indexes[I] = -1; 825 continue; 826 } 827 828 int8_t Index = cast<ConstantInt>(COp)->getValue().getZExtValue(); 829 830 // If the most significant bit (bit[7]) of each byte of the shuffle 831 // control mask is set, then zero is written in the result byte. 832 // The zero vector is in the right-hand side of the resulting 833 // shufflevector. 834 835 // The value of each index for the high 128-bit lane is the least 836 // significant 4 bits of the respective shuffle control byte. 837 Index = ((Index < 0) ? NumElts : Index & 0x0F) + (I & 0xF0); 838 Indexes[I] = Index; 839 } 840 841 auto V1 = II.getArgOperand(0); 842 auto V2 = Constant::getNullValue(VecTy); 843 return Builder.CreateShuffleVector(V1, V2, makeArrayRef(Indexes, NumElts)); 844 } 845 846 /// Attempt to convert vpermilvar* to shufflevector if the mask is constant. 847 static Value *simplifyX86vpermilvar(const IntrinsicInst &II, 848 InstCombiner::BuilderTy &Builder) { 849 auto *V = dyn_cast<Constant>(II.getArgOperand(1)); 850 if (!V) 851 return nullptr; 852 853 auto *VecTy = cast<FixedVectorType>(II.getType()); 854 unsigned NumElts = VecTy->getNumElements(); 855 bool IsPD = VecTy->getScalarType()->isDoubleTy(); 856 unsigned NumLaneElts = IsPD ? 2 : 4; 857 assert(NumElts == 16 || NumElts == 8 || NumElts == 4 || NumElts == 2); 858 859 // Construct a shuffle mask from constant integers or UNDEFs. 860 int Indexes[16]; 861 862 // The intrinsics only read one or two bits, clear the rest. 863 for (unsigned I = 0; I < NumElts; ++I) { 864 Constant *COp = V->getAggregateElement(I); 865 if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) 866 return nullptr; 867 868 if (isa<UndefValue>(COp)) { 869 Indexes[I] = -1; 870 continue; 871 } 872 873 APInt Index = cast<ConstantInt>(COp)->getValue(); 874 Index = Index.zextOrTrunc(32).getLoBits(2); 875 876 // The PD variants uses bit 1 to select per-lane element index, so 877 // shift down to convert to generic shuffle mask index. 878 if (IsPD) 879 Index.lshrInPlace(1); 880 881 // The _256 variants are a bit trickier since the mask bits always index 882 // into the corresponding 128 half. In order to convert to a generic 883 // shuffle, we have to make that explicit. 884 Index += APInt(32, (I / NumLaneElts) * NumLaneElts); 885 886 Indexes[I] = Index.getZExtValue(); 887 } 888 889 auto V1 = II.getArgOperand(0); 890 return Builder.CreateShuffleVector(V1, makeArrayRef(Indexes, NumElts)); 891 } 892 893 /// Attempt to convert vpermd/vpermps to shufflevector if the mask is constant. 894 static Value *simplifyX86vpermv(const IntrinsicInst &II, 895 InstCombiner::BuilderTy &Builder) { 896 auto *V = dyn_cast<Constant>(II.getArgOperand(1)); 897 if (!V) 898 return nullptr; 899 900 auto *VecTy = cast<FixedVectorType>(II.getType()); 901 unsigned Size = VecTy->getNumElements(); 902 assert((Size == 4 || Size == 8 || Size == 16 || Size == 32 || Size == 64) && 903 "Unexpected shuffle mask size"); 904 905 // Construct a shuffle mask from constant integers or UNDEFs. 906 int Indexes[64]; 907 908 for (unsigned I = 0; I < Size; ++I) { 909 Constant *COp = V->getAggregateElement(I); 910 if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) 911 return nullptr; 912 913 if (isa<UndefValue>(COp)) { 914 Indexes[I] = -1; 915 continue; 916 } 917 918 uint32_t Index = cast<ConstantInt>(COp)->getZExtValue(); 919 Index &= Size - 1; 920 Indexes[I] = Index; 921 } 922 923 auto V1 = II.getArgOperand(0); 924 return Builder.CreateShuffleVector(V1, makeArrayRef(Indexes, Size)); 925 } 926 927 Optional<Instruction *> 928 X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { 929 auto SimplifyDemandedVectorEltsLow = [&IC](Value *Op, unsigned Width, 930 unsigned DemandedWidth) { 931 APInt UndefElts(Width, 0); 932 APInt DemandedElts = APInt::getLowBitsSet(Width, DemandedWidth); 933 return IC.SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts); 934 }; 935 936 Intrinsic::ID IID = II.getIntrinsicID(); 937 switch (IID) { 938 case Intrinsic::x86_bmi_bextr_32: 939 case Intrinsic::x86_bmi_bextr_64: 940 case Intrinsic::x86_tbm_bextri_u32: 941 case Intrinsic::x86_tbm_bextri_u64: 942 // If the RHS is a constant we can try some simplifications. 943 if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) { 944 uint64_t Shift = C->getZExtValue(); 945 uint64_t Length = (Shift >> 8) & 0xff; 946 Shift &= 0xff; 947 unsigned BitWidth = II.getType()->getIntegerBitWidth(); 948 // If the length is 0 or the shift is out of range, replace with zero. 949 if (Length == 0 || Shift >= BitWidth) { 950 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0)); 951 } 952 // If the LHS is also a constant, we can completely constant fold this. 953 if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) { 954 uint64_t Result = InC->getZExtValue() >> Shift; 955 if (Length > BitWidth) 956 Length = BitWidth; 957 Result &= maskTrailingOnes<uint64_t>(Length); 958 return IC.replaceInstUsesWith(II, 959 ConstantInt::get(II.getType(), Result)); 960 } 961 // TODO should we turn this into 'and' if shift is 0? Or 'shl' if we 962 // are only masking bits that a shift already cleared? 963 } 964 break; 965 966 case Intrinsic::x86_bmi_bzhi_32: 967 case Intrinsic::x86_bmi_bzhi_64: 968 // If the RHS is a constant we can try some simplifications. 969 if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) { 970 uint64_t Index = C->getZExtValue() & 0xff; 971 unsigned BitWidth = II.getType()->getIntegerBitWidth(); 972 if (Index >= BitWidth) { 973 return IC.replaceInstUsesWith(II, II.getArgOperand(0)); 974 } 975 if (Index == 0) { 976 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0)); 977 } 978 // If the LHS is also a constant, we can completely constant fold this. 979 if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) { 980 uint64_t Result = InC->getZExtValue(); 981 Result &= maskTrailingOnes<uint64_t>(Index); 982 return IC.replaceInstUsesWith(II, 983 ConstantInt::get(II.getType(), Result)); 984 } 985 // TODO should we convert this to an AND if the RHS is constant? 986 } 987 break; 988 case Intrinsic::x86_bmi_pext_32: 989 case Intrinsic::x86_bmi_pext_64: 990 if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) { 991 if (MaskC->isNullValue()) { 992 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0)); 993 } 994 if (MaskC->isAllOnesValue()) { 995 return IC.replaceInstUsesWith(II, II.getArgOperand(0)); 996 } 997 998 unsigned MaskIdx, MaskLen; 999 if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) { 1000 // any single contingous sequence of 1s anywhere in the mask simply 1001 // describes a subset of the input bits shifted to the appropriate 1002 // position. Replace with the straight forward IR. 1003 Value *Input = II.getArgOperand(0); 1004 Value *Masked = IC.Builder.CreateAnd(Input, II.getArgOperand(1)); 1005 Value *ShiftAmt = ConstantInt::get(II.getType(), MaskIdx); 1006 Value *Shifted = IC.Builder.CreateLShr(Masked, ShiftAmt); 1007 return IC.replaceInstUsesWith(II, Shifted); 1008 } 1009 1010 if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) { 1011 uint64_t Src = SrcC->getZExtValue(); 1012 uint64_t Mask = MaskC->getZExtValue(); 1013 uint64_t Result = 0; 1014 uint64_t BitToSet = 1; 1015 1016 while (Mask) { 1017 // Isolate lowest set bit. 1018 uint64_t BitToTest = Mask & -Mask; 1019 if (BitToTest & Src) 1020 Result |= BitToSet; 1021 1022 BitToSet <<= 1; 1023 // Clear lowest set bit. 1024 Mask &= Mask - 1; 1025 } 1026 1027 return IC.replaceInstUsesWith(II, 1028 ConstantInt::get(II.getType(), Result)); 1029 } 1030 } 1031 break; 1032 case Intrinsic::x86_bmi_pdep_32: 1033 case Intrinsic::x86_bmi_pdep_64: 1034 if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) { 1035 if (MaskC->isNullValue()) { 1036 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0)); 1037 } 1038 if (MaskC->isAllOnesValue()) { 1039 return IC.replaceInstUsesWith(II, II.getArgOperand(0)); 1040 } 1041 1042 unsigned MaskIdx, MaskLen; 1043 if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) { 1044 // any single contingous sequence of 1s anywhere in the mask simply 1045 // describes a subset of the input bits shifted to the appropriate 1046 // position. Replace with the straight forward IR. 1047 Value *Input = II.getArgOperand(0); 1048 Value *ShiftAmt = ConstantInt::get(II.getType(), MaskIdx); 1049 Value *Shifted = IC.Builder.CreateShl(Input, ShiftAmt); 1050 Value *Masked = IC.Builder.CreateAnd(Shifted, II.getArgOperand(1)); 1051 return IC.replaceInstUsesWith(II, Masked); 1052 } 1053 1054 if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) { 1055 uint64_t Src = SrcC->getZExtValue(); 1056 uint64_t Mask = MaskC->getZExtValue(); 1057 uint64_t Result = 0; 1058 uint64_t BitToTest = 1; 1059 1060 while (Mask) { 1061 // Isolate lowest set bit. 1062 uint64_t BitToSet = Mask & -Mask; 1063 if (BitToTest & Src) 1064 Result |= BitToSet; 1065 1066 BitToTest <<= 1; 1067 // Clear lowest set bit; 1068 Mask &= Mask - 1; 1069 } 1070 1071 return IC.replaceInstUsesWith(II, 1072 ConstantInt::get(II.getType(), Result)); 1073 } 1074 } 1075 break; 1076 1077 case Intrinsic::x86_sse_cvtss2si: 1078 case Intrinsic::x86_sse_cvtss2si64: 1079 case Intrinsic::x86_sse_cvttss2si: 1080 case Intrinsic::x86_sse_cvttss2si64: 1081 case Intrinsic::x86_sse2_cvtsd2si: 1082 case Intrinsic::x86_sse2_cvtsd2si64: 1083 case Intrinsic::x86_sse2_cvttsd2si: 1084 case Intrinsic::x86_sse2_cvttsd2si64: 1085 case Intrinsic::x86_avx512_vcvtss2si32: 1086 case Intrinsic::x86_avx512_vcvtss2si64: 1087 case Intrinsic::x86_avx512_vcvtss2usi32: 1088 case Intrinsic::x86_avx512_vcvtss2usi64: 1089 case Intrinsic::x86_avx512_vcvtsd2si32: 1090 case Intrinsic::x86_avx512_vcvtsd2si64: 1091 case Intrinsic::x86_avx512_vcvtsd2usi32: 1092 case Intrinsic::x86_avx512_vcvtsd2usi64: 1093 case Intrinsic::x86_avx512_cvttss2si: 1094 case Intrinsic::x86_avx512_cvttss2si64: 1095 case Intrinsic::x86_avx512_cvttss2usi: 1096 case Intrinsic::x86_avx512_cvttss2usi64: 1097 case Intrinsic::x86_avx512_cvttsd2si: 1098 case Intrinsic::x86_avx512_cvttsd2si64: 1099 case Intrinsic::x86_avx512_cvttsd2usi: 1100 case Intrinsic::x86_avx512_cvttsd2usi64: { 1101 // These intrinsics only demand the 0th element of their input vectors. If 1102 // we can simplify the input based on that, do so now. 1103 Value *Arg = II.getArgOperand(0); 1104 unsigned VWidth = cast<FixedVectorType>(Arg->getType())->getNumElements(); 1105 if (Value *V = SimplifyDemandedVectorEltsLow(Arg, VWidth, 1)) { 1106 return IC.replaceOperand(II, 0, V); 1107 } 1108 break; 1109 } 1110 1111 case Intrinsic::x86_mmx_pmovmskb: 1112 case Intrinsic::x86_sse_movmsk_ps: 1113 case Intrinsic::x86_sse2_movmsk_pd: 1114 case Intrinsic::x86_sse2_pmovmskb_128: 1115 case Intrinsic::x86_avx_movmsk_pd_256: 1116 case Intrinsic::x86_avx_movmsk_ps_256: 1117 case Intrinsic::x86_avx2_pmovmskb: 1118 if (Value *V = simplifyX86movmsk(II, IC.Builder)) { 1119 return IC.replaceInstUsesWith(II, V); 1120 } 1121 break; 1122 1123 case Intrinsic::x86_sse_comieq_ss: 1124 case Intrinsic::x86_sse_comige_ss: 1125 case Intrinsic::x86_sse_comigt_ss: 1126 case Intrinsic::x86_sse_comile_ss: 1127 case Intrinsic::x86_sse_comilt_ss: 1128 case Intrinsic::x86_sse_comineq_ss: 1129 case Intrinsic::x86_sse_ucomieq_ss: 1130 case Intrinsic::x86_sse_ucomige_ss: 1131 case Intrinsic::x86_sse_ucomigt_ss: 1132 case Intrinsic::x86_sse_ucomile_ss: 1133 case Intrinsic::x86_sse_ucomilt_ss: 1134 case Intrinsic::x86_sse_ucomineq_ss: 1135 case Intrinsic::x86_sse2_comieq_sd: 1136 case Intrinsic::x86_sse2_comige_sd: 1137 case Intrinsic::x86_sse2_comigt_sd: 1138 case Intrinsic::x86_sse2_comile_sd: 1139 case Intrinsic::x86_sse2_comilt_sd: 1140 case Intrinsic::x86_sse2_comineq_sd: 1141 case Intrinsic::x86_sse2_ucomieq_sd: 1142 case Intrinsic::x86_sse2_ucomige_sd: 1143 case Intrinsic::x86_sse2_ucomigt_sd: 1144 case Intrinsic::x86_sse2_ucomile_sd: 1145 case Intrinsic::x86_sse2_ucomilt_sd: 1146 case Intrinsic::x86_sse2_ucomineq_sd: 1147 case Intrinsic::x86_avx512_vcomi_ss: 1148 case Intrinsic::x86_avx512_vcomi_sd: 1149 case Intrinsic::x86_avx512_mask_cmp_ss: 1150 case Intrinsic::x86_avx512_mask_cmp_sd: { 1151 // These intrinsics only demand the 0th element of their input vectors. If 1152 // we can simplify the input based on that, do so now. 1153 bool MadeChange = false; 1154 Value *Arg0 = II.getArgOperand(0); 1155 Value *Arg1 = II.getArgOperand(1); 1156 unsigned VWidth = cast<FixedVectorType>(Arg0->getType())->getNumElements(); 1157 if (Value *V = SimplifyDemandedVectorEltsLow(Arg0, VWidth, 1)) { 1158 IC.replaceOperand(II, 0, V); 1159 MadeChange = true; 1160 } 1161 if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, 1)) { 1162 IC.replaceOperand(II, 1, V); 1163 MadeChange = true; 1164 } 1165 if (MadeChange) { 1166 return &II; 1167 } 1168 break; 1169 } 1170 1171 case Intrinsic::x86_avx512_add_ps_512: 1172 case Intrinsic::x86_avx512_div_ps_512: 1173 case Intrinsic::x86_avx512_mul_ps_512: 1174 case Intrinsic::x86_avx512_sub_ps_512: 1175 case Intrinsic::x86_avx512_add_pd_512: 1176 case Intrinsic::x86_avx512_div_pd_512: 1177 case Intrinsic::x86_avx512_mul_pd_512: 1178 case Intrinsic::x86_avx512_sub_pd_512: 1179 // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular 1180 // IR operations. 1181 if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(2))) { 1182 if (R->getValue() == 4) { 1183 Value *Arg0 = II.getArgOperand(0); 1184 Value *Arg1 = II.getArgOperand(1); 1185 1186 Value *V; 1187 switch (IID) { 1188 default: 1189 llvm_unreachable("Case stmts out of sync!"); 1190 case Intrinsic::x86_avx512_add_ps_512: 1191 case Intrinsic::x86_avx512_add_pd_512: 1192 V = IC.Builder.CreateFAdd(Arg0, Arg1); 1193 break; 1194 case Intrinsic::x86_avx512_sub_ps_512: 1195 case Intrinsic::x86_avx512_sub_pd_512: 1196 V = IC.Builder.CreateFSub(Arg0, Arg1); 1197 break; 1198 case Intrinsic::x86_avx512_mul_ps_512: 1199 case Intrinsic::x86_avx512_mul_pd_512: 1200 V = IC.Builder.CreateFMul(Arg0, Arg1); 1201 break; 1202 case Intrinsic::x86_avx512_div_ps_512: 1203 case Intrinsic::x86_avx512_div_pd_512: 1204 V = IC.Builder.CreateFDiv(Arg0, Arg1); 1205 break; 1206 } 1207 1208 return IC.replaceInstUsesWith(II, V); 1209 } 1210 } 1211 break; 1212 1213 case Intrinsic::x86_avx512_mask_add_ss_round: 1214 case Intrinsic::x86_avx512_mask_div_ss_round: 1215 case Intrinsic::x86_avx512_mask_mul_ss_round: 1216 case Intrinsic::x86_avx512_mask_sub_ss_round: 1217 case Intrinsic::x86_avx512_mask_add_sd_round: 1218 case Intrinsic::x86_avx512_mask_div_sd_round: 1219 case Intrinsic::x86_avx512_mask_mul_sd_round: 1220 case Intrinsic::x86_avx512_mask_sub_sd_round: 1221 // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular 1222 // IR operations. 1223 if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(4))) { 1224 if (R->getValue() == 4) { 1225 // Extract the element as scalars. 1226 Value *Arg0 = II.getArgOperand(0); 1227 Value *Arg1 = II.getArgOperand(1); 1228 Value *LHS = IC.Builder.CreateExtractElement(Arg0, (uint64_t)0); 1229 Value *RHS = IC.Builder.CreateExtractElement(Arg1, (uint64_t)0); 1230 1231 Value *V; 1232 switch (IID) { 1233 default: 1234 llvm_unreachable("Case stmts out of sync!"); 1235 case Intrinsic::x86_avx512_mask_add_ss_round: 1236 case Intrinsic::x86_avx512_mask_add_sd_round: 1237 V = IC.Builder.CreateFAdd(LHS, RHS); 1238 break; 1239 case Intrinsic::x86_avx512_mask_sub_ss_round: 1240 case Intrinsic::x86_avx512_mask_sub_sd_round: 1241 V = IC.Builder.CreateFSub(LHS, RHS); 1242 break; 1243 case Intrinsic::x86_avx512_mask_mul_ss_round: 1244 case Intrinsic::x86_avx512_mask_mul_sd_round: 1245 V = IC.Builder.CreateFMul(LHS, RHS); 1246 break; 1247 case Intrinsic::x86_avx512_mask_div_ss_round: 1248 case Intrinsic::x86_avx512_mask_div_sd_round: 1249 V = IC.Builder.CreateFDiv(LHS, RHS); 1250 break; 1251 } 1252 1253 // Handle the masking aspect of the intrinsic. 1254 Value *Mask = II.getArgOperand(3); 1255 auto *C = dyn_cast<ConstantInt>(Mask); 1256 // We don't need a select if we know the mask bit is a 1. 1257 if (!C || !C->getValue()[0]) { 1258 // Cast the mask to an i1 vector and then extract the lowest element. 1259 auto *MaskTy = FixedVectorType::get( 1260 IC.Builder.getInt1Ty(), 1261 cast<IntegerType>(Mask->getType())->getBitWidth()); 1262 Mask = IC.Builder.CreateBitCast(Mask, MaskTy); 1263 Mask = IC.Builder.CreateExtractElement(Mask, (uint64_t)0); 1264 // Extract the lowest element from the passthru operand. 1265 Value *Passthru = 1266 IC.Builder.CreateExtractElement(II.getArgOperand(2), (uint64_t)0); 1267 V = IC.Builder.CreateSelect(Mask, V, Passthru); 1268 } 1269 1270 // Insert the result back into the original argument 0. 1271 V = IC.Builder.CreateInsertElement(Arg0, V, (uint64_t)0); 1272 1273 return IC.replaceInstUsesWith(II, V); 1274 } 1275 } 1276 break; 1277 1278 // Constant fold ashr( <A x Bi>, Ci ). 1279 // Constant fold lshr( <A x Bi>, Ci ). 1280 // Constant fold shl( <A x Bi>, Ci ). 1281 case Intrinsic::x86_sse2_psrai_d: 1282 case Intrinsic::x86_sse2_psrai_w: 1283 case Intrinsic::x86_avx2_psrai_d: 1284 case Intrinsic::x86_avx2_psrai_w: 1285 case Intrinsic::x86_avx512_psrai_q_128: 1286 case Intrinsic::x86_avx512_psrai_q_256: 1287 case Intrinsic::x86_avx512_psrai_d_512: 1288 case Intrinsic::x86_avx512_psrai_q_512: 1289 case Intrinsic::x86_avx512_psrai_w_512: 1290 case Intrinsic::x86_sse2_psrli_d: 1291 case Intrinsic::x86_sse2_psrli_q: 1292 case Intrinsic::x86_sse2_psrli_w: 1293 case Intrinsic::x86_avx2_psrli_d: 1294 case Intrinsic::x86_avx2_psrli_q: 1295 case Intrinsic::x86_avx2_psrli_w: 1296 case Intrinsic::x86_avx512_psrli_d_512: 1297 case Intrinsic::x86_avx512_psrli_q_512: 1298 case Intrinsic::x86_avx512_psrli_w_512: 1299 case Intrinsic::x86_sse2_pslli_d: 1300 case Intrinsic::x86_sse2_pslli_q: 1301 case Intrinsic::x86_sse2_pslli_w: 1302 case Intrinsic::x86_avx2_pslli_d: 1303 case Intrinsic::x86_avx2_pslli_q: 1304 case Intrinsic::x86_avx2_pslli_w: 1305 case Intrinsic::x86_avx512_pslli_d_512: 1306 case Intrinsic::x86_avx512_pslli_q_512: 1307 case Intrinsic::x86_avx512_pslli_w_512: 1308 if (Value *V = simplifyX86immShift(II, IC.Builder)) { 1309 return IC.replaceInstUsesWith(II, V); 1310 } 1311 break; 1312 1313 case Intrinsic::x86_sse2_psra_d: 1314 case Intrinsic::x86_sse2_psra_w: 1315 case Intrinsic::x86_avx2_psra_d: 1316 case Intrinsic::x86_avx2_psra_w: 1317 case Intrinsic::x86_avx512_psra_q_128: 1318 case Intrinsic::x86_avx512_psra_q_256: 1319 case Intrinsic::x86_avx512_psra_d_512: 1320 case Intrinsic::x86_avx512_psra_q_512: 1321 case Intrinsic::x86_avx512_psra_w_512: 1322 case Intrinsic::x86_sse2_psrl_d: 1323 case Intrinsic::x86_sse2_psrl_q: 1324 case Intrinsic::x86_sse2_psrl_w: 1325 case Intrinsic::x86_avx2_psrl_d: 1326 case Intrinsic::x86_avx2_psrl_q: 1327 case Intrinsic::x86_avx2_psrl_w: 1328 case Intrinsic::x86_avx512_psrl_d_512: 1329 case Intrinsic::x86_avx512_psrl_q_512: 1330 case Intrinsic::x86_avx512_psrl_w_512: 1331 case Intrinsic::x86_sse2_psll_d: 1332 case Intrinsic::x86_sse2_psll_q: 1333 case Intrinsic::x86_sse2_psll_w: 1334 case Intrinsic::x86_avx2_psll_d: 1335 case Intrinsic::x86_avx2_psll_q: 1336 case Intrinsic::x86_avx2_psll_w: 1337 case Intrinsic::x86_avx512_psll_d_512: 1338 case Intrinsic::x86_avx512_psll_q_512: 1339 case Intrinsic::x86_avx512_psll_w_512: { 1340 if (Value *V = simplifyX86immShift(II, IC.Builder)) { 1341 return IC.replaceInstUsesWith(II, V); 1342 } 1343 1344 // SSE2/AVX2 uses only the first 64-bits of the 128-bit vector 1345 // operand to compute the shift amount. 1346 Value *Arg1 = II.getArgOperand(1); 1347 assert(Arg1->getType()->getPrimitiveSizeInBits() == 128 && 1348 "Unexpected packed shift size"); 1349 unsigned VWidth = cast<FixedVectorType>(Arg1->getType())->getNumElements(); 1350 1351 if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, VWidth / 2)) { 1352 return IC.replaceOperand(II, 1, V); 1353 } 1354 break; 1355 } 1356 1357 case Intrinsic::x86_avx2_psllv_d: 1358 case Intrinsic::x86_avx2_psllv_d_256: 1359 case Intrinsic::x86_avx2_psllv_q: 1360 case Intrinsic::x86_avx2_psllv_q_256: 1361 case Intrinsic::x86_avx512_psllv_d_512: 1362 case Intrinsic::x86_avx512_psllv_q_512: 1363 case Intrinsic::x86_avx512_psllv_w_128: 1364 case Intrinsic::x86_avx512_psllv_w_256: 1365 case Intrinsic::x86_avx512_psllv_w_512: 1366 case Intrinsic::x86_avx2_psrav_d: 1367 case Intrinsic::x86_avx2_psrav_d_256: 1368 case Intrinsic::x86_avx512_psrav_q_128: 1369 case Intrinsic::x86_avx512_psrav_q_256: 1370 case Intrinsic::x86_avx512_psrav_d_512: 1371 case Intrinsic::x86_avx512_psrav_q_512: 1372 case Intrinsic::x86_avx512_psrav_w_128: 1373 case Intrinsic::x86_avx512_psrav_w_256: 1374 case Intrinsic::x86_avx512_psrav_w_512: 1375 case Intrinsic::x86_avx2_psrlv_d: 1376 case Intrinsic::x86_avx2_psrlv_d_256: 1377 case Intrinsic::x86_avx2_psrlv_q: 1378 case Intrinsic::x86_avx2_psrlv_q_256: 1379 case Intrinsic::x86_avx512_psrlv_d_512: 1380 case Intrinsic::x86_avx512_psrlv_q_512: 1381 case Intrinsic::x86_avx512_psrlv_w_128: 1382 case Intrinsic::x86_avx512_psrlv_w_256: 1383 case Intrinsic::x86_avx512_psrlv_w_512: 1384 if (Value *V = simplifyX86varShift(II, IC.Builder)) { 1385 return IC.replaceInstUsesWith(II, V); 1386 } 1387 break; 1388 1389 case Intrinsic::x86_sse2_packssdw_128: 1390 case Intrinsic::x86_sse2_packsswb_128: 1391 case Intrinsic::x86_avx2_packssdw: 1392 case Intrinsic::x86_avx2_packsswb: 1393 case Intrinsic::x86_avx512_packssdw_512: 1394 case Intrinsic::x86_avx512_packsswb_512: 1395 if (Value *V = simplifyX86pack(II, IC.Builder, true)) { 1396 return IC.replaceInstUsesWith(II, V); 1397 } 1398 break; 1399 1400 case Intrinsic::x86_sse2_packuswb_128: 1401 case Intrinsic::x86_sse41_packusdw: 1402 case Intrinsic::x86_avx2_packusdw: 1403 case Intrinsic::x86_avx2_packuswb: 1404 case Intrinsic::x86_avx512_packusdw_512: 1405 case Intrinsic::x86_avx512_packuswb_512: 1406 if (Value *V = simplifyX86pack(II, IC.Builder, false)) { 1407 return IC.replaceInstUsesWith(II, V); 1408 } 1409 break; 1410 1411 case Intrinsic::x86_pclmulqdq: 1412 case Intrinsic::x86_pclmulqdq_256: 1413 case Intrinsic::x86_pclmulqdq_512: { 1414 if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(2))) { 1415 unsigned Imm = C->getZExtValue(); 1416 1417 bool MadeChange = false; 1418 Value *Arg0 = II.getArgOperand(0); 1419 Value *Arg1 = II.getArgOperand(1); 1420 unsigned VWidth = 1421 cast<FixedVectorType>(Arg0->getType())->getNumElements(); 1422 1423 APInt UndefElts1(VWidth, 0); 1424 APInt DemandedElts1 = 1425 APInt::getSplat(VWidth, APInt(2, (Imm & 0x01) ? 2 : 1)); 1426 if (Value *V = 1427 IC.SimplifyDemandedVectorElts(Arg0, DemandedElts1, UndefElts1)) { 1428 IC.replaceOperand(II, 0, V); 1429 MadeChange = true; 1430 } 1431 1432 APInt UndefElts2(VWidth, 0); 1433 APInt DemandedElts2 = 1434 APInt::getSplat(VWidth, APInt(2, (Imm & 0x10) ? 2 : 1)); 1435 if (Value *V = 1436 IC.SimplifyDemandedVectorElts(Arg1, DemandedElts2, UndefElts2)) { 1437 IC.replaceOperand(II, 1, V); 1438 MadeChange = true; 1439 } 1440 1441 // If either input elements are undef, the result is zero. 1442 if (DemandedElts1.isSubsetOf(UndefElts1) || 1443 DemandedElts2.isSubsetOf(UndefElts2)) { 1444 return IC.replaceInstUsesWith(II, 1445 ConstantAggregateZero::get(II.getType())); 1446 } 1447 1448 if (MadeChange) { 1449 return &II; 1450 } 1451 } 1452 break; 1453 } 1454 1455 case Intrinsic::x86_sse41_insertps: 1456 if (Value *V = simplifyX86insertps(II, IC.Builder)) { 1457 return IC.replaceInstUsesWith(II, V); 1458 } 1459 break; 1460 1461 case Intrinsic::x86_sse4a_extrq: { 1462 Value *Op0 = II.getArgOperand(0); 1463 Value *Op1 = II.getArgOperand(1); 1464 unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements(); 1465 unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements(); 1466 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && 1467 Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 && 1468 VWidth1 == 16 && "Unexpected operand sizes"); 1469 1470 // See if we're dealing with constant values. 1471 auto *C1 = dyn_cast<Constant>(Op1); 1472 auto *CILength = 1473 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0)) 1474 : nullptr; 1475 auto *CIIndex = 1476 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1)) 1477 : nullptr; 1478 1479 // Attempt to simplify to a constant, shuffle vector or EXTRQI call. 1480 if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) { 1481 return IC.replaceInstUsesWith(II, V); 1482 } 1483 1484 // EXTRQ only uses the lowest 64-bits of the first 128-bit vector 1485 // operands and the lowest 16-bits of the second. 1486 bool MadeChange = false; 1487 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) { 1488 IC.replaceOperand(II, 0, V); 1489 MadeChange = true; 1490 } 1491 if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 2)) { 1492 IC.replaceOperand(II, 1, V); 1493 MadeChange = true; 1494 } 1495 if (MadeChange) { 1496 return &II; 1497 } 1498 break; 1499 } 1500 1501 case Intrinsic::x86_sse4a_extrqi: { 1502 // EXTRQI: Extract Length bits starting from Index. Zero pad the remaining 1503 // bits of the lower 64-bits. The upper 64-bits are undefined. 1504 Value *Op0 = II.getArgOperand(0); 1505 unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements(); 1506 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 && 1507 "Unexpected operand size"); 1508 1509 // See if we're dealing with constant values. 1510 auto *CILength = dyn_cast<ConstantInt>(II.getArgOperand(1)); 1511 auto *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(2)); 1512 1513 // Attempt to simplify to a constant or shuffle vector. 1514 if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) { 1515 return IC.replaceInstUsesWith(II, V); 1516 } 1517 1518 // EXTRQI only uses the lowest 64-bits of the first 128-bit vector 1519 // operand. 1520 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) { 1521 return IC.replaceOperand(II, 0, V); 1522 } 1523 break; 1524 } 1525 1526 case Intrinsic::x86_sse4a_insertq: { 1527 Value *Op0 = II.getArgOperand(0); 1528 Value *Op1 = II.getArgOperand(1); 1529 unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements(); 1530 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && 1531 Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 && 1532 cast<FixedVectorType>(Op1->getType())->getNumElements() == 2 && 1533 "Unexpected operand size"); 1534 1535 // See if we're dealing with constant values. 1536 auto *C1 = dyn_cast<Constant>(Op1); 1537 auto *CI11 = 1538 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1)) 1539 : nullptr; 1540 1541 // Attempt to simplify to a constant, shuffle vector or INSERTQI call. 1542 if (CI11) { 1543 const APInt &V11 = CI11->getValue(); 1544 APInt Len = V11.zextOrTrunc(6); 1545 APInt Idx = V11.lshr(8).zextOrTrunc(6); 1546 if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) { 1547 return IC.replaceInstUsesWith(II, V); 1548 } 1549 } 1550 1551 // INSERTQ only uses the lowest 64-bits of the first 128-bit vector 1552 // operand. 1553 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) { 1554 return IC.replaceOperand(II, 0, V); 1555 } 1556 break; 1557 } 1558 1559 case Intrinsic::x86_sse4a_insertqi: { 1560 // INSERTQI: Extract lowest Length bits from lower half of second source and 1561 // insert over first source starting at Index bit. The upper 64-bits are 1562 // undefined. 1563 Value *Op0 = II.getArgOperand(0); 1564 Value *Op1 = II.getArgOperand(1); 1565 unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements(); 1566 unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements(); 1567 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && 1568 Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 && 1569 VWidth1 == 2 && "Unexpected operand sizes"); 1570 1571 // See if we're dealing with constant values. 1572 auto *CILength = dyn_cast<ConstantInt>(II.getArgOperand(2)); 1573 auto *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(3)); 1574 1575 // Attempt to simplify to a constant or shuffle vector. 1576 if (CILength && CIIndex) { 1577 APInt Len = CILength->getValue().zextOrTrunc(6); 1578 APInt Idx = CIIndex->getValue().zextOrTrunc(6); 1579 if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) { 1580 return IC.replaceInstUsesWith(II, V); 1581 } 1582 } 1583 1584 // INSERTQI only uses the lowest 64-bits of the first two 128-bit vector 1585 // operands. 1586 bool MadeChange = false; 1587 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) { 1588 IC.replaceOperand(II, 0, V); 1589 MadeChange = true; 1590 } 1591 if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 1)) { 1592 IC.replaceOperand(II, 1, V); 1593 MadeChange = true; 1594 } 1595 if (MadeChange) { 1596 return &II; 1597 } 1598 break; 1599 } 1600 1601 case Intrinsic::x86_sse41_pblendvb: 1602 case Intrinsic::x86_sse41_blendvps: 1603 case Intrinsic::x86_sse41_blendvpd: 1604 case Intrinsic::x86_avx_blendv_ps_256: 1605 case Intrinsic::x86_avx_blendv_pd_256: 1606 case Intrinsic::x86_avx2_pblendvb: { 1607 // fold (blend A, A, Mask) -> A 1608 Value *Op0 = II.getArgOperand(0); 1609 Value *Op1 = II.getArgOperand(1); 1610 Value *Mask = II.getArgOperand(2); 1611 if (Op0 == Op1) { 1612 return IC.replaceInstUsesWith(II, Op0); 1613 } 1614 1615 // Zero Mask - select 1st argument. 1616 if (isa<ConstantAggregateZero>(Mask)) { 1617 return IC.replaceInstUsesWith(II, Op0); 1618 } 1619 1620 // Constant Mask - select 1st/2nd argument lane based on top bit of mask. 1621 if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) { 1622 Constant *NewSelector = getNegativeIsTrueBoolVec(ConstantMask); 1623 return SelectInst::Create(NewSelector, Op1, Op0, "blendv"); 1624 } 1625 1626 // Convert to a vector select if we can bypass casts and find a boolean 1627 // vector condition value. 1628 Value *BoolVec; 1629 Mask = InstCombiner::peekThroughBitcast(Mask); 1630 if (match(Mask, PatternMatch::m_SExt(PatternMatch::m_Value(BoolVec))) && 1631 BoolVec->getType()->isVectorTy() && 1632 BoolVec->getType()->getScalarSizeInBits() == 1) { 1633 assert(Mask->getType()->getPrimitiveSizeInBits() == 1634 II.getType()->getPrimitiveSizeInBits() && 1635 "Not expecting mask and operands with different sizes"); 1636 1637 unsigned NumMaskElts = 1638 cast<FixedVectorType>(Mask->getType())->getNumElements(); 1639 unsigned NumOperandElts = 1640 cast<FixedVectorType>(II.getType())->getNumElements(); 1641 if (NumMaskElts == NumOperandElts) { 1642 return SelectInst::Create(BoolVec, Op1, Op0); 1643 } 1644 1645 // If the mask has less elements than the operands, each mask bit maps to 1646 // multiple elements of the operands. Bitcast back and forth. 1647 if (NumMaskElts < NumOperandElts) { 1648 Value *CastOp0 = IC.Builder.CreateBitCast(Op0, Mask->getType()); 1649 Value *CastOp1 = IC.Builder.CreateBitCast(Op1, Mask->getType()); 1650 Value *Sel = IC.Builder.CreateSelect(BoolVec, CastOp1, CastOp0); 1651 return new BitCastInst(Sel, II.getType()); 1652 } 1653 } 1654 1655 break; 1656 } 1657 1658 case Intrinsic::x86_ssse3_pshuf_b_128: 1659 case Intrinsic::x86_avx2_pshuf_b: 1660 case Intrinsic::x86_avx512_pshuf_b_512: 1661 if (Value *V = simplifyX86pshufb(II, IC.Builder)) { 1662 return IC.replaceInstUsesWith(II, V); 1663 } 1664 break; 1665 1666 case Intrinsic::x86_avx_vpermilvar_ps: 1667 case Intrinsic::x86_avx_vpermilvar_ps_256: 1668 case Intrinsic::x86_avx512_vpermilvar_ps_512: 1669 case Intrinsic::x86_avx_vpermilvar_pd: 1670 case Intrinsic::x86_avx_vpermilvar_pd_256: 1671 case Intrinsic::x86_avx512_vpermilvar_pd_512: 1672 if (Value *V = simplifyX86vpermilvar(II, IC.Builder)) { 1673 return IC.replaceInstUsesWith(II, V); 1674 } 1675 break; 1676 1677 case Intrinsic::x86_avx2_permd: 1678 case Intrinsic::x86_avx2_permps: 1679 case Intrinsic::x86_avx512_permvar_df_256: 1680 case Intrinsic::x86_avx512_permvar_df_512: 1681 case Intrinsic::x86_avx512_permvar_di_256: 1682 case Intrinsic::x86_avx512_permvar_di_512: 1683 case Intrinsic::x86_avx512_permvar_hi_128: 1684 case Intrinsic::x86_avx512_permvar_hi_256: 1685 case Intrinsic::x86_avx512_permvar_hi_512: 1686 case Intrinsic::x86_avx512_permvar_qi_128: 1687 case Intrinsic::x86_avx512_permvar_qi_256: 1688 case Intrinsic::x86_avx512_permvar_qi_512: 1689 case Intrinsic::x86_avx512_permvar_sf_512: 1690 case Intrinsic::x86_avx512_permvar_si_512: 1691 if (Value *V = simplifyX86vpermv(II, IC.Builder)) { 1692 return IC.replaceInstUsesWith(II, V); 1693 } 1694 break; 1695 1696 case Intrinsic::x86_avx_maskload_ps: 1697 case Intrinsic::x86_avx_maskload_pd: 1698 case Intrinsic::x86_avx_maskload_ps_256: 1699 case Intrinsic::x86_avx_maskload_pd_256: 1700 case Intrinsic::x86_avx2_maskload_d: 1701 case Intrinsic::x86_avx2_maskload_q: 1702 case Intrinsic::x86_avx2_maskload_d_256: 1703 case Intrinsic::x86_avx2_maskload_q_256: 1704 if (Instruction *I = simplifyX86MaskedLoad(II, IC)) { 1705 return I; 1706 } 1707 break; 1708 1709 case Intrinsic::x86_sse2_maskmov_dqu: 1710 case Intrinsic::x86_avx_maskstore_ps: 1711 case Intrinsic::x86_avx_maskstore_pd: 1712 case Intrinsic::x86_avx_maskstore_ps_256: 1713 case Intrinsic::x86_avx_maskstore_pd_256: 1714 case Intrinsic::x86_avx2_maskstore_d: 1715 case Intrinsic::x86_avx2_maskstore_q: 1716 case Intrinsic::x86_avx2_maskstore_d_256: 1717 case Intrinsic::x86_avx2_maskstore_q_256: 1718 if (simplifyX86MaskedStore(II, IC)) { 1719 return nullptr; 1720 } 1721 break; 1722 1723 case Intrinsic::x86_addcarry_32: 1724 case Intrinsic::x86_addcarry_64: 1725 if (Value *V = simplifyX86addcarry(II, IC.Builder)) { 1726 return IC.replaceInstUsesWith(II, V); 1727 } 1728 break; 1729 1730 default: 1731 break; 1732 } 1733 return None; 1734 } 1735 1736 Optional<Value *> X86TTIImpl::simplifyDemandedUseBitsIntrinsic( 1737 InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known, 1738 bool &KnownBitsComputed) const { 1739 switch (II.getIntrinsicID()) { 1740 default: 1741 break; 1742 case Intrinsic::x86_mmx_pmovmskb: 1743 case Intrinsic::x86_sse_movmsk_ps: 1744 case Intrinsic::x86_sse2_movmsk_pd: 1745 case Intrinsic::x86_sse2_pmovmskb_128: 1746 case Intrinsic::x86_avx_movmsk_ps_256: 1747 case Intrinsic::x86_avx_movmsk_pd_256: 1748 case Intrinsic::x86_avx2_pmovmskb: { 1749 // MOVMSK copies the vector elements' sign bits to the low bits 1750 // and zeros the high bits. 1751 unsigned ArgWidth; 1752 if (II.getIntrinsicID() == Intrinsic::x86_mmx_pmovmskb) { 1753 ArgWidth = 8; // Arg is x86_mmx, but treated as <8 x i8>. 1754 } else { 1755 auto *ArgType = cast<FixedVectorType>(II.getArgOperand(0)->getType()); 1756 ArgWidth = ArgType->getNumElements(); 1757 } 1758 1759 // If we don't need any of low bits then return zero, 1760 // we know that DemandedMask is non-zero already. 1761 APInt DemandedElts = DemandedMask.zextOrTrunc(ArgWidth); 1762 Type *VTy = II.getType(); 1763 if (DemandedElts.isZero()) { 1764 return ConstantInt::getNullValue(VTy); 1765 } 1766 1767 // We know that the upper bits are set to zero. 1768 Known.Zero.setBitsFrom(ArgWidth); 1769 KnownBitsComputed = true; 1770 break; 1771 } 1772 } 1773 return None; 1774 } 1775 1776 Optional<Value *> X86TTIImpl::simplifyDemandedVectorEltsIntrinsic( 1777 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, 1778 APInt &UndefElts2, APInt &UndefElts3, 1779 std::function<void(Instruction *, unsigned, APInt, APInt &)> 1780 simplifyAndSetOp) const { 1781 unsigned VWidth = cast<FixedVectorType>(II.getType())->getNumElements(); 1782 switch (II.getIntrinsicID()) { 1783 default: 1784 break; 1785 case Intrinsic::x86_xop_vfrcz_ss: 1786 case Intrinsic::x86_xop_vfrcz_sd: 1787 // The instructions for these intrinsics are speced to zero upper bits not 1788 // pass them through like other scalar intrinsics. So we shouldn't just 1789 // use Arg0 if DemandedElts[0] is clear like we do for other intrinsics. 1790 // Instead we should return a zero vector. 1791 if (!DemandedElts[0]) { 1792 IC.addToWorklist(&II); 1793 return ConstantAggregateZero::get(II.getType()); 1794 } 1795 1796 // Only the lower element is used. 1797 DemandedElts = 1; 1798 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); 1799 1800 // Only the lower element is undefined. The high elements are zero. 1801 UndefElts = UndefElts[0]; 1802 break; 1803 1804 // Unary scalar-as-vector operations that work column-wise. 1805 case Intrinsic::x86_sse_rcp_ss: 1806 case Intrinsic::x86_sse_rsqrt_ss: 1807 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); 1808 1809 // If lowest element of a scalar op isn't used then use Arg0. 1810 if (!DemandedElts[0]) { 1811 IC.addToWorklist(&II); 1812 return II.getArgOperand(0); 1813 } 1814 // TODO: If only low elt lower SQRT to FSQRT (with rounding/exceptions 1815 // checks). 1816 break; 1817 1818 // Binary scalar-as-vector operations that work column-wise. The high 1819 // elements come from operand 0. The low element is a function of both 1820 // operands. 1821 case Intrinsic::x86_sse_min_ss: 1822 case Intrinsic::x86_sse_max_ss: 1823 case Intrinsic::x86_sse_cmp_ss: 1824 case Intrinsic::x86_sse2_min_sd: 1825 case Intrinsic::x86_sse2_max_sd: 1826 case Intrinsic::x86_sse2_cmp_sd: { 1827 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); 1828 1829 // If lowest element of a scalar op isn't used then use Arg0. 1830 if (!DemandedElts[0]) { 1831 IC.addToWorklist(&II); 1832 return II.getArgOperand(0); 1833 } 1834 1835 // Only lower element is used for operand 1. 1836 DemandedElts = 1; 1837 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2); 1838 1839 // Lower element is undefined if both lower elements are undefined. 1840 // Consider things like undef&0. The result is known zero, not undef. 1841 if (!UndefElts2[0]) 1842 UndefElts.clearBit(0); 1843 1844 break; 1845 } 1846 1847 // Binary scalar-as-vector operations that work column-wise. The high 1848 // elements come from operand 0 and the low element comes from operand 1. 1849 case Intrinsic::x86_sse41_round_ss: 1850 case Intrinsic::x86_sse41_round_sd: { 1851 // Don't use the low element of operand 0. 1852 APInt DemandedElts2 = DemandedElts; 1853 DemandedElts2.clearBit(0); 1854 simplifyAndSetOp(&II, 0, DemandedElts2, UndefElts); 1855 1856 // If lowest element of a scalar op isn't used then use Arg0. 1857 if (!DemandedElts[0]) { 1858 IC.addToWorklist(&II); 1859 return II.getArgOperand(0); 1860 } 1861 1862 // Only lower element is used for operand 1. 1863 DemandedElts = 1; 1864 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2); 1865 1866 // Take the high undef elements from operand 0 and take the lower element 1867 // from operand 1. 1868 UndefElts.clearBit(0); 1869 UndefElts |= UndefElts2[0]; 1870 break; 1871 } 1872 1873 // Three input scalar-as-vector operations that work column-wise. The high 1874 // elements come from operand 0 and the low element is a function of all 1875 // three inputs. 1876 case Intrinsic::x86_avx512_mask_add_ss_round: 1877 case Intrinsic::x86_avx512_mask_div_ss_round: 1878 case Intrinsic::x86_avx512_mask_mul_ss_round: 1879 case Intrinsic::x86_avx512_mask_sub_ss_round: 1880 case Intrinsic::x86_avx512_mask_max_ss_round: 1881 case Intrinsic::x86_avx512_mask_min_ss_round: 1882 case Intrinsic::x86_avx512_mask_add_sd_round: 1883 case Intrinsic::x86_avx512_mask_div_sd_round: 1884 case Intrinsic::x86_avx512_mask_mul_sd_round: 1885 case Intrinsic::x86_avx512_mask_sub_sd_round: 1886 case Intrinsic::x86_avx512_mask_max_sd_round: 1887 case Intrinsic::x86_avx512_mask_min_sd_round: 1888 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); 1889 1890 // If lowest element of a scalar op isn't used then use Arg0. 1891 if (!DemandedElts[0]) { 1892 IC.addToWorklist(&II); 1893 return II.getArgOperand(0); 1894 } 1895 1896 // Only lower element is used for operand 1 and 2. 1897 DemandedElts = 1; 1898 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2); 1899 simplifyAndSetOp(&II, 2, DemandedElts, UndefElts3); 1900 1901 // Lower element is undefined if all three lower elements are undefined. 1902 // Consider things like undef&0. The result is known zero, not undef. 1903 if (!UndefElts2[0] || !UndefElts3[0]) 1904 UndefElts.clearBit(0); 1905 break; 1906 1907 // TODO: Add fmaddsub support? 1908 case Intrinsic::x86_sse3_addsub_pd: 1909 case Intrinsic::x86_sse3_addsub_ps: 1910 case Intrinsic::x86_avx_addsub_pd_256: 1911 case Intrinsic::x86_avx_addsub_ps_256: { 1912 // If none of the even or none of the odd lanes are required, turn this 1913 // into a generic FP math instruction. 1914 APInt SubMask = APInt::getSplat(VWidth, APInt(2, 0x1)); 1915 APInt AddMask = APInt::getSplat(VWidth, APInt(2, 0x2)); 1916 bool IsSubOnly = DemandedElts.isSubsetOf(SubMask); 1917 bool IsAddOnly = DemandedElts.isSubsetOf(AddMask); 1918 if (IsSubOnly || IsAddOnly) { 1919 assert((IsSubOnly ^ IsAddOnly) && "Can't be both add-only and sub-only"); 1920 IRBuilderBase::InsertPointGuard Guard(IC.Builder); 1921 IC.Builder.SetInsertPoint(&II); 1922 Value *Arg0 = II.getArgOperand(0), *Arg1 = II.getArgOperand(1); 1923 return IC.Builder.CreateBinOp( 1924 IsSubOnly ? Instruction::FSub : Instruction::FAdd, Arg0, Arg1); 1925 } 1926 1927 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); 1928 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2); 1929 UndefElts &= UndefElts2; 1930 break; 1931 } 1932 1933 // General per-element vector operations. 1934 case Intrinsic::x86_avx2_psllv_d: 1935 case Intrinsic::x86_avx2_psllv_d_256: 1936 case Intrinsic::x86_avx2_psllv_q: 1937 case Intrinsic::x86_avx2_psllv_q_256: 1938 case Intrinsic::x86_avx2_psrlv_d: 1939 case Intrinsic::x86_avx2_psrlv_d_256: 1940 case Intrinsic::x86_avx2_psrlv_q: 1941 case Intrinsic::x86_avx2_psrlv_q_256: 1942 case Intrinsic::x86_avx2_psrav_d: 1943 case Intrinsic::x86_avx2_psrav_d_256: { 1944 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); 1945 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2); 1946 UndefElts &= UndefElts2; 1947 break; 1948 } 1949 1950 case Intrinsic::x86_sse2_packssdw_128: 1951 case Intrinsic::x86_sse2_packsswb_128: 1952 case Intrinsic::x86_sse2_packuswb_128: 1953 case Intrinsic::x86_sse41_packusdw: 1954 case Intrinsic::x86_avx2_packssdw: 1955 case Intrinsic::x86_avx2_packsswb: 1956 case Intrinsic::x86_avx2_packusdw: 1957 case Intrinsic::x86_avx2_packuswb: 1958 case Intrinsic::x86_avx512_packssdw_512: 1959 case Intrinsic::x86_avx512_packsswb_512: 1960 case Intrinsic::x86_avx512_packusdw_512: 1961 case Intrinsic::x86_avx512_packuswb_512: { 1962 auto *Ty0 = II.getArgOperand(0)->getType(); 1963 unsigned InnerVWidth = cast<FixedVectorType>(Ty0)->getNumElements(); 1964 assert(VWidth == (InnerVWidth * 2) && "Unexpected input size"); 1965 1966 unsigned NumLanes = Ty0->getPrimitiveSizeInBits() / 128; 1967 unsigned VWidthPerLane = VWidth / NumLanes; 1968 unsigned InnerVWidthPerLane = InnerVWidth / NumLanes; 1969 1970 // Per lane, pack the elements of the first input and then the second. 1971 // e.g. 1972 // v8i16 PACK(v4i32 X, v4i32 Y) - (X[0..3],Y[0..3]) 1973 // v32i8 PACK(v16i16 X, v16i16 Y) - (X[0..7],Y[0..7]),(X[8..15],Y[8..15]) 1974 for (int OpNum = 0; OpNum != 2; ++OpNum) { 1975 APInt OpDemandedElts(InnerVWidth, 0); 1976 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { 1977 unsigned LaneIdx = Lane * VWidthPerLane; 1978 for (unsigned Elt = 0; Elt != InnerVWidthPerLane; ++Elt) { 1979 unsigned Idx = LaneIdx + Elt + InnerVWidthPerLane * OpNum; 1980 if (DemandedElts[Idx]) 1981 OpDemandedElts.setBit((Lane * InnerVWidthPerLane) + Elt); 1982 } 1983 } 1984 1985 // Demand elements from the operand. 1986 APInt OpUndefElts(InnerVWidth, 0); 1987 simplifyAndSetOp(&II, OpNum, OpDemandedElts, OpUndefElts); 1988 1989 // Pack the operand's UNDEF elements, one lane at a time. 1990 OpUndefElts = OpUndefElts.zext(VWidth); 1991 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { 1992 APInt LaneElts = OpUndefElts.lshr(InnerVWidthPerLane * Lane); 1993 LaneElts = LaneElts.getLoBits(InnerVWidthPerLane); 1994 LaneElts <<= InnerVWidthPerLane * (2 * Lane + OpNum); 1995 UndefElts |= LaneElts; 1996 } 1997 } 1998 break; 1999 } 2000 2001 // PSHUFB 2002 case Intrinsic::x86_ssse3_pshuf_b_128: 2003 case Intrinsic::x86_avx2_pshuf_b: 2004 case Intrinsic::x86_avx512_pshuf_b_512: 2005 // PERMILVAR 2006 case Intrinsic::x86_avx_vpermilvar_ps: 2007 case Intrinsic::x86_avx_vpermilvar_ps_256: 2008 case Intrinsic::x86_avx512_vpermilvar_ps_512: 2009 case Intrinsic::x86_avx_vpermilvar_pd: 2010 case Intrinsic::x86_avx_vpermilvar_pd_256: 2011 case Intrinsic::x86_avx512_vpermilvar_pd_512: 2012 // PERMV 2013 case Intrinsic::x86_avx2_permd: 2014 case Intrinsic::x86_avx2_permps: { 2015 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts); 2016 break; 2017 } 2018 2019 // SSE4A instructions leave the upper 64-bits of the 128-bit result 2020 // in an undefined state. 2021 case Intrinsic::x86_sse4a_extrq: 2022 case Intrinsic::x86_sse4a_extrqi: 2023 case Intrinsic::x86_sse4a_insertq: 2024 case Intrinsic::x86_sse4a_insertqi: 2025 UndefElts.setHighBits(VWidth / 2); 2026 break; 2027 } 2028 return None; 2029 } 2030