1 //===-- X86InstCombineIntrinsic.cpp - X86 specific InstCombine pass -------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements a TargetTransformInfo analysis pass specific to the 10 /// X86 target machine. It uses the target's detailed information to provide 11 /// more precise answers to certain TTI queries, while letting the target 12 /// independent and default TTI implementations handle the rest. 13 /// 14 //===----------------------------------------------------------------------===// 15 16 #include "X86TargetTransformInfo.h" 17 #include "llvm/IR/IntrinsicInst.h" 18 #include "llvm/IR/IntrinsicsX86.h" 19 #include "llvm/Support/KnownBits.h" 20 #include "llvm/Transforms/InstCombine/InstCombiner.h" 21 #include <optional> 22 23 using namespace llvm; 24 25 #define DEBUG_TYPE "x86tti" 26 27 /// Return a constant boolean vector that has true elements in all positions 28 /// where the input constant data vector has an element with the sign bit set. 29 static Constant *getNegativeIsTrueBoolVec(Constant *V) { 30 VectorType *IntTy = VectorType::getInteger(cast<VectorType>(V->getType())); 31 V = ConstantExpr::getBitCast(V, IntTy); 32 V = ConstantExpr::getICmp(CmpInst::ICMP_SGT, Constant::getNullValue(IntTy), 33 V); 34 return V; 35 } 36 37 /// Convert the x86 XMM integer vector mask to a vector of bools based on 38 /// each element's most significant bit (the sign bit). 39 static Value *getBoolVecFromMask(Value *Mask) { 40 // Fold Constant Mask. 41 if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) 42 return getNegativeIsTrueBoolVec(ConstantMask); 43 44 // Mask was extended from a boolean vector. 45 Value *ExtMask; 46 if (PatternMatch::match( 47 Mask, PatternMatch::m_SExt(PatternMatch::m_Value(ExtMask))) && 48 ExtMask->getType()->isIntOrIntVectorTy(1)) 49 return ExtMask; 50 51 return nullptr; 52 } 53 54 // TODO: If the x86 backend knew how to convert a bool vector mask back to an 55 // XMM register mask efficiently, we could transform all x86 masked intrinsics 56 // to LLVM masked intrinsics and remove the x86 masked intrinsic defs. 57 static Instruction *simplifyX86MaskedLoad(IntrinsicInst &II, InstCombiner &IC) { 58 Value *Ptr = II.getOperand(0); 59 Value *Mask = II.getOperand(1); 60 Constant *ZeroVec = Constant::getNullValue(II.getType()); 61 62 // Zero Mask - masked load instruction creates a zero vector. 63 if (isa<ConstantAggregateZero>(Mask)) 64 return IC.replaceInstUsesWith(II, ZeroVec); 65 66 // The mask is constant or extended from a bool vector. Convert this x86 67 // intrinsic to the LLVM intrinsic to allow target-independent optimizations. 68 if (Value *BoolMask = getBoolVecFromMask(Mask)) { 69 // First, cast the x86 intrinsic scalar pointer to a vector pointer to match 70 // the LLVM intrinsic definition for the pointer argument. 71 unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace(); 72 PointerType *VecPtrTy = PointerType::get(II.getType(), AddrSpace); 73 Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec"); 74 75 // The pass-through vector for an x86 masked load is a zero vector. 76 CallInst *NewMaskedLoad = IC.Builder.CreateMaskedLoad( 77 II.getType(), PtrCast, Align(1), BoolMask, ZeroVec); 78 return IC.replaceInstUsesWith(II, NewMaskedLoad); 79 } 80 81 return nullptr; 82 } 83 84 // TODO: If the x86 backend knew how to convert a bool vector mask back to an 85 // XMM register mask efficiently, we could transform all x86 masked intrinsics 86 // to LLVM masked intrinsics and remove the x86 masked intrinsic defs. 87 static bool simplifyX86MaskedStore(IntrinsicInst &II, InstCombiner &IC) { 88 Value *Ptr = II.getOperand(0); 89 Value *Mask = II.getOperand(1); 90 Value *Vec = II.getOperand(2); 91 92 // Zero Mask - this masked store instruction does nothing. 93 if (isa<ConstantAggregateZero>(Mask)) { 94 IC.eraseInstFromFunction(II); 95 return true; 96 } 97 98 // The SSE2 version is too weird (eg, unaligned but non-temporal) to do 99 // anything else at this level. 100 if (II.getIntrinsicID() == Intrinsic::x86_sse2_maskmov_dqu) 101 return false; 102 103 // The mask is constant or extended from a bool vector. Convert this x86 104 // intrinsic to the LLVM intrinsic to allow target-independent optimizations. 105 if (Value *BoolMask = getBoolVecFromMask(Mask)) { 106 unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace(); 107 PointerType *VecPtrTy = PointerType::get(Vec->getType(), AddrSpace); 108 Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec"); 109 110 IC.Builder.CreateMaskedStore(Vec, PtrCast, Align(1), BoolMask); 111 112 // 'Replace uses' doesn't work for stores. Erase the original masked store. 113 IC.eraseInstFromFunction(II); 114 return true; 115 } 116 117 return false; 118 } 119 120 static Value *simplifyX86immShift(const IntrinsicInst &II, 121 InstCombiner::BuilderTy &Builder) { 122 bool LogicalShift = false; 123 bool ShiftLeft = false; 124 bool IsImm = false; 125 126 switch (II.getIntrinsicID()) { 127 default: 128 llvm_unreachable("Unexpected intrinsic!"); 129 case Intrinsic::x86_sse2_psrai_d: 130 case Intrinsic::x86_sse2_psrai_w: 131 case Intrinsic::x86_avx2_psrai_d: 132 case Intrinsic::x86_avx2_psrai_w: 133 case Intrinsic::x86_avx512_psrai_q_128: 134 case Intrinsic::x86_avx512_psrai_q_256: 135 case Intrinsic::x86_avx512_psrai_d_512: 136 case Intrinsic::x86_avx512_psrai_q_512: 137 case Intrinsic::x86_avx512_psrai_w_512: 138 IsImm = true; 139 [[fallthrough]]; 140 case Intrinsic::x86_sse2_psra_d: 141 case Intrinsic::x86_sse2_psra_w: 142 case Intrinsic::x86_avx2_psra_d: 143 case Intrinsic::x86_avx2_psra_w: 144 case Intrinsic::x86_avx512_psra_q_128: 145 case Intrinsic::x86_avx512_psra_q_256: 146 case Intrinsic::x86_avx512_psra_d_512: 147 case Intrinsic::x86_avx512_psra_q_512: 148 case Intrinsic::x86_avx512_psra_w_512: 149 LogicalShift = false; 150 ShiftLeft = false; 151 break; 152 case Intrinsic::x86_sse2_psrli_d: 153 case Intrinsic::x86_sse2_psrli_q: 154 case Intrinsic::x86_sse2_psrli_w: 155 case Intrinsic::x86_avx2_psrli_d: 156 case Intrinsic::x86_avx2_psrli_q: 157 case Intrinsic::x86_avx2_psrli_w: 158 case Intrinsic::x86_avx512_psrli_d_512: 159 case Intrinsic::x86_avx512_psrli_q_512: 160 case Intrinsic::x86_avx512_psrli_w_512: 161 IsImm = true; 162 [[fallthrough]]; 163 case Intrinsic::x86_sse2_psrl_d: 164 case Intrinsic::x86_sse2_psrl_q: 165 case Intrinsic::x86_sse2_psrl_w: 166 case Intrinsic::x86_avx2_psrl_d: 167 case Intrinsic::x86_avx2_psrl_q: 168 case Intrinsic::x86_avx2_psrl_w: 169 case Intrinsic::x86_avx512_psrl_d_512: 170 case Intrinsic::x86_avx512_psrl_q_512: 171 case Intrinsic::x86_avx512_psrl_w_512: 172 LogicalShift = true; 173 ShiftLeft = false; 174 break; 175 case Intrinsic::x86_sse2_pslli_d: 176 case Intrinsic::x86_sse2_pslli_q: 177 case Intrinsic::x86_sse2_pslli_w: 178 case Intrinsic::x86_avx2_pslli_d: 179 case Intrinsic::x86_avx2_pslli_q: 180 case Intrinsic::x86_avx2_pslli_w: 181 case Intrinsic::x86_avx512_pslli_d_512: 182 case Intrinsic::x86_avx512_pslli_q_512: 183 case Intrinsic::x86_avx512_pslli_w_512: 184 IsImm = true; 185 [[fallthrough]]; 186 case Intrinsic::x86_sse2_psll_d: 187 case Intrinsic::x86_sse2_psll_q: 188 case Intrinsic::x86_sse2_psll_w: 189 case Intrinsic::x86_avx2_psll_d: 190 case Intrinsic::x86_avx2_psll_q: 191 case Intrinsic::x86_avx2_psll_w: 192 case Intrinsic::x86_avx512_psll_d_512: 193 case Intrinsic::x86_avx512_psll_q_512: 194 case Intrinsic::x86_avx512_psll_w_512: 195 LogicalShift = true; 196 ShiftLeft = true; 197 break; 198 } 199 assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left"); 200 201 Value *Vec = II.getArgOperand(0); 202 Value *Amt = II.getArgOperand(1); 203 auto *VT = cast<FixedVectorType>(Vec->getType()); 204 Type *SVT = VT->getElementType(); 205 Type *AmtVT = Amt->getType(); 206 unsigned VWidth = VT->getNumElements(); 207 unsigned BitWidth = SVT->getPrimitiveSizeInBits(); 208 209 // If the shift amount is guaranteed to be in-range we can replace it with a 210 // generic shift. If its guaranteed to be out of range, logical shifts combine 211 // to zero and arithmetic shifts are clamped to (BitWidth - 1). 212 if (IsImm) { 213 assert(AmtVT->isIntegerTy(32) && "Unexpected shift-by-immediate type"); 214 KnownBits KnownAmtBits = 215 llvm::computeKnownBits(Amt, II.getModule()->getDataLayout()); 216 if (KnownAmtBits.getMaxValue().ult(BitWidth)) { 217 Amt = Builder.CreateZExtOrTrunc(Amt, SVT); 218 Amt = Builder.CreateVectorSplat(VWidth, Amt); 219 return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt) 220 : Builder.CreateLShr(Vec, Amt)) 221 : Builder.CreateAShr(Vec, Amt)); 222 } 223 if (KnownAmtBits.getMinValue().uge(BitWidth)) { 224 if (LogicalShift) 225 return ConstantAggregateZero::get(VT); 226 Amt = ConstantInt::get(SVT, BitWidth - 1); 227 return Builder.CreateAShr(Vec, Builder.CreateVectorSplat(VWidth, Amt)); 228 } 229 } else { 230 // Ensure the first element has an in-range value and the rest of the 231 // elements in the bottom 64 bits are zero. 232 assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 && 233 cast<VectorType>(AmtVT)->getElementType() == SVT && 234 "Unexpected shift-by-scalar type"); 235 unsigned NumAmtElts = cast<FixedVectorType>(AmtVT)->getNumElements(); 236 APInt DemandedLower = APInt::getOneBitSet(NumAmtElts, 0); 237 APInt DemandedUpper = APInt::getBitsSet(NumAmtElts, 1, NumAmtElts / 2); 238 KnownBits KnownLowerBits = llvm::computeKnownBits( 239 Amt, DemandedLower, II.getModule()->getDataLayout()); 240 KnownBits KnownUpperBits = llvm::computeKnownBits( 241 Amt, DemandedUpper, II.getModule()->getDataLayout()); 242 if (KnownLowerBits.getMaxValue().ult(BitWidth) && 243 (DemandedUpper.isZero() || KnownUpperBits.isZero())) { 244 SmallVector<int, 16> ZeroSplat(VWidth, 0); 245 Amt = Builder.CreateShuffleVector(Amt, ZeroSplat); 246 return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt) 247 : Builder.CreateLShr(Vec, Amt)) 248 : Builder.CreateAShr(Vec, Amt)); 249 } 250 } 251 252 // Simplify if count is constant vector. 253 auto *CDV = dyn_cast<ConstantDataVector>(Amt); 254 if (!CDV) 255 return nullptr; 256 257 // SSE2/AVX2 uses all the first 64-bits of the 128-bit vector 258 // operand to compute the shift amount. 259 assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 && 260 cast<VectorType>(AmtVT)->getElementType() == SVT && 261 "Unexpected shift-by-scalar type"); 262 263 // Concatenate the sub-elements to create the 64-bit value. 264 APInt Count(64, 0); 265 for (unsigned i = 0, NumSubElts = 64 / BitWidth; i != NumSubElts; ++i) { 266 unsigned SubEltIdx = (NumSubElts - 1) - i; 267 auto *SubElt = cast<ConstantInt>(CDV->getElementAsConstant(SubEltIdx)); 268 Count <<= BitWidth; 269 Count |= SubElt->getValue().zextOrTrunc(64); 270 } 271 272 // If shift-by-zero then just return the original value. 273 if (Count.isZero()) 274 return Vec; 275 276 // Handle cases when Shift >= BitWidth. 277 if (Count.uge(BitWidth)) { 278 // If LogicalShift - just return zero. 279 if (LogicalShift) 280 return ConstantAggregateZero::get(VT); 281 282 // If ArithmeticShift - clamp Shift to (BitWidth - 1). 283 Count = APInt(64, BitWidth - 1); 284 } 285 286 // Get a constant vector of the same type as the first operand. 287 auto ShiftAmt = ConstantInt::get(SVT, Count.zextOrTrunc(BitWidth)); 288 auto ShiftVec = Builder.CreateVectorSplat(VWidth, ShiftAmt); 289 290 if (ShiftLeft) 291 return Builder.CreateShl(Vec, ShiftVec); 292 293 if (LogicalShift) 294 return Builder.CreateLShr(Vec, ShiftVec); 295 296 return Builder.CreateAShr(Vec, ShiftVec); 297 } 298 299 // Attempt to simplify AVX2 per-element shift intrinsics to a generic IR shift. 300 // Unlike the generic IR shifts, the intrinsics have defined behaviour for out 301 // of range shift amounts (logical - set to zero, arithmetic - splat sign bit). 302 static Value *simplifyX86varShift(const IntrinsicInst &II, 303 InstCombiner::BuilderTy &Builder) { 304 bool LogicalShift = false; 305 bool ShiftLeft = false; 306 307 switch (II.getIntrinsicID()) { 308 default: 309 llvm_unreachable("Unexpected intrinsic!"); 310 case Intrinsic::x86_avx2_psrav_d: 311 case Intrinsic::x86_avx2_psrav_d_256: 312 case Intrinsic::x86_avx512_psrav_q_128: 313 case Intrinsic::x86_avx512_psrav_q_256: 314 case Intrinsic::x86_avx512_psrav_d_512: 315 case Intrinsic::x86_avx512_psrav_q_512: 316 case Intrinsic::x86_avx512_psrav_w_128: 317 case Intrinsic::x86_avx512_psrav_w_256: 318 case Intrinsic::x86_avx512_psrav_w_512: 319 LogicalShift = false; 320 ShiftLeft = false; 321 break; 322 case Intrinsic::x86_avx2_psrlv_d: 323 case Intrinsic::x86_avx2_psrlv_d_256: 324 case Intrinsic::x86_avx2_psrlv_q: 325 case Intrinsic::x86_avx2_psrlv_q_256: 326 case Intrinsic::x86_avx512_psrlv_d_512: 327 case Intrinsic::x86_avx512_psrlv_q_512: 328 case Intrinsic::x86_avx512_psrlv_w_128: 329 case Intrinsic::x86_avx512_psrlv_w_256: 330 case Intrinsic::x86_avx512_psrlv_w_512: 331 LogicalShift = true; 332 ShiftLeft = false; 333 break; 334 case Intrinsic::x86_avx2_psllv_d: 335 case Intrinsic::x86_avx2_psllv_d_256: 336 case Intrinsic::x86_avx2_psllv_q: 337 case Intrinsic::x86_avx2_psllv_q_256: 338 case Intrinsic::x86_avx512_psllv_d_512: 339 case Intrinsic::x86_avx512_psllv_q_512: 340 case Intrinsic::x86_avx512_psllv_w_128: 341 case Intrinsic::x86_avx512_psllv_w_256: 342 case Intrinsic::x86_avx512_psllv_w_512: 343 LogicalShift = true; 344 ShiftLeft = true; 345 break; 346 } 347 assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left"); 348 349 Value *Vec = II.getArgOperand(0); 350 Value *Amt = II.getArgOperand(1); 351 auto *VT = cast<FixedVectorType>(II.getType()); 352 Type *SVT = VT->getElementType(); 353 int NumElts = VT->getNumElements(); 354 int BitWidth = SVT->getIntegerBitWidth(); 355 356 // If the shift amount is guaranteed to be in-range we can replace it with a 357 // generic shift. 358 KnownBits KnownAmt = 359 llvm::computeKnownBits(Amt, II.getModule()->getDataLayout()); 360 if (KnownAmt.getMaxValue().ult(BitWidth)) { 361 return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt) 362 : Builder.CreateLShr(Vec, Amt)) 363 : Builder.CreateAShr(Vec, Amt)); 364 } 365 366 // Simplify if all shift amounts are constant/undef. 367 auto *CShift = dyn_cast<Constant>(Amt); 368 if (!CShift) 369 return nullptr; 370 371 // Collect each element's shift amount. 372 // We also collect special cases: UNDEF = -1, OUT-OF-RANGE = BitWidth. 373 bool AnyOutOfRange = false; 374 SmallVector<int, 8> ShiftAmts; 375 for (int I = 0; I < NumElts; ++I) { 376 auto *CElt = CShift->getAggregateElement(I); 377 if (isa_and_nonnull<UndefValue>(CElt)) { 378 ShiftAmts.push_back(-1); 379 continue; 380 } 381 382 auto *COp = dyn_cast_or_null<ConstantInt>(CElt); 383 if (!COp) 384 return nullptr; 385 386 // Handle out of range shifts. 387 // If LogicalShift - set to BitWidth (special case). 388 // If ArithmeticShift - set to (BitWidth - 1) (sign splat). 389 APInt ShiftVal = COp->getValue(); 390 if (ShiftVal.uge(BitWidth)) { 391 AnyOutOfRange = LogicalShift; 392 ShiftAmts.push_back(LogicalShift ? BitWidth : BitWidth - 1); 393 continue; 394 } 395 396 ShiftAmts.push_back((int)ShiftVal.getZExtValue()); 397 } 398 399 // If all elements out of range or UNDEF, return vector of zeros/undefs. 400 // ArithmeticShift should only hit this if they are all UNDEF. 401 auto OutOfRange = [&](int Idx) { return (Idx < 0) || (BitWidth <= Idx); }; 402 if (llvm::all_of(ShiftAmts, OutOfRange)) { 403 SmallVector<Constant *, 8> ConstantVec; 404 for (int Idx : ShiftAmts) { 405 if (Idx < 0) { 406 ConstantVec.push_back(UndefValue::get(SVT)); 407 } else { 408 assert(LogicalShift && "Logical shift expected"); 409 ConstantVec.push_back(ConstantInt::getNullValue(SVT)); 410 } 411 } 412 return ConstantVector::get(ConstantVec); 413 } 414 415 // We can't handle only some out of range values with generic logical shifts. 416 if (AnyOutOfRange) 417 return nullptr; 418 419 // Build the shift amount constant vector. 420 SmallVector<Constant *, 8> ShiftVecAmts; 421 for (int Idx : ShiftAmts) { 422 if (Idx < 0) 423 ShiftVecAmts.push_back(UndefValue::get(SVT)); 424 else 425 ShiftVecAmts.push_back(ConstantInt::get(SVT, Idx)); 426 } 427 auto ShiftVec = ConstantVector::get(ShiftVecAmts); 428 429 if (ShiftLeft) 430 return Builder.CreateShl(Vec, ShiftVec); 431 432 if (LogicalShift) 433 return Builder.CreateLShr(Vec, ShiftVec); 434 435 return Builder.CreateAShr(Vec, ShiftVec); 436 } 437 438 static Value *simplifyX86pack(IntrinsicInst &II, 439 InstCombiner::BuilderTy &Builder, bool IsSigned) { 440 Value *Arg0 = II.getArgOperand(0); 441 Value *Arg1 = II.getArgOperand(1); 442 Type *ResTy = II.getType(); 443 444 // Fast all undef handling. 445 if (isa<UndefValue>(Arg0) && isa<UndefValue>(Arg1)) 446 return UndefValue::get(ResTy); 447 448 auto *ArgTy = cast<FixedVectorType>(Arg0->getType()); 449 unsigned NumLanes = ResTy->getPrimitiveSizeInBits() / 128; 450 unsigned NumSrcElts = ArgTy->getNumElements(); 451 assert(cast<FixedVectorType>(ResTy)->getNumElements() == (2 * NumSrcElts) && 452 "Unexpected packing types"); 453 454 unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes; 455 unsigned DstScalarSizeInBits = ResTy->getScalarSizeInBits(); 456 unsigned SrcScalarSizeInBits = ArgTy->getScalarSizeInBits(); 457 assert(SrcScalarSizeInBits == (2 * DstScalarSizeInBits) && 458 "Unexpected packing types"); 459 460 // Constant folding. 461 if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1)) 462 return nullptr; 463 464 // Clamp Values - signed/unsigned both use signed clamp values, but they 465 // differ on the min/max values. 466 APInt MinValue, MaxValue; 467 if (IsSigned) { 468 // PACKSS: Truncate signed value with signed saturation. 469 // Source values less than dst minint are saturated to minint. 470 // Source values greater than dst maxint are saturated to maxint. 471 MinValue = 472 APInt::getSignedMinValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits); 473 MaxValue = 474 APInt::getSignedMaxValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits); 475 } else { 476 // PACKUS: Truncate signed value with unsigned saturation. 477 // Source values less than zero are saturated to zero. 478 // Source values greater than dst maxuint are saturated to maxuint. 479 MinValue = APInt::getZero(SrcScalarSizeInBits); 480 MaxValue = APInt::getLowBitsSet(SrcScalarSizeInBits, DstScalarSizeInBits); 481 } 482 483 auto *MinC = Constant::getIntegerValue(ArgTy, MinValue); 484 auto *MaxC = Constant::getIntegerValue(ArgTy, MaxValue); 485 Arg0 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg0, MinC), MinC, Arg0); 486 Arg1 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg1, MinC), MinC, Arg1); 487 Arg0 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg0, MaxC), MaxC, Arg0); 488 Arg1 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg1, MaxC), MaxC, Arg1); 489 490 // Shuffle clamped args together at the lane level. 491 SmallVector<int, 32> PackMask; 492 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { 493 for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt) 494 PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane)); 495 for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt) 496 PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane) + NumSrcElts); 497 } 498 auto *Shuffle = Builder.CreateShuffleVector(Arg0, Arg1, PackMask); 499 500 // Truncate to dst size. 501 return Builder.CreateTrunc(Shuffle, ResTy); 502 } 503 504 static Value *simplifyX86movmsk(const IntrinsicInst &II, 505 InstCombiner::BuilderTy &Builder) { 506 Value *Arg = II.getArgOperand(0); 507 Type *ResTy = II.getType(); 508 509 // movmsk(undef) -> zero as we must ensure the upper bits are zero. 510 if (isa<UndefValue>(Arg)) 511 return Constant::getNullValue(ResTy); 512 513 auto *ArgTy = dyn_cast<FixedVectorType>(Arg->getType()); 514 // We can't easily peek through x86_mmx types. 515 if (!ArgTy) 516 return nullptr; 517 518 // Expand MOVMSK to compare/bitcast/zext: 519 // e.g. PMOVMSKB(v16i8 x): 520 // %cmp = icmp slt <16 x i8> %x, zeroinitializer 521 // %int = bitcast <16 x i1> %cmp to i16 522 // %res = zext i16 %int to i32 523 unsigned NumElts = ArgTy->getNumElements(); 524 Type *IntegerTy = Builder.getIntNTy(NumElts); 525 526 Value *Res = Builder.CreateBitCast(Arg, VectorType::getInteger(ArgTy)); 527 Res = Builder.CreateIsNeg(Res); 528 Res = Builder.CreateBitCast(Res, IntegerTy); 529 Res = Builder.CreateZExtOrTrunc(Res, ResTy); 530 return Res; 531 } 532 533 static Value *simplifyX86addcarry(const IntrinsicInst &II, 534 InstCombiner::BuilderTy &Builder) { 535 Value *CarryIn = II.getArgOperand(0); 536 Value *Op1 = II.getArgOperand(1); 537 Value *Op2 = II.getArgOperand(2); 538 Type *RetTy = II.getType(); 539 Type *OpTy = Op1->getType(); 540 assert(RetTy->getStructElementType(0)->isIntegerTy(8) && 541 RetTy->getStructElementType(1) == OpTy && OpTy == Op2->getType() && 542 "Unexpected types for x86 addcarry"); 543 544 // If carry-in is zero, this is just an unsigned add with overflow. 545 if (match(CarryIn, PatternMatch::m_ZeroInt())) { 546 Value *UAdd = Builder.CreateIntrinsic(Intrinsic::uadd_with_overflow, OpTy, 547 {Op1, Op2}); 548 // The types have to be adjusted to match the x86 call types. 549 Value *UAddResult = Builder.CreateExtractValue(UAdd, 0); 550 Value *UAddOV = Builder.CreateZExt(Builder.CreateExtractValue(UAdd, 1), 551 Builder.getInt8Ty()); 552 Value *Res = PoisonValue::get(RetTy); 553 Res = Builder.CreateInsertValue(Res, UAddOV, 0); 554 return Builder.CreateInsertValue(Res, UAddResult, 1); 555 } 556 557 return nullptr; 558 } 559 560 static Value *simplifyX86insertps(const IntrinsicInst &II, 561 InstCombiner::BuilderTy &Builder) { 562 auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2)); 563 if (!CInt) 564 return nullptr; 565 566 auto *VecTy = cast<FixedVectorType>(II.getType()); 567 assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type"); 568 569 // The immediate permute control byte looks like this: 570 // [3:0] - zero mask for each 32-bit lane 571 // [5:4] - select one 32-bit destination lane 572 // [7:6] - select one 32-bit source lane 573 574 uint8_t Imm = CInt->getZExtValue(); 575 uint8_t ZMask = Imm & 0xf; 576 uint8_t DestLane = (Imm >> 4) & 0x3; 577 uint8_t SourceLane = (Imm >> 6) & 0x3; 578 579 ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy); 580 581 // If all zero mask bits are set, this was just a weird way to 582 // generate a zero vector. 583 if (ZMask == 0xf) 584 return ZeroVector; 585 586 // Initialize by passing all of the first source bits through. 587 int ShuffleMask[4] = {0, 1, 2, 3}; 588 589 // We may replace the second operand with the zero vector. 590 Value *V1 = II.getArgOperand(1); 591 592 if (ZMask) { 593 // If the zero mask is being used with a single input or the zero mask 594 // overrides the destination lane, this is a shuffle with the zero vector. 595 if ((II.getArgOperand(0) == II.getArgOperand(1)) || 596 (ZMask & (1 << DestLane))) { 597 V1 = ZeroVector; 598 // We may still move 32-bits of the first source vector from one lane 599 // to another. 600 ShuffleMask[DestLane] = SourceLane; 601 // The zero mask may override the previous insert operation. 602 for (unsigned i = 0; i < 4; ++i) 603 if ((ZMask >> i) & 0x1) 604 ShuffleMask[i] = i + 4; 605 } else { 606 // TODO: Model this case as 2 shuffles or a 'logical and' plus shuffle? 607 return nullptr; 608 } 609 } else { 610 // Replace the selected destination lane with the selected source lane. 611 ShuffleMask[DestLane] = SourceLane + 4; 612 } 613 614 return Builder.CreateShuffleVector(II.getArgOperand(0), V1, ShuffleMask); 615 } 616 617 /// Attempt to simplify SSE4A EXTRQ/EXTRQI instructions using constant folding 618 /// or conversion to a shuffle vector. 619 static Value *simplifyX86extrq(IntrinsicInst &II, Value *Op0, 620 ConstantInt *CILength, ConstantInt *CIIndex, 621 InstCombiner::BuilderTy &Builder) { 622 auto LowConstantHighUndef = [&](uint64_t Val) { 623 Type *IntTy64 = Type::getInt64Ty(II.getContext()); 624 Constant *Args[] = {ConstantInt::get(IntTy64, Val), 625 UndefValue::get(IntTy64)}; 626 return ConstantVector::get(Args); 627 }; 628 629 // See if we're dealing with constant values. 630 auto *C0 = dyn_cast<Constant>(Op0); 631 auto *CI0 = 632 C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0)) 633 : nullptr; 634 635 // Attempt to constant fold. 636 if (CILength && CIIndex) { 637 // From AMD documentation: "The bit index and field length are each six 638 // bits in length other bits of the field are ignored." 639 APInt APIndex = CIIndex->getValue().zextOrTrunc(6); 640 APInt APLength = CILength->getValue().zextOrTrunc(6); 641 642 unsigned Index = APIndex.getZExtValue(); 643 644 // From AMD documentation: "a value of zero in the field length is 645 // defined as length of 64". 646 unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue(); 647 648 // From AMD documentation: "If the sum of the bit index + length field 649 // is greater than 64, the results are undefined". 650 unsigned End = Index + Length; 651 652 // Note that both field index and field length are 8-bit quantities. 653 // Since variables 'Index' and 'Length' are unsigned values 654 // obtained from zero-extending field index and field length 655 // respectively, their sum should never wrap around. 656 if (End > 64) 657 return UndefValue::get(II.getType()); 658 659 // If we are inserting whole bytes, we can convert this to a shuffle. 660 // Lowering can recognize EXTRQI shuffle masks. 661 if ((Length % 8) == 0 && (Index % 8) == 0) { 662 // Convert bit indices to byte indices. 663 Length /= 8; 664 Index /= 8; 665 666 Type *IntTy8 = Type::getInt8Ty(II.getContext()); 667 auto *ShufTy = FixedVectorType::get(IntTy8, 16); 668 669 SmallVector<int, 16> ShuffleMask; 670 for (int i = 0; i != (int)Length; ++i) 671 ShuffleMask.push_back(i + Index); 672 for (int i = Length; i != 8; ++i) 673 ShuffleMask.push_back(i + 16); 674 for (int i = 8; i != 16; ++i) 675 ShuffleMask.push_back(-1); 676 677 Value *SV = Builder.CreateShuffleVector( 678 Builder.CreateBitCast(Op0, ShufTy), 679 ConstantAggregateZero::get(ShufTy), ShuffleMask); 680 return Builder.CreateBitCast(SV, II.getType()); 681 } 682 683 // Constant Fold - shift Index'th bit to lowest position and mask off 684 // Length bits. 685 if (CI0) { 686 APInt Elt = CI0->getValue(); 687 Elt.lshrInPlace(Index); 688 Elt = Elt.zextOrTrunc(Length); 689 return LowConstantHighUndef(Elt.getZExtValue()); 690 } 691 692 // If we were an EXTRQ call, we'll save registers if we convert to EXTRQI. 693 if (II.getIntrinsicID() == Intrinsic::x86_sse4a_extrq) { 694 Value *Args[] = {Op0, CILength, CIIndex}; 695 Module *M = II.getModule(); 696 Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_extrqi); 697 return Builder.CreateCall(F, Args); 698 } 699 } 700 701 // Constant Fold - extraction from zero is always {zero, undef}. 702 if (CI0 && CI0->isZero()) 703 return LowConstantHighUndef(0); 704 705 return nullptr; 706 } 707 708 /// Attempt to simplify SSE4A INSERTQ/INSERTQI instructions using constant 709 /// folding or conversion to a shuffle vector. 710 static Value *simplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1, 711 APInt APLength, APInt APIndex, 712 InstCombiner::BuilderTy &Builder) { 713 // From AMD documentation: "The bit index and field length are each six bits 714 // in length other bits of the field are ignored." 715 APIndex = APIndex.zextOrTrunc(6); 716 APLength = APLength.zextOrTrunc(6); 717 718 // Attempt to constant fold. 719 unsigned Index = APIndex.getZExtValue(); 720 721 // From AMD documentation: "a value of zero in the field length is 722 // defined as length of 64". 723 unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue(); 724 725 // From AMD documentation: "If the sum of the bit index + length field 726 // is greater than 64, the results are undefined". 727 unsigned End = Index + Length; 728 729 // Note that both field index and field length are 8-bit quantities. 730 // Since variables 'Index' and 'Length' are unsigned values 731 // obtained from zero-extending field index and field length 732 // respectively, their sum should never wrap around. 733 if (End > 64) 734 return UndefValue::get(II.getType()); 735 736 // If we are inserting whole bytes, we can convert this to a shuffle. 737 // Lowering can recognize INSERTQI shuffle masks. 738 if ((Length % 8) == 0 && (Index % 8) == 0) { 739 // Convert bit indices to byte indices. 740 Length /= 8; 741 Index /= 8; 742 743 Type *IntTy8 = Type::getInt8Ty(II.getContext()); 744 auto *ShufTy = FixedVectorType::get(IntTy8, 16); 745 746 SmallVector<int, 16> ShuffleMask; 747 for (int i = 0; i != (int)Index; ++i) 748 ShuffleMask.push_back(i); 749 for (int i = 0; i != (int)Length; ++i) 750 ShuffleMask.push_back(i + 16); 751 for (int i = Index + Length; i != 8; ++i) 752 ShuffleMask.push_back(i); 753 for (int i = 8; i != 16; ++i) 754 ShuffleMask.push_back(-1); 755 756 Value *SV = Builder.CreateShuffleVector(Builder.CreateBitCast(Op0, ShufTy), 757 Builder.CreateBitCast(Op1, ShufTy), 758 ShuffleMask); 759 return Builder.CreateBitCast(SV, II.getType()); 760 } 761 762 // See if we're dealing with constant values. 763 auto *C0 = dyn_cast<Constant>(Op0); 764 auto *C1 = dyn_cast<Constant>(Op1); 765 auto *CI00 = 766 C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0)) 767 : nullptr; 768 auto *CI10 = 769 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0)) 770 : nullptr; 771 772 // Constant Fold - insert bottom Length bits starting at the Index'th bit. 773 if (CI00 && CI10) { 774 APInt V00 = CI00->getValue(); 775 APInt V10 = CI10->getValue(); 776 APInt Mask = APInt::getLowBitsSet(64, Length).shl(Index); 777 V00 = V00 & ~Mask; 778 V10 = V10.zextOrTrunc(Length).zextOrTrunc(64).shl(Index); 779 APInt Val = V00 | V10; 780 Type *IntTy64 = Type::getInt64Ty(II.getContext()); 781 Constant *Args[] = {ConstantInt::get(IntTy64, Val.getZExtValue()), 782 UndefValue::get(IntTy64)}; 783 return ConstantVector::get(Args); 784 } 785 786 // If we were an INSERTQ call, we'll save demanded elements if we convert to 787 // INSERTQI. 788 if (II.getIntrinsicID() == Intrinsic::x86_sse4a_insertq) { 789 Type *IntTy8 = Type::getInt8Ty(II.getContext()); 790 Constant *CILength = ConstantInt::get(IntTy8, Length, false); 791 Constant *CIIndex = ConstantInt::get(IntTy8, Index, false); 792 793 Value *Args[] = {Op0, Op1, CILength, CIIndex}; 794 Module *M = II.getModule(); 795 Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi); 796 return Builder.CreateCall(F, Args); 797 } 798 799 return nullptr; 800 } 801 802 /// Attempt to convert pshufb* to shufflevector if the mask is constant. 803 static Value *simplifyX86pshufb(const IntrinsicInst &II, 804 InstCombiner::BuilderTy &Builder) { 805 auto *V = dyn_cast<Constant>(II.getArgOperand(1)); 806 if (!V) 807 return nullptr; 808 809 auto *VecTy = cast<FixedVectorType>(II.getType()); 810 unsigned NumElts = VecTy->getNumElements(); 811 assert((NumElts == 16 || NumElts == 32 || NumElts == 64) && 812 "Unexpected number of elements in shuffle mask!"); 813 814 // Construct a shuffle mask from constant integers or UNDEFs. 815 int Indexes[64]; 816 817 // Each byte in the shuffle control mask forms an index to permute the 818 // corresponding byte in the destination operand. 819 for (unsigned I = 0; I < NumElts; ++I) { 820 Constant *COp = V->getAggregateElement(I); 821 if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) 822 return nullptr; 823 824 if (isa<UndefValue>(COp)) { 825 Indexes[I] = -1; 826 continue; 827 } 828 829 int8_t Index = cast<ConstantInt>(COp)->getValue().getZExtValue(); 830 831 // If the most significant bit (bit[7]) of each byte of the shuffle 832 // control mask is set, then zero is written in the result byte. 833 // The zero vector is in the right-hand side of the resulting 834 // shufflevector. 835 836 // The value of each index for the high 128-bit lane is the least 837 // significant 4 bits of the respective shuffle control byte. 838 Index = ((Index < 0) ? NumElts : Index & 0x0F) + (I & 0xF0); 839 Indexes[I] = Index; 840 } 841 842 auto V1 = II.getArgOperand(0); 843 auto V2 = Constant::getNullValue(VecTy); 844 return Builder.CreateShuffleVector(V1, V2, ArrayRef(Indexes, NumElts)); 845 } 846 847 /// Attempt to convert vpermilvar* to shufflevector if the mask is constant. 848 static Value *simplifyX86vpermilvar(const IntrinsicInst &II, 849 InstCombiner::BuilderTy &Builder) { 850 auto *V = dyn_cast<Constant>(II.getArgOperand(1)); 851 if (!V) 852 return nullptr; 853 854 auto *VecTy = cast<FixedVectorType>(II.getType()); 855 unsigned NumElts = VecTy->getNumElements(); 856 bool IsPD = VecTy->getScalarType()->isDoubleTy(); 857 unsigned NumLaneElts = IsPD ? 2 : 4; 858 assert(NumElts == 16 || NumElts == 8 || NumElts == 4 || NumElts == 2); 859 860 // Construct a shuffle mask from constant integers or UNDEFs. 861 int Indexes[16]; 862 863 // The intrinsics only read one or two bits, clear the rest. 864 for (unsigned I = 0; I < NumElts; ++I) { 865 Constant *COp = V->getAggregateElement(I); 866 if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) 867 return nullptr; 868 869 if (isa<UndefValue>(COp)) { 870 Indexes[I] = -1; 871 continue; 872 } 873 874 APInt Index = cast<ConstantInt>(COp)->getValue(); 875 Index = Index.zextOrTrunc(32).getLoBits(2); 876 877 // The PD variants uses bit 1 to select per-lane element index, so 878 // shift down to convert to generic shuffle mask index. 879 if (IsPD) 880 Index.lshrInPlace(1); 881 882 // The _256 variants are a bit trickier since the mask bits always index 883 // into the corresponding 128 half. In order to convert to a generic 884 // shuffle, we have to make that explicit. 885 Index += APInt(32, (I / NumLaneElts) * NumLaneElts); 886 887 Indexes[I] = Index.getZExtValue(); 888 } 889 890 auto V1 = II.getArgOperand(0); 891 return Builder.CreateShuffleVector(V1, ArrayRef(Indexes, NumElts)); 892 } 893 894 /// Attempt to convert vpermd/vpermps to shufflevector if the mask is constant. 895 static Value *simplifyX86vpermv(const IntrinsicInst &II, 896 InstCombiner::BuilderTy &Builder) { 897 auto *V = dyn_cast<Constant>(II.getArgOperand(1)); 898 if (!V) 899 return nullptr; 900 901 auto *VecTy = cast<FixedVectorType>(II.getType()); 902 unsigned Size = VecTy->getNumElements(); 903 assert((Size == 4 || Size == 8 || Size == 16 || Size == 32 || Size == 64) && 904 "Unexpected shuffle mask size"); 905 906 // Construct a shuffle mask from constant integers or UNDEFs. 907 int Indexes[64]; 908 909 for (unsigned I = 0; I < Size; ++I) { 910 Constant *COp = V->getAggregateElement(I); 911 if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) 912 return nullptr; 913 914 if (isa<UndefValue>(COp)) { 915 Indexes[I] = -1; 916 continue; 917 } 918 919 uint32_t Index = cast<ConstantInt>(COp)->getZExtValue(); 920 Index &= Size - 1; 921 Indexes[I] = Index; 922 } 923 924 auto V1 = II.getArgOperand(0); 925 return Builder.CreateShuffleVector(V1, ArrayRef(Indexes, Size)); 926 } 927 928 std::optional<Instruction *> 929 X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { 930 auto SimplifyDemandedVectorEltsLow = [&IC](Value *Op, unsigned Width, 931 unsigned DemandedWidth) { 932 APInt UndefElts(Width, 0); 933 APInt DemandedElts = APInt::getLowBitsSet(Width, DemandedWidth); 934 return IC.SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts); 935 }; 936 937 Intrinsic::ID IID = II.getIntrinsicID(); 938 switch (IID) { 939 case Intrinsic::x86_bmi_bextr_32: 940 case Intrinsic::x86_bmi_bextr_64: 941 case Intrinsic::x86_tbm_bextri_u32: 942 case Intrinsic::x86_tbm_bextri_u64: 943 // If the RHS is a constant we can try some simplifications. 944 if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) { 945 uint64_t Shift = C->getZExtValue(); 946 uint64_t Length = (Shift >> 8) & 0xff; 947 Shift &= 0xff; 948 unsigned BitWidth = II.getType()->getIntegerBitWidth(); 949 // If the length is 0 or the shift is out of range, replace with zero. 950 if (Length == 0 || Shift >= BitWidth) { 951 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0)); 952 } 953 // If the LHS is also a constant, we can completely constant fold this. 954 if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) { 955 uint64_t Result = InC->getZExtValue() >> Shift; 956 if (Length > BitWidth) 957 Length = BitWidth; 958 Result &= maskTrailingOnes<uint64_t>(Length); 959 return IC.replaceInstUsesWith(II, 960 ConstantInt::get(II.getType(), Result)); 961 } 962 // TODO should we turn this into 'and' if shift is 0? Or 'shl' if we 963 // are only masking bits that a shift already cleared? 964 } 965 break; 966 967 case Intrinsic::x86_bmi_bzhi_32: 968 case Intrinsic::x86_bmi_bzhi_64: 969 // If the RHS is a constant we can try some simplifications. 970 if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) { 971 uint64_t Index = C->getZExtValue() & 0xff; 972 unsigned BitWidth = II.getType()->getIntegerBitWidth(); 973 if (Index >= BitWidth) { 974 return IC.replaceInstUsesWith(II, II.getArgOperand(0)); 975 } 976 if (Index == 0) { 977 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0)); 978 } 979 // If the LHS is also a constant, we can completely constant fold this. 980 if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) { 981 uint64_t Result = InC->getZExtValue(); 982 Result &= maskTrailingOnes<uint64_t>(Index); 983 return IC.replaceInstUsesWith(II, 984 ConstantInt::get(II.getType(), Result)); 985 } 986 // TODO should we convert this to an AND if the RHS is constant? 987 } 988 break; 989 case Intrinsic::x86_bmi_pext_32: 990 case Intrinsic::x86_bmi_pext_64: 991 if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) { 992 if (MaskC->isNullValue()) { 993 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0)); 994 } 995 if (MaskC->isAllOnesValue()) { 996 return IC.replaceInstUsesWith(II, II.getArgOperand(0)); 997 } 998 999 unsigned MaskIdx, MaskLen; 1000 if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) { 1001 // any single contingous sequence of 1s anywhere in the mask simply 1002 // describes a subset of the input bits shifted to the appropriate 1003 // position. Replace with the straight forward IR. 1004 Value *Input = II.getArgOperand(0); 1005 Value *Masked = IC.Builder.CreateAnd(Input, II.getArgOperand(1)); 1006 Value *ShiftAmt = ConstantInt::get(II.getType(), MaskIdx); 1007 Value *Shifted = IC.Builder.CreateLShr(Masked, ShiftAmt); 1008 return IC.replaceInstUsesWith(II, Shifted); 1009 } 1010 1011 if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) { 1012 uint64_t Src = SrcC->getZExtValue(); 1013 uint64_t Mask = MaskC->getZExtValue(); 1014 uint64_t Result = 0; 1015 uint64_t BitToSet = 1; 1016 1017 while (Mask) { 1018 // Isolate lowest set bit. 1019 uint64_t BitToTest = Mask & -Mask; 1020 if (BitToTest & Src) 1021 Result |= BitToSet; 1022 1023 BitToSet <<= 1; 1024 // Clear lowest set bit. 1025 Mask &= Mask - 1; 1026 } 1027 1028 return IC.replaceInstUsesWith(II, 1029 ConstantInt::get(II.getType(), Result)); 1030 } 1031 } 1032 break; 1033 case Intrinsic::x86_bmi_pdep_32: 1034 case Intrinsic::x86_bmi_pdep_64: 1035 if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) { 1036 if (MaskC->isNullValue()) { 1037 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0)); 1038 } 1039 if (MaskC->isAllOnesValue()) { 1040 return IC.replaceInstUsesWith(II, II.getArgOperand(0)); 1041 } 1042 1043 unsigned MaskIdx, MaskLen; 1044 if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) { 1045 // any single contingous sequence of 1s anywhere in the mask simply 1046 // describes a subset of the input bits shifted to the appropriate 1047 // position. Replace with the straight forward IR. 1048 Value *Input = II.getArgOperand(0); 1049 Value *ShiftAmt = ConstantInt::get(II.getType(), MaskIdx); 1050 Value *Shifted = IC.Builder.CreateShl(Input, ShiftAmt); 1051 Value *Masked = IC.Builder.CreateAnd(Shifted, II.getArgOperand(1)); 1052 return IC.replaceInstUsesWith(II, Masked); 1053 } 1054 1055 if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) { 1056 uint64_t Src = SrcC->getZExtValue(); 1057 uint64_t Mask = MaskC->getZExtValue(); 1058 uint64_t Result = 0; 1059 uint64_t BitToTest = 1; 1060 1061 while (Mask) { 1062 // Isolate lowest set bit. 1063 uint64_t BitToSet = Mask & -Mask; 1064 if (BitToTest & Src) 1065 Result |= BitToSet; 1066 1067 BitToTest <<= 1; 1068 // Clear lowest set bit; 1069 Mask &= Mask - 1; 1070 } 1071 1072 return IC.replaceInstUsesWith(II, 1073 ConstantInt::get(II.getType(), Result)); 1074 } 1075 } 1076 break; 1077 1078 case Intrinsic::x86_sse_cvtss2si: 1079 case Intrinsic::x86_sse_cvtss2si64: 1080 case Intrinsic::x86_sse_cvttss2si: 1081 case Intrinsic::x86_sse_cvttss2si64: 1082 case Intrinsic::x86_sse2_cvtsd2si: 1083 case Intrinsic::x86_sse2_cvtsd2si64: 1084 case Intrinsic::x86_sse2_cvttsd2si: 1085 case Intrinsic::x86_sse2_cvttsd2si64: 1086 case Intrinsic::x86_avx512_vcvtss2si32: 1087 case Intrinsic::x86_avx512_vcvtss2si64: 1088 case Intrinsic::x86_avx512_vcvtss2usi32: 1089 case Intrinsic::x86_avx512_vcvtss2usi64: 1090 case Intrinsic::x86_avx512_vcvtsd2si32: 1091 case Intrinsic::x86_avx512_vcvtsd2si64: 1092 case Intrinsic::x86_avx512_vcvtsd2usi32: 1093 case Intrinsic::x86_avx512_vcvtsd2usi64: 1094 case Intrinsic::x86_avx512_cvttss2si: 1095 case Intrinsic::x86_avx512_cvttss2si64: 1096 case Intrinsic::x86_avx512_cvttss2usi: 1097 case Intrinsic::x86_avx512_cvttss2usi64: 1098 case Intrinsic::x86_avx512_cvttsd2si: 1099 case Intrinsic::x86_avx512_cvttsd2si64: 1100 case Intrinsic::x86_avx512_cvttsd2usi: 1101 case Intrinsic::x86_avx512_cvttsd2usi64: { 1102 // These intrinsics only demand the 0th element of their input vectors. If 1103 // we can simplify the input based on that, do so now. 1104 Value *Arg = II.getArgOperand(0); 1105 unsigned VWidth = cast<FixedVectorType>(Arg->getType())->getNumElements(); 1106 if (Value *V = SimplifyDemandedVectorEltsLow(Arg, VWidth, 1)) { 1107 return IC.replaceOperand(II, 0, V); 1108 } 1109 break; 1110 } 1111 1112 case Intrinsic::x86_mmx_pmovmskb: 1113 case Intrinsic::x86_sse_movmsk_ps: 1114 case Intrinsic::x86_sse2_movmsk_pd: 1115 case Intrinsic::x86_sse2_pmovmskb_128: 1116 case Intrinsic::x86_avx_movmsk_pd_256: 1117 case Intrinsic::x86_avx_movmsk_ps_256: 1118 case Intrinsic::x86_avx2_pmovmskb: 1119 if (Value *V = simplifyX86movmsk(II, IC.Builder)) { 1120 return IC.replaceInstUsesWith(II, V); 1121 } 1122 break; 1123 1124 case Intrinsic::x86_sse_comieq_ss: 1125 case Intrinsic::x86_sse_comige_ss: 1126 case Intrinsic::x86_sse_comigt_ss: 1127 case Intrinsic::x86_sse_comile_ss: 1128 case Intrinsic::x86_sse_comilt_ss: 1129 case Intrinsic::x86_sse_comineq_ss: 1130 case Intrinsic::x86_sse_ucomieq_ss: 1131 case Intrinsic::x86_sse_ucomige_ss: 1132 case Intrinsic::x86_sse_ucomigt_ss: 1133 case Intrinsic::x86_sse_ucomile_ss: 1134 case Intrinsic::x86_sse_ucomilt_ss: 1135 case Intrinsic::x86_sse_ucomineq_ss: 1136 case Intrinsic::x86_sse2_comieq_sd: 1137 case Intrinsic::x86_sse2_comige_sd: 1138 case Intrinsic::x86_sse2_comigt_sd: 1139 case Intrinsic::x86_sse2_comile_sd: 1140 case Intrinsic::x86_sse2_comilt_sd: 1141 case Intrinsic::x86_sse2_comineq_sd: 1142 case Intrinsic::x86_sse2_ucomieq_sd: 1143 case Intrinsic::x86_sse2_ucomige_sd: 1144 case Intrinsic::x86_sse2_ucomigt_sd: 1145 case Intrinsic::x86_sse2_ucomile_sd: 1146 case Intrinsic::x86_sse2_ucomilt_sd: 1147 case Intrinsic::x86_sse2_ucomineq_sd: 1148 case Intrinsic::x86_avx512_vcomi_ss: 1149 case Intrinsic::x86_avx512_vcomi_sd: 1150 case Intrinsic::x86_avx512_mask_cmp_ss: 1151 case Intrinsic::x86_avx512_mask_cmp_sd: { 1152 // These intrinsics only demand the 0th element of their input vectors. If 1153 // we can simplify the input based on that, do so now. 1154 bool MadeChange = false; 1155 Value *Arg0 = II.getArgOperand(0); 1156 Value *Arg1 = II.getArgOperand(1); 1157 unsigned VWidth = cast<FixedVectorType>(Arg0->getType())->getNumElements(); 1158 if (Value *V = SimplifyDemandedVectorEltsLow(Arg0, VWidth, 1)) { 1159 IC.replaceOperand(II, 0, V); 1160 MadeChange = true; 1161 } 1162 if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, 1)) { 1163 IC.replaceOperand(II, 1, V); 1164 MadeChange = true; 1165 } 1166 if (MadeChange) { 1167 return &II; 1168 } 1169 break; 1170 } 1171 1172 case Intrinsic::x86_avx512_add_ps_512: 1173 case Intrinsic::x86_avx512_div_ps_512: 1174 case Intrinsic::x86_avx512_mul_ps_512: 1175 case Intrinsic::x86_avx512_sub_ps_512: 1176 case Intrinsic::x86_avx512_add_pd_512: 1177 case Intrinsic::x86_avx512_div_pd_512: 1178 case Intrinsic::x86_avx512_mul_pd_512: 1179 case Intrinsic::x86_avx512_sub_pd_512: 1180 // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular 1181 // IR operations. 1182 if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(2))) { 1183 if (R->getValue() == 4) { 1184 Value *Arg0 = II.getArgOperand(0); 1185 Value *Arg1 = II.getArgOperand(1); 1186 1187 Value *V; 1188 switch (IID) { 1189 default: 1190 llvm_unreachable("Case stmts out of sync!"); 1191 case Intrinsic::x86_avx512_add_ps_512: 1192 case Intrinsic::x86_avx512_add_pd_512: 1193 V = IC.Builder.CreateFAdd(Arg0, Arg1); 1194 break; 1195 case Intrinsic::x86_avx512_sub_ps_512: 1196 case Intrinsic::x86_avx512_sub_pd_512: 1197 V = IC.Builder.CreateFSub(Arg0, Arg1); 1198 break; 1199 case Intrinsic::x86_avx512_mul_ps_512: 1200 case Intrinsic::x86_avx512_mul_pd_512: 1201 V = IC.Builder.CreateFMul(Arg0, Arg1); 1202 break; 1203 case Intrinsic::x86_avx512_div_ps_512: 1204 case Intrinsic::x86_avx512_div_pd_512: 1205 V = IC.Builder.CreateFDiv(Arg0, Arg1); 1206 break; 1207 } 1208 1209 return IC.replaceInstUsesWith(II, V); 1210 } 1211 } 1212 break; 1213 1214 case Intrinsic::x86_avx512_mask_add_ss_round: 1215 case Intrinsic::x86_avx512_mask_div_ss_round: 1216 case Intrinsic::x86_avx512_mask_mul_ss_round: 1217 case Intrinsic::x86_avx512_mask_sub_ss_round: 1218 case Intrinsic::x86_avx512_mask_add_sd_round: 1219 case Intrinsic::x86_avx512_mask_div_sd_round: 1220 case Intrinsic::x86_avx512_mask_mul_sd_round: 1221 case Intrinsic::x86_avx512_mask_sub_sd_round: 1222 // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular 1223 // IR operations. 1224 if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(4))) { 1225 if (R->getValue() == 4) { 1226 // Extract the element as scalars. 1227 Value *Arg0 = II.getArgOperand(0); 1228 Value *Arg1 = II.getArgOperand(1); 1229 Value *LHS = IC.Builder.CreateExtractElement(Arg0, (uint64_t)0); 1230 Value *RHS = IC.Builder.CreateExtractElement(Arg1, (uint64_t)0); 1231 1232 Value *V; 1233 switch (IID) { 1234 default: 1235 llvm_unreachable("Case stmts out of sync!"); 1236 case Intrinsic::x86_avx512_mask_add_ss_round: 1237 case Intrinsic::x86_avx512_mask_add_sd_round: 1238 V = IC.Builder.CreateFAdd(LHS, RHS); 1239 break; 1240 case Intrinsic::x86_avx512_mask_sub_ss_round: 1241 case Intrinsic::x86_avx512_mask_sub_sd_round: 1242 V = IC.Builder.CreateFSub(LHS, RHS); 1243 break; 1244 case Intrinsic::x86_avx512_mask_mul_ss_round: 1245 case Intrinsic::x86_avx512_mask_mul_sd_round: 1246 V = IC.Builder.CreateFMul(LHS, RHS); 1247 break; 1248 case Intrinsic::x86_avx512_mask_div_ss_round: 1249 case Intrinsic::x86_avx512_mask_div_sd_round: 1250 V = IC.Builder.CreateFDiv(LHS, RHS); 1251 break; 1252 } 1253 1254 // Handle the masking aspect of the intrinsic. 1255 Value *Mask = II.getArgOperand(3); 1256 auto *C = dyn_cast<ConstantInt>(Mask); 1257 // We don't need a select if we know the mask bit is a 1. 1258 if (!C || !C->getValue()[0]) { 1259 // Cast the mask to an i1 vector and then extract the lowest element. 1260 auto *MaskTy = FixedVectorType::get( 1261 IC.Builder.getInt1Ty(), 1262 cast<IntegerType>(Mask->getType())->getBitWidth()); 1263 Mask = IC.Builder.CreateBitCast(Mask, MaskTy); 1264 Mask = IC.Builder.CreateExtractElement(Mask, (uint64_t)0); 1265 // Extract the lowest element from the passthru operand. 1266 Value *Passthru = 1267 IC.Builder.CreateExtractElement(II.getArgOperand(2), (uint64_t)0); 1268 V = IC.Builder.CreateSelect(Mask, V, Passthru); 1269 } 1270 1271 // Insert the result back into the original argument 0. 1272 V = IC.Builder.CreateInsertElement(Arg0, V, (uint64_t)0); 1273 1274 return IC.replaceInstUsesWith(II, V); 1275 } 1276 } 1277 break; 1278 1279 // Constant fold ashr( <A x Bi>, Ci ). 1280 // Constant fold lshr( <A x Bi>, Ci ). 1281 // Constant fold shl( <A x Bi>, Ci ). 1282 case Intrinsic::x86_sse2_psrai_d: 1283 case Intrinsic::x86_sse2_psrai_w: 1284 case Intrinsic::x86_avx2_psrai_d: 1285 case Intrinsic::x86_avx2_psrai_w: 1286 case Intrinsic::x86_avx512_psrai_q_128: 1287 case Intrinsic::x86_avx512_psrai_q_256: 1288 case Intrinsic::x86_avx512_psrai_d_512: 1289 case Intrinsic::x86_avx512_psrai_q_512: 1290 case Intrinsic::x86_avx512_psrai_w_512: 1291 case Intrinsic::x86_sse2_psrli_d: 1292 case Intrinsic::x86_sse2_psrli_q: 1293 case Intrinsic::x86_sse2_psrli_w: 1294 case Intrinsic::x86_avx2_psrli_d: 1295 case Intrinsic::x86_avx2_psrli_q: 1296 case Intrinsic::x86_avx2_psrli_w: 1297 case Intrinsic::x86_avx512_psrli_d_512: 1298 case Intrinsic::x86_avx512_psrli_q_512: 1299 case Intrinsic::x86_avx512_psrli_w_512: 1300 case Intrinsic::x86_sse2_pslli_d: 1301 case Intrinsic::x86_sse2_pslli_q: 1302 case Intrinsic::x86_sse2_pslli_w: 1303 case Intrinsic::x86_avx2_pslli_d: 1304 case Intrinsic::x86_avx2_pslli_q: 1305 case Intrinsic::x86_avx2_pslli_w: 1306 case Intrinsic::x86_avx512_pslli_d_512: 1307 case Intrinsic::x86_avx512_pslli_q_512: 1308 case Intrinsic::x86_avx512_pslli_w_512: 1309 if (Value *V = simplifyX86immShift(II, IC.Builder)) { 1310 return IC.replaceInstUsesWith(II, V); 1311 } 1312 break; 1313 1314 case Intrinsic::x86_sse2_psra_d: 1315 case Intrinsic::x86_sse2_psra_w: 1316 case Intrinsic::x86_avx2_psra_d: 1317 case Intrinsic::x86_avx2_psra_w: 1318 case Intrinsic::x86_avx512_psra_q_128: 1319 case Intrinsic::x86_avx512_psra_q_256: 1320 case Intrinsic::x86_avx512_psra_d_512: 1321 case Intrinsic::x86_avx512_psra_q_512: 1322 case Intrinsic::x86_avx512_psra_w_512: 1323 case Intrinsic::x86_sse2_psrl_d: 1324 case Intrinsic::x86_sse2_psrl_q: 1325 case Intrinsic::x86_sse2_psrl_w: 1326 case Intrinsic::x86_avx2_psrl_d: 1327 case Intrinsic::x86_avx2_psrl_q: 1328 case Intrinsic::x86_avx2_psrl_w: 1329 case Intrinsic::x86_avx512_psrl_d_512: 1330 case Intrinsic::x86_avx512_psrl_q_512: 1331 case Intrinsic::x86_avx512_psrl_w_512: 1332 case Intrinsic::x86_sse2_psll_d: 1333 case Intrinsic::x86_sse2_psll_q: 1334 case Intrinsic::x86_sse2_psll_w: 1335 case Intrinsic::x86_avx2_psll_d: 1336 case Intrinsic::x86_avx2_psll_q: 1337 case Intrinsic::x86_avx2_psll_w: 1338 case Intrinsic::x86_avx512_psll_d_512: 1339 case Intrinsic::x86_avx512_psll_q_512: 1340 case Intrinsic::x86_avx512_psll_w_512: { 1341 if (Value *V = simplifyX86immShift(II, IC.Builder)) { 1342 return IC.replaceInstUsesWith(II, V); 1343 } 1344 1345 // SSE2/AVX2 uses only the first 64-bits of the 128-bit vector 1346 // operand to compute the shift amount. 1347 Value *Arg1 = II.getArgOperand(1); 1348 assert(Arg1->getType()->getPrimitiveSizeInBits() == 128 && 1349 "Unexpected packed shift size"); 1350 unsigned VWidth = cast<FixedVectorType>(Arg1->getType())->getNumElements(); 1351 1352 if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, VWidth / 2)) { 1353 return IC.replaceOperand(II, 1, V); 1354 } 1355 break; 1356 } 1357 1358 case Intrinsic::x86_avx2_psllv_d: 1359 case Intrinsic::x86_avx2_psllv_d_256: 1360 case Intrinsic::x86_avx2_psllv_q: 1361 case Intrinsic::x86_avx2_psllv_q_256: 1362 case Intrinsic::x86_avx512_psllv_d_512: 1363 case Intrinsic::x86_avx512_psllv_q_512: 1364 case Intrinsic::x86_avx512_psllv_w_128: 1365 case Intrinsic::x86_avx512_psllv_w_256: 1366 case Intrinsic::x86_avx512_psllv_w_512: 1367 case Intrinsic::x86_avx2_psrav_d: 1368 case Intrinsic::x86_avx2_psrav_d_256: 1369 case Intrinsic::x86_avx512_psrav_q_128: 1370 case Intrinsic::x86_avx512_psrav_q_256: 1371 case Intrinsic::x86_avx512_psrav_d_512: 1372 case Intrinsic::x86_avx512_psrav_q_512: 1373 case Intrinsic::x86_avx512_psrav_w_128: 1374 case Intrinsic::x86_avx512_psrav_w_256: 1375 case Intrinsic::x86_avx512_psrav_w_512: 1376 case Intrinsic::x86_avx2_psrlv_d: 1377 case Intrinsic::x86_avx2_psrlv_d_256: 1378 case Intrinsic::x86_avx2_psrlv_q: 1379 case Intrinsic::x86_avx2_psrlv_q_256: 1380 case Intrinsic::x86_avx512_psrlv_d_512: 1381 case Intrinsic::x86_avx512_psrlv_q_512: 1382 case Intrinsic::x86_avx512_psrlv_w_128: 1383 case Intrinsic::x86_avx512_psrlv_w_256: 1384 case Intrinsic::x86_avx512_psrlv_w_512: 1385 if (Value *V = simplifyX86varShift(II, IC.Builder)) { 1386 return IC.replaceInstUsesWith(II, V); 1387 } 1388 break; 1389 1390 case Intrinsic::x86_sse2_packssdw_128: 1391 case Intrinsic::x86_sse2_packsswb_128: 1392 case Intrinsic::x86_avx2_packssdw: 1393 case Intrinsic::x86_avx2_packsswb: 1394 case Intrinsic::x86_avx512_packssdw_512: 1395 case Intrinsic::x86_avx512_packsswb_512: 1396 if (Value *V = simplifyX86pack(II, IC.Builder, true)) { 1397 return IC.replaceInstUsesWith(II, V); 1398 } 1399 break; 1400 1401 case Intrinsic::x86_sse2_packuswb_128: 1402 case Intrinsic::x86_sse41_packusdw: 1403 case Intrinsic::x86_avx2_packusdw: 1404 case Intrinsic::x86_avx2_packuswb: 1405 case Intrinsic::x86_avx512_packusdw_512: 1406 case Intrinsic::x86_avx512_packuswb_512: 1407 if (Value *V = simplifyX86pack(II, IC.Builder, false)) { 1408 return IC.replaceInstUsesWith(II, V); 1409 } 1410 break; 1411 1412 case Intrinsic::x86_pclmulqdq: 1413 case Intrinsic::x86_pclmulqdq_256: 1414 case Intrinsic::x86_pclmulqdq_512: { 1415 if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(2))) { 1416 unsigned Imm = C->getZExtValue(); 1417 1418 bool MadeChange = false; 1419 Value *Arg0 = II.getArgOperand(0); 1420 Value *Arg1 = II.getArgOperand(1); 1421 unsigned VWidth = 1422 cast<FixedVectorType>(Arg0->getType())->getNumElements(); 1423 1424 APInt UndefElts1(VWidth, 0); 1425 APInt DemandedElts1 = 1426 APInt::getSplat(VWidth, APInt(2, (Imm & 0x01) ? 2 : 1)); 1427 if (Value *V = 1428 IC.SimplifyDemandedVectorElts(Arg0, DemandedElts1, UndefElts1)) { 1429 IC.replaceOperand(II, 0, V); 1430 MadeChange = true; 1431 } 1432 1433 APInt UndefElts2(VWidth, 0); 1434 APInt DemandedElts2 = 1435 APInt::getSplat(VWidth, APInt(2, (Imm & 0x10) ? 2 : 1)); 1436 if (Value *V = 1437 IC.SimplifyDemandedVectorElts(Arg1, DemandedElts2, UndefElts2)) { 1438 IC.replaceOperand(II, 1, V); 1439 MadeChange = true; 1440 } 1441 1442 // If either input elements are undef, the result is zero. 1443 if (DemandedElts1.isSubsetOf(UndefElts1) || 1444 DemandedElts2.isSubsetOf(UndefElts2)) { 1445 return IC.replaceInstUsesWith(II, 1446 ConstantAggregateZero::get(II.getType())); 1447 } 1448 1449 if (MadeChange) { 1450 return &II; 1451 } 1452 } 1453 break; 1454 } 1455 1456 case Intrinsic::x86_sse41_insertps: 1457 if (Value *V = simplifyX86insertps(II, IC.Builder)) { 1458 return IC.replaceInstUsesWith(II, V); 1459 } 1460 break; 1461 1462 case Intrinsic::x86_sse4a_extrq: { 1463 Value *Op0 = II.getArgOperand(0); 1464 Value *Op1 = II.getArgOperand(1); 1465 unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements(); 1466 unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements(); 1467 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && 1468 Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 && 1469 VWidth1 == 16 && "Unexpected operand sizes"); 1470 1471 // See if we're dealing with constant values. 1472 auto *C1 = dyn_cast<Constant>(Op1); 1473 auto *CILength = 1474 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0)) 1475 : nullptr; 1476 auto *CIIndex = 1477 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1)) 1478 : nullptr; 1479 1480 // Attempt to simplify to a constant, shuffle vector or EXTRQI call. 1481 if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) { 1482 return IC.replaceInstUsesWith(II, V); 1483 } 1484 1485 // EXTRQ only uses the lowest 64-bits of the first 128-bit vector 1486 // operands and the lowest 16-bits of the second. 1487 bool MadeChange = false; 1488 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) { 1489 IC.replaceOperand(II, 0, V); 1490 MadeChange = true; 1491 } 1492 if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 2)) { 1493 IC.replaceOperand(II, 1, V); 1494 MadeChange = true; 1495 } 1496 if (MadeChange) { 1497 return &II; 1498 } 1499 break; 1500 } 1501 1502 case Intrinsic::x86_sse4a_extrqi: { 1503 // EXTRQI: Extract Length bits starting from Index. Zero pad the remaining 1504 // bits of the lower 64-bits. The upper 64-bits are undefined. 1505 Value *Op0 = II.getArgOperand(0); 1506 unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements(); 1507 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 && 1508 "Unexpected operand size"); 1509 1510 // See if we're dealing with constant values. 1511 auto *CILength = dyn_cast<ConstantInt>(II.getArgOperand(1)); 1512 auto *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(2)); 1513 1514 // Attempt to simplify to a constant or shuffle vector. 1515 if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) { 1516 return IC.replaceInstUsesWith(II, V); 1517 } 1518 1519 // EXTRQI only uses the lowest 64-bits of the first 128-bit vector 1520 // operand. 1521 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) { 1522 return IC.replaceOperand(II, 0, V); 1523 } 1524 break; 1525 } 1526 1527 case Intrinsic::x86_sse4a_insertq: { 1528 Value *Op0 = II.getArgOperand(0); 1529 Value *Op1 = II.getArgOperand(1); 1530 unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements(); 1531 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && 1532 Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 && 1533 cast<FixedVectorType>(Op1->getType())->getNumElements() == 2 && 1534 "Unexpected operand size"); 1535 1536 // See if we're dealing with constant values. 1537 auto *C1 = dyn_cast<Constant>(Op1); 1538 auto *CI11 = 1539 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1)) 1540 : nullptr; 1541 1542 // Attempt to simplify to a constant, shuffle vector or INSERTQI call. 1543 if (CI11) { 1544 const APInt &V11 = CI11->getValue(); 1545 APInt Len = V11.zextOrTrunc(6); 1546 APInt Idx = V11.lshr(8).zextOrTrunc(6); 1547 if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) { 1548 return IC.replaceInstUsesWith(II, V); 1549 } 1550 } 1551 1552 // INSERTQ only uses the lowest 64-bits of the first 128-bit vector 1553 // operand. 1554 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) { 1555 return IC.replaceOperand(II, 0, V); 1556 } 1557 break; 1558 } 1559 1560 case Intrinsic::x86_sse4a_insertqi: { 1561 // INSERTQI: Extract lowest Length bits from lower half of second source and 1562 // insert over first source starting at Index bit. The upper 64-bits are 1563 // undefined. 1564 Value *Op0 = II.getArgOperand(0); 1565 Value *Op1 = II.getArgOperand(1); 1566 unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements(); 1567 unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements(); 1568 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && 1569 Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 && 1570 VWidth1 == 2 && "Unexpected operand sizes"); 1571 1572 // See if we're dealing with constant values. 1573 auto *CILength = dyn_cast<ConstantInt>(II.getArgOperand(2)); 1574 auto *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(3)); 1575 1576 // Attempt to simplify to a constant or shuffle vector. 1577 if (CILength && CIIndex) { 1578 APInt Len = CILength->getValue().zextOrTrunc(6); 1579 APInt Idx = CIIndex->getValue().zextOrTrunc(6); 1580 if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) { 1581 return IC.replaceInstUsesWith(II, V); 1582 } 1583 } 1584 1585 // INSERTQI only uses the lowest 64-bits of the first two 128-bit vector 1586 // operands. 1587 bool MadeChange = false; 1588 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) { 1589 IC.replaceOperand(II, 0, V); 1590 MadeChange = true; 1591 } 1592 if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 1)) { 1593 IC.replaceOperand(II, 1, V); 1594 MadeChange = true; 1595 } 1596 if (MadeChange) { 1597 return &II; 1598 } 1599 break; 1600 } 1601 1602 case Intrinsic::x86_sse41_pblendvb: 1603 case Intrinsic::x86_sse41_blendvps: 1604 case Intrinsic::x86_sse41_blendvpd: 1605 case Intrinsic::x86_avx_blendv_ps_256: 1606 case Intrinsic::x86_avx_blendv_pd_256: 1607 case Intrinsic::x86_avx2_pblendvb: { 1608 // fold (blend A, A, Mask) -> A 1609 Value *Op0 = II.getArgOperand(0); 1610 Value *Op1 = II.getArgOperand(1); 1611 Value *Mask = II.getArgOperand(2); 1612 if (Op0 == Op1) { 1613 return IC.replaceInstUsesWith(II, Op0); 1614 } 1615 1616 // Zero Mask - select 1st argument. 1617 if (isa<ConstantAggregateZero>(Mask)) { 1618 return IC.replaceInstUsesWith(II, Op0); 1619 } 1620 1621 // Constant Mask - select 1st/2nd argument lane based on top bit of mask. 1622 if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) { 1623 Constant *NewSelector = getNegativeIsTrueBoolVec(ConstantMask); 1624 return SelectInst::Create(NewSelector, Op1, Op0, "blendv"); 1625 } 1626 1627 // Convert to a vector select if we can bypass casts and find a boolean 1628 // vector condition value. 1629 Value *BoolVec; 1630 Mask = InstCombiner::peekThroughBitcast(Mask); 1631 if (match(Mask, PatternMatch::m_SExt(PatternMatch::m_Value(BoolVec))) && 1632 BoolVec->getType()->isVectorTy() && 1633 BoolVec->getType()->getScalarSizeInBits() == 1) { 1634 assert(Mask->getType()->getPrimitiveSizeInBits() == 1635 II.getType()->getPrimitiveSizeInBits() && 1636 "Not expecting mask and operands with different sizes"); 1637 1638 unsigned NumMaskElts = 1639 cast<FixedVectorType>(Mask->getType())->getNumElements(); 1640 unsigned NumOperandElts = 1641 cast<FixedVectorType>(II.getType())->getNumElements(); 1642 if (NumMaskElts == NumOperandElts) { 1643 return SelectInst::Create(BoolVec, Op1, Op0); 1644 } 1645 1646 // If the mask has less elements than the operands, each mask bit maps to 1647 // multiple elements of the operands. Bitcast back and forth. 1648 if (NumMaskElts < NumOperandElts) { 1649 Value *CastOp0 = IC.Builder.CreateBitCast(Op0, Mask->getType()); 1650 Value *CastOp1 = IC.Builder.CreateBitCast(Op1, Mask->getType()); 1651 Value *Sel = IC.Builder.CreateSelect(BoolVec, CastOp1, CastOp0); 1652 return new BitCastInst(Sel, II.getType()); 1653 } 1654 } 1655 1656 break; 1657 } 1658 1659 case Intrinsic::x86_ssse3_pshuf_b_128: 1660 case Intrinsic::x86_avx2_pshuf_b: 1661 case Intrinsic::x86_avx512_pshuf_b_512: 1662 if (Value *V = simplifyX86pshufb(II, IC.Builder)) { 1663 return IC.replaceInstUsesWith(II, V); 1664 } 1665 break; 1666 1667 case Intrinsic::x86_avx_vpermilvar_ps: 1668 case Intrinsic::x86_avx_vpermilvar_ps_256: 1669 case Intrinsic::x86_avx512_vpermilvar_ps_512: 1670 case Intrinsic::x86_avx_vpermilvar_pd: 1671 case Intrinsic::x86_avx_vpermilvar_pd_256: 1672 case Intrinsic::x86_avx512_vpermilvar_pd_512: 1673 if (Value *V = simplifyX86vpermilvar(II, IC.Builder)) { 1674 return IC.replaceInstUsesWith(II, V); 1675 } 1676 break; 1677 1678 case Intrinsic::x86_avx2_permd: 1679 case Intrinsic::x86_avx2_permps: 1680 case Intrinsic::x86_avx512_permvar_df_256: 1681 case Intrinsic::x86_avx512_permvar_df_512: 1682 case Intrinsic::x86_avx512_permvar_di_256: 1683 case Intrinsic::x86_avx512_permvar_di_512: 1684 case Intrinsic::x86_avx512_permvar_hi_128: 1685 case Intrinsic::x86_avx512_permvar_hi_256: 1686 case Intrinsic::x86_avx512_permvar_hi_512: 1687 case Intrinsic::x86_avx512_permvar_qi_128: 1688 case Intrinsic::x86_avx512_permvar_qi_256: 1689 case Intrinsic::x86_avx512_permvar_qi_512: 1690 case Intrinsic::x86_avx512_permvar_sf_512: 1691 case Intrinsic::x86_avx512_permvar_si_512: 1692 if (Value *V = simplifyX86vpermv(II, IC.Builder)) { 1693 return IC.replaceInstUsesWith(II, V); 1694 } 1695 break; 1696 1697 case Intrinsic::x86_avx_maskload_ps: 1698 case Intrinsic::x86_avx_maskload_pd: 1699 case Intrinsic::x86_avx_maskload_ps_256: 1700 case Intrinsic::x86_avx_maskload_pd_256: 1701 case Intrinsic::x86_avx2_maskload_d: 1702 case Intrinsic::x86_avx2_maskload_q: 1703 case Intrinsic::x86_avx2_maskload_d_256: 1704 case Intrinsic::x86_avx2_maskload_q_256: 1705 if (Instruction *I = simplifyX86MaskedLoad(II, IC)) { 1706 return I; 1707 } 1708 break; 1709 1710 case Intrinsic::x86_sse2_maskmov_dqu: 1711 case Intrinsic::x86_avx_maskstore_ps: 1712 case Intrinsic::x86_avx_maskstore_pd: 1713 case Intrinsic::x86_avx_maskstore_ps_256: 1714 case Intrinsic::x86_avx_maskstore_pd_256: 1715 case Intrinsic::x86_avx2_maskstore_d: 1716 case Intrinsic::x86_avx2_maskstore_q: 1717 case Intrinsic::x86_avx2_maskstore_d_256: 1718 case Intrinsic::x86_avx2_maskstore_q_256: 1719 if (simplifyX86MaskedStore(II, IC)) { 1720 return nullptr; 1721 } 1722 break; 1723 1724 case Intrinsic::x86_addcarry_32: 1725 case Intrinsic::x86_addcarry_64: 1726 if (Value *V = simplifyX86addcarry(II, IC.Builder)) { 1727 return IC.replaceInstUsesWith(II, V); 1728 } 1729 break; 1730 1731 default: 1732 break; 1733 } 1734 return std::nullopt; 1735 } 1736 1737 std::optional<Value *> X86TTIImpl::simplifyDemandedUseBitsIntrinsic( 1738 InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known, 1739 bool &KnownBitsComputed) const { 1740 switch (II.getIntrinsicID()) { 1741 default: 1742 break; 1743 case Intrinsic::x86_mmx_pmovmskb: 1744 case Intrinsic::x86_sse_movmsk_ps: 1745 case Intrinsic::x86_sse2_movmsk_pd: 1746 case Intrinsic::x86_sse2_pmovmskb_128: 1747 case Intrinsic::x86_avx_movmsk_ps_256: 1748 case Intrinsic::x86_avx_movmsk_pd_256: 1749 case Intrinsic::x86_avx2_pmovmskb: { 1750 // MOVMSK copies the vector elements' sign bits to the low bits 1751 // and zeros the high bits. 1752 unsigned ArgWidth; 1753 if (II.getIntrinsicID() == Intrinsic::x86_mmx_pmovmskb) { 1754 ArgWidth = 8; // Arg is x86_mmx, but treated as <8 x i8>. 1755 } else { 1756 auto *ArgType = cast<FixedVectorType>(II.getArgOperand(0)->getType()); 1757 ArgWidth = ArgType->getNumElements(); 1758 } 1759 1760 // If we don't need any of low bits then return zero, 1761 // we know that DemandedMask is non-zero already. 1762 APInt DemandedElts = DemandedMask.zextOrTrunc(ArgWidth); 1763 Type *VTy = II.getType(); 1764 if (DemandedElts.isZero()) { 1765 return ConstantInt::getNullValue(VTy); 1766 } 1767 1768 // We know that the upper bits are set to zero. 1769 Known.Zero.setBitsFrom(ArgWidth); 1770 KnownBitsComputed = true; 1771 break; 1772 } 1773 } 1774 return std::nullopt; 1775 } 1776 1777 std::optional<Value *> X86TTIImpl::simplifyDemandedVectorEltsIntrinsic( 1778 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, 1779 APInt &UndefElts2, APInt &UndefElts3, 1780 std::function<void(Instruction *, unsigned, APInt, APInt &)> 1781 simplifyAndSetOp) const { 1782 unsigned VWidth = cast<FixedVectorType>(II.getType())->getNumElements(); 1783 switch (II.getIntrinsicID()) { 1784 default: 1785 break; 1786 case Intrinsic::x86_xop_vfrcz_ss: 1787 case Intrinsic::x86_xop_vfrcz_sd: 1788 // The instructions for these intrinsics are speced to zero upper bits not 1789 // pass them through like other scalar intrinsics. So we shouldn't just 1790 // use Arg0 if DemandedElts[0] is clear like we do for other intrinsics. 1791 // Instead we should return a zero vector. 1792 if (!DemandedElts[0]) { 1793 IC.addToWorklist(&II); 1794 return ConstantAggregateZero::get(II.getType()); 1795 } 1796 1797 // Only the lower element is used. 1798 DemandedElts = 1; 1799 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); 1800 1801 // Only the lower element is undefined. The high elements are zero. 1802 UndefElts = UndefElts[0]; 1803 break; 1804 1805 // Unary scalar-as-vector operations that work column-wise. 1806 case Intrinsic::x86_sse_rcp_ss: 1807 case Intrinsic::x86_sse_rsqrt_ss: 1808 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); 1809 1810 // If lowest element of a scalar op isn't used then use Arg0. 1811 if (!DemandedElts[0]) { 1812 IC.addToWorklist(&II); 1813 return II.getArgOperand(0); 1814 } 1815 // TODO: If only low elt lower SQRT to FSQRT (with rounding/exceptions 1816 // checks). 1817 break; 1818 1819 // Binary scalar-as-vector operations that work column-wise. The high 1820 // elements come from operand 0. The low element is a function of both 1821 // operands. 1822 case Intrinsic::x86_sse_min_ss: 1823 case Intrinsic::x86_sse_max_ss: 1824 case Intrinsic::x86_sse_cmp_ss: 1825 case Intrinsic::x86_sse2_min_sd: 1826 case Intrinsic::x86_sse2_max_sd: 1827 case Intrinsic::x86_sse2_cmp_sd: { 1828 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); 1829 1830 // If lowest element of a scalar op isn't used then use Arg0. 1831 if (!DemandedElts[0]) { 1832 IC.addToWorklist(&II); 1833 return II.getArgOperand(0); 1834 } 1835 1836 // Only lower element is used for operand 1. 1837 DemandedElts = 1; 1838 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2); 1839 1840 // Lower element is undefined if both lower elements are undefined. 1841 // Consider things like undef&0. The result is known zero, not undef. 1842 if (!UndefElts2[0]) 1843 UndefElts.clearBit(0); 1844 1845 break; 1846 } 1847 1848 // Binary scalar-as-vector operations that work column-wise. The high 1849 // elements come from operand 0 and the low element comes from operand 1. 1850 case Intrinsic::x86_sse41_round_ss: 1851 case Intrinsic::x86_sse41_round_sd: { 1852 // Don't use the low element of operand 0. 1853 APInt DemandedElts2 = DemandedElts; 1854 DemandedElts2.clearBit(0); 1855 simplifyAndSetOp(&II, 0, DemandedElts2, UndefElts); 1856 1857 // If lowest element of a scalar op isn't used then use Arg0. 1858 if (!DemandedElts[0]) { 1859 IC.addToWorklist(&II); 1860 return II.getArgOperand(0); 1861 } 1862 1863 // Only lower element is used for operand 1. 1864 DemandedElts = 1; 1865 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2); 1866 1867 // Take the high undef elements from operand 0 and take the lower element 1868 // from operand 1. 1869 UndefElts.clearBit(0); 1870 UndefElts |= UndefElts2[0]; 1871 break; 1872 } 1873 1874 // Three input scalar-as-vector operations that work column-wise. The high 1875 // elements come from operand 0 and the low element is a function of all 1876 // three inputs. 1877 case Intrinsic::x86_avx512_mask_add_ss_round: 1878 case Intrinsic::x86_avx512_mask_div_ss_round: 1879 case Intrinsic::x86_avx512_mask_mul_ss_round: 1880 case Intrinsic::x86_avx512_mask_sub_ss_round: 1881 case Intrinsic::x86_avx512_mask_max_ss_round: 1882 case Intrinsic::x86_avx512_mask_min_ss_round: 1883 case Intrinsic::x86_avx512_mask_add_sd_round: 1884 case Intrinsic::x86_avx512_mask_div_sd_round: 1885 case Intrinsic::x86_avx512_mask_mul_sd_round: 1886 case Intrinsic::x86_avx512_mask_sub_sd_round: 1887 case Intrinsic::x86_avx512_mask_max_sd_round: 1888 case Intrinsic::x86_avx512_mask_min_sd_round: 1889 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); 1890 1891 // If lowest element of a scalar op isn't used then use Arg0. 1892 if (!DemandedElts[0]) { 1893 IC.addToWorklist(&II); 1894 return II.getArgOperand(0); 1895 } 1896 1897 // Only lower element is used for operand 1 and 2. 1898 DemandedElts = 1; 1899 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2); 1900 simplifyAndSetOp(&II, 2, DemandedElts, UndefElts3); 1901 1902 // Lower element is undefined if all three lower elements are undefined. 1903 // Consider things like undef&0. The result is known zero, not undef. 1904 if (!UndefElts2[0] || !UndefElts3[0]) 1905 UndefElts.clearBit(0); 1906 break; 1907 1908 // TODO: Add fmaddsub support? 1909 case Intrinsic::x86_sse3_addsub_pd: 1910 case Intrinsic::x86_sse3_addsub_ps: 1911 case Intrinsic::x86_avx_addsub_pd_256: 1912 case Intrinsic::x86_avx_addsub_ps_256: { 1913 // If none of the even or none of the odd lanes are required, turn this 1914 // into a generic FP math instruction. 1915 APInt SubMask = APInt::getSplat(VWidth, APInt(2, 0x1)); 1916 APInt AddMask = APInt::getSplat(VWidth, APInt(2, 0x2)); 1917 bool IsSubOnly = DemandedElts.isSubsetOf(SubMask); 1918 bool IsAddOnly = DemandedElts.isSubsetOf(AddMask); 1919 if (IsSubOnly || IsAddOnly) { 1920 assert((IsSubOnly ^ IsAddOnly) && "Can't be both add-only and sub-only"); 1921 IRBuilderBase::InsertPointGuard Guard(IC.Builder); 1922 IC.Builder.SetInsertPoint(&II); 1923 Value *Arg0 = II.getArgOperand(0), *Arg1 = II.getArgOperand(1); 1924 return IC.Builder.CreateBinOp( 1925 IsSubOnly ? Instruction::FSub : Instruction::FAdd, Arg0, Arg1); 1926 } 1927 1928 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); 1929 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2); 1930 UndefElts &= UndefElts2; 1931 break; 1932 } 1933 1934 // General per-element vector operations. 1935 case Intrinsic::x86_avx2_psllv_d: 1936 case Intrinsic::x86_avx2_psllv_d_256: 1937 case Intrinsic::x86_avx2_psllv_q: 1938 case Intrinsic::x86_avx2_psllv_q_256: 1939 case Intrinsic::x86_avx2_psrlv_d: 1940 case Intrinsic::x86_avx2_psrlv_d_256: 1941 case Intrinsic::x86_avx2_psrlv_q: 1942 case Intrinsic::x86_avx2_psrlv_q_256: 1943 case Intrinsic::x86_avx2_psrav_d: 1944 case Intrinsic::x86_avx2_psrav_d_256: { 1945 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); 1946 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2); 1947 UndefElts &= UndefElts2; 1948 break; 1949 } 1950 1951 case Intrinsic::x86_sse2_packssdw_128: 1952 case Intrinsic::x86_sse2_packsswb_128: 1953 case Intrinsic::x86_sse2_packuswb_128: 1954 case Intrinsic::x86_sse41_packusdw: 1955 case Intrinsic::x86_avx2_packssdw: 1956 case Intrinsic::x86_avx2_packsswb: 1957 case Intrinsic::x86_avx2_packusdw: 1958 case Intrinsic::x86_avx2_packuswb: 1959 case Intrinsic::x86_avx512_packssdw_512: 1960 case Intrinsic::x86_avx512_packsswb_512: 1961 case Intrinsic::x86_avx512_packusdw_512: 1962 case Intrinsic::x86_avx512_packuswb_512: { 1963 auto *Ty0 = II.getArgOperand(0)->getType(); 1964 unsigned InnerVWidth = cast<FixedVectorType>(Ty0)->getNumElements(); 1965 assert(VWidth == (InnerVWidth * 2) && "Unexpected input size"); 1966 1967 unsigned NumLanes = Ty0->getPrimitiveSizeInBits() / 128; 1968 unsigned VWidthPerLane = VWidth / NumLanes; 1969 unsigned InnerVWidthPerLane = InnerVWidth / NumLanes; 1970 1971 // Per lane, pack the elements of the first input and then the second. 1972 // e.g. 1973 // v8i16 PACK(v4i32 X, v4i32 Y) - (X[0..3],Y[0..3]) 1974 // v32i8 PACK(v16i16 X, v16i16 Y) - (X[0..7],Y[0..7]),(X[8..15],Y[8..15]) 1975 for (int OpNum = 0; OpNum != 2; ++OpNum) { 1976 APInt OpDemandedElts(InnerVWidth, 0); 1977 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { 1978 unsigned LaneIdx = Lane * VWidthPerLane; 1979 for (unsigned Elt = 0; Elt != InnerVWidthPerLane; ++Elt) { 1980 unsigned Idx = LaneIdx + Elt + InnerVWidthPerLane * OpNum; 1981 if (DemandedElts[Idx]) 1982 OpDemandedElts.setBit((Lane * InnerVWidthPerLane) + Elt); 1983 } 1984 } 1985 1986 // Demand elements from the operand. 1987 APInt OpUndefElts(InnerVWidth, 0); 1988 simplifyAndSetOp(&II, OpNum, OpDemandedElts, OpUndefElts); 1989 1990 // Pack the operand's UNDEF elements, one lane at a time. 1991 OpUndefElts = OpUndefElts.zext(VWidth); 1992 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { 1993 APInt LaneElts = OpUndefElts.lshr(InnerVWidthPerLane * Lane); 1994 LaneElts = LaneElts.getLoBits(InnerVWidthPerLane); 1995 LaneElts <<= InnerVWidthPerLane * (2 * Lane + OpNum); 1996 UndefElts |= LaneElts; 1997 } 1998 } 1999 break; 2000 } 2001 2002 // PSHUFB 2003 case Intrinsic::x86_ssse3_pshuf_b_128: 2004 case Intrinsic::x86_avx2_pshuf_b: 2005 case Intrinsic::x86_avx512_pshuf_b_512: 2006 // PERMILVAR 2007 case Intrinsic::x86_avx_vpermilvar_ps: 2008 case Intrinsic::x86_avx_vpermilvar_ps_256: 2009 case Intrinsic::x86_avx512_vpermilvar_ps_512: 2010 case Intrinsic::x86_avx_vpermilvar_pd: 2011 case Intrinsic::x86_avx_vpermilvar_pd_256: 2012 case Intrinsic::x86_avx512_vpermilvar_pd_512: 2013 // PERMV 2014 case Intrinsic::x86_avx2_permd: 2015 case Intrinsic::x86_avx2_permps: { 2016 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts); 2017 break; 2018 } 2019 2020 // SSE4A instructions leave the upper 64-bits of the 128-bit result 2021 // in an undefined state. 2022 case Intrinsic::x86_sse4a_extrq: 2023 case Intrinsic::x86_sse4a_extrqi: 2024 case Intrinsic::x86_sse4a_insertq: 2025 case Intrinsic::x86_sse4a_insertqi: 2026 UndefElts.setHighBits(VWidth / 2); 2027 break; 2028 } 2029 return std::nullopt; 2030 } 2031