1 //===-- X86InstCombineIntrinsic.cpp - X86 specific InstCombine pass -------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements a TargetTransformInfo analysis pass specific to the 10 /// X86 target machine. It uses the target's detailed information to provide 11 /// more precise answers to certain TTI queries, while letting the target 12 /// independent and default TTI implementations handle the rest. 13 /// 14 //===----------------------------------------------------------------------===// 15 16 #include "X86TargetTransformInfo.h" 17 #include "llvm/IR/IntrinsicInst.h" 18 #include "llvm/IR/IntrinsicsX86.h" 19 #include "llvm/Support/KnownBits.h" 20 #include "llvm/Transforms/InstCombine/InstCombiner.h" 21 22 using namespace llvm; 23 24 #define DEBUG_TYPE "x86tti" 25 26 /// Return a constant boolean vector that has true elements in all positions 27 /// where the input constant data vector has an element with the sign bit set. 28 static Constant *getNegativeIsTrueBoolVec(Constant *V) { 29 VectorType *IntTy = VectorType::getInteger(cast<VectorType>(V->getType())); 30 V = ConstantExpr::getBitCast(V, IntTy); 31 V = ConstantExpr::getICmp(CmpInst::ICMP_SGT, Constant::getNullValue(IntTy), 32 V); 33 return V; 34 } 35 36 /// Convert the x86 XMM integer vector mask to a vector of bools based on 37 /// each element's most significant bit (the sign bit). 38 static Value *getBoolVecFromMask(Value *Mask) { 39 // Fold Constant Mask. 40 if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) 41 return getNegativeIsTrueBoolVec(ConstantMask); 42 43 // Mask was extended from a boolean vector. 44 Value *ExtMask; 45 if (PatternMatch::match( 46 Mask, PatternMatch::m_SExt(PatternMatch::m_Value(ExtMask))) && 47 ExtMask->getType()->isIntOrIntVectorTy(1)) 48 return ExtMask; 49 50 return nullptr; 51 } 52 53 // TODO: If the x86 backend knew how to convert a bool vector mask back to an 54 // XMM register mask efficiently, we could transform all x86 masked intrinsics 55 // to LLVM masked intrinsics and remove the x86 masked intrinsic defs. 56 static Instruction *simplifyX86MaskedLoad(IntrinsicInst &II, InstCombiner &IC) { 57 Value *Ptr = II.getOperand(0); 58 Value *Mask = II.getOperand(1); 59 Constant *ZeroVec = Constant::getNullValue(II.getType()); 60 61 // Zero Mask - masked load instruction creates a zero vector. 62 if (isa<ConstantAggregateZero>(Mask)) 63 return IC.replaceInstUsesWith(II, ZeroVec); 64 65 // The mask is constant or extended from a bool vector. Convert this x86 66 // intrinsic to the LLVM intrinsic to allow target-independent optimizations. 67 if (Value *BoolMask = getBoolVecFromMask(Mask)) { 68 // First, cast the x86 intrinsic scalar pointer to a vector pointer to match 69 // the LLVM intrinsic definition for the pointer argument. 70 unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace(); 71 PointerType *VecPtrTy = PointerType::get(II.getType(), AddrSpace); 72 Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec"); 73 74 // The pass-through vector for an x86 masked load is a zero vector. 75 CallInst *NewMaskedLoad = IC.Builder.CreateMaskedLoad( 76 II.getType(), PtrCast, Align(1), BoolMask, ZeroVec); 77 return IC.replaceInstUsesWith(II, NewMaskedLoad); 78 } 79 80 return nullptr; 81 } 82 83 // TODO: If the x86 backend knew how to convert a bool vector mask back to an 84 // XMM register mask efficiently, we could transform all x86 masked intrinsics 85 // to LLVM masked intrinsics and remove the x86 masked intrinsic defs. 86 static bool simplifyX86MaskedStore(IntrinsicInst &II, InstCombiner &IC) { 87 Value *Ptr = II.getOperand(0); 88 Value *Mask = II.getOperand(1); 89 Value *Vec = II.getOperand(2); 90 91 // Zero Mask - this masked store instruction does nothing. 92 if (isa<ConstantAggregateZero>(Mask)) { 93 IC.eraseInstFromFunction(II); 94 return true; 95 } 96 97 // The SSE2 version is too weird (eg, unaligned but non-temporal) to do 98 // anything else at this level. 99 if (II.getIntrinsicID() == Intrinsic::x86_sse2_maskmov_dqu) 100 return false; 101 102 // The mask is constant or extended from a bool vector. Convert this x86 103 // intrinsic to the LLVM intrinsic to allow target-independent optimizations. 104 if (Value *BoolMask = getBoolVecFromMask(Mask)) { 105 unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace(); 106 PointerType *VecPtrTy = PointerType::get(Vec->getType(), AddrSpace); 107 Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec"); 108 109 IC.Builder.CreateMaskedStore(Vec, PtrCast, Align(1), BoolMask); 110 111 // 'Replace uses' doesn't work for stores. Erase the original masked store. 112 IC.eraseInstFromFunction(II); 113 return true; 114 } 115 116 return false; 117 } 118 119 static Value *simplifyX86immShift(const IntrinsicInst &II, 120 InstCombiner::BuilderTy &Builder) { 121 bool LogicalShift = false; 122 bool ShiftLeft = false; 123 bool IsImm = false; 124 125 switch (II.getIntrinsicID()) { 126 default: 127 llvm_unreachable("Unexpected intrinsic!"); 128 case Intrinsic::x86_sse2_psrai_d: 129 case Intrinsic::x86_sse2_psrai_w: 130 case Intrinsic::x86_avx2_psrai_d: 131 case Intrinsic::x86_avx2_psrai_w: 132 case Intrinsic::x86_avx512_psrai_q_128: 133 case Intrinsic::x86_avx512_psrai_q_256: 134 case Intrinsic::x86_avx512_psrai_d_512: 135 case Intrinsic::x86_avx512_psrai_q_512: 136 case Intrinsic::x86_avx512_psrai_w_512: 137 IsImm = true; 138 LLVM_FALLTHROUGH; 139 case Intrinsic::x86_sse2_psra_d: 140 case Intrinsic::x86_sse2_psra_w: 141 case Intrinsic::x86_avx2_psra_d: 142 case Intrinsic::x86_avx2_psra_w: 143 case Intrinsic::x86_avx512_psra_q_128: 144 case Intrinsic::x86_avx512_psra_q_256: 145 case Intrinsic::x86_avx512_psra_d_512: 146 case Intrinsic::x86_avx512_psra_q_512: 147 case Intrinsic::x86_avx512_psra_w_512: 148 LogicalShift = false; 149 ShiftLeft = false; 150 break; 151 case Intrinsic::x86_sse2_psrli_d: 152 case Intrinsic::x86_sse2_psrli_q: 153 case Intrinsic::x86_sse2_psrli_w: 154 case Intrinsic::x86_avx2_psrli_d: 155 case Intrinsic::x86_avx2_psrli_q: 156 case Intrinsic::x86_avx2_psrli_w: 157 case Intrinsic::x86_avx512_psrli_d_512: 158 case Intrinsic::x86_avx512_psrli_q_512: 159 case Intrinsic::x86_avx512_psrli_w_512: 160 IsImm = true; 161 LLVM_FALLTHROUGH; 162 case Intrinsic::x86_sse2_psrl_d: 163 case Intrinsic::x86_sse2_psrl_q: 164 case Intrinsic::x86_sse2_psrl_w: 165 case Intrinsic::x86_avx2_psrl_d: 166 case Intrinsic::x86_avx2_psrl_q: 167 case Intrinsic::x86_avx2_psrl_w: 168 case Intrinsic::x86_avx512_psrl_d_512: 169 case Intrinsic::x86_avx512_psrl_q_512: 170 case Intrinsic::x86_avx512_psrl_w_512: 171 LogicalShift = true; 172 ShiftLeft = false; 173 break; 174 case Intrinsic::x86_sse2_pslli_d: 175 case Intrinsic::x86_sse2_pslli_q: 176 case Intrinsic::x86_sse2_pslli_w: 177 case Intrinsic::x86_avx2_pslli_d: 178 case Intrinsic::x86_avx2_pslli_q: 179 case Intrinsic::x86_avx2_pslli_w: 180 case Intrinsic::x86_avx512_pslli_d_512: 181 case Intrinsic::x86_avx512_pslli_q_512: 182 case Intrinsic::x86_avx512_pslli_w_512: 183 IsImm = true; 184 LLVM_FALLTHROUGH; 185 case Intrinsic::x86_sse2_psll_d: 186 case Intrinsic::x86_sse2_psll_q: 187 case Intrinsic::x86_sse2_psll_w: 188 case Intrinsic::x86_avx2_psll_d: 189 case Intrinsic::x86_avx2_psll_q: 190 case Intrinsic::x86_avx2_psll_w: 191 case Intrinsic::x86_avx512_psll_d_512: 192 case Intrinsic::x86_avx512_psll_q_512: 193 case Intrinsic::x86_avx512_psll_w_512: 194 LogicalShift = true; 195 ShiftLeft = true; 196 break; 197 } 198 assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left"); 199 200 Value *Vec = II.getArgOperand(0); 201 Value *Amt = II.getArgOperand(1); 202 auto *VT = cast<FixedVectorType>(Vec->getType()); 203 Type *SVT = VT->getElementType(); 204 Type *AmtVT = Amt->getType(); 205 unsigned VWidth = VT->getNumElements(); 206 unsigned BitWidth = SVT->getPrimitiveSizeInBits(); 207 208 // If the shift amount is guaranteed to be in-range we can replace it with a 209 // generic shift. If its guaranteed to be out of range, logical shifts combine 210 // to zero and arithmetic shifts are clamped to (BitWidth - 1). 211 if (IsImm) { 212 assert(AmtVT->isIntegerTy(32) && "Unexpected shift-by-immediate type"); 213 KnownBits KnownAmtBits = 214 llvm::computeKnownBits(Amt, II.getModule()->getDataLayout()); 215 if (KnownAmtBits.getMaxValue().ult(BitWidth)) { 216 Amt = Builder.CreateZExtOrTrunc(Amt, SVT); 217 Amt = Builder.CreateVectorSplat(VWidth, Amt); 218 return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt) 219 : Builder.CreateLShr(Vec, Amt)) 220 : Builder.CreateAShr(Vec, Amt)); 221 } 222 if (KnownAmtBits.getMinValue().uge(BitWidth)) { 223 if (LogicalShift) 224 return ConstantAggregateZero::get(VT); 225 Amt = ConstantInt::get(SVT, BitWidth - 1); 226 return Builder.CreateAShr(Vec, Builder.CreateVectorSplat(VWidth, Amt)); 227 } 228 } else { 229 // Ensure the first element has an in-range value and the rest of the 230 // elements in the bottom 64 bits are zero. 231 assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 && 232 cast<VectorType>(AmtVT)->getElementType() == SVT && 233 "Unexpected shift-by-scalar type"); 234 unsigned NumAmtElts = cast<FixedVectorType>(AmtVT)->getNumElements(); 235 APInt DemandedLower = APInt::getOneBitSet(NumAmtElts, 0); 236 APInt DemandedUpper = APInt::getBitsSet(NumAmtElts, 1, NumAmtElts / 2); 237 KnownBits KnownLowerBits = llvm::computeKnownBits( 238 Amt, DemandedLower, II.getModule()->getDataLayout()); 239 KnownBits KnownUpperBits = llvm::computeKnownBits( 240 Amt, DemandedUpper, II.getModule()->getDataLayout()); 241 if (KnownLowerBits.getMaxValue().ult(BitWidth) && 242 (DemandedUpper.isZero() || KnownUpperBits.isZero())) { 243 SmallVector<int, 16> ZeroSplat(VWidth, 0); 244 Amt = Builder.CreateShuffleVector(Amt, ZeroSplat); 245 return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt) 246 : Builder.CreateLShr(Vec, Amt)) 247 : Builder.CreateAShr(Vec, Amt)); 248 } 249 } 250 251 // Simplify if count is constant vector. 252 auto *CDV = dyn_cast<ConstantDataVector>(Amt); 253 if (!CDV) 254 return nullptr; 255 256 // SSE2/AVX2 uses all the first 64-bits of the 128-bit vector 257 // operand to compute the shift amount. 258 assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 && 259 cast<VectorType>(AmtVT)->getElementType() == SVT && 260 "Unexpected shift-by-scalar type"); 261 262 // Concatenate the sub-elements to create the 64-bit value. 263 APInt Count(64, 0); 264 for (unsigned i = 0, NumSubElts = 64 / BitWidth; i != NumSubElts; ++i) { 265 unsigned SubEltIdx = (NumSubElts - 1) - i; 266 auto *SubElt = cast<ConstantInt>(CDV->getElementAsConstant(SubEltIdx)); 267 Count <<= BitWidth; 268 Count |= SubElt->getValue().zextOrTrunc(64); 269 } 270 271 // If shift-by-zero then just return the original value. 272 if (Count.isZero()) 273 return Vec; 274 275 // Handle cases when Shift >= BitWidth. 276 if (Count.uge(BitWidth)) { 277 // If LogicalShift - just return zero. 278 if (LogicalShift) 279 return ConstantAggregateZero::get(VT); 280 281 // If ArithmeticShift - clamp Shift to (BitWidth - 1). 282 Count = APInt(64, BitWidth - 1); 283 } 284 285 // Get a constant vector of the same type as the first operand. 286 auto ShiftAmt = ConstantInt::get(SVT, Count.zextOrTrunc(BitWidth)); 287 auto ShiftVec = Builder.CreateVectorSplat(VWidth, ShiftAmt); 288 289 if (ShiftLeft) 290 return Builder.CreateShl(Vec, ShiftVec); 291 292 if (LogicalShift) 293 return Builder.CreateLShr(Vec, ShiftVec); 294 295 return Builder.CreateAShr(Vec, ShiftVec); 296 } 297 298 // Attempt to simplify AVX2 per-element shift intrinsics to a generic IR shift. 299 // Unlike the generic IR shifts, the intrinsics have defined behaviour for out 300 // of range shift amounts (logical - set to zero, arithmetic - splat sign bit). 301 static Value *simplifyX86varShift(const IntrinsicInst &II, 302 InstCombiner::BuilderTy &Builder) { 303 bool LogicalShift = false; 304 bool ShiftLeft = false; 305 306 switch (II.getIntrinsicID()) { 307 default: 308 llvm_unreachable("Unexpected intrinsic!"); 309 case Intrinsic::x86_avx2_psrav_d: 310 case Intrinsic::x86_avx2_psrav_d_256: 311 case Intrinsic::x86_avx512_psrav_q_128: 312 case Intrinsic::x86_avx512_psrav_q_256: 313 case Intrinsic::x86_avx512_psrav_d_512: 314 case Intrinsic::x86_avx512_psrav_q_512: 315 case Intrinsic::x86_avx512_psrav_w_128: 316 case Intrinsic::x86_avx512_psrav_w_256: 317 case Intrinsic::x86_avx512_psrav_w_512: 318 LogicalShift = false; 319 ShiftLeft = false; 320 break; 321 case Intrinsic::x86_avx2_psrlv_d: 322 case Intrinsic::x86_avx2_psrlv_d_256: 323 case Intrinsic::x86_avx2_psrlv_q: 324 case Intrinsic::x86_avx2_psrlv_q_256: 325 case Intrinsic::x86_avx512_psrlv_d_512: 326 case Intrinsic::x86_avx512_psrlv_q_512: 327 case Intrinsic::x86_avx512_psrlv_w_128: 328 case Intrinsic::x86_avx512_psrlv_w_256: 329 case Intrinsic::x86_avx512_psrlv_w_512: 330 LogicalShift = true; 331 ShiftLeft = false; 332 break; 333 case Intrinsic::x86_avx2_psllv_d: 334 case Intrinsic::x86_avx2_psllv_d_256: 335 case Intrinsic::x86_avx2_psllv_q: 336 case Intrinsic::x86_avx2_psllv_q_256: 337 case Intrinsic::x86_avx512_psllv_d_512: 338 case Intrinsic::x86_avx512_psllv_q_512: 339 case Intrinsic::x86_avx512_psllv_w_128: 340 case Intrinsic::x86_avx512_psllv_w_256: 341 case Intrinsic::x86_avx512_psllv_w_512: 342 LogicalShift = true; 343 ShiftLeft = true; 344 break; 345 } 346 assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left"); 347 348 Value *Vec = II.getArgOperand(0); 349 Value *Amt = II.getArgOperand(1); 350 auto *VT = cast<FixedVectorType>(II.getType()); 351 Type *SVT = VT->getElementType(); 352 int NumElts = VT->getNumElements(); 353 int BitWidth = SVT->getIntegerBitWidth(); 354 355 // If the shift amount is guaranteed to be in-range we can replace it with a 356 // generic shift. 357 APInt UpperBits = 358 APInt::getHighBitsSet(BitWidth, BitWidth - Log2_32(BitWidth)); 359 if (llvm::MaskedValueIsZero(Amt, UpperBits, 360 II.getModule()->getDataLayout())) { 361 return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt) 362 : Builder.CreateLShr(Vec, Amt)) 363 : Builder.CreateAShr(Vec, Amt)); 364 } 365 366 // Simplify if all shift amounts are constant/undef. 367 auto *CShift = dyn_cast<Constant>(Amt); 368 if (!CShift) 369 return nullptr; 370 371 // Collect each element's shift amount. 372 // We also collect special cases: UNDEF = -1, OUT-OF-RANGE = BitWidth. 373 bool AnyOutOfRange = false; 374 SmallVector<int, 8> ShiftAmts; 375 for (int I = 0; I < NumElts; ++I) { 376 auto *CElt = CShift->getAggregateElement(I); 377 if (isa_and_nonnull<UndefValue>(CElt)) { 378 ShiftAmts.push_back(-1); 379 continue; 380 } 381 382 auto *COp = dyn_cast_or_null<ConstantInt>(CElt); 383 if (!COp) 384 return nullptr; 385 386 // Handle out of range shifts. 387 // If LogicalShift - set to BitWidth (special case). 388 // If ArithmeticShift - set to (BitWidth - 1) (sign splat). 389 APInt ShiftVal = COp->getValue(); 390 if (ShiftVal.uge(BitWidth)) { 391 AnyOutOfRange = LogicalShift; 392 ShiftAmts.push_back(LogicalShift ? BitWidth : BitWidth - 1); 393 continue; 394 } 395 396 ShiftAmts.push_back((int)ShiftVal.getZExtValue()); 397 } 398 399 // If all elements out of range or UNDEF, return vector of zeros/undefs. 400 // ArithmeticShift should only hit this if they are all UNDEF. 401 auto OutOfRange = [&](int Idx) { return (Idx < 0) || (BitWidth <= Idx); }; 402 if (llvm::all_of(ShiftAmts, OutOfRange)) { 403 SmallVector<Constant *, 8> ConstantVec; 404 for (int Idx : ShiftAmts) { 405 if (Idx < 0) { 406 ConstantVec.push_back(UndefValue::get(SVT)); 407 } else { 408 assert(LogicalShift && "Logical shift expected"); 409 ConstantVec.push_back(ConstantInt::getNullValue(SVT)); 410 } 411 } 412 return ConstantVector::get(ConstantVec); 413 } 414 415 // We can't handle only some out of range values with generic logical shifts. 416 if (AnyOutOfRange) 417 return nullptr; 418 419 // Build the shift amount constant vector. 420 SmallVector<Constant *, 8> ShiftVecAmts; 421 for (int Idx : ShiftAmts) { 422 if (Idx < 0) 423 ShiftVecAmts.push_back(UndefValue::get(SVT)); 424 else 425 ShiftVecAmts.push_back(ConstantInt::get(SVT, Idx)); 426 } 427 auto ShiftVec = ConstantVector::get(ShiftVecAmts); 428 429 if (ShiftLeft) 430 return Builder.CreateShl(Vec, ShiftVec); 431 432 if (LogicalShift) 433 return Builder.CreateLShr(Vec, ShiftVec); 434 435 return Builder.CreateAShr(Vec, ShiftVec); 436 } 437 438 static Value *simplifyX86pack(IntrinsicInst &II, 439 InstCombiner::BuilderTy &Builder, bool IsSigned) { 440 Value *Arg0 = II.getArgOperand(0); 441 Value *Arg1 = II.getArgOperand(1); 442 Type *ResTy = II.getType(); 443 444 // Fast all undef handling. 445 if (isa<UndefValue>(Arg0) && isa<UndefValue>(Arg1)) 446 return UndefValue::get(ResTy); 447 448 auto *ArgTy = cast<FixedVectorType>(Arg0->getType()); 449 unsigned NumLanes = ResTy->getPrimitiveSizeInBits() / 128; 450 unsigned NumSrcElts = ArgTy->getNumElements(); 451 assert(cast<FixedVectorType>(ResTy)->getNumElements() == (2 * NumSrcElts) && 452 "Unexpected packing types"); 453 454 unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes; 455 unsigned DstScalarSizeInBits = ResTy->getScalarSizeInBits(); 456 unsigned SrcScalarSizeInBits = ArgTy->getScalarSizeInBits(); 457 assert(SrcScalarSizeInBits == (2 * DstScalarSizeInBits) && 458 "Unexpected packing types"); 459 460 // Constant folding. 461 if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1)) 462 return nullptr; 463 464 // Clamp Values - signed/unsigned both use signed clamp values, but they 465 // differ on the min/max values. 466 APInt MinValue, MaxValue; 467 if (IsSigned) { 468 // PACKSS: Truncate signed value with signed saturation. 469 // Source values less than dst minint are saturated to minint. 470 // Source values greater than dst maxint are saturated to maxint. 471 MinValue = 472 APInt::getSignedMinValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits); 473 MaxValue = 474 APInt::getSignedMaxValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits); 475 } else { 476 // PACKUS: Truncate signed value with unsigned saturation. 477 // Source values less than zero are saturated to zero. 478 // Source values greater than dst maxuint are saturated to maxuint. 479 MinValue = APInt::getZero(SrcScalarSizeInBits); 480 MaxValue = APInt::getLowBitsSet(SrcScalarSizeInBits, DstScalarSizeInBits); 481 } 482 483 auto *MinC = Constant::getIntegerValue(ArgTy, MinValue); 484 auto *MaxC = Constant::getIntegerValue(ArgTy, MaxValue); 485 Arg0 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg0, MinC), MinC, Arg0); 486 Arg1 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg1, MinC), MinC, Arg1); 487 Arg0 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg0, MaxC), MaxC, Arg0); 488 Arg1 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg1, MaxC), MaxC, Arg1); 489 490 // Shuffle clamped args together at the lane level. 491 SmallVector<int, 32> PackMask; 492 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { 493 for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt) 494 PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane)); 495 for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt) 496 PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane) + NumSrcElts); 497 } 498 auto *Shuffle = Builder.CreateShuffleVector(Arg0, Arg1, PackMask); 499 500 // Truncate to dst size. 501 return Builder.CreateTrunc(Shuffle, ResTy); 502 } 503 504 static Value *simplifyX86movmsk(const IntrinsicInst &II, 505 InstCombiner::BuilderTy &Builder) { 506 Value *Arg = II.getArgOperand(0); 507 Type *ResTy = II.getType(); 508 509 // movmsk(undef) -> zero as we must ensure the upper bits are zero. 510 if (isa<UndefValue>(Arg)) 511 return Constant::getNullValue(ResTy); 512 513 auto *ArgTy = dyn_cast<FixedVectorType>(Arg->getType()); 514 // We can't easily peek through x86_mmx types. 515 if (!ArgTy) 516 return nullptr; 517 518 // Expand MOVMSK to compare/bitcast/zext: 519 // e.g. PMOVMSKB(v16i8 x): 520 // %cmp = icmp slt <16 x i8> %x, zeroinitializer 521 // %int = bitcast <16 x i1> %cmp to i16 522 // %res = zext i16 %int to i32 523 unsigned NumElts = ArgTy->getNumElements(); 524 Type *IntegerVecTy = VectorType::getInteger(ArgTy); 525 Type *IntegerTy = Builder.getIntNTy(NumElts); 526 527 Value *Res = Builder.CreateBitCast(Arg, IntegerVecTy); 528 Res = Builder.CreateICmpSLT(Res, Constant::getNullValue(IntegerVecTy)); 529 Res = Builder.CreateBitCast(Res, IntegerTy); 530 Res = Builder.CreateZExtOrTrunc(Res, ResTy); 531 return Res; 532 } 533 534 static Value *simplifyX86addcarry(const IntrinsicInst &II, 535 InstCombiner::BuilderTy &Builder) { 536 Value *CarryIn = II.getArgOperand(0); 537 Value *Op1 = II.getArgOperand(1); 538 Value *Op2 = II.getArgOperand(2); 539 Type *RetTy = II.getType(); 540 Type *OpTy = Op1->getType(); 541 assert(RetTy->getStructElementType(0)->isIntegerTy(8) && 542 RetTy->getStructElementType(1) == OpTy && OpTy == Op2->getType() && 543 "Unexpected types for x86 addcarry"); 544 545 // If carry-in is zero, this is just an unsigned add with overflow. 546 if (match(CarryIn, PatternMatch::m_ZeroInt())) { 547 Value *UAdd = Builder.CreateIntrinsic(Intrinsic::uadd_with_overflow, OpTy, 548 {Op1, Op2}); 549 // The types have to be adjusted to match the x86 call types. 550 Value *UAddResult = Builder.CreateExtractValue(UAdd, 0); 551 Value *UAddOV = Builder.CreateZExt(Builder.CreateExtractValue(UAdd, 1), 552 Builder.getInt8Ty()); 553 Value *Res = UndefValue::get(RetTy); 554 Res = Builder.CreateInsertValue(Res, UAddOV, 0); 555 return Builder.CreateInsertValue(Res, UAddResult, 1); 556 } 557 558 return nullptr; 559 } 560 561 static Value *simplifyX86insertps(const IntrinsicInst &II, 562 InstCombiner::BuilderTy &Builder) { 563 auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2)); 564 if (!CInt) 565 return nullptr; 566 567 auto *VecTy = cast<FixedVectorType>(II.getType()); 568 assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type"); 569 570 // The immediate permute control byte looks like this: 571 // [3:0] - zero mask for each 32-bit lane 572 // [5:4] - select one 32-bit destination lane 573 // [7:6] - select one 32-bit source lane 574 575 uint8_t Imm = CInt->getZExtValue(); 576 uint8_t ZMask = Imm & 0xf; 577 uint8_t DestLane = (Imm >> 4) & 0x3; 578 uint8_t SourceLane = (Imm >> 6) & 0x3; 579 580 ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy); 581 582 // If all zero mask bits are set, this was just a weird way to 583 // generate a zero vector. 584 if (ZMask == 0xf) 585 return ZeroVector; 586 587 // Initialize by passing all of the first source bits through. 588 int ShuffleMask[4] = {0, 1, 2, 3}; 589 590 // We may replace the second operand with the zero vector. 591 Value *V1 = II.getArgOperand(1); 592 593 if (ZMask) { 594 // If the zero mask is being used with a single input or the zero mask 595 // overrides the destination lane, this is a shuffle with the zero vector. 596 if ((II.getArgOperand(0) == II.getArgOperand(1)) || 597 (ZMask & (1 << DestLane))) { 598 V1 = ZeroVector; 599 // We may still move 32-bits of the first source vector from one lane 600 // to another. 601 ShuffleMask[DestLane] = SourceLane; 602 // The zero mask may override the previous insert operation. 603 for (unsigned i = 0; i < 4; ++i) 604 if ((ZMask >> i) & 0x1) 605 ShuffleMask[i] = i + 4; 606 } else { 607 // TODO: Model this case as 2 shuffles or a 'logical and' plus shuffle? 608 return nullptr; 609 } 610 } else { 611 // Replace the selected destination lane with the selected source lane. 612 ShuffleMask[DestLane] = SourceLane + 4; 613 } 614 615 return Builder.CreateShuffleVector(II.getArgOperand(0), V1, ShuffleMask); 616 } 617 618 /// Attempt to simplify SSE4A EXTRQ/EXTRQI instructions using constant folding 619 /// or conversion to a shuffle vector. 620 static Value *simplifyX86extrq(IntrinsicInst &II, Value *Op0, 621 ConstantInt *CILength, ConstantInt *CIIndex, 622 InstCombiner::BuilderTy &Builder) { 623 auto LowConstantHighUndef = [&](uint64_t Val) { 624 Type *IntTy64 = Type::getInt64Ty(II.getContext()); 625 Constant *Args[] = {ConstantInt::get(IntTy64, Val), 626 UndefValue::get(IntTy64)}; 627 return ConstantVector::get(Args); 628 }; 629 630 // See if we're dealing with constant values. 631 auto *C0 = dyn_cast<Constant>(Op0); 632 auto *CI0 = 633 C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0)) 634 : nullptr; 635 636 // Attempt to constant fold. 637 if (CILength && CIIndex) { 638 // From AMD documentation: "The bit index and field length are each six 639 // bits in length other bits of the field are ignored." 640 APInt APIndex = CIIndex->getValue().zextOrTrunc(6); 641 APInt APLength = CILength->getValue().zextOrTrunc(6); 642 643 unsigned Index = APIndex.getZExtValue(); 644 645 // From AMD documentation: "a value of zero in the field length is 646 // defined as length of 64". 647 unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue(); 648 649 // From AMD documentation: "If the sum of the bit index + length field 650 // is greater than 64, the results are undefined". 651 unsigned End = Index + Length; 652 653 // Note that both field index and field length are 8-bit quantities. 654 // Since variables 'Index' and 'Length' are unsigned values 655 // obtained from zero-extending field index and field length 656 // respectively, their sum should never wrap around. 657 if (End > 64) 658 return UndefValue::get(II.getType()); 659 660 // If we are inserting whole bytes, we can convert this to a shuffle. 661 // Lowering can recognize EXTRQI shuffle masks. 662 if ((Length % 8) == 0 && (Index % 8) == 0) { 663 // Convert bit indices to byte indices. 664 Length /= 8; 665 Index /= 8; 666 667 Type *IntTy8 = Type::getInt8Ty(II.getContext()); 668 auto *ShufTy = FixedVectorType::get(IntTy8, 16); 669 670 SmallVector<int, 16> ShuffleMask; 671 for (int i = 0; i != (int)Length; ++i) 672 ShuffleMask.push_back(i + Index); 673 for (int i = Length; i != 8; ++i) 674 ShuffleMask.push_back(i + 16); 675 for (int i = 8; i != 16; ++i) 676 ShuffleMask.push_back(-1); 677 678 Value *SV = Builder.CreateShuffleVector( 679 Builder.CreateBitCast(Op0, ShufTy), 680 ConstantAggregateZero::get(ShufTy), ShuffleMask); 681 return Builder.CreateBitCast(SV, II.getType()); 682 } 683 684 // Constant Fold - shift Index'th bit to lowest position and mask off 685 // Length bits. 686 if (CI0) { 687 APInt Elt = CI0->getValue(); 688 Elt.lshrInPlace(Index); 689 Elt = Elt.zextOrTrunc(Length); 690 return LowConstantHighUndef(Elt.getZExtValue()); 691 } 692 693 // If we were an EXTRQ call, we'll save registers if we convert to EXTRQI. 694 if (II.getIntrinsicID() == Intrinsic::x86_sse4a_extrq) { 695 Value *Args[] = {Op0, CILength, CIIndex}; 696 Module *M = II.getModule(); 697 Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_extrqi); 698 return Builder.CreateCall(F, Args); 699 } 700 } 701 702 // Constant Fold - extraction from zero is always {zero, undef}. 703 if (CI0 && CI0->isZero()) 704 return LowConstantHighUndef(0); 705 706 return nullptr; 707 } 708 709 /// Attempt to simplify SSE4A INSERTQ/INSERTQI instructions using constant 710 /// folding or conversion to a shuffle vector. 711 static Value *simplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1, 712 APInt APLength, APInt APIndex, 713 InstCombiner::BuilderTy &Builder) { 714 // From AMD documentation: "The bit index and field length are each six bits 715 // in length other bits of the field are ignored." 716 APIndex = APIndex.zextOrTrunc(6); 717 APLength = APLength.zextOrTrunc(6); 718 719 // Attempt to constant fold. 720 unsigned Index = APIndex.getZExtValue(); 721 722 // From AMD documentation: "a value of zero in the field length is 723 // defined as length of 64". 724 unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue(); 725 726 // From AMD documentation: "If the sum of the bit index + length field 727 // is greater than 64, the results are undefined". 728 unsigned End = Index + Length; 729 730 // Note that both field index and field length are 8-bit quantities. 731 // Since variables 'Index' and 'Length' are unsigned values 732 // obtained from zero-extending field index and field length 733 // respectively, their sum should never wrap around. 734 if (End > 64) 735 return UndefValue::get(II.getType()); 736 737 // If we are inserting whole bytes, we can convert this to a shuffle. 738 // Lowering can recognize INSERTQI shuffle masks. 739 if ((Length % 8) == 0 && (Index % 8) == 0) { 740 // Convert bit indices to byte indices. 741 Length /= 8; 742 Index /= 8; 743 744 Type *IntTy8 = Type::getInt8Ty(II.getContext()); 745 auto *ShufTy = FixedVectorType::get(IntTy8, 16); 746 747 SmallVector<int, 16> ShuffleMask; 748 for (int i = 0; i != (int)Index; ++i) 749 ShuffleMask.push_back(i); 750 for (int i = 0; i != (int)Length; ++i) 751 ShuffleMask.push_back(i + 16); 752 for (int i = Index + Length; i != 8; ++i) 753 ShuffleMask.push_back(i); 754 for (int i = 8; i != 16; ++i) 755 ShuffleMask.push_back(-1); 756 757 Value *SV = Builder.CreateShuffleVector(Builder.CreateBitCast(Op0, ShufTy), 758 Builder.CreateBitCast(Op1, ShufTy), 759 ShuffleMask); 760 return Builder.CreateBitCast(SV, II.getType()); 761 } 762 763 // See if we're dealing with constant values. 764 auto *C0 = dyn_cast<Constant>(Op0); 765 auto *C1 = dyn_cast<Constant>(Op1); 766 auto *CI00 = 767 C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0)) 768 : nullptr; 769 auto *CI10 = 770 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0)) 771 : nullptr; 772 773 // Constant Fold - insert bottom Length bits starting at the Index'th bit. 774 if (CI00 && CI10) { 775 APInt V00 = CI00->getValue(); 776 APInt V10 = CI10->getValue(); 777 APInt Mask = APInt::getLowBitsSet(64, Length).shl(Index); 778 V00 = V00 & ~Mask; 779 V10 = V10.zextOrTrunc(Length).zextOrTrunc(64).shl(Index); 780 APInt Val = V00 | V10; 781 Type *IntTy64 = Type::getInt64Ty(II.getContext()); 782 Constant *Args[] = {ConstantInt::get(IntTy64, Val.getZExtValue()), 783 UndefValue::get(IntTy64)}; 784 return ConstantVector::get(Args); 785 } 786 787 // If we were an INSERTQ call, we'll save demanded elements if we convert to 788 // INSERTQI. 789 if (II.getIntrinsicID() == Intrinsic::x86_sse4a_insertq) { 790 Type *IntTy8 = Type::getInt8Ty(II.getContext()); 791 Constant *CILength = ConstantInt::get(IntTy8, Length, false); 792 Constant *CIIndex = ConstantInt::get(IntTy8, Index, false); 793 794 Value *Args[] = {Op0, Op1, CILength, CIIndex}; 795 Module *M = II.getModule(); 796 Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi); 797 return Builder.CreateCall(F, Args); 798 } 799 800 return nullptr; 801 } 802 803 /// Attempt to convert pshufb* to shufflevector if the mask is constant. 804 static Value *simplifyX86pshufb(const IntrinsicInst &II, 805 InstCombiner::BuilderTy &Builder) { 806 auto *V = dyn_cast<Constant>(II.getArgOperand(1)); 807 if (!V) 808 return nullptr; 809 810 auto *VecTy = cast<FixedVectorType>(II.getType()); 811 unsigned NumElts = VecTy->getNumElements(); 812 assert((NumElts == 16 || NumElts == 32 || NumElts == 64) && 813 "Unexpected number of elements in shuffle mask!"); 814 815 // Construct a shuffle mask from constant integers or UNDEFs. 816 int Indexes[64]; 817 818 // Each byte in the shuffle control mask forms an index to permute the 819 // corresponding byte in the destination operand. 820 for (unsigned I = 0; I < NumElts; ++I) { 821 Constant *COp = V->getAggregateElement(I); 822 if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) 823 return nullptr; 824 825 if (isa<UndefValue>(COp)) { 826 Indexes[I] = -1; 827 continue; 828 } 829 830 int8_t Index = cast<ConstantInt>(COp)->getValue().getZExtValue(); 831 832 // If the most significant bit (bit[7]) of each byte of the shuffle 833 // control mask is set, then zero is written in the result byte. 834 // The zero vector is in the right-hand side of the resulting 835 // shufflevector. 836 837 // The value of each index for the high 128-bit lane is the least 838 // significant 4 bits of the respective shuffle control byte. 839 Index = ((Index < 0) ? NumElts : Index & 0x0F) + (I & 0xF0); 840 Indexes[I] = Index; 841 } 842 843 auto V1 = II.getArgOperand(0); 844 auto V2 = Constant::getNullValue(VecTy); 845 return Builder.CreateShuffleVector(V1, V2, makeArrayRef(Indexes, NumElts)); 846 } 847 848 /// Attempt to convert vpermilvar* to shufflevector if the mask is constant. 849 static Value *simplifyX86vpermilvar(const IntrinsicInst &II, 850 InstCombiner::BuilderTy &Builder) { 851 auto *V = dyn_cast<Constant>(II.getArgOperand(1)); 852 if (!V) 853 return nullptr; 854 855 auto *VecTy = cast<FixedVectorType>(II.getType()); 856 unsigned NumElts = VecTy->getNumElements(); 857 bool IsPD = VecTy->getScalarType()->isDoubleTy(); 858 unsigned NumLaneElts = IsPD ? 2 : 4; 859 assert(NumElts == 16 || NumElts == 8 || NumElts == 4 || NumElts == 2); 860 861 // Construct a shuffle mask from constant integers or UNDEFs. 862 int Indexes[16]; 863 864 // The intrinsics only read one or two bits, clear the rest. 865 for (unsigned I = 0; I < NumElts; ++I) { 866 Constant *COp = V->getAggregateElement(I); 867 if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) 868 return nullptr; 869 870 if (isa<UndefValue>(COp)) { 871 Indexes[I] = -1; 872 continue; 873 } 874 875 APInt Index = cast<ConstantInt>(COp)->getValue(); 876 Index = Index.zextOrTrunc(32).getLoBits(2); 877 878 // The PD variants uses bit 1 to select per-lane element index, so 879 // shift down to convert to generic shuffle mask index. 880 if (IsPD) 881 Index.lshrInPlace(1); 882 883 // The _256 variants are a bit trickier since the mask bits always index 884 // into the corresponding 128 half. In order to convert to a generic 885 // shuffle, we have to make that explicit. 886 Index += APInt(32, (I / NumLaneElts) * NumLaneElts); 887 888 Indexes[I] = Index.getZExtValue(); 889 } 890 891 auto V1 = II.getArgOperand(0); 892 return Builder.CreateShuffleVector(V1, makeArrayRef(Indexes, NumElts)); 893 } 894 895 /// Attempt to convert vpermd/vpermps to shufflevector if the mask is constant. 896 static Value *simplifyX86vpermv(const IntrinsicInst &II, 897 InstCombiner::BuilderTy &Builder) { 898 auto *V = dyn_cast<Constant>(II.getArgOperand(1)); 899 if (!V) 900 return nullptr; 901 902 auto *VecTy = cast<FixedVectorType>(II.getType()); 903 unsigned Size = VecTy->getNumElements(); 904 assert((Size == 4 || Size == 8 || Size == 16 || Size == 32 || Size == 64) && 905 "Unexpected shuffle mask size"); 906 907 // Construct a shuffle mask from constant integers or UNDEFs. 908 int Indexes[64]; 909 910 for (unsigned I = 0; I < Size; ++I) { 911 Constant *COp = V->getAggregateElement(I); 912 if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) 913 return nullptr; 914 915 if (isa<UndefValue>(COp)) { 916 Indexes[I] = -1; 917 continue; 918 } 919 920 uint32_t Index = cast<ConstantInt>(COp)->getZExtValue(); 921 Index &= Size - 1; 922 Indexes[I] = Index; 923 } 924 925 auto V1 = II.getArgOperand(0); 926 return Builder.CreateShuffleVector(V1, makeArrayRef(Indexes, Size)); 927 } 928 929 Optional<Instruction *> 930 X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { 931 auto SimplifyDemandedVectorEltsLow = [&IC](Value *Op, unsigned Width, 932 unsigned DemandedWidth) { 933 APInt UndefElts(Width, 0); 934 APInt DemandedElts = APInt::getLowBitsSet(Width, DemandedWidth); 935 return IC.SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts); 936 }; 937 938 Intrinsic::ID IID = II.getIntrinsicID(); 939 switch (IID) { 940 case Intrinsic::x86_bmi_bextr_32: 941 case Intrinsic::x86_bmi_bextr_64: 942 case Intrinsic::x86_tbm_bextri_u32: 943 case Intrinsic::x86_tbm_bextri_u64: 944 // If the RHS is a constant we can try some simplifications. 945 if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) { 946 uint64_t Shift = C->getZExtValue(); 947 uint64_t Length = (Shift >> 8) & 0xff; 948 Shift &= 0xff; 949 unsigned BitWidth = II.getType()->getIntegerBitWidth(); 950 // If the length is 0 or the shift is out of range, replace with zero. 951 if (Length == 0 || Shift >= BitWidth) { 952 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0)); 953 } 954 // If the LHS is also a constant, we can completely constant fold this. 955 if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) { 956 uint64_t Result = InC->getZExtValue() >> Shift; 957 if (Length > BitWidth) 958 Length = BitWidth; 959 Result &= maskTrailingOnes<uint64_t>(Length); 960 return IC.replaceInstUsesWith(II, 961 ConstantInt::get(II.getType(), Result)); 962 } 963 // TODO should we turn this into 'and' if shift is 0? Or 'shl' if we 964 // are only masking bits that a shift already cleared? 965 } 966 break; 967 968 case Intrinsic::x86_bmi_bzhi_32: 969 case Intrinsic::x86_bmi_bzhi_64: 970 // If the RHS is a constant we can try some simplifications. 971 if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) { 972 uint64_t Index = C->getZExtValue() & 0xff; 973 unsigned BitWidth = II.getType()->getIntegerBitWidth(); 974 if (Index >= BitWidth) { 975 return IC.replaceInstUsesWith(II, II.getArgOperand(0)); 976 } 977 if (Index == 0) { 978 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0)); 979 } 980 // If the LHS is also a constant, we can completely constant fold this. 981 if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) { 982 uint64_t Result = InC->getZExtValue(); 983 Result &= maskTrailingOnes<uint64_t>(Index); 984 return IC.replaceInstUsesWith(II, 985 ConstantInt::get(II.getType(), Result)); 986 } 987 // TODO should we convert this to an AND if the RHS is constant? 988 } 989 break; 990 case Intrinsic::x86_bmi_pext_32: 991 case Intrinsic::x86_bmi_pext_64: 992 if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) { 993 if (MaskC->isNullValue()) { 994 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0)); 995 } 996 if (MaskC->isAllOnesValue()) { 997 return IC.replaceInstUsesWith(II, II.getArgOperand(0)); 998 } 999 1000 if (MaskC->getValue().isShiftedMask()) { 1001 // any single contingous sequence of 1s anywhere in the mask simply 1002 // describes a subset of the input bits shifted to the appropriate 1003 // position. Replace with the straight forward IR. 1004 unsigned ShiftAmount = MaskC->getValue().countTrailingZeros(); 1005 Value *Input = II.getArgOperand(0); 1006 Value *Masked = IC.Builder.CreateAnd(Input, II.getArgOperand(1)); 1007 Value *Shifted = IC.Builder.CreateLShr(Masked, 1008 ConstantInt::get(II.getType(), 1009 ShiftAmount)); 1010 return IC.replaceInstUsesWith(II, Shifted); 1011 } 1012 1013 1014 if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) { 1015 uint64_t Src = SrcC->getZExtValue(); 1016 uint64_t Mask = MaskC->getZExtValue(); 1017 uint64_t Result = 0; 1018 uint64_t BitToSet = 1; 1019 1020 while (Mask) { 1021 // Isolate lowest set bit. 1022 uint64_t BitToTest = Mask & -Mask; 1023 if (BitToTest & Src) 1024 Result |= BitToSet; 1025 1026 BitToSet <<= 1; 1027 // Clear lowest set bit. 1028 Mask &= Mask - 1; 1029 } 1030 1031 return IC.replaceInstUsesWith(II, 1032 ConstantInt::get(II.getType(), Result)); 1033 } 1034 } 1035 break; 1036 case Intrinsic::x86_bmi_pdep_32: 1037 case Intrinsic::x86_bmi_pdep_64: 1038 if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) { 1039 if (MaskC->isNullValue()) { 1040 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0)); 1041 } 1042 if (MaskC->isAllOnesValue()) { 1043 return IC.replaceInstUsesWith(II, II.getArgOperand(0)); 1044 } 1045 if (MaskC->getValue().isShiftedMask()) { 1046 // any single contingous sequence of 1s anywhere in the mask simply 1047 // describes a subset of the input bits shifted to the appropriate 1048 // position. Replace with the straight forward IR. 1049 unsigned ShiftAmount = MaskC->getValue().countTrailingZeros(); 1050 Value *Input = II.getArgOperand(0); 1051 Value *Shifted = IC.Builder.CreateShl(Input, 1052 ConstantInt::get(II.getType(), 1053 ShiftAmount)); 1054 Value *Masked = IC.Builder.CreateAnd(Shifted, II.getArgOperand(1)); 1055 return IC.replaceInstUsesWith(II, Masked); 1056 } 1057 1058 if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) { 1059 uint64_t Src = SrcC->getZExtValue(); 1060 uint64_t Mask = MaskC->getZExtValue(); 1061 uint64_t Result = 0; 1062 uint64_t BitToTest = 1; 1063 1064 while (Mask) { 1065 // Isolate lowest set bit. 1066 uint64_t BitToSet = Mask & -Mask; 1067 if (BitToTest & Src) 1068 Result |= BitToSet; 1069 1070 BitToTest <<= 1; 1071 // Clear lowest set bit; 1072 Mask &= Mask - 1; 1073 } 1074 1075 return IC.replaceInstUsesWith(II, 1076 ConstantInt::get(II.getType(), Result)); 1077 } 1078 } 1079 break; 1080 1081 case Intrinsic::x86_sse_cvtss2si: 1082 case Intrinsic::x86_sse_cvtss2si64: 1083 case Intrinsic::x86_sse_cvttss2si: 1084 case Intrinsic::x86_sse_cvttss2si64: 1085 case Intrinsic::x86_sse2_cvtsd2si: 1086 case Intrinsic::x86_sse2_cvtsd2si64: 1087 case Intrinsic::x86_sse2_cvttsd2si: 1088 case Intrinsic::x86_sse2_cvttsd2si64: 1089 case Intrinsic::x86_avx512_vcvtss2si32: 1090 case Intrinsic::x86_avx512_vcvtss2si64: 1091 case Intrinsic::x86_avx512_vcvtss2usi32: 1092 case Intrinsic::x86_avx512_vcvtss2usi64: 1093 case Intrinsic::x86_avx512_vcvtsd2si32: 1094 case Intrinsic::x86_avx512_vcvtsd2si64: 1095 case Intrinsic::x86_avx512_vcvtsd2usi32: 1096 case Intrinsic::x86_avx512_vcvtsd2usi64: 1097 case Intrinsic::x86_avx512_cvttss2si: 1098 case Intrinsic::x86_avx512_cvttss2si64: 1099 case Intrinsic::x86_avx512_cvttss2usi: 1100 case Intrinsic::x86_avx512_cvttss2usi64: 1101 case Intrinsic::x86_avx512_cvttsd2si: 1102 case Intrinsic::x86_avx512_cvttsd2si64: 1103 case Intrinsic::x86_avx512_cvttsd2usi: 1104 case Intrinsic::x86_avx512_cvttsd2usi64: { 1105 // These intrinsics only demand the 0th element of their input vectors. If 1106 // we can simplify the input based on that, do so now. 1107 Value *Arg = II.getArgOperand(0); 1108 unsigned VWidth = cast<FixedVectorType>(Arg->getType())->getNumElements(); 1109 if (Value *V = SimplifyDemandedVectorEltsLow(Arg, VWidth, 1)) { 1110 return IC.replaceOperand(II, 0, V); 1111 } 1112 break; 1113 } 1114 1115 case Intrinsic::x86_mmx_pmovmskb: 1116 case Intrinsic::x86_sse_movmsk_ps: 1117 case Intrinsic::x86_sse2_movmsk_pd: 1118 case Intrinsic::x86_sse2_pmovmskb_128: 1119 case Intrinsic::x86_avx_movmsk_pd_256: 1120 case Intrinsic::x86_avx_movmsk_ps_256: 1121 case Intrinsic::x86_avx2_pmovmskb: 1122 if (Value *V = simplifyX86movmsk(II, IC.Builder)) { 1123 return IC.replaceInstUsesWith(II, V); 1124 } 1125 break; 1126 1127 case Intrinsic::x86_sse_comieq_ss: 1128 case Intrinsic::x86_sse_comige_ss: 1129 case Intrinsic::x86_sse_comigt_ss: 1130 case Intrinsic::x86_sse_comile_ss: 1131 case Intrinsic::x86_sse_comilt_ss: 1132 case Intrinsic::x86_sse_comineq_ss: 1133 case Intrinsic::x86_sse_ucomieq_ss: 1134 case Intrinsic::x86_sse_ucomige_ss: 1135 case Intrinsic::x86_sse_ucomigt_ss: 1136 case Intrinsic::x86_sse_ucomile_ss: 1137 case Intrinsic::x86_sse_ucomilt_ss: 1138 case Intrinsic::x86_sse_ucomineq_ss: 1139 case Intrinsic::x86_sse2_comieq_sd: 1140 case Intrinsic::x86_sse2_comige_sd: 1141 case Intrinsic::x86_sse2_comigt_sd: 1142 case Intrinsic::x86_sse2_comile_sd: 1143 case Intrinsic::x86_sse2_comilt_sd: 1144 case Intrinsic::x86_sse2_comineq_sd: 1145 case Intrinsic::x86_sse2_ucomieq_sd: 1146 case Intrinsic::x86_sse2_ucomige_sd: 1147 case Intrinsic::x86_sse2_ucomigt_sd: 1148 case Intrinsic::x86_sse2_ucomile_sd: 1149 case Intrinsic::x86_sse2_ucomilt_sd: 1150 case Intrinsic::x86_sse2_ucomineq_sd: 1151 case Intrinsic::x86_avx512_vcomi_ss: 1152 case Intrinsic::x86_avx512_vcomi_sd: 1153 case Intrinsic::x86_avx512_mask_cmp_ss: 1154 case Intrinsic::x86_avx512_mask_cmp_sd: { 1155 // These intrinsics only demand the 0th element of their input vectors. If 1156 // we can simplify the input based on that, do so now. 1157 bool MadeChange = false; 1158 Value *Arg0 = II.getArgOperand(0); 1159 Value *Arg1 = II.getArgOperand(1); 1160 unsigned VWidth = cast<FixedVectorType>(Arg0->getType())->getNumElements(); 1161 if (Value *V = SimplifyDemandedVectorEltsLow(Arg0, VWidth, 1)) { 1162 IC.replaceOperand(II, 0, V); 1163 MadeChange = true; 1164 } 1165 if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, 1)) { 1166 IC.replaceOperand(II, 1, V); 1167 MadeChange = true; 1168 } 1169 if (MadeChange) { 1170 return &II; 1171 } 1172 break; 1173 } 1174 1175 case Intrinsic::x86_avx512_add_ps_512: 1176 case Intrinsic::x86_avx512_div_ps_512: 1177 case Intrinsic::x86_avx512_mul_ps_512: 1178 case Intrinsic::x86_avx512_sub_ps_512: 1179 case Intrinsic::x86_avx512_add_pd_512: 1180 case Intrinsic::x86_avx512_div_pd_512: 1181 case Intrinsic::x86_avx512_mul_pd_512: 1182 case Intrinsic::x86_avx512_sub_pd_512: 1183 // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular 1184 // IR operations. 1185 if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(2))) { 1186 if (R->getValue() == 4) { 1187 Value *Arg0 = II.getArgOperand(0); 1188 Value *Arg1 = II.getArgOperand(1); 1189 1190 Value *V; 1191 switch (IID) { 1192 default: 1193 llvm_unreachable("Case stmts out of sync!"); 1194 case Intrinsic::x86_avx512_add_ps_512: 1195 case Intrinsic::x86_avx512_add_pd_512: 1196 V = IC.Builder.CreateFAdd(Arg0, Arg1); 1197 break; 1198 case Intrinsic::x86_avx512_sub_ps_512: 1199 case Intrinsic::x86_avx512_sub_pd_512: 1200 V = IC.Builder.CreateFSub(Arg0, Arg1); 1201 break; 1202 case Intrinsic::x86_avx512_mul_ps_512: 1203 case Intrinsic::x86_avx512_mul_pd_512: 1204 V = IC.Builder.CreateFMul(Arg0, Arg1); 1205 break; 1206 case Intrinsic::x86_avx512_div_ps_512: 1207 case Intrinsic::x86_avx512_div_pd_512: 1208 V = IC.Builder.CreateFDiv(Arg0, Arg1); 1209 break; 1210 } 1211 1212 return IC.replaceInstUsesWith(II, V); 1213 } 1214 } 1215 break; 1216 1217 case Intrinsic::x86_avx512_mask_add_ss_round: 1218 case Intrinsic::x86_avx512_mask_div_ss_round: 1219 case Intrinsic::x86_avx512_mask_mul_ss_round: 1220 case Intrinsic::x86_avx512_mask_sub_ss_round: 1221 case Intrinsic::x86_avx512_mask_add_sd_round: 1222 case Intrinsic::x86_avx512_mask_div_sd_round: 1223 case Intrinsic::x86_avx512_mask_mul_sd_round: 1224 case Intrinsic::x86_avx512_mask_sub_sd_round: 1225 // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular 1226 // IR operations. 1227 if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(4))) { 1228 if (R->getValue() == 4) { 1229 // Extract the element as scalars. 1230 Value *Arg0 = II.getArgOperand(0); 1231 Value *Arg1 = II.getArgOperand(1); 1232 Value *LHS = IC.Builder.CreateExtractElement(Arg0, (uint64_t)0); 1233 Value *RHS = IC.Builder.CreateExtractElement(Arg1, (uint64_t)0); 1234 1235 Value *V; 1236 switch (IID) { 1237 default: 1238 llvm_unreachable("Case stmts out of sync!"); 1239 case Intrinsic::x86_avx512_mask_add_ss_round: 1240 case Intrinsic::x86_avx512_mask_add_sd_round: 1241 V = IC.Builder.CreateFAdd(LHS, RHS); 1242 break; 1243 case Intrinsic::x86_avx512_mask_sub_ss_round: 1244 case Intrinsic::x86_avx512_mask_sub_sd_round: 1245 V = IC.Builder.CreateFSub(LHS, RHS); 1246 break; 1247 case Intrinsic::x86_avx512_mask_mul_ss_round: 1248 case Intrinsic::x86_avx512_mask_mul_sd_round: 1249 V = IC.Builder.CreateFMul(LHS, RHS); 1250 break; 1251 case Intrinsic::x86_avx512_mask_div_ss_round: 1252 case Intrinsic::x86_avx512_mask_div_sd_round: 1253 V = IC.Builder.CreateFDiv(LHS, RHS); 1254 break; 1255 } 1256 1257 // Handle the masking aspect of the intrinsic. 1258 Value *Mask = II.getArgOperand(3); 1259 auto *C = dyn_cast<ConstantInt>(Mask); 1260 // We don't need a select if we know the mask bit is a 1. 1261 if (!C || !C->getValue()[0]) { 1262 // Cast the mask to an i1 vector and then extract the lowest element. 1263 auto *MaskTy = FixedVectorType::get( 1264 IC.Builder.getInt1Ty(), 1265 cast<IntegerType>(Mask->getType())->getBitWidth()); 1266 Mask = IC.Builder.CreateBitCast(Mask, MaskTy); 1267 Mask = IC.Builder.CreateExtractElement(Mask, (uint64_t)0); 1268 // Extract the lowest element from the passthru operand. 1269 Value *Passthru = 1270 IC.Builder.CreateExtractElement(II.getArgOperand(2), (uint64_t)0); 1271 V = IC.Builder.CreateSelect(Mask, V, Passthru); 1272 } 1273 1274 // Insert the result back into the original argument 0. 1275 V = IC.Builder.CreateInsertElement(Arg0, V, (uint64_t)0); 1276 1277 return IC.replaceInstUsesWith(II, V); 1278 } 1279 } 1280 break; 1281 1282 // Constant fold ashr( <A x Bi>, Ci ). 1283 // Constant fold lshr( <A x Bi>, Ci ). 1284 // Constant fold shl( <A x Bi>, Ci ). 1285 case Intrinsic::x86_sse2_psrai_d: 1286 case Intrinsic::x86_sse2_psrai_w: 1287 case Intrinsic::x86_avx2_psrai_d: 1288 case Intrinsic::x86_avx2_psrai_w: 1289 case Intrinsic::x86_avx512_psrai_q_128: 1290 case Intrinsic::x86_avx512_psrai_q_256: 1291 case Intrinsic::x86_avx512_psrai_d_512: 1292 case Intrinsic::x86_avx512_psrai_q_512: 1293 case Intrinsic::x86_avx512_psrai_w_512: 1294 case Intrinsic::x86_sse2_psrli_d: 1295 case Intrinsic::x86_sse2_psrli_q: 1296 case Intrinsic::x86_sse2_psrli_w: 1297 case Intrinsic::x86_avx2_psrli_d: 1298 case Intrinsic::x86_avx2_psrli_q: 1299 case Intrinsic::x86_avx2_psrli_w: 1300 case Intrinsic::x86_avx512_psrli_d_512: 1301 case Intrinsic::x86_avx512_psrli_q_512: 1302 case Intrinsic::x86_avx512_psrli_w_512: 1303 case Intrinsic::x86_sse2_pslli_d: 1304 case Intrinsic::x86_sse2_pslli_q: 1305 case Intrinsic::x86_sse2_pslli_w: 1306 case Intrinsic::x86_avx2_pslli_d: 1307 case Intrinsic::x86_avx2_pslli_q: 1308 case Intrinsic::x86_avx2_pslli_w: 1309 case Intrinsic::x86_avx512_pslli_d_512: 1310 case Intrinsic::x86_avx512_pslli_q_512: 1311 case Intrinsic::x86_avx512_pslli_w_512: 1312 if (Value *V = simplifyX86immShift(II, IC.Builder)) { 1313 return IC.replaceInstUsesWith(II, V); 1314 } 1315 break; 1316 1317 case Intrinsic::x86_sse2_psra_d: 1318 case Intrinsic::x86_sse2_psra_w: 1319 case Intrinsic::x86_avx2_psra_d: 1320 case Intrinsic::x86_avx2_psra_w: 1321 case Intrinsic::x86_avx512_psra_q_128: 1322 case Intrinsic::x86_avx512_psra_q_256: 1323 case Intrinsic::x86_avx512_psra_d_512: 1324 case Intrinsic::x86_avx512_psra_q_512: 1325 case Intrinsic::x86_avx512_psra_w_512: 1326 case Intrinsic::x86_sse2_psrl_d: 1327 case Intrinsic::x86_sse2_psrl_q: 1328 case Intrinsic::x86_sse2_psrl_w: 1329 case Intrinsic::x86_avx2_psrl_d: 1330 case Intrinsic::x86_avx2_psrl_q: 1331 case Intrinsic::x86_avx2_psrl_w: 1332 case Intrinsic::x86_avx512_psrl_d_512: 1333 case Intrinsic::x86_avx512_psrl_q_512: 1334 case Intrinsic::x86_avx512_psrl_w_512: 1335 case Intrinsic::x86_sse2_psll_d: 1336 case Intrinsic::x86_sse2_psll_q: 1337 case Intrinsic::x86_sse2_psll_w: 1338 case Intrinsic::x86_avx2_psll_d: 1339 case Intrinsic::x86_avx2_psll_q: 1340 case Intrinsic::x86_avx2_psll_w: 1341 case Intrinsic::x86_avx512_psll_d_512: 1342 case Intrinsic::x86_avx512_psll_q_512: 1343 case Intrinsic::x86_avx512_psll_w_512: { 1344 if (Value *V = simplifyX86immShift(II, IC.Builder)) { 1345 return IC.replaceInstUsesWith(II, V); 1346 } 1347 1348 // SSE2/AVX2 uses only the first 64-bits of the 128-bit vector 1349 // operand to compute the shift amount. 1350 Value *Arg1 = II.getArgOperand(1); 1351 assert(Arg1->getType()->getPrimitiveSizeInBits() == 128 && 1352 "Unexpected packed shift size"); 1353 unsigned VWidth = cast<FixedVectorType>(Arg1->getType())->getNumElements(); 1354 1355 if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, VWidth / 2)) { 1356 return IC.replaceOperand(II, 1, V); 1357 } 1358 break; 1359 } 1360 1361 case Intrinsic::x86_avx2_psllv_d: 1362 case Intrinsic::x86_avx2_psllv_d_256: 1363 case Intrinsic::x86_avx2_psllv_q: 1364 case Intrinsic::x86_avx2_psllv_q_256: 1365 case Intrinsic::x86_avx512_psllv_d_512: 1366 case Intrinsic::x86_avx512_psllv_q_512: 1367 case Intrinsic::x86_avx512_psllv_w_128: 1368 case Intrinsic::x86_avx512_psllv_w_256: 1369 case Intrinsic::x86_avx512_psllv_w_512: 1370 case Intrinsic::x86_avx2_psrav_d: 1371 case Intrinsic::x86_avx2_psrav_d_256: 1372 case Intrinsic::x86_avx512_psrav_q_128: 1373 case Intrinsic::x86_avx512_psrav_q_256: 1374 case Intrinsic::x86_avx512_psrav_d_512: 1375 case Intrinsic::x86_avx512_psrav_q_512: 1376 case Intrinsic::x86_avx512_psrav_w_128: 1377 case Intrinsic::x86_avx512_psrav_w_256: 1378 case Intrinsic::x86_avx512_psrav_w_512: 1379 case Intrinsic::x86_avx2_psrlv_d: 1380 case Intrinsic::x86_avx2_psrlv_d_256: 1381 case Intrinsic::x86_avx2_psrlv_q: 1382 case Intrinsic::x86_avx2_psrlv_q_256: 1383 case Intrinsic::x86_avx512_psrlv_d_512: 1384 case Intrinsic::x86_avx512_psrlv_q_512: 1385 case Intrinsic::x86_avx512_psrlv_w_128: 1386 case Intrinsic::x86_avx512_psrlv_w_256: 1387 case Intrinsic::x86_avx512_psrlv_w_512: 1388 if (Value *V = simplifyX86varShift(II, IC.Builder)) { 1389 return IC.replaceInstUsesWith(II, V); 1390 } 1391 break; 1392 1393 case Intrinsic::x86_sse2_packssdw_128: 1394 case Intrinsic::x86_sse2_packsswb_128: 1395 case Intrinsic::x86_avx2_packssdw: 1396 case Intrinsic::x86_avx2_packsswb: 1397 case Intrinsic::x86_avx512_packssdw_512: 1398 case Intrinsic::x86_avx512_packsswb_512: 1399 if (Value *V = simplifyX86pack(II, IC.Builder, true)) { 1400 return IC.replaceInstUsesWith(II, V); 1401 } 1402 break; 1403 1404 case Intrinsic::x86_sse2_packuswb_128: 1405 case Intrinsic::x86_sse41_packusdw: 1406 case Intrinsic::x86_avx2_packusdw: 1407 case Intrinsic::x86_avx2_packuswb: 1408 case Intrinsic::x86_avx512_packusdw_512: 1409 case Intrinsic::x86_avx512_packuswb_512: 1410 if (Value *V = simplifyX86pack(II, IC.Builder, false)) { 1411 return IC.replaceInstUsesWith(II, V); 1412 } 1413 break; 1414 1415 case Intrinsic::x86_pclmulqdq: 1416 case Intrinsic::x86_pclmulqdq_256: 1417 case Intrinsic::x86_pclmulqdq_512: { 1418 if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(2))) { 1419 unsigned Imm = C->getZExtValue(); 1420 1421 bool MadeChange = false; 1422 Value *Arg0 = II.getArgOperand(0); 1423 Value *Arg1 = II.getArgOperand(1); 1424 unsigned VWidth = 1425 cast<FixedVectorType>(Arg0->getType())->getNumElements(); 1426 1427 APInt UndefElts1(VWidth, 0); 1428 APInt DemandedElts1 = 1429 APInt::getSplat(VWidth, APInt(2, (Imm & 0x01) ? 2 : 1)); 1430 if (Value *V = 1431 IC.SimplifyDemandedVectorElts(Arg0, DemandedElts1, UndefElts1)) { 1432 IC.replaceOperand(II, 0, V); 1433 MadeChange = true; 1434 } 1435 1436 APInt UndefElts2(VWidth, 0); 1437 APInt DemandedElts2 = 1438 APInt::getSplat(VWidth, APInt(2, (Imm & 0x10) ? 2 : 1)); 1439 if (Value *V = 1440 IC.SimplifyDemandedVectorElts(Arg1, DemandedElts2, UndefElts2)) { 1441 IC.replaceOperand(II, 1, V); 1442 MadeChange = true; 1443 } 1444 1445 // If either input elements are undef, the result is zero. 1446 if (DemandedElts1.isSubsetOf(UndefElts1) || 1447 DemandedElts2.isSubsetOf(UndefElts2)) { 1448 return IC.replaceInstUsesWith(II, 1449 ConstantAggregateZero::get(II.getType())); 1450 } 1451 1452 if (MadeChange) { 1453 return &II; 1454 } 1455 } 1456 break; 1457 } 1458 1459 case Intrinsic::x86_sse41_insertps: 1460 if (Value *V = simplifyX86insertps(II, IC.Builder)) { 1461 return IC.replaceInstUsesWith(II, V); 1462 } 1463 break; 1464 1465 case Intrinsic::x86_sse4a_extrq: { 1466 Value *Op0 = II.getArgOperand(0); 1467 Value *Op1 = II.getArgOperand(1); 1468 unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements(); 1469 unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements(); 1470 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && 1471 Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 && 1472 VWidth1 == 16 && "Unexpected operand sizes"); 1473 1474 // See if we're dealing with constant values. 1475 auto *C1 = dyn_cast<Constant>(Op1); 1476 auto *CILength = 1477 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0)) 1478 : nullptr; 1479 auto *CIIndex = 1480 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1)) 1481 : nullptr; 1482 1483 // Attempt to simplify to a constant, shuffle vector or EXTRQI call. 1484 if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) { 1485 return IC.replaceInstUsesWith(II, V); 1486 } 1487 1488 // EXTRQ only uses the lowest 64-bits of the first 128-bit vector 1489 // operands and the lowest 16-bits of the second. 1490 bool MadeChange = false; 1491 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) { 1492 IC.replaceOperand(II, 0, V); 1493 MadeChange = true; 1494 } 1495 if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 2)) { 1496 IC.replaceOperand(II, 1, V); 1497 MadeChange = true; 1498 } 1499 if (MadeChange) { 1500 return &II; 1501 } 1502 break; 1503 } 1504 1505 case Intrinsic::x86_sse4a_extrqi: { 1506 // EXTRQI: Extract Length bits starting from Index. Zero pad the remaining 1507 // bits of the lower 64-bits. The upper 64-bits are undefined. 1508 Value *Op0 = II.getArgOperand(0); 1509 unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements(); 1510 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 && 1511 "Unexpected operand size"); 1512 1513 // See if we're dealing with constant values. 1514 auto *CILength = dyn_cast<ConstantInt>(II.getArgOperand(1)); 1515 auto *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(2)); 1516 1517 // Attempt to simplify to a constant or shuffle vector. 1518 if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) { 1519 return IC.replaceInstUsesWith(II, V); 1520 } 1521 1522 // EXTRQI only uses the lowest 64-bits of the first 128-bit vector 1523 // operand. 1524 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) { 1525 return IC.replaceOperand(II, 0, V); 1526 } 1527 break; 1528 } 1529 1530 case Intrinsic::x86_sse4a_insertq: { 1531 Value *Op0 = II.getArgOperand(0); 1532 Value *Op1 = II.getArgOperand(1); 1533 unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements(); 1534 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && 1535 Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 && 1536 cast<FixedVectorType>(Op1->getType())->getNumElements() == 2 && 1537 "Unexpected operand size"); 1538 1539 // See if we're dealing with constant values. 1540 auto *C1 = dyn_cast<Constant>(Op1); 1541 auto *CI11 = 1542 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1)) 1543 : nullptr; 1544 1545 // Attempt to simplify to a constant, shuffle vector or INSERTQI call. 1546 if (CI11) { 1547 const APInt &V11 = CI11->getValue(); 1548 APInt Len = V11.zextOrTrunc(6); 1549 APInt Idx = V11.lshr(8).zextOrTrunc(6); 1550 if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) { 1551 return IC.replaceInstUsesWith(II, V); 1552 } 1553 } 1554 1555 // INSERTQ only uses the lowest 64-bits of the first 128-bit vector 1556 // operand. 1557 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) { 1558 return IC.replaceOperand(II, 0, V); 1559 } 1560 break; 1561 } 1562 1563 case Intrinsic::x86_sse4a_insertqi: { 1564 // INSERTQI: Extract lowest Length bits from lower half of second source and 1565 // insert over first source starting at Index bit. The upper 64-bits are 1566 // undefined. 1567 Value *Op0 = II.getArgOperand(0); 1568 Value *Op1 = II.getArgOperand(1); 1569 unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements(); 1570 unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements(); 1571 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && 1572 Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 && 1573 VWidth1 == 2 && "Unexpected operand sizes"); 1574 1575 // See if we're dealing with constant values. 1576 auto *CILength = dyn_cast<ConstantInt>(II.getArgOperand(2)); 1577 auto *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(3)); 1578 1579 // Attempt to simplify to a constant or shuffle vector. 1580 if (CILength && CIIndex) { 1581 APInt Len = CILength->getValue().zextOrTrunc(6); 1582 APInt Idx = CIIndex->getValue().zextOrTrunc(6); 1583 if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) { 1584 return IC.replaceInstUsesWith(II, V); 1585 } 1586 } 1587 1588 // INSERTQI only uses the lowest 64-bits of the first two 128-bit vector 1589 // operands. 1590 bool MadeChange = false; 1591 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) { 1592 IC.replaceOperand(II, 0, V); 1593 MadeChange = true; 1594 } 1595 if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 1)) { 1596 IC.replaceOperand(II, 1, V); 1597 MadeChange = true; 1598 } 1599 if (MadeChange) { 1600 return &II; 1601 } 1602 break; 1603 } 1604 1605 case Intrinsic::x86_sse41_pblendvb: 1606 case Intrinsic::x86_sse41_blendvps: 1607 case Intrinsic::x86_sse41_blendvpd: 1608 case Intrinsic::x86_avx_blendv_ps_256: 1609 case Intrinsic::x86_avx_blendv_pd_256: 1610 case Intrinsic::x86_avx2_pblendvb: { 1611 // fold (blend A, A, Mask) -> A 1612 Value *Op0 = II.getArgOperand(0); 1613 Value *Op1 = II.getArgOperand(1); 1614 Value *Mask = II.getArgOperand(2); 1615 if (Op0 == Op1) { 1616 return IC.replaceInstUsesWith(II, Op0); 1617 } 1618 1619 // Zero Mask - select 1st argument. 1620 if (isa<ConstantAggregateZero>(Mask)) { 1621 return IC.replaceInstUsesWith(II, Op0); 1622 } 1623 1624 // Constant Mask - select 1st/2nd argument lane based on top bit of mask. 1625 if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) { 1626 Constant *NewSelector = getNegativeIsTrueBoolVec(ConstantMask); 1627 return SelectInst::Create(NewSelector, Op1, Op0, "blendv"); 1628 } 1629 1630 // Convert to a vector select if we can bypass casts and find a boolean 1631 // vector condition value. 1632 Value *BoolVec; 1633 Mask = InstCombiner::peekThroughBitcast(Mask); 1634 if (match(Mask, PatternMatch::m_SExt(PatternMatch::m_Value(BoolVec))) && 1635 BoolVec->getType()->isVectorTy() && 1636 BoolVec->getType()->getScalarSizeInBits() == 1) { 1637 assert(Mask->getType()->getPrimitiveSizeInBits() == 1638 II.getType()->getPrimitiveSizeInBits() && 1639 "Not expecting mask and operands with different sizes"); 1640 1641 unsigned NumMaskElts = 1642 cast<FixedVectorType>(Mask->getType())->getNumElements(); 1643 unsigned NumOperandElts = 1644 cast<FixedVectorType>(II.getType())->getNumElements(); 1645 if (NumMaskElts == NumOperandElts) { 1646 return SelectInst::Create(BoolVec, Op1, Op0); 1647 } 1648 1649 // If the mask has less elements than the operands, each mask bit maps to 1650 // multiple elements of the operands. Bitcast back and forth. 1651 if (NumMaskElts < NumOperandElts) { 1652 Value *CastOp0 = IC.Builder.CreateBitCast(Op0, Mask->getType()); 1653 Value *CastOp1 = IC.Builder.CreateBitCast(Op1, Mask->getType()); 1654 Value *Sel = IC.Builder.CreateSelect(BoolVec, CastOp1, CastOp0); 1655 return new BitCastInst(Sel, II.getType()); 1656 } 1657 } 1658 1659 break; 1660 } 1661 1662 case Intrinsic::x86_ssse3_pshuf_b_128: 1663 case Intrinsic::x86_avx2_pshuf_b: 1664 case Intrinsic::x86_avx512_pshuf_b_512: 1665 if (Value *V = simplifyX86pshufb(II, IC.Builder)) { 1666 return IC.replaceInstUsesWith(II, V); 1667 } 1668 break; 1669 1670 case Intrinsic::x86_avx_vpermilvar_ps: 1671 case Intrinsic::x86_avx_vpermilvar_ps_256: 1672 case Intrinsic::x86_avx512_vpermilvar_ps_512: 1673 case Intrinsic::x86_avx_vpermilvar_pd: 1674 case Intrinsic::x86_avx_vpermilvar_pd_256: 1675 case Intrinsic::x86_avx512_vpermilvar_pd_512: 1676 if (Value *V = simplifyX86vpermilvar(II, IC.Builder)) { 1677 return IC.replaceInstUsesWith(II, V); 1678 } 1679 break; 1680 1681 case Intrinsic::x86_avx2_permd: 1682 case Intrinsic::x86_avx2_permps: 1683 case Intrinsic::x86_avx512_permvar_df_256: 1684 case Intrinsic::x86_avx512_permvar_df_512: 1685 case Intrinsic::x86_avx512_permvar_di_256: 1686 case Intrinsic::x86_avx512_permvar_di_512: 1687 case Intrinsic::x86_avx512_permvar_hi_128: 1688 case Intrinsic::x86_avx512_permvar_hi_256: 1689 case Intrinsic::x86_avx512_permvar_hi_512: 1690 case Intrinsic::x86_avx512_permvar_qi_128: 1691 case Intrinsic::x86_avx512_permvar_qi_256: 1692 case Intrinsic::x86_avx512_permvar_qi_512: 1693 case Intrinsic::x86_avx512_permvar_sf_512: 1694 case Intrinsic::x86_avx512_permvar_si_512: 1695 if (Value *V = simplifyX86vpermv(II, IC.Builder)) { 1696 return IC.replaceInstUsesWith(II, V); 1697 } 1698 break; 1699 1700 case Intrinsic::x86_avx_maskload_ps: 1701 case Intrinsic::x86_avx_maskload_pd: 1702 case Intrinsic::x86_avx_maskload_ps_256: 1703 case Intrinsic::x86_avx_maskload_pd_256: 1704 case Intrinsic::x86_avx2_maskload_d: 1705 case Intrinsic::x86_avx2_maskload_q: 1706 case Intrinsic::x86_avx2_maskload_d_256: 1707 case Intrinsic::x86_avx2_maskload_q_256: 1708 if (Instruction *I = simplifyX86MaskedLoad(II, IC)) { 1709 return I; 1710 } 1711 break; 1712 1713 case Intrinsic::x86_sse2_maskmov_dqu: 1714 case Intrinsic::x86_avx_maskstore_ps: 1715 case Intrinsic::x86_avx_maskstore_pd: 1716 case Intrinsic::x86_avx_maskstore_ps_256: 1717 case Intrinsic::x86_avx_maskstore_pd_256: 1718 case Intrinsic::x86_avx2_maskstore_d: 1719 case Intrinsic::x86_avx2_maskstore_q: 1720 case Intrinsic::x86_avx2_maskstore_d_256: 1721 case Intrinsic::x86_avx2_maskstore_q_256: 1722 if (simplifyX86MaskedStore(II, IC)) { 1723 return nullptr; 1724 } 1725 break; 1726 1727 case Intrinsic::x86_addcarry_32: 1728 case Intrinsic::x86_addcarry_64: 1729 if (Value *V = simplifyX86addcarry(II, IC.Builder)) { 1730 return IC.replaceInstUsesWith(II, V); 1731 } 1732 break; 1733 1734 default: 1735 break; 1736 } 1737 return None; 1738 } 1739 1740 Optional<Value *> X86TTIImpl::simplifyDemandedUseBitsIntrinsic( 1741 InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known, 1742 bool &KnownBitsComputed) const { 1743 switch (II.getIntrinsicID()) { 1744 default: 1745 break; 1746 case Intrinsic::x86_mmx_pmovmskb: 1747 case Intrinsic::x86_sse_movmsk_ps: 1748 case Intrinsic::x86_sse2_movmsk_pd: 1749 case Intrinsic::x86_sse2_pmovmskb_128: 1750 case Intrinsic::x86_avx_movmsk_ps_256: 1751 case Intrinsic::x86_avx_movmsk_pd_256: 1752 case Intrinsic::x86_avx2_pmovmskb: { 1753 // MOVMSK copies the vector elements' sign bits to the low bits 1754 // and zeros the high bits. 1755 unsigned ArgWidth; 1756 if (II.getIntrinsicID() == Intrinsic::x86_mmx_pmovmskb) { 1757 ArgWidth = 8; // Arg is x86_mmx, but treated as <8 x i8>. 1758 } else { 1759 auto *ArgType = cast<FixedVectorType>(II.getArgOperand(0)->getType()); 1760 ArgWidth = ArgType->getNumElements(); 1761 } 1762 1763 // If we don't need any of low bits then return zero, 1764 // we know that DemandedMask is non-zero already. 1765 APInt DemandedElts = DemandedMask.zextOrTrunc(ArgWidth); 1766 Type *VTy = II.getType(); 1767 if (DemandedElts.isZero()) { 1768 return ConstantInt::getNullValue(VTy); 1769 } 1770 1771 // We know that the upper bits are set to zero. 1772 Known.Zero.setBitsFrom(ArgWidth); 1773 KnownBitsComputed = true; 1774 break; 1775 } 1776 } 1777 return None; 1778 } 1779 1780 Optional<Value *> X86TTIImpl::simplifyDemandedVectorEltsIntrinsic( 1781 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, 1782 APInt &UndefElts2, APInt &UndefElts3, 1783 std::function<void(Instruction *, unsigned, APInt, APInt &)> 1784 simplifyAndSetOp) const { 1785 unsigned VWidth = cast<FixedVectorType>(II.getType())->getNumElements(); 1786 switch (II.getIntrinsicID()) { 1787 default: 1788 break; 1789 case Intrinsic::x86_xop_vfrcz_ss: 1790 case Intrinsic::x86_xop_vfrcz_sd: 1791 // The instructions for these intrinsics are speced to zero upper bits not 1792 // pass them through like other scalar intrinsics. So we shouldn't just 1793 // use Arg0 if DemandedElts[0] is clear like we do for other intrinsics. 1794 // Instead we should return a zero vector. 1795 if (!DemandedElts[0]) { 1796 IC.addToWorklist(&II); 1797 return ConstantAggregateZero::get(II.getType()); 1798 } 1799 1800 // Only the lower element is used. 1801 DemandedElts = 1; 1802 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); 1803 1804 // Only the lower element is undefined. The high elements are zero. 1805 UndefElts = UndefElts[0]; 1806 break; 1807 1808 // Unary scalar-as-vector operations that work column-wise. 1809 case Intrinsic::x86_sse_rcp_ss: 1810 case Intrinsic::x86_sse_rsqrt_ss: 1811 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); 1812 1813 // If lowest element of a scalar op isn't used then use Arg0. 1814 if (!DemandedElts[0]) { 1815 IC.addToWorklist(&II); 1816 return II.getArgOperand(0); 1817 } 1818 // TODO: If only low elt lower SQRT to FSQRT (with rounding/exceptions 1819 // checks). 1820 break; 1821 1822 // Binary scalar-as-vector operations that work column-wise. The high 1823 // elements come from operand 0. The low element is a function of both 1824 // operands. 1825 case Intrinsic::x86_sse_min_ss: 1826 case Intrinsic::x86_sse_max_ss: 1827 case Intrinsic::x86_sse_cmp_ss: 1828 case Intrinsic::x86_sse2_min_sd: 1829 case Intrinsic::x86_sse2_max_sd: 1830 case Intrinsic::x86_sse2_cmp_sd: { 1831 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); 1832 1833 // If lowest element of a scalar op isn't used then use Arg0. 1834 if (!DemandedElts[0]) { 1835 IC.addToWorklist(&II); 1836 return II.getArgOperand(0); 1837 } 1838 1839 // Only lower element is used for operand 1. 1840 DemandedElts = 1; 1841 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2); 1842 1843 // Lower element is undefined if both lower elements are undefined. 1844 // Consider things like undef&0. The result is known zero, not undef. 1845 if (!UndefElts2[0]) 1846 UndefElts.clearBit(0); 1847 1848 break; 1849 } 1850 1851 // Binary scalar-as-vector operations that work column-wise. The high 1852 // elements come from operand 0 and the low element comes from operand 1. 1853 case Intrinsic::x86_sse41_round_ss: 1854 case Intrinsic::x86_sse41_round_sd: { 1855 // Don't use the low element of operand 0. 1856 APInt DemandedElts2 = DemandedElts; 1857 DemandedElts2.clearBit(0); 1858 simplifyAndSetOp(&II, 0, DemandedElts2, UndefElts); 1859 1860 // If lowest element of a scalar op isn't used then use Arg0. 1861 if (!DemandedElts[0]) { 1862 IC.addToWorklist(&II); 1863 return II.getArgOperand(0); 1864 } 1865 1866 // Only lower element is used for operand 1. 1867 DemandedElts = 1; 1868 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2); 1869 1870 // Take the high undef elements from operand 0 and take the lower element 1871 // from operand 1. 1872 UndefElts.clearBit(0); 1873 UndefElts |= UndefElts2[0]; 1874 break; 1875 } 1876 1877 // Three input scalar-as-vector operations that work column-wise. The high 1878 // elements come from operand 0 and the low element is a function of all 1879 // three inputs. 1880 case Intrinsic::x86_avx512_mask_add_ss_round: 1881 case Intrinsic::x86_avx512_mask_div_ss_round: 1882 case Intrinsic::x86_avx512_mask_mul_ss_round: 1883 case Intrinsic::x86_avx512_mask_sub_ss_round: 1884 case Intrinsic::x86_avx512_mask_max_ss_round: 1885 case Intrinsic::x86_avx512_mask_min_ss_round: 1886 case Intrinsic::x86_avx512_mask_add_sd_round: 1887 case Intrinsic::x86_avx512_mask_div_sd_round: 1888 case Intrinsic::x86_avx512_mask_mul_sd_round: 1889 case Intrinsic::x86_avx512_mask_sub_sd_round: 1890 case Intrinsic::x86_avx512_mask_max_sd_round: 1891 case Intrinsic::x86_avx512_mask_min_sd_round: 1892 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); 1893 1894 // If lowest element of a scalar op isn't used then use Arg0. 1895 if (!DemandedElts[0]) { 1896 IC.addToWorklist(&II); 1897 return II.getArgOperand(0); 1898 } 1899 1900 // Only lower element is used for operand 1 and 2. 1901 DemandedElts = 1; 1902 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2); 1903 simplifyAndSetOp(&II, 2, DemandedElts, UndefElts3); 1904 1905 // Lower element is undefined if all three lower elements are undefined. 1906 // Consider things like undef&0. The result is known zero, not undef. 1907 if (!UndefElts2[0] || !UndefElts3[0]) 1908 UndefElts.clearBit(0); 1909 break; 1910 1911 // TODO: Add fmaddsub support? 1912 case Intrinsic::x86_sse3_addsub_pd: 1913 case Intrinsic::x86_sse3_addsub_ps: 1914 case Intrinsic::x86_avx_addsub_pd_256: 1915 case Intrinsic::x86_avx_addsub_ps_256: { 1916 // If none of the even or none of the odd lanes are required, turn this 1917 // into a generic FP math instruction. 1918 APInt SubMask = APInt::getSplat(VWidth, APInt(2, 0x1)); 1919 APInt AddMask = APInt::getSplat(VWidth, APInt(2, 0x2)); 1920 bool IsSubOnly = DemandedElts.isSubsetOf(SubMask); 1921 bool IsAddOnly = DemandedElts.isSubsetOf(AddMask); 1922 if (IsSubOnly || IsAddOnly) { 1923 assert((IsSubOnly ^ IsAddOnly) && "Can't be both add-only and sub-only"); 1924 IRBuilderBase::InsertPointGuard Guard(IC.Builder); 1925 IC.Builder.SetInsertPoint(&II); 1926 Value *Arg0 = II.getArgOperand(0), *Arg1 = II.getArgOperand(1); 1927 return IC.Builder.CreateBinOp( 1928 IsSubOnly ? Instruction::FSub : Instruction::FAdd, Arg0, Arg1); 1929 } 1930 1931 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); 1932 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2); 1933 UndefElts &= UndefElts2; 1934 break; 1935 } 1936 1937 case Intrinsic::x86_sse2_packssdw_128: 1938 case Intrinsic::x86_sse2_packsswb_128: 1939 case Intrinsic::x86_sse2_packuswb_128: 1940 case Intrinsic::x86_sse41_packusdw: 1941 case Intrinsic::x86_avx2_packssdw: 1942 case Intrinsic::x86_avx2_packsswb: 1943 case Intrinsic::x86_avx2_packusdw: 1944 case Intrinsic::x86_avx2_packuswb: 1945 case Intrinsic::x86_avx512_packssdw_512: 1946 case Intrinsic::x86_avx512_packsswb_512: 1947 case Intrinsic::x86_avx512_packusdw_512: 1948 case Intrinsic::x86_avx512_packuswb_512: { 1949 auto *Ty0 = II.getArgOperand(0)->getType(); 1950 unsigned InnerVWidth = cast<FixedVectorType>(Ty0)->getNumElements(); 1951 assert(VWidth == (InnerVWidth * 2) && "Unexpected input size"); 1952 1953 unsigned NumLanes = Ty0->getPrimitiveSizeInBits() / 128; 1954 unsigned VWidthPerLane = VWidth / NumLanes; 1955 unsigned InnerVWidthPerLane = InnerVWidth / NumLanes; 1956 1957 // Per lane, pack the elements of the first input and then the second. 1958 // e.g. 1959 // v8i16 PACK(v4i32 X, v4i32 Y) - (X[0..3],Y[0..3]) 1960 // v32i8 PACK(v16i16 X, v16i16 Y) - (X[0..7],Y[0..7]),(X[8..15],Y[8..15]) 1961 for (int OpNum = 0; OpNum != 2; ++OpNum) { 1962 APInt OpDemandedElts(InnerVWidth, 0); 1963 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { 1964 unsigned LaneIdx = Lane * VWidthPerLane; 1965 for (unsigned Elt = 0; Elt != InnerVWidthPerLane; ++Elt) { 1966 unsigned Idx = LaneIdx + Elt + InnerVWidthPerLane * OpNum; 1967 if (DemandedElts[Idx]) 1968 OpDemandedElts.setBit((Lane * InnerVWidthPerLane) + Elt); 1969 } 1970 } 1971 1972 // Demand elements from the operand. 1973 APInt OpUndefElts(InnerVWidth, 0); 1974 simplifyAndSetOp(&II, OpNum, OpDemandedElts, OpUndefElts); 1975 1976 // Pack the operand's UNDEF elements, one lane at a time. 1977 OpUndefElts = OpUndefElts.zext(VWidth); 1978 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { 1979 APInt LaneElts = OpUndefElts.lshr(InnerVWidthPerLane * Lane); 1980 LaneElts = LaneElts.getLoBits(InnerVWidthPerLane); 1981 LaneElts <<= InnerVWidthPerLane * (2 * Lane + OpNum); 1982 UndefElts |= LaneElts; 1983 } 1984 } 1985 break; 1986 } 1987 1988 // PSHUFB 1989 case Intrinsic::x86_ssse3_pshuf_b_128: 1990 case Intrinsic::x86_avx2_pshuf_b: 1991 case Intrinsic::x86_avx512_pshuf_b_512: 1992 // PERMILVAR 1993 case Intrinsic::x86_avx_vpermilvar_ps: 1994 case Intrinsic::x86_avx_vpermilvar_ps_256: 1995 case Intrinsic::x86_avx512_vpermilvar_ps_512: 1996 case Intrinsic::x86_avx_vpermilvar_pd: 1997 case Intrinsic::x86_avx_vpermilvar_pd_256: 1998 case Intrinsic::x86_avx512_vpermilvar_pd_512: 1999 // PERMV 2000 case Intrinsic::x86_avx2_permd: 2001 case Intrinsic::x86_avx2_permps: { 2002 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts); 2003 break; 2004 } 2005 2006 // SSE4A instructions leave the upper 64-bits of the 128-bit result 2007 // in an undefined state. 2008 case Intrinsic::x86_sse4a_extrq: 2009 case Intrinsic::x86_sse4a_extrqi: 2010 case Intrinsic::x86_sse4a_insertq: 2011 case Intrinsic::x86_sse4a_insertqi: 2012 UndefElts.setHighBits(VWidth / 2); 2013 break; 2014 } 2015 return None; 2016 } 2017