1 //===-- X86InstCombineIntrinsic.cpp - X86 specific InstCombine pass -------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements a TargetTransformInfo analysis pass specific to the 10 /// X86 target machine. It uses the target's detailed information to provide 11 /// more precise answers to certain TTI queries, while letting the target 12 /// independent and default TTI implementations handle the rest. 13 /// 14 //===----------------------------------------------------------------------===// 15 16 #include "X86TargetTransformInfo.h" 17 #include "llvm/IR/IntrinsicInst.h" 18 #include "llvm/IR/IntrinsicsX86.h" 19 #include "llvm/Support/KnownBits.h" 20 #include "llvm/Transforms/InstCombine/InstCombiner.h" 21 #include <optional> 22 23 using namespace llvm; 24 25 #define DEBUG_TYPE "x86tti" 26 27 /// Return a constant boolean vector that has true elements in all positions 28 /// where the input constant data vector has an element with the sign bit set. 29 static Constant *getNegativeIsTrueBoolVec(Constant *V) { 30 VectorType *IntTy = VectorType::getInteger(cast<VectorType>(V->getType())); 31 V = ConstantExpr::getBitCast(V, IntTy); 32 V = ConstantExpr::getICmp(CmpInst::ICMP_SGT, Constant::getNullValue(IntTy), 33 V); 34 return V; 35 } 36 37 /// Convert the x86 XMM integer vector mask to a vector of bools based on 38 /// each element's most significant bit (the sign bit). 39 static Value *getBoolVecFromMask(Value *Mask) { 40 // Fold Constant Mask. 41 if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) 42 return getNegativeIsTrueBoolVec(ConstantMask); 43 44 // Mask was extended from a boolean vector. 45 Value *ExtMask; 46 if (PatternMatch::match( 47 Mask, PatternMatch::m_SExt(PatternMatch::m_Value(ExtMask))) && 48 ExtMask->getType()->isIntOrIntVectorTy(1)) 49 return ExtMask; 50 51 return nullptr; 52 } 53 54 // TODO: If the x86 backend knew how to convert a bool vector mask back to an 55 // XMM register mask efficiently, we could transform all x86 masked intrinsics 56 // to LLVM masked intrinsics and remove the x86 masked intrinsic defs. 57 static Instruction *simplifyX86MaskedLoad(IntrinsicInst &II, InstCombiner &IC) { 58 Value *Ptr = II.getOperand(0); 59 Value *Mask = II.getOperand(1); 60 Constant *ZeroVec = Constant::getNullValue(II.getType()); 61 62 // Zero Mask - masked load instruction creates a zero vector. 63 if (isa<ConstantAggregateZero>(Mask)) 64 return IC.replaceInstUsesWith(II, ZeroVec); 65 66 // The mask is constant or extended from a bool vector. Convert this x86 67 // intrinsic to the LLVM intrinsic to allow target-independent optimizations. 68 if (Value *BoolMask = getBoolVecFromMask(Mask)) { 69 // First, cast the x86 intrinsic scalar pointer to a vector pointer to match 70 // the LLVM intrinsic definition for the pointer argument. 71 unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace(); 72 PointerType *VecPtrTy = PointerType::get(II.getType(), AddrSpace); 73 Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec"); 74 75 // The pass-through vector for an x86 masked load is a zero vector. 76 CallInst *NewMaskedLoad = IC.Builder.CreateMaskedLoad( 77 II.getType(), PtrCast, Align(1), BoolMask, ZeroVec); 78 return IC.replaceInstUsesWith(II, NewMaskedLoad); 79 } 80 81 return nullptr; 82 } 83 84 // TODO: If the x86 backend knew how to convert a bool vector mask back to an 85 // XMM register mask efficiently, we could transform all x86 masked intrinsics 86 // to LLVM masked intrinsics and remove the x86 masked intrinsic defs. 87 static bool simplifyX86MaskedStore(IntrinsicInst &II, InstCombiner &IC) { 88 Value *Ptr = II.getOperand(0); 89 Value *Mask = II.getOperand(1); 90 Value *Vec = II.getOperand(2); 91 92 // Zero Mask - this masked store instruction does nothing. 93 if (isa<ConstantAggregateZero>(Mask)) { 94 IC.eraseInstFromFunction(II); 95 return true; 96 } 97 98 // The SSE2 version is too weird (eg, unaligned but non-temporal) to do 99 // anything else at this level. 100 if (II.getIntrinsicID() == Intrinsic::x86_sse2_maskmov_dqu) 101 return false; 102 103 // The mask is constant or extended from a bool vector. Convert this x86 104 // intrinsic to the LLVM intrinsic to allow target-independent optimizations. 105 if (Value *BoolMask = getBoolVecFromMask(Mask)) { 106 unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace(); 107 PointerType *VecPtrTy = PointerType::get(Vec->getType(), AddrSpace); 108 Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec"); 109 110 IC.Builder.CreateMaskedStore(Vec, PtrCast, Align(1), BoolMask); 111 112 // 'Replace uses' doesn't work for stores. Erase the original masked store. 113 IC.eraseInstFromFunction(II); 114 return true; 115 } 116 117 return false; 118 } 119 120 static Value *simplifyX86immShift(const IntrinsicInst &II, 121 InstCombiner::BuilderTy &Builder) { 122 bool LogicalShift = false; 123 bool ShiftLeft = false; 124 bool IsImm = false; 125 126 switch (II.getIntrinsicID()) { 127 default: 128 llvm_unreachable("Unexpected intrinsic!"); 129 case Intrinsic::x86_sse2_psrai_d: 130 case Intrinsic::x86_sse2_psrai_w: 131 case Intrinsic::x86_avx2_psrai_d: 132 case Intrinsic::x86_avx2_psrai_w: 133 case Intrinsic::x86_avx512_psrai_q_128: 134 case Intrinsic::x86_avx512_psrai_q_256: 135 case Intrinsic::x86_avx512_psrai_d_512: 136 case Intrinsic::x86_avx512_psrai_q_512: 137 case Intrinsic::x86_avx512_psrai_w_512: 138 IsImm = true; 139 [[fallthrough]]; 140 case Intrinsic::x86_sse2_psra_d: 141 case Intrinsic::x86_sse2_psra_w: 142 case Intrinsic::x86_avx2_psra_d: 143 case Intrinsic::x86_avx2_psra_w: 144 case Intrinsic::x86_avx512_psra_q_128: 145 case Intrinsic::x86_avx512_psra_q_256: 146 case Intrinsic::x86_avx512_psra_d_512: 147 case Intrinsic::x86_avx512_psra_q_512: 148 case Intrinsic::x86_avx512_psra_w_512: 149 LogicalShift = false; 150 ShiftLeft = false; 151 break; 152 case Intrinsic::x86_sse2_psrli_d: 153 case Intrinsic::x86_sse2_psrli_q: 154 case Intrinsic::x86_sse2_psrli_w: 155 case Intrinsic::x86_avx2_psrli_d: 156 case Intrinsic::x86_avx2_psrli_q: 157 case Intrinsic::x86_avx2_psrli_w: 158 case Intrinsic::x86_avx512_psrli_d_512: 159 case Intrinsic::x86_avx512_psrli_q_512: 160 case Intrinsic::x86_avx512_psrli_w_512: 161 IsImm = true; 162 [[fallthrough]]; 163 case Intrinsic::x86_sse2_psrl_d: 164 case Intrinsic::x86_sse2_psrl_q: 165 case Intrinsic::x86_sse2_psrl_w: 166 case Intrinsic::x86_avx2_psrl_d: 167 case Intrinsic::x86_avx2_psrl_q: 168 case Intrinsic::x86_avx2_psrl_w: 169 case Intrinsic::x86_avx512_psrl_d_512: 170 case Intrinsic::x86_avx512_psrl_q_512: 171 case Intrinsic::x86_avx512_psrl_w_512: 172 LogicalShift = true; 173 ShiftLeft = false; 174 break; 175 case Intrinsic::x86_sse2_pslli_d: 176 case Intrinsic::x86_sse2_pslli_q: 177 case Intrinsic::x86_sse2_pslli_w: 178 case Intrinsic::x86_avx2_pslli_d: 179 case Intrinsic::x86_avx2_pslli_q: 180 case Intrinsic::x86_avx2_pslli_w: 181 case Intrinsic::x86_avx512_pslli_d_512: 182 case Intrinsic::x86_avx512_pslli_q_512: 183 case Intrinsic::x86_avx512_pslli_w_512: 184 IsImm = true; 185 [[fallthrough]]; 186 case Intrinsic::x86_sse2_psll_d: 187 case Intrinsic::x86_sse2_psll_q: 188 case Intrinsic::x86_sse2_psll_w: 189 case Intrinsic::x86_avx2_psll_d: 190 case Intrinsic::x86_avx2_psll_q: 191 case Intrinsic::x86_avx2_psll_w: 192 case Intrinsic::x86_avx512_psll_d_512: 193 case Intrinsic::x86_avx512_psll_q_512: 194 case Intrinsic::x86_avx512_psll_w_512: 195 LogicalShift = true; 196 ShiftLeft = true; 197 break; 198 } 199 assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left"); 200 201 Value *Vec = II.getArgOperand(0); 202 Value *Amt = II.getArgOperand(1); 203 auto *VT = cast<FixedVectorType>(Vec->getType()); 204 Type *SVT = VT->getElementType(); 205 Type *AmtVT = Amt->getType(); 206 unsigned VWidth = VT->getNumElements(); 207 unsigned BitWidth = SVT->getPrimitiveSizeInBits(); 208 209 // If the shift amount is guaranteed to be in-range we can replace it with a 210 // generic shift. If its guaranteed to be out of range, logical shifts combine 211 // to zero and arithmetic shifts are clamped to (BitWidth - 1). 212 if (IsImm) { 213 assert(AmtVT->isIntegerTy(32) && "Unexpected shift-by-immediate type"); 214 KnownBits KnownAmtBits = 215 llvm::computeKnownBits(Amt, II.getModule()->getDataLayout()); 216 if (KnownAmtBits.getMaxValue().ult(BitWidth)) { 217 Amt = Builder.CreateZExtOrTrunc(Amt, SVT); 218 Amt = Builder.CreateVectorSplat(VWidth, Amt); 219 return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt) 220 : Builder.CreateLShr(Vec, Amt)) 221 : Builder.CreateAShr(Vec, Amt)); 222 } 223 if (KnownAmtBits.getMinValue().uge(BitWidth)) { 224 if (LogicalShift) 225 return ConstantAggregateZero::get(VT); 226 Amt = ConstantInt::get(SVT, BitWidth - 1); 227 return Builder.CreateAShr(Vec, Builder.CreateVectorSplat(VWidth, Amt)); 228 } 229 } else { 230 // Ensure the first element has an in-range value and the rest of the 231 // elements in the bottom 64 bits are zero. 232 assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 && 233 cast<VectorType>(AmtVT)->getElementType() == SVT && 234 "Unexpected shift-by-scalar type"); 235 unsigned NumAmtElts = cast<FixedVectorType>(AmtVT)->getNumElements(); 236 APInt DemandedLower = APInt::getOneBitSet(NumAmtElts, 0); 237 APInt DemandedUpper = APInt::getBitsSet(NumAmtElts, 1, NumAmtElts / 2); 238 KnownBits KnownLowerBits = llvm::computeKnownBits( 239 Amt, DemandedLower, II.getModule()->getDataLayout()); 240 KnownBits KnownUpperBits = llvm::computeKnownBits( 241 Amt, DemandedUpper, II.getModule()->getDataLayout()); 242 if (KnownLowerBits.getMaxValue().ult(BitWidth) && 243 (DemandedUpper.isZero() || KnownUpperBits.isZero())) { 244 SmallVector<int, 16> ZeroSplat(VWidth, 0); 245 Amt = Builder.CreateShuffleVector(Amt, ZeroSplat); 246 return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt) 247 : Builder.CreateLShr(Vec, Amt)) 248 : Builder.CreateAShr(Vec, Amt)); 249 } 250 } 251 252 // Simplify if count is constant vector. 253 auto *CDV = dyn_cast<ConstantDataVector>(Amt); 254 if (!CDV) 255 return nullptr; 256 257 // SSE2/AVX2 uses all the first 64-bits of the 128-bit vector 258 // operand to compute the shift amount. 259 assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 && 260 cast<VectorType>(AmtVT)->getElementType() == SVT && 261 "Unexpected shift-by-scalar type"); 262 263 // Concatenate the sub-elements to create the 64-bit value. 264 APInt Count(64, 0); 265 for (unsigned i = 0, NumSubElts = 64 / BitWidth; i != NumSubElts; ++i) { 266 unsigned SubEltIdx = (NumSubElts - 1) - i; 267 auto *SubElt = cast<ConstantInt>(CDV->getElementAsConstant(SubEltIdx)); 268 Count <<= BitWidth; 269 Count |= SubElt->getValue().zextOrTrunc(64); 270 } 271 272 // If shift-by-zero then just return the original value. 273 if (Count.isZero()) 274 return Vec; 275 276 // Handle cases when Shift >= BitWidth. 277 if (Count.uge(BitWidth)) { 278 // If LogicalShift - just return zero. 279 if (LogicalShift) 280 return ConstantAggregateZero::get(VT); 281 282 // If ArithmeticShift - clamp Shift to (BitWidth - 1). 283 Count = APInt(64, BitWidth - 1); 284 } 285 286 // Get a constant vector of the same type as the first operand. 287 auto ShiftAmt = ConstantInt::get(SVT, Count.zextOrTrunc(BitWidth)); 288 auto ShiftVec = Builder.CreateVectorSplat(VWidth, ShiftAmt); 289 290 if (ShiftLeft) 291 return Builder.CreateShl(Vec, ShiftVec); 292 293 if (LogicalShift) 294 return Builder.CreateLShr(Vec, ShiftVec); 295 296 return Builder.CreateAShr(Vec, ShiftVec); 297 } 298 299 // Attempt to simplify AVX2 per-element shift intrinsics to a generic IR shift. 300 // Unlike the generic IR shifts, the intrinsics have defined behaviour for out 301 // of range shift amounts (logical - set to zero, arithmetic - splat sign bit). 302 static Value *simplifyX86varShift(const IntrinsicInst &II, 303 InstCombiner::BuilderTy &Builder) { 304 bool LogicalShift = false; 305 bool ShiftLeft = false; 306 307 switch (II.getIntrinsicID()) { 308 default: 309 llvm_unreachable("Unexpected intrinsic!"); 310 case Intrinsic::x86_avx2_psrav_d: 311 case Intrinsic::x86_avx2_psrav_d_256: 312 case Intrinsic::x86_avx512_psrav_q_128: 313 case Intrinsic::x86_avx512_psrav_q_256: 314 case Intrinsic::x86_avx512_psrav_d_512: 315 case Intrinsic::x86_avx512_psrav_q_512: 316 case Intrinsic::x86_avx512_psrav_w_128: 317 case Intrinsic::x86_avx512_psrav_w_256: 318 case Intrinsic::x86_avx512_psrav_w_512: 319 LogicalShift = false; 320 ShiftLeft = false; 321 break; 322 case Intrinsic::x86_avx2_psrlv_d: 323 case Intrinsic::x86_avx2_psrlv_d_256: 324 case Intrinsic::x86_avx2_psrlv_q: 325 case Intrinsic::x86_avx2_psrlv_q_256: 326 case Intrinsic::x86_avx512_psrlv_d_512: 327 case Intrinsic::x86_avx512_psrlv_q_512: 328 case Intrinsic::x86_avx512_psrlv_w_128: 329 case Intrinsic::x86_avx512_psrlv_w_256: 330 case Intrinsic::x86_avx512_psrlv_w_512: 331 LogicalShift = true; 332 ShiftLeft = false; 333 break; 334 case Intrinsic::x86_avx2_psllv_d: 335 case Intrinsic::x86_avx2_psllv_d_256: 336 case Intrinsic::x86_avx2_psllv_q: 337 case Intrinsic::x86_avx2_psllv_q_256: 338 case Intrinsic::x86_avx512_psllv_d_512: 339 case Intrinsic::x86_avx512_psllv_q_512: 340 case Intrinsic::x86_avx512_psllv_w_128: 341 case Intrinsic::x86_avx512_psllv_w_256: 342 case Intrinsic::x86_avx512_psllv_w_512: 343 LogicalShift = true; 344 ShiftLeft = true; 345 break; 346 } 347 assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left"); 348 349 Value *Vec = II.getArgOperand(0); 350 Value *Amt = II.getArgOperand(1); 351 auto *VT = cast<FixedVectorType>(II.getType()); 352 Type *SVT = VT->getElementType(); 353 int NumElts = VT->getNumElements(); 354 int BitWidth = SVT->getIntegerBitWidth(); 355 356 // If the shift amount is guaranteed to be in-range we can replace it with a 357 // generic shift. 358 KnownBits KnownAmt = 359 llvm::computeKnownBits(Amt, II.getModule()->getDataLayout()); 360 if (KnownAmt.getMaxValue().ult(BitWidth)) { 361 return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt) 362 : Builder.CreateLShr(Vec, Amt)) 363 : Builder.CreateAShr(Vec, Amt)); 364 } 365 366 // Simplify if all shift amounts are constant/undef. 367 auto *CShift = dyn_cast<Constant>(Amt); 368 if (!CShift) 369 return nullptr; 370 371 // Collect each element's shift amount. 372 // We also collect special cases: UNDEF = -1, OUT-OF-RANGE = BitWidth. 373 bool AnyOutOfRange = false; 374 SmallVector<int, 8> ShiftAmts; 375 for (int I = 0; I < NumElts; ++I) { 376 auto *CElt = CShift->getAggregateElement(I); 377 if (isa_and_nonnull<UndefValue>(CElt)) { 378 ShiftAmts.push_back(-1); 379 continue; 380 } 381 382 auto *COp = dyn_cast_or_null<ConstantInt>(CElt); 383 if (!COp) 384 return nullptr; 385 386 // Handle out of range shifts. 387 // If LogicalShift - set to BitWidth (special case). 388 // If ArithmeticShift - set to (BitWidth - 1) (sign splat). 389 APInt ShiftVal = COp->getValue(); 390 if (ShiftVal.uge(BitWidth)) { 391 AnyOutOfRange = LogicalShift; 392 ShiftAmts.push_back(LogicalShift ? BitWidth : BitWidth - 1); 393 continue; 394 } 395 396 ShiftAmts.push_back((int)ShiftVal.getZExtValue()); 397 } 398 399 // If all elements out of range or UNDEF, return vector of zeros/undefs. 400 // ArithmeticShift should only hit this if they are all UNDEF. 401 auto OutOfRange = [&](int Idx) { return (Idx < 0) || (BitWidth <= Idx); }; 402 if (llvm::all_of(ShiftAmts, OutOfRange)) { 403 SmallVector<Constant *, 8> ConstantVec; 404 for (int Idx : ShiftAmts) { 405 if (Idx < 0) { 406 ConstantVec.push_back(UndefValue::get(SVT)); 407 } else { 408 assert(LogicalShift && "Logical shift expected"); 409 ConstantVec.push_back(ConstantInt::getNullValue(SVT)); 410 } 411 } 412 return ConstantVector::get(ConstantVec); 413 } 414 415 // We can't handle only some out of range values with generic logical shifts. 416 if (AnyOutOfRange) 417 return nullptr; 418 419 // Build the shift amount constant vector. 420 SmallVector<Constant *, 8> ShiftVecAmts; 421 for (int Idx : ShiftAmts) { 422 if (Idx < 0) 423 ShiftVecAmts.push_back(UndefValue::get(SVT)); 424 else 425 ShiftVecAmts.push_back(ConstantInt::get(SVT, Idx)); 426 } 427 auto ShiftVec = ConstantVector::get(ShiftVecAmts); 428 429 if (ShiftLeft) 430 return Builder.CreateShl(Vec, ShiftVec); 431 432 if (LogicalShift) 433 return Builder.CreateLShr(Vec, ShiftVec); 434 435 return Builder.CreateAShr(Vec, ShiftVec); 436 } 437 438 static Value *simplifyX86pack(IntrinsicInst &II, 439 InstCombiner::BuilderTy &Builder, bool IsSigned) { 440 Value *Arg0 = II.getArgOperand(0); 441 Value *Arg1 = II.getArgOperand(1); 442 Type *ResTy = II.getType(); 443 444 // Fast all undef handling. 445 if (isa<UndefValue>(Arg0) && isa<UndefValue>(Arg1)) 446 return UndefValue::get(ResTy); 447 448 auto *ArgTy = cast<FixedVectorType>(Arg0->getType()); 449 unsigned NumLanes = ResTy->getPrimitiveSizeInBits() / 128; 450 unsigned NumSrcElts = ArgTy->getNumElements(); 451 assert(cast<FixedVectorType>(ResTy)->getNumElements() == (2 * NumSrcElts) && 452 "Unexpected packing types"); 453 454 unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes; 455 unsigned DstScalarSizeInBits = ResTy->getScalarSizeInBits(); 456 unsigned SrcScalarSizeInBits = ArgTy->getScalarSizeInBits(); 457 assert(SrcScalarSizeInBits == (2 * DstScalarSizeInBits) && 458 "Unexpected packing types"); 459 460 // Constant folding. 461 if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1)) 462 return nullptr; 463 464 // Clamp Values - signed/unsigned both use signed clamp values, but they 465 // differ on the min/max values. 466 APInt MinValue, MaxValue; 467 if (IsSigned) { 468 // PACKSS: Truncate signed value with signed saturation. 469 // Source values less than dst minint are saturated to minint. 470 // Source values greater than dst maxint are saturated to maxint. 471 MinValue = 472 APInt::getSignedMinValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits); 473 MaxValue = 474 APInt::getSignedMaxValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits); 475 } else { 476 // PACKUS: Truncate signed value with unsigned saturation. 477 // Source values less than zero are saturated to zero. 478 // Source values greater than dst maxuint are saturated to maxuint. 479 MinValue = APInt::getZero(SrcScalarSizeInBits); 480 MaxValue = APInt::getLowBitsSet(SrcScalarSizeInBits, DstScalarSizeInBits); 481 } 482 483 auto *MinC = Constant::getIntegerValue(ArgTy, MinValue); 484 auto *MaxC = Constant::getIntegerValue(ArgTy, MaxValue); 485 Arg0 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg0, MinC), MinC, Arg0); 486 Arg1 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg1, MinC), MinC, Arg1); 487 Arg0 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg0, MaxC), MaxC, Arg0); 488 Arg1 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg1, MaxC), MaxC, Arg1); 489 490 // Shuffle clamped args together at the lane level. 491 SmallVector<int, 32> PackMask; 492 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { 493 for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt) 494 PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane)); 495 for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt) 496 PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane) + NumSrcElts); 497 } 498 auto *Shuffle = Builder.CreateShuffleVector(Arg0, Arg1, PackMask); 499 500 // Truncate to dst size. 501 return Builder.CreateTrunc(Shuffle, ResTy); 502 } 503 504 static Value *simplifyX86movmsk(const IntrinsicInst &II, 505 InstCombiner::BuilderTy &Builder) { 506 Value *Arg = II.getArgOperand(0); 507 Type *ResTy = II.getType(); 508 509 // movmsk(undef) -> zero as we must ensure the upper bits are zero. 510 if (isa<UndefValue>(Arg)) 511 return Constant::getNullValue(ResTy); 512 513 auto *ArgTy = dyn_cast<FixedVectorType>(Arg->getType()); 514 // We can't easily peek through x86_mmx types. 515 if (!ArgTy) 516 return nullptr; 517 518 // Expand MOVMSK to compare/bitcast/zext: 519 // e.g. PMOVMSKB(v16i8 x): 520 // %cmp = icmp slt <16 x i8> %x, zeroinitializer 521 // %int = bitcast <16 x i1> %cmp to i16 522 // %res = zext i16 %int to i32 523 unsigned NumElts = ArgTy->getNumElements(); 524 Type *IntegerTy = Builder.getIntNTy(NumElts); 525 526 Value *Res = Builder.CreateBitCast(Arg, VectorType::getInteger(ArgTy)); 527 Res = Builder.CreateIsNeg(Res); 528 Res = Builder.CreateBitCast(Res, IntegerTy); 529 Res = Builder.CreateZExtOrTrunc(Res, ResTy); 530 return Res; 531 } 532 533 static Value *simplifyX86addcarry(const IntrinsicInst &II, 534 InstCombiner::BuilderTy &Builder) { 535 Value *CarryIn = II.getArgOperand(0); 536 Value *Op1 = II.getArgOperand(1); 537 Value *Op2 = II.getArgOperand(2); 538 Type *RetTy = II.getType(); 539 Type *OpTy = Op1->getType(); 540 assert(RetTy->getStructElementType(0)->isIntegerTy(8) && 541 RetTy->getStructElementType(1) == OpTy && OpTy == Op2->getType() && 542 "Unexpected types for x86 addcarry"); 543 544 // If carry-in is zero, this is just an unsigned add with overflow. 545 if (match(CarryIn, PatternMatch::m_ZeroInt())) { 546 Value *UAdd = Builder.CreateIntrinsic(Intrinsic::uadd_with_overflow, OpTy, 547 {Op1, Op2}); 548 // The types have to be adjusted to match the x86 call types. 549 Value *UAddResult = Builder.CreateExtractValue(UAdd, 0); 550 Value *UAddOV = Builder.CreateZExt(Builder.CreateExtractValue(UAdd, 1), 551 Builder.getInt8Ty()); 552 Value *Res = PoisonValue::get(RetTy); 553 Res = Builder.CreateInsertValue(Res, UAddOV, 0); 554 return Builder.CreateInsertValue(Res, UAddResult, 1); 555 } 556 557 return nullptr; 558 } 559 560 static Value *simplifyTernarylogic(const IntrinsicInst &II, 561 InstCombiner::BuilderTy &Builder) { 562 563 auto *ArgImm = dyn_cast<ConstantInt>(II.getArgOperand(3)); 564 if (!ArgImm || ArgImm->getValue().uge(256)) 565 return nullptr; 566 567 Value *ArgA = II.getArgOperand(0); 568 Value *ArgB = II.getArgOperand(1); 569 Value *ArgC = II.getArgOperand(2); 570 571 Type *Ty = II.getType(); 572 573 auto Or = [&](auto Lhs, auto Rhs) -> std::pair<Value *, uint8_t> { 574 return {Builder.CreateOr(Lhs.first, Rhs.first), Lhs.second | Rhs.second}; 575 }; 576 auto Xor = [&](auto Lhs, auto Rhs) -> std::pair<Value *, uint8_t> { 577 return {Builder.CreateXor(Lhs.first, Rhs.first), Lhs.second ^ Rhs.second}; 578 }; 579 auto And = [&](auto Lhs, auto Rhs) -> std::pair<Value *, uint8_t> { 580 return {Builder.CreateAnd(Lhs.first, Rhs.first), Lhs.second & Rhs.second}; 581 }; 582 auto Not = [&](auto V) -> std::pair<Value *, uint8_t> { 583 return {Builder.CreateNot(V.first), ~V.second}; 584 }; 585 auto Nor = [&](auto Lhs, auto Rhs) { return Not(Or(Lhs, Rhs)); }; 586 auto Xnor = [&](auto Lhs, auto Rhs) { return Not(Xor(Lhs, Rhs)); }; 587 auto Nand = [&](auto Lhs, auto Rhs) { return Not(And(Lhs, Rhs)); }; 588 589 bool AIsConst = match(ArgA, PatternMatch::m_ImmConstant()); 590 bool BIsConst = match(ArgB, PatternMatch::m_ImmConstant()); 591 bool CIsConst = match(ArgC, PatternMatch::m_ImmConstant()); 592 593 bool ABIsConst = AIsConst && BIsConst; 594 bool ACIsConst = AIsConst && CIsConst; 595 bool BCIsConst = BIsConst && CIsConst; 596 bool ABCIsConst = AIsConst && BIsConst && CIsConst; 597 598 // Use for verification. Its a big table. Its difficult to go from Imm -> 599 // logic ops, but easy to verify that a set of logic ops is correct. We track 600 // the logic ops through the second value in the pair. At the end it should 601 // equal Imm. 602 std::pair<Value *, uint8_t> A = {ArgA, 0xf0}; 603 std::pair<Value *, uint8_t> B = {ArgB, 0xcc}; 604 std::pair<Value *, uint8_t> C = {ArgC, 0xaa}; 605 std::pair<Value *, uint8_t> Res = {nullptr, 0}; 606 607 // Currently we only handle cases that convert directly to another instruction 608 // or cases where all the ops are constant. This is because we don't properly 609 // handle creating ternary ops in the backend, so splitting them here may 610 // cause regressions. As the backend improves, uncomment more cases. 611 612 uint8_t Imm = ArgImm->getValue().getZExtValue(); 613 switch (Imm) { 614 case 0x0: 615 Res = {Constant::getNullValue(Ty), 0}; 616 break; 617 case 0x1: 618 if (ABCIsConst) 619 Res = Nor(Or(A, B), C); 620 break; 621 case 0x2: 622 if (ABCIsConst) 623 Res = And(Nor(A, B), C); 624 break; 625 case 0x3: 626 if (ABIsConst) 627 Res = Nor(A, B); 628 break; 629 case 0x4: 630 if (ABCIsConst) 631 Res = And(Nor(A, C), B); 632 break; 633 case 0x5: 634 if (ACIsConst) 635 Res = Nor(A, C); 636 break; 637 case 0x6: 638 if (ABCIsConst) 639 Res = Nor(A, Xnor(B, C)); 640 break; 641 case 0x7: 642 if (ABCIsConst) 643 Res = Nor(A, And(B, C)); 644 break; 645 case 0x8: 646 if (ABCIsConst) 647 Res = Nor(A, Nand(B, C)); 648 break; 649 case 0x9: 650 if (ABCIsConst) 651 Res = Nor(A, Xor(B, C)); 652 break; 653 case 0xa: 654 if (ACIsConst) 655 Res = Nor(A, Not(C)); 656 break; 657 case 0xb: 658 if (ABCIsConst) 659 Res = Nor(A, Nor(C, Not(B))); 660 break; 661 case 0xc: 662 if (ABIsConst) 663 Res = Nor(A, Not(B)); 664 break; 665 case 0xd: 666 if (ABCIsConst) 667 Res = Nor(A, Nor(B, Not(C))); 668 break; 669 case 0xe: 670 if (ABCIsConst) 671 Res = Nor(A, Nor(B, C)); 672 break; 673 case 0xf: 674 Res = Not(A); 675 break; 676 case 0x10: 677 if (ABCIsConst) 678 Res = And(A, Nor(B, C)); 679 break; 680 case 0x11: 681 if (BCIsConst) 682 Res = Nor(B, C); 683 break; 684 case 0x12: 685 if (ABCIsConst) 686 Res = Nor(Xnor(A, C), B); 687 break; 688 case 0x13: 689 if (ABCIsConst) 690 Res = Nor(And(A, C), B); 691 break; 692 case 0x14: 693 if (ABCIsConst) 694 Res = Nor(Xnor(A, B), C); 695 break; 696 case 0x15: 697 if (ABCIsConst) 698 Res = Nor(And(A, B), C); 699 break; 700 case 0x16: 701 if (ABCIsConst) 702 Res = Xor(Xor(A, B), And(Nand(A, B), C)); 703 break; 704 case 0x17: 705 if (ABCIsConst) 706 Res = Xor(Or(A, B), Or(Xnor(A, B), C)); 707 break; 708 case 0x18: 709 if (ABCIsConst) 710 Res = Nor(Xnor(A, B), Xnor(A, C)); 711 break; 712 case 0x19: 713 if (ABCIsConst) 714 Res = And(Nand(A, B), Xnor(B, C)); 715 break; 716 case 0x1a: 717 if (ABCIsConst) 718 Res = Xor(A, Or(And(A, B), C)); 719 break; 720 case 0x1b: 721 if (ABCIsConst) 722 Res = Xor(A, Or(Xnor(A, B), C)); 723 break; 724 case 0x1c: 725 if (ABCIsConst) 726 Res = Xor(A, Or(And(A, C), B)); 727 break; 728 case 0x1d: 729 if (ABCIsConst) 730 Res = Xor(A, Or(Xnor(A, C), B)); 731 break; 732 case 0x1e: 733 if (ABCIsConst) 734 Res = Xor(A, Or(B, C)); 735 break; 736 case 0x1f: 737 if (ABCIsConst) 738 Res = Nand(A, Or(B, C)); 739 break; 740 case 0x20: 741 if (ABCIsConst) 742 Res = Nor(Nand(A, C), B); 743 break; 744 case 0x21: 745 if (ABCIsConst) 746 Res = Nor(Xor(A, C), B); 747 break; 748 case 0x22: 749 if (BCIsConst) 750 Res = Nor(B, Not(C)); 751 break; 752 case 0x23: 753 if (ABCIsConst) 754 Res = Nor(B, Nor(C, Not(A))); 755 break; 756 case 0x24: 757 if (ABCIsConst) 758 Res = Nor(Xnor(A, B), Xor(A, C)); 759 break; 760 case 0x25: 761 if (ABCIsConst) 762 Res = Xor(A, Nand(Nand(A, B), C)); 763 break; 764 case 0x26: 765 if (ABCIsConst) 766 Res = And(Nand(A, B), Xor(B, C)); 767 break; 768 case 0x27: 769 if (ABCIsConst) 770 Res = Xor(Or(Xnor(A, B), C), B); 771 break; 772 case 0x28: 773 if (ABCIsConst) 774 Res = And(Xor(A, B), C); 775 break; 776 case 0x29: 777 if (ABCIsConst) 778 Res = Xor(Xor(A, B), Nor(And(A, B), C)); 779 break; 780 case 0x2a: 781 if (ABCIsConst) 782 Res = And(Nand(A, B), C); 783 break; 784 case 0x2b: 785 if (ABCIsConst) 786 Res = Xor(Or(Xnor(A, B), Xor(A, C)), A); 787 break; 788 case 0x2c: 789 if (ABCIsConst) 790 Res = Nor(Xnor(A, B), Nor(B, C)); 791 break; 792 case 0x2d: 793 if (ABCIsConst) 794 Res = Xor(A, Or(B, Not(C))); 795 break; 796 case 0x2e: 797 if (ABCIsConst) 798 Res = Xor(A, Or(Xor(A, C), B)); 799 break; 800 case 0x2f: 801 if (ABCIsConst) 802 Res = Nand(A, Or(B, Not(C))); 803 break; 804 case 0x30: 805 if (ABIsConst) 806 Res = Nor(B, Not(A)); 807 break; 808 case 0x31: 809 if (ABCIsConst) 810 Res = Nor(Nor(A, Not(C)), B); 811 break; 812 case 0x32: 813 if (ABCIsConst) 814 Res = Nor(Nor(A, C), B); 815 break; 816 case 0x33: 817 Res = Not(B); 818 break; 819 case 0x34: 820 if (ABCIsConst) 821 Res = And(Xor(A, B), Nand(B, C)); 822 break; 823 case 0x35: 824 if (ABCIsConst) 825 Res = Xor(B, Or(A, Xnor(B, C))); 826 break; 827 case 0x36: 828 if (ABCIsConst) 829 Res = Xor(Or(A, C), B); 830 break; 831 case 0x37: 832 if (ABCIsConst) 833 Res = Nand(Or(A, C), B); 834 break; 835 case 0x38: 836 if (ABCIsConst) 837 Res = Nor(Xnor(A, B), Nor(A, C)); 838 break; 839 case 0x39: 840 if (ABCIsConst) 841 Res = Xor(Or(A, Not(C)), B); 842 break; 843 case 0x3a: 844 if (ABCIsConst) 845 Res = Xor(B, Or(A, Xor(B, C))); 846 break; 847 case 0x3b: 848 if (ABCIsConst) 849 Res = Nand(Or(A, Not(C)), B); 850 break; 851 case 0x3c: 852 Res = Xor(A, B); 853 break; 854 case 0x3d: 855 if (ABCIsConst) 856 Res = Xor(A, Or(Nor(A, C), B)); 857 break; 858 case 0x3e: 859 if (ABCIsConst) 860 Res = Xor(A, Or(Nor(A, Not(C)), B)); 861 break; 862 case 0x3f: 863 if (ABIsConst) 864 Res = Nand(A, B); 865 break; 866 case 0x40: 867 if (ABCIsConst) 868 Res = Nor(Nand(A, B), C); 869 break; 870 case 0x41: 871 if (ABCIsConst) 872 Res = Nor(Xor(A, B), C); 873 break; 874 case 0x42: 875 if (ABCIsConst) 876 Res = Nor(Xor(A, B), Xnor(A, C)); 877 break; 878 case 0x43: 879 if (ABCIsConst) 880 Res = Xor(A, Nand(Nand(A, C), B)); 881 break; 882 case 0x44: 883 if (BCIsConst) 884 Res = Nor(C, Not(B)); 885 break; 886 case 0x45: 887 if (ABCIsConst) 888 Res = Nor(Nor(B, Not(A)), C); 889 break; 890 case 0x46: 891 if (ABCIsConst) 892 Res = Xor(Or(And(A, C), B), C); 893 break; 894 case 0x47: 895 if (ABCIsConst) 896 Res = Xor(Or(Xnor(A, C), B), C); 897 break; 898 case 0x48: 899 if (ABCIsConst) 900 Res = And(Xor(A, C), B); 901 break; 902 case 0x49: 903 if (ABCIsConst) 904 Res = Xor(Or(Xnor(A, B), And(A, C)), C); 905 break; 906 case 0x4a: 907 if (ABCIsConst) 908 Res = Nor(Xnor(A, C), Nor(B, C)); 909 break; 910 case 0x4b: 911 if (ABCIsConst) 912 Res = Xor(A, Or(C, Not(B))); 913 break; 914 case 0x4c: 915 if (ABCIsConst) 916 Res = And(Nand(A, C), B); 917 break; 918 case 0x4d: 919 if (ABCIsConst) 920 Res = Xor(Or(Xor(A, B), Xnor(A, C)), A); 921 break; 922 case 0x4e: 923 if (ABCIsConst) 924 Res = Xor(A, Or(Xor(A, B), C)); 925 break; 926 case 0x4f: 927 if (ABCIsConst) 928 Res = Nand(A, Nand(B, Not(C))); 929 break; 930 case 0x50: 931 if (ACIsConst) 932 Res = Nor(C, Not(A)); 933 break; 934 case 0x51: 935 if (ABCIsConst) 936 Res = Nor(Nor(A, Not(B)), C); 937 break; 938 case 0x52: 939 if (ABCIsConst) 940 Res = And(Xor(A, C), Nand(B, C)); 941 break; 942 case 0x53: 943 if (ABCIsConst) 944 Res = Xor(Or(Xnor(B, C), A), C); 945 break; 946 case 0x54: 947 if (ABCIsConst) 948 Res = Nor(Nor(A, B), C); 949 break; 950 case 0x55: 951 Res = Not(C); 952 break; 953 case 0x56: 954 if (ABCIsConst) 955 Res = Xor(Or(A, B), C); 956 break; 957 case 0x57: 958 if (ABCIsConst) 959 Res = Nand(Or(A, B), C); 960 break; 961 case 0x58: 962 if (ABCIsConst) 963 Res = Nor(Nor(A, B), Xnor(A, C)); 964 break; 965 case 0x59: 966 if (ABCIsConst) 967 Res = Xor(Or(A, Not(B)), C); 968 break; 969 case 0x5a: 970 Res = Xor(A, C); 971 break; 972 case 0x5b: 973 if (ABCIsConst) 974 Res = Xor(A, Or(Nor(A, B), C)); 975 break; 976 case 0x5c: 977 if (ABCIsConst) 978 Res = Xor(Or(Xor(B, C), A), C); 979 break; 980 case 0x5d: 981 if (ABCIsConst) 982 Res = Nand(Or(A, Not(B)), C); 983 break; 984 case 0x5e: 985 if (ABCIsConst) 986 Res = Xor(A, Or(Nor(A, Not(B)), C)); 987 break; 988 case 0x5f: 989 if (ACIsConst) 990 Res = Nand(A, C); 991 break; 992 case 0x60: 993 if (ABCIsConst) 994 Res = And(A, Xor(B, C)); 995 break; 996 case 0x61: 997 if (ABCIsConst) 998 Res = Xor(Or(Xnor(A, B), And(B, C)), C); 999 break; 1000 case 0x62: 1001 if (ABCIsConst) 1002 Res = Nor(Nor(A, C), Xnor(B, C)); 1003 break; 1004 case 0x63: 1005 if (ABCIsConst) 1006 Res = Xor(B, Or(C, Not(A))); 1007 break; 1008 case 0x64: 1009 if (ABCIsConst) 1010 Res = Nor(Nor(A, B), Xnor(B, C)); 1011 break; 1012 case 0x65: 1013 if (ABCIsConst) 1014 Res = Xor(Or(B, Not(A)), C); 1015 break; 1016 case 0x66: 1017 Res = Xor(B, C); 1018 break; 1019 case 0x67: 1020 if (ABCIsConst) 1021 Res = Or(Nor(A, B), Xor(B, C)); 1022 break; 1023 case 0x68: 1024 if (ABCIsConst) 1025 Res = Xor(Xor(A, B), Nor(Nor(A, B), C)); 1026 break; 1027 case 0x69: 1028 if (ABCIsConst) 1029 Res = Xor(Xnor(A, B), C); 1030 break; 1031 case 0x6a: 1032 if (ABCIsConst) 1033 Res = Xor(And(A, B), C); 1034 break; 1035 case 0x6b: 1036 if (ABCIsConst) 1037 Res = Or(Nor(A, B), Xor(Xnor(A, B), C)); 1038 break; 1039 case 0x6c: 1040 if (ABCIsConst) 1041 Res = Xor(And(A, C), B); 1042 break; 1043 case 0x6d: 1044 if (ABCIsConst) 1045 Res = Xor(Or(Xnor(A, B), Nor(A, C)), C); 1046 break; 1047 case 0x6e: 1048 if (ABCIsConst) 1049 Res = Or(Nor(A, Not(B)), Xor(B, C)); 1050 break; 1051 case 0x6f: 1052 if (ABCIsConst) 1053 Res = Nand(A, Xnor(B, C)); 1054 break; 1055 case 0x70: 1056 if (ABCIsConst) 1057 Res = And(A, Nand(B, C)); 1058 break; 1059 case 0x71: 1060 if (ABCIsConst) 1061 Res = Xor(Nor(Xor(A, B), Xor(A, C)), A); 1062 break; 1063 case 0x72: 1064 if (ABCIsConst) 1065 Res = Xor(Or(Xor(A, B), C), B); 1066 break; 1067 case 0x73: 1068 if (ABCIsConst) 1069 Res = Nand(Nand(A, Not(C)), B); 1070 break; 1071 case 0x74: 1072 if (ABCIsConst) 1073 Res = Xor(Or(Xor(A, C), B), C); 1074 break; 1075 case 0x75: 1076 if (ABCIsConst) 1077 Res = Nand(Nand(A, Not(B)), C); 1078 break; 1079 case 0x76: 1080 if (ABCIsConst) 1081 Res = Xor(B, Or(Nor(B, Not(A)), C)); 1082 break; 1083 case 0x77: 1084 if (BCIsConst) 1085 Res = Nand(B, C); 1086 break; 1087 case 0x78: 1088 if (ABCIsConst) 1089 Res = Xor(A, And(B, C)); 1090 break; 1091 case 0x79: 1092 if (ABCIsConst) 1093 Res = Xor(Or(Xnor(A, B), Nor(B, C)), C); 1094 break; 1095 case 0x7a: 1096 if (ABCIsConst) 1097 Res = Or(Xor(A, C), Nor(B, Not(A))); 1098 break; 1099 case 0x7b: 1100 if (ABCIsConst) 1101 Res = Nand(Xnor(A, C), B); 1102 break; 1103 case 0x7c: 1104 if (ABCIsConst) 1105 Res = Or(Xor(A, B), Nor(C, Not(A))); 1106 break; 1107 case 0x7d: 1108 if (ABCIsConst) 1109 Res = Nand(Xnor(A, B), C); 1110 break; 1111 case 0x7e: 1112 if (ABCIsConst) 1113 Res = Or(Xor(A, B), Xor(A, C)); 1114 break; 1115 case 0x7f: 1116 if (ABCIsConst) 1117 Res = Nand(And(A, B), C); 1118 break; 1119 case 0x80: 1120 if (ABCIsConst) 1121 Res = And(And(A, B), C); 1122 break; 1123 case 0x81: 1124 if (ABCIsConst) 1125 Res = Nor(Xor(A, B), Xor(A, C)); 1126 break; 1127 case 0x82: 1128 if (ABCIsConst) 1129 Res = And(Xnor(A, B), C); 1130 break; 1131 case 0x83: 1132 if (ABCIsConst) 1133 Res = Nor(Xor(A, B), Nor(C, Not(A))); 1134 break; 1135 case 0x84: 1136 if (ABCIsConst) 1137 Res = And(Xnor(A, C), B); 1138 break; 1139 case 0x85: 1140 if (ABCIsConst) 1141 Res = Nor(Xor(A, C), Nor(B, Not(A))); 1142 break; 1143 case 0x86: 1144 if (ABCIsConst) 1145 Res = Xor(Nor(Xnor(A, B), Nor(B, C)), C); 1146 break; 1147 case 0x87: 1148 if (ABCIsConst) 1149 Res = Xor(A, Nand(B, C)); 1150 break; 1151 case 0x88: 1152 Res = And(B, C); 1153 break; 1154 case 0x89: 1155 if (ABCIsConst) 1156 Res = Xor(B, Nor(Nor(B, Not(A)), C)); 1157 break; 1158 case 0x8a: 1159 if (ABCIsConst) 1160 Res = And(Nand(A, Not(B)), C); 1161 break; 1162 case 0x8b: 1163 if (ABCIsConst) 1164 Res = Xor(Nor(Xor(A, C), B), C); 1165 break; 1166 case 0x8c: 1167 if (ABCIsConst) 1168 Res = And(Nand(A, Not(C)), B); 1169 break; 1170 case 0x8d: 1171 if (ABCIsConst) 1172 Res = Xor(Nor(Xor(A, B), C), B); 1173 break; 1174 case 0x8e: 1175 if (ABCIsConst) 1176 Res = Xor(Or(Xor(A, B), Xor(A, C)), A); 1177 break; 1178 case 0x8f: 1179 if (ABCIsConst) 1180 Res = Nand(A, Nand(B, C)); 1181 break; 1182 case 0x90: 1183 if (ABCIsConst) 1184 Res = And(A, Xnor(B, C)); 1185 break; 1186 case 0x91: 1187 if (ABCIsConst) 1188 Res = Nor(Nor(A, Not(B)), Xor(B, C)); 1189 break; 1190 case 0x92: 1191 if (ABCIsConst) 1192 Res = Xor(Nor(Xnor(A, B), Nor(A, C)), C); 1193 break; 1194 case 0x93: 1195 if (ABCIsConst) 1196 Res = Xor(Nand(A, C), B); 1197 break; 1198 case 0x94: 1199 if (ABCIsConst) 1200 Res = Nor(Nor(A, B), Xor(Xnor(A, B), C)); 1201 break; 1202 case 0x95: 1203 if (ABCIsConst) 1204 Res = Xor(Nand(A, B), C); 1205 break; 1206 case 0x96: 1207 if (ABCIsConst) 1208 Res = Xor(Xor(A, B), C); 1209 break; 1210 case 0x97: 1211 if (ABCIsConst) 1212 Res = Xor(Xor(A, B), Or(Nor(A, B), C)); 1213 break; 1214 case 0x98: 1215 if (ABCIsConst) 1216 Res = Nor(Nor(A, B), Xor(B, C)); 1217 break; 1218 case 0x99: 1219 if (BCIsConst) 1220 Res = Xnor(B, C); 1221 break; 1222 case 0x9a: 1223 if (ABCIsConst) 1224 Res = Xor(Nor(B, Not(A)), C); 1225 break; 1226 case 0x9b: 1227 if (ABCIsConst) 1228 Res = Or(Nor(A, B), Xnor(B, C)); 1229 break; 1230 case 0x9c: 1231 if (ABCIsConst) 1232 Res = Xor(B, Nor(C, Not(A))); 1233 break; 1234 case 0x9d: 1235 if (ABCIsConst) 1236 Res = Or(Nor(A, C), Xnor(B, C)); 1237 break; 1238 case 0x9e: 1239 if (ABCIsConst) 1240 Res = Xor(And(Xor(A, B), Nand(B, C)), C); 1241 break; 1242 case 0x9f: 1243 if (ABCIsConst) 1244 Res = Nand(A, Xor(B, C)); 1245 break; 1246 case 0xa0: 1247 Res = And(A, C); 1248 break; 1249 case 0xa1: 1250 if (ABCIsConst) 1251 Res = Xor(A, Nor(Nor(A, Not(B)), C)); 1252 break; 1253 case 0xa2: 1254 if (ABCIsConst) 1255 Res = And(Or(A, Not(B)), C); 1256 break; 1257 case 0xa3: 1258 if (ABCIsConst) 1259 Res = Xor(Nor(Xor(B, C), A), C); 1260 break; 1261 case 0xa4: 1262 if (ABCIsConst) 1263 Res = Xor(A, Nor(Nor(A, B), C)); 1264 break; 1265 case 0xa5: 1266 if (ACIsConst) 1267 Res = Xnor(A, C); 1268 break; 1269 case 0xa6: 1270 if (ABCIsConst) 1271 Res = Xor(Nor(A, Not(B)), C); 1272 break; 1273 case 0xa7: 1274 if (ABCIsConst) 1275 Res = Or(Nor(A, B), Xnor(A, C)); 1276 break; 1277 case 0xa8: 1278 if (ABCIsConst) 1279 Res = And(Or(A, B), C); 1280 break; 1281 case 0xa9: 1282 if (ABCIsConst) 1283 Res = Xor(Nor(A, B), C); 1284 break; 1285 case 0xaa: 1286 Res = C; 1287 break; 1288 case 0xab: 1289 if (ABCIsConst) 1290 Res = Or(Nor(A, B), C); 1291 break; 1292 case 0xac: 1293 if (ABCIsConst) 1294 Res = Xor(Nor(Xnor(B, C), A), C); 1295 break; 1296 case 0xad: 1297 if (ABCIsConst) 1298 Res = Or(Xnor(A, C), And(B, C)); 1299 break; 1300 case 0xae: 1301 if (ABCIsConst) 1302 Res = Or(Nor(A, Not(B)), C); 1303 break; 1304 case 0xaf: 1305 if (ACIsConst) 1306 Res = Or(C, Not(A)); 1307 break; 1308 case 0xb0: 1309 if (ABCIsConst) 1310 Res = And(A, Nand(B, Not(C))); 1311 break; 1312 case 0xb1: 1313 if (ABCIsConst) 1314 Res = Xor(A, Nor(Xor(A, B), C)); 1315 break; 1316 case 0xb2: 1317 if (ABCIsConst) 1318 Res = Xor(Nor(Xor(A, B), Xnor(A, C)), A); 1319 break; 1320 case 0xb3: 1321 if (ABCIsConst) 1322 Res = Nand(Nand(A, C), B); 1323 break; 1324 case 0xb4: 1325 if (ABCIsConst) 1326 Res = Xor(A, Nor(C, Not(B))); 1327 break; 1328 case 0xb5: 1329 if (ABCIsConst) 1330 Res = Or(Xnor(A, C), Nor(B, C)); 1331 break; 1332 case 0xb6: 1333 if (ABCIsConst) 1334 Res = Xor(And(Xor(A, B), Nand(A, C)), C); 1335 break; 1336 case 0xb7: 1337 if (ABCIsConst) 1338 Res = Nand(Xor(A, C), B); 1339 break; 1340 case 0xb8: 1341 if (ABCIsConst) 1342 Res = Xor(Nor(Xnor(A, C), B), C); 1343 break; 1344 case 0xb9: 1345 if (ABCIsConst) 1346 Res = Xor(Nor(And(A, C), B), C); 1347 break; 1348 case 0xba: 1349 if (ABCIsConst) 1350 Res = Or(Nor(B, Not(A)), C); 1351 break; 1352 case 0xbb: 1353 if (BCIsConst) 1354 Res = Or(C, Not(B)); 1355 break; 1356 case 0xbc: 1357 if (ABCIsConst) 1358 Res = Xor(A, And(Nand(A, C), B)); 1359 break; 1360 case 0xbd: 1361 if (ABCIsConst) 1362 Res = Or(Xor(A, B), Xnor(A, C)); 1363 break; 1364 case 0xbe: 1365 if (ABCIsConst) 1366 Res = Or(Xor(A, B), C); 1367 break; 1368 case 0xbf: 1369 if (ABCIsConst) 1370 Res = Or(Nand(A, B), C); 1371 break; 1372 case 0xc0: 1373 Res = And(A, B); 1374 break; 1375 case 0xc1: 1376 if (ABCIsConst) 1377 Res = Xor(A, Nor(Nor(A, Not(C)), B)); 1378 break; 1379 case 0xc2: 1380 if (ABCIsConst) 1381 Res = Xor(A, Nor(Nor(A, C), B)); 1382 break; 1383 case 0xc3: 1384 if (ABIsConst) 1385 Res = Xnor(A, B); 1386 break; 1387 case 0xc4: 1388 if (ABCIsConst) 1389 Res = And(Or(A, Not(C)), B); 1390 break; 1391 case 0xc5: 1392 if (ABCIsConst) 1393 Res = Xor(B, Nor(A, Xor(B, C))); 1394 break; 1395 case 0xc6: 1396 if (ABCIsConst) 1397 Res = Xor(Nor(A, Not(C)), B); 1398 break; 1399 case 0xc7: 1400 if (ABCIsConst) 1401 Res = Or(Xnor(A, B), Nor(A, C)); 1402 break; 1403 case 0xc8: 1404 if (ABCIsConst) 1405 Res = And(Or(A, C), B); 1406 break; 1407 case 0xc9: 1408 if (ABCIsConst) 1409 Res = Xor(Nor(A, C), B); 1410 break; 1411 case 0xca: 1412 if (ABCIsConst) 1413 Res = Xor(B, Nor(A, Xnor(B, C))); 1414 break; 1415 case 0xcb: 1416 if (ABCIsConst) 1417 Res = Or(Xnor(A, B), And(B, C)); 1418 break; 1419 case 0xcc: 1420 Res = B; 1421 break; 1422 case 0xcd: 1423 if (ABCIsConst) 1424 Res = Or(Nor(A, C), B); 1425 break; 1426 case 0xce: 1427 if (ABCIsConst) 1428 Res = Or(Nor(A, Not(C)), B); 1429 break; 1430 case 0xcf: 1431 if (ABIsConst) 1432 Res = Or(B, Not(A)); 1433 break; 1434 case 0xd0: 1435 if (ABCIsConst) 1436 Res = And(A, Or(B, Not(C))); 1437 break; 1438 case 0xd1: 1439 if (ABCIsConst) 1440 Res = Xor(A, Nor(Xor(A, C), B)); 1441 break; 1442 case 0xd2: 1443 if (ABCIsConst) 1444 Res = Xor(A, Nor(B, Not(C))); 1445 break; 1446 case 0xd3: 1447 if (ABCIsConst) 1448 Res = Or(Xnor(A, B), Nor(B, C)); 1449 break; 1450 case 0xd4: 1451 if (ABCIsConst) 1452 Res = Xor(Nor(Xnor(A, B), Xor(A, C)), A); 1453 break; 1454 case 0xd5: 1455 if (ABCIsConst) 1456 Res = Nand(Nand(A, B), C); 1457 break; 1458 case 0xd6: 1459 if (ABCIsConst) 1460 Res = Xor(Xor(A, B), Or(And(A, B), C)); 1461 break; 1462 case 0xd7: 1463 if (ABCIsConst) 1464 Res = Nand(Xor(A, B), C); 1465 break; 1466 case 0xd8: 1467 if (ABCIsConst) 1468 Res = Xor(Nor(Xnor(A, B), C), B); 1469 break; 1470 case 0xd9: 1471 if (ABCIsConst) 1472 Res = Or(And(A, B), Xnor(B, C)); 1473 break; 1474 case 0xda: 1475 if (ABCIsConst) 1476 Res = Xor(A, And(Nand(A, B), C)); 1477 break; 1478 case 0xdb: 1479 if (ABCIsConst) 1480 Res = Or(Xnor(A, B), Xor(A, C)); 1481 break; 1482 case 0xdc: 1483 if (ABCIsConst) 1484 Res = Or(B, Nor(C, Not(A))); 1485 break; 1486 case 0xdd: 1487 if (BCIsConst) 1488 Res = Or(B, Not(C)); 1489 break; 1490 case 0xde: 1491 if (ABCIsConst) 1492 Res = Or(Xor(A, C), B); 1493 break; 1494 case 0xdf: 1495 if (ABCIsConst) 1496 Res = Or(Nand(A, C), B); 1497 break; 1498 case 0xe0: 1499 if (ABCIsConst) 1500 Res = And(A, Or(B, C)); 1501 break; 1502 case 0xe1: 1503 if (ABCIsConst) 1504 Res = Xor(A, Nor(B, C)); 1505 break; 1506 case 0xe2: 1507 if (ABCIsConst) 1508 Res = Xor(A, Nor(Xnor(A, C), B)); 1509 break; 1510 case 0xe3: 1511 if (ABCIsConst) 1512 Res = Xor(A, Nor(And(A, C), B)); 1513 break; 1514 case 0xe4: 1515 if (ABCIsConst) 1516 Res = Xor(A, Nor(Xnor(A, B), C)); 1517 break; 1518 case 0xe5: 1519 if (ABCIsConst) 1520 Res = Xor(A, Nor(And(A, B), C)); 1521 break; 1522 case 0xe6: 1523 if (ABCIsConst) 1524 Res = Or(And(A, B), Xor(B, C)); 1525 break; 1526 case 0xe7: 1527 if (ABCIsConst) 1528 Res = Or(Xnor(A, B), Xnor(A, C)); 1529 break; 1530 case 0xe8: 1531 if (ABCIsConst) 1532 Res = Xor(Or(A, B), Nor(Xnor(A, B), C)); 1533 break; 1534 case 0xe9: 1535 if (ABCIsConst) 1536 Res = Xor(Xor(A, B), Nand(Nand(A, B), C)); 1537 break; 1538 case 0xea: 1539 if (ABCIsConst) 1540 Res = Or(And(A, B), C); 1541 break; 1542 case 0xeb: 1543 if (ABCIsConst) 1544 Res = Or(Xnor(A, B), C); 1545 break; 1546 case 0xec: 1547 if (ABCIsConst) 1548 Res = Or(And(A, C), B); 1549 break; 1550 case 0xed: 1551 if (ABCIsConst) 1552 Res = Or(Xnor(A, C), B); 1553 break; 1554 case 0xee: 1555 Res = Or(B, C); 1556 break; 1557 case 0xef: 1558 if (ABCIsConst) 1559 Res = Nand(A, Nor(B, C)); 1560 break; 1561 case 0xf0: 1562 Res = A; 1563 break; 1564 case 0xf1: 1565 if (ABCIsConst) 1566 Res = Or(A, Nor(B, C)); 1567 break; 1568 case 0xf2: 1569 if (ABCIsConst) 1570 Res = Or(A, Nor(B, Not(C))); 1571 break; 1572 case 0xf3: 1573 if (ABIsConst) 1574 Res = Or(A, Not(B)); 1575 break; 1576 case 0xf4: 1577 if (ABCIsConst) 1578 Res = Or(A, Nor(C, Not(B))); 1579 break; 1580 case 0xf5: 1581 if (ACIsConst) 1582 Res = Or(A, Not(C)); 1583 break; 1584 case 0xf6: 1585 if (ABCIsConst) 1586 Res = Or(A, Xor(B, C)); 1587 break; 1588 case 0xf7: 1589 if (ABCIsConst) 1590 Res = Or(A, Nand(B, C)); 1591 break; 1592 case 0xf8: 1593 if (ABCIsConst) 1594 Res = Or(A, And(B, C)); 1595 break; 1596 case 0xf9: 1597 if (ABCIsConst) 1598 Res = Or(A, Xnor(B, C)); 1599 break; 1600 case 0xfa: 1601 Res = Or(A, C); 1602 break; 1603 case 0xfb: 1604 if (ABCIsConst) 1605 Res = Nand(Nor(A, C), B); 1606 break; 1607 case 0xfc: 1608 Res = Or(A, B); 1609 break; 1610 case 0xfd: 1611 if (ABCIsConst) 1612 Res = Nand(Nor(A, B), C); 1613 break; 1614 case 0xfe: 1615 if (ABCIsConst) 1616 Res = Or(Or(A, B), C); 1617 break; 1618 case 0xff: 1619 Res = {Constant::getAllOnesValue(Ty), 0xff}; 1620 break; 1621 } 1622 1623 assert((Res.first == nullptr || Res.second == Imm) && 1624 "Simplification of ternary logic does not verify!"); 1625 return Res.first; 1626 } 1627 1628 static Value *simplifyX86insertps(const IntrinsicInst &II, 1629 InstCombiner::BuilderTy &Builder) { 1630 auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2)); 1631 if (!CInt) 1632 return nullptr; 1633 1634 auto *VecTy = cast<FixedVectorType>(II.getType()); 1635 assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type"); 1636 1637 // The immediate permute control byte looks like this: 1638 // [3:0] - zero mask for each 32-bit lane 1639 // [5:4] - select one 32-bit destination lane 1640 // [7:6] - select one 32-bit source lane 1641 1642 uint8_t Imm = CInt->getZExtValue(); 1643 uint8_t ZMask = Imm & 0xf; 1644 uint8_t DestLane = (Imm >> 4) & 0x3; 1645 uint8_t SourceLane = (Imm >> 6) & 0x3; 1646 1647 ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy); 1648 1649 // If all zero mask bits are set, this was just a weird way to 1650 // generate a zero vector. 1651 if (ZMask == 0xf) 1652 return ZeroVector; 1653 1654 // Initialize by passing all of the first source bits through. 1655 int ShuffleMask[4] = {0, 1, 2, 3}; 1656 1657 // We may replace the second operand with the zero vector. 1658 Value *V1 = II.getArgOperand(1); 1659 1660 if (ZMask) { 1661 // If the zero mask is being used with a single input or the zero mask 1662 // overrides the destination lane, this is a shuffle with the zero vector. 1663 if ((II.getArgOperand(0) == II.getArgOperand(1)) || 1664 (ZMask & (1 << DestLane))) { 1665 V1 = ZeroVector; 1666 // We may still move 32-bits of the first source vector from one lane 1667 // to another. 1668 ShuffleMask[DestLane] = SourceLane; 1669 // The zero mask may override the previous insert operation. 1670 for (unsigned i = 0; i < 4; ++i) 1671 if ((ZMask >> i) & 0x1) 1672 ShuffleMask[i] = i + 4; 1673 } else { 1674 // TODO: Model this case as 2 shuffles or a 'logical and' plus shuffle? 1675 return nullptr; 1676 } 1677 } else { 1678 // Replace the selected destination lane with the selected source lane. 1679 ShuffleMask[DestLane] = SourceLane + 4; 1680 } 1681 1682 return Builder.CreateShuffleVector(II.getArgOperand(0), V1, ShuffleMask); 1683 } 1684 1685 /// Attempt to simplify SSE4A EXTRQ/EXTRQI instructions using constant folding 1686 /// or conversion to a shuffle vector. 1687 static Value *simplifyX86extrq(IntrinsicInst &II, Value *Op0, 1688 ConstantInt *CILength, ConstantInt *CIIndex, 1689 InstCombiner::BuilderTy &Builder) { 1690 auto LowConstantHighUndef = [&](uint64_t Val) { 1691 Type *IntTy64 = Type::getInt64Ty(II.getContext()); 1692 Constant *Args[] = {ConstantInt::get(IntTy64, Val), 1693 UndefValue::get(IntTy64)}; 1694 return ConstantVector::get(Args); 1695 }; 1696 1697 // See if we're dealing with constant values. 1698 auto *C0 = dyn_cast<Constant>(Op0); 1699 auto *CI0 = 1700 C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0)) 1701 : nullptr; 1702 1703 // Attempt to constant fold. 1704 if (CILength && CIIndex) { 1705 // From AMD documentation: "The bit index and field length are each six 1706 // bits in length other bits of the field are ignored." 1707 APInt APIndex = CIIndex->getValue().zextOrTrunc(6); 1708 APInt APLength = CILength->getValue().zextOrTrunc(6); 1709 1710 unsigned Index = APIndex.getZExtValue(); 1711 1712 // From AMD documentation: "a value of zero in the field length is 1713 // defined as length of 64". 1714 unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue(); 1715 1716 // From AMD documentation: "If the sum of the bit index + length field 1717 // is greater than 64, the results are undefined". 1718 unsigned End = Index + Length; 1719 1720 // Note that both field index and field length are 8-bit quantities. 1721 // Since variables 'Index' and 'Length' are unsigned values 1722 // obtained from zero-extending field index and field length 1723 // respectively, their sum should never wrap around. 1724 if (End > 64) 1725 return UndefValue::get(II.getType()); 1726 1727 // If we are inserting whole bytes, we can convert this to a shuffle. 1728 // Lowering can recognize EXTRQI shuffle masks. 1729 if ((Length % 8) == 0 && (Index % 8) == 0) { 1730 // Convert bit indices to byte indices. 1731 Length /= 8; 1732 Index /= 8; 1733 1734 Type *IntTy8 = Type::getInt8Ty(II.getContext()); 1735 auto *ShufTy = FixedVectorType::get(IntTy8, 16); 1736 1737 SmallVector<int, 16> ShuffleMask; 1738 for (int i = 0; i != (int)Length; ++i) 1739 ShuffleMask.push_back(i + Index); 1740 for (int i = Length; i != 8; ++i) 1741 ShuffleMask.push_back(i + 16); 1742 for (int i = 8; i != 16; ++i) 1743 ShuffleMask.push_back(-1); 1744 1745 Value *SV = Builder.CreateShuffleVector( 1746 Builder.CreateBitCast(Op0, ShufTy), 1747 ConstantAggregateZero::get(ShufTy), ShuffleMask); 1748 return Builder.CreateBitCast(SV, II.getType()); 1749 } 1750 1751 // Constant Fold - shift Index'th bit to lowest position and mask off 1752 // Length bits. 1753 if (CI0) { 1754 APInt Elt = CI0->getValue(); 1755 Elt.lshrInPlace(Index); 1756 Elt = Elt.zextOrTrunc(Length); 1757 return LowConstantHighUndef(Elt.getZExtValue()); 1758 } 1759 1760 // If we were an EXTRQ call, we'll save registers if we convert to EXTRQI. 1761 if (II.getIntrinsicID() == Intrinsic::x86_sse4a_extrq) { 1762 Value *Args[] = {Op0, CILength, CIIndex}; 1763 Module *M = II.getModule(); 1764 Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_extrqi); 1765 return Builder.CreateCall(F, Args); 1766 } 1767 } 1768 1769 // Constant Fold - extraction from zero is always {zero, undef}. 1770 if (CI0 && CI0->isZero()) 1771 return LowConstantHighUndef(0); 1772 1773 return nullptr; 1774 } 1775 1776 /// Attempt to simplify SSE4A INSERTQ/INSERTQI instructions using constant 1777 /// folding or conversion to a shuffle vector. 1778 static Value *simplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1, 1779 APInt APLength, APInt APIndex, 1780 InstCombiner::BuilderTy &Builder) { 1781 // From AMD documentation: "The bit index and field length are each six bits 1782 // in length other bits of the field are ignored." 1783 APIndex = APIndex.zextOrTrunc(6); 1784 APLength = APLength.zextOrTrunc(6); 1785 1786 // Attempt to constant fold. 1787 unsigned Index = APIndex.getZExtValue(); 1788 1789 // From AMD documentation: "a value of zero in the field length is 1790 // defined as length of 64". 1791 unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue(); 1792 1793 // From AMD documentation: "If the sum of the bit index + length field 1794 // is greater than 64, the results are undefined". 1795 unsigned End = Index + Length; 1796 1797 // Note that both field index and field length are 8-bit quantities. 1798 // Since variables 'Index' and 'Length' are unsigned values 1799 // obtained from zero-extending field index and field length 1800 // respectively, their sum should never wrap around. 1801 if (End > 64) 1802 return UndefValue::get(II.getType()); 1803 1804 // If we are inserting whole bytes, we can convert this to a shuffle. 1805 // Lowering can recognize INSERTQI shuffle masks. 1806 if ((Length % 8) == 0 && (Index % 8) == 0) { 1807 // Convert bit indices to byte indices. 1808 Length /= 8; 1809 Index /= 8; 1810 1811 Type *IntTy8 = Type::getInt8Ty(II.getContext()); 1812 auto *ShufTy = FixedVectorType::get(IntTy8, 16); 1813 1814 SmallVector<int, 16> ShuffleMask; 1815 for (int i = 0; i != (int)Index; ++i) 1816 ShuffleMask.push_back(i); 1817 for (int i = 0; i != (int)Length; ++i) 1818 ShuffleMask.push_back(i + 16); 1819 for (int i = Index + Length; i != 8; ++i) 1820 ShuffleMask.push_back(i); 1821 for (int i = 8; i != 16; ++i) 1822 ShuffleMask.push_back(-1); 1823 1824 Value *SV = Builder.CreateShuffleVector(Builder.CreateBitCast(Op0, ShufTy), 1825 Builder.CreateBitCast(Op1, ShufTy), 1826 ShuffleMask); 1827 return Builder.CreateBitCast(SV, II.getType()); 1828 } 1829 1830 // See if we're dealing with constant values. 1831 auto *C0 = dyn_cast<Constant>(Op0); 1832 auto *C1 = dyn_cast<Constant>(Op1); 1833 auto *CI00 = 1834 C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0)) 1835 : nullptr; 1836 auto *CI10 = 1837 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0)) 1838 : nullptr; 1839 1840 // Constant Fold - insert bottom Length bits starting at the Index'th bit. 1841 if (CI00 && CI10) { 1842 APInt V00 = CI00->getValue(); 1843 APInt V10 = CI10->getValue(); 1844 APInt Mask = APInt::getLowBitsSet(64, Length).shl(Index); 1845 V00 = V00 & ~Mask; 1846 V10 = V10.zextOrTrunc(Length).zextOrTrunc(64).shl(Index); 1847 APInt Val = V00 | V10; 1848 Type *IntTy64 = Type::getInt64Ty(II.getContext()); 1849 Constant *Args[] = {ConstantInt::get(IntTy64, Val.getZExtValue()), 1850 UndefValue::get(IntTy64)}; 1851 return ConstantVector::get(Args); 1852 } 1853 1854 // If we were an INSERTQ call, we'll save demanded elements if we convert to 1855 // INSERTQI. 1856 if (II.getIntrinsicID() == Intrinsic::x86_sse4a_insertq) { 1857 Type *IntTy8 = Type::getInt8Ty(II.getContext()); 1858 Constant *CILength = ConstantInt::get(IntTy8, Length, false); 1859 Constant *CIIndex = ConstantInt::get(IntTy8, Index, false); 1860 1861 Value *Args[] = {Op0, Op1, CILength, CIIndex}; 1862 Module *M = II.getModule(); 1863 Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi); 1864 return Builder.CreateCall(F, Args); 1865 } 1866 1867 return nullptr; 1868 } 1869 1870 /// Attempt to convert pshufb* to shufflevector if the mask is constant. 1871 static Value *simplifyX86pshufb(const IntrinsicInst &II, 1872 InstCombiner::BuilderTy &Builder) { 1873 auto *V = dyn_cast<Constant>(II.getArgOperand(1)); 1874 if (!V) 1875 return nullptr; 1876 1877 auto *VecTy = cast<FixedVectorType>(II.getType()); 1878 unsigned NumElts = VecTy->getNumElements(); 1879 assert((NumElts == 16 || NumElts == 32 || NumElts == 64) && 1880 "Unexpected number of elements in shuffle mask!"); 1881 1882 // Construct a shuffle mask from constant integers or UNDEFs. 1883 int Indexes[64]; 1884 1885 // Each byte in the shuffle control mask forms an index to permute the 1886 // corresponding byte in the destination operand. 1887 for (unsigned I = 0; I < NumElts; ++I) { 1888 Constant *COp = V->getAggregateElement(I); 1889 if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) 1890 return nullptr; 1891 1892 if (isa<UndefValue>(COp)) { 1893 Indexes[I] = -1; 1894 continue; 1895 } 1896 1897 int8_t Index = cast<ConstantInt>(COp)->getValue().getZExtValue(); 1898 1899 // If the most significant bit (bit[7]) of each byte of the shuffle 1900 // control mask is set, then zero is written in the result byte. 1901 // The zero vector is in the right-hand side of the resulting 1902 // shufflevector. 1903 1904 // The value of each index for the high 128-bit lane is the least 1905 // significant 4 bits of the respective shuffle control byte. 1906 Index = ((Index < 0) ? NumElts : Index & 0x0F) + (I & 0xF0); 1907 Indexes[I] = Index; 1908 } 1909 1910 auto V1 = II.getArgOperand(0); 1911 auto V2 = Constant::getNullValue(VecTy); 1912 return Builder.CreateShuffleVector(V1, V2, ArrayRef(Indexes, NumElts)); 1913 } 1914 1915 /// Attempt to convert vpermilvar* to shufflevector if the mask is constant. 1916 static Value *simplifyX86vpermilvar(const IntrinsicInst &II, 1917 InstCombiner::BuilderTy &Builder) { 1918 auto *V = dyn_cast<Constant>(II.getArgOperand(1)); 1919 if (!V) 1920 return nullptr; 1921 1922 auto *VecTy = cast<FixedVectorType>(II.getType()); 1923 unsigned NumElts = VecTy->getNumElements(); 1924 bool IsPD = VecTy->getScalarType()->isDoubleTy(); 1925 unsigned NumLaneElts = IsPD ? 2 : 4; 1926 assert(NumElts == 16 || NumElts == 8 || NumElts == 4 || NumElts == 2); 1927 1928 // Construct a shuffle mask from constant integers or UNDEFs. 1929 int Indexes[16]; 1930 1931 // The intrinsics only read one or two bits, clear the rest. 1932 for (unsigned I = 0; I < NumElts; ++I) { 1933 Constant *COp = V->getAggregateElement(I); 1934 if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) 1935 return nullptr; 1936 1937 if (isa<UndefValue>(COp)) { 1938 Indexes[I] = -1; 1939 continue; 1940 } 1941 1942 APInt Index = cast<ConstantInt>(COp)->getValue(); 1943 Index = Index.zextOrTrunc(32).getLoBits(2); 1944 1945 // The PD variants uses bit 1 to select per-lane element index, so 1946 // shift down to convert to generic shuffle mask index. 1947 if (IsPD) 1948 Index.lshrInPlace(1); 1949 1950 // The _256 variants are a bit trickier since the mask bits always index 1951 // into the corresponding 128 half. In order to convert to a generic 1952 // shuffle, we have to make that explicit. 1953 Index += APInt(32, (I / NumLaneElts) * NumLaneElts); 1954 1955 Indexes[I] = Index.getZExtValue(); 1956 } 1957 1958 auto V1 = II.getArgOperand(0); 1959 return Builder.CreateShuffleVector(V1, ArrayRef(Indexes, NumElts)); 1960 } 1961 1962 /// Attempt to convert vpermd/vpermps to shufflevector if the mask is constant. 1963 static Value *simplifyX86vpermv(const IntrinsicInst &II, 1964 InstCombiner::BuilderTy &Builder) { 1965 auto *V = dyn_cast<Constant>(II.getArgOperand(1)); 1966 if (!V) 1967 return nullptr; 1968 1969 auto *VecTy = cast<FixedVectorType>(II.getType()); 1970 unsigned Size = VecTy->getNumElements(); 1971 assert((Size == 4 || Size == 8 || Size == 16 || Size == 32 || Size == 64) && 1972 "Unexpected shuffle mask size"); 1973 1974 // Construct a shuffle mask from constant integers or UNDEFs. 1975 int Indexes[64]; 1976 1977 for (unsigned I = 0; I < Size; ++I) { 1978 Constant *COp = V->getAggregateElement(I); 1979 if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) 1980 return nullptr; 1981 1982 if (isa<UndefValue>(COp)) { 1983 Indexes[I] = -1; 1984 continue; 1985 } 1986 1987 uint32_t Index = cast<ConstantInt>(COp)->getZExtValue(); 1988 Index &= Size - 1; 1989 Indexes[I] = Index; 1990 } 1991 1992 auto V1 = II.getArgOperand(0); 1993 return Builder.CreateShuffleVector(V1, ArrayRef(Indexes, Size)); 1994 } 1995 1996 std::optional<Instruction *> 1997 X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { 1998 auto SimplifyDemandedVectorEltsLow = [&IC](Value *Op, unsigned Width, 1999 unsigned DemandedWidth) { 2000 APInt UndefElts(Width, 0); 2001 APInt DemandedElts = APInt::getLowBitsSet(Width, DemandedWidth); 2002 return IC.SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts); 2003 }; 2004 2005 Intrinsic::ID IID = II.getIntrinsicID(); 2006 switch (IID) { 2007 case Intrinsic::x86_bmi_bextr_32: 2008 case Intrinsic::x86_bmi_bextr_64: 2009 case Intrinsic::x86_tbm_bextri_u32: 2010 case Intrinsic::x86_tbm_bextri_u64: 2011 // If the RHS is a constant we can try some simplifications. 2012 if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) { 2013 uint64_t Shift = C->getZExtValue(); 2014 uint64_t Length = (Shift >> 8) & 0xff; 2015 Shift &= 0xff; 2016 unsigned BitWidth = II.getType()->getIntegerBitWidth(); 2017 // If the length is 0 or the shift is out of range, replace with zero. 2018 if (Length == 0 || Shift >= BitWidth) { 2019 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0)); 2020 } 2021 // If the LHS is also a constant, we can completely constant fold this. 2022 if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) { 2023 uint64_t Result = InC->getZExtValue() >> Shift; 2024 if (Length > BitWidth) 2025 Length = BitWidth; 2026 Result &= maskTrailingOnes<uint64_t>(Length); 2027 return IC.replaceInstUsesWith(II, 2028 ConstantInt::get(II.getType(), Result)); 2029 } 2030 // TODO should we turn this into 'and' if shift is 0? Or 'shl' if we 2031 // are only masking bits that a shift already cleared? 2032 } 2033 break; 2034 2035 case Intrinsic::x86_bmi_bzhi_32: 2036 case Intrinsic::x86_bmi_bzhi_64: 2037 // If the RHS is a constant we can try some simplifications. 2038 if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) { 2039 uint64_t Index = C->getZExtValue() & 0xff; 2040 unsigned BitWidth = II.getType()->getIntegerBitWidth(); 2041 if (Index >= BitWidth) { 2042 return IC.replaceInstUsesWith(II, II.getArgOperand(0)); 2043 } 2044 if (Index == 0) { 2045 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0)); 2046 } 2047 // If the LHS is also a constant, we can completely constant fold this. 2048 if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) { 2049 uint64_t Result = InC->getZExtValue(); 2050 Result &= maskTrailingOnes<uint64_t>(Index); 2051 return IC.replaceInstUsesWith(II, 2052 ConstantInt::get(II.getType(), Result)); 2053 } 2054 // TODO should we convert this to an AND if the RHS is constant? 2055 } 2056 break; 2057 case Intrinsic::x86_bmi_pext_32: 2058 case Intrinsic::x86_bmi_pext_64: 2059 if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) { 2060 if (MaskC->isNullValue()) { 2061 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0)); 2062 } 2063 if (MaskC->isAllOnesValue()) { 2064 return IC.replaceInstUsesWith(II, II.getArgOperand(0)); 2065 } 2066 2067 unsigned MaskIdx, MaskLen; 2068 if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) { 2069 // any single contingous sequence of 1s anywhere in the mask simply 2070 // describes a subset of the input bits shifted to the appropriate 2071 // position. Replace with the straight forward IR. 2072 Value *Input = II.getArgOperand(0); 2073 Value *Masked = IC.Builder.CreateAnd(Input, II.getArgOperand(1)); 2074 Value *ShiftAmt = ConstantInt::get(II.getType(), MaskIdx); 2075 Value *Shifted = IC.Builder.CreateLShr(Masked, ShiftAmt); 2076 return IC.replaceInstUsesWith(II, Shifted); 2077 } 2078 2079 if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) { 2080 uint64_t Src = SrcC->getZExtValue(); 2081 uint64_t Mask = MaskC->getZExtValue(); 2082 uint64_t Result = 0; 2083 uint64_t BitToSet = 1; 2084 2085 while (Mask) { 2086 // Isolate lowest set bit. 2087 uint64_t BitToTest = Mask & -Mask; 2088 if (BitToTest & Src) 2089 Result |= BitToSet; 2090 2091 BitToSet <<= 1; 2092 // Clear lowest set bit. 2093 Mask &= Mask - 1; 2094 } 2095 2096 return IC.replaceInstUsesWith(II, 2097 ConstantInt::get(II.getType(), Result)); 2098 } 2099 } 2100 break; 2101 case Intrinsic::x86_bmi_pdep_32: 2102 case Intrinsic::x86_bmi_pdep_64: 2103 if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) { 2104 if (MaskC->isNullValue()) { 2105 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0)); 2106 } 2107 if (MaskC->isAllOnesValue()) { 2108 return IC.replaceInstUsesWith(II, II.getArgOperand(0)); 2109 } 2110 2111 unsigned MaskIdx, MaskLen; 2112 if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) { 2113 // any single contingous sequence of 1s anywhere in the mask simply 2114 // describes a subset of the input bits shifted to the appropriate 2115 // position. Replace with the straight forward IR. 2116 Value *Input = II.getArgOperand(0); 2117 Value *ShiftAmt = ConstantInt::get(II.getType(), MaskIdx); 2118 Value *Shifted = IC.Builder.CreateShl(Input, ShiftAmt); 2119 Value *Masked = IC.Builder.CreateAnd(Shifted, II.getArgOperand(1)); 2120 return IC.replaceInstUsesWith(II, Masked); 2121 } 2122 2123 if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) { 2124 uint64_t Src = SrcC->getZExtValue(); 2125 uint64_t Mask = MaskC->getZExtValue(); 2126 uint64_t Result = 0; 2127 uint64_t BitToTest = 1; 2128 2129 while (Mask) { 2130 // Isolate lowest set bit. 2131 uint64_t BitToSet = Mask & -Mask; 2132 if (BitToTest & Src) 2133 Result |= BitToSet; 2134 2135 BitToTest <<= 1; 2136 // Clear lowest set bit; 2137 Mask &= Mask - 1; 2138 } 2139 2140 return IC.replaceInstUsesWith(II, 2141 ConstantInt::get(II.getType(), Result)); 2142 } 2143 } 2144 break; 2145 2146 case Intrinsic::x86_sse_cvtss2si: 2147 case Intrinsic::x86_sse_cvtss2si64: 2148 case Intrinsic::x86_sse_cvttss2si: 2149 case Intrinsic::x86_sse_cvttss2si64: 2150 case Intrinsic::x86_sse2_cvtsd2si: 2151 case Intrinsic::x86_sse2_cvtsd2si64: 2152 case Intrinsic::x86_sse2_cvttsd2si: 2153 case Intrinsic::x86_sse2_cvttsd2si64: 2154 case Intrinsic::x86_avx512_vcvtss2si32: 2155 case Intrinsic::x86_avx512_vcvtss2si64: 2156 case Intrinsic::x86_avx512_vcvtss2usi32: 2157 case Intrinsic::x86_avx512_vcvtss2usi64: 2158 case Intrinsic::x86_avx512_vcvtsd2si32: 2159 case Intrinsic::x86_avx512_vcvtsd2si64: 2160 case Intrinsic::x86_avx512_vcvtsd2usi32: 2161 case Intrinsic::x86_avx512_vcvtsd2usi64: 2162 case Intrinsic::x86_avx512_cvttss2si: 2163 case Intrinsic::x86_avx512_cvttss2si64: 2164 case Intrinsic::x86_avx512_cvttss2usi: 2165 case Intrinsic::x86_avx512_cvttss2usi64: 2166 case Intrinsic::x86_avx512_cvttsd2si: 2167 case Intrinsic::x86_avx512_cvttsd2si64: 2168 case Intrinsic::x86_avx512_cvttsd2usi: 2169 case Intrinsic::x86_avx512_cvttsd2usi64: { 2170 // These intrinsics only demand the 0th element of their input vectors. If 2171 // we can simplify the input based on that, do so now. 2172 Value *Arg = II.getArgOperand(0); 2173 unsigned VWidth = cast<FixedVectorType>(Arg->getType())->getNumElements(); 2174 if (Value *V = SimplifyDemandedVectorEltsLow(Arg, VWidth, 1)) { 2175 return IC.replaceOperand(II, 0, V); 2176 } 2177 break; 2178 } 2179 2180 case Intrinsic::x86_mmx_pmovmskb: 2181 case Intrinsic::x86_sse_movmsk_ps: 2182 case Intrinsic::x86_sse2_movmsk_pd: 2183 case Intrinsic::x86_sse2_pmovmskb_128: 2184 case Intrinsic::x86_avx_movmsk_pd_256: 2185 case Intrinsic::x86_avx_movmsk_ps_256: 2186 case Intrinsic::x86_avx2_pmovmskb: 2187 if (Value *V = simplifyX86movmsk(II, IC.Builder)) { 2188 return IC.replaceInstUsesWith(II, V); 2189 } 2190 break; 2191 2192 case Intrinsic::x86_sse_comieq_ss: 2193 case Intrinsic::x86_sse_comige_ss: 2194 case Intrinsic::x86_sse_comigt_ss: 2195 case Intrinsic::x86_sse_comile_ss: 2196 case Intrinsic::x86_sse_comilt_ss: 2197 case Intrinsic::x86_sse_comineq_ss: 2198 case Intrinsic::x86_sse_ucomieq_ss: 2199 case Intrinsic::x86_sse_ucomige_ss: 2200 case Intrinsic::x86_sse_ucomigt_ss: 2201 case Intrinsic::x86_sse_ucomile_ss: 2202 case Intrinsic::x86_sse_ucomilt_ss: 2203 case Intrinsic::x86_sse_ucomineq_ss: 2204 case Intrinsic::x86_sse2_comieq_sd: 2205 case Intrinsic::x86_sse2_comige_sd: 2206 case Intrinsic::x86_sse2_comigt_sd: 2207 case Intrinsic::x86_sse2_comile_sd: 2208 case Intrinsic::x86_sse2_comilt_sd: 2209 case Intrinsic::x86_sse2_comineq_sd: 2210 case Intrinsic::x86_sse2_ucomieq_sd: 2211 case Intrinsic::x86_sse2_ucomige_sd: 2212 case Intrinsic::x86_sse2_ucomigt_sd: 2213 case Intrinsic::x86_sse2_ucomile_sd: 2214 case Intrinsic::x86_sse2_ucomilt_sd: 2215 case Intrinsic::x86_sse2_ucomineq_sd: 2216 case Intrinsic::x86_avx512_vcomi_ss: 2217 case Intrinsic::x86_avx512_vcomi_sd: 2218 case Intrinsic::x86_avx512_mask_cmp_ss: 2219 case Intrinsic::x86_avx512_mask_cmp_sd: { 2220 // These intrinsics only demand the 0th element of their input vectors. If 2221 // we can simplify the input based on that, do so now. 2222 bool MadeChange = false; 2223 Value *Arg0 = II.getArgOperand(0); 2224 Value *Arg1 = II.getArgOperand(1); 2225 unsigned VWidth = cast<FixedVectorType>(Arg0->getType())->getNumElements(); 2226 if (Value *V = SimplifyDemandedVectorEltsLow(Arg0, VWidth, 1)) { 2227 IC.replaceOperand(II, 0, V); 2228 MadeChange = true; 2229 } 2230 if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, 1)) { 2231 IC.replaceOperand(II, 1, V); 2232 MadeChange = true; 2233 } 2234 if (MadeChange) { 2235 return &II; 2236 } 2237 break; 2238 } 2239 2240 case Intrinsic::x86_avx512_add_ps_512: 2241 case Intrinsic::x86_avx512_div_ps_512: 2242 case Intrinsic::x86_avx512_mul_ps_512: 2243 case Intrinsic::x86_avx512_sub_ps_512: 2244 case Intrinsic::x86_avx512_add_pd_512: 2245 case Intrinsic::x86_avx512_div_pd_512: 2246 case Intrinsic::x86_avx512_mul_pd_512: 2247 case Intrinsic::x86_avx512_sub_pd_512: 2248 // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular 2249 // IR operations. 2250 if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(2))) { 2251 if (R->getValue() == 4) { 2252 Value *Arg0 = II.getArgOperand(0); 2253 Value *Arg1 = II.getArgOperand(1); 2254 2255 Value *V; 2256 switch (IID) { 2257 default: 2258 llvm_unreachable("Case stmts out of sync!"); 2259 case Intrinsic::x86_avx512_add_ps_512: 2260 case Intrinsic::x86_avx512_add_pd_512: 2261 V = IC.Builder.CreateFAdd(Arg0, Arg1); 2262 break; 2263 case Intrinsic::x86_avx512_sub_ps_512: 2264 case Intrinsic::x86_avx512_sub_pd_512: 2265 V = IC.Builder.CreateFSub(Arg0, Arg1); 2266 break; 2267 case Intrinsic::x86_avx512_mul_ps_512: 2268 case Intrinsic::x86_avx512_mul_pd_512: 2269 V = IC.Builder.CreateFMul(Arg0, Arg1); 2270 break; 2271 case Intrinsic::x86_avx512_div_ps_512: 2272 case Intrinsic::x86_avx512_div_pd_512: 2273 V = IC.Builder.CreateFDiv(Arg0, Arg1); 2274 break; 2275 } 2276 2277 return IC.replaceInstUsesWith(II, V); 2278 } 2279 } 2280 break; 2281 2282 case Intrinsic::x86_avx512_mask_add_ss_round: 2283 case Intrinsic::x86_avx512_mask_div_ss_round: 2284 case Intrinsic::x86_avx512_mask_mul_ss_round: 2285 case Intrinsic::x86_avx512_mask_sub_ss_round: 2286 case Intrinsic::x86_avx512_mask_add_sd_round: 2287 case Intrinsic::x86_avx512_mask_div_sd_round: 2288 case Intrinsic::x86_avx512_mask_mul_sd_round: 2289 case Intrinsic::x86_avx512_mask_sub_sd_round: 2290 // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular 2291 // IR operations. 2292 if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(4))) { 2293 if (R->getValue() == 4) { 2294 // Extract the element as scalars. 2295 Value *Arg0 = II.getArgOperand(0); 2296 Value *Arg1 = II.getArgOperand(1); 2297 Value *LHS = IC.Builder.CreateExtractElement(Arg0, (uint64_t)0); 2298 Value *RHS = IC.Builder.CreateExtractElement(Arg1, (uint64_t)0); 2299 2300 Value *V; 2301 switch (IID) { 2302 default: 2303 llvm_unreachable("Case stmts out of sync!"); 2304 case Intrinsic::x86_avx512_mask_add_ss_round: 2305 case Intrinsic::x86_avx512_mask_add_sd_round: 2306 V = IC.Builder.CreateFAdd(LHS, RHS); 2307 break; 2308 case Intrinsic::x86_avx512_mask_sub_ss_round: 2309 case Intrinsic::x86_avx512_mask_sub_sd_round: 2310 V = IC.Builder.CreateFSub(LHS, RHS); 2311 break; 2312 case Intrinsic::x86_avx512_mask_mul_ss_round: 2313 case Intrinsic::x86_avx512_mask_mul_sd_round: 2314 V = IC.Builder.CreateFMul(LHS, RHS); 2315 break; 2316 case Intrinsic::x86_avx512_mask_div_ss_round: 2317 case Intrinsic::x86_avx512_mask_div_sd_round: 2318 V = IC.Builder.CreateFDiv(LHS, RHS); 2319 break; 2320 } 2321 2322 // Handle the masking aspect of the intrinsic. 2323 Value *Mask = II.getArgOperand(3); 2324 auto *C = dyn_cast<ConstantInt>(Mask); 2325 // We don't need a select if we know the mask bit is a 1. 2326 if (!C || !C->getValue()[0]) { 2327 // Cast the mask to an i1 vector and then extract the lowest element. 2328 auto *MaskTy = FixedVectorType::get( 2329 IC.Builder.getInt1Ty(), 2330 cast<IntegerType>(Mask->getType())->getBitWidth()); 2331 Mask = IC.Builder.CreateBitCast(Mask, MaskTy); 2332 Mask = IC.Builder.CreateExtractElement(Mask, (uint64_t)0); 2333 // Extract the lowest element from the passthru operand. 2334 Value *Passthru = 2335 IC.Builder.CreateExtractElement(II.getArgOperand(2), (uint64_t)0); 2336 V = IC.Builder.CreateSelect(Mask, V, Passthru); 2337 } 2338 2339 // Insert the result back into the original argument 0. 2340 V = IC.Builder.CreateInsertElement(Arg0, V, (uint64_t)0); 2341 2342 return IC.replaceInstUsesWith(II, V); 2343 } 2344 } 2345 break; 2346 2347 // Constant fold ashr( <A x Bi>, Ci ). 2348 // Constant fold lshr( <A x Bi>, Ci ). 2349 // Constant fold shl( <A x Bi>, Ci ). 2350 case Intrinsic::x86_sse2_psrai_d: 2351 case Intrinsic::x86_sse2_psrai_w: 2352 case Intrinsic::x86_avx2_psrai_d: 2353 case Intrinsic::x86_avx2_psrai_w: 2354 case Intrinsic::x86_avx512_psrai_q_128: 2355 case Intrinsic::x86_avx512_psrai_q_256: 2356 case Intrinsic::x86_avx512_psrai_d_512: 2357 case Intrinsic::x86_avx512_psrai_q_512: 2358 case Intrinsic::x86_avx512_psrai_w_512: 2359 case Intrinsic::x86_sse2_psrli_d: 2360 case Intrinsic::x86_sse2_psrli_q: 2361 case Intrinsic::x86_sse2_psrli_w: 2362 case Intrinsic::x86_avx2_psrli_d: 2363 case Intrinsic::x86_avx2_psrli_q: 2364 case Intrinsic::x86_avx2_psrli_w: 2365 case Intrinsic::x86_avx512_psrli_d_512: 2366 case Intrinsic::x86_avx512_psrli_q_512: 2367 case Intrinsic::x86_avx512_psrli_w_512: 2368 case Intrinsic::x86_sse2_pslli_d: 2369 case Intrinsic::x86_sse2_pslli_q: 2370 case Intrinsic::x86_sse2_pslli_w: 2371 case Intrinsic::x86_avx2_pslli_d: 2372 case Intrinsic::x86_avx2_pslli_q: 2373 case Intrinsic::x86_avx2_pslli_w: 2374 case Intrinsic::x86_avx512_pslli_d_512: 2375 case Intrinsic::x86_avx512_pslli_q_512: 2376 case Intrinsic::x86_avx512_pslli_w_512: 2377 if (Value *V = simplifyX86immShift(II, IC.Builder)) { 2378 return IC.replaceInstUsesWith(II, V); 2379 } 2380 break; 2381 2382 case Intrinsic::x86_sse2_psra_d: 2383 case Intrinsic::x86_sse2_psra_w: 2384 case Intrinsic::x86_avx2_psra_d: 2385 case Intrinsic::x86_avx2_psra_w: 2386 case Intrinsic::x86_avx512_psra_q_128: 2387 case Intrinsic::x86_avx512_psra_q_256: 2388 case Intrinsic::x86_avx512_psra_d_512: 2389 case Intrinsic::x86_avx512_psra_q_512: 2390 case Intrinsic::x86_avx512_psra_w_512: 2391 case Intrinsic::x86_sse2_psrl_d: 2392 case Intrinsic::x86_sse2_psrl_q: 2393 case Intrinsic::x86_sse2_psrl_w: 2394 case Intrinsic::x86_avx2_psrl_d: 2395 case Intrinsic::x86_avx2_psrl_q: 2396 case Intrinsic::x86_avx2_psrl_w: 2397 case Intrinsic::x86_avx512_psrl_d_512: 2398 case Intrinsic::x86_avx512_psrl_q_512: 2399 case Intrinsic::x86_avx512_psrl_w_512: 2400 case Intrinsic::x86_sse2_psll_d: 2401 case Intrinsic::x86_sse2_psll_q: 2402 case Intrinsic::x86_sse2_psll_w: 2403 case Intrinsic::x86_avx2_psll_d: 2404 case Intrinsic::x86_avx2_psll_q: 2405 case Intrinsic::x86_avx2_psll_w: 2406 case Intrinsic::x86_avx512_psll_d_512: 2407 case Intrinsic::x86_avx512_psll_q_512: 2408 case Intrinsic::x86_avx512_psll_w_512: { 2409 if (Value *V = simplifyX86immShift(II, IC.Builder)) { 2410 return IC.replaceInstUsesWith(II, V); 2411 } 2412 2413 // SSE2/AVX2 uses only the first 64-bits of the 128-bit vector 2414 // operand to compute the shift amount. 2415 Value *Arg1 = II.getArgOperand(1); 2416 assert(Arg1->getType()->getPrimitiveSizeInBits() == 128 && 2417 "Unexpected packed shift size"); 2418 unsigned VWidth = cast<FixedVectorType>(Arg1->getType())->getNumElements(); 2419 2420 if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, VWidth / 2)) { 2421 return IC.replaceOperand(II, 1, V); 2422 } 2423 break; 2424 } 2425 2426 case Intrinsic::x86_avx2_psllv_d: 2427 case Intrinsic::x86_avx2_psllv_d_256: 2428 case Intrinsic::x86_avx2_psllv_q: 2429 case Intrinsic::x86_avx2_psllv_q_256: 2430 case Intrinsic::x86_avx512_psllv_d_512: 2431 case Intrinsic::x86_avx512_psllv_q_512: 2432 case Intrinsic::x86_avx512_psllv_w_128: 2433 case Intrinsic::x86_avx512_psllv_w_256: 2434 case Intrinsic::x86_avx512_psllv_w_512: 2435 case Intrinsic::x86_avx2_psrav_d: 2436 case Intrinsic::x86_avx2_psrav_d_256: 2437 case Intrinsic::x86_avx512_psrav_q_128: 2438 case Intrinsic::x86_avx512_psrav_q_256: 2439 case Intrinsic::x86_avx512_psrav_d_512: 2440 case Intrinsic::x86_avx512_psrav_q_512: 2441 case Intrinsic::x86_avx512_psrav_w_128: 2442 case Intrinsic::x86_avx512_psrav_w_256: 2443 case Intrinsic::x86_avx512_psrav_w_512: 2444 case Intrinsic::x86_avx2_psrlv_d: 2445 case Intrinsic::x86_avx2_psrlv_d_256: 2446 case Intrinsic::x86_avx2_psrlv_q: 2447 case Intrinsic::x86_avx2_psrlv_q_256: 2448 case Intrinsic::x86_avx512_psrlv_d_512: 2449 case Intrinsic::x86_avx512_psrlv_q_512: 2450 case Intrinsic::x86_avx512_psrlv_w_128: 2451 case Intrinsic::x86_avx512_psrlv_w_256: 2452 case Intrinsic::x86_avx512_psrlv_w_512: 2453 if (Value *V = simplifyX86varShift(II, IC.Builder)) { 2454 return IC.replaceInstUsesWith(II, V); 2455 } 2456 break; 2457 2458 case Intrinsic::x86_sse2_packssdw_128: 2459 case Intrinsic::x86_sse2_packsswb_128: 2460 case Intrinsic::x86_avx2_packssdw: 2461 case Intrinsic::x86_avx2_packsswb: 2462 case Intrinsic::x86_avx512_packssdw_512: 2463 case Intrinsic::x86_avx512_packsswb_512: 2464 if (Value *V = simplifyX86pack(II, IC.Builder, true)) { 2465 return IC.replaceInstUsesWith(II, V); 2466 } 2467 break; 2468 2469 case Intrinsic::x86_sse2_packuswb_128: 2470 case Intrinsic::x86_sse41_packusdw: 2471 case Intrinsic::x86_avx2_packusdw: 2472 case Intrinsic::x86_avx2_packuswb: 2473 case Intrinsic::x86_avx512_packusdw_512: 2474 case Intrinsic::x86_avx512_packuswb_512: 2475 if (Value *V = simplifyX86pack(II, IC.Builder, false)) { 2476 return IC.replaceInstUsesWith(II, V); 2477 } 2478 break; 2479 2480 case Intrinsic::x86_pclmulqdq: 2481 case Intrinsic::x86_pclmulqdq_256: 2482 case Intrinsic::x86_pclmulqdq_512: { 2483 if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(2))) { 2484 unsigned Imm = C->getZExtValue(); 2485 2486 bool MadeChange = false; 2487 Value *Arg0 = II.getArgOperand(0); 2488 Value *Arg1 = II.getArgOperand(1); 2489 unsigned VWidth = 2490 cast<FixedVectorType>(Arg0->getType())->getNumElements(); 2491 2492 APInt UndefElts1(VWidth, 0); 2493 APInt DemandedElts1 = 2494 APInt::getSplat(VWidth, APInt(2, (Imm & 0x01) ? 2 : 1)); 2495 if (Value *V = 2496 IC.SimplifyDemandedVectorElts(Arg0, DemandedElts1, UndefElts1)) { 2497 IC.replaceOperand(II, 0, V); 2498 MadeChange = true; 2499 } 2500 2501 APInt UndefElts2(VWidth, 0); 2502 APInt DemandedElts2 = 2503 APInt::getSplat(VWidth, APInt(2, (Imm & 0x10) ? 2 : 1)); 2504 if (Value *V = 2505 IC.SimplifyDemandedVectorElts(Arg1, DemandedElts2, UndefElts2)) { 2506 IC.replaceOperand(II, 1, V); 2507 MadeChange = true; 2508 } 2509 2510 // If either input elements are undef, the result is zero. 2511 if (DemandedElts1.isSubsetOf(UndefElts1) || 2512 DemandedElts2.isSubsetOf(UndefElts2)) { 2513 return IC.replaceInstUsesWith(II, 2514 ConstantAggregateZero::get(II.getType())); 2515 } 2516 2517 if (MadeChange) { 2518 return &II; 2519 } 2520 } 2521 break; 2522 } 2523 2524 case Intrinsic::x86_sse41_insertps: 2525 if (Value *V = simplifyX86insertps(II, IC.Builder)) { 2526 return IC.replaceInstUsesWith(II, V); 2527 } 2528 break; 2529 2530 case Intrinsic::x86_sse4a_extrq: { 2531 Value *Op0 = II.getArgOperand(0); 2532 Value *Op1 = II.getArgOperand(1); 2533 unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements(); 2534 unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements(); 2535 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && 2536 Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 && 2537 VWidth1 == 16 && "Unexpected operand sizes"); 2538 2539 // See if we're dealing with constant values. 2540 auto *C1 = dyn_cast<Constant>(Op1); 2541 auto *CILength = 2542 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0)) 2543 : nullptr; 2544 auto *CIIndex = 2545 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1)) 2546 : nullptr; 2547 2548 // Attempt to simplify to a constant, shuffle vector or EXTRQI call. 2549 if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) { 2550 return IC.replaceInstUsesWith(II, V); 2551 } 2552 2553 // EXTRQ only uses the lowest 64-bits of the first 128-bit vector 2554 // operands and the lowest 16-bits of the second. 2555 bool MadeChange = false; 2556 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) { 2557 IC.replaceOperand(II, 0, V); 2558 MadeChange = true; 2559 } 2560 if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 2)) { 2561 IC.replaceOperand(II, 1, V); 2562 MadeChange = true; 2563 } 2564 if (MadeChange) { 2565 return &II; 2566 } 2567 break; 2568 } 2569 2570 case Intrinsic::x86_sse4a_extrqi: { 2571 // EXTRQI: Extract Length bits starting from Index. Zero pad the remaining 2572 // bits of the lower 64-bits. The upper 64-bits are undefined. 2573 Value *Op0 = II.getArgOperand(0); 2574 unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements(); 2575 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 && 2576 "Unexpected operand size"); 2577 2578 // See if we're dealing with constant values. 2579 auto *CILength = dyn_cast<ConstantInt>(II.getArgOperand(1)); 2580 auto *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(2)); 2581 2582 // Attempt to simplify to a constant or shuffle vector. 2583 if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) { 2584 return IC.replaceInstUsesWith(II, V); 2585 } 2586 2587 // EXTRQI only uses the lowest 64-bits of the first 128-bit vector 2588 // operand. 2589 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) { 2590 return IC.replaceOperand(II, 0, V); 2591 } 2592 break; 2593 } 2594 2595 case Intrinsic::x86_sse4a_insertq: { 2596 Value *Op0 = II.getArgOperand(0); 2597 Value *Op1 = II.getArgOperand(1); 2598 unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements(); 2599 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && 2600 Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 && 2601 cast<FixedVectorType>(Op1->getType())->getNumElements() == 2 && 2602 "Unexpected operand size"); 2603 2604 // See if we're dealing with constant values. 2605 auto *C1 = dyn_cast<Constant>(Op1); 2606 auto *CI11 = 2607 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1)) 2608 : nullptr; 2609 2610 // Attempt to simplify to a constant, shuffle vector or INSERTQI call. 2611 if (CI11) { 2612 const APInt &V11 = CI11->getValue(); 2613 APInt Len = V11.zextOrTrunc(6); 2614 APInt Idx = V11.lshr(8).zextOrTrunc(6); 2615 if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) { 2616 return IC.replaceInstUsesWith(II, V); 2617 } 2618 } 2619 2620 // INSERTQ only uses the lowest 64-bits of the first 128-bit vector 2621 // operand. 2622 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) { 2623 return IC.replaceOperand(II, 0, V); 2624 } 2625 break; 2626 } 2627 2628 case Intrinsic::x86_sse4a_insertqi: { 2629 // INSERTQI: Extract lowest Length bits from lower half of second source and 2630 // insert over first source starting at Index bit. The upper 64-bits are 2631 // undefined. 2632 Value *Op0 = II.getArgOperand(0); 2633 Value *Op1 = II.getArgOperand(1); 2634 unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements(); 2635 unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements(); 2636 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && 2637 Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 && 2638 VWidth1 == 2 && "Unexpected operand sizes"); 2639 2640 // See if we're dealing with constant values. 2641 auto *CILength = dyn_cast<ConstantInt>(II.getArgOperand(2)); 2642 auto *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(3)); 2643 2644 // Attempt to simplify to a constant or shuffle vector. 2645 if (CILength && CIIndex) { 2646 APInt Len = CILength->getValue().zextOrTrunc(6); 2647 APInt Idx = CIIndex->getValue().zextOrTrunc(6); 2648 if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) { 2649 return IC.replaceInstUsesWith(II, V); 2650 } 2651 } 2652 2653 // INSERTQI only uses the lowest 64-bits of the first two 128-bit vector 2654 // operands. 2655 bool MadeChange = false; 2656 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) { 2657 IC.replaceOperand(II, 0, V); 2658 MadeChange = true; 2659 } 2660 if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 1)) { 2661 IC.replaceOperand(II, 1, V); 2662 MadeChange = true; 2663 } 2664 if (MadeChange) { 2665 return &II; 2666 } 2667 break; 2668 } 2669 2670 case Intrinsic::x86_sse41_pblendvb: 2671 case Intrinsic::x86_sse41_blendvps: 2672 case Intrinsic::x86_sse41_blendvpd: 2673 case Intrinsic::x86_avx_blendv_ps_256: 2674 case Intrinsic::x86_avx_blendv_pd_256: 2675 case Intrinsic::x86_avx2_pblendvb: { 2676 // fold (blend A, A, Mask) -> A 2677 Value *Op0 = II.getArgOperand(0); 2678 Value *Op1 = II.getArgOperand(1); 2679 Value *Mask = II.getArgOperand(2); 2680 if (Op0 == Op1) { 2681 return IC.replaceInstUsesWith(II, Op0); 2682 } 2683 2684 // Zero Mask - select 1st argument. 2685 if (isa<ConstantAggregateZero>(Mask)) { 2686 return IC.replaceInstUsesWith(II, Op0); 2687 } 2688 2689 // Constant Mask - select 1st/2nd argument lane based on top bit of mask. 2690 if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) { 2691 Constant *NewSelector = getNegativeIsTrueBoolVec(ConstantMask); 2692 return SelectInst::Create(NewSelector, Op1, Op0, "blendv"); 2693 } 2694 2695 // Convert to a vector select if we can bypass casts and find a boolean 2696 // vector condition value. 2697 Value *BoolVec; 2698 Mask = InstCombiner::peekThroughBitcast(Mask); 2699 if (match(Mask, PatternMatch::m_SExt(PatternMatch::m_Value(BoolVec))) && 2700 BoolVec->getType()->isVectorTy() && 2701 BoolVec->getType()->getScalarSizeInBits() == 1) { 2702 assert(Mask->getType()->getPrimitiveSizeInBits() == 2703 II.getType()->getPrimitiveSizeInBits() && 2704 "Not expecting mask and operands with different sizes"); 2705 2706 unsigned NumMaskElts = 2707 cast<FixedVectorType>(Mask->getType())->getNumElements(); 2708 unsigned NumOperandElts = 2709 cast<FixedVectorType>(II.getType())->getNumElements(); 2710 if (NumMaskElts == NumOperandElts) { 2711 return SelectInst::Create(BoolVec, Op1, Op0); 2712 } 2713 2714 // If the mask has less elements than the operands, each mask bit maps to 2715 // multiple elements of the operands. Bitcast back and forth. 2716 if (NumMaskElts < NumOperandElts) { 2717 Value *CastOp0 = IC.Builder.CreateBitCast(Op0, Mask->getType()); 2718 Value *CastOp1 = IC.Builder.CreateBitCast(Op1, Mask->getType()); 2719 Value *Sel = IC.Builder.CreateSelect(BoolVec, CastOp1, CastOp0); 2720 return new BitCastInst(Sel, II.getType()); 2721 } 2722 } 2723 2724 break; 2725 } 2726 2727 case Intrinsic::x86_ssse3_pshuf_b_128: 2728 case Intrinsic::x86_avx2_pshuf_b: 2729 case Intrinsic::x86_avx512_pshuf_b_512: 2730 if (Value *V = simplifyX86pshufb(II, IC.Builder)) { 2731 return IC.replaceInstUsesWith(II, V); 2732 } 2733 break; 2734 2735 case Intrinsic::x86_avx_vpermilvar_ps: 2736 case Intrinsic::x86_avx_vpermilvar_ps_256: 2737 case Intrinsic::x86_avx512_vpermilvar_ps_512: 2738 case Intrinsic::x86_avx_vpermilvar_pd: 2739 case Intrinsic::x86_avx_vpermilvar_pd_256: 2740 case Intrinsic::x86_avx512_vpermilvar_pd_512: 2741 if (Value *V = simplifyX86vpermilvar(II, IC.Builder)) { 2742 return IC.replaceInstUsesWith(II, V); 2743 } 2744 break; 2745 2746 case Intrinsic::x86_avx2_permd: 2747 case Intrinsic::x86_avx2_permps: 2748 case Intrinsic::x86_avx512_permvar_df_256: 2749 case Intrinsic::x86_avx512_permvar_df_512: 2750 case Intrinsic::x86_avx512_permvar_di_256: 2751 case Intrinsic::x86_avx512_permvar_di_512: 2752 case Intrinsic::x86_avx512_permvar_hi_128: 2753 case Intrinsic::x86_avx512_permvar_hi_256: 2754 case Intrinsic::x86_avx512_permvar_hi_512: 2755 case Intrinsic::x86_avx512_permvar_qi_128: 2756 case Intrinsic::x86_avx512_permvar_qi_256: 2757 case Intrinsic::x86_avx512_permvar_qi_512: 2758 case Intrinsic::x86_avx512_permvar_sf_512: 2759 case Intrinsic::x86_avx512_permvar_si_512: 2760 if (Value *V = simplifyX86vpermv(II, IC.Builder)) { 2761 return IC.replaceInstUsesWith(II, V); 2762 } 2763 break; 2764 2765 case Intrinsic::x86_avx_maskload_ps: 2766 case Intrinsic::x86_avx_maskload_pd: 2767 case Intrinsic::x86_avx_maskload_ps_256: 2768 case Intrinsic::x86_avx_maskload_pd_256: 2769 case Intrinsic::x86_avx2_maskload_d: 2770 case Intrinsic::x86_avx2_maskload_q: 2771 case Intrinsic::x86_avx2_maskload_d_256: 2772 case Intrinsic::x86_avx2_maskload_q_256: 2773 if (Instruction *I = simplifyX86MaskedLoad(II, IC)) { 2774 return I; 2775 } 2776 break; 2777 2778 case Intrinsic::x86_sse2_maskmov_dqu: 2779 case Intrinsic::x86_avx_maskstore_ps: 2780 case Intrinsic::x86_avx_maskstore_pd: 2781 case Intrinsic::x86_avx_maskstore_ps_256: 2782 case Intrinsic::x86_avx_maskstore_pd_256: 2783 case Intrinsic::x86_avx2_maskstore_d: 2784 case Intrinsic::x86_avx2_maskstore_q: 2785 case Intrinsic::x86_avx2_maskstore_d_256: 2786 case Intrinsic::x86_avx2_maskstore_q_256: 2787 if (simplifyX86MaskedStore(II, IC)) { 2788 return nullptr; 2789 } 2790 break; 2791 2792 case Intrinsic::x86_addcarry_32: 2793 case Intrinsic::x86_addcarry_64: 2794 if (Value *V = simplifyX86addcarry(II, IC.Builder)) { 2795 return IC.replaceInstUsesWith(II, V); 2796 } 2797 break; 2798 2799 case Intrinsic::x86_avx512_pternlog_d_128: 2800 case Intrinsic::x86_avx512_pternlog_d_256: 2801 case Intrinsic::x86_avx512_pternlog_d_512: 2802 case Intrinsic::x86_avx512_pternlog_q_128: 2803 case Intrinsic::x86_avx512_pternlog_q_256: 2804 case Intrinsic::x86_avx512_pternlog_q_512: 2805 if (Value *V = simplifyTernarylogic(II, IC.Builder)) { 2806 return IC.replaceInstUsesWith(II, V); 2807 } 2808 break; 2809 default: 2810 break; 2811 } 2812 return std::nullopt; 2813 } 2814 2815 std::optional<Value *> X86TTIImpl::simplifyDemandedUseBitsIntrinsic( 2816 InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known, 2817 bool &KnownBitsComputed) const { 2818 switch (II.getIntrinsicID()) { 2819 default: 2820 break; 2821 case Intrinsic::x86_mmx_pmovmskb: 2822 case Intrinsic::x86_sse_movmsk_ps: 2823 case Intrinsic::x86_sse2_movmsk_pd: 2824 case Intrinsic::x86_sse2_pmovmskb_128: 2825 case Intrinsic::x86_avx_movmsk_ps_256: 2826 case Intrinsic::x86_avx_movmsk_pd_256: 2827 case Intrinsic::x86_avx2_pmovmskb: { 2828 // MOVMSK copies the vector elements' sign bits to the low bits 2829 // and zeros the high bits. 2830 unsigned ArgWidth; 2831 if (II.getIntrinsicID() == Intrinsic::x86_mmx_pmovmskb) { 2832 ArgWidth = 8; // Arg is x86_mmx, but treated as <8 x i8>. 2833 } else { 2834 auto *ArgType = cast<FixedVectorType>(II.getArgOperand(0)->getType()); 2835 ArgWidth = ArgType->getNumElements(); 2836 } 2837 2838 // If we don't need any of low bits then return zero, 2839 // we know that DemandedMask is non-zero already. 2840 APInt DemandedElts = DemandedMask.zextOrTrunc(ArgWidth); 2841 Type *VTy = II.getType(); 2842 if (DemandedElts.isZero()) { 2843 return ConstantInt::getNullValue(VTy); 2844 } 2845 2846 // We know that the upper bits are set to zero. 2847 Known.Zero.setBitsFrom(ArgWidth); 2848 KnownBitsComputed = true; 2849 break; 2850 } 2851 } 2852 return std::nullopt; 2853 } 2854 2855 std::optional<Value *> X86TTIImpl::simplifyDemandedVectorEltsIntrinsic( 2856 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, 2857 APInt &UndefElts2, APInt &UndefElts3, 2858 std::function<void(Instruction *, unsigned, APInt, APInt &)> 2859 simplifyAndSetOp) const { 2860 unsigned VWidth = cast<FixedVectorType>(II.getType())->getNumElements(); 2861 switch (II.getIntrinsicID()) { 2862 default: 2863 break; 2864 case Intrinsic::x86_xop_vfrcz_ss: 2865 case Intrinsic::x86_xop_vfrcz_sd: 2866 // The instructions for these intrinsics are speced to zero upper bits not 2867 // pass them through like other scalar intrinsics. So we shouldn't just 2868 // use Arg0 if DemandedElts[0] is clear like we do for other intrinsics. 2869 // Instead we should return a zero vector. 2870 if (!DemandedElts[0]) { 2871 IC.addToWorklist(&II); 2872 return ConstantAggregateZero::get(II.getType()); 2873 } 2874 2875 // Only the lower element is used. 2876 DemandedElts = 1; 2877 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); 2878 2879 // Only the lower element is undefined. The high elements are zero. 2880 UndefElts = UndefElts[0]; 2881 break; 2882 2883 // Unary scalar-as-vector operations that work column-wise. 2884 case Intrinsic::x86_sse_rcp_ss: 2885 case Intrinsic::x86_sse_rsqrt_ss: 2886 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); 2887 2888 // If lowest element of a scalar op isn't used then use Arg0. 2889 if (!DemandedElts[0]) { 2890 IC.addToWorklist(&II); 2891 return II.getArgOperand(0); 2892 } 2893 // TODO: If only low elt lower SQRT to FSQRT (with rounding/exceptions 2894 // checks). 2895 break; 2896 2897 // Binary scalar-as-vector operations that work column-wise. The high 2898 // elements come from operand 0. The low element is a function of both 2899 // operands. 2900 case Intrinsic::x86_sse_min_ss: 2901 case Intrinsic::x86_sse_max_ss: 2902 case Intrinsic::x86_sse_cmp_ss: 2903 case Intrinsic::x86_sse2_min_sd: 2904 case Intrinsic::x86_sse2_max_sd: 2905 case Intrinsic::x86_sse2_cmp_sd: { 2906 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); 2907 2908 // If lowest element of a scalar op isn't used then use Arg0. 2909 if (!DemandedElts[0]) { 2910 IC.addToWorklist(&II); 2911 return II.getArgOperand(0); 2912 } 2913 2914 // Only lower element is used for operand 1. 2915 DemandedElts = 1; 2916 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2); 2917 2918 // Lower element is undefined if both lower elements are undefined. 2919 // Consider things like undef&0. The result is known zero, not undef. 2920 if (!UndefElts2[0]) 2921 UndefElts.clearBit(0); 2922 2923 break; 2924 } 2925 2926 // Binary scalar-as-vector operations that work column-wise. The high 2927 // elements come from operand 0 and the low element comes from operand 1. 2928 case Intrinsic::x86_sse41_round_ss: 2929 case Intrinsic::x86_sse41_round_sd: { 2930 // Don't use the low element of operand 0. 2931 APInt DemandedElts2 = DemandedElts; 2932 DemandedElts2.clearBit(0); 2933 simplifyAndSetOp(&II, 0, DemandedElts2, UndefElts); 2934 2935 // If lowest element of a scalar op isn't used then use Arg0. 2936 if (!DemandedElts[0]) { 2937 IC.addToWorklist(&II); 2938 return II.getArgOperand(0); 2939 } 2940 2941 // Only lower element is used for operand 1. 2942 DemandedElts = 1; 2943 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2); 2944 2945 // Take the high undef elements from operand 0 and take the lower element 2946 // from operand 1. 2947 UndefElts.clearBit(0); 2948 UndefElts |= UndefElts2[0]; 2949 break; 2950 } 2951 2952 // Three input scalar-as-vector operations that work column-wise. The high 2953 // elements come from operand 0 and the low element is a function of all 2954 // three inputs. 2955 case Intrinsic::x86_avx512_mask_add_ss_round: 2956 case Intrinsic::x86_avx512_mask_div_ss_round: 2957 case Intrinsic::x86_avx512_mask_mul_ss_round: 2958 case Intrinsic::x86_avx512_mask_sub_ss_round: 2959 case Intrinsic::x86_avx512_mask_max_ss_round: 2960 case Intrinsic::x86_avx512_mask_min_ss_round: 2961 case Intrinsic::x86_avx512_mask_add_sd_round: 2962 case Intrinsic::x86_avx512_mask_div_sd_round: 2963 case Intrinsic::x86_avx512_mask_mul_sd_round: 2964 case Intrinsic::x86_avx512_mask_sub_sd_round: 2965 case Intrinsic::x86_avx512_mask_max_sd_round: 2966 case Intrinsic::x86_avx512_mask_min_sd_round: 2967 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); 2968 2969 // If lowest element of a scalar op isn't used then use Arg0. 2970 if (!DemandedElts[0]) { 2971 IC.addToWorklist(&II); 2972 return II.getArgOperand(0); 2973 } 2974 2975 // Only lower element is used for operand 1 and 2. 2976 DemandedElts = 1; 2977 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2); 2978 simplifyAndSetOp(&II, 2, DemandedElts, UndefElts3); 2979 2980 // Lower element is undefined if all three lower elements are undefined. 2981 // Consider things like undef&0. The result is known zero, not undef. 2982 if (!UndefElts2[0] || !UndefElts3[0]) 2983 UndefElts.clearBit(0); 2984 break; 2985 2986 // TODO: Add fmaddsub support? 2987 case Intrinsic::x86_sse3_addsub_pd: 2988 case Intrinsic::x86_sse3_addsub_ps: 2989 case Intrinsic::x86_avx_addsub_pd_256: 2990 case Intrinsic::x86_avx_addsub_ps_256: { 2991 // If none of the even or none of the odd lanes are required, turn this 2992 // into a generic FP math instruction. 2993 APInt SubMask = APInt::getSplat(VWidth, APInt(2, 0x1)); 2994 APInt AddMask = APInt::getSplat(VWidth, APInt(2, 0x2)); 2995 bool IsSubOnly = DemandedElts.isSubsetOf(SubMask); 2996 bool IsAddOnly = DemandedElts.isSubsetOf(AddMask); 2997 if (IsSubOnly || IsAddOnly) { 2998 assert((IsSubOnly ^ IsAddOnly) && "Can't be both add-only and sub-only"); 2999 IRBuilderBase::InsertPointGuard Guard(IC.Builder); 3000 IC.Builder.SetInsertPoint(&II); 3001 Value *Arg0 = II.getArgOperand(0), *Arg1 = II.getArgOperand(1); 3002 return IC.Builder.CreateBinOp( 3003 IsSubOnly ? Instruction::FSub : Instruction::FAdd, Arg0, Arg1); 3004 } 3005 3006 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); 3007 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2); 3008 UndefElts &= UndefElts2; 3009 break; 3010 } 3011 3012 // General per-element vector operations. 3013 case Intrinsic::x86_avx2_psllv_d: 3014 case Intrinsic::x86_avx2_psllv_d_256: 3015 case Intrinsic::x86_avx2_psllv_q: 3016 case Intrinsic::x86_avx2_psllv_q_256: 3017 case Intrinsic::x86_avx2_psrlv_d: 3018 case Intrinsic::x86_avx2_psrlv_d_256: 3019 case Intrinsic::x86_avx2_psrlv_q: 3020 case Intrinsic::x86_avx2_psrlv_q_256: 3021 case Intrinsic::x86_avx2_psrav_d: 3022 case Intrinsic::x86_avx2_psrav_d_256: { 3023 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); 3024 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2); 3025 UndefElts &= UndefElts2; 3026 break; 3027 } 3028 3029 case Intrinsic::x86_sse2_packssdw_128: 3030 case Intrinsic::x86_sse2_packsswb_128: 3031 case Intrinsic::x86_sse2_packuswb_128: 3032 case Intrinsic::x86_sse41_packusdw: 3033 case Intrinsic::x86_avx2_packssdw: 3034 case Intrinsic::x86_avx2_packsswb: 3035 case Intrinsic::x86_avx2_packusdw: 3036 case Intrinsic::x86_avx2_packuswb: 3037 case Intrinsic::x86_avx512_packssdw_512: 3038 case Intrinsic::x86_avx512_packsswb_512: 3039 case Intrinsic::x86_avx512_packusdw_512: 3040 case Intrinsic::x86_avx512_packuswb_512: { 3041 auto *Ty0 = II.getArgOperand(0)->getType(); 3042 unsigned InnerVWidth = cast<FixedVectorType>(Ty0)->getNumElements(); 3043 assert(VWidth == (InnerVWidth * 2) && "Unexpected input size"); 3044 3045 unsigned NumLanes = Ty0->getPrimitiveSizeInBits() / 128; 3046 unsigned VWidthPerLane = VWidth / NumLanes; 3047 unsigned InnerVWidthPerLane = InnerVWidth / NumLanes; 3048 3049 // Per lane, pack the elements of the first input and then the second. 3050 // e.g. 3051 // v8i16 PACK(v4i32 X, v4i32 Y) - (X[0..3],Y[0..3]) 3052 // v32i8 PACK(v16i16 X, v16i16 Y) - (X[0..7],Y[0..7]),(X[8..15],Y[8..15]) 3053 for (int OpNum = 0; OpNum != 2; ++OpNum) { 3054 APInt OpDemandedElts(InnerVWidth, 0); 3055 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { 3056 unsigned LaneIdx = Lane * VWidthPerLane; 3057 for (unsigned Elt = 0; Elt != InnerVWidthPerLane; ++Elt) { 3058 unsigned Idx = LaneIdx + Elt + InnerVWidthPerLane * OpNum; 3059 if (DemandedElts[Idx]) 3060 OpDemandedElts.setBit((Lane * InnerVWidthPerLane) + Elt); 3061 } 3062 } 3063 3064 // Demand elements from the operand. 3065 APInt OpUndefElts(InnerVWidth, 0); 3066 simplifyAndSetOp(&II, OpNum, OpDemandedElts, OpUndefElts); 3067 3068 // Pack the operand's UNDEF elements, one lane at a time. 3069 OpUndefElts = OpUndefElts.zext(VWidth); 3070 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { 3071 APInt LaneElts = OpUndefElts.lshr(InnerVWidthPerLane * Lane); 3072 LaneElts = LaneElts.getLoBits(InnerVWidthPerLane); 3073 LaneElts <<= InnerVWidthPerLane * (2 * Lane + OpNum); 3074 UndefElts |= LaneElts; 3075 } 3076 } 3077 break; 3078 } 3079 3080 // PSHUFB 3081 case Intrinsic::x86_ssse3_pshuf_b_128: 3082 case Intrinsic::x86_avx2_pshuf_b: 3083 case Intrinsic::x86_avx512_pshuf_b_512: 3084 // PERMILVAR 3085 case Intrinsic::x86_avx_vpermilvar_ps: 3086 case Intrinsic::x86_avx_vpermilvar_ps_256: 3087 case Intrinsic::x86_avx512_vpermilvar_ps_512: 3088 case Intrinsic::x86_avx_vpermilvar_pd: 3089 case Intrinsic::x86_avx_vpermilvar_pd_256: 3090 case Intrinsic::x86_avx512_vpermilvar_pd_512: 3091 // PERMV 3092 case Intrinsic::x86_avx2_permd: 3093 case Intrinsic::x86_avx2_permps: { 3094 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts); 3095 break; 3096 } 3097 3098 // SSE4A instructions leave the upper 64-bits of the 128-bit result 3099 // in an undefined state. 3100 case Intrinsic::x86_sse4a_extrq: 3101 case Intrinsic::x86_sse4a_extrqi: 3102 case Intrinsic::x86_sse4a_insertq: 3103 case Intrinsic::x86_sse4a_insertqi: 3104 UndefElts.setHighBits(VWidth / 2); 3105 break; 3106 } 3107 return std::nullopt; 3108 } 3109