1e8d8bef9SDimitry Andric //===-- X86InstCombineIntrinsic.cpp - X86 specific InstCombine pass -------===// 2e8d8bef9SDimitry Andric // 3e8d8bef9SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4e8d8bef9SDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 5e8d8bef9SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6e8d8bef9SDimitry Andric // 7e8d8bef9SDimitry Andric //===----------------------------------------------------------------------===// 8e8d8bef9SDimitry Andric /// \file 9e8d8bef9SDimitry Andric /// This file implements a TargetTransformInfo analysis pass specific to the 10e8d8bef9SDimitry Andric /// X86 target machine. It uses the target's detailed information to provide 11e8d8bef9SDimitry Andric /// more precise answers to certain TTI queries, while letting the target 12e8d8bef9SDimitry Andric /// independent and default TTI implementations handle the rest. 13e8d8bef9SDimitry Andric /// 14e8d8bef9SDimitry Andric //===----------------------------------------------------------------------===// 15e8d8bef9SDimitry Andric 16e8d8bef9SDimitry Andric #include "X86TargetTransformInfo.h" 17e8d8bef9SDimitry Andric #include "llvm/IR/IntrinsicInst.h" 18e8d8bef9SDimitry Andric #include "llvm/IR/IntrinsicsX86.h" 19e8d8bef9SDimitry Andric #include "llvm/Support/KnownBits.h" 20e8d8bef9SDimitry Andric #include "llvm/Transforms/InstCombine/InstCombiner.h" 21e8d8bef9SDimitry Andric 22e8d8bef9SDimitry Andric using namespace llvm; 23e8d8bef9SDimitry Andric 24e8d8bef9SDimitry Andric #define DEBUG_TYPE "x86tti" 25e8d8bef9SDimitry Andric 26e8d8bef9SDimitry Andric /// Return a constant boolean vector that has true elements in all positions 27e8d8bef9SDimitry Andric /// where the input constant data vector has an element with the sign bit set. 28e8d8bef9SDimitry Andric static Constant *getNegativeIsTrueBoolVec(Constant *V) { 29e8d8bef9SDimitry Andric VectorType *IntTy = VectorType::getInteger(cast<VectorType>(V->getType())); 30e8d8bef9SDimitry Andric V = ConstantExpr::getBitCast(V, IntTy); 31e8d8bef9SDimitry Andric V = ConstantExpr::getICmp(CmpInst::ICMP_SGT, Constant::getNullValue(IntTy), 32e8d8bef9SDimitry Andric V); 33e8d8bef9SDimitry Andric return V; 34e8d8bef9SDimitry Andric } 35e8d8bef9SDimitry Andric 36e8d8bef9SDimitry Andric /// Convert the x86 XMM integer vector mask to a vector of bools based on 37e8d8bef9SDimitry Andric /// each element's most significant bit (the sign bit). 38e8d8bef9SDimitry Andric static Value *getBoolVecFromMask(Value *Mask) { 39e8d8bef9SDimitry Andric // Fold Constant Mask. 40e8d8bef9SDimitry Andric if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) 41e8d8bef9SDimitry Andric return getNegativeIsTrueBoolVec(ConstantMask); 42e8d8bef9SDimitry Andric 43e8d8bef9SDimitry Andric // Mask was extended from a boolean vector. 44e8d8bef9SDimitry Andric Value *ExtMask; 45e8d8bef9SDimitry Andric if (PatternMatch::match( 46e8d8bef9SDimitry Andric Mask, PatternMatch::m_SExt(PatternMatch::m_Value(ExtMask))) && 47e8d8bef9SDimitry Andric ExtMask->getType()->isIntOrIntVectorTy(1)) 48e8d8bef9SDimitry Andric return ExtMask; 49e8d8bef9SDimitry Andric 50e8d8bef9SDimitry Andric return nullptr; 51e8d8bef9SDimitry Andric } 52e8d8bef9SDimitry Andric 53e8d8bef9SDimitry Andric // TODO: If the x86 backend knew how to convert a bool vector mask back to an 54e8d8bef9SDimitry Andric // XMM register mask efficiently, we could transform all x86 masked intrinsics 55e8d8bef9SDimitry Andric // to LLVM masked intrinsics and remove the x86 masked intrinsic defs. 56e8d8bef9SDimitry Andric static Instruction *simplifyX86MaskedLoad(IntrinsicInst &II, InstCombiner &IC) { 57e8d8bef9SDimitry Andric Value *Ptr = II.getOperand(0); 58e8d8bef9SDimitry Andric Value *Mask = II.getOperand(1); 59e8d8bef9SDimitry Andric Constant *ZeroVec = Constant::getNullValue(II.getType()); 60e8d8bef9SDimitry Andric 61e8d8bef9SDimitry Andric // Zero Mask - masked load instruction creates a zero vector. 62e8d8bef9SDimitry Andric if (isa<ConstantAggregateZero>(Mask)) 63e8d8bef9SDimitry Andric return IC.replaceInstUsesWith(II, ZeroVec); 64e8d8bef9SDimitry Andric 65e8d8bef9SDimitry Andric // The mask is constant or extended from a bool vector. Convert this x86 66e8d8bef9SDimitry Andric // intrinsic to the LLVM intrinsic to allow target-independent optimizations. 67e8d8bef9SDimitry Andric if (Value *BoolMask = getBoolVecFromMask(Mask)) { 68e8d8bef9SDimitry Andric // First, cast the x86 intrinsic scalar pointer to a vector pointer to match 69e8d8bef9SDimitry Andric // the LLVM intrinsic definition for the pointer argument. 70e8d8bef9SDimitry Andric unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace(); 71e8d8bef9SDimitry Andric PointerType *VecPtrTy = PointerType::get(II.getType(), AddrSpace); 72e8d8bef9SDimitry Andric Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec"); 73e8d8bef9SDimitry Andric 74e8d8bef9SDimitry Andric // The pass-through vector for an x86 masked load is a zero vector. 75fe6060f1SDimitry Andric CallInst *NewMaskedLoad = IC.Builder.CreateMaskedLoad( 76fe6060f1SDimitry Andric II.getType(), PtrCast, Align(1), BoolMask, ZeroVec); 77e8d8bef9SDimitry Andric return IC.replaceInstUsesWith(II, NewMaskedLoad); 78e8d8bef9SDimitry Andric } 79e8d8bef9SDimitry Andric 80e8d8bef9SDimitry Andric return nullptr; 81e8d8bef9SDimitry Andric } 82e8d8bef9SDimitry Andric 83e8d8bef9SDimitry Andric // TODO: If the x86 backend knew how to convert a bool vector mask back to an 84e8d8bef9SDimitry Andric // XMM register mask efficiently, we could transform all x86 masked intrinsics 85e8d8bef9SDimitry Andric // to LLVM masked intrinsics and remove the x86 masked intrinsic defs. 86e8d8bef9SDimitry Andric static bool simplifyX86MaskedStore(IntrinsicInst &II, InstCombiner &IC) { 87e8d8bef9SDimitry Andric Value *Ptr = II.getOperand(0); 88e8d8bef9SDimitry Andric Value *Mask = II.getOperand(1); 89e8d8bef9SDimitry Andric Value *Vec = II.getOperand(2); 90e8d8bef9SDimitry Andric 91e8d8bef9SDimitry Andric // Zero Mask - this masked store instruction does nothing. 92e8d8bef9SDimitry Andric if (isa<ConstantAggregateZero>(Mask)) { 93e8d8bef9SDimitry Andric IC.eraseInstFromFunction(II); 94e8d8bef9SDimitry Andric return true; 95e8d8bef9SDimitry Andric } 96e8d8bef9SDimitry Andric 97e8d8bef9SDimitry Andric // The SSE2 version is too weird (eg, unaligned but non-temporal) to do 98e8d8bef9SDimitry Andric // anything else at this level. 99e8d8bef9SDimitry Andric if (II.getIntrinsicID() == Intrinsic::x86_sse2_maskmov_dqu) 100e8d8bef9SDimitry Andric return false; 101e8d8bef9SDimitry Andric 102e8d8bef9SDimitry Andric // The mask is constant or extended from a bool vector. Convert this x86 103e8d8bef9SDimitry Andric // intrinsic to the LLVM intrinsic to allow target-independent optimizations. 104e8d8bef9SDimitry Andric if (Value *BoolMask = getBoolVecFromMask(Mask)) { 105e8d8bef9SDimitry Andric unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace(); 106e8d8bef9SDimitry Andric PointerType *VecPtrTy = PointerType::get(Vec->getType(), AddrSpace); 107e8d8bef9SDimitry Andric Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec"); 108e8d8bef9SDimitry Andric 109e8d8bef9SDimitry Andric IC.Builder.CreateMaskedStore(Vec, PtrCast, Align(1), BoolMask); 110e8d8bef9SDimitry Andric 111e8d8bef9SDimitry Andric // 'Replace uses' doesn't work for stores. Erase the original masked store. 112e8d8bef9SDimitry Andric IC.eraseInstFromFunction(II); 113e8d8bef9SDimitry Andric return true; 114e8d8bef9SDimitry Andric } 115e8d8bef9SDimitry Andric 116e8d8bef9SDimitry Andric return false; 117e8d8bef9SDimitry Andric } 118e8d8bef9SDimitry Andric 119e8d8bef9SDimitry Andric static Value *simplifyX86immShift(const IntrinsicInst &II, 120e8d8bef9SDimitry Andric InstCombiner::BuilderTy &Builder) { 121e8d8bef9SDimitry Andric bool LogicalShift = false; 122e8d8bef9SDimitry Andric bool ShiftLeft = false; 123e8d8bef9SDimitry Andric bool IsImm = false; 124e8d8bef9SDimitry Andric 125e8d8bef9SDimitry Andric switch (II.getIntrinsicID()) { 126e8d8bef9SDimitry Andric default: 127e8d8bef9SDimitry Andric llvm_unreachable("Unexpected intrinsic!"); 128e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_psrai_d: 129e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_psrai_w: 130e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psrai_d: 131e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psrai_w: 132e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrai_q_128: 133e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrai_q_256: 134e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrai_d_512: 135e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrai_q_512: 136e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrai_w_512: 137e8d8bef9SDimitry Andric IsImm = true; 138e8d8bef9SDimitry Andric LLVM_FALLTHROUGH; 139e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_psra_d: 140e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_psra_w: 141e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psra_d: 142e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psra_w: 143e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psra_q_128: 144e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psra_q_256: 145e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psra_d_512: 146e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psra_q_512: 147e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psra_w_512: 148e8d8bef9SDimitry Andric LogicalShift = false; 149e8d8bef9SDimitry Andric ShiftLeft = false; 150e8d8bef9SDimitry Andric break; 151e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_psrli_d: 152e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_psrli_q: 153e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_psrli_w: 154e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psrli_d: 155e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psrli_q: 156e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psrli_w: 157e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrli_d_512: 158e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrli_q_512: 159e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrli_w_512: 160e8d8bef9SDimitry Andric IsImm = true; 161e8d8bef9SDimitry Andric LLVM_FALLTHROUGH; 162e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_psrl_d: 163e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_psrl_q: 164e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_psrl_w: 165e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psrl_d: 166e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psrl_q: 167e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psrl_w: 168e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrl_d_512: 169e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrl_q_512: 170e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrl_w_512: 171e8d8bef9SDimitry Andric LogicalShift = true; 172e8d8bef9SDimitry Andric ShiftLeft = false; 173e8d8bef9SDimitry Andric break; 174e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_pslli_d: 175e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_pslli_q: 176e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_pslli_w: 177e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_pslli_d: 178e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_pslli_q: 179e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_pslli_w: 180e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_pslli_d_512: 181e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_pslli_q_512: 182e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_pslli_w_512: 183e8d8bef9SDimitry Andric IsImm = true; 184e8d8bef9SDimitry Andric LLVM_FALLTHROUGH; 185e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_psll_d: 186e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_psll_q: 187e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_psll_w: 188e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psll_d: 189e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psll_q: 190e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psll_w: 191e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psll_d_512: 192e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psll_q_512: 193e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psll_w_512: 194e8d8bef9SDimitry Andric LogicalShift = true; 195e8d8bef9SDimitry Andric ShiftLeft = true; 196e8d8bef9SDimitry Andric break; 197e8d8bef9SDimitry Andric } 198e8d8bef9SDimitry Andric assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left"); 199e8d8bef9SDimitry Andric 200fe6060f1SDimitry Andric Value *Vec = II.getArgOperand(0); 201fe6060f1SDimitry Andric Value *Amt = II.getArgOperand(1); 202fe6060f1SDimitry Andric auto *VT = cast<FixedVectorType>(Vec->getType()); 203fe6060f1SDimitry Andric Type *SVT = VT->getElementType(); 204fe6060f1SDimitry Andric Type *AmtVT = Amt->getType(); 205e8d8bef9SDimitry Andric unsigned VWidth = VT->getNumElements(); 206e8d8bef9SDimitry Andric unsigned BitWidth = SVT->getPrimitiveSizeInBits(); 207e8d8bef9SDimitry Andric 208e8d8bef9SDimitry Andric // If the shift amount is guaranteed to be in-range we can replace it with a 209e8d8bef9SDimitry Andric // generic shift. If its guaranteed to be out of range, logical shifts combine 210e8d8bef9SDimitry Andric // to zero and arithmetic shifts are clamped to (BitWidth - 1). 211e8d8bef9SDimitry Andric if (IsImm) { 212e8d8bef9SDimitry Andric assert(AmtVT->isIntegerTy(32) && "Unexpected shift-by-immediate type"); 213e8d8bef9SDimitry Andric KnownBits KnownAmtBits = 214e8d8bef9SDimitry Andric llvm::computeKnownBits(Amt, II.getModule()->getDataLayout()); 215e8d8bef9SDimitry Andric if (KnownAmtBits.getMaxValue().ult(BitWidth)) { 216e8d8bef9SDimitry Andric Amt = Builder.CreateZExtOrTrunc(Amt, SVT); 217e8d8bef9SDimitry Andric Amt = Builder.CreateVectorSplat(VWidth, Amt); 218e8d8bef9SDimitry Andric return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt) 219e8d8bef9SDimitry Andric : Builder.CreateLShr(Vec, Amt)) 220e8d8bef9SDimitry Andric : Builder.CreateAShr(Vec, Amt)); 221e8d8bef9SDimitry Andric } 222e8d8bef9SDimitry Andric if (KnownAmtBits.getMinValue().uge(BitWidth)) { 223e8d8bef9SDimitry Andric if (LogicalShift) 224e8d8bef9SDimitry Andric return ConstantAggregateZero::get(VT); 225e8d8bef9SDimitry Andric Amt = ConstantInt::get(SVT, BitWidth - 1); 226e8d8bef9SDimitry Andric return Builder.CreateAShr(Vec, Builder.CreateVectorSplat(VWidth, Amt)); 227e8d8bef9SDimitry Andric } 228e8d8bef9SDimitry Andric } else { 229e8d8bef9SDimitry Andric // Ensure the first element has an in-range value and the rest of the 230e8d8bef9SDimitry Andric // elements in the bottom 64 bits are zero. 231e8d8bef9SDimitry Andric assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 && 232e8d8bef9SDimitry Andric cast<VectorType>(AmtVT)->getElementType() == SVT && 233e8d8bef9SDimitry Andric "Unexpected shift-by-scalar type"); 234e8d8bef9SDimitry Andric unsigned NumAmtElts = cast<FixedVectorType>(AmtVT)->getNumElements(); 235e8d8bef9SDimitry Andric APInt DemandedLower = APInt::getOneBitSet(NumAmtElts, 0); 236e8d8bef9SDimitry Andric APInt DemandedUpper = APInt::getBitsSet(NumAmtElts, 1, NumAmtElts / 2); 237e8d8bef9SDimitry Andric KnownBits KnownLowerBits = llvm::computeKnownBits( 238e8d8bef9SDimitry Andric Amt, DemandedLower, II.getModule()->getDataLayout()); 239e8d8bef9SDimitry Andric KnownBits KnownUpperBits = llvm::computeKnownBits( 240e8d8bef9SDimitry Andric Amt, DemandedUpper, II.getModule()->getDataLayout()); 241e8d8bef9SDimitry Andric if (KnownLowerBits.getMaxValue().ult(BitWidth) && 242349cc55cSDimitry Andric (DemandedUpper.isZero() || KnownUpperBits.isZero())) { 243e8d8bef9SDimitry Andric SmallVector<int, 16> ZeroSplat(VWidth, 0); 244e8d8bef9SDimitry Andric Amt = Builder.CreateShuffleVector(Amt, ZeroSplat); 245e8d8bef9SDimitry Andric return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt) 246e8d8bef9SDimitry Andric : Builder.CreateLShr(Vec, Amt)) 247e8d8bef9SDimitry Andric : Builder.CreateAShr(Vec, Amt)); 248e8d8bef9SDimitry Andric } 249e8d8bef9SDimitry Andric } 250e8d8bef9SDimitry Andric 251e8d8bef9SDimitry Andric // Simplify if count is constant vector. 252fe6060f1SDimitry Andric auto *CDV = dyn_cast<ConstantDataVector>(Amt); 253e8d8bef9SDimitry Andric if (!CDV) 254e8d8bef9SDimitry Andric return nullptr; 255e8d8bef9SDimitry Andric 256e8d8bef9SDimitry Andric // SSE2/AVX2 uses all the first 64-bits of the 128-bit vector 257e8d8bef9SDimitry Andric // operand to compute the shift amount. 258e8d8bef9SDimitry Andric assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 && 259e8d8bef9SDimitry Andric cast<VectorType>(AmtVT)->getElementType() == SVT && 260e8d8bef9SDimitry Andric "Unexpected shift-by-scalar type"); 261e8d8bef9SDimitry Andric 262e8d8bef9SDimitry Andric // Concatenate the sub-elements to create the 64-bit value. 263e8d8bef9SDimitry Andric APInt Count(64, 0); 264e8d8bef9SDimitry Andric for (unsigned i = 0, NumSubElts = 64 / BitWidth; i != NumSubElts; ++i) { 265e8d8bef9SDimitry Andric unsigned SubEltIdx = (NumSubElts - 1) - i; 266fe6060f1SDimitry Andric auto *SubElt = cast<ConstantInt>(CDV->getElementAsConstant(SubEltIdx)); 267e8d8bef9SDimitry Andric Count <<= BitWidth; 268e8d8bef9SDimitry Andric Count |= SubElt->getValue().zextOrTrunc(64); 269e8d8bef9SDimitry Andric } 270e8d8bef9SDimitry Andric 271e8d8bef9SDimitry Andric // If shift-by-zero then just return the original value. 272349cc55cSDimitry Andric if (Count.isZero()) 273e8d8bef9SDimitry Andric return Vec; 274e8d8bef9SDimitry Andric 275e8d8bef9SDimitry Andric // Handle cases when Shift >= BitWidth. 276e8d8bef9SDimitry Andric if (Count.uge(BitWidth)) { 277e8d8bef9SDimitry Andric // If LogicalShift - just return zero. 278e8d8bef9SDimitry Andric if (LogicalShift) 279e8d8bef9SDimitry Andric return ConstantAggregateZero::get(VT); 280e8d8bef9SDimitry Andric 281e8d8bef9SDimitry Andric // If ArithmeticShift - clamp Shift to (BitWidth - 1). 282e8d8bef9SDimitry Andric Count = APInt(64, BitWidth - 1); 283e8d8bef9SDimitry Andric } 284e8d8bef9SDimitry Andric 285e8d8bef9SDimitry Andric // Get a constant vector of the same type as the first operand. 286e8d8bef9SDimitry Andric auto ShiftAmt = ConstantInt::get(SVT, Count.zextOrTrunc(BitWidth)); 287e8d8bef9SDimitry Andric auto ShiftVec = Builder.CreateVectorSplat(VWidth, ShiftAmt); 288e8d8bef9SDimitry Andric 289e8d8bef9SDimitry Andric if (ShiftLeft) 290e8d8bef9SDimitry Andric return Builder.CreateShl(Vec, ShiftVec); 291e8d8bef9SDimitry Andric 292e8d8bef9SDimitry Andric if (LogicalShift) 293e8d8bef9SDimitry Andric return Builder.CreateLShr(Vec, ShiftVec); 294e8d8bef9SDimitry Andric 295e8d8bef9SDimitry Andric return Builder.CreateAShr(Vec, ShiftVec); 296e8d8bef9SDimitry Andric } 297e8d8bef9SDimitry Andric 298e8d8bef9SDimitry Andric // Attempt to simplify AVX2 per-element shift intrinsics to a generic IR shift. 299e8d8bef9SDimitry Andric // Unlike the generic IR shifts, the intrinsics have defined behaviour for out 300e8d8bef9SDimitry Andric // of range shift amounts (logical - set to zero, arithmetic - splat sign bit). 301e8d8bef9SDimitry Andric static Value *simplifyX86varShift(const IntrinsicInst &II, 302e8d8bef9SDimitry Andric InstCombiner::BuilderTy &Builder) { 303e8d8bef9SDimitry Andric bool LogicalShift = false; 304e8d8bef9SDimitry Andric bool ShiftLeft = false; 305e8d8bef9SDimitry Andric 306e8d8bef9SDimitry Andric switch (II.getIntrinsicID()) { 307e8d8bef9SDimitry Andric default: 308e8d8bef9SDimitry Andric llvm_unreachable("Unexpected intrinsic!"); 309e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psrav_d: 310e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psrav_d_256: 311e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrav_q_128: 312e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrav_q_256: 313e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrav_d_512: 314e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrav_q_512: 315e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrav_w_128: 316e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrav_w_256: 317e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrav_w_512: 318e8d8bef9SDimitry Andric LogicalShift = false; 319e8d8bef9SDimitry Andric ShiftLeft = false; 320e8d8bef9SDimitry Andric break; 321e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psrlv_d: 322e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psrlv_d_256: 323e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psrlv_q: 324e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psrlv_q_256: 325e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrlv_d_512: 326e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrlv_q_512: 327e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrlv_w_128: 328e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrlv_w_256: 329e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrlv_w_512: 330e8d8bef9SDimitry Andric LogicalShift = true; 331e8d8bef9SDimitry Andric ShiftLeft = false; 332e8d8bef9SDimitry Andric break; 333e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psllv_d: 334e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psllv_d_256: 335e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psllv_q: 336e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psllv_q_256: 337e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psllv_d_512: 338e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psllv_q_512: 339e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psllv_w_128: 340e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psllv_w_256: 341e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psllv_w_512: 342e8d8bef9SDimitry Andric LogicalShift = true; 343e8d8bef9SDimitry Andric ShiftLeft = true; 344e8d8bef9SDimitry Andric break; 345e8d8bef9SDimitry Andric } 346e8d8bef9SDimitry Andric assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left"); 347e8d8bef9SDimitry Andric 348fe6060f1SDimitry Andric Value *Vec = II.getArgOperand(0); 349fe6060f1SDimitry Andric Value *Amt = II.getArgOperand(1); 350fe6060f1SDimitry Andric auto *VT = cast<FixedVectorType>(II.getType()); 351fe6060f1SDimitry Andric Type *SVT = VT->getElementType(); 352e8d8bef9SDimitry Andric int NumElts = VT->getNumElements(); 353e8d8bef9SDimitry Andric int BitWidth = SVT->getIntegerBitWidth(); 354e8d8bef9SDimitry Andric 355e8d8bef9SDimitry Andric // If the shift amount is guaranteed to be in-range we can replace it with a 356e8d8bef9SDimitry Andric // generic shift. 357*81ad6265SDimitry Andric KnownBits KnownAmt = 358*81ad6265SDimitry Andric llvm::computeKnownBits(Amt, II.getModule()->getDataLayout()); 359*81ad6265SDimitry Andric if (KnownAmt.getMaxValue().ult(BitWidth)) { 360e8d8bef9SDimitry Andric return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt) 361e8d8bef9SDimitry Andric : Builder.CreateLShr(Vec, Amt)) 362e8d8bef9SDimitry Andric : Builder.CreateAShr(Vec, Amt)); 363e8d8bef9SDimitry Andric } 364e8d8bef9SDimitry Andric 365e8d8bef9SDimitry Andric // Simplify if all shift amounts are constant/undef. 366e8d8bef9SDimitry Andric auto *CShift = dyn_cast<Constant>(Amt); 367e8d8bef9SDimitry Andric if (!CShift) 368e8d8bef9SDimitry Andric return nullptr; 369e8d8bef9SDimitry Andric 370e8d8bef9SDimitry Andric // Collect each element's shift amount. 371e8d8bef9SDimitry Andric // We also collect special cases: UNDEF = -1, OUT-OF-RANGE = BitWidth. 372e8d8bef9SDimitry Andric bool AnyOutOfRange = false; 373e8d8bef9SDimitry Andric SmallVector<int, 8> ShiftAmts; 374e8d8bef9SDimitry Andric for (int I = 0; I < NumElts; ++I) { 375e8d8bef9SDimitry Andric auto *CElt = CShift->getAggregateElement(I); 376e8d8bef9SDimitry Andric if (isa_and_nonnull<UndefValue>(CElt)) { 377e8d8bef9SDimitry Andric ShiftAmts.push_back(-1); 378e8d8bef9SDimitry Andric continue; 379e8d8bef9SDimitry Andric } 380e8d8bef9SDimitry Andric 381e8d8bef9SDimitry Andric auto *COp = dyn_cast_or_null<ConstantInt>(CElt); 382e8d8bef9SDimitry Andric if (!COp) 383e8d8bef9SDimitry Andric return nullptr; 384e8d8bef9SDimitry Andric 385e8d8bef9SDimitry Andric // Handle out of range shifts. 386e8d8bef9SDimitry Andric // If LogicalShift - set to BitWidth (special case). 387e8d8bef9SDimitry Andric // If ArithmeticShift - set to (BitWidth - 1) (sign splat). 388e8d8bef9SDimitry Andric APInt ShiftVal = COp->getValue(); 389e8d8bef9SDimitry Andric if (ShiftVal.uge(BitWidth)) { 390e8d8bef9SDimitry Andric AnyOutOfRange = LogicalShift; 391e8d8bef9SDimitry Andric ShiftAmts.push_back(LogicalShift ? BitWidth : BitWidth - 1); 392e8d8bef9SDimitry Andric continue; 393e8d8bef9SDimitry Andric } 394e8d8bef9SDimitry Andric 395e8d8bef9SDimitry Andric ShiftAmts.push_back((int)ShiftVal.getZExtValue()); 396e8d8bef9SDimitry Andric } 397e8d8bef9SDimitry Andric 398e8d8bef9SDimitry Andric // If all elements out of range or UNDEF, return vector of zeros/undefs. 399e8d8bef9SDimitry Andric // ArithmeticShift should only hit this if they are all UNDEF. 400e8d8bef9SDimitry Andric auto OutOfRange = [&](int Idx) { return (Idx < 0) || (BitWidth <= Idx); }; 401e8d8bef9SDimitry Andric if (llvm::all_of(ShiftAmts, OutOfRange)) { 402e8d8bef9SDimitry Andric SmallVector<Constant *, 8> ConstantVec; 403e8d8bef9SDimitry Andric for (int Idx : ShiftAmts) { 404e8d8bef9SDimitry Andric if (Idx < 0) { 405e8d8bef9SDimitry Andric ConstantVec.push_back(UndefValue::get(SVT)); 406e8d8bef9SDimitry Andric } else { 407e8d8bef9SDimitry Andric assert(LogicalShift && "Logical shift expected"); 408e8d8bef9SDimitry Andric ConstantVec.push_back(ConstantInt::getNullValue(SVT)); 409e8d8bef9SDimitry Andric } 410e8d8bef9SDimitry Andric } 411e8d8bef9SDimitry Andric return ConstantVector::get(ConstantVec); 412e8d8bef9SDimitry Andric } 413e8d8bef9SDimitry Andric 414e8d8bef9SDimitry Andric // We can't handle only some out of range values with generic logical shifts. 415e8d8bef9SDimitry Andric if (AnyOutOfRange) 416e8d8bef9SDimitry Andric return nullptr; 417e8d8bef9SDimitry Andric 418e8d8bef9SDimitry Andric // Build the shift amount constant vector. 419e8d8bef9SDimitry Andric SmallVector<Constant *, 8> ShiftVecAmts; 420e8d8bef9SDimitry Andric for (int Idx : ShiftAmts) { 421e8d8bef9SDimitry Andric if (Idx < 0) 422e8d8bef9SDimitry Andric ShiftVecAmts.push_back(UndefValue::get(SVT)); 423e8d8bef9SDimitry Andric else 424e8d8bef9SDimitry Andric ShiftVecAmts.push_back(ConstantInt::get(SVT, Idx)); 425e8d8bef9SDimitry Andric } 426e8d8bef9SDimitry Andric auto ShiftVec = ConstantVector::get(ShiftVecAmts); 427e8d8bef9SDimitry Andric 428e8d8bef9SDimitry Andric if (ShiftLeft) 429e8d8bef9SDimitry Andric return Builder.CreateShl(Vec, ShiftVec); 430e8d8bef9SDimitry Andric 431e8d8bef9SDimitry Andric if (LogicalShift) 432e8d8bef9SDimitry Andric return Builder.CreateLShr(Vec, ShiftVec); 433e8d8bef9SDimitry Andric 434e8d8bef9SDimitry Andric return Builder.CreateAShr(Vec, ShiftVec); 435e8d8bef9SDimitry Andric } 436e8d8bef9SDimitry Andric 437e8d8bef9SDimitry Andric static Value *simplifyX86pack(IntrinsicInst &II, 438e8d8bef9SDimitry Andric InstCombiner::BuilderTy &Builder, bool IsSigned) { 439e8d8bef9SDimitry Andric Value *Arg0 = II.getArgOperand(0); 440e8d8bef9SDimitry Andric Value *Arg1 = II.getArgOperand(1); 441e8d8bef9SDimitry Andric Type *ResTy = II.getType(); 442e8d8bef9SDimitry Andric 443e8d8bef9SDimitry Andric // Fast all undef handling. 444e8d8bef9SDimitry Andric if (isa<UndefValue>(Arg0) && isa<UndefValue>(Arg1)) 445e8d8bef9SDimitry Andric return UndefValue::get(ResTy); 446e8d8bef9SDimitry Andric 447e8d8bef9SDimitry Andric auto *ArgTy = cast<FixedVectorType>(Arg0->getType()); 448e8d8bef9SDimitry Andric unsigned NumLanes = ResTy->getPrimitiveSizeInBits() / 128; 449e8d8bef9SDimitry Andric unsigned NumSrcElts = ArgTy->getNumElements(); 450e8d8bef9SDimitry Andric assert(cast<FixedVectorType>(ResTy)->getNumElements() == (2 * NumSrcElts) && 451e8d8bef9SDimitry Andric "Unexpected packing types"); 452e8d8bef9SDimitry Andric 453e8d8bef9SDimitry Andric unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes; 454e8d8bef9SDimitry Andric unsigned DstScalarSizeInBits = ResTy->getScalarSizeInBits(); 455e8d8bef9SDimitry Andric unsigned SrcScalarSizeInBits = ArgTy->getScalarSizeInBits(); 456e8d8bef9SDimitry Andric assert(SrcScalarSizeInBits == (2 * DstScalarSizeInBits) && 457e8d8bef9SDimitry Andric "Unexpected packing types"); 458e8d8bef9SDimitry Andric 459e8d8bef9SDimitry Andric // Constant folding. 460e8d8bef9SDimitry Andric if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1)) 461e8d8bef9SDimitry Andric return nullptr; 462e8d8bef9SDimitry Andric 463e8d8bef9SDimitry Andric // Clamp Values - signed/unsigned both use signed clamp values, but they 464e8d8bef9SDimitry Andric // differ on the min/max values. 465e8d8bef9SDimitry Andric APInt MinValue, MaxValue; 466e8d8bef9SDimitry Andric if (IsSigned) { 467e8d8bef9SDimitry Andric // PACKSS: Truncate signed value with signed saturation. 468e8d8bef9SDimitry Andric // Source values less than dst minint are saturated to minint. 469e8d8bef9SDimitry Andric // Source values greater than dst maxint are saturated to maxint. 470e8d8bef9SDimitry Andric MinValue = 471e8d8bef9SDimitry Andric APInt::getSignedMinValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits); 472e8d8bef9SDimitry Andric MaxValue = 473e8d8bef9SDimitry Andric APInt::getSignedMaxValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits); 474e8d8bef9SDimitry Andric } else { 475e8d8bef9SDimitry Andric // PACKUS: Truncate signed value with unsigned saturation. 476e8d8bef9SDimitry Andric // Source values less than zero are saturated to zero. 477e8d8bef9SDimitry Andric // Source values greater than dst maxuint are saturated to maxuint. 478349cc55cSDimitry Andric MinValue = APInt::getZero(SrcScalarSizeInBits); 479e8d8bef9SDimitry Andric MaxValue = APInt::getLowBitsSet(SrcScalarSizeInBits, DstScalarSizeInBits); 480e8d8bef9SDimitry Andric } 481e8d8bef9SDimitry Andric 482e8d8bef9SDimitry Andric auto *MinC = Constant::getIntegerValue(ArgTy, MinValue); 483e8d8bef9SDimitry Andric auto *MaxC = Constant::getIntegerValue(ArgTy, MaxValue); 484e8d8bef9SDimitry Andric Arg0 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg0, MinC), MinC, Arg0); 485e8d8bef9SDimitry Andric Arg1 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg1, MinC), MinC, Arg1); 486e8d8bef9SDimitry Andric Arg0 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg0, MaxC), MaxC, Arg0); 487e8d8bef9SDimitry Andric Arg1 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg1, MaxC), MaxC, Arg1); 488e8d8bef9SDimitry Andric 489e8d8bef9SDimitry Andric // Shuffle clamped args together at the lane level. 490e8d8bef9SDimitry Andric SmallVector<int, 32> PackMask; 491e8d8bef9SDimitry Andric for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { 492e8d8bef9SDimitry Andric for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt) 493e8d8bef9SDimitry Andric PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane)); 494e8d8bef9SDimitry Andric for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt) 495e8d8bef9SDimitry Andric PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane) + NumSrcElts); 496e8d8bef9SDimitry Andric } 497e8d8bef9SDimitry Andric auto *Shuffle = Builder.CreateShuffleVector(Arg0, Arg1, PackMask); 498e8d8bef9SDimitry Andric 499e8d8bef9SDimitry Andric // Truncate to dst size. 500e8d8bef9SDimitry Andric return Builder.CreateTrunc(Shuffle, ResTy); 501e8d8bef9SDimitry Andric } 502e8d8bef9SDimitry Andric 503e8d8bef9SDimitry Andric static Value *simplifyX86movmsk(const IntrinsicInst &II, 504e8d8bef9SDimitry Andric InstCombiner::BuilderTy &Builder) { 505e8d8bef9SDimitry Andric Value *Arg = II.getArgOperand(0); 506e8d8bef9SDimitry Andric Type *ResTy = II.getType(); 507e8d8bef9SDimitry Andric 508e8d8bef9SDimitry Andric // movmsk(undef) -> zero as we must ensure the upper bits are zero. 509e8d8bef9SDimitry Andric if (isa<UndefValue>(Arg)) 510e8d8bef9SDimitry Andric return Constant::getNullValue(ResTy); 511e8d8bef9SDimitry Andric 512e8d8bef9SDimitry Andric auto *ArgTy = dyn_cast<FixedVectorType>(Arg->getType()); 513e8d8bef9SDimitry Andric // We can't easily peek through x86_mmx types. 514e8d8bef9SDimitry Andric if (!ArgTy) 515e8d8bef9SDimitry Andric return nullptr; 516e8d8bef9SDimitry Andric 517e8d8bef9SDimitry Andric // Expand MOVMSK to compare/bitcast/zext: 518e8d8bef9SDimitry Andric // e.g. PMOVMSKB(v16i8 x): 519e8d8bef9SDimitry Andric // %cmp = icmp slt <16 x i8> %x, zeroinitializer 520e8d8bef9SDimitry Andric // %int = bitcast <16 x i1> %cmp to i16 521e8d8bef9SDimitry Andric // %res = zext i16 %int to i32 522e8d8bef9SDimitry Andric unsigned NumElts = ArgTy->getNumElements(); 523e8d8bef9SDimitry Andric Type *IntegerTy = Builder.getIntNTy(NumElts); 524e8d8bef9SDimitry Andric 525*81ad6265SDimitry Andric Value *Res = Builder.CreateBitCast(Arg, VectorType::getInteger(ArgTy)); 526*81ad6265SDimitry Andric Res = Builder.CreateIsNeg(Res); 527e8d8bef9SDimitry Andric Res = Builder.CreateBitCast(Res, IntegerTy); 528e8d8bef9SDimitry Andric Res = Builder.CreateZExtOrTrunc(Res, ResTy); 529e8d8bef9SDimitry Andric return Res; 530e8d8bef9SDimitry Andric } 531e8d8bef9SDimitry Andric 532e8d8bef9SDimitry Andric static Value *simplifyX86addcarry(const IntrinsicInst &II, 533e8d8bef9SDimitry Andric InstCombiner::BuilderTy &Builder) { 534e8d8bef9SDimitry Andric Value *CarryIn = II.getArgOperand(0); 535e8d8bef9SDimitry Andric Value *Op1 = II.getArgOperand(1); 536e8d8bef9SDimitry Andric Value *Op2 = II.getArgOperand(2); 537e8d8bef9SDimitry Andric Type *RetTy = II.getType(); 538e8d8bef9SDimitry Andric Type *OpTy = Op1->getType(); 539e8d8bef9SDimitry Andric assert(RetTy->getStructElementType(0)->isIntegerTy(8) && 540e8d8bef9SDimitry Andric RetTy->getStructElementType(1) == OpTy && OpTy == Op2->getType() && 541e8d8bef9SDimitry Andric "Unexpected types for x86 addcarry"); 542e8d8bef9SDimitry Andric 543e8d8bef9SDimitry Andric // If carry-in is zero, this is just an unsigned add with overflow. 544e8d8bef9SDimitry Andric if (match(CarryIn, PatternMatch::m_ZeroInt())) { 545e8d8bef9SDimitry Andric Value *UAdd = Builder.CreateIntrinsic(Intrinsic::uadd_with_overflow, OpTy, 546e8d8bef9SDimitry Andric {Op1, Op2}); 547e8d8bef9SDimitry Andric // The types have to be adjusted to match the x86 call types. 548e8d8bef9SDimitry Andric Value *UAddResult = Builder.CreateExtractValue(UAdd, 0); 549e8d8bef9SDimitry Andric Value *UAddOV = Builder.CreateZExt(Builder.CreateExtractValue(UAdd, 1), 550e8d8bef9SDimitry Andric Builder.getInt8Ty()); 551e8d8bef9SDimitry Andric Value *Res = UndefValue::get(RetTy); 552e8d8bef9SDimitry Andric Res = Builder.CreateInsertValue(Res, UAddOV, 0); 553e8d8bef9SDimitry Andric return Builder.CreateInsertValue(Res, UAddResult, 1); 554e8d8bef9SDimitry Andric } 555e8d8bef9SDimitry Andric 556e8d8bef9SDimitry Andric return nullptr; 557e8d8bef9SDimitry Andric } 558e8d8bef9SDimitry Andric 559e8d8bef9SDimitry Andric static Value *simplifyX86insertps(const IntrinsicInst &II, 560e8d8bef9SDimitry Andric InstCombiner::BuilderTy &Builder) { 561e8d8bef9SDimitry Andric auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2)); 562e8d8bef9SDimitry Andric if (!CInt) 563e8d8bef9SDimitry Andric return nullptr; 564e8d8bef9SDimitry Andric 565e8d8bef9SDimitry Andric auto *VecTy = cast<FixedVectorType>(II.getType()); 566e8d8bef9SDimitry Andric assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type"); 567e8d8bef9SDimitry Andric 568e8d8bef9SDimitry Andric // The immediate permute control byte looks like this: 569e8d8bef9SDimitry Andric // [3:0] - zero mask for each 32-bit lane 570e8d8bef9SDimitry Andric // [5:4] - select one 32-bit destination lane 571e8d8bef9SDimitry Andric // [7:6] - select one 32-bit source lane 572e8d8bef9SDimitry Andric 573e8d8bef9SDimitry Andric uint8_t Imm = CInt->getZExtValue(); 574e8d8bef9SDimitry Andric uint8_t ZMask = Imm & 0xf; 575e8d8bef9SDimitry Andric uint8_t DestLane = (Imm >> 4) & 0x3; 576e8d8bef9SDimitry Andric uint8_t SourceLane = (Imm >> 6) & 0x3; 577e8d8bef9SDimitry Andric 578e8d8bef9SDimitry Andric ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy); 579e8d8bef9SDimitry Andric 580e8d8bef9SDimitry Andric // If all zero mask bits are set, this was just a weird way to 581e8d8bef9SDimitry Andric // generate a zero vector. 582e8d8bef9SDimitry Andric if (ZMask == 0xf) 583e8d8bef9SDimitry Andric return ZeroVector; 584e8d8bef9SDimitry Andric 585e8d8bef9SDimitry Andric // Initialize by passing all of the first source bits through. 586e8d8bef9SDimitry Andric int ShuffleMask[4] = {0, 1, 2, 3}; 587e8d8bef9SDimitry Andric 588e8d8bef9SDimitry Andric // We may replace the second operand with the zero vector. 589e8d8bef9SDimitry Andric Value *V1 = II.getArgOperand(1); 590e8d8bef9SDimitry Andric 591e8d8bef9SDimitry Andric if (ZMask) { 592e8d8bef9SDimitry Andric // If the zero mask is being used with a single input or the zero mask 593e8d8bef9SDimitry Andric // overrides the destination lane, this is a shuffle with the zero vector. 594e8d8bef9SDimitry Andric if ((II.getArgOperand(0) == II.getArgOperand(1)) || 595e8d8bef9SDimitry Andric (ZMask & (1 << DestLane))) { 596e8d8bef9SDimitry Andric V1 = ZeroVector; 597e8d8bef9SDimitry Andric // We may still move 32-bits of the first source vector from one lane 598e8d8bef9SDimitry Andric // to another. 599e8d8bef9SDimitry Andric ShuffleMask[DestLane] = SourceLane; 600e8d8bef9SDimitry Andric // The zero mask may override the previous insert operation. 601e8d8bef9SDimitry Andric for (unsigned i = 0; i < 4; ++i) 602e8d8bef9SDimitry Andric if ((ZMask >> i) & 0x1) 603e8d8bef9SDimitry Andric ShuffleMask[i] = i + 4; 604e8d8bef9SDimitry Andric } else { 605e8d8bef9SDimitry Andric // TODO: Model this case as 2 shuffles or a 'logical and' plus shuffle? 606e8d8bef9SDimitry Andric return nullptr; 607e8d8bef9SDimitry Andric } 608e8d8bef9SDimitry Andric } else { 609e8d8bef9SDimitry Andric // Replace the selected destination lane with the selected source lane. 610e8d8bef9SDimitry Andric ShuffleMask[DestLane] = SourceLane + 4; 611e8d8bef9SDimitry Andric } 612e8d8bef9SDimitry Andric 613e8d8bef9SDimitry Andric return Builder.CreateShuffleVector(II.getArgOperand(0), V1, ShuffleMask); 614e8d8bef9SDimitry Andric } 615e8d8bef9SDimitry Andric 616e8d8bef9SDimitry Andric /// Attempt to simplify SSE4A EXTRQ/EXTRQI instructions using constant folding 617e8d8bef9SDimitry Andric /// or conversion to a shuffle vector. 618e8d8bef9SDimitry Andric static Value *simplifyX86extrq(IntrinsicInst &II, Value *Op0, 619e8d8bef9SDimitry Andric ConstantInt *CILength, ConstantInt *CIIndex, 620e8d8bef9SDimitry Andric InstCombiner::BuilderTy &Builder) { 621e8d8bef9SDimitry Andric auto LowConstantHighUndef = [&](uint64_t Val) { 622e8d8bef9SDimitry Andric Type *IntTy64 = Type::getInt64Ty(II.getContext()); 623e8d8bef9SDimitry Andric Constant *Args[] = {ConstantInt::get(IntTy64, Val), 624e8d8bef9SDimitry Andric UndefValue::get(IntTy64)}; 625e8d8bef9SDimitry Andric return ConstantVector::get(Args); 626e8d8bef9SDimitry Andric }; 627e8d8bef9SDimitry Andric 628e8d8bef9SDimitry Andric // See if we're dealing with constant values. 629fe6060f1SDimitry Andric auto *C0 = dyn_cast<Constant>(Op0); 630fe6060f1SDimitry Andric auto *CI0 = 631e8d8bef9SDimitry Andric C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0)) 632e8d8bef9SDimitry Andric : nullptr; 633e8d8bef9SDimitry Andric 634e8d8bef9SDimitry Andric // Attempt to constant fold. 635e8d8bef9SDimitry Andric if (CILength && CIIndex) { 636e8d8bef9SDimitry Andric // From AMD documentation: "The bit index and field length are each six 637e8d8bef9SDimitry Andric // bits in length other bits of the field are ignored." 638e8d8bef9SDimitry Andric APInt APIndex = CIIndex->getValue().zextOrTrunc(6); 639e8d8bef9SDimitry Andric APInt APLength = CILength->getValue().zextOrTrunc(6); 640e8d8bef9SDimitry Andric 641e8d8bef9SDimitry Andric unsigned Index = APIndex.getZExtValue(); 642e8d8bef9SDimitry Andric 643e8d8bef9SDimitry Andric // From AMD documentation: "a value of zero in the field length is 644e8d8bef9SDimitry Andric // defined as length of 64". 645e8d8bef9SDimitry Andric unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue(); 646e8d8bef9SDimitry Andric 647e8d8bef9SDimitry Andric // From AMD documentation: "If the sum of the bit index + length field 648e8d8bef9SDimitry Andric // is greater than 64, the results are undefined". 649e8d8bef9SDimitry Andric unsigned End = Index + Length; 650e8d8bef9SDimitry Andric 651e8d8bef9SDimitry Andric // Note that both field index and field length are 8-bit quantities. 652e8d8bef9SDimitry Andric // Since variables 'Index' and 'Length' are unsigned values 653e8d8bef9SDimitry Andric // obtained from zero-extending field index and field length 654e8d8bef9SDimitry Andric // respectively, their sum should never wrap around. 655e8d8bef9SDimitry Andric if (End > 64) 656e8d8bef9SDimitry Andric return UndefValue::get(II.getType()); 657e8d8bef9SDimitry Andric 658e8d8bef9SDimitry Andric // If we are inserting whole bytes, we can convert this to a shuffle. 659e8d8bef9SDimitry Andric // Lowering can recognize EXTRQI shuffle masks. 660e8d8bef9SDimitry Andric if ((Length % 8) == 0 && (Index % 8) == 0) { 661e8d8bef9SDimitry Andric // Convert bit indices to byte indices. 662e8d8bef9SDimitry Andric Length /= 8; 663e8d8bef9SDimitry Andric Index /= 8; 664e8d8bef9SDimitry Andric 665e8d8bef9SDimitry Andric Type *IntTy8 = Type::getInt8Ty(II.getContext()); 666e8d8bef9SDimitry Andric auto *ShufTy = FixedVectorType::get(IntTy8, 16); 667e8d8bef9SDimitry Andric 668e8d8bef9SDimitry Andric SmallVector<int, 16> ShuffleMask; 669e8d8bef9SDimitry Andric for (int i = 0; i != (int)Length; ++i) 670e8d8bef9SDimitry Andric ShuffleMask.push_back(i + Index); 671e8d8bef9SDimitry Andric for (int i = Length; i != 8; ++i) 672e8d8bef9SDimitry Andric ShuffleMask.push_back(i + 16); 673e8d8bef9SDimitry Andric for (int i = 8; i != 16; ++i) 674e8d8bef9SDimitry Andric ShuffleMask.push_back(-1); 675e8d8bef9SDimitry Andric 676e8d8bef9SDimitry Andric Value *SV = Builder.CreateShuffleVector( 677e8d8bef9SDimitry Andric Builder.CreateBitCast(Op0, ShufTy), 678e8d8bef9SDimitry Andric ConstantAggregateZero::get(ShufTy), ShuffleMask); 679e8d8bef9SDimitry Andric return Builder.CreateBitCast(SV, II.getType()); 680e8d8bef9SDimitry Andric } 681e8d8bef9SDimitry Andric 682e8d8bef9SDimitry Andric // Constant Fold - shift Index'th bit to lowest position and mask off 683e8d8bef9SDimitry Andric // Length bits. 684e8d8bef9SDimitry Andric if (CI0) { 685e8d8bef9SDimitry Andric APInt Elt = CI0->getValue(); 686e8d8bef9SDimitry Andric Elt.lshrInPlace(Index); 687e8d8bef9SDimitry Andric Elt = Elt.zextOrTrunc(Length); 688e8d8bef9SDimitry Andric return LowConstantHighUndef(Elt.getZExtValue()); 689e8d8bef9SDimitry Andric } 690e8d8bef9SDimitry Andric 691e8d8bef9SDimitry Andric // If we were an EXTRQ call, we'll save registers if we convert to EXTRQI. 692e8d8bef9SDimitry Andric if (II.getIntrinsicID() == Intrinsic::x86_sse4a_extrq) { 693e8d8bef9SDimitry Andric Value *Args[] = {Op0, CILength, CIIndex}; 694e8d8bef9SDimitry Andric Module *M = II.getModule(); 695e8d8bef9SDimitry Andric Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_extrqi); 696e8d8bef9SDimitry Andric return Builder.CreateCall(F, Args); 697e8d8bef9SDimitry Andric } 698e8d8bef9SDimitry Andric } 699e8d8bef9SDimitry Andric 700e8d8bef9SDimitry Andric // Constant Fold - extraction from zero is always {zero, undef}. 701e8d8bef9SDimitry Andric if (CI0 && CI0->isZero()) 702e8d8bef9SDimitry Andric return LowConstantHighUndef(0); 703e8d8bef9SDimitry Andric 704e8d8bef9SDimitry Andric return nullptr; 705e8d8bef9SDimitry Andric } 706e8d8bef9SDimitry Andric 707e8d8bef9SDimitry Andric /// Attempt to simplify SSE4A INSERTQ/INSERTQI instructions using constant 708e8d8bef9SDimitry Andric /// folding or conversion to a shuffle vector. 709e8d8bef9SDimitry Andric static Value *simplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1, 710e8d8bef9SDimitry Andric APInt APLength, APInt APIndex, 711e8d8bef9SDimitry Andric InstCombiner::BuilderTy &Builder) { 712e8d8bef9SDimitry Andric // From AMD documentation: "The bit index and field length are each six bits 713e8d8bef9SDimitry Andric // in length other bits of the field are ignored." 714e8d8bef9SDimitry Andric APIndex = APIndex.zextOrTrunc(6); 715e8d8bef9SDimitry Andric APLength = APLength.zextOrTrunc(6); 716e8d8bef9SDimitry Andric 717e8d8bef9SDimitry Andric // Attempt to constant fold. 718e8d8bef9SDimitry Andric unsigned Index = APIndex.getZExtValue(); 719e8d8bef9SDimitry Andric 720e8d8bef9SDimitry Andric // From AMD documentation: "a value of zero in the field length is 721e8d8bef9SDimitry Andric // defined as length of 64". 722e8d8bef9SDimitry Andric unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue(); 723e8d8bef9SDimitry Andric 724e8d8bef9SDimitry Andric // From AMD documentation: "If the sum of the bit index + length field 725e8d8bef9SDimitry Andric // is greater than 64, the results are undefined". 726e8d8bef9SDimitry Andric unsigned End = Index + Length; 727e8d8bef9SDimitry Andric 728e8d8bef9SDimitry Andric // Note that both field index and field length are 8-bit quantities. 729e8d8bef9SDimitry Andric // Since variables 'Index' and 'Length' are unsigned values 730e8d8bef9SDimitry Andric // obtained from zero-extending field index and field length 731e8d8bef9SDimitry Andric // respectively, their sum should never wrap around. 732e8d8bef9SDimitry Andric if (End > 64) 733e8d8bef9SDimitry Andric return UndefValue::get(II.getType()); 734e8d8bef9SDimitry Andric 735e8d8bef9SDimitry Andric // If we are inserting whole bytes, we can convert this to a shuffle. 736e8d8bef9SDimitry Andric // Lowering can recognize INSERTQI shuffle masks. 737e8d8bef9SDimitry Andric if ((Length % 8) == 0 && (Index % 8) == 0) { 738e8d8bef9SDimitry Andric // Convert bit indices to byte indices. 739e8d8bef9SDimitry Andric Length /= 8; 740e8d8bef9SDimitry Andric Index /= 8; 741e8d8bef9SDimitry Andric 742e8d8bef9SDimitry Andric Type *IntTy8 = Type::getInt8Ty(II.getContext()); 743e8d8bef9SDimitry Andric auto *ShufTy = FixedVectorType::get(IntTy8, 16); 744e8d8bef9SDimitry Andric 745e8d8bef9SDimitry Andric SmallVector<int, 16> ShuffleMask; 746e8d8bef9SDimitry Andric for (int i = 0; i != (int)Index; ++i) 747e8d8bef9SDimitry Andric ShuffleMask.push_back(i); 748e8d8bef9SDimitry Andric for (int i = 0; i != (int)Length; ++i) 749e8d8bef9SDimitry Andric ShuffleMask.push_back(i + 16); 750e8d8bef9SDimitry Andric for (int i = Index + Length; i != 8; ++i) 751e8d8bef9SDimitry Andric ShuffleMask.push_back(i); 752e8d8bef9SDimitry Andric for (int i = 8; i != 16; ++i) 753e8d8bef9SDimitry Andric ShuffleMask.push_back(-1); 754e8d8bef9SDimitry Andric 755e8d8bef9SDimitry Andric Value *SV = Builder.CreateShuffleVector(Builder.CreateBitCast(Op0, ShufTy), 756e8d8bef9SDimitry Andric Builder.CreateBitCast(Op1, ShufTy), 757e8d8bef9SDimitry Andric ShuffleMask); 758e8d8bef9SDimitry Andric return Builder.CreateBitCast(SV, II.getType()); 759e8d8bef9SDimitry Andric } 760e8d8bef9SDimitry Andric 761e8d8bef9SDimitry Andric // See if we're dealing with constant values. 762fe6060f1SDimitry Andric auto *C0 = dyn_cast<Constant>(Op0); 763fe6060f1SDimitry Andric auto *C1 = dyn_cast<Constant>(Op1); 764fe6060f1SDimitry Andric auto *CI00 = 765e8d8bef9SDimitry Andric C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0)) 766e8d8bef9SDimitry Andric : nullptr; 767fe6060f1SDimitry Andric auto *CI10 = 768e8d8bef9SDimitry Andric C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0)) 769e8d8bef9SDimitry Andric : nullptr; 770e8d8bef9SDimitry Andric 771e8d8bef9SDimitry Andric // Constant Fold - insert bottom Length bits starting at the Index'th bit. 772e8d8bef9SDimitry Andric if (CI00 && CI10) { 773e8d8bef9SDimitry Andric APInt V00 = CI00->getValue(); 774e8d8bef9SDimitry Andric APInt V10 = CI10->getValue(); 775e8d8bef9SDimitry Andric APInt Mask = APInt::getLowBitsSet(64, Length).shl(Index); 776e8d8bef9SDimitry Andric V00 = V00 & ~Mask; 777e8d8bef9SDimitry Andric V10 = V10.zextOrTrunc(Length).zextOrTrunc(64).shl(Index); 778e8d8bef9SDimitry Andric APInt Val = V00 | V10; 779e8d8bef9SDimitry Andric Type *IntTy64 = Type::getInt64Ty(II.getContext()); 780e8d8bef9SDimitry Andric Constant *Args[] = {ConstantInt::get(IntTy64, Val.getZExtValue()), 781e8d8bef9SDimitry Andric UndefValue::get(IntTy64)}; 782e8d8bef9SDimitry Andric return ConstantVector::get(Args); 783e8d8bef9SDimitry Andric } 784e8d8bef9SDimitry Andric 785e8d8bef9SDimitry Andric // If we were an INSERTQ call, we'll save demanded elements if we convert to 786e8d8bef9SDimitry Andric // INSERTQI. 787e8d8bef9SDimitry Andric if (II.getIntrinsicID() == Intrinsic::x86_sse4a_insertq) { 788e8d8bef9SDimitry Andric Type *IntTy8 = Type::getInt8Ty(II.getContext()); 789e8d8bef9SDimitry Andric Constant *CILength = ConstantInt::get(IntTy8, Length, false); 790e8d8bef9SDimitry Andric Constant *CIIndex = ConstantInt::get(IntTy8, Index, false); 791e8d8bef9SDimitry Andric 792e8d8bef9SDimitry Andric Value *Args[] = {Op0, Op1, CILength, CIIndex}; 793e8d8bef9SDimitry Andric Module *M = II.getModule(); 794e8d8bef9SDimitry Andric Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi); 795e8d8bef9SDimitry Andric return Builder.CreateCall(F, Args); 796e8d8bef9SDimitry Andric } 797e8d8bef9SDimitry Andric 798e8d8bef9SDimitry Andric return nullptr; 799e8d8bef9SDimitry Andric } 800e8d8bef9SDimitry Andric 801e8d8bef9SDimitry Andric /// Attempt to convert pshufb* to shufflevector if the mask is constant. 802e8d8bef9SDimitry Andric static Value *simplifyX86pshufb(const IntrinsicInst &II, 803e8d8bef9SDimitry Andric InstCombiner::BuilderTy &Builder) { 804fe6060f1SDimitry Andric auto *V = dyn_cast<Constant>(II.getArgOperand(1)); 805e8d8bef9SDimitry Andric if (!V) 806e8d8bef9SDimitry Andric return nullptr; 807e8d8bef9SDimitry Andric 808e8d8bef9SDimitry Andric auto *VecTy = cast<FixedVectorType>(II.getType()); 809e8d8bef9SDimitry Andric unsigned NumElts = VecTy->getNumElements(); 810e8d8bef9SDimitry Andric assert((NumElts == 16 || NumElts == 32 || NumElts == 64) && 811e8d8bef9SDimitry Andric "Unexpected number of elements in shuffle mask!"); 812e8d8bef9SDimitry Andric 813e8d8bef9SDimitry Andric // Construct a shuffle mask from constant integers or UNDEFs. 814e8d8bef9SDimitry Andric int Indexes[64]; 815e8d8bef9SDimitry Andric 816e8d8bef9SDimitry Andric // Each byte in the shuffle control mask forms an index to permute the 817e8d8bef9SDimitry Andric // corresponding byte in the destination operand. 818e8d8bef9SDimitry Andric for (unsigned I = 0; I < NumElts; ++I) { 819e8d8bef9SDimitry Andric Constant *COp = V->getAggregateElement(I); 820e8d8bef9SDimitry Andric if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) 821e8d8bef9SDimitry Andric return nullptr; 822e8d8bef9SDimitry Andric 823e8d8bef9SDimitry Andric if (isa<UndefValue>(COp)) { 824e8d8bef9SDimitry Andric Indexes[I] = -1; 825e8d8bef9SDimitry Andric continue; 826e8d8bef9SDimitry Andric } 827e8d8bef9SDimitry Andric 828e8d8bef9SDimitry Andric int8_t Index = cast<ConstantInt>(COp)->getValue().getZExtValue(); 829e8d8bef9SDimitry Andric 830e8d8bef9SDimitry Andric // If the most significant bit (bit[7]) of each byte of the shuffle 831e8d8bef9SDimitry Andric // control mask is set, then zero is written in the result byte. 832e8d8bef9SDimitry Andric // The zero vector is in the right-hand side of the resulting 833e8d8bef9SDimitry Andric // shufflevector. 834e8d8bef9SDimitry Andric 835e8d8bef9SDimitry Andric // The value of each index for the high 128-bit lane is the least 836e8d8bef9SDimitry Andric // significant 4 bits of the respective shuffle control byte. 837e8d8bef9SDimitry Andric Index = ((Index < 0) ? NumElts : Index & 0x0F) + (I & 0xF0); 838e8d8bef9SDimitry Andric Indexes[I] = Index; 839e8d8bef9SDimitry Andric } 840e8d8bef9SDimitry Andric 841e8d8bef9SDimitry Andric auto V1 = II.getArgOperand(0); 842e8d8bef9SDimitry Andric auto V2 = Constant::getNullValue(VecTy); 843e8d8bef9SDimitry Andric return Builder.CreateShuffleVector(V1, V2, makeArrayRef(Indexes, NumElts)); 844e8d8bef9SDimitry Andric } 845e8d8bef9SDimitry Andric 846e8d8bef9SDimitry Andric /// Attempt to convert vpermilvar* to shufflevector if the mask is constant. 847e8d8bef9SDimitry Andric static Value *simplifyX86vpermilvar(const IntrinsicInst &II, 848e8d8bef9SDimitry Andric InstCombiner::BuilderTy &Builder) { 849fe6060f1SDimitry Andric auto *V = dyn_cast<Constant>(II.getArgOperand(1)); 850e8d8bef9SDimitry Andric if (!V) 851e8d8bef9SDimitry Andric return nullptr; 852e8d8bef9SDimitry Andric 853e8d8bef9SDimitry Andric auto *VecTy = cast<FixedVectorType>(II.getType()); 854e8d8bef9SDimitry Andric unsigned NumElts = VecTy->getNumElements(); 855e8d8bef9SDimitry Andric bool IsPD = VecTy->getScalarType()->isDoubleTy(); 856e8d8bef9SDimitry Andric unsigned NumLaneElts = IsPD ? 2 : 4; 857e8d8bef9SDimitry Andric assert(NumElts == 16 || NumElts == 8 || NumElts == 4 || NumElts == 2); 858e8d8bef9SDimitry Andric 859e8d8bef9SDimitry Andric // Construct a shuffle mask from constant integers or UNDEFs. 860e8d8bef9SDimitry Andric int Indexes[16]; 861e8d8bef9SDimitry Andric 862e8d8bef9SDimitry Andric // The intrinsics only read one or two bits, clear the rest. 863e8d8bef9SDimitry Andric for (unsigned I = 0; I < NumElts; ++I) { 864e8d8bef9SDimitry Andric Constant *COp = V->getAggregateElement(I); 865e8d8bef9SDimitry Andric if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) 866e8d8bef9SDimitry Andric return nullptr; 867e8d8bef9SDimitry Andric 868e8d8bef9SDimitry Andric if (isa<UndefValue>(COp)) { 869e8d8bef9SDimitry Andric Indexes[I] = -1; 870e8d8bef9SDimitry Andric continue; 871e8d8bef9SDimitry Andric } 872e8d8bef9SDimitry Andric 873e8d8bef9SDimitry Andric APInt Index = cast<ConstantInt>(COp)->getValue(); 874e8d8bef9SDimitry Andric Index = Index.zextOrTrunc(32).getLoBits(2); 875e8d8bef9SDimitry Andric 876e8d8bef9SDimitry Andric // The PD variants uses bit 1 to select per-lane element index, so 877e8d8bef9SDimitry Andric // shift down to convert to generic shuffle mask index. 878e8d8bef9SDimitry Andric if (IsPD) 879e8d8bef9SDimitry Andric Index.lshrInPlace(1); 880e8d8bef9SDimitry Andric 881e8d8bef9SDimitry Andric // The _256 variants are a bit trickier since the mask bits always index 882e8d8bef9SDimitry Andric // into the corresponding 128 half. In order to convert to a generic 883e8d8bef9SDimitry Andric // shuffle, we have to make that explicit. 884e8d8bef9SDimitry Andric Index += APInt(32, (I / NumLaneElts) * NumLaneElts); 885e8d8bef9SDimitry Andric 886e8d8bef9SDimitry Andric Indexes[I] = Index.getZExtValue(); 887e8d8bef9SDimitry Andric } 888e8d8bef9SDimitry Andric 889e8d8bef9SDimitry Andric auto V1 = II.getArgOperand(0); 890e8d8bef9SDimitry Andric return Builder.CreateShuffleVector(V1, makeArrayRef(Indexes, NumElts)); 891e8d8bef9SDimitry Andric } 892e8d8bef9SDimitry Andric 893e8d8bef9SDimitry Andric /// Attempt to convert vpermd/vpermps to shufflevector if the mask is constant. 894e8d8bef9SDimitry Andric static Value *simplifyX86vpermv(const IntrinsicInst &II, 895e8d8bef9SDimitry Andric InstCombiner::BuilderTy &Builder) { 896e8d8bef9SDimitry Andric auto *V = dyn_cast<Constant>(II.getArgOperand(1)); 897e8d8bef9SDimitry Andric if (!V) 898e8d8bef9SDimitry Andric return nullptr; 899e8d8bef9SDimitry Andric 900e8d8bef9SDimitry Andric auto *VecTy = cast<FixedVectorType>(II.getType()); 901e8d8bef9SDimitry Andric unsigned Size = VecTy->getNumElements(); 902e8d8bef9SDimitry Andric assert((Size == 4 || Size == 8 || Size == 16 || Size == 32 || Size == 64) && 903e8d8bef9SDimitry Andric "Unexpected shuffle mask size"); 904e8d8bef9SDimitry Andric 905e8d8bef9SDimitry Andric // Construct a shuffle mask from constant integers or UNDEFs. 906e8d8bef9SDimitry Andric int Indexes[64]; 907e8d8bef9SDimitry Andric 908e8d8bef9SDimitry Andric for (unsigned I = 0; I < Size; ++I) { 909e8d8bef9SDimitry Andric Constant *COp = V->getAggregateElement(I); 910e8d8bef9SDimitry Andric if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) 911e8d8bef9SDimitry Andric return nullptr; 912e8d8bef9SDimitry Andric 913e8d8bef9SDimitry Andric if (isa<UndefValue>(COp)) { 914e8d8bef9SDimitry Andric Indexes[I] = -1; 915e8d8bef9SDimitry Andric continue; 916e8d8bef9SDimitry Andric } 917e8d8bef9SDimitry Andric 918e8d8bef9SDimitry Andric uint32_t Index = cast<ConstantInt>(COp)->getZExtValue(); 919e8d8bef9SDimitry Andric Index &= Size - 1; 920e8d8bef9SDimitry Andric Indexes[I] = Index; 921e8d8bef9SDimitry Andric } 922e8d8bef9SDimitry Andric 923e8d8bef9SDimitry Andric auto V1 = II.getArgOperand(0); 924e8d8bef9SDimitry Andric return Builder.CreateShuffleVector(V1, makeArrayRef(Indexes, Size)); 925e8d8bef9SDimitry Andric } 926e8d8bef9SDimitry Andric 927e8d8bef9SDimitry Andric Optional<Instruction *> 928e8d8bef9SDimitry Andric X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { 929e8d8bef9SDimitry Andric auto SimplifyDemandedVectorEltsLow = [&IC](Value *Op, unsigned Width, 930e8d8bef9SDimitry Andric unsigned DemandedWidth) { 931e8d8bef9SDimitry Andric APInt UndefElts(Width, 0); 932e8d8bef9SDimitry Andric APInt DemandedElts = APInt::getLowBitsSet(Width, DemandedWidth); 933e8d8bef9SDimitry Andric return IC.SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts); 934e8d8bef9SDimitry Andric }; 935e8d8bef9SDimitry Andric 936e8d8bef9SDimitry Andric Intrinsic::ID IID = II.getIntrinsicID(); 937e8d8bef9SDimitry Andric switch (IID) { 938e8d8bef9SDimitry Andric case Intrinsic::x86_bmi_bextr_32: 939e8d8bef9SDimitry Andric case Intrinsic::x86_bmi_bextr_64: 940e8d8bef9SDimitry Andric case Intrinsic::x86_tbm_bextri_u32: 941e8d8bef9SDimitry Andric case Intrinsic::x86_tbm_bextri_u64: 942e8d8bef9SDimitry Andric // If the RHS is a constant we can try some simplifications. 943e8d8bef9SDimitry Andric if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) { 944e8d8bef9SDimitry Andric uint64_t Shift = C->getZExtValue(); 945e8d8bef9SDimitry Andric uint64_t Length = (Shift >> 8) & 0xff; 946e8d8bef9SDimitry Andric Shift &= 0xff; 947e8d8bef9SDimitry Andric unsigned BitWidth = II.getType()->getIntegerBitWidth(); 948e8d8bef9SDimitry Andric // If the length is 0 or the shift is out of range, replace with zero. 949e8d8bef9SDimitry Andric if (Length == 0 || Shift >= BitWidth) { 950e8d8bef9SDimitry Andric return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0)); 951e8d8bef9SDimitry Andric } 952e8d8bef9SDimitry Andric // If the LHS is also a constant, we can completely constant fold this. 953e8d8bef9SDimitry Andric if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) { 954e8d8bef9SDimitry Andric uint64_t Result = InC->getZExtValue() >> Shift; 955e8d8bef9SDimitry Andric if (Length > BitWidth) 956e8d8bef9SDimitry Andric Length = BitWidth; 957e8d8bef9SDimitry Andric Result &= maskTrailingOnes<uint64_t>(Length); 958e8d8bef9SDimitry Andric return IC.replaceInstUsesWith(II, 959e8d8bef9SDimitry Andric ConstantInt::get(II.getType(), Result)); 960e8d8bef9SDimitry Andric } 961e8d8bef9SDimitry Andric // TODO should we turn this into 'and' if shift is 0? Or 'shl' if we 962e8d8bef9SDimitry Andric // are only masking bits that a shift already cleared? 963e8d8bef9SDimitry Andric } 964e8d8bef9SDimitry Andric break; 965e8d8bef9SDimitry Andric 966e8d8bef9SDimitry Andric case Intrinsic::x86_bmi_bzhi_32: 967e8d8bef9SDimitry Andric case Intrinsic::x86_bmi_bzhi_64: 968e8d8bef9SDimitry Andric // If the RHS is a constant we can try some simplifications. 969e8d8bef9SDimitry Andric if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) { 970e8d8bef9SDimitry Andric uint64_t Index = C->getZExtValue() & 0xff; 971e8d8bef9SDimitry Andric unsigned BitWidth = II.getType()->getIntegerBitWidth(); 972e8d8bef9SDimitry Andric if (Index >= BitWidth) { 973e8d8bef9SDimitry Andric return IC.replaceInstUsesWith(II, II.getArgOperand(0)); 974e8d8bef9SDimitry Andric } 975e8d8bef9SDimitry Andric if (Index == 0) { 976e8d8bef9SDimitry Andric return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0)); 977e8d8bef9SDimitry Andric } 978e8d8bef9SDimitry Andric // If the LHS is also a constant, we can completely constant fold this. 979e8d8bef9SDimitry Andric if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) { 980e8d8bef9SDimitry Andric uint64_t Result = InC->getZExtValue(); 981e8d8bef9SDimitry Andric Result &= maskTrailingOnes<uint64_t>(Index); 982e8d8bef9SDimitry Andric return IC.replaceInstUsesWith(II, 983e8d8bef9SDimitry Andric ConstantInt::get(II.getType(), Result)); 984e8d8bef9SDimitry Andric } 985e8d8bef9SDimitry Andric // TODO should we convert this to an AND if the RHS is constant? 986e8d8bef9SDimitry Andric } 987e8d8bef9SDimitry Andric break; 988e8d8bef9SDimitry Andric case Intrinsic::x86_bmi_pext_32: 989e8d8bef9SDimitry Andric case Intrinsic::x86_bmi_pext_64: 990e8d8bef9SDimitry Andric if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) { 991e8d8bef9SDimitry Andric if (MaskC->isNullValue()) { 992e8d8bef9SDimitry Andric return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0)); 993e8d8bef9SDimitry Andric } 994e8d8bef9SDimitry Andric if (MaskC->isAllOnesValue()) { 995e8d8bef9SDimitry Andric return IC.replaceInstUsesWith(II, II.getArgOperand(0)); 996e8d8bef9SDimitry Andric } 997e8d8bef9SDimitry Andric 998*81ad6265SDimitry Andric unsigned MaskIdx, MaskLen; 999*81ad6265SDimitry Andric if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) { 1000e8d8bef9SDimitry Andric // any single contingous sequence of 1s anywhere in the mask simply 1001e8d8bef9SDimitry Andric // describes a subset of the input bits shifted to the appropriate 1002e8d8bef9SDimitry Andric // position. Replace with the straight forward IR. 1003e8d8bef9SDimitry Andric Value *Input = II.getArgOperand(0); 1004e8d8bef9SDimitry Andric Value *Masked = IC.Builder.CreateAnd(Input, II.getArgOperand(1)); 1005*81ad6265SDimitry Andric Value *ShiftAmt = ConstantInt::get(II.getType(), MaskIdx); 1006*81ad6265SDimitry Andric Value *Shifted = IC.Builder.CreateLShr(Masked, ShiftAmt); 1007e8d8bef9SDimitry Andric return IC.replaceInstUsesWith(II, Shifted); 1008e8d8bef9SDimitry Andric } 1009e8d8bef9SDimitry Andric 1010e8d8bef9SDimitry Andric if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) { 1011e8d8bef9SDimitry Andric uint64_t Src = SrcC->getZExtValue(); 1012e8d8bef9SDimitry Andric uint64_t Mask = MaskC->getZExtValue(); 1013e8d8bef9SDimitry Andric uint64_t Result = 0; 1014e8d8bef9SDimitry Andric uint64_t BitToSet = 1; 1015e8d8bef9SDimitry Andric 1016e8d8bef9SDimitry Andric while (Mask) { 1017e8d8bef9SDimitry Andric // Isolate lowest set bit. 1018e8d8bef9SDimitry Andric uint64_t BitToTest = Mask & -Mask; 1019e8d8bef9SDimitry Andric if (BitToTest & Src) 1020e8d8bef9SDimitry Andric Result |= BitToSet; 1021e8d8bef9SDimitry Andric 1022e8d8bef9SDimitry Andric BitToSet <<= 1; 1023e8d8bef9SDimitry Andric // Clear lowest set bit. 1024e8d8bef9SDimitry Andric Mask &= Mask - 1; 1025e8d8bef9SDimitry Andric } 1026e8d8bef9SDimitry Andric 1027e8d8bef9SDimitry Andric return IC.replaceInstUsesWith(II, 1028e8d8bef9SDimitry Andric ConstantInt::get(II.getType(), Result)); 1029e8d8bef9SDimitry Andric } 1030e8d8bef9SDimitry Andric } 1031e8d8bef9SDimitry Andric break; 1032e8d8bef9SDimitry Andric case Intrinsic::x86_bmi_pdep_32: 1033e8d8bef9SDimitry Andric case Intrinsic::x86_bmi_pdep_64: 1034e8d8bef9SDimitry Andric if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) { 1035e8d8bef9SDimitry Andric if (MaskC->isNullValue()) { 1036e8d8bef9SDimitry Andric return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0)); 1037e8d8bef9SDimitry Andric } 1038e8d8bef9SDimitry Andric if (MaskC->isAllOnesValue()) { 1039e8d8bef9SDimitry Andric return IC.replaceInstUsesWith(II, II.getArgOperand(0)); 1040e8d8bef9SDimitry Andric } 1041*81ad6265SDimitry Andric 1042*81ad6265SDimitry Andric unsigned MaskIdx, MaskLen; 1043*81ad6265SDimitry Andric if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) { 1044e8d8bef9SDimitry Andric // any single contingous sequence of 1s anywhere in the mask simply 1045e8d8bef9SDimitry Andric // describes a subset of the input bits shifted to the appropriate 1046e8d8bef9SDimitry Andric // position. Replace with the straight forward IR. 1047e8d8bef9SDimitry Andric Value *Input = II.getArgOperand(0); 1048*81ad6265SDimitry Andric Value *ShiftAmt = ConstantInt::get(II.getType(), MaskIdx); 1049*81ad6265SDimitry Andric Value *Shifted = IC.Builder.CreateShl(Input, ShiftAmt); 1050e8d8bef9SDimitry Andric Value *Masked = IC.Builder.CreateAnd(Shifted, II.getArgOperand(1)); 1051e8d8bef9SDimitry Andric return IC.replaceInstUsesWith(II, Masked); 1052e8d8bef9SDimitry Andric } 1053e8d8bef9SDimitry Andric 1054e8d8bef9SDimitry Andric if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) { 1055e8d8bef9SDimitry Andric uint64_t Src = SrcC->getZExtValue(); 1056e8d8bef9SDimitry Andric uint64_t Mask = MaskC->getZExtValue(); 1057e8d8bef9SDimitry Andric uint64_t Result = 0; 1058e8d8bef9SDimitry Andric uint64_t BitToTest = 1; 1059e8d8bef9SDimitry Andric 1060e8d8bef9SDimitry Andric while (Mask) { 1061e8d8bef9SDimitry Andric // Isolate lowest set bit. 1062e8d8bef9SDimitry Andric uint64_t BitToSet = Mask & -Mask; 1063e8d8bef9SDimitry Andric if (BitToTest & Src) 1064e8d8bef9SDimitry Andric Result |= BitToSet; 1065e8d8bef9SDimitry Andric 1066e8d8bef9SDimitry Andric BitToTest <<= 1; 1067e8d8bef9SDimitry Andric // Clear lowest set bit; 1068e8d8bef9SDimitry Andric Mask &= Mask - 1; 1069e8d8bef9SDimitry Andric } 1070e8d8bef9SDimitry Andric 1071e8d8bef9SDimitry Andric return IC.replaceInstUsesWith(II, 1072e8d8bef9SDimitry Andric ConstantInt::get(II.getType(), Result)); 1073e8d8bef9SDimitry Andric } 1074e8d8bef9SDimitry Andric } 1075e8d8bef9SDimitry Andric break; 1076e8d8bef9SDimitry Andric 1077e8d8bef9SDimitry Andric case Intrinsic::x86_sse_cvtss2si: 1078e8d8bef9SDimitry Andric case Intrinsic::x86_sse_cvtss2si64: 1079e8d8bef9SDimitry Andric case Intrinsic::x86_sse_cvttss2si: 1080e8d8bef9SDimitry Andric case Intrinsic::x86_sse_cvttss2si64: 1081e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_cvtsd2si: 1082e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_cvtsd2si64: 1083e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_cvttsd2si: 1084e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_cvttsd2si64: 1085e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_vcvtss2si32: 1086e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_vcvtss2si64: 1087e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_vcvtss2usi32: 1088e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_vcvtss2usi64: 1089e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_vcvtsd2si32: 1090e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_vcvtsd2si64: 1091e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_vcvtsd2usi32: 1092e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_vcvtsd2usi64: 1093e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_cvttss2si: 1094e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_cvttss2si64: 1095e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_cvttss2usi: 1096e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_cvttss2usi64: 1097e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_cvttsd2si: 1098e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_cvttsd2si64: 1099e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_cvttsd2usi: 1100e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_cvttsd2usi64: { 1101e8d8bef9SDimitry Andric // These intrinsics only demand the 0th element of their input vectors. If 1102e8d8bef9SDimitry Andric // we can simplify the input based on that, do so now. 1103e8d8bef9SDimitry Andric Value *Arg = II.getArgOperand(0); 1104e8d8bef9SDimitry Andric unsigned VWidth = cast<FixedVectorType>(Arg->getType())->getNumElements(); 1105e8d8bef9SDimitry Andric if (Value *V = SimplifyDemandedVectorEltsLow(Arg, VWidth, 1)) { 1106e8d8bef9SDimitry Andric return IC.replaceOperand(II, 0, V); 1107e8d8bef9SDimitry Andric } 1108e8d8bef9SDimitry Andric break; 1109e8d8bef9SDimitry Andric } 1110e8d8bef9SDimitry Andric 1111e8d8bef9SDimitry Andric case Intrinsic::x86_mmx_pmovmskb: 1112e8d8bef9SDimitry Andric case Intrinsic::x86_sse_movmsk_ps: 1113e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_movmsk_pd: 1114e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_pmovmskb_128: 1115e8d8bef9SDimitry Andric case Intrinsic::x86_avx_movmsk_pd_256: 1116e8d8bef9SDimitry Andric case Intrinsic::x86_avx_movmsk_ps_256: 1117e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_pmovmskb: 1118e8d8bef9SDimitry Andric if (Value *V = simplifyX86movmsk(II, IC.Builder)) { 1119e8d8bef9SDimitry Andric return IC.replaceInstUsesWith(II, V); 1120e8d8bef9SDimitry Andric } 1121e8d8bef9SDimitry Andric break; 1122e8d8bef9SDimitry Andric 1123e8d8bef9SDimitry Andric case Intrinsic::x86_sse_comieq_ss: 1124e8d8bef9SDimitry Andric case Intrinsic::x86_sse_comige_ss: 1125e8d8bef9SDimitry Andric case Intrinsic::x86_sse_comigt_ss: 1126e8d8bef9SDimitry Andric case Intrinsic::x86_sse_comile_ss: 1127e8d8bef9SDimitry Andric case Intrinsic::x86_sse_comilt_ss: 1128e8d8bef9SDimitry Andric case Intrinsic::x86_sse_comineq_ss: 1129e8d8bef9SDimitry Andric case Intrinsic::x86_sse_ucomieq_ss: 1130e8d8bef9SDimitry Andric case Intrinsic::x86_sse_ucomige_ss: 1131e8d8bef9SDimitry Andric case Intrinsic::x86_sse_ucomigt_ss: 1132e8d8bef9SDimitry Andric case Intrinsic::x86_sse_ucomile_ss: 1133e8d8bef9SDimitry Andric case Intrinsic::x86_sse_ucomilt_ss: 1134e8d8bef9SDimitry Andric case Intrinsic::x86_sse_ucomineq_ss: 1135e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_comieq_sd: 1136e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_comige_sd: 1137e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_comigt_sd: 1138e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_comile_sd: 1139e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_comilt_sd: 1140e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_comineq_sd: 1141e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_ucomieq_sd: 1142e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_ucomige_sd: 1143e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_ucomigt_sd: 1144e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_ucomile_sd: 1145e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_ucomilt_sd: 1146e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_ucomineq_sd: 1147e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_vcomi_ss: 1148e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_vcomi_sd: 1149e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_mask_cmp_ss: 1150e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_mask_cmp_sd: { 1151e8d8bef9SDimitry Andric // These intrinsics only demand the 0th element of their input vectors. If 1152e8d8bef9SDimitry Andric // we can simplify the input based on that, do so now. 1153e8d8bef9SDimitry Andric bool MadeChange = false; 1154e8d8bef9SDimitry Andric Value *Arg0 = II.getArgOperand(0); 1155e8d8bef9SDimitry Andric Value *Arg1 = II.getArgOperand(1); 1156e8d8bef9SDimitry Andric unsigned VWidth = cast<FixedVectorType>(Arg0->getType())->getNumElements(); 1157e8d8bef9SDimitry Andric if (Value *V = SimplifyDemandedVectorEltsLow(Arg0, VWidth, 1)) { 1158e8d8bef9SDimitry Andric IC.replaceOperand(II, 0, V); 1159e8d8bef9SDimitry Andric MadeChange = true; 1160e8d8bef9SDimitry Andric } 1161e8d8bef9SDimitry Andric if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, 1)) { 1162e8d8bef9SDimitry Andric IC.replaceOperand(II, 1, V); 1163e8d8bef9SDimitry Andric MadeChange = true; 1164e8d8bef9SDimitry Andric } 1165e8d8bef9SDimitry Andric if (MadeChange) { 1166e8d8bef9SDimitry Andric return &II; 1167e8d8bef9SDimitry Andric } 1168e8d8bef9SDimitry Andric break; 1169e8d8bef9SDimitry Andric } 1170e8d8bef9SDimitry Andric 1171e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_add_ps_512: 1172e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_div_ps_512: 1173e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_mul_ps_512: 1174e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_sub_ps_512: 1175e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_add_pd_512: 1176e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_div_pd_512: 1177e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_mul_pd_512: 1178e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_sub_pd_512: 1179e8d8bef9SDimitry Andric // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular 1180e8d8bef9SDimitry Andric // IR operations. 1181e8d8bef9SDimitry Andric if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(2))) { 1182e8d8bef9SDimitry Andric if (R->getValue() == 4) { 1183e8d8bef9SDimitry Andric Value *Arg0 = II.getArgOperand(0); 1184e8d8bef9SDimitry Andric Value *Arg1 = II.getArgOperand(1); 1185e8d8bef9SDimitry Andric 1186e8d8bef9SDimitry Andric Value *V; 1187e8d8bef9SDimitry Andric switch (IID) { 1188e8d8bef9SDimitry Andric default: 1189e8d8bef9SDimitry Andric llvm_unreachable("Case stmts out of sync!"); 1190e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_add_ps_512: 1191e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_add_pd_512: 1192e8d8bef9SDimitry Andric V = IC.Builder.CreateFAdd(Arg0, Arg1); 1193e8d8bef9SDimitry Andric break; 1194e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_sub_ps_512: 1195e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_sub_pd_512: 1196e8d8bef9SDimitry Andric V = IC.Builder.CreateFSub(Arg0, Arg1); 1197e8d8bef9SDimitry Andric break; 1198e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_mul_ps_512: 1199e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_mul_pd_512: 1200e8d8bef9SDimitry Andric V = IC.Builder.CreateFMul(Arg0, Arg1); 1201e8d8bef9SDimitry Andric break; 1202e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_div_ps_512: 1203e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_div_pd_512: 1204e8d8bef9SDimitry Andric V = IC.Builder.CreateFDiv(Arg0, Arg1); 1205e8d8bef9SDimitry Andric break; 1206e8d8bef9SDimitry Andric } 1207e8d8bef9SDimitry Andric 1208e8d8bef9SDimitry Andric return IC.replaceInstUsesWith(II, V); 1209e8d8bef9SDimitry Andric } 1210e8d8bef9SDimitry Andric } 1211e8d8bef9SDimitry Andric break; 1212e8d8bef9SDimitry Andric 1213e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_mask_add_ss_round: 1214e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_mask_div_ss_round: 1215e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_mask_mul_ss_round: 1216e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_mask_sub_ss_round: 1217e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_mask_add_sd_round: 1218e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_mask_div_sd_round: 1219e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_mask_mul_sd_round: 1220e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_mask_sub_sd_round: 1221e8d8bef9SDimitry Andric // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular 1222e8d8bef9SDimitry Andric // IR operations. 1223e8d8bef9SDimitry Andric if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(4))) { 1224e8d8bef9SDimitry Andric if (R->getValue() == 4) { 1225e8d8bef9SDimitry Andric // Extract the element as scalars. 1226e8d8bef9SDimitry Andric Value *Arg0 = II.getArgOperand(0); 1227e8d8bef9SDimitry Andric Value *Arg1 = II.getArgOperand(1); 1228e8d8bef9SDimitry Andric Value *LHS = IC.Builder.CreateExtractElement(Arg0, (uint64_t)0); 1229e8d8bef9SDimitry Andric Value *RHS = IC.Builder.CreateExtractElement(Arg1, (uint64_t)0); 1230e8d8bef9SDimitry Andric 1231e8d8bef9SDimitry Andric Value *V; 1232e8d8bef9SDimitry Andric switch (IID) { 1233e8d8bef9SDimitry Andric default: 1234e8d8bef9SDimitry Andric llvm_unreachable("Case stmts out of sync!"); 1235e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_mask_add_ss_round: 1236e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_mask_add_sd_round: 1237e8d8bef9SDimitry Andric V = IC.Builder.CreateFAdd(LHS, RHS); 1238e8d8bef9SDimitry Andric break; 1239e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_mask_sub_ss_round: 1240e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_mask_sub_sd_round: 1241e8d8bef9SDimitry Andric V = IC.Builder.CreateFSub(LHS, RHS); 1242e8d8bef9SDimitry Andric break; 1243e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_mask_mul_ss_round: 1244e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_mask_mul_sd_round: 1245e8d8bef9SDimitry Andric V = IC.Builder.CreateFMul(LHS, RHS); 1246e8d8bef9SDimitry Andric break; 1247e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_mask_div_ss_round: 1248e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_mask_div_sd_round: 1249e8d8bef9SDimitry Andric V = IC.Builder.CreateFDiv(LHS, RHS); 1250e8d8bef9SDimitry Andric break; 1251e8d8bef9SDimitry Andric } 1252e8d8bef9SDimitry Andric 1253e8d8bef9SDimitry Andric // Handle the masking aspect of the intrinsic. 1254e8d8bef9SDimitry Andric Value *Mask = II.getArgOperand(3); 1255e8d8bef9SDimitry Andric auto *C = dyn_cast<ConstantInt>(Mask); 1256e8d8bef9SDimitry Andric // We don't need a select if we know the mask bit is a 1. 1257e8d8bef9SDimitry Andric if (!C || !C->getValue()[0]) { 1258e8d8bef9SDimitry Andric // Cast the mask to an i1 vector and then extract the lowest element. 1259e8d8bef9SDimitry Andric auto *MaskTy = FixedVectorType::get( 1260e8d8bef9SDimitry Andric IC.Builder.getInt1Ty(), 1261e8d8bef9SDimitry Andric cast<IntegerType>(Mask->getType())->getBitWidth()); 1262e8d8bef9SDimitry Andric Mask = IC.Builder.CreateBitCast(Mask, MaskTy); 1263e8d8bef9SDimitry Andric Mask = IC.Builder.CreateExtractElement(Mask, (uint64_t)0); 1264e8d8bef9SDimitry Andric // Extract the lowest element from the passthru operand. 1265e8d8bef9SDimitry Andric Value *Passthru = 1266e8d8bef9SDimitry Andric IC.Builder.CreateExtractElement(II.getArgOperand(2), (uint64_t)0); 1267e8d8bef9SDimitry Andric V = IC.Builder.CreateSelect(Mask, V, Passthru); 1268e8d8bef9SDimitry Andric } 1269e8d8bef9SDimitry Andric 1270e8d8bef9SDimitry Andric // Insert the result back into the original argument 0. 1271e8d8bef9SDimitry Andric V = IC.Builder.CreateInsertElement(Arg0, V, (uint64_t)0); 1272e8d8bef9SDimitry Andric 1273e8d8bef9SDimitry Andric return IC.replaceInstUsesWith(II, V); 1274e8d8bef9SDimitry Andric } 1275e8d8bef9SDimitry Andric } 1276e8d8bef9SDimitry Andric break; 1277e8d8bef9SDimitry Andric 1278e8d8bef9SDimitry Andric // Constant fold ashr( <A x Bi>, Ci ). 1279e8d8bef9SDimitry Andric // Constant fold lshr( <A x Bi>, Ci ). 1280e8d8bef9SDimitry Andric // Constant fold shl( <A x Bi>, Ci ). 1281e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_psrai_d: 1282e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_psrai_w: 1283e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psrai_d: 1284e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psrai_w: 1285e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrai_q_128: 1286e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrai_q_256: 1287e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrai_d_512: 1288e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrai_q_512: 1289e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrai_w_512: 1290e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_psrli_d: 1291e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_psrli_q: 1292e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_psrli_w: 1293e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psrli_d: 1294e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psrli_q: 1295e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psrli_w: 1296e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrli_d_512: 1297e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrli_q_512: 1298e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrli_w_512: 1299e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_pslli_d: 1300e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_pslli_q: 1301e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_pslli_w: 1302e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_pslli_d: 1303e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_pslli_q: 1304e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_pslli_w: 1305e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_pslli_d_512: 1306e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_pslli_q_512: 1307e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_pslli_w_512: 1308e8d8bef9SDimitry Andric if (Value *V = simplifyX86immShift(II, IC.Builder)) { 1309e8d8bef9SDimitry Andric return IC.replaceInstUsesWith(II, V); 1310e8d8bef9SDimitry Andric } 1311e8d8bef9SDimitry Andric break; 1312e8d8bef9SDimitry Andric 1313e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_psra_d: 1314e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_psra_w: 1315e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psra_d: 1316e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psra_w: 1317e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psra_q_128: 1318e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psra_q_256: 1319e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psra_d_512: 1320e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psra_q_512: 1321e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psra_w_512: 1322e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_psrl_d: 1323e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_psrl_q: 1324e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_psrl_w: 1325e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psrl_d: 1326e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psrl_q: 1327e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psrl_w: 1328e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrl_d_512: 1329e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrl_q_512: 1330e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrl_w_512: 1331e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_psll_d: 1332e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_psll_q: 1333e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_psll_w: 1334e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psll_d: 1335e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psll_q: 1336e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psll_w: 1337e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psll_d_512: 1338e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psll_q_512: 1339e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psll_w_512: { 1340e8d8bef9SDimitry Andric if (Value *V = simplifyX86immShift(II, IC.Builder)) { 1341e8d8bef9SDimitry Andric return IC.replaceInstUsesWith(II, V); 1342e8d8bef9SDimitry Andric } 1343e8d8bef9SDimitry Andric 1344e8d8bef9SDimitry Andric // SSE2/AVX2 uses only the first 64-bits of the 128-bit vector 1345e8d8bef9SDimitry Andric // operand to compute the shift amount. 1346e8d8bef9SDimitry Andric Value *Arg1 = II.getArgOperand(1); 1347e8d8bef9SDimitry Andric assert(Arg1->getType()->getPrimitiveSizeInBits() == 128 && 1348e8d8bef9SDimitry Andric "Unexpected packed shift size"); 1349e8d8bef9SDimitry Andric unsigned VWidth = cast<FixedVectorType>(Arg1->getType())->getNumElements(); 1350e8d8bef9SDimitry Andric 1351e8d8bef9SDimitry Andric if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, VWidth / 2)) { 1352e8d8bef9SDimitry Andric return IC.replaceOperand(II, 1, V); 1353e8d8bef9SDimitry Andric } 1354e8d8bef9SDimitry Andric break; 1355e8d8bef9SDimitry Andric } 1356e8d8bef9SDimitry Andric 1357e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psllv_d: 1358e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psllv_d_256: 1359e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psllv_q: 1360e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psllv_q_256: 1361e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psllv_d_512: 1362e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psllv_q_512: 1363e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psllv_w_128: 1364e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psllv_w_256: 1365e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psllv_w_512: 1366e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psrav_d: 1367e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psrav_d_256: 1368e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrav_q_128: 1369e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrav_q_256: 1370e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrav_d_512: 1371e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrav_q_512: 1372e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrav_w_128: 1373e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrav_w_256: 1374e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrav_w_512: 1375e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psrlv_d: 1376e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psrlv_d_256: 1377e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psrlv_q: 1378e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_psrlv_q_256: 1379e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrlv_d_512: 1380e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrlv_q_512: 1381e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrlv_w_128: 1382e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrlv_w_256: 1383e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_psrlv_w_512: 1384e8d8bef9SDimitry Andric if (Value *V = simplifyX86varShift(II, IC.Builder)) { 1385e8d8bef9SDimitry Andric return IC.replaceInstUsesWith(II, V); 1386e8d8bef9SDimitry Andric } 1387e8d8bef9SDimitry Andric break; 1388e8d8bef9SDimitry Andric 1389e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_packssdw_128: 1390e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_packsswb_128: 1391e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_packssdw: 1392e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_packsswb: 1393e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_packssdw_512: 1394e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_packsswb_512: 1395e8d8bef9SDimitry Andric if (Value *V = simplifyX86pack(II, IC.Builder, true)) { 1396e8d8bef9SDimitry Andric return IC.replaceInstUsesWith(II, V); 1397e8d8bef9SDimitry Andric } 1398e8d8bef9SDimitry Andric break; 1399e8d8bef9SDimitry Andric 1400e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_packuswb_128: 1401e8d8bef9SDimitry Andric case Intrinsic::x86_sse41_packusdw: 1402e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_packusdw: 1403e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_packuswb: 1404e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_packusdw_512: 1405e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_packuswb_512: 1406e8d8bef9SDimitry Andric if (Value *V = simplifyX86pack(II, IC.Builder, false)) { 1407e8d8bef9SDimitry Andric return IC.replaceInstUsesWith(II, V); 1408e8d8bef9SDimitry Andric } 1409e8d8bef9SDimitry Andric break; 1410e8d8bef9SDimitry Andric 1411e8d8bef9SDimitry Andric case Intrinsic::x86_pclmulqdq: 1412e8d8bef9SDimitry Andric case Intrinsic::x86_pclmulqdq_256: 1413e8d8bef9SDimitry Andric case Intrinsic::x86_pclmulqdq_512: { 1414e8d8bef9SDimitry Andric if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(2))) { 1415e8d8bef9SDimitry Andric unsigned Imm = C->getZExtValue(); 1416e8d8bef9SDimitry Andric 1417e8d8bef9SDimitry Andric bool MadeChange = false; 1418e8d8bef9SDimitry Andric Value *Arg0 = II.getArgOperand(0); 1419e8d8bef9SDimitry Andric Value *Arg1 = II.getArgOperand(1); 1420e8d8bef9SDimitry Andric unsigned VWidth = 1421e8d8bef9SDimitry Andric cast<FixedVectorType>(Arg0->getType())->getNumElements(); 1422e8d8bef9SDimitry Andric 1423e8d8bef9SDimitry Andric APInt UndefElts1(VWidth, 0); 1424e8d8bef9SDimitry Andric APInt DemandedElts1 = 1425e8d8bef9SDimitry Andric APInt::getSplat(VWidth, APInt(2, (Imm & 0x01) ? 2 : 1)); 1426e8d8bef9SDimitry Andric if (Value *V = 1427e8d8bef9SDimitry Andric IC.SimplifyDemandedVectorElts(Arg0, DemandedElts1, UndefElts1)) { 1428e8d8bef9SDimitry Andric IC.replaceOperand(II, 0, V); 1429e8d8bef9SDimitry Andric MadeChange = true; 1430e8d8bef9SDimitry Andric } 1431e8d8bef9SDimitry Andric 1432e8d8bef9SDimitry Andric APInt UndefElts2(VWidth, 0); 1433e8d8bef9SDimitry Andric APInt DemandedElts2 = 1434e8d8bef9SDimitry Andric APInt::getSplat(VWidth, APInt(2, (Imm & 0x10) ? 2 : 1)); 1435e8d8bef9SDimitry Andric if (Value *V = 1436e8d8bef9SDimitry Andric IC.SimplifyDemandedVectorElts(Arg1, DemandedElts2, UndefElts2)) { 1437e8d8bef9SDimitry Andric IC.replaceOperand(II, 1, V); 1438e8d8bef9SDimitry Andric MadeChange = true; 1439e8d8bef9SDimitry Andric } 1440e8d8bef9SDimitry Andric 1441e8d8bef9SDimitry Andric // If either input elements are undef, the result is zero. 1442e8d8bef9SDimitry Andric if (DemandedElts1.isSubsetOf(UndefElts1) || 1443e8d8bef9SDimitry Andric DemandedElts2.isSubsetOf(UndefElts2)) { 1444e8d8bef9SDimitry Andric return IC.replaceInstUsesWith(II, 1445e8d8bef9SDimitry Andric ConstantAggregateZero::get(II.getType())); 1446e8d8bef9SDimitry Andric } 1447e8d8bef9SDimitry Andric 1448e8d8bef9SDimitry Andric if (MadeChange) { 1449e8d8bef9SDimitry Andric return &II; 1450e8d8bef9SDimitry Andric } 1451e8d8bef9SDimitry Andric } 1452e8d8bef9SDimitry Andric break; 1453e8d8bef9SDimitry Andric } 1454e8d8bef9SDimitry Andric 1455e8d8bef9SDimitry Andric case Intrinsic::x86_sse41_insertps: 1456e8d8bef9SDimitry Andric if (Value *V = simplifyX86insertps(II, IC.Builder)) { 1457e8d8bef9SDimitry Andric return IC.replaceInstUsesWith(II, V); 1458e8d8bef9SDimitry Andric } 1459e8d8bef9SDimitry Andric break; 1460e8d8bef9SDimitry Andric 1461e8d8bef9SDimitry Andric case Intrinsic::x86_sse4a_extrq: { 1462e8d8bef9SDimitry Andric Value *Op0 = II.getArgOperand(0); 1463e8d8bef9SDimitry Andric Value *Op1 = II.getArgOperand(1); 1464e8d8bef9SDimitry Andric unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements(); 1465e8d8bef9SDimitry Andric unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements(); 1466e8d8bef9SDimitry Andric assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && 1467e8d8bef9SDimitry Andric Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 && 1468e8d8bef9SDimitry Andric VWidth1 == 16 && "Unexpected operand sizes"); 1469e8d8bef9SDimitry Andric 1470e8d8bef9SDimitry Andric // See if we're dealing with constant values. 1471fe6060f1SDimitry Andric auto *C1 = dyn_cast<Constant>(Op1); 1472fe6060f1SDimitry Andric auto *CILength = 1473e8d8bef9SDimitry Andric C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0)) 1474e8d8bef9SDimitry Andric : nullptr; 1475fe6060f1SDimitry Andric auto *CIIndex = 1476e8d8bef9SDimitry Andric C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1)) 1477e8d8bef9SDimitry Andric : nullptr; 1478e8d8bef9SDimitry Andric 1479e8d8bef9SDimitry Andric // Attempt to simplify to a constant, shuffle vector or EXTRQI call. 1480e8d8bef9SDimitry Andric if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) { 1481e8d8bef9SDimitry Andric return IC.replaceInstUsesWith(II, V); 1482e8d8bef9SDimitry Andric } 1483e8d8bef9SDimitry Andric 1484e8d8bef9SDimitry Andric // EXTRQ only uses the lowest 64-bits of the first 128-bit vector 1485e8d8bef9SDimitry Andric // operands and the lowest 16-bits of the second. 1486e8d8bef9SDimitry Andric bool MadeChange = false; 1487e8d8bef9SDimitry Andric if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) { 1488e8d8bef9SDimitry Andric IC.replaceOperand(II, 0, V); 1489e8d8bef9SDimitry Andric MadeChange = true; 1490e8d8bef9SDimitry Andric } 1491e8d8bef9SDimitry Andric if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 2)) { 1492e8d8bef9SDimitry Andric IC.replaceOperand(II, 1, V); 1493e8d8bef9SDimitry Andric MadeChange = true; 1494e8d8bef9SDimitry Andric } 1495e8d8bef9SDimitry Andric if (MadeChange) { 1496e8d8bef9SDimitry Andric return &II; 1497e8d8bef9SDimitry Andric } 1498e8d8bef9SDimitry Andric break; 1499e8d8bef9SDimitry Andric } 1500e8d8bef9SDimitry Andric 1501e8d8bef9SDimitry Andric case Intrinsic::x86_sse4a_extrqi: { 1502e8d8bef9SDimitry Andric // EXTRQI: Extract Length bits starting from Index. Zero pad the remaining 1503e8d8bef9SDimitry Andric // bits of the lower 64-bits. The upper 64-bits are undefined. 1504e8d8bef9SDimitry Andric Value *Op0 = II.getArgOperand(0); 1505e8d8bef9SDimitry Andric unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements(); 1506e8d8bef9SDimitry Andric assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 && 1507e8d8bef9SDimitry Andric "Unexpected operand size"); 1508e8d8bef9SDimitry Andric 1509e8d8bef9SDimitry Andric // See if we're dealing with constant values. 1510fe6060f1SDimitry Andric auto *CILength = dyn_cast<ConstantInt>(II.getArgOperand(1)); 1511fe6060f1SDimitry Andric auto *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(2)); 1512e8d8bef9SDimitry Andric 1513e8d8bef9SDimitry Andric // Attempt to simplify to a constant or shuffle vector. 1514e8d8bef9SDimitry Andric if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) { 1515e8d8bef9SDimitry Andric return IC.replaceInstUsesWith(II, V); 1516e8d8bef9SDimitry Andric } 1517e8d8bef9SDimitry Andric 1518e8d8bef9SDimitry Andric // EXTRQI only uses the lowest 64-bits of the first 128-bit vector 1519e8d8bef9SDimitry Andric // operand. 1520e8d8bef9SDimitry Andric if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) { 1521e8d8bef9SDimitry Andric return IC.replaceOperand(II, 0, V); 1522e8d8bef9SDimitry Andric } 1523e8d8bef9SDimitry Andric break; 1524e8d8bef9SDimitry Andric } 1525e8d8bef9SDimitry Andric 1526e8d8bef9SDimitry Andric case Intrinsic::x86_sse4a_insertq: { 1527e8d8bef9SDimitry Andric Value *Op0 = II.getArgOperand(0); 1528e8d8bef9SDimitry Andric Value *Op1 = II.getArgOperand(1); 1529e8d8bef9SDimitry Andric unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements(); 1530e8d8bef9SDimitry Andric assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && 1531e8d8bef9SDimitry Andric Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 && 1532e8d8bef9SDimitry Andric cast<FixedVectorType>(Op1->getType())->getNumElements() == 2 && 1533e8d8bef9SDimitry Andric "Unexpected operand size"); 1534e8d8bef9SDimitry Andric 1535e8d8bef9SDimitry Andric // See if we're dealing with constant values. 1536fe6060f1SDimitry Andric auto *C1 = dyn_cast<Constant>(Op1); 1537fe6060f1SDimitry Andric auto *CI11 = 1538e8d8bef9SDimitry Andric C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1)) 1539e8d8bef9SDimitry Andric : nullptr; 1540e8d8bef9SDimitry Andric 1541e8d8bef9SDimitry Andric // Attempt to simplify to a constant, shuffle vector or INSERTQI call. 1542e8d8bef9SDimitry Andric if (CI11) { 1543e8d8bef9SDimitry Andric const APInt &V11 = CI11->getValue(); 1544e8d8bef9SDimitry Andric APInt Len = V11.zextOrTrunc(6); 1545e8d8bef9SDimitry Andric APInt Idx = V11.lshr(8).zextOrTrunc(6); 1546e8d8bef9SDimitry Andric if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) { 1547e8d8bef9SDimitry Andric return IC.replaceInstUsesWith(II, V); 1548e8d8bef9SDimitry Andric } 1549e8d8bef9SDimitry Andric } 1550e8d8bef9SDimitry Andric 1551e8d8bef9SDimitry Andric // INSERTQ only uses the lowest 64-bits of the first 128-bit vector 1552e8d8bef9SDimitry Andric // operand. 1553e8d8bef9SDimitry Andric if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) { 1554e8d8bef9SDimitry Andric return IC.replaceOperand(II, 0, V); 1555e8d8bef9SDimitry Andric } 1556e8d8bef9SDimitry Andric break; 1557e8d8bef9SDimitry Andric } 1558e8d8bef9SDimitry Andric 1559e8d8bef9SDimitry Andric case Intrinsic::x86_sse4a_insertqi: { 1560e8d8bef9SDimitry Andric // INSERTQI: Extract lowest Length bits from lower half of second source and 1561e8d8bef9SDimitry Andric // insert over first source starting at Index bit. The upper 64-bits are 1562e8d8bef9SDimitry Andric // undefined. 1563e8d8bef9SDimitry Andric Value *Op0 = II.getArgOperand(0); 1564e8d8bef9SDimitry Andric Value *Op1 = II.getArgOperand(1); 1565e8d8bef9SDimitry Andric unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements(); 1566e8d8bef9SDimitry Andric unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements(); 1567e8d8bef9SDimitry Andric assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && 1568e8d8bef9SDimitry Andric Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 && 1569e8d8bef9SDimitry Andric VWidth1 == 2 && "Unexpected operand sizes"); 1570e8d8bef9SDimitry Andric 1571e8d8bef9SDimitry Andric // See if we're dealing with constant values. 1572fe6060f1SDimitry Andric auto *CILength = dyn_cast<ConstantInt>(II.getArgOperand(2)); 1573fe6060f1SDimitry Andric auto *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(3)); 1574e8d8bef9SDimitry Andric 1575e8d8bef9SDimitry Andric // Attempt to simplify to a constant or shuffle vector. 1576e8d8bef9SDimitry Andric if (CILength && CIIndex) { 1577e8d8bef9SDimitry Andric APInt Len = CILength->getValue().zextOrTrunc(6); 1578e8d8bef9SDimitry Andric APInt Idx = CIIndex->getValue().zextOrTrunc(6); 1579e8d8bef9SDimitry Andric if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) { 1580e8d8bef9SDimitry Andric return IC.replaceInstUsesWith(II, V); 1581e8d8bef9SDimitry Andric } 1582e8d8bef9SDimitry Andric } 1583e8d8bef9SDimitry Andric 1584e8d8bef9SDimitry Andric // INSERTQI only uses the lowest 64-bits of the first two 128-bit vector 1585e8d8bef9SDimitry Andric // operands. 1586e8d8bef9SDimitry Andric bool MadeChange = false; 1587e8d8bef9SDimitry Andric if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) { 1588e8d8bef9SDimitry Andric IC.replaceOperand(II, 0, V); 1589e8d8bef9SDimitry Andric MadeChange = true; 1590e8d8bef9SDimitry Andric } 1591e8d8bef9SDimitry Andric if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 1)) { 1592e8d8bef9SDimitry Andric IC.replaceOperand(II, 1, V); 1593e8d8bef9SDimitry Andric MadeChange = true; 1594e8d8bef9SDimitry Andric } 1595e8d8bef9SDimitry Andric if (MadeChange) { 1596e8d8bef9SDimitry Andric return &II; 1597e8d8bef9SDimitry Andric } 1598e8d8bef9SDimitry Andric break; 1599e8d8bef9SDimitry Andric } 1600e8d8bef9SDimitry Andric 1601e8d8bef9SDimitry Andric case Intrinsic::x86_sse41_pblendvb: 1602e8d8bef9SDimitry Andric case Intrinsic::x86_sse41_blendvps: 1603e8d8bef9SDimitry Andric case Intrinsic::x86_sse41_blendvpd: 1604e8d8bef9SDimitry Andric case Intrinsic::x86_avx_blendv_ps_256: 1605e8d8bef9SDimitry Andric case Intrinsic::x86_avx_blendv_pd_256: 1606e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_pblendvb: { 1607e8d8bef9SDimitry Andric // fold (blend A, A, Mask) -> A 1608e8d8bef9SDimitry Andric Value *Op0 = II.getArgOperand(0); 1609e8d8bef9SDimitry Andric Value *Op1 = II.getArgOperand(1); 1610e8d8bef9SDimitry Andric Value *Mask = II.getArgOperand(2); 1611e8d8bef9SDimitry Andric if (Op0 == Op1) { 1612e8d8bef9SDimitry Andric return IC.replaceInstUsesWith(II, Op0); 1613e8d8bef9SDimitry Andric } 1614e8d8bef9SDimitry Andric 1615e8d8bef9SDimitry Andric // Zero Mask - select 1st argument. 1616e8d8bef9SDimitry Andric if (isa<ConstantAggregateZero>(Mask)) { 1617e8d8bef9SDimitry Andric return IC.replaceInstUsesWith(II, Op0); 1618e8d8bef9SDimitry Andric } 1619e8d8bef9SDimitry Andric 1620e8d8bef9SDimitry Andric // Constant Mask - select 1st/2nd argument lane based on top bit of mask. 1621e8d8bef9SDimitry Andric if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) { 1622e8d8bef9SDimitry Andric Constant *NewSelector = getNegativeIsTrueBoolVec(ConstantMask); 1623e8d8bef9SDimitry Andric return SelectInst::Create(NewSelector, Op1, Op0, "blendv"); 1624e8d8bef9SDimitry Andric } 1625e8d8bef9SDimitry Andric 1626e8d8bef9SDimitry Andric // Convert to a vector select if we can bypass casts and find a boolean 1627e8d8bef9SDimitry Andric // vector condition value. 1628e8d8bef9SDimitry Andric Value *BoolVec; 1629e8d8bef9SDimitry Andric Mask = InstCombiner::peekThroughBitcast(Mask); 1630e8d8bef9SDimitry Andric if (match(Mask, PatternMatch::m_SExt(PatternMatch::m_Value(BoolVec))) && 1631e8d8bef9SDimitry Andric BoolVec->getType()->isVectorTy() && 1632e8d8bef9SDimitry Andric BoolVec->getType()->getScalarSizeInBits() == 1) { 1633e8d8bef9SDimitry Andric assert(Mask->getType()->getPrimitiveSizeInBits() == 1634e8d8bef9SDimitry Andric II.getType()->getPrimitiveSizeInBits() && 1635e8d8bef9SDimitry Andric "Not expecting mask and operands with different sizes"); 1636e8d8bef9SDimitry Andric 1637e8d8bef9SDimitry Andric unsigned NumMaskElts = 1638e8d8bef9SDimitry Andric cast<FixedVectorType>(Mask->getType())->getNumElements(); 1639e8d8bef9SDimitry Andric unsigned NumOperandElts = 1640e8d8bef9SDimitry Andric cast<FixedVectorType>(II.getType())->getNumElements(); 1641e8d8bef9SDimitry Andric if (NumMaskElts == NumOperandElts) { 1642e8d8bef9SDimitry Andric return SelectInst::Create(BoolVec, Op1, Op0); 1643e8d8bef9SDimitry Andric } 1644e8d8bef9SDimitry Andric 1645e8d8bef9SDimitry Andric // If the mask has less elements than the operands, each mask bit maps to 1646e8d8bef9SDimitry Andric // multiple elements of the operands. Bitcast back and forth. 1647e8d8bef9SDimitry Andric if (NumMaskElts < NumOperandElts) { 1648e8d8bef9SDimitry Andric Value *CastOp0 = IC.Builder.CreateBitCast(Op0, Mask->getType()); 1649e8d8bef9SDimitry Andric Value *CastOp1 = IC.Builder.CreateBitCast(Op1, Mask->getType()); 1650e8d8bef9SDimitry Andric Value *Sel = IC.Builder.CreateSelect(BoolVec, CastOp1, CastOp0); 1651e8d8bef9SDimitry Andric return new BitCastInst(Sel, II.getType()); 1652e8d8bef9SDimitry Andric } 1653e8d8bef9SDimitry Andric } 1654e8d8bef9SDimitry Andric 1655e8d8bef9SDimitry Andric break; 1656e8d8bef9SDimitry Andric } 1657e8d8bef9SDimitry Andric 1658e8d8bef9SDimitry Andric case Intrinsic::x86_ssse3_pshuf_b_128: 1659e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_pshuf_b: 1660e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_pshuf_b_512: 1661e8d8bef9SDimitry Andric if (Value *V = simplifyX86pshufb(II, IC.Builder)) { 1662e8d8bef9SDimitry Andric return IC.replaceInstUsesWith(II, V); 1663e8d8bef9SDimitry Andric } 1664e8d8bef9SDimitry Andric break; 1665e8d8bef9SDimitry Andric 1666e8d8bef9SDimitry Andric case Intrinsic::x86_avx_vpermilvar_ps: 1667e8d8bef9SDimitry Andric case Intrinsic::x86_avx_vpermilvar_ps_256: 1668e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_vpermilvar_ps_512: 1669e8d8bef9SDimitry Andric case Intrinsic::x86_avx_vpermilvar_pd: 1670e8d8bef9SDimitry Andric case Intrinsic::x86_avx_vpermilvar_pd_256: 1671e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_vpermilvar_pd_512: 1672e8d8bef9SDimitry Andric if (Value *V = simplifyX86vpermilvar(II, IC.Builder)) { 1673e8d8bef9SDimitry Andric return IC.replaceInstUsesWith(II, V); 1674e8d8bef9SDimitry Andric } 1675e8d8bef9SDimitry Andric break; 1676e8d8bef9SDimitry Andric 1677e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_permd: 1678e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_permps: 1679e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_permvar_df_256: 1680e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_permvar_df_512: 1681e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_permvar_di_256: 1682e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_permvar_di_512: 1683e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_permvar_hi_128: 1684e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_permvar_hi_256: 1685e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_permvar_hi_512: 1686e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_permvar_qi_128: 1687e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_permvar_qi_256: 1688e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_permvar_qi_512: 1689e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_permvar_sf_512: 1690e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_permvar_si_512: 1691e8d8bef9SDimitry Andric if (Value *V = simplifyX86vpermv(II, IC.Builder)) { 1692e8d8bef9SDimitry Andric return IC.replaceInstUsesWith(II, V); 1693e8d8bef9SDimitry Andric } 1694e8d8bef9SDimitry Andric break; 1695e8d8bef9SDimitry Andric 1696e8d8bef9SDimitry Andric case Intrinsic::x86_avx_maskload_ps: 1697e8d8bef9SDimitry Andric case Intrinsic::x86_avx_maskload_pd: 1698e8d8bef9SDimitry Andric case Intrinsic::x86_avx_maskload_ps_256: 1699e8d8bef9SDimitry Andric case Intrinsic::x86_avx_maskload_pd_256: 1700e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_maskload_d: 1701e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_maskload_q: 1702e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_maskload_d_256: 1703e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_maskload_q_256: 1704e8d8bef9SDimitry Andric if (Instruction *I = simplifyX86MaskedLoad(II, IC)) { 1705e8d8bef9SDimitry Andric return I; 1706e8d8bef9SDimitry Andric } 1707e8d8bef9SDimitry Andric break; 1708e8d8bef9SDimitry Andric 1709e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_maskmov_dqu: 1710e8d8bef9SDimitry Andric case Intrinsic::x86_avx_maskstore_ps: 1711e8d8bef9SDimitry Andric case Intrinsic::x86_avx_maskstore_pd: 1712e8d8bef9SDimitry Andric case Intrinsic::x86_avx_maskstore_ps_256: 1713e8d8bef9SDimitry Andric case Intrinsic::x86_avx_maskstore_pd_256: 1714e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_maskstore_d: 1715e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_maskstore_q: 1716e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_maskstore_d_256: 1717e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_maskstore_q_256: 1718e8d8bef9SDimitry Andric if (simplifyX86MaskedStore(II, IC)) { 1719e8d8bef9SDimitry Andric return nullptr; 1720e8d8bef9SDimitry Andric } 1721e8d8bef9SDimitry Andric break; 1722e8d8bef9SDimitry Andric 1723e8d8bef9SDimitry Andric case Intrinsic::x86_addcarry_32: 1724e8d8bef9SDimitry Andric case Intrinsic::x86_addcarry_64: 1725e8d8bef9SDimitry Andric if (Value *V = simplifyX86addcarry(II, IC.Builder)) { 1726e8d8bef9SDimitry Andric return IC.replaceInstUsesWith(II, V); 1727e8d8bef9SDimitry Andric } 1728e8d8bef9SDimitry Andric break; 1729e8d8bef9SDimitry Andric 1730e8d8bef9SDimitry Andric default: 1731e8d8bef9SDimitry Andric break; 1732e8d8bef9SDimitry Andric } 1733e8d8bef9SDimitry Andric return None; 1734e8d8bef9SDimitry Andric } 1735e8d8bef9SDimitry Andric 1736e8d8bef9SDimitry Andric Optional<Value *> X86TTIImpl::simplifyDemandedUseBitsIntrinsic( 1737e8d8bef9SDimitry Andric InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known, 1738e8d8bef9SDimitry Andric bool &KnownBitsComputed) const { 1739e8d8bef9SDimitry Andric switch (II.getIntrinsicID()) { 1740e8d8bef9SDimitry Andric default: 1741e8d8bef9SDimitry Andric break; 1742e8d8bef9SDimitry Andric case Intrinsic::x86_mmx_pmovmskb: 1743e8d8bef9SDimitry Andric case Intrinsic::x86_sse_movmsk_ps: 1744e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_movmsk_pd: 1745e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_pmovmskb_128: 1746e8d8bef9SDimitry Andric case Intrinsic::x86_avx_movmsk_ps_256: 1747e8d8bef9SDimitry Andric case Intrinsic::x86_avx_movmsk_pd_256: 1748e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_pmovmskb: { 1749e8d8bef9SDimitry Andric // MOVMSK copies the vector elements' sign bits to the low bits 1750e8d8bef9SDimitry Andric // and zeros the high bits. 1751e8d8bef9SDimitry Andric unsigned ArgWidth; 1752e8d8bef9SDimitry Andric if (II.getIntrinsicID() == Intrinsic::x86_mmx_pmovmskb) { 1753e8d8bef9SDimitry Andric ArgWidth = 8; // Arg is x86_mmx, but treated as <8 x i8>. 1754e8d8bef9SDimitry Andric } else { 1755fe6060f1SDimitry Andric auto *ArgType = cast<FixedVectorType>(II.getArgOperand(0)->getType()); 1756e8d8bef9SDimitry Andric ArgWidth = ArgType->getNumElements(); 1757e8d8bef9SDimitry Andric } 1758e8d8bef9SDimitry Andric 1759e8d8bef9SDimitry Andric // If we don't need any of low bits then return zero, 1760e8d8bef9SDimitry Andric // we know that DemandedMask is non-zero already. 1761e8d8bef9SDimitry Andric APInt DemandedElts = DemandedMask.zextOrTrunc(ArgWidth); 1762e8d8bef9SDimitry Andric Type *VTy = II.getType(); 1763349cc55cSDimitry Andric if (DemandedElts.isZero()) { 1764e8d8bef9SDimitry Andric return ConstantInt::getNullValue(VTy); 1765e8d8bef9SDimitry Andric } 1766e8d8bef9SDimitry Andric 1767e8d8bef9SDimitry Andric // We know that the upper bits are set to zero. 1768e8d8bef9SDimitry Andric Known.Zero.setBitsFrom(ArgWidth); 1769e8d8bef9SDimitry Andric KnownBitsComputed = true; 1770e8d8bef9SDimitry Andric break; 1771e8d8bef9SDimitry Andric } 1772e8d8bef9SDimitry Andric } 1773e8d8bef9SDimitry Andric return None; 1774e8d8bef9SDimitry Andric } 1775e8d8bef9SDimitry Andric 1776e8d8bef9SDimitry Andric Optional<Value *> X86TTIImpl::simplifyDemandedVectorEltsIntrinsic( 1777e8d8bef9SDimitry Andric InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, 1778e8d8bef9SDimitry Andric APInt &UndefElts2, APInt &UndefElts3, 1779e8d8bef9SDimitry Andric std::function<void(Instruction *, unsigned, APInt, APInt &)> 1780e8d8bef9SDimitry Andric simplifyAndSetOp) const { 1781e8d8bef9SDimitry Andric unsigned VWidth = cast<FixedVectorType>(II.getType())->getNumElements(); 1782e8d8bef9SDimitry Andric switch (II.getIntrinsicID()) { 1783e8d8bef9SDimitry Andric default: 1784e8d8bef9SDimitry Andric break; 1785e8d8bef9SDimitry Andric case Intrinsic::x86_xop_vfrcz_ss: 1786e8d8bef9SDimitry Andric case Intrinsic::x86_xop_vfrcz_sd: 1787e8d8bef9SDimitry Andric // The instructions for these intrinsics are speced to zero upper bits not 1788e8d8bef9SDimitry Andric // pass them through like other scalar intrinsics. So we shouldn't just 1789e8d8bef9SDimitry Andric // use Arg0 if DemandedElts[0] is clear like we do for other intrinsics. 1790e8d8bef9SDimitry Andric // Instead we should return a zero vector. 1791e8d8bef9SDimitry Andric if (!DemandedElts[0]) { 1792e8d8bef9SDimitry Andric IC.addToWorklist(&II); 1793e8d8bef9SDimitry Andric return ConstantAggregateZero::get(II.getType()); 1794e8d8bef9SDimitry Andric } 1795e8d8bef9SDimitry Andric 1796e8d8bef9SDimitry Andric // Only the lower element is used. 1797e8d8bef9SDimitry Andric DemandedElts = 1; 1798e8d8bef9SDimitry Andric simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); 1799e8d8bef9SDimitry Andric 1800e8d8bef9SDimitry Andric // Only the lower element is undefined. The high elements are zero. 1801e8d8bef9SDimitry Andric UndefElts = UndefElts[0]; 1802e8d8bef9SDimitry Andric break; 1803e8d8bef9SDimitry Andric 1804e8d8bef9SDimitry Andric // Unary scalar-as-vector operations that work column-wise. 1805e8d8bef9SDimitry Andric case Intrinsic::x86_sse_rcp_ss: 1806e8d8bef9SDimitry Andric case Intrinsic::x86_sse_rsqrt_ss: 1807e8d8bef9SDimitry Andric simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); 1808e8d8bef9SDimitry Andric 1809e8d8bef9SDimitry Andric // If lowest element of a scalar op isn't used then use Arg0. 1810e8d8bef9SDimitry Andric if (!DemandedElts[0]) { 1811e8d8bef9SDimitry Andric IC.addToWorklist(&II); 1812e8d8bef9SDimitry Andric return II.getArgOperand(0); 1813e8d8bef9SDimitry Andric } 1814e8d8bef9SDimitry Andric // TODO: If only low elt lower SQRT to FSQRT (with rounding/exceptions 1815e8d8bef9SDimitry Andric // checks). 1816e8d8bef9SDimitry Andric break; 1817e8d8bef9SDimitry Andric 1818e8d8bef9SDimitry Andric // Binary scalar-as-vector operations that work column-wise. The high 1819e8d8bef9SDimitry Andric // elements come from operand 0. The low element is a function of both 1820e8d8bef9SDimitry Andric // operands. 1821e8d8bef9SDimitry Andric case Intrinsic::x86_sse_min_ss: 1822e8d8bef9SDimitry Andric case Intrinsic::x86_sse_max_ss: 1823e8d8bef9SDimitry Andric case Intrinsic::x86_sse_cmp_ss: 1824e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_min_sd: 1825e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_max_sd: 1826e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_cmp_sd: { 1827e8d8bef9SDimitry Andric simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); 1828e8d8bef9SDimitry Andric 1829e8d8bef9SDimitry Andric // If lowest element of a scalar op isn't used then use Arg0. 1830e8d8bef9SDimitry Andric if (!DemandedElts[0]) { 1831e8d8bef9SDimitry Andric IC.addToWorklist(&II); 1832e8d8bef9SDimitry Andric return II.getArgOperand(0); 1833e8d8bef9SDimitry Andric } 1834e8d8bef9SDimitry Andric 1835e8d8bef9SDimitry Andric // Only lower element is used for operand 1. 1836e8d8bef9SDimitry Andric DemandedElts = 1; 1837e8d8bef9SDimitry Andric simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2); 1838e8d8bef9SDimitry Andric 1839e8d8bef9SDimitry Andric // Lower element is undefined if both lower elements are undefined. 1840e8d8bef9SDimitry Andric // Consider things like undef&0. The result is known zero, not undef. 1841e8d8bef9SDimitry Andric if (!UndefElts2[0]) 1842e8d8bef9SDimitry Andric UndefElts.clearBit(0); 1843e8d8bef9SDimitry Andric 1844e8d8bef9SDimitry Andric break; 1845e8d8bef9SDimitry Andric } 1846e8d8bef9SDimitry Andric 1847e8d8bef9SDimitry Andric // Binary scalar-as-vector operations that work column-wise. The high 1848e8d8bef9SDimitry Andric // elements come from operand 0 and the low element comes from operand 1. 1849e8d8bef9SDimitry Andric case Intrinsic::x86_sse41_round_ss: 1850e8d8bef9SDimitry Andric case Intrinsic::x86_sse41_round_sd: { 1851e8d8bef9SDimitry Andric // Don't use the low element of operand 0. 1852e8d8bef9SDimitry Andric APInt DemandedElts2 = DemandedElts; 1853e8d8bef9SDimitry Andric DemandedElts2.clearBit(0); 1854e8d8bef9SDimitry Andric simplifyAndSetOp(&II, 0, DemandedElts2, UndefElts); 1855e8d8bef9SDimitry Andric 1856e8d8bef9SDimitry Andric // If lowest element of a scalar op isn't used then use Arg0. 1857e8d8bef9SDimitry Andric if (!DemandedElts[0]) { 1858e8d8bef9SDimitry Andric IC.addToWorklist(&II); 1859e8d8bef9SDimitry Andric return II.getArgOperand(0); 1860e8d8bef9SDimitry Andric } 1861e8d8bef9SDimitry Andric 1862e8d8bef9SDimitry Andric // Only lower element is used for operand 1. 1863e8d8bef9SDimitry Andric DemandedElts = 1; 1864e8d8bef9SDimitry Andric simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2); 1865e8d8bef9SDimitry Andric 1866e8d8bef9SDimitry Andric // Take the high undef elements from operand 0 and take the lower element 1867e8d8bef9SDimitry Andric // from operand 1. 1868e8d8bef9SDimitry Andric UndefElts.clearBit(0); 1869e8d8bef9SDimitry Andric UndefElts |= UndefElts2[0]; 1870e8d8bef9SDimitry Andric break; 1871e8d8bef9SDimitry Andric } 1872e8d8bef9SDimitry Andric 1873e8d8bef9SDimitry Andric // Three input scalar-as-vector operations that work column-wise. The high 1874e8d8bef9SDimitry Andric // elements come from operand 0 and the low element is a function of all 1875e8d8bef9SDimitry Andric // three inputs. 1876e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_mask_add_ss_round: 1877e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_mask_div_ss_round: 1878e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_mask_mul_ss_round: 1879e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_mask_sub_ss_round: 1880e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_mask_max_ss_round: 1881e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_mask_min_ss_round: 1882e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_mask_add_sd_round: 1883e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_mask_div_sd_round: 1884e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_mask_mul_sd_round: 1885e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_mask_sub_sd_round: 1886e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_mask_max_sd_round: 1887e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_mask_min_sd_round: 1888e8d8bef9SDimitry Andric simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); 1889e8d8bef9SDimitry Andric 1890e8d8bef9SDimitry Andric // If lowest element of a scalar op isn't used then use Arg0. 1891e8d8bef9SDimitry Andric if (!DemandedElts[0]) { 1892e8d8bef9SDimitry Andric IC.addToWorklist(&II); 1893e8d8bef9SDimitry Andric return II.getArgOperand(0); 1894e8d8bef9SDimitry Andric } 1895e8d8bef9SDimitry Andric 1896e8d8bef9SDimitry Andric // Only lower element is used for operand 1 and 2. 1897e8d8bef9SDimitry Andric DemandedElts = 1; 1898e8d8bef9SDimitry Andric simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2); 1899e8d8bef9SDimitry Andric simplifyAndSetOp(&II, 2, DemandedElts, UndefElts3); 1900e8d8bef9SDimitry Andric 1901e8d8bef9SDimitry Andric // Lower element is undefined if all three lower elements are undefined. 1902e8d8bef9SDimitry Andric // Consider things like undef&0. The result is known zero, not undef. 1903e8d8bef9SDimitry Andric if (!UndefElts2[0] || !UndefElts3[0]) 1904e8d8bef9SDimitry Andric UndefElts.clearBit(0); 1905e8d8bef9SDimitry Andric break; 1906e8d8bef9SDimitry Andric 1907e8d8bef9SDimitry Andric // TODO: Add fmaddsub support? 1908e8d8bef9SDimitry Andric case Intrinsic::x86_sse3_addsub_pd: 1909e8d8bef9SDimitry Andric case Intrinsic::x86_sse3_addsub_ps: 1910e8d8bef9SDimitry Andric case Intrinsic::x86_avx_addsub_pd_256: 1911e8d8bef9SDimitry Andric case Intrinsic::x86_avx_addsub_ps_256: { 1912e8d8bef9SDimitry Andric // If none of the even or none of the odd lanes are required, turn this 1913e8d8bef9SDimitry Andric // into a generic FP math instruction. 1914e8d8bef9SDimitry Andric APInt SubMask = APInt::getSplat(VWidth, APInt(2, 0x1)); 1915e8d8bef9SDimitry Andric APInt AddMask = APInt::getSplat(VWidth, APInt(2, 0x2)); 1916e8d8bef9SDimitry Andric bool IsSubOnly = DemandedElts.isSubsetOf(SubMask); 1917e8d8bef9SDimitry Andric bool IsAddOnly = DemandedElts.isSubsetOf(AddMask); 1918e8d8bef9SDimitry Andric if (IsSubOnly || IsAddOnly) { 1919e8d8bef9SDimitry Andric assert((IsSubOnly ^ IsAddOnly) && "Can't be both add-only and sub-only"); 1920e8d8bef9SDimitry Andric IRBuilderBase::InsertPointGuard Guard(IC.Builder); 1921e8d8bef9SDimitry Andric IC.Builder.SetInsertPoint(&II); 1922e8d8bef9SDimitry Andric Value *Arg0 = II.getArgOperand(0), *Arg1 = II.getArgOperand(1); 1923e8d8bef9SDimitry Andric return IC.Builder.CreateBinOp( 1924e8d8bef9SDimitry Andric IsSubOnly ? Instruction::FSub : Instruction::FAdd, Arg0, Arg1); 1925e8d8bef9SDimitry Andric } 1926e8d8bef9SDimitry Andric 1927e8d8bef9SDimitry Andric simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); 1928e8d8bef9SDimitry Andric simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2); 1929e8d8bef9SDimitry Andric UndefElts &= UndefElts2; 1930e8d8bef9SDimitry Andric break; 1931e8d8bef9SDimitry Andric } 1932e8d8bef9SDimitry Andric 1933*81ad6265SDimitry Andric // General per-element vector operations. 1934*81ad6265SDimitry Andric case Intrinsic::x86_avx2_psllv_d: 1935*81ad6265SDimitry Andric case Intrinsic::x86_avx2_psllv_d_256: 1936*81ad6265SDimitry Andric case Intrinsic::x86_avx2_psllv_q: 1937*81ad6265SDimitry Andric case Intrinsic::x86_avx2_psllv_q_256: 1938*81ad6265SDimitry Andric case Intrinsic::x86_avx2_psrlv_d: 1939*81ad6265SDimitry Andric case Intrinsic::x86_avx2_psrlv_d_256: 1940*81ad6265SDimitry Andric case Intrinsic::x86_avx2_psrlv_q: 1941*81ad6265SDimitry Andric case Intrinsic::x86_avx2_psrlv_q_256: 1942*81ad6265SDimitry Andric case Intrinsic::x86_avx2_psrav_d: 1943*81ad6265SDimitry Andric case Intrinsic::x86_avx2_psrav_d_256: { 1944*81ad6265SDimitry Andric simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); 1945*81ad6265SDimitry Andric simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2); 1946*81ad6265SDimitry Andric UndefElts &= UndefElts2; 1947*81ad6265SDimitry Andric break; 1948*81ad6265SDimitry Andric } 1949*81ad6265SDimitry Andric 1950e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_packssdw_128: 1951e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_packsswb_128: 1952e8d8bef9SDimitry Andric case Intrinsic::x86_sse2_packuswb_128: 1953e8d8bef9SDimitry Andric case Intrinsic::x86_sse41_packusdw: 1954e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_packssdw: 1955e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_packsswb: 1956e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_packusdw: 1957e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_packuswb: 1958e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_packssdw_512: 1959e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_packsswb_512: 1960e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_packusdw_512: 1961e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_packuswb_512: { 1962e8d8bef9SDimitry Andric auto *Ty0 = II.getArgOperand(0)->getType(); 1963e8d8bef9SDimitry Andric unsigned InnerVWidth = cast<FixedVectorType>(Ty0)->getNumElements(); 1964e8d8bef9SDimitry Andric assert(VWidth == (InnerVWidth * 2) && "Unexpected input size"); 1965e8d8bef9SDimitry Andric 1966e8d8bef9SDimitry Andric unsigned NumLanes = Ty0->getPrimitiveSizeInBits() / 128; 1967e8d8bef9SDimitry Andric unsigned VWidthPerLane = VWidth / NumLanes; 1968e8d8bef9SDimitry Andric unsigned InnerVWidthPerLane = InnerVWidth / NumLanes; 1969e8d8bef9SDimitry Andric 1970e8d8bef9SDimitry Andric // Per lane, pack the elements of the first input and then the second. 1971e8d8bef9SDimitry Andric // e.g. 1972e8d8bef9SDimitry Andric // v8i16 PACK(v4i32 X, v4i32 Y) - (X[0..3],Y[0..3]) 1973e8d8bef9SDimitry Andric // v32i8 PACK(v16i16 X, v16i16 Y) - (X[0..7],Y[0..7]),(X[8..15],Y[8..15]) 1974e8d8bef9SDimitry Andric for (int OpNum = 0; OpNum != 2; ++OpNum) { 1975e8d8bef9SDimitry Andric APInt OpDemandedElts(InnerVWidth, 0); 1976e8d8bef9SDimitry Andric for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { 1977e8d8bef9SDimitry Andric unsigned LaneIdx = Lane * VWidthPerLane; 1978e8d8bef9SDimitry Andric for (unsigned Elt = 0; Elt != InnerVWidthPerLane; ++Elt) { 1979e8d8bef9SDimitry Andric unsigned Idx = LaneIdx + Elt + InnerVWidthPerLane * OpNum; 1980e8d8bef9SDimitry Andric if (DemandedElts[Idx]) 1981e8d8bef9SDimitry Andric OpDemandedElts.setBit((Lane * InnerVWidthPerLane) + Elt); 1982e8d8bef9SDimitry Andric } 1983e8d8bef9SDimitry Andric } 1984e8d8bef9SDimitry Andric 1985e8d8bef9SDimitry Andric // Demand elements from the operand. 1986e8d8bef9SDimitry Andric APInt OpUndefElts(InnerVWidth, 0); 1987e8d8bef9SDimitry Andric simplifyAndSetOp(&II, OpNum, OpDemandedElts, OpUndefElts); 1988e8d8bef9SDimitry Andric 1989e8d8bef9SDimitry Andric // Pack the operand's UNDEF elements, one lane at a time. 1990e8d8bef9SDimitry Andric OpUndefElts = OpUndefElts.zext(VWidth); 1991e8d8bef9SDimitry Andric for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { 1992e8d8bef9SDimitry Andric APInt LaneElts = OpUndefElts.lshr(InnerVWidthPerLane * Lane); 1993e8d8bef9SDimitry Andric LaneElts = LaneElts.getLoBits(InnerVWidthPerLane); 1994e8d8bef9SDimitry Andric LaneElts <<= InnerVWidthPerLane * (2 * Lane + OpNum); 1995e8d8bef9SDimitry Andric UndefElts |= LaneElts; 1996e8d8bef9SDimitry Andric } 1997e8d8bef9SDimitry Andric } 1998e8d8bef9SDimitry Andric break; 1999e8d8bef9SDimitry Andric } 2000e8d8bef9SDimitry Andric 2001e8d8bef9SDimitry Andric // PSHUFB 2002e8d8bef9SDimitry Andric case Intrinsic::x86_ssse3_pshuf_b_128: 2003e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_pshuf_b: 2004e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_pshuf_b_512: 2005e8d8bef9SDimitry Andric // PERMILVAR 2006e8d8bef9SDimitry Andric case Intrinsic::x86_avx_vpermilvar_ps: 2007e8d8bef9SDimitry Andric case Intrinsic::x86_avx_vpermilvar_ps_256: 2008e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_vpermilvar_ps_512: 2009e8d8bef9SDimitry Andric case Intrinsic::x86_avx_vpermilvar_pd: 2010e8d8bef9SDimitry Andric case Intrinsic::x86_avx_vpermilvar_pd_256: 2011e8d8bef9SDimitry Andric case Intrinsic::x86_avx512_vpermilvar_pd_512: 2012e8d8bef9SDimitry Andric // PERMV 2013e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_permd: 2014e8d8bef9SDimitry Andric case Intrinsic::x86_avx2_permps: { 2015e8d8bef9SDimitry Andric simplifyAndSetOp(&II, 1, DemandedElts, UndefElts); 2016e8d8bef9SDimitry Andric break; 2017e8d8bef9SDimitry Andric } 2018e8d8bef9SDimitry Andric 2019e8d8bef9SDimitry Andric // SSE4A instructions leave the upper 64-bits of the 128-bit result 2020e8d8bef9SDimitry Andric // in an undefined state. 2021e8d8bef9SDimitry Andric case Intrinsic::x86_sse4a_extrq: 2022e8d8bef9SDimitry Andric case Intrinsic::x86_sse4a_extrqi: 2023e8d8bef9SDimitry Andric case Intrinsic::x86_sse4a_insertq: 2024e8d8bef9SDimitry Andric case Intrinsic::x86_sse4a_insertqi: 2025e8d8bef9SDimitry Andric UndefElts.setHighBits(VWidth / 2); 2026e8d8bef9SDimitry Andric break; 2027e8d8bef9SDimitry Andric } 2028e8d8bef9SDimitry Andric return None; 2029e8d8bef9SDimitry Andric } 2030