1 //===- InstCombineCalls.cpp -----------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements the visitCall, visitInvoke, and visitCallBr functions. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "InstCombineInternal.h" 14 #include "llvm/ADT/APFloat.h" 15 #include "llvm/ADT/APInt.h" 16 #include "llvm/ADT/APSInt.h" 17 #include "llvm/ADT/ArrayRef.h" 18 #include "llvm/ADT/FloatingPointMode.h" 19 #include "llvm/ADT/None.h" 20 #include "llvm/ADT/Optional.h" 21 #include "llvm/ADT/STLExtras.h" 22 #include "llvm/ADT/SmallVector.h" 23 #include "llvm/ADT/Statistic.h" 24 #include "llvm/ADT/Twine.h" 25 #include "llvm/Analysis/AliasAnalysis.h" 26 #include "llvm/Analysis/AssumeBundleQueries.h" 27 #include "llvm/Analysis/AssumptionCache.h" 28 #include "llvm/Analysis/InstructionSimplify.h" 29 #include "llvm/Analysis/Loads.h" 30 #include "llvm/Analysis/MemoryBuiltins.h" 31 #include "llvm/Analysis/ValueTracking.h" 32 #include "llvm/Analysis/VectorUtils.h" 33 #include "llvm/IR/Attributes.h" 34 #include "llvm/IR/BasicBlock.h" 35 #include "llvm/IR/Constant.h" 36 #include "llvm/IR/Constants.h" 37 #include "llvm/IR/DataLayout.h" 38 #include "llvm/IR/DerivedTypes.h" 39 #include "llvm/IR/Function.h" 40 #include "llvm/IR/GlobalVariable.h" 41 #include "llvm/IR/InstrTypes.h" 42 #include "llvm/IR/Instruction.h" 43 #include "llvm/IR/Instructions.h" 44 #include "llvm/IR/IntrinsicInst.h" 45 #include "llvm/IR/Intrinsics.h" 46 #include "llvm/IR/IntrinsicsAArch64.h" 47 #include "llvm/IR/IntrinsicsAMDGPU.h" 48 #include "llvm/IR/IntrinsicsARM.h" 49 #include "llvm/IR/IntrinsicsHexagon.h" 50 #include "llvm/IR/IntrinsicsNVPTX.h" 51 #include "llvm/IR/IntrinsicsPowerPC.h" 52 #include "llvm/IR/IntrinsicsX86.h" 53 #include "llvm/IR/LLVMContext.h" 54 #include "llvm/IR/Metadata.h" 55 #include "llvm/IR/PatternMatch.h" 56 #include "llvm/IR/Statepoint.h" 57 #include "llvm/IR/Type.h" 58 #include "llvm/IR/User.h" 59 #include "llvm/IR/Value.h" 60 #include "llvm/IR/ValueHandle.h" 61 #include "llvm/Support/AtomicOrdering.h" 62 #include "llvm/Support/Casting.h" 63 #include "llvm/Support/CommandLine.h" 64 #include "llvm/Support/Compiler.h" 65 #include "llvm/Support/Debug.h" 66 #include "llvm/Support/ErrorHandling.h" 67 #include "llvm/Support/KnownBits.h" 68 #include "llvm/Support/MathExtras.h" 69 #include "llvm/Support/raw_ostream.h" 70 #include "llvm/Transforms/InstCombine/InstCombineWorklist.h" 71 #include "llvm/Transforms/Utils/Local.h" 72 #include "llvm/Transforms/Utils/SimplifyLibCalls.h" 73 #include <algorithm> 74 #include <cassert> 75 #include <cstdint> 76 #include <cstring> 77 #include <utility> 78 #include <vector> 79 80 using namespace llvm; 81 using namespace PatternMatch; 82 83 #define DEBUG_TYPE "instcombine" 84 85 STATISTIC(NumSimplified, "Number of library calls simplified"); 86 87 static cl::opt<unsigned> GuardWideningWindow( 88 "instcombine-guard-widening-window", 89 cl::init(3), 90 cl::desc("How wide an instruction window to bypass looking for " 91 "another guard")); 92 93 /// Return the specified type promoted as it would be to pass though a va_arg 94 /// area. 95 static Type *getPromotedType(Type *Ty) { 96 if (IntegerType* ITy = dyn_cast<IntegerType>(Ty)) { 97 if (ITy->getBitWidth() < 32) 98 return Type::getInt32Ty(Ty->getContext()); 99 } 100 return Ty; 101 } 102 103 /// Return a constant boolean vector that has true elements in all positions 104 /// where the input constant data vector has an element with the sign bit set. 105 static Constant *getNegativeIsTrueBoolVec(ConstantDataVector *V) { 106 SmallVector<Constant *, 32> BoolVec; 107 IntegerType *BoolTy = Type::getInt1Ty(V->getContext()); 108 for (unsigned I = 0, E = V->getNumElements(); I != E; ++I) { 109 Constant *Elt = V->getElementAsConstant(I); 110 assert((isa<ConstantInt>(Elt) || isa<ConstantFP>(Elt)) && 111 "Unexpected constant data vector element type"); 112 bool Sign = V->getElementType()->isIntegerTy() 113 ? cast<ConstantInt>(Elt)->isNegative() 114 : cast<ConstantFP>(Elt)->isNegative(); 115 BoolVec.push_back(ConstantInt::get(BoolTy, Sign)); 116 } 117 return ConstantVector::get(BoolVec); 118 } 119 120 Instruction *InstCombiner::SimplifyAnyMemTransfer(AnyMemTransferInst *MI) { 121 Align DstAlign = getKnownAlignment(MI->getRawDest(), DL, MI, &AC, &DT); 122 MaybeAlign CopyDstAlign = MI->getDestAlign(); 123 if (!CopyDstAlign || *CopyDstAlign < DstAlign) { 124 MI->setDestAlignment(DstAlign); 125 return MI; 126 } 127 128 Align SrcAlign = getKnownAlignment(MI->getRawSource(), DL, MI, &AC, &DT); 129 MaybeAlign CopySrcAlign = MI->getSourceAlign(); 130 if (!CopySrcAlign || *CopySrcAlign < SrcAlign) { 131 MI->setSourceAlignment(SrcAlign); 132 return MI; 133 } 134 135 // If we have a store to a location which is known constant, we can conclude 136 // that the store must be storing the constant value (else the memory 137 // wouldn't be constant), and this must be a noop. 138 if (AA->pointsToConstantMemory(MI->getDest())) { 139 // Set the size of the copy to 0, it will be deleted on the next iteration. 140 MI->setLength(Constant::getNullValue(MI->getLength()->getType())); 141 return MI; 142 } 143 144 // If MemCpyInst length is 1/2/4/8 bytes then replace memcpy with 145 // load/store. 146 ConstantInt *MemOpLength = dyn_cast<ConstantInt>(MI->getLength()); 147 if (!MemOpLength) return nullptr; 148 149 // Source and destination pointer types are always "i8*" for intrinsic. See 150 // if the size is something we can handle with a single primitive load/store. 151 // A single load+store correctly handles overlapping memory in the memmove 152 // case. 153 uint64_t Size = MemOpLength->getLimitedValue(); 154 assert(Size && "0-sized memory transferring should be removed already."); 155 156 if (Size > 8 || (Size&(Size-1))) 157 return nullptr; // If not 1/2/4/8 bytes, exit. 158 159 // If it is an atomic and alignment is less than the size then we will 160 // introduce the unaligned memory access which will be later transformed 161 // into libcall in CodeGen. This is not evident performance gain so disable 162 // it now. 163 if (isa<AtomicMemTransferInst>(MI)) 164 if (*CopyDstAlign < Size || *CopySrcAlign < Size) 165 return nullptr; 166 167 // Use an integer load+store unless we can find something better. 168 unsigned SrcAddrSp = 169 cast<PointerType>(MI->getArgOperand(1)->getType())->getAddressSpace(); 170 unsigned DstAddrSp = 171 cast<PointerType>(MI->getArgOperand(0)->getType())->getAddressSpace(); 172 173 IntegerType* IntType = IntegerType::get(MI->getContext(), Size<<3); 174 Type *NewSrcPtrTy = PointerType::get(IntType, SrcAddrSp); 175 Type *NewDstPtrTy = PointerType::get(IntType, DstAddrSp); 176 177 // If the memcpy has metadata describing the members, see if we can get the 178 // TBAA tag describing our copy. 179 MDNode *CopyMD = nullptr; 180 if (MDNode *M = MI->getMetadata(LLVMContext::MD_tbaa)) { 181 CopyMD = M; 182 } else if (MDNode *M = MI->getMetadata(LLVMContext::MD_tbaa_struct)) { 183 if (M->getNumOperands() == 3 && M->getOperand(0) && 184 mdconst::hasa<ConstantInt>(M->getOperand(0)) && 185 mdconst::extract<ConstantInt>(M->getOperand(0))->isZero() && 186 M->getOperand(1) && 187 mdconst::hasa<ConstantInt>(M->getOperand(1)) && 188 mdconst::extract<ConstantInt>(M->getOperand(1))->getValue() == 189 Size && 190 M->getOperand(2) && isa<MDNode>(M->getOperand(2))) 191 CopyMD = cast<MDNode>(M->getOperand(2)); 192 } 193 194 Value *Src = Builder.CreateBitCast(MI->getArgOperand(1), NewSrcPtrTy); 195 Value *Dest = Builder.CreateBitCast(MI->getArgOperand(0), NewDstPtrTy); 196 LoadInst *L = Builder.CreateLoad(IntType, Src); 197 // Alignment from the mem intrinsic will be better, so use it. 198 L->setAlignment(*CopySrcAlign); 199 if (CopyMD) 200 L->setMetadata(LLVMContext::MD_tbaa, CopyMD); 201 MDNode *LoopMemParallelMD = 202 MI->getMetadata(LLVMContext::MD_mem_parallel_loop_access); 203 if (LoopMemParallelMD) 204 L->setMetadata(LLVMContext::MD_mem_parallel_loop_access, LoopMemParallelMD); 205 MDNode *AccessGroupMD = MI->getMetadata(LLVMContext::MD_access_group); 206 if (AccessGroupMD) 207 L->setMetadata(LLVMContext::MD_access_group, AccessGroupMD); 208 209 StoreInst *S = Builder.CreateStore(L, Dest); 210 // Alignment from the mem intrinsic will be better, so use it. 211 S->setAlignment(*CopyDstAlign); 212 if (CopyMD) 213 S->setMetadata(LLVMContext::MD_tbaa, CopyMD); 214 if (LoopMemParallelMD) 215 S->setMetadata(LLVMContext::MD_mem_parallel_loop_access, LoopMemParallelMD); 216 if (AccessGroupMD) 217 S->setMetadata(LLVMContext::MD_access_group, AccessGroupMD); 218 219 if (auto *MT = dyn_cast<MemTransferInst>(MI)) { 220 // non-atomics can be volatile 221 L->setVolatile(MT->isVolatile()); 222 S->setVolatile(MT->isVolatile()); 223 } 224 if (isa<AtomicMemTransferInst>(MI)) { 225 // atomics have to be unordered 226 L->setOrdering(AtomicOrdering::Unordered); 227 S->setOrdering(AtomicOrdering::Unordered); 228 } 229 230 // Set the size of the copy to 0, it will be deleted on the next iteration. 231 MI->setLength(Constant::getNullValue(MemOpLength->getType())); 232 return MI; 233 } 234 235 Instruction *InstCombiner::SimplifyAnyMemSet(AnyMemSetInst *MI) { 236 const Align KnownAlignment = 237 getKnownAlignment(MI->getDest(), DL, MI, &AC, &DT); 238 MaybeAlign MemSetAlign = MI->getDestAlign(); 239 if (!MemSetAlign || *MemSetAlign < KnownAlignment) { 240 MI->setDestAlignment(KnownAlignment); 241 return MI; 242 } 243 244 // If we have a store to a location which is known constant, we can conclude 245 // that the store must be storing the constant value (else the memory 246 // wouldn't be constant), and this must be a noop. 247 if (AA->pointsToConstantMemory(MI->getDest())) { 248 // Set the size of the copy to 0, it will be deleted on the next iteration. 249 MI->setLength(Constant::getNullValue(MI->getLength()->getType())); 250 return MI; 251 } 252 253 // Extract the length and alignment and fill if they are constant. 254 ConstantInt *LenC = dyn_cast<ConstantInt>(MI->getLength()); 255 ConstantInt *FillC = dyn_cast<ConstantInt>(MI->getValue()); 256 if (!LenC || !FillC || !FillC->getType()->isIntegerTy(8)) 257 return nullptr; 258 const uint64_t Len = LenC->getLimitedValue(); 259 assert(Len && "0-sized memory setting should be removed already."); 260 const Align Alignment = assumeAligned(MI->getDestAlignment()); 261 262 // If it is an atomic and alignment is less than the size then we will 263 // introduce the unaligned memory access which will be later transformed 264 // into libcall in CodeGen. This is not evident performance gain so disable 265 // it now. 266 if (isa<AtomicMemSetInst>(MI)) 267 if (Alignment < Len) 268 return nullptr; 269 270 // memset(s,c,n) -> store s, c (for n=1,2,4,8) 271 if (Len <= 8 && isPowerOf2_32((uint32_t)Len)) { 272 Type *ITy = IntegerType::get(MI->getContext(), Len*8); // n=1 -> i8. 273 274 Value *Dest = MI->getDest(); 275 unsigned DstAddrSp = cast<PointerType>(Dest->getType())->getAddressSpace(); 276 Type *NewDstPtrTy = PointerType::get(ITy, DstAddrSp); 277 Dest = Builder.CreateBitCast(Dest, NewDstPtrTy); 278 279 // Extract the fill value and store. 280 uint64_t Fill = FillC->getZExtValue()*0x0101010101010101ULL; 281 StoreInst *S = Builder.CreateStore(ConstantInt::get(ITy, Fill), Dest, 282 MI->isVolatile()); 283 S->setAlignment(Alignment); 284 if (isa<AtomicMemSetInst>(MI)) 285 S->setOrdering(AtomicOrdering::Unordered); 286 287 // Set the size of the copy to 0, it will be deleted on the next iteration. 288 MI->setLength(Constant::getNullValue(LenC->getType())); 289 return MI; 290 } 291 292 return nullptr; 293 } 294 295 static Value *simplifyX86immShift(const IntrinsicInst &II, 296 InstCombiner::BuilderTy &Builder) { 297 bool LogicalShift = false; 298 bool ShiftLeft = false; 299 bool IsImm = false; 300 301 switch (II.getIntrinsicID()) { 302 default: llvm_unreachable("Unexpected intrinsic!"); 303 case Intrinsic::x86_sse2_psrai_d: 304 case Intrinsic::x86_sse2_psrai_w: 305 case Intrinsic::x86_avx2_psrai_d: 306 case Intrinsic::x86_avx2_psrai_w: 307 case Intrinsic::x86_avx512_psrai_q_128: 308 case Intrinsic::x86_avx512_psrai_q_256: 309 case Intrinsic::x86_avx512_psrai_d_512: 310 case Intrinsic::x86_avx512_psrai_q_512: 311 case Intrinsic::x86_avx512_psrai_w_512: 312 IsImm = true; 313 LLVM_FALLTHROUGH; 314 case Intrinsic::x86_sse2_psra_d: 315 case Intrinsic::x86_sse2_psra_w: 316 case Intrinsic::x86_avx2_psra_d: 317 case Intrinsic::x86_avx2_psra_w: 318 case Intrinsic::x86_avx512_psra_q_128: 319 case Intrinsic::x86_avx512_psra_q_256: 320 case Intrinsic::x86_avx512_psra_d_512: 321 case Intrinsic::x86_avx512_psra_q_512: 322 case Intrinsic::x86_avx512_psra_w_512: 323 LogicalShift = false; 324 ShiftLeft = false; 325 break; 326 case Intrinsic::x86_sse2_psrli_d: 327 case Intrinsic::x86_sse2_psrli_q: 328 case Intrinsic::x86_sse2_psrli_w: 329 case Intrinsic::x86_avx2_psrli_d: 330 case Intrinsic::x86_avx2_psrli_q: 331 case Intrinsic::x86_avx2_psrli_w: 332 case Intrinsic::x86_avx512_psrli_d_512: 333 case Intrinsic::x86_avx512_psrli_q_512: 334 case Intrinsic::x86_avx512_psrli_w_512: 335 IsImm = true; 336 LLVM_FALLTHROUGH; 337 case Intrinsic::x86_sse2_psrl_d: 338 case Intrinsic::x86_sse2_psrl_q: 339 case Intrinsic::x86_sse2_psrl_w: 340 case Intrinsic::x86_avx2_psrl_d: 341 case Intrinsic::x86_avx2_psrl_q: 342 case Intrinsic::x86_avx2_psrl_w: 343 case Intrinsic::x86_avx512_psrl_d_512: 344 case Intrinsic::x86_avx512_psrl_q_512: 345 case Intrinsic::x86_avx512_psrl_w_512: 346 LogicalShift = true; 347 ShiftLeft = false; 348 break; 349 case Intrinsic::x86_sse2_pslli_d: 350 case Intrinsic::x86_sse2_pslli_q: 351 case Intrinsic::x86_sse2_pslli_w: 352 case Intrinsic::x86_avx2_pslli_d: 353 case Intrinsic::x86_avx2_pslli_q: 354 case Intrinsic::x86_avx2_pslli_w: 355 case Intrinsic::x86_avx512_pslli_d_512: 356 case Intrinsic::x86_avx512_pslli_q_512: 357 case Intrinsic::x86_avx512_pslli_w_512: 358 IsImm = true; 359 LLVM_FALLTHROUGH; 360 case Intrinsic::x86_sse2_psll_d: 361 case Intrinsic::x86_sse2_psll_q: 362 case Intrinsic::x86_sse2_psll_w: 363 case Intrinsic::x86_avx2_psll_d: 364 case Intrinsic::x86_avx2_psll_q: 365 case Intrinsic::x86_avx2_psll_w: 366 case Intrinsic::x86_avx512_psll_d_512: 367 case Intrinsic::x86_avx512_psll_q_512: 368 case Intrinsic::x86_avx512_psll_w_512: 369 LogicalShift = true; 370 ShiftLeft = true; 371 break; 372 } 373 assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left"); 374 375 auto Vec = II.getArgOperand(0); 376 auto Amt = II.getArgOperand(1); 377 auto VT = cast<VectorType>(Vec->getType()); 378 auto SVT = VT->getElementType(); 379 auto AmtVT = Amt->getType(); 380 unsigned VWidth = VT->getNumElements(); 381 unsigned BitWidth = SVT->getPrimitiveSizeInBits(); 382 383 // If the shift amount is guaranteed to be in-range we can replace it with a 384 // generic shift. If its guaranteed to be out of range, logical shifts combine to 385 // zero and arithmetic shifts are clamped to (BitWidth - 1). 386 if (IsImm) { 387 assert(AmtVT ->isIntegerTy(32) && 388 "Unexpected shift-by-immediate type"); 389 KnownBits KnownAmtBits = 390 llvm::computeKnownBits(Amt, II.getModule()->getDataLayout()); 391 if (KnownAmtBits.getMaxValue().ult(BitWidth)) { 392 Amt = Builder.CreateZExtOrTrunc(Amt, SVT); 393 Amt = Builder.CreateVectorSplat(VWidth, Amt); 394 return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt) 395 : Builder.CreateLShr(Vec, Amt)) 396 : Builder.CreateAShr(Vec, Amt)); 397 } 398 if (KnownAmtBits.getMinValue().uge(BitWidth)) { 399 if (LogicalShift) 400 return ConstantAggregateZero::get(VT); 401 Amt = ConstantInt::get(SVT, BitWidth - 1); 402 return Builder.CreateAShr(Vec, Builder.CreateVectorSplat(VWidth, Amt)); 403 } 404 } else { 405 // Ensure the first element has an in-range value and the rest of the 406 // elements in the bottom 64 bits are zero. 407 assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 && 408 cast<VectorType>(AmtVT)->getElementType() == SVT && 409 "Unexpected shift-by-scalar type"); 410 unsigned NumAmtElts = cast<VectorType>(AmtVT)->getNumElements(); 411 APInt DemandedLower = APInt::getOneBitSet(NumAmtElts, 0); 412 APInt DemandedUpper = APInt::getBitsSet(NumAmtElts, 1, NumAmtElts / 2); 413 KnownBits KnownLowerBits = llvm::computeKnownBits( 414 Amt, DemandedLower, II.getModule()->getDataLayout()); 415 KnownBits KnownUpperBits = llvm::computeKnownBits( 416 Amt, DemandedUpper, II.getModule()->getDataLayout()); 417 if (KnownLowerBits.getMaxValue().ult(BitWidth) && 418 (DemandedUpper.isNullValue() || KnownUpperBits.isZero())) { 419 SmallVector<int, 16> ZeroSplat(VWidth, 0); 420 Amt = Builder.CreateShuffleVector(Amt, Amt, ZeroSplat); 421 return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt) 422 : Builder.CreateLShr(Vec, Amt)) 423 : Builder.CreateAShr(Vec, Amt)); 424 } 425 } 426 427 // Simplify if count is constant vector. 428 auto CDV = dyn_cast<ConstantDataVector>(Amt); 429 if (!CDV) 430 return nullptr; 431 432 // SSE2/AVX2 uses all the first 64-bits of the 128-bit vector 433 // operand to compute the shift amount. 434 assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 && 435 cast<VectorType>(AmtVT)->getElementType() == SVT && 436 "Unexpected shift-by-scalar type"); 437 438 // Concatenate the sub-elements to create the 64-bit value. 439 APInt Count(64, 0); 440 for (unsigned i = 0, NumSubElts = 64 / BitWidth; i != NumSubElts; ++i) { 441 unsigned SubEltIdx = (NumSubElts - 1) - i; 442 auto SubElt = cast<ConstantInt>(CDV->getElementAsConstant(SubEltIdx)); 443 Count <<= BitWidth; 444 Count |= SubElt->getValue().zextOrTrunc(64); 445 } 446 447 // If shift-by-zero then just return the original value. 448 if (Count.isNullValue()) 449 return Vec; 450 451 // Handle cases when Shift >= BitWidth. 452 if (Count.uge(BitWidth)) { 453 // If LogicalShift - just return zero. 454 if (LogicalShift) 455 return ConstantAggregateZero::get(VT); 456 457 // If ArithmeticShift - clamp Shift to (BitWidth - 1). 458 Count = APInt(64, BitWidth - 1); 459 } 460 461 // Get a constant vector of the same type as the first operand. 462 auto ShiftAmt = ConstantInt::get(SVT, Count.zextOrTrunc(BitWidth)); 463 auto ShiftVec = Builder.CreateVectorSplat(VWidth, ShiftAmt); 464 465 if (ShiftLeft) 466 return Builder.CreateShl(Vec, ShiftVec); 467 468 if (LogicalShift) 469 return Builder.CreateLShr(Vec, ShiftVec); 470 471 return Builder.CreateAShr(Vec, ShiftVec); 472 } 473 474 // Attempt to simplify AVX2 per-element shift intrinsics to a generic IR shift. 475 // Unlike the generic IR shifts, the intrinsics have defined behaviour for out 476 // of range shift amounts (logical - set to zero, arithmetic - splat sign bit). 477 static Value *simplifyX86varShift(const IntrinsicInst &II, 478 InstCombiner::BuilderTy &Builder) { 479 bool LogicalShift = false; 480 bool ShiftLeft = false; 481 482 switch (II.getIntrinsicID()) { 483 default: llvm_unreachable("Unexpected intrinsic!"); 484 case Intrinsic::x86_avx2_psrav_d: 485 case Intrinsic::x86_avx2_psrav_d_256: 486 case Intrinsic::x86_avx512_psrav_q_128: 487 case Intrinsic::x86_avx512_psrav_q_256: 488 case Intrinsic::x86_avx512_psrav_d_512: 489 case Intrinsic::x86_avx512_psrav_q_512: 490 case Intrinsic::x86_avx512_psrav_w_128: 491 case Intrinsic::x86_avx512_psrav_w_256: 492 case Intrinsic::x86_avx512_psrav_w_512: 493 LogicalShift = false; 494 ShiftLeft = false; 495 break; 496 case Intrinsic::x86_avx2_psrlv_d: 497 case Intrinsic::x86_avx2_psrlv_d_256: 498 case Intrinsic::x86_avx2_psrlv_q: 499 case Intrinsic::x86_avx2_psrlv_q_256: 500 case Intrinsic::x86_avx512_psrlv_d_512: 501 case Intrinsic::x86_avx512_psrlv_q_512: 502 case Intrinsic::x86_avx512_psrlv_w_128: 503 case Intrinsic::x86_avx512_psrlv_w_256: 504 case Intrinsic::x86_avx512_psrlv_w_512: 505 LogicalShift = true; 506 ShiftLeft = false; 507 break; 508 case Intrinsic::x86_avx2_psllv_d: 509 case Intrinsic::x86_avx2_psllv_d_256: 510 case Intrinsic::x86_avx2_psllv_q: 511 case Intrinsic::x86_avx2_psllv_q_256: 512 case Intrinsic::x86_avx512_psllv_d_512: 513 case Intrinsic::x86_avx512_psllv_q_512: 514 case Intrinsic::x86_avx512_psllv_w_128: 515 case Intrinsic::x86_avx512_psllv_w_256: 516 case Intrinsic::x86_avx512_psllv_w_512: 517 LogicalShift = true; 518 ShiftLeft = true; 519 break; 520 } 521 assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left"); 522 523 auto Vec = II.getArgOperand(0); 524 auto Amt = II.getArgOperand(1); 525 auto VT = cast<VectorType>(II.getType()); 526 auto SVT = VT->getElementType(); 527 int NumElts = VT->getNumElements(); 528 int BitWidth = SVT->getIntegerBitWidth(); 529 530 // If the shift amount is guaranteed to be in-range we can replace it with a 531 // generic shift. 532 APInt UpperBits = 533 APInt::getHighBitsSet(BitWidth, BitWidth - Log2_32(BitWidth)); 534 if (llvm::MaskedValueIsZero(Amt, UpperBits, 535 II.getModule()->getDataLayout())) { 536 return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt) 537 : Builder.CreateLShr(Vec, Amt)) 538 : Builder.CreateAShr(Vec, Amt)); 539 } 540 541 // Simplify if all shift amounts are constant/undef. 542 auto *CShift = dyn_cast<Constant>(Amt); 543 if (!CShift) 544 return nullptr; 545 546 // Collect each element's shift amount. 547 // We also collect special cases: UNDEF = -1, OUT-OF-RANGE = BitWidth. 548 bool AnyOutOfRange = false; 549 SmallVector<int, 8> ShiftAmts; 550 for (int I = 0; I < NumElts; ++I) { 551 auto *CElt = CShift->getAggregateElement(I); 552 if (CElt && isa<UndefValue>(CElt)) { 553 ShiftAmts.push_back(-1); 554 continue; 555 } 556 557 auto *COp = dyn_cast_or_null<ConstantInt>(CElt); 558 if (!COp) 559 return nullptr; 560 561 // Handle out of range shifts. 562 // If LogicalShift - set to BitWidth (special case). 563 // If ArithmeticShift - set to (BitWidth - 1) (sign splat). 564 APInt ShiftVal = COp->getValue(); 565 if (ShiftVal.uge(BitWidth)) { 566 AnyOutOfRange = LogicalShift; 567 ShiftAmts.push_back(LogicalShift ? BitWidth : BitWidth - 1); 568 continue; 569 } 570 571 ShiftAmts.push_back((int)ShiftVal.getZExtValue()); 572 } 573 574 // If all elements out of range or UNDEF, return vector of zeros/undefs. 575 // ArithmeticShift should only hit this if they are all UNDEF. 576 auto OutOfRange = [&](int Idx) { return (Idx < 0) || (BitWidth <= Idx); }; 577 if (llvm::all_of(ShiftAmts, OutOfRange)) { 578 SmallVector<Constant *, 8> ConstantVec; 579 for (int Idx : ShiftAmts) { 580 if (Idx < 0) { 581 ConstantVec.push_back(UndefValue::get(SVT)); 582 } else { 583 assert(LogicalShift && "Logical shift expected"); 584 ConstantVec.push_back(ConstantInt::getNullValue(SVT)); 585 } 586 } 587 return ConstantVector::get(ConstantVec); 588 } 589 590 // We can't handle only some out of range values with generic logical shifts. 591 if (AnyOutOfRange) 592 return nullptr; 593 594 // Build the shift amount constant vector. 595 SmallVector<Constant *, 8> ShiftVecAmts; 596 for (int Idx : ShiftAmts) { 597 if (Idx < 0) 598 ShiftVecAmts.push_back(UndefValue::get(SVT)); 599 else 600 ShiftVecAmts.push_back(ConstantInt::get(SVT, Idx)); 601 } 602 auto ShiftVec = ConstantVector::get(ShiftVecAmts); 603 604 if (ShiftLeft) 605 return Builder.CreateShl(Vec, ShiftVec); 606 607 if (LogicalShift) 608 return Builder.CreateLShr(Vec, ShiftVec); 609 610 return Builder.CreateAShr(Vec, ShiftVec); 611 } 612 613 static Value *simplifyX86pack(IntrinsicInst &II, 614 InstCombiner::BuilderTy &Builder, bool IsSigned) { 615 Value *Arg0 = II.getArgOperand(0); 616 Value *Arg1 = II.getArgOperand(1); 617 Type *ResTy = II.getType(); 618 619 // Fast all undef handling. 620 if (isa<UndefValue>(Arg0) && isa<UndefValue>(Arg1)) 621 return UndefValue::get(ResTy); 622 623 auto *ArgTy = cast<VectorType>(Arg0->getType()); 624 unsigned NumLanes = ResTy->getPrimitiveSizeInBits() / 128; 625 unsigned NumSrcElts = ArgTy->getNumElements(); 626 assert(cast<VectorType>(ResTy)->getNumElements() == (2 * NumSrcElts) && 627 "Unexpected packing types"); 628 629 unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes; 630 unsigned DstScalarSizeInBits = ResTy->getScalarSizeInBits(); 631 unsigned SrcScalarSizeInBits = ArgTy->getScalarSizeInBits(); 632 assert(SrcScalarSizeInBits == (2 * DstScalarSizeInBits) && 633 "Unexpected packing types"); 634 635 // Constant folding. 636 if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1)) 637 return nullptr; 638 639 // Clamp Values - signed/unsigned both use signed clamp values, but they 640 // differ on the min/max values. 641 APInt MinValue, MaxValue; 642 if (IsSigned) { 643 // PACKSS: Truncate signed value with signed saturation. 644 // Source values less than dst minint are saturated to minint. 645 // Source values greater than dst maxint are saturated to maxint. 646 MinValue = 647 APInt::getSignedMinValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits); 648 MaxValue = 649 APInt::getSignedMaxValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits); 650 } else { 651 // PACKUS: Truncate signed value with unsigned saturation. 652 // Source values less than zero are saturated to zero. 653 // Source values greater than dst maxuint are saturated to maxuint. 654 MinValue = APInt::getNullValue(SrcScalarSizeInBits); 655 MaxValue = APInt::getLowBitsSet(SrcScalarSizeInBits, DstScalarSizeInBits); 656 } 657 658 auto *MinC = Constant::getIntegerValue(ArgTy, MinValue); 659 auto *MaxC = Constant::getIntegerValue(ArgTy, MaxValue); 660 Arg0 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg0, MinC), MinC, Arg0); 661 Arg1 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg1, MinC), MinC, Arg1); 662 Arg0 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg0, MaxC), MaxC, Arg0); 663 Arg1 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg1, MaxC), MaxC, Arg1); 664 665 // Shuffle clamped args together at the lane level. 666 SmallVector<int, 32> PackMask; 667 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { 668 for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt) 669 PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane)); 670 for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt) 671 PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane) + NumSrcElts); 672 } 673 auto *Shuffle = Builder.CreateShuffleVector(Arg0, Arg1, PackMask); 674 675 // Truncate to dst size. 676 return Builder.CreateTrunc(Shuffle, ResTy); 677 } 678 679 static Value *simplifyX86movmsk(const IntrinsicInst &II, 680 InstCombiner::BuilderTy &Builder) { 681 Value *Arg = II.getArgOperand(0); 682 Type *ResTy = II.getType(); 683 684 // movmsk(undef) -> zero as we must ensure the upper bits are zero. 685 if (isa<UndefValue>(Arg)) 686 return Constant::getNullValue(ResTy); 687 688 auto *ArgTy = dyn_cast<VectorType>(Arg->getType()); 689 // We can't easily peek through x86_mmx types. 690 if (!ArgTy) 691 return nullptr; 692 693 // Expand MOVMSK to compare/bitcast/zext: 694 // e.g. PMOVMSKB(v16i8 x): 695 // %cmp = icmp slt <16 x i8> %x, zeroinitializer 696 // %int = bitcast <16 x i1> %cmp to i16 697 // %res = zext i16 %int to i32 698 unsigned NumElts = ArgTy->getNumElements(); 699 Type *IntegerVecTy = VectorType::getInteger(ArgTy); 700 Type *IntegerTy = Builder.getIntNTy(NumElts); 701 702 Value *Res = Builder.CreateBitCast(Arg, IntegerVecTy); 703 Res = Builder.CreateICmpSLT(Res, Constant::getNullValue(IntegerVecTy)); 704 Res = Builder.CreateBitCast(Res, IntegerTy); 705 Res = Builder.CreateZExtOrTrunc(Res, ResTy); 706 return Res; 707 } 708 709 static Value *simplifyX86addcarry(const IntrinsicInst &II, 710 InstCombiner::BuilderTy &Builder) { 711 Value *CarryIn = II.getArgOperand(0); 712 Value *Op1 = II.getArgOperand(1); 713 Value *Op2 = II.getArgOperand(2); 714 Type *RetTy = II.getType(); 715 Type *OpTy = Op1->getType(); 716 assert(RetTy->getStructElementType(0)->isIntegerTy(8) && 717 RetTy->getStructElementType(1) == OpTy && OpTy == Op2->getType() && 718 "Unexpected types for x86 addcarry"); 719 720 // If carry-in is zero, this is just an unsigned add with overflow. 721 if (match(CarryIn, m_ZeroInt())) { 722 Value *UAdd = Builder.CreateIntrinsic(Intrinsic::uadd_with_overflow, OpTy, 723 { Op1, Op2 }); 724 // The types have to be adjusted to match the x86 call types. 725 Value *UAddResult = Builder.CreateExtractValue(UAdd, 0); 726 Value *UAddOV = Builder.CreateZExt(Builder.CreateExtractValue(UAdd, 1), 727 Builder.getInt8Ty()); 728 Value *Res = UndefValue::get(RetTy); 729 Res = Builder.CreateInsertValue(Res, UAddOV, 0); 730 return Builder.CreateInsertValue(Res, UAddResult, 1); 731 } 732 733 return nullptr; 734 } 735 736 static Value *simplifyX86insertps(const IntrinsicInst &II, 737 InstCombiner::BuilderTy &Builder) { 738 auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2)); 739 if (!CInt) 740 return nullptr; 741 742 VectorType *VecTy = cast<VectorType>(II.getType()); 743 assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type"); 744 745 // The immediate permute control byte looks like this: 746 // [3:0] - zero mask for each 32-bit lane 747 // [5:4] - select one 32-bit destination lane 748 // [7:6] - select one 32-bit source lane 749 750 uint8_t Imm = CInt->getZExtValue(); 751 uint8_t ZMask = Imm & 0xf; 752 uint8_t DestLane = (Imm >> 4) & 0x3; 753 uint8_t SourceLane = (Imm >> 6) & 0x3; 754 755 ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy); 756 757 // If all zero mask bits are set, this was just a weird way to 758 // generate a zero vector. 759 if (ZMask == 0xf) 760 return ZeroVector; 761 762 // Initialize by passing all of the first source bits through. 763 int ShuffleMask[4] = {0, 1, 2, 3}; 764 765 // We may replace the second operand with the zero vector. 766 Value *V1 = II.getArgOperand(1); 767 768 if (ZMask) { 769 // If the zero mask is being used with a single input or the zero mask 770 // overrides the destination lane, this is a shuffle with the zero vector. 771 if ((II.getArgOperand(0) == II.getArgOperand(1)) || 772 (ZMask & (1 << DestLane))) { 773 V1 = ZeroVector; 774 // We may still move 32-bits of the first source vector from one lane 775 // to another. 776 ShuffleMask[DestLane] = SourceLane; 777 // The zero mask may override the previous insert operation. 778 for (unsigned i = 0; i < 4; ++i) 779 if ((ZMask >> i) & 0x1) 780 ShuffleMask[i] = i + 4; 781 } else { 782 // TODO: Model this case as 2 shuffles or a 'logical and' plus shuffle? 783 return nullptr; 784 } 785 } else { 786 // Replace the selected destination lane with the selected source lane. 787 ShuffleMask[DestLane] = SourceLane + 4; 788 } 789 790 return Builder.CreateShuffleVector(II.getArgOperand(0), V1, ShuffleMask); 791 } 792 793 /// Attempt to simplify SSE4A EXTRQ/EXTRQI instructions using constant folding 794 /// or conversion to a shuffle vector. 795 static Value *simplifyX86extrq(IntrinsicInst &II, Value *Op0, 796 ConstantInt *CILength, ConstantInt *CIIndex, 797 InstCombiner::BuilderTy &Builder) { 798 auto LowConstantHighUndef = [&](uint64_t Val) { 799 Type *IntTy64 = Type::getInt64Ty(II.getContext()); 800 Constant *Args[] = {ConstantInt::get(IntTy64, Val), 801 UndefValue::get(IntTy64)}; 802 return ConstantVector::get(Args); 803 }; 804 805 // See if we're dealing with constant values. 806 Constant *C0 = dyn_cast<Constant>(Op0); 807 ConstantInt *CI0 = 808 C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0)) 809 : nullptr; 810 811 // Attempt to constant fold. 812 if (CILength && CIIndex) { 813 // From AMD documentation: "The bit index and field length are each six 814 // bits in length other bits of the field are ignored." 815 APInt APIndex = CIIndex->getValue().zextOrTrunc(6); 816 APInt APLength = CILength->getValue().zextOrTrunc(6); 817 818 unsigned Index = APIndex.getZExtValue(); 819 820 // From AMD documentation: "a value of zero in the field length is 821 // defined as length of 64". 822 unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue(); 823 824 // From AMD documentation: "If the sum of the bit index + length field 825 // is greater than 64, the results are undefined". 826 unsigned End = Index + Length; 827 828 // Note that both field index and field length are 8-bit quantities. 829 // Since variables 'Index' and 'Length' are unsigned values 830 // obtained from zero-extending field index and field length 831 // respectively, their sum should never wrap around. 832 if (End > 64) 833 return UndefValue::get(II.getType()); 834 835 // If we are inserting whole bytes, we can convert this to a shuffle. 836 // Lowering can recognize EXTRQI shuffle masks. 837 if ((Length % 8) == 0 && (Index % 8) == 0) { 838 // Convert bit indices to byte indices. 839 Length /= 8; 840 Index /= 8; 841 842 Type *IntTy8 = Type::getInt8Ty(II.getContext()); 843 auto *ShufTy = FixedVectorType::get(IntTy8, 16); 844 845 SmallVector<int, 16> ShuffleMask; 846 for (int i = 0; i != (int)Length; ++i) 847 ShuffleMask.push_back(i + Index); 848 for (int i = Length; i != 8; ++i) 849 ShuffleMask.push_back(i + 16); 850 for (int i = 8; i != 16; ++i) 851 ShuffleMask.push_back(-1); 852 853 Value *SV = Builder.CreateShuffleVector( 854 Builder.CreateBitCast(Op0, ShufTy), 855 ConstantAggregateZero::get(ShufTy), ShuffleMask); 856 return Builder.CreateBitCast(SV, II.getType()); 857 } 858 859 // Constant Fold - shift Index'th bit to lowest position and mask off 860 // Length bits. 861 if (CI0) { 862 APInt Elt = CI0->getValue(); 863 Elt.lshrInPlace(Index); 864 Elt = Elt.zextOrTrunc(Length); 865 return LowConstantHighUndef(Elt.getZExtValue()); 866 } 867 868 // If we were an EXTRQ call, we'll save registers if we convert to EXTRQI. 869 if (II.getIntrinsicID() == Intrinsic::x86_sse4a_extrq) { 870 Value *Args[] = {Op0, CILength, CIIndex}; 871 Module *M = II.getModule(); 872 Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_extrqi); 873 return Builder.CreateCall(F, Args); 874 } 875 } 876 877 // Constant Fold - extraction from zero is always {zero, undef}. 878 if (CI0 && CI0->isZero()) 879 return LowConstantHighUndef(0); 880 881 return nullptr; 882 } 883 884 /// Attempt to simplify SSE4A INSERTQ/INSERTQI instructions using constant 885 /// folding or conversion to a shuffle vector. 886 static Value *simplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1, 887 APInt APLength, APInt APIndex, 888 InstCombiner::BuilderTy &Builder) { 889 // From AMD documentation: "The bit index and field length are each six bits 890 // in length other bits of the field are ignored." 891 APIndex = APIndex.zextOrTrunc(6); 892 APLength = APLength.zextOrTrunc(6); 893 894 // Attempt to constant fold. 895 unsigned Index = APIndex.getZExtValue(); 896 897 // From AMD documentation: "a value of zero in the field length is 898 // defined as length of 64". 899 unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue(); 900 901 // From AMD documentation: "If the sum of the bit index + length field 902 // is greater than 64, the results are undefined". 903 unsigned End = Index + Length; 904 905 // Note that both field index and field length are 8-bit quantities. 906 // Since variables 'Index' and 'Length' are unsigned values 907 // obtained from zero-extending field index and field length 908 // respectively, their sum should never wrap around. 909 if (End > 64) 910 return UndefValue::get(II.getType()); 911 912 // If we are inserting whole bytes, we can convert this to a shuffle. 913 // Lowering can recognize INSERTQI shuffle masks. 914 if ((Length % 8) == 0 && (Index % 8) == 0) { 915 // Convert bit indices to byte indices. 916 Length /= 8; 917 Index /= 8; 918 919 Type *IntTy8 = Type::getInt8Ty(II.getContext()); 920 auto *ShufTy = FixedVectorType::get(IntTy8, 16); 921 922 SmallVector<int, 16> ShuffleMask; 923 for (int i = 0; i != (int)Index; ++i) 924 ShuffleMask.push_back(i); 925 for (int i = 0; i != (int)Length; ++i) 926 ShuffleMask.push_back(i + 16); 927 for (int i = Index + Length; i != 8; ++i) 928 ShuffleMask.push_back(i); 929 for (int i = 8; i != 16; ++i) 930 ShuffleMask.push_back(-1); 931 932 Value *SV = Builder.CreateShuffleVector(Builder.CreateBitCast(Op0, ShufTy), 933 Builder.CreateBitCast(Op1, ShufTy), 934 ShuffleMask); 935 return Builder.CreateBitCast(SV, II.getType()); 936 } 937 938 // See if we're dealing with constant values. 939 Constant *C0 = dyn_cast<Constant>(Op0); 940 Constant *C1 = dyn_cast<Constant>(Op1); 941 ConstantInt *CI00 = 942 C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0)) 943 : nullptr; 944 ConstantInt *CI10 = 945 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0)) 946 : nullptr; 947 948 // Constant Fold - insert bottom Length bits starting at the Index'th bit. 949 if (CI00 && CI10) { 950 APInt V00 = CI00->getValue(); 951 APInt V10 = CI10->getValue(); 952 APInt Mask = APInt::getLowBitsSet(64, Length).shl(Index); 953 V00 = V00 & ~Mask; 954 V10 = V10.zextOrTrunc(Length).zextOrTrunc(64).shl(Index); 955 APInt Val = V00 | V10; 956 Type *IntTy64 = Type::getInt64Ty(II.getContext()); 957 Constant *Args[] = {ConstantInt::get(IntTy64, Val.getZExtValue()), 958 UndefValue::get(IntTy64)}; 959 return ConstantVector::get(Args); 960 } 961 962 // If we were an INSERTQ call, we'll save demanded elements if we convert to 963 // INSERTQI. 964 if (II.getIntrinsicID() == Intrinsic::x86_sse4a_insertq) { 965 Type *IntTy8 = Type::getInt8Ty(II.getContext()); 966 Constant *CILength = ConstantInt::get(IntTy8, Length, false); 967 Constant *CIIndex = ConstantInt::get(IntTy8, Index, false); 968 969 Value *Args[] = {Op0, Op1, CILength, CIIndex}; 970 Module *M = II.getModule(); 971 Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi); 972 return Builder.CreateCall(F, Args); 973 } 974 975 return nullptr; 976 } 977 978 /// Attempt to convert pshufb* to shufflevector if the mask is constant. 979 static Value *simplifyX86pshufb(const IntrinsicInst &II, 980 InstCombiner::BuilderTy &Builder) { 981 Constant *V = dyn_cast<Constant>(II.getArgOperand(1)); 982 if (!V) 983 return nullptr; 984 985 auto *VecTy = cast<VectorType>(II.getType()); 986 unsigned NumElts = VecTy->getNumElements(); 987 assert((NumElts == 16 || NumElts == 32 || NumElts == 64) && 988 "Unexpected number of elements in shuffle mask!"); 989 990 // Construct a shuffle mask from constant integers or UNDEFs. 991 int Indexes[64]; 992 993 // Each byte in the shuffle control mask forms an index to permute the 994 // corresponding byte in the destination operand. 995 for (unsigned I = 0; I < NumElts; ++I) { 996 Constant *COp = V->getAggregateElement(I); 997 if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) 998 return nullptr; 999 1000 if (isa<UndefValue>(COp)) { 1001 Indexes[I] = -1; 1002 continue; 1003 } 1004 1005 int8_t Index = cast<ConstantInt>(COp)->getValue().getZExtValue(); 1006 1007 // If the most significant bit (bit[7]) of each byte of the shuffle 1008 // control mask is set, then zero is written in the result byte. 1009 // The zero vector is in the right-hand side of the resulting 1010 // shufflevector. 1011 1012 // The value of each index for the high 128-bit lane is the least 1013 // significant 4 bits of the respective shuffle control byte. 1014 Index = ((Index < 0) ? NumElts : Index & 0x0F) + (I & 0xF0); 1015 Indexes[I] = Index; 1016 } 1017 1018 auto V1 = II.getArgOperand(0); 1019 auto V2 = Constant::getNullValue(VecTy); 1020 return Builder.CreateShuffleVector(V1, V2, makeArrayRef(Indexes, NumElts)); 1021 } 1022 1023 /// Attempt to convert vpermilvar* to shufflevector if the mask is constant. 1024 static Value *simplifyX86vpermilvar(const IntrinsicInst &II, 1025 InstCombiner::BuilderTy &Builder) { 1026 Constant *V = dyn_cast<Constant>(II.getArgOperand(1)); 1027 if (!V) 1028 return nullptr; 1029 1030 auto *VecTy = cast<VectorType>(II.getType()); 1031 unsigned NumElts = VecTy->getNumElements(); 1032 bool IsPD = VecTy->getScalarType()->isDoubleTy(); 1033 unsigned NumLaneElts = IsPD ? 2 : 4; 1034 assert(NumElts == 16 || NumElts == 8 || NumElts == 4 || NumElts == 2); 1035 1036 // Construct a shuffle mask from constant integers or UNDEFs. 1037 int Indexes[16]; 1038 1039 // The intrinsics only read one or two bits, clear the rest. 1040 for (unsigned I = 0; I < NumElts; ++I) { 1041 Constant *COp = V->getAggregateElement(I); 1042 if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) 1043 return nullptr; 1044 1045 if (isa<UndefValue>(COp)) { 1046 Indexes[I] = -1; 1047 continue; 1048 } 1049 1050 APInt Index = cast<ConstantInt>(COp)->getValue(); 1051 Index = Index.zextOrTrunc(32).getLoBits(2); 1052 1053 // The PD variants uses bit 1 to select per-lane element index, so 1054 // shift down to convert to generic shuffle mask index. 1055 if (IsPD) 1056 Index.lshrInPlace(1); 1057 1058 // The _256 variants are a bit trickier since the mask bits always index 1059 // into the corresponding 128 half. In order to convert to a generic 1060 // shuffle, we have to make that explicit. 1061 Index += APInt(32, (I / NumLaneElts) * NumLaneElts); 1062 1063 Indexes[I] = Index.getZExtValue(); 1064 } 1065 1066 auto V1 = II.getArgOperand(0); 1067 auto V2 = UndefValue::get(V1->getType()); 1068 return Builder.CreateShuffleVector(V1, V2, makeArrayRef(Indexes, NumElts)); 1069 } 1070 1071 /// Attempt to convert vpermd/vpermps to shufflevector if the mask is constant. 1072 static Value *simplifyX86vpermv(const IntrinsicInst &II, 1073 InstCombiner::BuilderTy &Builder) { 1074 auto *V = dyn_cast<Constant>(II.getArgOperand(1)); 1075 if (!V) 1076 return nullptr; 1077 1078 auto *VecTy = cast<VectorType>(II.getType()); 1079 unsigned Size = VecTy->getNumElements(); 1080 assert((Size == 4 || Size == 8 || Size == 16 || Size == 32 || Size == 64) && 1081 "Unexpected shuffle mask size"); 1082 1083 // Construct a shuffle mask from constant integers or UNDEFs. 1084 int Indexes[64]; 1085 1086 for (unsigned I = 0; I < Size; ++I) { 1087 Constant *COp = V->getAggregateElement(I); 1088 if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) 1089 return nullptr; 1090 1091 if (isa<UndefValue>(COp)) { 1092 Indexes[I] = -1; 1093 continue; 1094 } 1095 1096 uint32_t Index = cast<ConstantInt>(COp)->getZExtValue(); 1097 Index &= Size - 1; 1098 Indexes[I] = Index; 1099 } 1100 1101 auto V1 = II.getArgOperand(0); 1102 auto V2 = UndefValue::get(VecTy); 1103 return Builder.CreateShuffleVector(V1, V2, makeArrayRef(Indexes, Size)); 1104 } 1105 1106 // TODO, Obvious Missing Transforms: 1107 // * Narrow width by halfs excluding zero/undef lanes 1108 Value *InstCombiner::simplifyMaskedLoad(IntrinsicInst &II) { 1109 Value *LoadPtr = II.getArgOperand(0); 1110 const Align Alignment = 1111 cast<ConstantInt>(II.getArgOperand(1))->getAlignValue(); 1112 1113 // If the mask is all ones or undefs, this is a plain vector load of the 1st 1114 // argument. 1115 if (maskIsAllOneOrUndef(II.getArgOperand(2))) 1116 return Builder.CreateAlignedLoad(II.getType(), LoadPtr, Alignment, 1117 "unmaskedload"); 1118 1119 // If we can unconditionally load from this address, replace with a 1120 // load/select idiom. TODO: use DT for context sensitive query 1121 if (isDereferenceableAndAlignedPointer(LoadPtr, II.getType(), Alignment, 1122 II.getModule()->getDataLayout(), &II, 1123 nullptr)) { 1124 Value *LI = Builder.CreateAlignedLoad(II.getType(), LoadPtr, Alignment, 1125 "unmaskedload"); 1126 return Builder.CreateSelect(II.getArgOperand(2), LI, II.getArgOperand(3)); 1127 } 1128 1129 return nullptr; 1130 } 1131 1132 // TODO, Obvious Missing Transforms: 1133 // * Single constant active lane -> store 1134 // * Narrow width by halfs excluding zero/undef lanes 1135 Instruction *InstCombiner::simplifyMaskedStore(IntrinsicInst &II) { 1136 auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(3)); 1137 if (!ConstMask) 1138 return nullptr; 1139 1140 // If the mask is all zeros, this instruction does nothing. 1141 if (ConstMask->isNullValue()) 1142 return eraseInstFromFunction(II); 1143 1144 // If the mask is all ones, this is a plain vector store of the 1st argument. 1145 if (ConstMask->isAllOnesValue()) { 1146 Value *StorePtr = II.getArgOperand(1); 1147 Align Alignment = cast<ConstantInt>(II.getArgOperand(2))->getAlignValue(); 1148 return new StoreInst(II.getArgOperand(0), StorePtr, false, Alignment); 1149 } 1150 1151 // Use masked off lanes to simplify operands via SimplifyDemandedVectorElts 1152 APInt DemandedElts = possiblyDemandedEltsInMask(ConstMask); 1153 APInt UndefElts(DemandedElts.getBitWidth(), 0); 1154 if (Value *V = SimplifyDemandedVectorElts(II.getOperand(0), 1155 DemandedElts, UndefElts)) 1156 return replaceOperand(II, 0, V); 1157 1158 return nullptr; 1159 } 1160 1161 // TODO, Obvious Missing Transforms: 1162 // * Single constant active lane load -> load 1163 // * Dereferenceable address & few lanes -> scalarize speculative load/selects 1164 // * Adjacent vector addresses -> masked.load 1165 // * Narrow width by halfs excluding zero/undef lanes 1166 // * Vector splat address w/known mask -> scalar load 1167 // * Vector incrementing address -> vector masked load 1168 Instruction *InstCombiner::simplifyMaskedGather(IntrinsicInst &II) { 1169 return nullptr; 1170 } 1171 1172 // TODO, Obvious Missing Transforms: 1173 // * Single constant active lane -> store 1174 // * Adjacent vector addresses -> masked.store 1175 // * Narrow store width by halfs excluding zero/undef lanes 1176 // * Vector splat address w/known mask -> scalar store 1177 // * Vector incrementing address -> vector masked store 1178 Instruction *InstCombiner::simplifyMaskedScatter(IntrinsicInst &II) { 1179 auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(3)); 1180 if (!ConstMask) 1181 return nullptr; 1182 1183 // If the mask is all zeros, a scatter does nothing. 1184 if (ConstMask->isNullValue()) 1185 return eraseInstFromFunction(II); 1186 1187 // Use masked off lanes to simplify operands via SimplifyDemandedVectorElts 1188 APInt DemandedElts = possiblyDemandedEltsInMask(ConstMask); 1189 APInt UndefElts(DemandedElts.getBitWidth(), 0); 1190 if (Value *V = SimplifyDemandedVectorElts(II.getOperand(0), 1191 DemandedElts, UndefElts)) 1192 return replaceOperand(II, 0, V); 1193 if (Value *V = SimplifyDemandedVectorElts(II.getOperand(1), 1194 DemandedElts, UndefElts)) 1195 return replaceOperand(II, 1, V); 1196 1197 return nullptr; 1198 } 1199 1200 /// This function transforms launder.invariant.group and strip.invariant.group 1201 /// like: 1202 /// launder(launder(%x)) -> launder(%x) (the result is not the argument) 1203 /// launder(strip(%x)) -> launder(%x) 1204 /// strip(strip(%x)) -> strip(%x) (the result is not the argument) 1205 /// strip(launder(%x)) -> strip(%x) 1206 /// This is legal because it preserves the most recent information about 1207 /// the presence or absence of invariant.group. 1208 static Instruction *simplifyInvariantGroupIntrinsic(IntrinsicInst &II, 1209 InstCombiner &IC) { 1210 auto *Arg = II.getArgOperand(0); 1211 auto *StrippedArg = Arg->stripPointerCasts(); 1212 auto *StrippedInvariantGroupsArg = Arg->stripPointerCastsAndInvariantGroups(); 1213 if (StrippedArg == StrippedInvariantGroupsArg) 1214 return nullptr; // No launders/strips to remove. 1215 1216 Value *Result = nullptr; 1217 1218 if (II.getIntrinsicID() == Intrinsic::launder_invariant_group) 1219 Result = IC.Builder.CreateLaunderInvariantGroup(StrippedInvariantGroupsArg); 1220 else if (II.getIntrinsicID() == Intrinsic::strip_invariant_group) 1221 Result = IC.Builder.CreateStripInvariantGroup(StrippedInvariantGroupsArg); 1222 else 1223 llvm_unreachable( 1224 "simplifyInvariantGroupIntrinsic only handles launder and strip"); 1225 if (Result->getType()->getPointerAddressSpace() != 1226 II.getType()->getPointerAddressSpace()) 1227 Result = IC.Builder.CreateAddrSpaceCast(Result, II.getType()); 1228 if (Result->getType() != II.getType()) 1229 Result = IC.Builder.CreateBitCast(Result, II.getType()); 1230 1231 return cast<Instruction>(Result); 1232 } 1233 1234 static Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombiner &IC) { 1235 assert((II.getIntrinsicID() == Intrinsic::cttz || 1236 II.getIntrinsicID() == Intrinsic::ctlz) && 1237 "Expected cttz or ctlz intrinsic"); 1238 bool IsTZ = II.getIntrinsicID() == Intrinsic::cttz; 1239 Value *Op0 = II.getArgOperand(0); 1240 Value *X; 1241 // ctlz(bitreverse(x)) -> cttz(x) 1242 // cttz(bitreverse(x)) -> ctlz(x) 1243 if (match(Op0, m_BitReverse(m_Value(X)))) { 1244 Intrinsic::ID ID = IsTZ ? Intrinsic::ctlz : Intrinsic::cttz; 1245 Function *F = Intrinsic::getDeclaration(II.getModule(), ID, II.getType()); 1246 return CallInst::Create(F, {X, II.getArgOperand(1)}); 1247 } 1248 1249 if (IsTZ) { 1250 // cttz(-x) -> cttz(x) 1251 if (match(Op0, m_Neg(m_Value(X)))) 1252 return IC.replaceOperand(II, 0, X); 1253 1254 // cttz(abs(x)) -> cttz(x) 1255 // cttz(nabs(x)) -> cttz(x) 1256 Value *Y; 1257 SelectPatternFlavor SPF = matchSelectPattern(Op0, X, Y).Flavor; 1258 if (SPF == SPF_ABS || SPF == SPF_NABS) 1259 return IC.replaceOperand(II, 0, X); 1260 } 1261 1262 KnownBits Known = IC.computeKnownBits(Op0, 0, &II); 1263 1264 // Create a mask for bits above (ctlz) or below (cttz) the first known one. 1265 unsigned PossibleZeros = IsTZ ? Known.countMaxTrailingZeros() 1266 : Known.countMaxLeadingZeros(); 1267 unsigned DefiniteZeros = IsTZ ? Known.countMinTrailingZeros() 1268 : Known.countMinLeadingZeros(); 1269 1270 // If all bits above (ctlz) or below (cttz) the first known one are known 1271 // zero, this value is constant. 1272 // FIXME: This should be in InstSimplify because we're replacing an 1273 // instruction with a constant. 1274 if (PossibleZeros == DefiniteZeros) { 1275 auto *C = ConstantInt::get(Op0->getType(), DefiniteZeros); 1276 return IC.replaceInstUsesWith(II, C); 1277 } 1278 1279 // If the input to cttz/ctlz is known to be non-zero, 1280 // then change the 'ZeroIsUndef' parameter to 'true' 1281 // because we know the zero behavior can't affect the result. 1282 if (!Known.One.isNullValue() || 1283 isKnownNonZero(Op0, IC.getDataLayout(), 0, &IC.getAssumptionCache(), &II, 1284 &IC.getDominatorTree())) { 1285 if (!match(II.getArgOperand(1), m_One())) 1286 return IC.replaceOperand(II, 1, IC.Builder.getTrue()); 1287 } 1288 1289 // Add range metadata since known bits can't completely reflect what we know. 1290 // TODO: Handle splat vectors. 1291 auto *IT = dyn_cast<IntegerType>(Op0->getType()); 1292 if (IT && IT->getBitWidth() != 1 && !II.getMetadata(LLVMContext::MD_range)) { 1293 Metadata *LowAndHigh[] = { 1294 ConstantAsMetadata::get(ConstantInt::get(IT, DefiniteZeros)), 1295 ConstantAsMetadata::get(ConstantInt::get(IT, PossibleZeros + 1))}; 1296 II.setMetadata(LLVMContext::MD_range, 1297 MDNode::get(II.getContext(), LowAndHigh)); 1298 return &II; 1299 } 1300 1301 return nullptr; 1302 } 1303 1304 static Instruction *foldCtpop(IntrinsicInst &II, InstCombiner &IC) { 1305 assert(II.getIntrinsicID() == Intrinsic::ctpop && 1306 "Expected ctpop intrinsic"); 1307 Type *Ty = II.getType(); 1308 unsigned BitWidth = Ty->getScalarSizeInBits(); 1309 Value *Op0 = II.getArgOperand(0); 1310 Value *X; 1311 1312 // ctpop(bitreverse(x)) -> ctpop(x) 1313 // ctpop(bswap(x)) -> ctpop(x) 1314 if (match(Op0, m_BitReverse(m_Value(X))) || match(Op0, m_BSwap(m_Value(X)))) 1315 return IC.replaceOperand(II, 0, X); 1316 1317 // ctpop(x | -x) -> bitwidth - cttz(x, false) 1318 if (Op0->hasOneUse() && 1319 match(Op0, m_c_Or(m_Value(X), m_Neg(m_Deferred(X))))) { 1320 Function *F = 1321 Intrinsic::getDeclaration(II.getModule(), Intrinsic::cttz, Ty); 1322 auto *Cttz = IC.Builder.CreateCall(F, {X, IC.Builder.getFalse()}); 1323 auto *Bw = ConstantInt::get(Ty, APInt(BitWidth, BitWidth)); 1324 return IC.replaceInstUsesWith(II, IC.Builder.CreateSub(Bw, Cttz)); 1325 } 1326 1327 // ctpop(~x & (x - 1)) -> cttz(x, false) 1328 if (match(Op0, 1329 m_c_And(m_Not(m_Value(X)), m_Add(m_Deferred(X), m_AllOnes())))) { 1330 Function *F = 1331 Intrinsic::getDeclaration(II.getModule(), Intrinsic::cttz, Ty); 1332 return CallInst::Create(F, {X, IC.Builder.getFalse()}); 1333 } 1334 1335 // FIXME: Try to simplify vectors of integers. 1336 auto *IT = dyn_cast<IntegerType>(Ty); 1337 if (!IT) 1338 return nullptr; 1339 1340 KnownBits Known(BitWidth); 1341 IC.computeKnownBits(Op0, Known, 0, &II); 1342 1343 unsigned MinCount = Known.countMinPopulation(); 1344 unsigned MaxCount = Known.countMaxPopulation(); 1345 1346 // Add range metadata since known bits can't completely reflect what we know. 1347 if (IT->getBitWidth() != 1 && !II.getMetadata(LLVMContext::MD_range)) { 1348 Metadata *LowAndHigh[] = { 1349 ConstantAsMetadata::get(ConstantInt::get(IT, MinCount)), 1350 ConstantAsMetadata::get(ConstantInt::get(IT, MaxCount + 1))}; 1351 II.setMetadata(LLVMContext::MD_range, 1352 MDNode::get(II.getContext(), LowAndHigh)); 1353 return &II; 1354 } 1355 1356 return nullptr; 1357 } 1358 1359 // TODO: If the x86 backend knew how to convert a bool vector mask back to an 1360 // XMM register mask efficiently, we could transform all x86 masked intrinsics 1361 // to LLVM masked intrinsics and remove the x86 masked intrinsic defs. 1362 static Instruction *simplifyX86MaskedLoad(IntrinsicInst &II, InstCombiner &IC) { 1363 Value *Ptr = II.getOperand(0); 1364 Value *Mask = II.getOperand(1); 1365 Constant *ZeroVec = Constant::getNullValue(II.getType()); 1366 1367 // Special case a zero mask since that's not a ConstantDataVector. 1368 // This masked load instruction creates a zero vector. 1369 if (isa<ConstantAggregateZero>(Mask)) 1370 return IC.replaceInstUsesWith(II, ZeroVec); 1371 1372 auto *ConstMask = dyn_cast<ConstantDataVector>(Mask); 1373 if (!ConstMask) 1374 return nullptr; 1375 1376 // The mask is constant. Convert this x86 intrinsic to the LLVM instrinsic 1377 // to allow target-independent optimizations. 1378 1379 // First, cast the x86 intrinsic scalar pointer to a vector pointer to match 1380 // the LLVM intrinsic definition for the pointer argument. 1381 unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace(); 1382 PointerType *VecPtrTy = PointerType::get(II.getType(), AddrSpace); 1383 Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec"); 1384 1385 // Second, convert the x86 XMM integer vector mask to a vector of bools based 1386 // on each element's most significant bit (the sign bit). 1387 Constant *BoolMask = getNegativeIsTrueBoolVec(ConstMask); 1388 1389 // The pass-through vector for an x86 masked load is a zero vector. 1390 CallInst *NewMaskedLoad = 1391 IC.Builder.CreateMaskedLoad(PtrCast, Align(1), BoolMask, ZeroVec); 1392 return IC.replaceInstUsesWith(II, NewMaskedLoad); 1393 } 1394 1395 // TODO: If the x86 backend knew how to convert a bool vector mask back to an 1396 // XMM register mask efficiently, we could transform all x86 masked intrinsics 1397 // to LLVM masked intrinsics and remove the x86 masked intrinsic defs. 1398 static bool simplifyX86MaskedStore(IntrinsicInst &II, InstCombiner &IC) { 1399 Value *Ptr = II.getOperand(0); 1400 Value *Mask = II.getOperand(1); 1401 Value *Vec = II.getOperand(2); 1402 1403 // Special case a zero mask since that's not a ConstantDataVector: 1404 // this masked store instruction does nothing. 1405 if (isa<ConstantAggregateZero>(Mask)) { 1406 IC.eraseInstFromFunction(II); 1407 return true; 1408 } 1409 1410 // The SSE2 version is too weird (eg, unaligned but non-temporal) to do 1411 // anything else at this level. 1412 if (II.getIntrinsicID() == Intrinsic::x86_sse2_maskmov_dqu) 1413 return false; 1414 1415 auto *ConstMask = dyn_cast<ConstantDataVector>(Mask); 1416 if (!ConstMask) 1417 return false; 1418 1419 // The mask is constant. Convert this x86 intrinsic to the LLVM instrinsic 1420 // to allow target-independent optimizations. 1421 1422 // First, cast the x86 intrinsic scalar pointer to a vector pointer to match 1423 // the LLVM intrinsic definition for the pointer argument. 1424 unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace(); 1425 PointerType *VecPtrTy = PointerType::get(Vec->getType(), AddrSpace); 1426 Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec"); 1427 1428 // Second, convert the x86 XMM integer vector mask to a vector of bools based 1429 // on each element's most significant bit (the sign bit). 1430 Constant *BoolMask = getNegativeIsTrueBoolVec(ConstMask); 1431 1432 IC.Builder.CreateMaskedStore(Vec, PtrCast, Align(1), BoolMask); 1433 1434 // 'Replace uses' doesn't work for stores. Erase the original masked store. 1435 IC.eraseInstFromFunction(II); 1436 return true; 1437 } 1438 1439 // Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs. 1440 // 1441 // A single NaN input is folded to minnum, so we rely on that folding for 1442 // handling NaNs. 1443 static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1, 1444 const APFloat &Src2) { 1445 APFloat Max3 = maxnum(maxnum(Src0, Src1), Src2); 1446 1447 APFloat::cmpResult Cmp0 = Max3.compare(Src0); 1448 assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately"); 1449 if (Cmp0 == APFloat::cmpEqual) 1450 return maxnum(Src1, Src2); 1451 1452 APFloat::cmpResult Cmp1 = Max3.compare(Src1); 1453 assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately"); 1454 if (Cmp1 == APFloat::cmpEqual) 1455 return maxnum(Src0, Src2); 1456 1457 return maxnum(Src0, Src1); 1458 } 1459 1460 /// Convert a table lookup to shufflevector if the mask is constant. 1461 /// This could benefit tbl1 if the mask is { 7,6,5,4,3,2,1,0 }, in 1462 /// which case we could lower the shufflevector with rev64 instructions 1463 /// as it's actually a byte reverse. 1464 static Value *simplifyNeonTbl1(const IntrinsicInst &II, 1465 InstCombiner::BuilderTy &Builder) { 1466 // Bail out if the mask is not a constant. 1467 auto *C = dyn_cast<Constant>(II.getArgOperand(1)); 1468 if (!C) 1469 return nullptr; 1470 1471 auto *VecTy = cast<VectorType>(II.getType()); 1472 unsigned NumElts = VecTy->getNumElements(); 1473 1474 // Only perform this transformation for <8 x i8> vector types. 1475 if (!VecTy->getElementType()->isIntegerTy(8) || NumElts != 8) 1476 return nullptr; 1477 1478 int Indexes[8]; 1479 1480 for (unsigned I = 0; I < NumElts; ++I) { 1481 Constant *COp = C->getAggregateElement(I); 1482 1483 if (!COp || !isa<ConstantInt>(COp)) 1484 return nullptr; 1485 1486 Indexes[I] = cast<ConstantInt>(COp)->getLimitedValue(); 1487 1488 // Make sure the mask indices are in range. 1489 if ((unsigned)Indexes[I] >= NumElts) 1490 return nullptr; 1491 } 1492 1493 auto *V1 = II.getArgOperand(0); 1494 auto *V2 = Constant::getNullValue(V1->getType()); 1495 return Builder.CreateShuffleVector(V1, V2, makeArrayRef(Indexes)); 1496 } 1497 1498 /// Convert a vector load intrinsic into a simple llvm load instruction. 1499 /// This is beneficial when the underlying object being addressed comes 1500 /// from a constant, since we get constant-folding for free. 1501 static Value *simplifyNeonVld1(const IntrinsicInst &II, 1502 unsigned MemAlign, 1503 InstCombiner::BuilderTy &Builder) { 1504 auto *IntrAlign = dyn_cast<ConstantInt>(II.getArgOperand(1)); 1505 1506 if (!IntrAlign) 1507 return nullptr; 1508 1509 unsigned Alignment = IntrAlign->getLimitedValue() < MemAlign ? 1510 MemAlign : IntrAlign->getLimitedValue(); 1511 1512 if (!isPowerOf2_32(Alignment)) 1513 return nullptr; 1514 1515 auto *BCastInst = Builder.CreateBitCast(II.getArgOperand(0), 1516 PointerType::get(II.getType(), 0)); 1517 return Builder.CreateAlignedLoad(II.getType(), BCastInst, Align(Alignment)); 1518 } 1519 1520 // Returns true iff the 2 intrinsics have the same operands, limiting the 1521 // comparison to the first NumOperands. 1522 static bool haveSameOperands(const IntrinsicInst &I, const IntrinsicInst &E, 1523 unsigned NumOperands) { 1524 assert(I.getNumArgOperands() >= NumOperands && "Not enough operands"); 1525 assert(E.getNumArgOperands() >= NumOperands && "Not enough operands"); 1526 for (unsigned i = 0; i < NumOperands; i++) 1527 if (I.getArgOperand(i) != E.getArgOperand(i)) 1528 return false; 1529 return true; 1530 } 1531 1532 // Remove trivially empty start/end intrinsic ranges, i.e. a start 1533 // immediately followed by an end (ignoring debuginfo or other 1534 // start/end intrinsics in between). As this handles only the most trivial 1535 // cases, tracking the nesting level is not needed: 1536 // 1537 // call @llvm.foo.start(i1 0) 1538 // call @llvm.foo.start(i1 0) ; This one won't be skipped: it will be removed 1539 // call @llvm.foo.end(i1 0) 1540 // call @llvm.foo.end(i1 0) ; &I 1541 static bool removeTriviallyEmptyRange( 1542 IntrinsicInst &EndI, InstCombiner &IC, 1543 std::function<bool(const IntrinsicInst &)> IsStart) { 1544 // We start from the end intrinsic and scan backwards, so that InstCombine 1545 // has already processed (and potentially removed) all the instructions 1546 // before the end intrinsic. 1547 BasicBlock::reverse_iterator BI(EndI), BE(EndI.getParent()->rend()); 1548 for (; BI != BE; ++BI) { 1549 if (auto *I = dyn_cast<IntrinsicInst>(&*BI)) { 1550 if (isa<DbgInfoIntrinsic>(I) || 1551 I->getIntrinsicID() == EndI.getIntrinsicID()) 1552 continue; 1553 if (IsStart(*I)) { 1554 if (haveSameOperands(EndI, *I, EndI.getNumArgOperands())) { 1555 IC.eraseInstFromFunction(*I); 1556 IC.eraseInstFromFunction(EndI); 1557 return true; 1558 } 1559 // Skip start intrinsics that don't pair with this end intrinsic. 1560 continue; 1561 } 1562 } 1563 break; 1564 } 1565 1566 return false; 1567 } 1568 1569 // Convert NVVM intrinsics to target-generic LLVM code where possible. 1570 static Instruction *SimplifyNVVMIntrinsic(IntrinsicInst *II, InstCombiner &IC) { 1571 // Each NVVM intrinsic we can simplify can be replaced with one of: 1572 // 1573 // * an LLVM intrinsic, 1574 // * an LLVM cast operation, 1575 // * an LLVM binary operation, or 1576 // * ad-hoc LLVM IR for the particular operation. 1577 1578 // Some transformations are only valid when the module's 1579 // flush-denormals-to-zero (ftz) setting is true/false, whereas other 1580 // transformations are valid regardless of the module's ftz setting. 1581 enum FtzRequirementTy { 1582 FTZ_Any, // Any ftz setting is ok. 1583 FTZ_MustBeOn, // Transformation is valid only if ftz is on. 1584 FTZ_MustBeOff, // Transformation is valid only if ftz is off. 1585 }; 1586 // Classes of NVVM intrinsics that can't be replaced one-to-one with a 1587 // target-generic intrinsic, cast op, or binary op but that we can nonetheless 1588 // simplify. 1589 enum SpecialCase { 1590 SPC_Reciprocal, 1591 }; 1592 1593 // SimplifyAction is a poor-man's variant (plus an additional flag) that 1594 // represents how to replace an NVVM intrinsic with target-generic LLVM IR. 1595 struct SimplifyAction { 1596 // Invariant: At most one of these Optionals has a value. 1597 Optional<Intrinsic::ID> IID; 1598 Optional<Instruction::CastOps> CastOp; 1599 Optional<Instruction::BinaryOps> BinaryOp; 1600 Optional<SpecialCase> Special; 1601 1602 FtzRequirementTy FtzRequirement = FTZ_Any; 1603 1604 SimplifyAction() = default; 1605 1606 SimplifyAction(Intrinsic::ID IID, FtzRequirementTy FtzReq) 1607 : IID(IID), FtzRequirement(FtzReq) {} 1608 1609 // Cast operations don't have anything to do with FTZ, so we skip that 1610 // argument. 1611 SimplifyAction(Instruction::CastOps CastOp) : CastOp(CastOp) {} 1612 1613 SimplifyAction(Instruction::BinaryOps BinaryOp, FtzRequirementTy FtzReq) 1614 : BinaryOp(BinaryOp), FtzRequirement(FtzReq) {} 1615 1616 SimplifyAction(SpecialCase Special, FtzRequirementTy FtzReq) 1617 : Special(Special), FtzRequirement(FtzReq) {} 1618 }; 1619 1620 // Try to generate a SimplifyAction describing how to replace our 1621 // IntrinsicInstr with target-generic LLVM IR. 1622 const SimplifyAction Action = [II]() -> SimplifyAction { 1623 switch (II->getIntrinsicID()) { 1624 // NVVM intrinsics that map directly to LLVM intrinsics. 1625 case Intrinsic::nvvm_ceil_d: 1626 return {Intrinsic::ceil, FTZ_Any}; 1627 case Intrinsic::nvvm_ceil_f: 1628 return {Intrinsic::ceil, FTZ_MustBeOff}; 1629 case Intrinsic::nvvm_ceil_ftz_f: 1630 return {Intrinsic::ceil, FTZ_MustBeOn}; 1631 case Intrinsic::nvvm_fabs_d: 1632 return {Intrinsic::fabs, FTZ_Any}; 1633 case Intrinsic::nvvm_fabs_f: 1634 return {Intrinsic::fabs, FTZ_MustBeOff}; 1635 case Intrinsic::nvvm_fabs_ftz_f: 1636 return {Intrinsic::fabs, FTZ_MustBeOn}; 1637 case Intrinsic::nvvm_floor_d: 1638 return {Intrinsic::floor, FTZ_Any}; 1639 case Intrinsic::nvvm_floor_f: 1640 return {Intrinsic::floor, FTZ_MustBeOff}; 1641 case Intrinsic::nvvm_floor_ftz_f: 1642 return {Intrinsic::floor, FTZ_MustBeOn}; 1643 case Intrinsic::nvvm_fma_rn_d: 1644 return {Intrinsic::fma, FTZ_Any}; 1645 case Intrinsic::nvvm_fma_rn_f: 1646 return {Intrinsic::fma, FTZ_MustBeOff}; 1647 case Intrinsic::nvvm_fma_rn_ftz_f: 1648 return {Intrinsic::fma, FTZ_MustBeOn}; 1649 case Intrinsic::nvvm_fmax_d: 1650 return {Intrinsic::maxnum, FTZ_Any}; 1651 case Intrinsic::nvvm_fmax_f: 1652 return {Intrinsic::maxnum, FTZ_MustBeOff}; 1653 case Intrinsic::nvvm_fmax_ftz_f: 1654 return {Intrinsic::maxnum, FTZ_MustBeOn}; 1655 case Intrinsic::nvvm_fmin_d: 1656 return {Intrinsic::minnum, FTZ_Any}; 1657 case Intrinsic::nvvm_fmin_f: 1658 return {Intrinsic::minnum, FTZ_MustBeOff}; 1659 case Intrinsic::nvvm_fmin_ftz_f: 1660 return {Intrinsic::minnum, FTZ_MustBeOn}; 1661 case Intrinsic::nvvm_round_d: 1662 return {Intrinsic::round, FTZ_Any}; 1663 case Intrinsic::nvvm_round_f: 1664 return {Intrinsic::round, FTZ_MustBeOff}; 1665 case Intrinsic::nvvm_round_ftz_f: 1666 return {Intrinsic::round, FTZ_MustBeOn}; 1667 case Intrinsic::nvvm_sqrt_rn_d: 1668 return {Intrinsic::sqrt, FTZ_Any}; 1669 case Intrinsic::nvvm_sqrt_f: 1670 // nvvm_sqrt_f is a special case. For most intrinsics, foo_ftz_f is the 1671 // ftz version, and foo_f is the non-ftz version. But nvvm_sqrt_f adopts 1672 // the ftz-ness of the surrounding code. sqrt_rn_f and sqrt_rn_ftz_f are 1673 // the versions with explicit ftz-ness. 1674 return {Intrinsic::sqrt, FTZ_Any}; 1675 case Intrinsic::nvvm_sqrt_rn_f: 1676 return {Intrinsic::sqrt, FTZ_MustBeOff}; 1677 case Intrinsic::nvvm_sqrt_rn_ftz_f: 1678 return {Intrinsic::sqrt, FTZ_MustBeOn}; 1679 case Intrinsic::nvvm_trunc_d: 1680 return {Intrinsic::trunc, FTZ_Any}; 1681 case Intrinsic::nvvm_trunc_f: 1682 return {Intrinsic::trunc, FTZ_MustBeOff}; 1683 case Intrinsic::nvvm_trunc_ftz_f: 1684 return {Intrinsic::trunc, FTZ_MustBeOn}; 1685 1686 // NVVM intrinsics that map to LLVM cast operations. 1687 // 1688 // Note that llvm's target-generic conversion operators correspond to the rz 1689 // (round to zero) versions of the nvvm conversion intrinsics, even though 1690 // most everything else here uses the rn (round to nearest even) nvvm ops. 1691 case Intrinsic::nvvm_d2i_rz: 1692 case Intrinsic::nvvm_f2i_rz: 1693 case Intrinsic::nvvm_d2ll_rz: 1694 case Intrinsic::nvvm_f2ll_rz: 1695 return {Instruction::FPToSI}; 1696 case Intrinsic::nvvm_d2ui_rz: 1697 case Intrinsic::nvvm_f2ui_rz: 1698 case Intrinsic::nvvm_d2ull_rz: 1699 case Intrinsic::nvvm_f2ull_rz: 1700 return {Instruction::FPToUI}; 1701 case Intrinsic::nvvm_i2d_rz: 1702 case Intrinsic::nvvm_i2f_rz: 1703 case Intrinsic::nvvm_ll2d_rz: 1704 case Intrinsic::nvvm_ll2f_rz: 1705 return {Instruction::SIToFP}; 1706 case Intrinsic::nvvm_ui2d_rz: 1707 case Intrinsic::nvvm_ui2f_rz: 1708 case Intrinsic::nvvm_ull2d_rz: 1709 case Intrinsic::nvvm_ull2f_rz: 1710 return {Instruction::UIToFP}; 1711 1712 // NVVM intrinsics that map to LLVM binary ops. 1713 case Intrinsic::nvvm_add_rn_d: 1714 return {Instruction::FAdd, FTZ_Any}; 1715 case Intrinsic::nvvm_add_rn_f: 1716 return {Instruction::FAdd, FTZ_MustBeOff}; 1717 case Intrinsic::nvvm_add_rn_ftz_f: 1718 return {Instruction::FAdd, FTZ_MustBeOn}; 1719 case Intrinsic::nvvm_mul_rn_d: 1720 return {Instruction::FMul, FTZ_Any}; 1721 case Intrinsic::nvvm_mul_rn_f: 1722 return {Instruction::FMul, FTZ_MustBeOff}; 1723 case Intrinsic::nvvm_mul_rn_ftz_f: 1724 return {Instruction::FMul, FTZ_MustBeOn}; 1725 case Intrinsic::nvvm_div_rn_d: 1726 return {Instruction::FDiv, FTZ_Any}; 1727 case Intrinsic::nvvm_div_rn_f: 1728 return {Instruction::FDiv, FTZ_MustBeOff}; 1729 case Intrinsic::nvvm_div_rn_ftz_f: 1730 return {Instruction::FDiv, FTZ_MustBeOn}; 1731 1732 // The remainder of cases are NVVM intrinsics that map to LLVM idioms, but 1733 // need special handling. 1734 // 1735 // We seem to be missing intrinsics for rcp.approx.{ftz.}f32, which is just 1736 // as well. 1737 case Intrinsic::nvvm_rcp_rn_d: 1738 return {SPC_Reciprocal, FTZ_Any}; 1739 case Intrinsic::nvvm_rcp_rn_f: 1740 return {SPC_Reciprocal, FTZ_MustBeOff}; 1741 case Intrinsic::nvvm_rcp_rn_ftz_f: 1742 return {SPC_Reciprocal, FTZ_MustBeOn}; 1743 1744 // We do not currently simplify intrinsics that give an approximate answer. 1745 // These include: 1746 // 1747 // - nvvm_cos_approx_{f,ftz_f} 1748 // - nvvm_ex2_approx_{d,f,ftz_f} 1749 // - nvvm_lg2_approx_{d,f,ftz_f} 1750 // - nvvm_sin_approx_{f,ftz_f} 1751 // - nvvm_sqrt_approx_{f,ftz_f} 1752 // - nvvm_rsqrt_approx_{d,f,ftz_f} 1753 // - nvvm_div_approx_{ftz_d,ftz_f,f} 1754 // - nvvm_rcp_approx_ftz_d 1755 // 1756 // Ideally we'd encode them as e.g. "fast call @llvm.cos", where "fast" 1757 // means that fastmath is enabled in the intrinsic. Unfortunately only 1758 // binary operators (currently) have a fastmath bit in SelectionDAG, so this 1759 // information gets lost and we can't select on it. 1760 // 1761 // TODO: div and rcp are lowered to a binary op, so these we could in theory 1762 // lower them to "fast fdiv". 1763 1764 default: 1765 return {}; 1766 } 1767 }(); 1768 1769 // If Action.FtzRequirementTy is not satisfied by the module's ftz state, we 1770 // can bail out now. (Notice that in the case that IID is not an NVVM 1771 // intrinsic, we don't have to look up any module metadata, as 1772 // FtzRequirementTy will be FTZ_Any.) 1773 if (Action.FtzRequirement != FTZ_Any) { 1774 StringRef Attr = II->getFunction() 1775 ->getFnAttribute("denormal-fp-math-f32") 1776 .getValueAsString(); 1777 DenormalMode Mode = parseDenormalFPAttribute(Attr); 1778 bool FtzEnabled = Mode.Output != DenormalMode::IEEE; 1779 1780 if (FtzEnabled != (Action.FtzRequirement == FTZ_MustBeOn)) 1781 return nullptr; 1782 } 1783 1784 // Simplify to target-generic intrinsic. 1785 if (Action.IID) { 1786 SmallVector<Value *, 4> Args(II->arg_operands()); 1787 // All the target-generic intrinsics currently of interest to us have one 1788 // type argument, equal to that of the nvvm intrinsic's argument. 1789 Type *Tys[] = {II->getArgOperand(0)->getType()}; 1790 return CallInst::Create( 1791 Intrinsic::getDeclaration(II->getModule(), *Action.IID, Tys), Args); 1792 } 1793 1794 // Simplify to target-generic binary op. 1795 if (Action.BinaryOp) 1796 return BinaryOperator::Create(*Action.BinaryOp, II->getArgOperand(0), 1797 II->getArgOperand(1), II->getName()); 1798 1799 // Simplify to target-generic cast op. 1800 if (Action.CastOp) 1801 return CastInst::Create(*Action.CastOp, II->getArgOperand(0), II->getType(), 1802 II->getName()); 1803 1804 // All that's left are the special cases. 1805 if (!Action.Special) 1806 return nullptr; 1807 1808 switch (*Action.Special) { 1809 case SPC_Reciprocal: 1810 // Simplify reciprocal. 1811 return BinaryOperator::Create( 1812 Instruction::FDiv, ConstantFP::get(II->getArgOperand(0)->getType(), 1), 1813 II->getArgOperand(0), II->getName()); 1814 } 1815 llvm_unreachable("All SpecialCase enumerators should be handled in switch."); 1816 } 1817 1818 Instruction *InstCombiner::visitVAEndInst(VAEndInst &I) { 1819 removeTriviallyEmptyRange(I, *this, [](const IntrinsicInst &I) { 1820 return I.getIntrinsicID() == Intrinsic::vastart || 1821 I.getIntrinsicID() == Intrinsic::vacopy; 1822 }); 1823 return nullptr; 1824 } 1825 1826 static Instruction *canonicalizeConstantArg0ToArg1(CallInst &Call) { 1827 assert(Call.getNumArgOperands() > 1 && "Need at least 2 args to swap"); 1828 Value *Arg0 = Call.getArgOperand(0), *Arg1 = Call.getArgOperand(1); 1829 if (isa<Constant>(Arg0) && !isa<Constant>(Arg1)) { 1830 Call.setArgOperand(0, Arg1); 1831 Call.setArgOperand(1, Arg0); 1832 return &Call; 1833 } 1834 return nullptr; 1835 } 1836 1837 Instruction *InstCombiner::foldIntrinsicWithOverflowCommon(IntrinsicInst *II) { 1838 WithOverflowInst *WO = cast<WithOverflowInst>(II); 1839 Value *OperationResult = nullptr; 1840 Constant *OverflowResult = nullptr; 1841 if (OptimizeOverflowCheck(WO->getBinaryOp(), WO->isSigned(), WO->getLHS(), 1842 WO->getRHS(), *WO, OperationResult, OverflowResult)) 1843 return CreateOverflowTuple(WO, OperationResult, OverflowResult); 1844 return nullptr; 1845 } 1846 1847 /// CallInst simplification. This mostly only handles folding of intrinsic 1848 /// instructions. For normal calls, it allows visitCallBase to do the heavy 1849 /// lifting. 1850 Instruction *InstCombiner::visitCallInst(CallInst &CI) { 1851 // Don't try to simplify calls without uses. It will not do anything useful, 1852 // but will result in the following folds being skipped. 1853 if (!CI.use_empty()) 1854 if (Value *V = SimplifyCall(&CI, SQ.getWithInstruction(&CI))) 1855 return replaceInstUsesWith(CI, V); 1856 1857 if (isFreeCall(&CI, &TLI)) 1858 return visitFree(CI); 1859 1860 // If the caller function is nounwind, mark the call as nounwind, even if the 1861 // callee isn't. 1862 if (CI.getFunction()->doesNotThrow() && !CI.doesNotThrow()) { 1863 CI.setDoesNotThrow(); 1864 return &CI; 1865 } 1866 1867 IntrinsicInst *II = dyn_cast<IntrinsicInst>(&CI); 1868 if (!II) return visitCallBase(CI); 1869 1870 // For atomic unordered mem intrinsics if len is not a positive or 1871 // not a multiple of element size then behavior is undefined. 1872 if (auto *AMI = dyn_cast<AtomicMemIntrinsic>(II)) 1873 if (ConstantInt *NumBytes = dyn_cast<ConstantInt>(AMI->getLength())) 1874 if (NumBytes->getSExtValue() < 0 || 1875 (NumBytes->getZExtValue() % AMI->getElementSizeInBytes() != 0)) { 1876 CreateNonTerminatorUnreachable(AMI); 1877 assert(AMI->getType()->isVoidTy() && 1878 "non void atomic unordered mem intrinsic"); 1879 return eraseInstFromFunction(*AMI); 1880 } 1881 1882 // Intrinsics cannot occur in an invoke or a callbr, so handle them here 1883 // instead of in visitCallBase. 1884 if (auto *MI = dyn_cast<AnyMemIntrinsic>(II)) { 1885 bool Changed = false; 1886 1887 // memmove/cpy/set of zero bytes is a noop. 1888 if (Constant *NumBytes = dyn_cast<Constant>(MI->getLength())) { 1889 if (NumBytes->isNullValue()) 1890 return eraseInstFromFunction(CI); 1891 1892 if (ConstantInt *CI = dyn_cast<ConstantInt>(NumBytes)) 1893 if (CI->getZExtValue() == 1) { 1894 // Replace the instruction with just byte operations. We would 1895 // transform other cases to loads/stores, but we don't know if 1896 // alignment is sufficient. 1897 } 1898 } 1899 1900 // No other transformations apply to volatile transfers. 1901 if (auto *M = dyn_cast<MemIntrinsic>(MI)) 1902 if (M->isVolatile()) 1903 return nullptr; 1904 1905 // If we have a memmove and the source operation is a constant global, 1906 // then the source and dest pointers can't alias, so we can change this 1907 // into a call to memcpy. 1908 if (auto *MMI = dyn_cast<AnyMemMoveInst>(MI)) { 1909 if (GlobalVariable *GVSrc = dyn_cast<GlobalVariable>(MMI->getSource())) 1910 if (GVSrc->isConstant()) { 1911 Module *M = CI.getModule(); 1912 Intrinsic::ID MemCpyID = 1913 isa<AtomicMemMoveInst>(MMI) 1914 ? Intrinsic::memcpy_element_unordered_atomic 1915 : Intrinsic::memcpy; 1916 Type *Tys[3] = { CI.getArgOperand(0)->getType(), 1917 CI.getArgOperand(1)->getType(), 1918 CI.getArgOperand(2)->getType() }; 1919 CI.setCalledFunction(Intrinsic::getDeclaration(M, MemCpyID, Tys)); 1920 Changed = true; 1921 } 1922 } 1923 1924 if (AnyMemTransferInst *MTI = dyn_cast<AnyMemTransferInst>(MI)) { 1925 // memmove(x,x,size) -> noop. 1926 if (MTI->getSource() == MTI->getDest()) 1927 return eraseInstFromFunction(CI); 1928 } 1929 1930 // If we can determine a pointer alignment that is bigger than currently 1931 // set, update the alignment. 1932 if (auto *MTI = dyn_cast<AnyMemTransferInst>(MI)) { 1933 if (Instruction *I = SimplifyAnyMemTransfer(MTI)) 1934 return I; 1935 } else if (auto *MSI = dyn_cast<AnyMemSetInst>(MI)) { 1936 if (Instruction *I = SimplifyAnyMemSet(MSI)) 1937 return I; 1938 } 1939 1940 if (Changed) return II; 1941 } 1942 1943 // For fixed width vector result intrinsics, use the generic demanded vector 1944 // support. 1945 if (auto *IIFVTy = dyn_cast<FixedVectorType>(II->getType())) { 1946 auto VWidth = IIFVTy->getNumElements(); 1947 APInt UndefElts(VWidth, 0); 1948 APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth)); 1949 if (Value *V = SimplifyDemandedVectorElts(II, AllOnesEltMask, UndefElts)) { 1950 if (V != II) 1951 return replaceInstUsesWith(*II, V); 1952 return II; 1953 } 1954 } 1955 1956 if (Instruction *I = SimplifyNVVMIntrinsic(II, *this)) 1957 return I; 1958 1959 auto SimplifyDemandedVectorEltsLow = [this](Value *Op, unsigned Width, 1960 unsigned DemandedWidth) { 1961 APInt UndefElts(Width, 0); 1962 APInt DemandedElts = APInt::getLowBitsSet(Width, DemandedWidth); 1963 return SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts); 1964 }; 1965 1966 Intrinsic::ID IID = II->getIntrinsicID(); 1967 switch (IID) { 1968 default: break; 1969 case Intrinsic::objectsize: 1970 if (Value *V = lowerObjectSizeCall(II, DL, &TLI, /*MustSucceed=*/false)) 1971 return replaceInstUsesWith(CI, V); 1972 return nullptr; 1973 case Intrinsic::bswap: { 1974 Value *IIOperand = II->getArgOperand(0); 1975 Value *X = nullptr; 1976 1977 // bswap(trunc(bswap(x))) -> trunc(lshr(x, c)) 1978 if (match(IIOperand, m_Trunc(m_BSwap(m_Value(X))))) { 1979 unsigned C = X->getType()->getPrimitiveSizeInBits() - 1980 IIOperand->getType()->getPrimitiveSizeInBits(); 1981 Value *CV = ConstantInt::get(X->getType(), C); 1982 Value *V = Builder.CreateLShr(X, CV); 1983 return new TruncInst(V, IIOperand->getType()); 1984 } 1985 break; 1986 } 1987 case Intrinsic::masked_load: 1988 if (Value *SimplifiedMaskedOp = simplifyMaskedLoad(*II)) 1989 return replaceInstUsesWith(CI, SimplifiedMaskedOp); 1990 break; 1991 case Intrinsic::masked_store: 1992 return simplifyMaskedStore(*II); 1993 case Intrinsic::masked_gather: 1994 return simplifyMaskedGather(*II); 1995 case Intrinsic::masked_scatter: 1996 return simplifyMaskedScatter(*II); 1997 case Intrinsic::launder_invariant_group: 1998 case Intrinsic::strip_invariant_group: 1999 if (auto *SkippedBarrier = simplifyInvariantGroupIntrinsic(*II, *this)) 2000 return replaceInstUsesWith(*II, SkippedBarrier); 2001 break; 2002 case Intrinsic::powi: 2003 if (ConstantInt *Power = dyn_cast<ConstantInt>(II->getArgOperand(1))) { 2004 // 0 and 1 are handled in instsimplify 2005 2006 // powi(x, -1) -> 1/x 2007 if (Power->isMinusOne()) 2008 return BinaryOperator::CreateFDiv(ConstantFP::get(CI.getType(), 1.0), 2009 II->getArgOperand(0)); 2010 // powi(x, 2) -> x*x 2011 if (Power->equalsInt(2)) 2012 return BinaryOperator::CreateFMul(II->getArgOperand(0), 2013 II->getArgOperand(0)); 2014 } 2015 break; 2016 2017 case Intrinsic::cttz: 2018 case Intrinsic::ctlz: 2019 if (auto *I = foldCttzCtlz(*II, *this)) 2020 return I; 2021 break; 2022 2023 case Intrinsic::ctpop: 2024 if (auto *I = foldCtpop(*II, *this)) 2025 return I; 2026 break; 2027 2028 case Intrinsic::fshl: 2029 case Intrinsic::fshr: { 2030 Value *Op0 = II->getArgOperand(0), *Op1 = II->getArgOperand(1); 2031 Type *Ty = II->getType(); 2032 unsigned BitWidth = Ty->getScalarSizeInBits(); 2033 Constant *ShAmtC; 2034 if (match(II->getArgOperand(2), m_Constant(ShAmtC)) && 2035 !isa<ConstantExpr>(ShAmtC) && !ShAmtC->containsConstantExpression()) { 2036 // Canonicalize a shift amount constant operand to modulo the bit-width. 2037 Constant *WidthC = ConstantInt::get(Ty, BitWidth); 2038 Constant *ModuloC = ConstantExpr::getURem(ShAmtC, WidthC); 2039 if (ModuloC != ShAmtC) 2040 return replaceOperand(*II, 2, ModuloC); 2041 2042 assert(ConstantExpr::getICmp(ICmpInst::ICMP_UGT, WidthC, ShAmtC) == 2043 ConstantInt::getTrue(CmpInst::makeCmpResultType(Ty)) && 2044 "Shift amount expected to be modulo bitwidth"); 2045 2046 // Canonicalize funnel shift right by constant to funnel shift left. This 2047 // is not entirely arbitrary. For historical reasons, the backend may 2048 // recognize rotate left patterns but miss rotate right patterns. 2049 if (IID == Intrinsic::fshr) { 2050 // fshr X, Y, C --> fshl X, Y, (BitWidth - C) 2051 Constant *LeftShiftC = ConstantExpr::getSub(WidthC, ShAmtC); 2052 Module *Mod = II->getModule(); 2053 Function *Fshl = Intrinsic::getDeclaration(Mod, Intrinsic::fshl, Ty); 2054 return CallInst::Create(Fshl, { Op0, Op1, LeftShiftC }); 2055 } 2056 assert(IID == Intrinsic::fshl && 2057 "All funnel shifts by simple constants should go left"); 2058 2059 // fshl(X, 0, C) --> shl X, C 2060 // fshl(X, undef, C) --> shl X, C 2061 if (match(Op1, m_ZeroInt()) || match(Op1, m_Undef())) 2062 return BinaryOperator::CreateShl(Op0, ShAmtC); 2063 2064 // fshl(0, X, C) --> lshr X, (BW-C) 2065 // fshl(undef, X, C) --> lshr X, (BW-C) 2066 if (match(Op0, m_ZeroInt()) || match(Op0, m_Undef())) 2067 return BinaryOperator::CreateLShr(Op1, 2068 ConstantExpr::getSub(WidthC, ShAmtC)); 2069 2070 // fshl i16 X, X, 8 --> bswap i16 X (reduce to more-specific form) 2071 if (Op0 == Op1 && BitWidth == 16 && match(ShAmtC, m_SpecificInt(8))) { 2072 Module *Mod = II->getModule(); 2073 Function *Bswap = Intrinsic::getDeclaration(Mod, Intrinsic::bswap, Ty); 2074 return CallInst::Create(Bswap, { Op0 }); 2075 } 2076 } 2077 2078 // Left or right might be masked. 2079 if (SimplifyDemandedInstructionBits(*II)) 2080 return &CI; 2081 2082 // The shift amount (operand 2) of a funnel shift is modulo the bitwidth, 2083 // so only the low bits of the shift amount are demanded if the bitwidth is 2084 // a power-of-2. 2085 if (!isPowerOf2_32(BitWidth)) 2086 break; 2087 APInt Op2Demanded = APInt::getLowBitsSet(BitWidth, Log2_32_Ceil(BitWidth)); 2088 KnownBits Op2Known(BitWidth); 2089 if (SimplifyDemandedBits(II, 2, Op2Demanded, Op2Known)) 2090 return &CI; 2091 break; 2092 } 2093 case Intrinsic::uadd_with_overflow: 2094 case Intrinsic::sadd_with_overflow: { 2095 if (Instruction *I = canonicalizeConstantArg0ToArg1(CI)) 2096 return I; 2097 if (Instruction *I = foldIntrinsicWithOverflowCommon(II)) 2098 return I; 2099 2100 // Given 2 constant operands whose sum does not overflow: 2101 // uaddo (X +nuw C0), C1 -> uaddo X, C0 + C1 2102 // saddo (X +nsw C0), C1 -> saddo X, C0 + C1 2103 Value *X; 2104 const APInt *C0, *C1; 2105 Value *Arg0 = II->getArgOperand(0); 2106 Value *Arg1 = II->getArgOperand(1); 2107 bool IsSigned = IID == Intrinsic::sadd_with_overflow; 2108 bool HasNWAdd = IsSigned ? match(Arg0, m_NSWAdd(m_Value(X), m_APInt(C0))) 2109 : match(Arg0, m_NUWAdd(m_Value(X), m_APInt(C0))); 2110 if (HasNWAdd && match(Arg1, m_APInt(C1))) { 2111 bool Overflow; 2112 APInt NewC = 2113 IsSigned ? C1->sadd_ov(*C0, Overflow) : C1->uadd_ov(*C0, Overflow); 2114 if (!Overflow) 2115 return replaceInstUsesWith( 2116 *II, Builder.CreateBinaryIntrinsic( 2117 IID, X, ConstantInt::get(Arg1->getType(), NewC))); 2118 } 2119 break; 2120 } 2121 2122 case Intrinsic::umul_with_overflow: 2123 case Intrinsic::smul_with_overflow: 2124 if (Instruction *I = canonicalizeConstantArg0ToArg1(CI)) 2125 return I; 2126 LLVM_FALLTHROUGH; 2127 2128 case Intrinsic::usub_with_overflow: 2129 if (Instruction *I = foldIntrinsicWithOverflowCommon(II)) 2130 return I; 2131 break; 2132 2133 case Intrinsic::ssub_with_overflow: { 2134 if (Instruction *I = foldIntrinsicWithOverflowCommon(II)) 2135 return I; 2136 2137 Constant *C; 2138 Value *Arg0 = II->getArgOperand(0); 2139 Value *Arg1 = II->getArgOperand(1); 2140 // Given a constant C that is not the minimum signed value 2141 // for an integer of a given bit width: 2142 // 2143 // ssubo X, C -> saddo X, -C 2144 if (match(Arg1, m_Constant(C)) && C->isNotMinSignedValue()) { 2145 Value *NegVal = ConstantExpr::getNeg(C); 2146 // Build a saddo call that is equivalent to the discovered 2147 // ssubo call. 2148 return replaceInstUsesWith( 2149 *II, Builder.CreateBinaryIntrinsic(Intrinsic::sadd_with_overflow, 2150 Arg0, NegVal)); 2151 } 2152 2153 break; 2154 } 2155 2156 case Intrinsic::uadd_sat: 2157 case Intrinsic::sadd_sat: 2158 if (Instruction *I = canonicalizeConstantArg0ToArg1(CI)) 2159 return I; 2160 LLVM_FALLTHROUGH; 2161 case Intrinsic::usub_sat: 2162 case Intrinsic::ssub_sat: { 2163 SaturatingInst *SI = cast<SaturatingInst>(II); 2164 Type *Ty = SI->getType(); 2165 Value *Arg0 = SI->getLHS(); 2166 Value *Arg1 = SI->getRHS(); 2167 2168 // Make use of known overflow information. 2169 OverflowResult OR = computeOverflow(SI->getBinaryOp(), SI->isSigned(), 2170 Arg0, Arg1, SI); 2171 switch (OR) { 2172 case OverflowResult::MayOverflow: 2173 break; 2174 case OverflowResult::NeverOverflows: 2175 if (SI->isSigned()) 2176 return BinaryOperator::CreateNSW(SI->getBinaryOp(), Arg0, Arg1); 2177 else 2178 return BinaryOperator::CreateNUW(SI->getBinaryOp(), Arg0, Arg1); 2179 case OverflowResult::AlwaysOverflowsLow: { 2180 unsigned BitWidth = Ty->getScalarSizeInBits(); 2181 APInt Min = APSInt::getMinValue(BitWidth, !SI->isSigned()); 2182 return replaceInstUsesWith(*SI, ConstantInt::get(Ty, Min)); 2183 } 2184 case OverflowResult::AlwaysOverflowsHigh: { 2185 unsigned BitWidth = Ty->getScalarSizeInBits(); 2186 APInt Max = APSInt::getMaxValue(BitWidth, !SI->isSigned()); 2187 return replaceInstUsesWith(*SI, ConstantInt::get(Ty, Max)); 2188 } 2189 } 2190 2191 // ssub.sat(X, C) -> sadd.sat(X, -C) if C != MIN 2192 Constant *C; 2193 if (IID == Intrinsic::ssub_sat && match(Arg1, m_Constant(C)) && 2194 C->isNotMinSignedValue()) { 2195 Value *NegVal = ConstantExpr::getNeg(C); 2196 return replaceInstUsesWith( 2197 *II, Builder.CreateBinaryIntrinsic( 2198 Intrinsic::sadd_sat, Arg0, NegVal)); 2199 } 2200 2201 // sat(sat(X + Val2) + Val) -> sat(X + (Val+Val2)) 2202 // sat(sat(X - Val2) - Val) -> sat(X - (Val+Val2)) 2203 // if Val and Val2 have the same sign 2204 if (auto *Other = dyn_cast<IntrinsicInst>(Arg0)) { 2205 Value *X; 2206 const APInt *Val, *Val2; 2207 APInt NewVal; 2208 bool IsUnsigned = 2209 IID == Intrinsic::uadd_sat || IID == Intrinsic::usub_sat; 2210 if (Other->getIntrinsicID() == IID && 2211 match(Arg1, m_APInt(Val)) && 2212 match(Other->getArgOperand(0), m_Value(X)) && 2213 match(Other->getArgOperand(1), m_APInt(Val2))) { 2214 if (IsUnsigned) 2215 NewVal = Val->uadd_sat(*Val2); 2216 else if (Val->isNonNegative() == Val2->isNonNegative()) { 2217 bool Overflow; 2218 NewVal = Val->sadd_ov(*Val2, Overflow); 2219 if (Overflow) { 2220 // Both adds together may add more than SignedMaxValue 2221 // without saturating the final result. 2222 break; 2223 } 2224 } else { 2225 // Cannot fold saturated addition with different signs. 2226 break; 2227 } 2228 2229 return replaceInstUsesWith( 2230 *II, Builder.CreateBinaryIntrinsic( 2231 IID, X, ConstantInt::get(II->getType(), NewVal))); 2232 } 2233 } 2234 break; 2235 } 2236 2237 case Intrinsic::minnum: 2238 case Intrinsic::maxnum: 2239 case Intrinsic::minimum: 2240 case Intrinsic::maximum: { 2241 if (Instruction *I = canonicalizeConstantArg0ToArg1(CI)) 2242 return I; 2243 Value *Arg0 = II->getArgOperand(0); 2244 Value *Arg1 = II->getArgOperand(1); 2245 Value *X, *Y; 2246 if (match(Arg0, m_FNeg(m_Value(X))) && match(Arg1, m_FNeg(m_Value(Y))) && 2247 (Arg0->hasOneUse() || Arg1->hasOneUse())) { 2248 // If both operands are negated, invert the call and negate the result: 2249 // min(-X, -Y) --> -(max(X, Y)) 2250 // max(-X, -Y) --> -(min(X, Y)) 2251 Intrinsic::ID NewIID; 2252 switch (IID) { 2253 case Intrinsic::maxnum: 2254 NewIID = Intrinsic::minnum; 2255 break; 2256 case Intrinsic::minnum: 2257 NewIID = Intrinsic::maxnum; 2258 break; 2259 case Intrinsic::maximum: 2260 NewIID = Intrinsic::minimum; 2261 break; 2262 case Intrinsic::minimum: 2263 NewIID = Intrinsic::maximum; 2264 break; 2265 default: 2266 llvm_unreachable("unexpected intrinsic ID"); 2267 } 2268 Value *NewCall = Builder.CreateBinaryIntrinsic(NewIID, X, Y, II); 2269 Instruction *FNeg = UnaryOperator::CreateFNeg(NewCall); 2270 FNeg->copyIRFlags(II); 2271 return FNeg; 2272 } 2273 2274 // m(m(X, C2), C1) -> m(X, C) 2275 const APFloat *C1, *C2; 2276 if (auto *M = dyn_cast<IntrinsicInst>(Arg0)) { 2277 if (M->getIntrinsicID() == IID && match(Arg1, m_APFloat(C1)) && 2278 ((match(M->getArgOperand(0), m_Value(X)) && 2279 match(M->getArgOperand(1), m_APFloat(C2))) || 2280 (match(M->getArgOperand(1), m_Value(X)) && 2281 match(M->getArgOperand(0), m_APFloat(C2))))) { 2282 APFloat Res(0.0); 2283 switch (IID) { 2284 case Intrinsic::maxnum: 2285 Res = maxnum(*C1, *C2); 2286 break; 2287 case Intrinsic::minnum: 2288 Res = minnum(*C1, *C2); 2289 break; 2290 case Intrinsic::maximum: 2291 Res = maximum(*C1, *C2); 2292 break; 2293 case Intrinsic::minimum: 2294 Res = minimum(*C1, *C2); 2295 break; 2296 default: 2297 llvm_unreachable("unexpected intrinsic ID"); 2298 } 2299 Instruction *NewCall = Builder.CreateBinaryIntrinsic( 2300 IID, X, ConstantFP::get(Arg0->getType(), Res), II); 2301 // TODO: Conservatively intersecting FMF. If Res == C2, the transform 2302 // was a simplification (so Arg0 and its original flags could 2303 // propagate?) 2304 NewCall->andIRFlags(M); 2305 return replaceInstUsesWith(*II, NewCall); 2306 } 2307 } 2308 2309 Value *ExtSrc0; 2310 Value *ExtSrc1; 2311 2312 // minnum (fpext x), (fpext y) -> minnum x, y 2313 // maxnum (fpext x), (fpext y) -> maxnum x, y 2314 if (match(II->getArgOperand(0), m_OneUse(m_FPExt(m_Value(ExtSrc0)))) && 2315 match(II->getArgOperand(1), m_OneUse(m_FPExt(m_Value(ExtSrc1)))) && 2316 ExtSrc0->getType() == ExtSrc1->getType()) { 2317 Function *F = Intrinsic::getDeclaration( 2318 II->getModule(), II->getIntrinsicID(), {ExtSrc0->getType()}); 2319 CallInst *NewCall = Builder.CreateCall(F, { ExtSrc0, ExtSrc1 }); 2320 NewCall->copyFastMathFlags(II); 2321 NewCall->takeName(II); 2322 return new FPExtInst(NewCall, II->getType()); 2323 } 2324 2325 break; 2326 } 2327 case Intrinsic::fmuladd: { 2328 // Canonicalize fast fmuladd to the separate fmul + fadd. 2329 if (II->isFast()) { 2330 BuilderTy::FastMathFlagGuard Guard(Builder); 2331 Builder.setFastMathFlags(II->getFastMathFlags()); 2332 Value *Mul = Builder.CreateFMul(II->getArgOperand(0), 2333 II->getArgOperand(1)); 2334 Value *Add = Builder.CreateFAdd(Mul, II->getArgOperand(2)); 2335 Add->takeName(II); 2336 return replaceInstUsesWith(*II, Add); 2337 } 2338 2339 // Try to simplify the underlying FMul. 2340 if (Value *V = SimplifyFMulInst(II->getArgOperand(0), II->getArgOperand(1), 2341 II->getFastMathFlags(), 2342 SQ.getWithInstruction(II))) { 2343 auto *FAdd = BinaryOperator::CreateFAdd(V, II->getArgOperand(2)); 2344 FAdd->copyFastMathFlags(II); 2345 return FAdd; 2346 } 2347 2348 LLVM_FALLTHROUGH; 2349 } 2350 case Intrinsic::fma: { 2351 if (Instruction *I = canonicalizeConstantArg0ToArg1(CI)) 2352 return I; 2353 2354 // fma fneg(x), fneg(y), z -> fma x, y, z 2355 Value *Src0 = II->getArgOperand(0); 2356 Value *Src1 = II->getArgOperand(1); 2357 Value *X, *Y; 2358 if (match(Src0, m_FNeg(m_Value(X))) && match(Src1, m_FNeg(m_Value(Y)))) { 2359 replaceOperand(*II, 0, X); 2360 replaceOperand(*II, 1, Y); 2361 return II; 2362 } 2363 2364 // fma fabs(x), fabs(x), z -> fma x, x, z 2365 if (match(Src0, m_FAbs(m_Value(X))) && 2366 match(Src1, m_FAbs(m_Specific(X)))) { 2367 replaceOperand(*II, 0, X); 2368 replaceOperand(*II, 1, X); 2369 return II; 2370 } 2371 2372 // Try to simplify the underlying FMul. We can only apply simplifications 2373 // that do not require rounding. 2374 if (Value *V = SimplifyFMAFMul(II->getArgOperand(0), II->getArgOperand(1), 2375 II->getFastMathFlags(), 2376 SQ.getWithInstruction(II))) { 2377 auto *FAdd = BinaryOperator::CreateFAdd(V, II->getArgOperand(2)); 2378 FAdd->copyFastMathFlags(II); 2379 return FAdd; 2380 } 2381 2382 // fma x, y, 0 -> fmul x, y 2383 // This is always valid for -0.0, but requires nsz for +0.0 as 2384 // -0.0 + 0.0 = 0.0, which would not be the same as the fmul on its own. 2385 if (match(II->getArgOperand(2), m_NegZeroFP()) || 2386 (match(II->getArgOperand(2), m_PosZeroFP()) && 2387 II->getFastMathFlags().noSignedZeros())) 2388 return BinaryOperator::CreateFMulFMF(Src0, Src1, II); 2389 2390 break; 2391 } 2392 case Intrinsic::copysign: { 2393 if (SignBitMustBeZero(II->getArgOperand(1), &TLI)) { 2394 // If we know that the sign argument is positive, reduce to FABS: 2395 // copysign X, Pos --> fabs X 2396 Value *Fabs = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, 2397 II->getArgOperand(0), II); 2398 return replaceInstUsesWith(*II, Fabs); 2399 } 2400 // TODO: There should be a ValueTracking sibling like SignBitMustBeOne. 2401 const APFloat *C; 2402 if (match(II->getArgOperand(1), m_APFloat(C)) && C->isNegative()) { 2403 // If we know that the sign argument is negative, reduce to FNABS: 2404 // copysign X, Neg --> fneg (fabs X) 2405 Value *Fabs = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, 2406 II->getArgOperand(0), II); 2407 return replaceInstUsesWith(*II, Builder.CreateFNegFMF(Fabs, II)); 2408 } 2409 2410 // Propagate sign argument through nested calls: 2411 // copysign X, (copysign ?, SignArg) --> copysign X, SignArg 2412 Value *SignArg; 2413 if (match(II->getArgOperand(1), 2414 m_Intrinsic<Intrinsic::copysign>(m_Value(), m_Value(SignArg)))) 2415 return replaceOperand(*II, 1, SignArg); 2416 2417 break; 2418 } 2419 case Intrinsic::fabs: { 2420 Value *Cond; 2421 Constant *LHS, *RHS; 2422 if (match(II->getArgOperand(0), 2423 m_Select(m_Value(Cond), m_Constant(LHS), m_Constant(RHS)))) { 2424 CallInst *Call0 = Builder.CreateCall(II->getCalledFunction(), {LHS}); 2425 CallInst *Call1 = Builder.CreateCall(II->getCalledFunction(), {RHS}); 2426 return SelectInst::Create(Cond, Call0, Call1); 2427 } 2428 2429 LLVM_FALLTHROUGH; 2430 } 2431 case Intrinsic::ceil: 2432 case Intrinsic::floor: 2433 case Intrinsic::round: 2434 case Intrinsic::roundeven: 2435 case Intrinsic::nearbyint: 2436 case Intrinsic::rint: 2437 case Intrinsic::trunc: { 2438 Value *ExtSrc; 2439 if (match(II->getArgOperand(0), m_OneUse(m_FPExt(m_Value(ExtSrc))))) { 2440 // Narrow the call: intrinsic (fpext x) -> fpext (intrinsic x) 2441 Value *NarrowII = Builder.CreateUnaryIntrinsic(IID, ExtSrc, II); 2442 return new FPExtInst(NarrowII, II->getType()); 2443 } 2444 break; 2445 } 2446 case Intrinsic::cos: 2447 case Intrinsic::amdgcn_cos: { 2448 Value *X; 2449 Value *Src = II->getArgOperand(0); 2450 if (match(Src, m_FNeg(m_Value(X))) || match(Src, m_FAbs(m_Value(X)))) { 2451 // cos(-x) -> cos(x) 2452 // cos(fabs(x)) -> cos(x) 2453 return replaceOperand(*II, 0, X); 2454 } 2455 break; 2456 } 2457 case Intrinsic::sin: { 2458 Value *X; 2459 if (match(II->getArgOperand(0), m_OneUse(m_FNeg(m_Value(X))))) { 2460 // sin(-x) --> -sin(x) 2461 Value *NewSin = Builder.CreateUnaryIntrinsic(Intrinsic::sin, X, II); 2462 Instruction *FNeg = UnaryOperator::CreateFNeg(NewSin); 2463 FNeg->copyFastMathFlags(II); 2464 return FNeg; 2465 } 2466 break; 2467 } 2468 case Intrinsic::ppc_altivec_lvx: 2469 case Intrinsic::ppc_altivec_lvxl: 2470 // Turn PPC lvx -> load if the pointer is known aligned. 2471 if (getOrEnforceKnownAlignment(II->getArgOperand(0), Align(16), DL, II, &AC, 2472 &DT) >= 16) { 2473 Value *Ptr = Builder.CreateBitCast(II->getArgOperand(0), 2474 PointerType::getUnqual(II->getType())); 2475 return new LoadInst(II->getType(), Ptr, "", false, Align(16)); 2476 } 2477 break; 2478 case Intrinsic::ppc_vsx_lxvw4x: 2479 case Intrinsic::ppc_vsx_lxvd2x: { 2480 // Turn PPC VSX loads into normal loads. 2481 Value *Ptr = Builder.CreateBitCast(II->getArgOperand(0), 2482 PointerType::getUnqual(II->getType())); 2483 return new LoadInst(II->getType(), Ptr, Twine(""), false, Align(1)); 2484 } 2485 case Intrinsic::ppc_altivec_stvx: 2486 case Intrinsic::ppc_altivec_stvxl: 2487 // Turn stvx -> store if the pointer is known aligned. 2488 if (getOrEnforceKnownAlignment(II->getArgOperand(1), Align(16), DL, II, &AC, 2489 &DT) >= 16) { 2490 Type *OpPtrTy = 2491 PointerType::getUnqual(II->getArgOperand(0)->getType()); 2492 Value *Ptr = Builder.CreateBitCast(II->getArgOperand(1), OpPtrTy); 2493 return new StoreInst(II->getArgOperand(0), Ptr, false, Align(16)); 2494 } 2495 break; 2496 case Intrinsic::ppc_vsx_stxvw4x: 2497 case Intrinsic::ppc_vsx_stxvd2x: { 2498 // Turn PPC VSX stores into normal stores. 2499 Type *OpPtrTy = PointerType::getUnqual(II->getArgOperand(0)->getType()); 2500 Value *Ptr = Builder.CreateBitCast(II->getArgOperand(1), OpPtrTy); 2501 return new StoreInst(II->getArgOperand(0), Ptr, false, Align(1)); 2502 } 2503 case Intrinsic::ppc_qpx_qvlfs: 2504 // Turn PPC QPX qvlfs -> load if the pointer is known aligned. 2505 if (getOrEnforceKnownAlignment(II->getArgOperand(0), Align(16), DL, II, &AC, 2506 &DT) >= 16) { 2507 Type *VTy = 2508 VectorType::get(Builder.getFloatTy(), 2509 cast<VectorType>(II->getType())->getElementCount()); 2510 Value *Ptr = Builder.CreateBitCast(II->getArgOperand(0), 2511 PointerType::getUnqual(VTy)); 2512 Value *Load = Builder.CreateLoad(VTy, Ptr); 2513 return new FPExtInst(Load, II->getType()); 2514 } 2515 break; 2516 case Intrinsic::ppc_qpx_qvlfd: 2517 // Turn PPC QPX qvlfd -> load if the pointer is known aligned. 2518 if (getOrEnforceKnownAlignment(II->getArgOperand(0), Align(32), DL, II, &AC, 2519 &DT) >= 32) { 2520 Value *Ptr = Builder.CreateBitCast(II->getArgOperand(0), 2521 PointerType::getUnqual(II->getType())); 2522 return new LoadInst(II->getType(), Ptr, "", false, Align(32)); 2523 } 2524 break; 2525 case Intrinsic::ppc_qpx_qvstfs: 2526 // Turn PPC QPX qvstfs -> store if the pointer is known aligned. 2527 if (getOrEnforceKnownAlignment(II->getArgOperand(1), Align(16), DL, II, &AC, 2528 &DT) >= 16) { 2529 Type *VTy = VectorType::get( 2530 Builder.getFloatTy(), 2531 cast<VectorType>(II->getArgOperand(0)->getType())->getElementCount()); 2532 Value *TOp = Builder.CreateFPTrunc(II->getArgOperand(0), VTy); 2533 Type *OpPtrTy = PointerType::getUnqual(VTy); 2534 Value *Ptr = Builder.CreateBitCast(II->getArgOperand(1), OpPtrTy); 2535 return new StoreInst(TOp, Ptr, false, Align(16)); 2536 } 2537 break; 2538 case Intrinsic::ppc_qpx_qvstfd: 2539 // Turn PPC QPX qvstfd -> store if the pointer is known aligned. 2540 if (getOrEnforceKnownAlignment(II->getArgOperand(1), Align(32), DL, II, &AC, 2541 &DT) >= 32) { 2542 Type *OpPtrTy = 2543 PointerType::getUnqual(II->getArgOperand(0)->getType()); 2544 Value *Ptr = Builder.CreateBitCast(II->getArgOperand(1), OpPtrTy); 2545 return new StoreInst(II->getArgOperand(0), Ptr, false, Align(32)); 2546 } 2547 break; 2548 2549 case Intrinsic::x86_bmi_bextr_32: 2550 case Intrinsic::x86_bmi_bextr_64: 2551 case Intrinsic::x86_tbm_bextri_u32: 2552 case Intrinsic::x86_tbm_bextri_u64: 2553 // If the RHS is a constant we can try some simplifications. 2554 if (auto *C = dyn_cast<ConstantInt>(II->getArgOperand(1))) { 2555 uint64_t Shift = C->getZExtValue(); 2556 uint64_t Length = (Shift >> 8) & 0xff; 2557 Shift &= 0xff; 2558 unsigned BitWidth = II->getType()->getIntegerBitWidth(); 2559 // If the length is 0 or the shift is out of range, replace with zero. 2560 if (Length == 0 || Shift >= BitWidth) 2561 return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), 0)); 2562 // If the LHS is also a constant, we can completely constant fold this. 2563 if (auto *InC = dyn_cast<ConstantInt>(II->getArgOperand(0))) { 2564 uint64_t Result = InC->getZExtValue() >> Shift; 2565 if (Length > BitWidth) 2566 Length = BitWidth; 2567 Result &= maskTrailingOnes<uint64_t>(Length); 2568 return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), Result)); 2569 } 2570 // TODO should we turn this into 'and' if shift is 0? Or 'shl' if we 2571 // are only masking bits that a shift already cleared? 2572 } 2573 break; 2574 2575 case Intrinsic::x86_bmi_bzhi_32: 2576 case Intrinsic::x86_bmi_bzhi_64: 2577 // If the RHS is a constant we can try some simplifications. 2578 if (auto *C = dyn_cast<ConstantInt>(II->getArgOperand(1))) { 2579 uint64_t Index = C->getZExtValue() & 0xff; 2580 unsigned BitWidth = II->getType()->getIntegerBitWidth(); 2581 if (Index >= BitWidth) 2582 return replaceInstUsesWith(CI, II->getArgOperand(0)); 2583 if (Index == 0) 2584 return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), 0)); 2585 // If the LHS is also a constant, we can completely constant fold this. 2586 if (auto *InC = dyn_cast<ConstantInt>(II->getArgOperand(0))) { 2587 uint64_t Result = InC->getZExtValue(); 2588 Result &= maskTrailingOnes<uint64_t>(Index); 2589 return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), Result)); 2590 } 2591 // TODO should we convert this to an AND if the RHS is constant? 2592 } 2593 break; 2594 case Intrinsic::x86_bmi_pext_32: 2595 case Intrinsic::x86_bmi_pext_64: 2596 if (auto *MaskC = dyn_cast<ConstantInt>(II->getArgOperand(1))) { 2597 if (MaskC->isNullValue()) 2598 return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), 0)); 2599 if (MaskC->isAllOnesValue()) 2600 return replaceInstUsesWith(CI, II->getArgOperand(0)); 2601 2602 if (auto *SrcC = dyn_cast<ConstantInt>(II->getArgOperand(0))) { 2603 uint64_t Src = SrcC->getZExtValue(); 2604 uint64_t Mask = MaskC->getZExtValue(); 2605 uint64_t Result = 0; 2606 uint64_t BitToSet = 1; 2607 2608 while (Mask) { 2609 // Isolate lowest set bit. 2610 uint64_t BitToTest = Mask & -Mask; 2611 if (BitToTest & Src) 2612 Result |= BitToSet; 2613 2614 BitToSet <<= 1; 2615 // Clear lowest set bit. 2616 Mask &= Mask - 1; 2617 } 2618 2619 return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), Result)); 2620 } 2621 } 2622 break; 2623 case Intrinsic::x86_bmi_pdep_32: 2624 case Intrinsic::x86_bmi_pdep_64: 2625 if (auto *MaskC = dyn_cast<ConstantInt>(II->getArgOperand(1))) { 2626 if (MaskC->isNullValue()) 2627 return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), 0)); 2628 if (MaskC->isAllOnesValue()) 2629 return replaceInstUsesWith(CI, II->getArgOperand(0)); 2630 2631 if (auto *SrcC = dyn_cast<ConstantInt>(II->getArgOperand(0))) { 2632 uint64_t Src = SrcC->getZExtValue(); 2633 uint64_t Mask = MaskC->getZExtValue(); 2634 uint64_t Result = 0; 2635 uint64_t BitToTest = 1; 2636 2637 while (Mask) { 2638 // Isolate lowest set bit. 2639 uint64_t BitToSet = Mask & -Mask; 2640 if (BitToTest & Src) 2641 Result |= BitToSet; 2642 2643 BitToTest <<= 1; 2644 // Clear lowest set bit; 2645 Mask &= Mask - 1; 2646 } 2647 2648 return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), Result)); 2649 } 2650 } 2651 break; 2652 2653 case Intrinsic::x86_sse_cvtss2si: 2654 case Intrinsic::x86_sse_cvtss2si64: 2655 case Intrinsic::x86_sse_cvttss2si: 2656 case Intrinsic::x86_sse_cvttss2si64: 2657 case Intrinsic::x86_sse2_cvtsd2si: 2658 case Intrinsic::x86_sse2_cvtsd2si64: 2659 case Intrinsic::x86_sse2_cvttsd2si: 2660 case Intrinsic::x86_sse2_cvttsd2si64: 2661 case Intrinsic::x86_avx512_vcvtss2si32: 2662 case Intrinsic::x86_avx512_vcvtss2si64: 2663 case Intrinsic::x86_avx512_vcvtss2usi32: 2664 case Intrinsic::x86_avx512_vcvtss2usi64: 2665 case Intrinsic::x86_avx512_vcvtsd2si32: 2666 case Intrinsic::x86_avx512_vcvtsd2si64: 2667 case Intrinsic::x86_avx512_vcvtsd2usi32: 2668 case Intrinsic::x86_avx512_vcvtsd2usi64: 2669 case Intrinsic::x86_avx512_cvttss2si: 2670 case Intrinsic::x86_avx512_cvttss2si64: 2671 case Intrinsic::x86_avx512_cvttss2usi: 2672 case Intrinsic::x86_avx512_cvttss2usi64: 2673 case Intrinsic::x86_avx512_cvttsd2si: 2674 case Intrinsic::x86_avx512_cvttsd2si64: 2675 case Intrinsic::x86_avx512_cvttsd2usi: 2676 case Intrinsic::x86_avx512_cvttsd2usi64: { 2677 // These intrinsics only demand the 0th element of their input vectors. If 2678 // we can simplify the input based on that, do so now. 2679 Value *Arg = II->getArgOperand(0); 2680 unsigned VWidth = cast<VectorType>(Arg->getType())->getNumElements(); 2681 if (Value *V = SimplifyDemandedVectorEltsLow(Arg, VWidth, 1)) 2682 return replaceOperand(*II, 0, V); 2683 break; 2684 } 2685 2686 case Intrinsic::x86_mmx_pmovmskb: 2687 case Intrinsic::x86_sse_movmsk_ps: 2688 case Intrinsic::x86_sse2_movmsk_pd: 2689 case Intrinsic::x86_sse2_pmovmskb_128: 2690 case Intrinsic::x86_avx_movmsk_pd_256: 2691 case Intrinsic::x86_avx_movmsk_ps_256: 2692 case Intrinsic::x86_avx2_pmovmskb: 2693 if (Value *V = simplifyX86movmsk(*II, Builder)) 2694 return replaceInstUsesWith(*II, V); 2695 break; 2696 2697 case Intrinsic::x86_sse_comieq_ss: 2698 case Intrinsic::x86_sse_comige_ss: 2699 case Intrinsic::x86_sse_comigt_ss: 2700 case Intrinsic::x86_sse_comile_ss: 2701 case Intrinsic::x86_sse_comilt_ss: 2702 case Intrinsic::x86_sse_comineq_ss: 2703 case Intrinsic::x86_sse_ucomieq_ss: 2704 case Intrinsic::x86_sse_ucomige_ss: 2705 case Intrinsic::x86_sse_ucomigt_ss: 2706 case Intrinsic::x86_sse_ucomile_ss: 2707 case Intrinsic::x86_sse_ucomilt_ss: 2708 case Intrinsic::x86_sse_ucomineq_ss: 2709 case Intrinsic::x86_sse2_comieq_sd: 2710 case Intrinsic::x86_sse2_comige_sd: 2711 case Intrinsic::x86_sse2_comigt_sd: 2712 case Intrinsic::x86_sse2_comile_sd: 2713 case Intrinsic::x86_sse2_comilt_sd: 2714 case Intrinsic::x86_sse2_comineq_sd: 2715 case Intrinsic::x86_sse2_ucomieq_sd: 2716 case Intrinsic::x86_sse2_ucomige_sd: 2717 case Intrinsic::x86_sse2_ucomigt_sd: 2718 case Intrinsic::x86_sse2_ucomile_sd: 2719 case Intrinsic::x86_sse2_ucomilt_sd: 2720 case Intrinsic::x86_sse2_ucomineq_sd: 2721 case Intrinsic::x86_avx512_vcomi_ss: 2722 case Intrinsic::x86_avx512_vcomi_sd: 2723 case Intrinsic::x86_avx512_mask_cmp_ss: 2724 case Intrinsic::x86_avx512_mask_cmp_sd: { 2725 // These intrinsics only demand the 0th element of their input vectors. If 2726 // we can simplify the input based on that, do so now. 2727 bool MadeChange = false; 2728 Value *Arg0 = II->getArgOperand(0); 2729 Value *Arg1 = II->getArgOperand(1); 2730 unsigned VWidth = cast<VectorType>(Arg0->getType())->getNumElements(); 2731 if (Value *V = SimplifyDemandedVectorEltsLow(Arg0, VWidth, 1)) { 2732 replaceOperand(*II, 0, V); 2733 MadeChange = true; 2734 } 2735 if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, 1)) { 2736 replaceOperand(*II, 1, V); 2737 MadeChange = true; 2738 } 2739 if (MadeChange) 2740 return II; 2741 break; 2742 } 2743 case Intrinsic::x86_avx512_cmp_pd_128: 2744 case Intrinsic::x86_avx512_cmp_pd_256: 2745 case Intrinsic::x86_avx512_cmp_pd_512: 2746 case Intrinsic::x86_avx512_cmp_ps_128: 2747 case Intrinsic::x86_avx512_cmp_ps_256: 2748 case Intrinsic::x86_avx512_cmp_ps_512: { 2749 // Folding cmp(sub(a,b),0) -> cmp(a,b) and cmp(0,sub(a,b)) -> cmp(b,a) 2750 Value *Arg0 = II->getArgOperand(0); 2751 Value *Arg1 = II->getArgOperand(1); 2752 bool Arg0IsZero = match(Arg0, m_PosZeroFP()); 2753 if (Arg0IsZero) 2754 std::swap(Arg0, Arg1); 2755 Value *A, *B; 2756 // This fold requires only the NINF(not +/- inf) since inf minus 2757 // inf is nan. 2758 // NSZ(No Signed Zeros) is not needed because zeros of any sign are 2759 // equal for both compares. 2760 // NNAN is not needed because nans compare the same for both compares. 2761 // The compare intrinsic uses the above assumptions and therefore 2762 // doesn't require additional flags. 2763 if ((match(Arg0, m_OneUse(m_FSub(m_Value(A), m_Value(B)))) && 2764 match(Arg1, m_PosZeroFP()) && isa<Instruction>(Arg0) && 2765 cast<Instruction>(Arg0)->getFastMathFlags().noInfs())) { 2766 if (Arg0IsZero) 2767 std::swap(A, B); 2768 replaceOperand(*II, 0, A); 2769 replaceOperand(*II, 1, B); 2770 return II; 2771 } 2772 break; 2773 } 2774 2775 case Intrinsic::x86_avx512_add_ps_512: 2776 case Intrinsic::x86_avx512_div_ps_512: 2777 case Intrinsic::x86_avx512_mul_ps_512: 2778 case Intrinsic::x86_avx512_sub_ps_512: 2779 case Intrinsic::x86_avx512_add_pd_512: 2780 case Intrinsic::x86_avx512_div_pd_512: 2781 case Intrinsic::x86_avx512_mul_pd_512: 2782 case Intrinsic::x86_avx512_sub_pd_512: 2783 // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular 2784 // IR operations. 2785 if (auto *R = dyn_cast<ConstantInt>(II->getArgOperand(2))) { 2786 if (R->getValue() == 4) { 2787 Value *Arg0 = II->getArgOperand(0); 2788 Value *Arg1 = II->getArgOperand(1); 2789 2790 Value *V; 2791 switch (IID) { 2792 default: llvm_unreachable("Case stmts out of sync!"); 2793 case Intrinsic::x86_avx512_add_ps_512: 2794 case Intrinsic::x86_avx512_add_pd_512: 2795 V = Builder.CreateFAdd(Arg0, Arg1); 2796 break; 2797 case Intrinsic::x86_avx512_sub_ps_512: 2798 case Intrinsic::x86_avx512_sub_pd_512: 2799 V = Builder.CreateFSub(Arg0, Arg1); 2800 break; 2801 case Intrinsic::x86_avx512_mul_ps_512: 2802 case Intrinsic::x86_avx512_mul_pd_512: 2803 V = Builder.CreateFMul(Arg0, Arg1); 2804 break; 2805 case Intrinsic::x86_avx512_div_ps_512: 2806 case Intrinsic::x86_avx512_div_pd_512: 2807 V = Builder.CreateFDiv(Arg0, Arg1); 2808 break; 2809 } 2810 2811 return replaceInstUsesWith(*II, V); 2812 } 2813 } 2814 break; 2815 2816 case Intrinsic::x86_avx512_mask_add_ss_round: 2817 case Intrinsic::x86_avx512_mask_div_ss_round: 2818 case Intrinsic::x86_avx512_mask_mul_ss_round: 2819 case Intrinsic::x86_avx512_mask_sub_ss_round: 2820 case Intrinsic::x86_avx512_mask_add_sd_round: 2821 case Intrinsic::x86_avx512_mask_div_sd_round: 2822 case Intrinsic::x86_avx512_mask_mul_sd_round: 2823 case Intrinsic::x86_avx512_mask_sub_sd_round: 2824 // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular 2825 // IR operations. 2826 if (auto *R = dyn_cast<ConstantInt>(II->getArgOperand(4))) { 2827 if (R->getValue() == 4) { 2828 // Extract the element as scalars. 2829 Value *Arg0 = II->getArgOperand(0); 2830 Value *Arg1 = II->getArgOperand(1); 2831 Value *LHS = Builder.CreateExtractElement(Arg0, (uint64_t)0); 2832 Value *RHS = Builder.CreateExtractElement(Arg1, (uint64_t)0); 2833 2834 Value *V; 2835 switch (IID) { 2836 default: llvm_unreachable("Case stmts out of sync!"); 2837 case Intrinsic::x86_avx512_mask_add_ss_round: 2838 case Intrinsic::x86_avx512_mask_add_sd_round: 2839 V = Builder.CreateFAdd(LHS, RHS); 2840 break; 2841 case Intrinsic::x86_avx512_mask_sub_ss_round: 2842 case Intrinsic::x86_avx512_mask_sub_sd_round: 2843 V = Builder.CreateFSub(LHS, RHS); 2844 break; 2845 case Intrinsic::x86_avx512_mask_mul_ss_round: 2846 case Intrinsic::x86_avx512_mask_mul_sd_round: 2847 V = Builder.CreateFMul(LHS, RHS); 2848 break; 2849 case Intrinsic::x86_avx512_mask_div_ss_round: 2850 case Intrinsic::x86_avx512_mask_div_sd_round: 2851 V = Builder.CreateFDiv(LHS, RHS); 2852 break; 2853 } 2854 2855 // Handle the masking aspect of the intrinsic. 2856 Value *Mask = II->getArgOperand(3); 2857 auto *C = dyn_cast<ConstantInt>(Mask); 2858 // We don't need a select if we know the mask bit is a 1. 2859 if (!C || !C->getValue()[0]) { 2860 // Cast the mask to an i1 vector and then extract the lowest element. 2861 auto *MaskTy = FixedVectorType::get( 2862 Builder.getInt1Ty(), 2863 cast<IntegerType>(Mask->getType())->getBitWidth()); 2864 Mask = Builder.CreateBitCast(Mask, MaskTy); 2865 Mask = Builder.CreateExtractElement(Mask, (uint64_t)0); 2866 // Extract the lowest element from the passthru operand. 2867 Value *Passthru = Builder.CreateExtractElement(II->getArgOperand(2), 2868 (uint64_t)0); 2869 V = Builder.CreateSelect(Mask, V, Passthru); 2870 } 2871 2872 // Insert the result back into the original argument 0. 2873 V = Builder.CreateInsertElement(Arg0, V, (uint64_t)0); 2874 2875 return replaceInstUsesWith(*II, V); 2876 } 2877 } 2878 break; 2879 2880 // Constant fold ashr( <A x Bi>, Ci ). 2881 // Constant fold lshr( <A x Bi>, Ci ). 2882 // Constant fold shl( <A x Bi>, Ci ). 2883 case Intrinsic::x86_sse2_psrai_d: 2884 case Intrinsic::x86_sse2_psrai_w: 2885 case Intrinsic::x86_avx2_psrai_d: 2886 case Intrinsic::x86_avx2_psrai_w: 2887 case Intrinsic::x86_avx512_psrai_q_128: 2888 case Intrinsic::x86_avx512_psrai_q_256: 2889 case Intrinsic::x86_avx512_psrai_d_512: 2890 case Intrinsic::x86_avx512_psrai_q_512: 2891 case Intrinsic::x86_avx512_psrai_w_512: 2892 case Intrinsic::x86_sse2_psrli_d: 2893 case Intrinsic::x86_sse2_psrli_q: 2894 case Intrinsic::x86_sse2_psrli_w: 2895 case Intrinsic::x86_avx2_psrli_d: 2896 case Intrinsic::x86_avx2_psrli_q: 2897 case Intrinsic::x86_avx2_psrli_w: 2898 case Intrinsic::x86_avx512_psrli_d_512: 2899 case Intrinsic::x86_avx512_psrli_q_512: 2900 case Intrinsic::x86_avx512_psrli_w_512: 2901 case Intrinsic::x86_sse2_pslli_d: 2902 case Intrinsic::x86_sse2_pslli_q: 2903 case Intrinsic::x86_sse2_pslli_w: 2904 case Intrinsic::x86_avx2_pslli_d: 2905 case Intrinsic::x86_avx2_pslli_q: 2906 case Intrinsic::x86_avx2_pslli_w: 2907 case Intrinsic::x86_avx512_pslli_d_512: 2908 case Intrinsic::x86_avx512_pslli_q_512: 2909 case Intrinsic::x86_avx512_pslli_w_512: 2910 if (Value *V = simplifyX86immShift(*II, Builder)) 2911 return replaceInstUsesWith(*II, V); 2912 break; 2913 2914 case Intrinsic::x86_sse2_psra_d: 2915 case Intrinsic::x86_sse2_psra_w: 2916 case Intrinsic::x86_avx2_psra_d: 2917 case Intrinsic::x86_avx2_psra_w: 2918 case Intrinsic::x86_avx512_psra_q_128: 2919 case Intrinsic::x86_avx512_psra_q_256: 2920 case Intrinsic::x86_avx512_psra_d_512: 2921 case Intrinsic::x86_avx512_psra_q_512: 2922 case Intrinsic::x86_avx512_psra_w_512: 2923 case Intrinsic::x86_sse2_psrl_d: 2924 case Intrinsic::x86_sse2_psrl_q: 2925 case Intrinsic::x86_sse2_psrl_w: 2926 case Intrinsic::x86_avx2_psrl_d: 2927 case Intrinsic::x86_avx2_psrl_q: 2928 case Intrinsic::x86_avx2_psrl_w: 2929 case Intrinsic::x86_avx512_psrl_d_512: 2930 case Intrinsic::x86_avx512_psrl_q_512: 2931 case Intrinsic::x86_avx512_psrl_w_512: 2932 case Intrinsic::x86_sse2_psll_d: 2933 case Intrinsic::x86_sse2_psll_q: 2934 case Intrinsic::x86_sse2_psll_w: 2935 case Intrinsic::x86_avx2_psll_d: 2936 case Intrinsic::x86_avx2_psll_q: 2937 case Intrinsic::x86_avx2_psll_w: 2938 case Intrinsic::x86_avx512_psll_d_512: 2939 case Intrinsic::x86_avx512_psll_q_512: 2940 case Intrinsic::x86_avx512_psll_w_512: { 2941 if (Value *V = simplifyX86immShift(*II, Builder)) 2942 return replaceInstUsesWith(*II, V); 2943 2944 // SSE2/AVX2 uses only the first 64-bits of the 128-bit vector 2945 // operand to compute the shift amount. 2946 Value *Arg1 = II->getArgOperand(1); 2947 assert(Arg1->getType()->getPrimitiveSizeInBits() == 128 && 2948 "Unexpected packed shift size"); 2949 unsigned VWidth = cast<VectorType>(Arg1->getType())->getNumElements(); 2950 2951 if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, VWidth / 2)) 2952 return replaceOperand(*II, 1, V); 2953 break; 2954 } 2955 2956 case Intrinsic::x86_avx2_psllv_d: 2957 case Intrinsic::x86_avx2_psllv_d_256: 2958 case Intrinsic::x86_avx2_psllv_q: 2959 case Intrinsic::x86_avx2_psllv_q_256: 2960 case Intrinsic::x86_avx512_psllv_d_512: 2961 case Intrinsic::x86_avx512_psllv_q_512: 2962 case Intrinsic::x86_avx512_psllv_w_128: 2963 case Intrinsic::x86_avx512_psllv_w_256: 2964 case Intrinsic::x86_avx512_psllv_w_512: 2965 case Intrinsic::x86_avx2_psrav_d: 2966 case Intrinsic::x86_avx2_psrav_d_256: 2967 case Intrinsic::x86_avx512_psrav_q_128: 2968 case Intrinsic::x86_avx512_psrav_q_256: 2969 case Intrinsic::x86_avx512_psrav_d_512: 2970 case Intrinsic::x86_avx512_psrav_q_512: 2971 case Intrinsic::x86_avx512_psrav_w_128: 2972 case Intrinsic::x86_avx512_psrav_w_256: 2973 case Intrinsic::x86_avx512_psrav_w_512: 2974 case Intrinsic::x86_avx2_psrlv_d: 2975 case Intrinsic::x86_avx2_psrlv_d_256: 2976 case Intrinsic::x86_avx2_psrlv_q: 2977 case Intrinsic::x86_avx2_psrlv_q_256: 2978 case Intrinsic::x86_avx512_psrlv_d_512: 2979 case Intrinsic::x86_avx512_psrlv_q_512: 2980 case Intrinsic::x86_avx512_psrlv_w_128: 2981 case Intrinsic::x86_avx512_psrlv_w_256: 2982 case Intrinsic::x86_avx512_psrlv_w_512: 2983 if (Value *V = simplifyX86varShift(*II, Builder)) 2984 return replaceInstUsesWith(*II, V); 2985 break; 2986 2987 case Intrinsic::x86_sse2_packssdw_128: 2988 case Intrinsic::x86_sse2_packsswb_128: 2989 case Intrinsic::x86_avx2_packssdw: 2990 case Intrinsic::x86_avx2_packsswb: 2991 case Intrinsic::x86_avx512_packssdw_512: 2992 case Intrinsic::x86_avx512_packsswb_512: 2993 if (Value *V = simplifyX86pack(*II, Builder, true)) 2994 return replaceInstUsesWith(*II, V); 2995 break; 2996 2997 case Intrinsic::x86_sse2_packuswb_128: 2998 case Intrinsic::x86_sse41_packusdw: 2999 case Intrinsic::x86_avx2_packusdw: 3000 case Intrinsic::x86_avx2_packuswb: 3001 case Intrinsic::x86_avx512_packusdw_512: 3002 case Intrinsic::x86_avx512_packuswb_512: 3003 if (Value *V = simplifyX86pack(*II, Builder, false)) 3004 return replaceInstUsesWith(*II, V); 3005 break; 3006 3007 case Intrinsic::x86_pclmulqdq: 3008 case Intrinsic::x86_pclmulqdq_256: 3009 case Intrinsic::x86_pclmulqdq_512: { 3010 if (auto *C = dyn_cast<ConstantInt>(II->getArgOperand(2))) { 3011 unsigned Imm = C->getZExtValue(); 3012 3013 bool MadeChange = false; 3014 Value *Arg0 = II->getArgOperand(0); 3015 Value *Arg1 = II->getArgOperand(1); 3016 unsigned VWidth = cast<VectorType>(Arg0->getType())->getNumElements(); 3017 3018 APInt UndefElts1(VWidth, 0); 3019 APInt DemandedElts1 = APInt::getSplat(VWidth, 3020 APInt(2, (Imm & 0x01) ? 2 : 1)); 3021 if (Value *V = SimplifyDemandedVectorElts(Arg0, DemandedElts1, 3022 UndefElts1)) { 3023 replaceOperand(*II, 0, V); 3024 MadeChange = true; 3025 } 3026 3027 APInt UndefElts2(VWidth, 0); 3028 APInt DemandedElts2 = APInt::getSplat(VWidth, 3029 APInt(2, (Imm & 0x10) ? 2 : 1)); 3030 if (Value *V = SimplifyDemandedVectorElts(Arg1, DemandedElts2, 3031 UndefElts2)) { 3032 replaceOperand(*II, 1, V); 3033 MadeChange = true; 3034 } 3035 3036 // If either input elements are undef, the result is zero. 3037 if (DemandedElts1.isSubsetOf(UndefElts1) || 3038 DemandedElts2.isSubsetOf(UndefElts2)) 3039 return replaceInstUsesWith(*II, 3040 ConstantAggregateZero::get(II->getType())); 3041 3042 if (MadeChange) 3043 return II; 3044 } 3045 break; 3046 } 3047 3048 case Intrinsic::x86_sse41_insertps: 3049 if (Value *V = simplifyX86insertps(*II, Builder)) 3050 return replaceInstUsesWith(*II, V); 3051 break; 3052 3053 case Intrinsic::x86_sse4a_extrq: { 3054 Value *Op0 = II->getArgOperand(0); 3055 Value *Op1 = II->getArgOperand(1); 3056 unsigned VWidth0 = cast<VectorType>(Op0->getType())->getNumElements(); 3057 unsigned VWidth1 = cast<VectorType>(Op1->getType())->getNumElements(); 3058 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && 3059 Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 && 3060 VWidth1 == 16 && "Unexpected operand sizes"); 3061 3062 // See if we're dealing with constant values. 3063 Constant *C1 = dyn_cast<Constant>(Op1); 3064 ConstantInt *CILength = 3065 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0)) 3066 : nullptr; 3067 ConstantInt *CIIndex = 3068 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1)) 3069 : nullptr; 3070 3071 // Attempt to simplify to a constant, shuffle vector or EXTRQI call. 3072 if (Value *V = simplifyX86extrq(*II, Op0, CILength, CIIndex, Builder)) 3073 return replaceInstUsesWith(*II, V); 3074 3075 // EXTRQ only uses the lowest 64-bits of the first 128-bit vector 3076 // operands and the lowest 16-bits of the second. 3077 bool MadeChange = false; 3078 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) { 3079 replaceOperand(*II, 0, V); 3080 MadeChange = true; 3081 } 3082 if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 2)) { 3083 replaceOperand(*II, 1, V); 3084 MadeChange = true; 3085 } 3086 if (MadeChange) 3087 return II; 3088 break; 3089 } 3090 3091 case Intrinsic::x86_sse4a_extrqi: { 3092 // EXTRQI: Extract Length bits starting from Index. Zero pad the remaining 3093 // bits of the lower 64-bits. The upper 64-bits are undefined. 3094 Value *Op0 = II->getArgOperand(0); 3095 unsigned VWidth = cast<VectorType>(Op0->getType())->getNumElements(); 3096 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 && 3097 "Unexpected operand size"); 3098 3099 // See if we're dealing with constant values. 3100 ConstantInt *CILength = dyn_cast<ConstantInt>(II->getArgOperand(1)); 3101 ConstantInt *CIIndex = dyn_cast<ConstantInt>(II->getArgOperand(2)); 3102 3103 // Attempt to simplify to a constant or shuffle vector. 3104 if (Value *V = simplifyX86extrq(*II, Op0, CILength, CIIndex, Builder)) 3105 return replaceInstUsesWith(*II, V); 3106 3107 // EXTRQI only uses the lowest 64-bits of the first 128-bit vector 3108 // operand. 3109 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) 3110 return replaceOperand(*II, 0, V); 3111 break; 3112 } 3113 3114 case Intrinsic::x86_sse4a_insertq: { 3115 Value *Op0 = II->getArgOperand(0); 3116 Value *Op1 = II->getArgOperand(1); 3117 unsigned VWidth = cast<VectorType>(Op0->getType())->getNumElements(); 3118 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && 3119 Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 && 3120 cast<VectorType>(Op1->getType())->getNumElements() == 2 && 3121 "Unexpected operand size"); 3122 3123 // See if we're dealing with constant values. 3124 Constant *C1 = dyn_cast<Constant>(Op1); 3125 ConstantInt *CI11 = 3126 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1)) 3127 : nullptr; 3128 3129 // Attempt to simplify to a constant, shuffle vector or INSERTQI call. 3130 if (CI11) { 3131 const APInt &V11 = CI11->getValue(); 3132 APInt Len = V11.zextOrTrunc(6); 3133 APInt Idx = V11.lshr(8).zextOrTrunc(6); 3134 if (Value *V = simplifyX86insertq(*II, Op0, Op1, Len, Idx, Builder)) 3135 return replaceInstUsesWith(*II, V); 3136 } 3137 3138 // INSERTQ only uses the lowest 64-bits of the first 128-bit vector 3139 // operand. 3140 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) 3141 return replaceOperand(*II, 0, V); 3142 break; 3143 } 3144 3145 case Intrinsic::x86_sse4a_insertqi: { 3146 // INSERTQI: Extract lowest Length bits from lower half of second source and 3147 // insert over first source starting at Index bit. The upper 64-bits are 3148 // undefined. 3149 Value *Op0 = II->getArgOperand(0); 3150 Value *Op1 = II->getArgOperand(1); 3151 unsigned VWidth0 = cast<VectorType>(Op0->getType())->getNumElements(); 3152 unsigned VWidth1 = cast<VectorType>(Op1->getType())->getNumElements(); 3153 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && 3154 Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 && 3155 VWidth1 == 2 && "Unexpected operand sizes"); 3156 3157 // See if we're dealing with constant values. 3158 ConstantInt *CILength = dyn_cast<ConstantInt>(II->getArgOperand(2)); 3159 ConstantInt *CIIndex = dyn_cast<ConstantInt>(II->getArgOperand(3)); 3160 3161 // Attempt to simplify to a constant or shuffle vector. 3162 if (CILength && CIIndex) { 3163 APInt Len = CILength->getValue().zextOrTrunc(6); 3164 APInt Idx = CIIndex->getValue().zextOrTrunc(6); 3165 if (Value *V = simplifyX86insertq(*II, Op0, Op1, Len, Idx, Builder)) 3166 return replaceInstUsesWith(*II, V); 3167 } 3168 3169 // INSERTQI only uses the lowest 64-bits of the first two 128-bit vector 3170 // operands. 3171 bool MadeChange = false; 3172 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) { 3173 replaceOperand(*II, 0, V); 3174 MadeChange = true; 3175 } 3176 if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 1)) { 3177 replaceOperand(*II, 1, V); 3178 MadeChange = true; 3179 } 3180 if (MadeChange) 3181 return II; 3182 break; 3183 } 3184 3185 case Intrinsic::x86_sse41_pblendvb: 3186 case Intrinsic::x86_sse41_blendvps: 3187 case Intrinsic::x86_sse41_blendvpd: 3188 case Intrinsic::x86_avx_blendv_ps_256: 3189 case Intrinsic::x86_avx_blendv_pd_256: 3190 case Intrinsic::x86_avx2_pblendvb: { 3191 // fold (blend A, A, Mask) -> A 3192 Value *Op0 = II->getArgOperand(0); 3193 Value *Op1 = II->getArgOperand(1); 3194 Value *Mask = II->getArgOperand(2); 3195 if (Op0 == Op1) 3196 return replaceInstUsesWith(CI, Op0); 3197 3198 // Zero Mask - select 1st argument. 3199 if (isa<ConstantAggregateZero>(Mask)) 3200 return replaceInstUsesWith(CI, Op0); 3201 3202 // Constant Mask - select 1st/2nd argument lane based on top bit of mask. 3203 if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) { 3204 Constant *NewSelector = getNegativeIsTrueBoolVec(ConstantMask); 3205 return SelectInst::Create(NewSelector, Op1, Op0, "blendv"); 3206 } 3207 3208 // Convert to a vector select if we can bypass casts and find a boolean 3209 // vector condition value. 3210 Value *BoolVec; 3211 Mask = peekThroughBitcast(Mask); 3212 if (match(Mask, m_SExt(m_Value(BoolVec))) && 3213 BoolVec->getType()->isVectorTy() && 3214 BoolVec->getType()->getScalarSizeInBits() == 1) { 3215 assert(Mask->getType()->getPrimitiveSizeInBits() == 3216 II->getType()->getPrimitiveSizeInBits() && 3217 "Not expecting mask and operands with different sizes"); 3218 3219 unsigned NumMaskElts = 3220 cast<VectorType>(Mask->getType())->getNumElements(); 3221 unsigned NumOperandElts = 3222 cast<VectorType>(II->getType())->getNumElements(); 3223 if (NumMaskElts == NumOperandElts) 3224 return SelectInst::Create(BoolVec, Op1, Op0); 3225 3226 // If the mask has less elements than the operands, each mask bit maps to 3227 // multiple elements of the operands. Bitcast back and forth. 3228 if (NumMaskElts < NumOperandElts) { 3229 Value *CastOp0 = Builder.CreateBitCast(Op0, Mask->getType()); 3230 Value *CastOp1 = Builder.CreateBitCast(Op1, Mask->getType()); 3231 Value *Sel = Builder.CreateSelect(BoolVec, CastOp1, CastOp0); 3232 return new BitCastInst(Sel, II->getType()); 3233 } 3234 } 3235 3236 break; 3237 } 3238 3239 case Intrinsic::x86_ssse3_pshuf_b_128: 3240 case Intrinsic::x86_avx2_pshuf_b: 3241 case Intrinsic::x86_avx512_pshuf_b_512: 3242 if (Value *V = simplifyX86pshufb(*II, Builder)) 3243 return replaceInstUsesWith(*II, V); 3244 break; 3245 3246 case Intrinsic::x86_avx_vpermilvar_ps: 3247 case Intrinsic::x86_avx_vpermilvar_ps_256: 3248 case Intrinsic::x86_avx512_vpermilvar_ps_512: 3249 case Intrinsic::x86_avx_vpermilvar_pd: 3250 case Intrinsic::x86_avx_vpermilvar_pd_256: 3251 case Intrinsic::x86_avx512_vpermilvar_pd_512: 3252 if (Value *V = simplifyX86vpermilvar(*II, Builder)) 3253 return replaceInstUsesWith(*II, V); 3254 break; 3255 3256 case Intrinsic::x86_avx2_permd: 3257 case Intrinsic::x86_avx2_permps: 3258 case Intrinsic::x86_avx512_permvar_df_256: 3259 case Intrinsic::x86_avx512_permvar_df_512: 3260 case Intrinsic::x86_avx512_permvar_di_256: 3261 case Intrinsic::x86_avx512_permvar_di_512: 3262 case Intrinsic::x86_avx512_permvar_hi_128: 3263 case Intrinsic::x86_avx512_permvar_hi_256: 3264 case Intrinsic::x86_avx512_permvar_hi_512: 3265 case Intrinsic::x86_avx512_permvar_qi_128: 3266 case Intrinsic::x86_avx512_permvar_qi_256: 3267 case Intrinsic::x86_avx512_permvar_qi_512: 3268 case Intrinsic::x86_avx512_permvar_sf_512: 3269 case Intrinsic::x86_avx512_permvar_si_512: 3270 if (Value *V = simplifyX86vpermv(*II, Builder)) 3271 return replaceInstUsesWith(*II, V); 3272 break; 3273 3274 case Intrinsic::x86_avx_maskload_ps: 3275 case Intrinsic::x86_avx_maskload_pd: 3276 case Intrinsic::x86_avx_maskload_ps_256: 3277 case Intrinsic::x86_avx_maskload_pd_256: 3278 case Intrinsic::x86_avx2_maskload_d: 3279 case Intrinsic::x86_avx2_maskload_q: 3280 case Intrinsic::x86_avx2_maskload_d_256: 3281 case Intrinsic::x86_avx2_maskload_q_256: 3282 if (Instruction *I = simplifyX86MaskedLoad(*II, *this)) 3283 return I; 3284 break; 3285 3286 case Intrinsic::x86_sse2_maskmov_dqu: 3287 case Intrinsic::x86_avx_maskstore_ps: 3288 case Intrinsic::x86_avx_maskstore_pd: 3289 case Intrinsic::x86_avx_maskstore_ps_256: 3290 case Intrinsic::x86_avx_maskstore_pd_256: 3291 case Intrinsic::x86_avx2_maskstore_d: 3292 case Intrinsic::x86_avx2_maskstore_q: 3293 case Intrinsic::x86_avx2_maskstore_d_256: 3294 case Intrinsic::x86_avx2_maskstore_q_256: 3295 if (simplifyX86MaskedStore(*II, *this)) 3296 return nullptr; 3297 break; 3298 3299 case Intrinsic::x86_addcarry_32: 3300 case Intrinsic::x86_addcarry_64: 3301 if (Value *V = simplifyX86addcarry(*II, Builder)) 3302 return replaceInstUsesWith(*II, V); 3303 break; 3304 3305 case Intrinsic::ppc_altivec_vperm: 3306 // Turn vperm(V1,V2,mask) -> shuffle(V1,V2,mask) if mask is a constant. 3307 // Note that ppc_altivec_vperm has a big-endian bias, so when creating 3308 // a vectorshuffle for little endian, we must undo the transformation 3309 // performed on vec_perm in altivec.h. That is, we must complement 3310 // the permutation mask with respect to 31 and reverse the order of 3311 // V1 and V2. 3312 if (Constant *Mask = dyn_cast<Constant>(II->getArgOperand(2))) { 3313 assert(cast<VectorType>(Mask->getType())->getNumElements() == 16 && 3314 "Bad type for intrinsic!"); 3315 3316 // Check that all of the elements are integer constants or undefs. 3317 bool AllEltsOk = true; 3318 for (unsigned i = 0; i != 16; ++i) { 3319 Constant *Elt = Mask->getAggregateElement(i); 3320 if (!Elt || !(isa<ConstantInt>(Elt) || isa<UndefValue>(Elt))) { 3321 AllEltsOk = false; 3322 break; 3323 } 3324 } 3325 3326 if (AllEltsOk) { 3327 // Cast the input vectors to byte vectors. 3328 Value *Op0 = Builder.CreateBitCast(II->getArgOperand(0), 3329 Mask->getType()); 3330 Value *Op1 = Builder.CreateBitCast(II->getArgOperand(1), 3331 Mask->getType()); 3332 Value *Result = UndefValue::get(Op0->getType()); 3333 3334 // Only extract each element once. 3335 Value *ExtractedElts[32]; 3336 memset(ExtractedElts, 0, sizeof(ExtractedElts)); 3337 3338 for (unsigned i = 0; i != 16; ++i) { 3339 if (isa<UndefValue>(Mask->getAggregateElement(i))) 3340 continue; 3341 unsigned Idx = 3342 cast<ConstantInt>(Mask->getAggregateElement(i))->getZExtValue(); 3343 Idx &= 31; // Match the hardware behavior. 3344 if (DL.isLittleEndian()) 3345 Idx = 31 - Idx; 3346 3347 if (!ExtractedElts[Idx]) { 3348 Value *Op0ToUse = (DL.isLittleEndian()) ? Op1 : Op0; 3349 Value *Op1ToUse = (DL.isLittleEndian()) ? Op0 : Op1; 3350 ExtractedElts[Idx] = 3351 Builder.CreateExtractElement(Idx < 16 ? Op0ToUse : Op1ToUse, 3352 Builder.getInt32(Idx&15)); 3353 } 3354 3355 // Insert this value into the result vector. 3356 Result = Builder.CreateInsertElement(Result, ExtractedElts[Idx], 3357 Builder.getInt32(i)); 3358 } 3359 return CastInst::Create(Instruction::BitCast, Result, CI.getType()); 3360 } 3361 } 3362 break; 3363 3364 case Intrinsic::arm_neon_vld1: { 3365 Align MemAlign = getKnownAlignment(II->getArgOperand(0), DL, II, &AC, &DT); 3366 if (Value *V = simplifyNeonVld1(*II, MemAlign.value(), Builder)) 3367 return replaceInstUsesWith(*II, V); 3368 break; 3369 } 3370 3371 case Intrinsic::arm_neon_vld2: 3372 case Intrinsic::arm_neon_vld3: 3373 case Intrinsic::arm_neon_vld4: 3374 case Intrinsic::arm_neon_vld2lane: 3375 case Intrinsic::arm_neon_vld3lane: 3376 case Intrinsic::arm_neon_vld4lane: 3377 case Intrinsic::arm_neon_vst1: 3378 case Intrinsic::arm_neon_vst2: 3379 case Intrinsic::arm_neon_vst3: 3380 case Intrinsic::arm_neon_vst4: 3381 case Intrinsic::arm_neon_vst2lane: 3382 case Intrinsic::arm_neon_vst3lane: 3383 case Intrinsic::arm_neon_vst4lane: { 3384 Align MemAlign = getKnownAlignment(II->getArgOperand(0), DL, II, &AC, &DT); 3385 unsigned AlignArg = II->getNumArgOperands() - 1; 3386 Value *AlignArgOp = II->getArgOperand(AlignArg); 3387 MaybeAlign Align = cast<ConstantInt>(AlignArgOp)->getMaybeAlignValue(); 3388 if (Align && *Align < MemAlign) 3389 return replaceOperand(*II, AlignArg, 3390 ConstantInt::get(Type::getInt32Ty(II->getContext()), 3391 MemAlign.value(), false)); 3392 break; 3393 } 3394 3395 case Intrinsic::arm_neon_vtbl1: 3396 case Intrinsic::aarch64_neon_tbl1: 3397 if (Value *V = simplifyNeonTbl1(*II, Builder)) 3398 return replaceInstUsesWith(*II, V); 3399 break; 3400 3401 case Intrinsic::arm_neon_vmulls: 3402 case Intrinsic::arm_neon_vmullu: 3403 case Intrinsic::aarch64_neon_smull: 3404 case Intrinsic::aarch64_neon_umull: { 3405 Value *Arg0 = II->getArgOperand(0); 3406 Value *Arg1 = II->getArgOperand(1); 3407 3408 // Handle mul by zero first: 3409 if (isa<ConstantAggregateZero>(Arg0) || isa<ConstantAggregateZero>(Arg1)) { 3410 return replaceInstUsesWith(CI, ConstantAggregateZero::get(II->getType())); 3411 } 3412 3413 // Check for constant LHS & RHS - in this case we just simplify. 3414 bool Zext = (IID == Intrinsic::arm_neon_vmullu || 3415 IID == Intrinsic::aarch64_neon_umull); 3416 VectorType *NewVT = cast<VectorType>(II->getType()); 3417 if (Constant *CV0 = dyn_cast<Constant>(Arg0)) { 3418 if (Constant *CV1 = dyn_cast<Constant>(Arg1)) { 3419 CV0 = ConstantExpr::getIntegerCast(CV0, NewVT, /*isSigned=*/!Zext); 3420 CV1 = ConstantExpr::getIntegerCast(CV1, NewVT, /*isSigned=*/!Zext); 3421 3422 return replaceInstUsesWith(CI, ConstantExpr::getMul(CV0, CV1)); 3423 } 3424 3425 // Couldn't simplify - canonicalize constant to the RHS. 3426 std::swap(Arg0, Arg1); 3427 } 3428 3429 // Handle mul by one: 3430 if (Constant *CV1 = dyn_cast<Constant>(Arg1)) 3431 if (ConstantInt *Splat = 3432 dyn_cast_or_null<ConstantInt>(CV1->getSplatValue())) 3433 if (Splat->isOne()) 3434 return CastInst::CreateIntegerCast(Arg0, II->getType(), 3435 /*isSigned=*/!Zext); 3436 3437 break; 3438 } 3439 case Intrinsic::arm_neon_aesd: 3440 case Intrinsic::arm_neon_aese: 3441 case Intrinsic::aarch64_crypto_aesd: 3442 case Intrinsic::aarch64_crypto_aese: { 3443 Value *DataArg = II->getArgOperand(0); 3444 Value *KeyArg = II->getArgOperand(1); 3445 3446 // Try to use the builtin XOR in AESE and AESD to eliminate a prior XOR 3447 Value *Data, *Key; 3448 if (match(KeyArg, m_ZeroInt()) && 3449 match(DataArg, m_Xor(m_Value(Data), m_Value(Key)))) { 3450 replaceOperand(*II, 0, Data); 3451 replaceOperand(*II, 1, Key); 3452 return II; 3453 } 3454 break; 3455 } 3456 case Intrinsic::arm_mve_pred_i2v: { 3457 Value *Arg = II->getArgOperand(0); 3458 Value *ArgArg; 3459 if (match(Arg, m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(m_Value(ArgArg))) && 3460 II->getType() == ArgArg->getType()) 3461 return replaceInstUsesWith(*II, ArgArg); 3462 Constant *XorMask; 3463 if (match(Arg, 3464 m_Xor(m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(m_Value(ArgArg)), 3465 m_Constant(XorMask))) && 3466 II->getType() == ArgArg->getType()) { 3467 if (auto *CI = dyn_cast<ConstantInt>(XorMask)) { 3468 if (CI->getValue().trunc(16).isAllOnesValue()) { 3469 auto TrueVector = Builder.CreateVectorSplat( 3470 cast<VectorType>(II->getType())->getNumElements(), 3471 Builder.getTrue()); 3472 return BinaryOperator::Create(Instruction::Xor, ArgArg, TrueVector); 3473 } 3474 } 3475 } 3476 KnownBits ScalarKnown(32); 3477 if (SimplifyDemandedBits(II, 0, APInt::getLowBitsSet(32, 16), 3478 ScalarKnown, 0)) 3479 return II; 3480 break; 3481 } 3482 case Intrinsic::arm_mve_pred_v2i: { 3483 Value *Arg = II->getArgOperand(0); 3484 Value *ArgArg; 3485 if (match(Arg, m_Intrinsic<Intrinsic::arm_mve_pred_i2v>(m_Value(ArgArg)))) 3486 return replaceInstUsesWith(*II, ArgArg); 3487 if (!II->getMetadata(LLVMContext::MD_range)) { 3488 Type *IntTy32 = Type::getInt32Ty(II->getContext()); 3489 Metadata *M[] = { 3490 ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0)), 3491 ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0xFFFF)) 3492 }; 3493 II->setMetadata(LLVMContext::MD_range, MDNode::get(II->getContext(), M)); 3494 return II; 3495 } 3496 break; 3497 } 3498 case Intrinsic::arm_mve_vadc: 3499 case Intrinsic::arm_mve_vadc_predicated: { 3500 unsigned CarryOp = 3501 (II->getIntrinsicID() == Intrinsic::arm_mve_vadc_predicated) ? 3 : 2; 3502 assert(II->getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == 32 && 3503 "Bad type for intrinsic!"); 3504 3505 KnownBits CarryKnown(32); 3506 if (SimplifyDemandedBits(II, CarryOp, APInt::getOneBitSet(32, 29), 3507 CarryKnown)) 3508 return II; 3509 break; 3510 } 3511 case Intrinsic::amdgcn_rcp: { 3512 Value *Src = II->getArgOperand(0); 3513 3514 // TODO: Move to ConstantFolding/InstSimplify? 3515 if (isa<UndefValue>(Src)) { 3516 Type *Ty = II->getType(); 3517 auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics())); 3518 return replaceInstUsesWith(CI, QNaN); 3519 } 3520 3521 if (II->isStrictFP()) 3522 break; 3523 3524 if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) { 3525 const APFloat &ArgVal = C->getValueAPF(); 3526 APFloat Val(ArgVal.getSemantics(), 1); 3527 Val.divide(ArgVal, APFloat::rmNearestTiesToEven); 3528 3529 // This is more precise than the instruction may give. 3530 // 3531 // TODO: The instruction always flushes denormal results (except for f16), 3532 // should this also? 3533 return replaceInstUsesWith(CI, ConstantFP::get(II->getContext(), Val)); 3534 } 3535 3536 break; 3537 } 3538 case Intrinsic::amdgcn_rsq: { 3539 Value *Src = II->getArgOperand(0); 3540 3541 // TODO: Move to ConstantFolding/InstSimplify? 3542 if (isa<UndefValue>(Src)) { 3543 Type *Ty = II->getType(); 3544 auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics())); 3545 return replaceInstUsesWith(CI, QNaN); 3546 } 3547 3548 break; 3549 } 3550 case Intrinsic::amdgcn_frexp_mant: 3551 case Intrinsic::amdgcn_frexp_exp: { 3552 Value *Src = II->getArgOperand(0); 3553 if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) { 3554 int Exp; 3555 APFloat Significand = frexp(C->getValueAPF(), Exp, 3556 APFloat::rmNearestTiesToEven); 3557 3558 if (IID == Intrinsic::amdgcn_frexp_mant) { 3559 return replaceInstUsesWith(CI, ConstantFP::get(II->getContext(), 3560 Significand)); 3561 } 3562 3563 // Match instruction special case behavior. 3564 if (Exp == APFloat::IEK_NaN || Exp == APFloat::IEK_Inf) 3565 Exp = 0; 3566 3567 return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), Exp)); 3568 } 3569 3570 if (isa<UndefValue>(Src)) 3571 return replaceInstUsesWith(CI, UndefValue::get(II->getType())); 3572 3573 break; 3574 } 3575 case Intrinsic::amdgcn_class: { 3576 enum { 3577 S_NAN = 1 << 0, // Signaling NaN 3578 Q_NAN = 1 << 1, // Quiet NaN 3579 N_INFINITY = 1 << 2, // Negative infinity 3580 N_NORMAL = 1 << 3, // Negative normal 3581 N_SUBNORMAL = 1 << 4, // Negative subnormal 3582 N_ZERO = 1 << 5, // Negative zero 3583 P_ZERO = 1 << 6, // Positive zero 3584 P_SUBNORMAL = 1 << 7, // Positive subnormal 3585 P_NORMAL = 1 << 8, // Positive normal 3586 P_INFINITY = 1 << 9 // Positive infinity 3587 }; 3588 3589 const uint32_t FullMask = S_NAN | Q_NAN | N_INFINITY | N_NORMAL | 3590 N_SUBNORMAL | N_ZERO | P_ZERO | P_SUBNORMAL | P_NORMAL | P_INFINITY; 3591 3592 Value *Src0 = II->getArgOperand(0); 3593 Value *Src1 = II->getArgOperand(1); 3594 const ConstantInt *CMask = dyn_cast<ConstantInt>(Src1); 3595 if (!CMask) { 3596 if (isa<UndefValue>(Src0)) 3597 return replaceInstUsesWith(*II, UndefValue::get(II->getType())); 3598 3599 if (isa<UndefValue>(Src1)) 3600 return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), false)); 3601 break; 3602 } 3603 3604 uint32_t Mask = CMask->getZExtValue(); 3605 3606 // If all tests are made, it doesn't matter what the value is. 3607 if ((Mask & FullMask) == FullMask) 3608 return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), true)); 3609 3610 if ((Mask & FullMask) == 0) 3611 return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), false)); 3612 3613 if (Mask == (S_NAN | Q_NAN)) { 3614 // Equivalent of isnan. Replace with standard fcmp. 3615 Value *FCmp = Builder.CreateFCmpUNO(Src0, Src0); 3616 FCmp->takeName(II); 3617 return replaceInstUsesWith(*II, FCmp); 3618 } 3619 3620 if (Mask == (N_ZERO | P_ZERO)) { 3621 // Equivalent of == 0. 3622 Value *FCmp = Builder.CreateFCmpOEQ( 3623 Src0, ConstantFP::get(Src0->getType(), 0.0)); 3624 3625 FCmp->takeName(II); 3626 return replaceInstUsesWith(*II, FCmp); 3627 } 3628 3629 // fp_class (nnan x), qnan|snan|other -> fp_class (nnan x), other 3630 if (((Mask & S_NAN) || (Mask & Q_NAN)) && isKnownNeverNaN(Src0, &TLI)) 3631 return replaceOperand(*II, 1, ConstantInt::get(Src1->getType(), 3632 Mask & ~(S_NAN | Q_NAN))); 3633 3634 const ConstantFP *CVal = dyn_cast<ConstantFP>(Src0); 3635 if (!CVal) { 3636 if (isa<UndefValue>(Src0)) 3637 return replaceInstUsesWith(*II, UndefValue::get(II->getType())); 3638 3639 // Clamp mask to used bits 3640 if ((Mask & FullMask) != Mask) { 3641 CallInst *NewCall = Builder.CreateCall(II->getCalledFunction(), 3642 { Src0, ConstantInt::get(Src1->getType(), Mask & FullMask) } 3643 ); 3644 3645 NewCall->takeName(II); 3646 return replaceInstUsesWith(*II, NewCall); 3647 } 3648 3649 break; 3650 } 3651 3652 const APFloat &Val = CVal->getValueAPF(); 3653 3654 bool Result = 3655 ((Mask & S_NAN) && Val.isNaN() && Val.isSignaling()) || 3656 ((Mask & Q_NAN) && Val.isNaN() && !Val.isSignaling()) || 3657 ((Mask & N_INFINITY) && Val.isInfinity() && Val.isNegative()) || 3658 ((Mask & N_NORMAL) && Val.isNormal() && Val.isNegative()) || 3659 ((Mask & N_SUBNORMAL) && Val.isDenormal() && Val.isNegative()) || 3660 ((Mask & N_ZERO) && Val.isZero() && Val.isNegative()) || 3661 ((Mask & P_ZERO) && Val.isZero() && !Val.isNegative()) || 3662 ((Mask & P_SUBNORMAL) && Val.isDenormal() && !Val.isNegative()) || 3663 ((Mask & P_NORMAL) && Val.isNormal() && !Val.isNegative()) || 3664 ((Mask & P_INFINITY) && Val.isInfinity() && !Val.isNegative()); 3665 3666 return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), Result)); 3667 } 3668 case Intrinsic::amdgcn_cvt_pkrtz: { 3669 Value *Src0 = II->getArgOperand(0); 3670 Value *Src1 = II->getArgOperand(1); 3671 if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) { 3672 if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) { 3673 const fltSemantics &HalfSem 3674 = II->getType()->getScalarType()->getFltSemantics(); 3675 bool LosesInfo; 3676 APFloat Val0 = C0->getValueAPF(); 3677 APFloat Val1 = C1->getValueAPF(); 3678 Val0.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo); 3679 Val1.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo); 3680 3681 Constant *Folded = ConstantVector::get({ 3682 ConstantFP::get(II->getContext(), Val0), 3683 ConstantFP::get(II->getContext(), Val1) }); 3684 return replaceInstUsesWith(*II, Folded); 3685 } 3686 } 3687 3688 if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) 3689 return replaceInstUsesWith(*II, UndefValue::get(II->getType())); 3690 3691 break; 3692 } 3693 case Intrinsic::amdgcn_cvt_pknorm_i16: 3694 case Intrinsic::amdgcn_cvt_pknorm_u16: 3695 case Intrinsic::amdgcn_cvt_pk_i16: 3696 case Intrinsic::amdgcn_cvt_pk_u16: { 3697 Value *Src0 = II->getArgOperand(0); 3698 Value *Src1 = II->getArgOperand(1); 3699 3700 if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) 3701 return replaceInstUsesWith(*II, UndefValue::get(II->getType())); 3702 3703 break; 3704 } 3705 case Intrinsic::amdgcn_ubfe: 3706 case Intrinsic::amdgcn_sbfe: { 3707 // Decompose simple cases into standard shifts. 3708 Value *Src = II->getArgOperand(0); 3709 if (isa<UndefValue>(Src)) 3710 return replaceInstUsesWith(*II, Src); 3711 3712 unsigned Width; 3713 Type *Ty = II->getType(); 3714 unsigned IntSize = Ty->getIntegerBitWidth(); 3715 3716 ConstantInt *CWidth = dyn_cast<ConstantInt>(II->getArgOperand(2)); 3717 if (CWidth) { 3718 Width = CWidth->getZExtValue(); 3719 if ((Width & (IntSize - 1)) == 0) 3720 return replaceInstUsesWith(*II, ConstantInt::getNullValue(Ty)); 3721 3722 // Hardware ignores high bits, so remove those. 3723 if (Width >= IntSize) 3724 return replaceOperand(*II, 2, ConstantInt::get(CWidth->getType(), 3725 Width & (IntSize - 1))); 3726 } 3727 3728 unsigned Offset; 3729 ConstantInt *COffset = dyn_cast<ConstantInt>(II->getArgOperand(1)); 3730 if (COffset) { 3731 Offset = COffset->getZExtValue(); 3732 if (Offset >= IntSize) 3733 return replaceOperand(*II, 1, ConstantInt::get(COffset->getType(), 3734 Offset & (IntSize - 1))); 3735 } 3736 3737 bool Signed = IID == Intrinsic::amdgcn_sbfe; 3738 3739 if (!CWidth || !COffset) 3740 break; 3741 3742 // The case of Width == 0 is handled above, which makes this tranformation 3743 // safe. If Width == 0, then the ashr and lshr instructions become poison 3744 // value since the shift amount would be equal to the bit size. 3745 assert(Width != 0); 3746 3747 // TODO: This allows folding to undef when the hardware has specific 3748 // behavior? 3749 if (Offset + Width < IntSize) { 3750 Value *Shl = Builder.CreateShl(Src, IntSize - Offset - Width); 3751 Value *RightShift = Signed ? Builder.CreateAShr(Shl, IntSize - Width) 3752 : Builder.CreateLShr(Shl, IntSize - Width); 3753 RightShift->takeName(II); 3754 return replaceInstUsesWith(*II, RightShift); 3755 } 3756 3757 Value *RightShift = Signed ? Builder.CreateAShr(Src, Offset) 3758 : Builder.CreateLShr(Src, Offset); 3759 3760 RightShift->takeName(II); 3761 return replaceInstUsesWith(*II, RightShift); 3762 } 3763 case Intrinsic::amdgcn_exp: 3764 case Intrinsic::amdgcn_exp_compr: { 3765 ConstantInt *En = cast<ConstantInt>(II->getArgOperand(1)); 3766 unsigned EnBits = En->getZExtValue(); 3767 if (EnBits == 0xf) 3768 break; // All inputs enabled. 3769 3770 bool IsCompr = IID == Intrinsic::amdgcn_exp_compr; 3771 bool Changed = false; 3772 for (int I = 0; I < (IsCompr ? 2 : 4); ++I) { 3773 if ((!IsCompr && (EnBits & (1 << I)) == 0) || 3774 (IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) { 3775 Value *Src = II->getArgOperand(I + 2); 3776 if (!isa<UndefValue>(Src)) { 3777 replaceOperand(*II, I + 2, UndefValue::get(Src->getType())); 3778 Changed = true; 3779 } 3780 } 3781 } 3782 3783 if (Changed) 3784 return II; 3785 3786 break; 3787 } 3788 case Intrinsic::amdgcn_fmed3: { 3789 // Note this does not preserve proper sNaN behavior if IEEE-mode is enabled 3790 // for the shader. 3791 3792 Value *Src0 = II->getArgOperand(0); 3793 Value *Src1 = II->getArgOperand(1); 3794 Value *Src2 = II->getArgOperand(2); 3795 3796 // Checking for NaN before canonicalization provides better fidelity when 3797 // mapping other operations onto fmed3 since the order of operands is 3798 // unchanged. 3799 CallInst *NewCall = nullptr; 3800 if (match(Src0, m_NaN()) || isa<UndefValue>(Src0)) { 3801 NewCall = Builder.CreateMinNum(Src1, Src2); 3802 } else if (match(Src1, m_NaN()) || isa<UndefValue>(Src1)) { 3803 NewCall = Builder.CreateMinNum(Src0, Src2); 3804 } else if (match(Src2, m_NaN()) || isa<UndefValue>(Src2)) { 3805 NewCall = Builder.CreateMaxNum(Src0, Src1); 3806 } 3807 3808 if (NewCall) { 3809 NewCall->copyFastMathFlags(II); 3810 NewCall->takeName(II); 3811 return replaceInstUsesWith(*II, NewCall); 3812 } 3813 3814 bool Swap = false; 3815 // Canonicalize constants to RHS operands. 3816 // 3817 // fmed3(c0, x, c1) -> fmed3(x, c0, c1) 3818 if (isa<Constant>(Src0) && !isa<Constant>(Src1)) { 3819 std::swap(Src0, Src1); 3820 Swap = true; 3821 } 3822 3823 if (isa<Constant>(Src1) && !isa<Constant>(Src2)) { 3824 std::swap(Src1, Src2); 3825 Swap = true; 3826 } 3827 3828 if (isa<Constant>(Src0) && !isa<Constant>(Src1)) { 3829 std::swap(Src0, Src1); 3830 Swap = true; 3831 } 3832 3833 if (Swap) { 3834 II->setArgOperand(0, Src0); 3835 II->setArgOperand(1, Src1); 3836 II->setArgOperand(2, Src2); 3837 return II; 3838 } 3839 3840 if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) { 3841 if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) { 3842 if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Src2)) { 3843 APFloat Result = fmed3AMDGCN(C0->getValueAPF(), C1->getValueAPF(), 3844 C2->getValueAPF()); 3845 return replaceInstUsesWith(*II, 3846 ConstantFP::get(Builder.getContext(), Result)); 3847 } 3848 } 3849 } 3850 3851 break; 3852 } 3853 case Intrinsic::amdgcn_icmp: 3854 case Intrinsic::amdgcn_fcmp: { 3855 const ConstantInt *CC = cast<ConstantInt>(II->getArgOperand(2)); 3856 // Guard against invalid arguments. 3857 int64_t CCVal = CC->getZExtValue(); 3858 bool IsInteger = IID == Intrinsic::amdgcn_icmp; 3859 if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE || 3860 CCVal > CmpInst::LAST_ICMP_PREDICATE)) || 3861 (!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE || 3862 CCVal > CmpInst::LAST_FCMP_PREDICATE))) 3863 break; 3864 3865 Value *Src0 = II->getArgOperand(0); 3866 Value *Src1 = II->getArgOperand(1); 3867 3868 if (auto *CSrc0 = dyn_cast<Constant>(Src0)) { 3869 if (auto *CSrc1 = dyn_cast<Constant>(Src1)) { 3870 Constant *CCmp = ConstantExpr::getCompare(CCVal, CSrc0, CSrc1); 3871 if (CCmp->isNullValue()) { 3872 return replaceInstUsesWith( 3873 *II, ConstantExpr::getSExt(CCmp, II->getType())); 3874 } 3875 3876 // The result of V_ICMP/V_FCMP assembly instructions (which this 3877 // intrinsic exposes) is one bit per thread, masked with the EXEC 3878 // register (which contains the bitmask of live threads). So a 3879 // comparison that always returns true is the same as a read of the 3880 // EXEC register. 3881 Function *NewF = Intrinsic::getDeclaration( 3882 II->getModule(), Intrinsic::read_register, II->getType()); 3883 Metadata *MDArgs[] = {MDString::get(II->getContext(), "exec")}; 3884 MDNode *MD = MDNode::get(II->getContext(), MDArgs); 3885 Value *Args[] = {MetadataAsValue::get(II->getContext(), MD)}; 3886 CallInst *NewCall = Builder.CreateCall(NewF, Args); 3887 NewCall->addAttribute(AttributeList::FunctionIndex, 3888 Attribute::Convergent); 3889 NewCall->takeName(II); 3890 return replaceInstUsesWith(*II, NewCall); 3891 } 3892 3893 // Canonicalize constants to RHS. 3894 CmpInst::Predicate SwapPred 3895 = CmpInst::getSwappedPredicate(static_cast<CmpInst::Predicate>(CCVal)); 3896 II->setArgOperand(0, Src1); 3897 II->setArgOperand(1, Src0); 3898 II->setArgOperand(2, ConstantInt::get(CC->getType(), 3899 static_cast<int>(SwapPred))); 3900 return II; 3901 } 3902 3903 if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE) 3904 break; 3905 3906 // Canonicalize compare eq with true value to compare != 0 3907 // llvm.amdgcn.icmp(zext (i1 x), 1, eq) 3908 // -> llvm.amdgcn.icmp(zext (i1 x), 0, ne) 3909 // llvm.amdgcn.icmp(sext (i1 x), -1, eq) 3910 // -> llvm.amdgcn.icmp(sext (i1 x), 0, ne) 3911 Value *ExtSrc; 3912 if (CCVal == CmpInst::ICMP_EQ && 3913 ((match(Src1, m_One()) && match(Src0, m_ZExt(m_Value(ExtSrc)))) || 3914 (match(Src1, m_AllOnes()) && match(Src0, m_SExt(m_Value(ExtSrc))))) && 3915 ExtSrc->getType()->isIntegerTy(1)) { 3916 replaceOperand(*II, 1, ConstantInt::getNullValue(Src1->getType())); 3917 replaceOperand(*II, 2, ConstantInt::get(CC->getType(), CmpInst::ICMP_NE)); 3918 return II; 3919 } 3920 3921 CmpInst::Predicate SrcPred; 3922 Value *SrcLHS; 3923 Value *SrcRHS; 3924 3925 // Fold compare eq/ne with 0 from a compare result as the predicate to the 3926 // intrinsic. The typical use is a wave vote function in the library, which 3927 // will be fed from a user code condition compared with 0. Fold in the 3928 // redundant compare. 3929 3930 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne) 3931 // -> llvm.amdgcn.[if]cmp(a, b, pred) 3932 // 3933 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq) 3934 // -> llvm.amdgcn.[if]cmp(a, b, inv pred) 3935 if (match(Src1, m_Zero()) && 3936 match(Src0, 3937 m_ZExtOrSExt(m_Cmp(SrcPred, m_Value(SrcLHS), m_Value(SrcRHS))))) { 3938 if (CCVal == CmpInst::ICMP_EQ) 3939 SrcPred = CmpInst::getInversePredicate(SrcPred); 3940 3941 Intrinsic::ID NewIID = CmpInst::isFPPredicate(SrcPred) ? 3942 Intrinsic::amdgcn_fcmp : Intrinsic::amdgcn_icmp; 3943 3944 Type *Ty = SrcLHS->getType(); 3945 if (auto *CmpType = dyn_cast<IntegerType>(Ty)) { 3946 // Promote to next legal integer type. 3947 unsigned Width = CmpType->getBitWidth(); 3948 unsigned NewWidth = Width; 3949 3950 // Don't do anything for i1 comparisons. 3951 if (Width == 1) 3952 break; 3953 3954 if (Width <= 16) 3955 NewWidth = 16; 3956 else if (Width <= 32) 3957 NewWidth = 32; 3958 else if (Width <= 64) 3959 NewWidth = 64; 3960 else if (Width > 64) 3961 break; // Can't handle this. 3962 3963 if (Width != NewWidth) { 3964 IntegerType *CmpTy = Builder.getIntNTy(NewWidth); 3965 if (CmpInst::isSigned(SrcPred)) { 3966 SrcLHS = Builder.CreateSExt(SrcLHS, CmpTy); 3967 SrcRHS = Builder.CreateSExt(SrcRHS, CmpTy); 3968 } else { 3969 SrcLHS = Builder.CreateZExt(SrcLHS, CmpTy); 3970 SrcRHS = Builder.CreateZExt(SrcRHS, CmpTy); 3971 } 3972 } 3973 } else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy()) 3974 break; 3975 3976 Function *NewF = 3977 Intrinsic::getDeclaration(II->getModule(), NewIID, 3978 { II->getType(), 3979 SrcLHS->getType() }); 3980 Value *Args[] = { SrcLHS, SrcRHS, 3981 ConstantInt::get(CC->getType(), SrcPred) }; 3982 CallInst *NewCall = Builder.CreateCall(NewF, Args); 3983 NewCall->takeName(II); 3984 return replaceInstUsesWith(*II, NewCall); 3985 } 3986 3987 break; 3988 } 3989 case Intrinsic::amdgcn_ballot: { 3990 if (auto *Src = dyn_cast<ConstantInt>(II->getArgOperand(0))) { 3991 if (Src->isZero()) { 3992 // amdgcn.ballot(i1 0) is zero. 3993 return replaceInstUsesWith(*II, Constant::getNullValue(II->getType())); 3994 } 3995 3996 if (Src->isOne()) { 3997 // amdgcn.ballot(i1 1) is exec. 3998 const char *RegName = "exec"; 3999 if (II->getType()->isIntegerTy(32)) 4000 RegName = "exec_lo"; 4001 else if (!II->getType()->isIntegerTy(64)) 4002 break; 4003 4004 Function *NewF = Intrinsic::getDeclaration( 4005 II->getModule(), Intrinsic::read_register, II->getType()); 4006 Metadata *MDArgs[] = {MDString::get(II->getContext(), RegName)}; 4007 MDNode *MD = MDNode::get(II->getContext(), MDArgs); 4008 Value *Args[] = {MetadataAsValue::get(II->getContext(), MD)}; 4009 CallInst *NewCall = Builder.CreateCall(NewF, Args); 4010 NewCall->addAttribute(AttributeList::FunctionIndex, 4011 Attribute::Convergent); 4012 NewCall->takeName(II); 4013 return replaceInstUsesWith(*II, NewCall); 4014 } 4015 } 4016 break; 4017 } 4018 case Intrinsic::amdgcn_wqm_vote: { 4019 // wqm_vote is identity when the argument is constant. 4020 if (!isa<Constant>(II->getArgOperand(0))) 4021 break; 4022 4023 return replaceInstUsesWith(*II, II->getArgOperand(0)); 4024 } 4025 case Intrinsic::amdgcn_kill: { 4026 const ConstantInt *C = dyn_cast<ConstantInt>(II->getArgOperand(0)); 4027 if (!C || !C->getZExtValue()) 4028 break; 4029 4030 // amdgcn.kill(i1 1) is a no-op 4031 return eraseInstFromFunction(CI); 4032 } 4033 case Intrinsic::amdgcn_update_dpp: { 4034 Value *Old = II->getArgOperand(0); 4035 4036 auto BC = cast<ConstantInt>(II->getArgOperand(5)); 4037 auto RM = cast<ConstantInt>(II->getArgOperand(3)); 4038 auto BM = cast<ConstantInt>(II->getArgOperand(4)); 4039 if (BC->isZeroValue() || 4040 RM->getZExtValue() != 0xF || 4041 BM->getZExtValue() != 0xF || 4042 isa<UndefValue>(Old)) 4043 break; 4044 4045 // If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value. 4046 return replaceOperand(*II, 0, UndefValue::get(Old->getType())); 4047 } 4048 case Intrinsic::amdgcn_permlane16: 4049 case Intrinsic::amdgcn_permlanex16: { 4050 // Discard vdst_in if it's not going to be read. 4051 Value *VDstIn = II->getArgOperand(0); 4052 if (isa<UndefValue>(VDstIn)) 4053 break; 4054 4055 ConstantInt *FetchInvalid = cast<ConstantInt>(II->getArgOperand(4)); 4056 ConstantInt *BoundCtrl = cast<ConstantInt>(II->getArgOperand(5)); 4057 if (!FetchInvalid->getZExtValue() && !BoundCtrl->getZExtValue()) 4058 break; 4059 4060 return replaceOperand(*II, 0, UndefValue::get(VDstIn->getType())); 4061 } 4062 case Intrinsic::amdgcn_readfirstlane: 4063 case Intrinsic::amdgcn_readlane: { 4064 // A constant value is trivially uniform. 4065 if (Constant *C = dyn_cast<Constant>(II->getArgOperand(0))) 4066 return replaceInstUsesWith(*II, C); 4067 4068 // The rest of these may not be safe if the exec may not be the same between 4069 // the def and use. 4070 Value *Src = II->getArgOperand(0); 4071 Instruction *SrcInst = dyn_cast<Instruction>(Src); 4072 if (SrcInst && SrcInst->getParent() != II->getParent()) 4073 break; 4074 4075 // readfirstlane (readfirstlane x) -> readfirstlane x 4076 // readlane (readfirstlane x), y -> readfirstlane x 4077 if (match(Src, m_Intrinsic<Intrinsic::amdgcn_readfirstlane>())) 4078 return replaceInstUsesWith(*II, Src); 4079 4080 if (IID == Intrinsic::amdgcn_readfirstlane) { 4081 // readfirstlane (readlane x, y) -> readlane x, y 4082 if (match(Src, m_Intrinsic<Intrinsic::amdgcn_readlane>())) 4083 return replaceInstUsesWith(*II, Src); 4084 } else { 4085 // readlane (readlane x, y), y -> readlane x, y 4086 if (match(Src, m_Intrinsic<Intrinsic::amdgcn_readlane>( 4087 m_Value(), m_Specific(II->getArgOperand(1))))) 4088 return replaceInstUsesWith(*II, Src); 4089 } 4090 4091 break; 4092 } 4093 case Intrinsic::amdgcn_ldexp: { 4094 // FIXME: This doesn't introduce new instructions and belongs in 4095 // InstructionSimplify. 4096 Type *Ty = II->getType(); 4097 Value *Op0 = II->getArgOperand(0); 4098 Value *Op1 = II->getArgOperand(1); 4099 4100 // Folding undef to qnan is safe regardless of the FP mode. 4101 if (isa<UndefValue>(Op0)) { 4102 auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics())); 4103 return replaceInstUsesWith(*II, QNaN); 4104 } 4105 4106 const APFloat *C = nullptr; 4107 match(Op0, m_APFloat(C)); 4108 4109 // FIXME: Should flush denorms depending on FP mode, but that's ignored 4110 // everywhere else. 4111 // 4112 // These cases should be safe, even with strictfp. 4113 // ldexp(0.0, x) -> 0.0 4114 // ldexp(-0.0, x) -> -0.0 4115 // ldexp(inf, x) -> inf 4116 // ldexp(-inf, x) -> -inf 4117 if (C && (C->isZero() || C->isInfinity())) 4118 return replaceInstUsesWith(*II, Op0); 4119 4120 // With strictfp, be more careful about possibly needing to flush denormals 4121 // or not, and snan behavior depends on ieee_mode. 4122 if (II->isStrictFP()) 4123 break; 4124 4125 if (C && C->isNaN()) { 4126 // FIXME: We just need to make the nan quiet here, but that's unavailable 4127 // on APFloat, only IEEEfloat 4128 auto *Quieted = ConstantFP::get( 4129 Ty, scalbn(*C, 0, APFloat::rmNearestTiesToEven)); 4130 return replaceInstUsesWith(*II, Quieted); 4131 } 4132 4133 // ldexp(x, 0) -> x 4134 // ldexp(x, undef) -> x 4135 if (isa<UndefValue>(Op1) || match(Op1, m_ZeroInt())) 4136 return replaceInstUsesWith(*II, Op0); 4137 4138 break; 4139 } 4140 case Intrinsic::hexagon_V6_vandvrt: 4141 case Intrinsic::hexagon_V6_vandvrt_128B: { 4142 // Simplify Q -> V -> Q conversion. 4143 if (auto Op0 = dyn_cast<IntrinsicInst>(II->getArgOperand(0))) { 4144 Intrinsic::ID ID0 = Op0->getIntrinsicID(); 4145 if (ID0 != Intrinsic::hexagon_V6_vandqrt && 4146 ID0 != Intrinsic::hexagon_V6_vandqrt_128B) 4147 break; 4148 Value *Bytes = Op0->getArgOperand(1), *Mask = II->getArgOperand(1); 4149 uint64_t Bytes1 = computeKnownBits(Bytes, 0, Op0).One.getZExtValue(); 4150 uint64_t Mask1 = computeKnownBits(Mask, 0, II).One.getZExtValue(); 4151 // Check if every byte has common bits in Bytes and Mask. 4152 uint64_t C = Bytes1 & Mask1; 4153 if ((C & 0xFF) && (C & 0xFF00) && (C & 0xFF0000) && (C & 0xFF000000)) 4154 return replaceInstUsesWith(*II, Op0->getArgOperand(0)); 4155 } 4156 break; 4157 } 4158 case Intrinsic::stackrestore: { 4159 // If the save is right next to the restore, remove the restore. This can 4160 // happen when variable allocas are DCE'd. 4161 if (IntrinsicInst *SS = dyn_cast<IntrinsicInst>(II->getArgOperand(0))) { 4162 if (SS->getIntrinsicID() == Intrinsic::stacksave) { 4163 // Skip over debug info. 4164 if (SS->getNextNonDebugInstruction() == II) { 4165 return eraseInstFromFunction(CI); 4166 } 4167 } 4168 } 4169 4170 // Scan down this block to see if there is another stack restore in the 4171 // same block without an intervening call/alloca. 4172 BasicBlock::iterator BI(II); 4173 Instruction *TI = II->getParent()->getTerminator(); 4174 bool CannotRemove = false; 4175 for (++BI; &*BI != TI; ++BI) { 4176 if (isa<AllocaInst>(BI)) { 4177 CannotRemove = true; 4178 break; 4179 } 4180 if (CallInst *BCI = dyn_cast<CallInst>(BI)) { 4181 if (auto *II2 = dyn_cast<IntrinsicInst>(BCI)) { 4182 // If there is a stackrestore below this one, remove this one. 4183 if (II2->getIntrinsicID() == Intrinsic::stackrestore) 4184 return eraseInstFromFunction(CI); 4185 4186 // Bail if we cross over an intrinsic with side effects, such as 4187 // llvm.stacksave, or llvm.read_register. 4188 if (II2->mayHaveSideEffects()) { 4189 CannotRemove = true; 4190 break; 4191 } 4192 } else { 4193 // If we found a non-intrinsic call, we can't remove the stack 4194 // restore. 4195 CannotRemove = true; 4196 break; 4197 } 4198 } 4199 } 4200 4201 // If the stack restore is in a return, resume, or unwind block and if there 4202 // are no allocas or calls between the restore and the return, nuke the 4203 // restore. 4204 if (!CannotRemove && (isa<ReturnInst>(TI) || isa<ResumeInst>(TI))) 4205 return eraseInstFromFunction(CI); 4206 break; 4207 } 4208 case Intrinsic::lifetime_end: 4209 // Asan needs to poison memory to detect invalid access which is possible 4210 // even for empty lifetime range. 4211 if (II->getFunction()->hasFnAttribute(Attribute::SanitizeAddress) || 4212 II->getFunction()->hasFnAttribute(Attribute::SanitizeMemory) || 4213 II->getFunction()->hasFnAttribute(Attribute::SanitizeHWAddress)) 4214 break; 4215 4216 if (removeTriviallyEmptyRange(*II, *this, [](const IntrinsicInst &I) { 4217 return I.getIntrinsicID() == Intrinsic::lifetime_start; 4218 })) 4219 return nullptr; 4220 break; 4221 case Intrinsic::assume: { 4222 Value *IIOperand = II->getArgOperand(0); 4223 // Remove an assume if it is followed by an identical assume. 4224 // TODO: Do we need this? Unless there are conflicting assumptions, the 4225 // computeKnownBits(IIOperand) below here eliminates redundant assumes. 4226 Instruction *Next = II->getNextNonDebugInstruction(); 4227 if (match(Next, m_Intrinsic<Intrinsic::assume>(m_Specific(IIOperand)))) 4228 return eraseInstFromFunction(CI); 4229 4230 // Canonicalize assume(a && b) -> assume(a); assume(b); 4231 // Note: New assumption intrinsics created here are registered by 4232 // the InstCombineIRInserter object. 4233 FunctionType *AssumeIntrinsicTy = II->getFunctionType(); 4234 Value *AssumeIntrinsic = II->getCalledOperand(); 4235 Value *A, *B; 4236 if (match(IIOperand, m_And(m_Value(A), m_Value(B)))) { 4237 Builder.CreateCall(AssumeIntrinsicTy, AssumeIntrinsic, A, II->getName()); 4238 Builder.CreateCall(AssumeIntrinsicTy, AssumeIntrinsic, B, II->getName()); 4239 return eraseInstFromFunction(*II); 4240 } 4241 // assume(!(a || b)) -> assume(!a); assume(!b); 4242 if (match(IIOperand, m_Not(m_Or(m_Value(A), m_Value(B))))) { 4243 Builder.CreateCall(AssumeIntrinsicTy, AssumeIntrinsic, 4244 Builder.CreateNot(A), II->getName()); 4245 Builder.CreateCall(AssumeIntrinsicTy, AssumeIntrinsic, 4246 Builder.CreateNot(B), II->getName()); 4247 return eraseInstFromFunction(*II); 4248 } 4249 4250 // assume( (load addr) != null ) -> add 'nonnull' metadata to load 4251 // (if assume is valid at the load) 4252 CmpInst::Predicate Pred; 4253 Instruction *LHS; 4254 if (match(IIOperand, m_ICmp(Pred, m_Instruction(LHS), m_Zero())) && 4255 Pred == ICmpInst::ICMP_NE && LHS->getOpcode() == Instruction::Load && 4256 LHS->getType()->isPointerTy() && 4257 isValidAssumeForContext(II, LHS, &DT)) { 4258 MDNode *MD = MDNode::get(II->getContext(), None); 4259 LHS->setMetadata(LLVMContext::MD_nonnull, MD); 4260 return eraseInstFromFunction(*II); 4261 4262 // TODO: apply nonnull return attributes to calls and invokes 4263 // TODO: apply range metadata for range check patterns? 4264 } 4265 4266 // If there is a dominating assume with the same condition as this one, 4267 // then this one is redundant, and should be removed. 4268 KnownBits Known(1); 4269 computeKnownBits(IIOperand, Known, 0, II); 4270 if (Known.isAllOnes() && isAssumeWithEmptyBundle(*II)) 4271 return eraseInstFromFunction(*II); 4272 4273 // Update the cache of affected values for this assumption (we might be 4274 // here because we just simplified the condition). 4275 AC.updateAffectedValues(II); 4276 break; 4277 } 4278 case Intrinsic::experimental_gc_relocate: { 4279 auto &GCR = *cast<GCRelocateInst>(II); 4280 4281 // If we have two copies of the same pointer in the statepoint argument 4282 // list, canonicalize to one. This may let us common gc.relocates. 4283 if (GCR.getBasePtr() == GCR.getDerivedPtr() && 4284 GCR.getBasePtrIndex() != GCR.getDerivedPtrIndex()) { 4285 auto *OpIntTy = GCR.getOperand(2)->getType(); 4286 return replaceOperand(*II, 2, 4287 ConstantInt::get(OpIntTy, GCR.getBasePtrIndex())); 4288 } 4289 4290 // Translate facts known about a pointer before relocating into 4291 // facts about the relocate value, while being careful to 4292 // preserve relocation semantics. 4293 Value *DerivedPtr = GCR.getDerivedPtr(); 4294 4295 // Remove the relocation if unused, note that this check is required 4296 // to prevent the cases below from looping forever. 4297 if (II->use_empty()) 4298 return eraseInstFromFunction(*II); 4299 4300 // Undef is undef, even after relocation. 4301 // TODO: provide a hook for this in GCStrategy. This is clearly legal for 4302 // most practical collectors, but there was discussion in the review thread 4303 // about whether it was legal for all possible collectors. 4304 if (isa<UndefValue>(DerivedPtr)) 4305 // Use undef of gc_relocate's type to replace it. 4306 return replaceInstUsesWith(*II, UndefValue::get(II->getType())); 4307 4308 if (auto *PT = dyn_cast<PointerType>(II->getType())) { 4309 // The relocation of null will be null for most any collector. 4310 // TODO: provide a hook for this in GCStrategy. There might be some 4311 // weird collector this property does not hold for. 4312 if (isa<ConstantPointerNull>(DerivedPtr)) 4313 // Use null-pointer of gc_relocate's type to replace it. 4314 return replaceInstUsesWith(*II, ConstantPointerNull::get(PT)); 4315 4316 // isKnownNonNull -> nonnull attribute 4317 if (!II->hasRetAttr(Attribute::NonNull) && 4318 isKnownNonZero(DerivedPtr, DL, 0, &AC, II, &DT)) { 4319 II->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull); 4320 return II; 4321 } 4322 } 4323 4324 // TODO: bitcast(relocate(p)) -> relocate(bitcast(p)) 4325 // Canonicalize on the type from the uses to the defs 4326 4327 // TODO: relocate((gep p, C, C2, ...)) -> gep(relocate(p), C, C2, ...) 4328 break; 4329 } 4330 4331 case Intrinsic::experimental_guard: { 4332 // Is this guard followed by another guard? We scan forward over a small 4333 // fixed window of instructions to handle common cases with conditions 4334 // computed between guards. 4335 Instruction *NextInst = II->getNextNonDebugInstruction(); 4336 for (unsigned i = 0; i < GuardWideningWindow; i++) { 4337 // Note: Using context-free form to avoid compile time blow up 4338 if (!isSafeToSpeculativelyExecute(NextInst)) 4339 break; 4340 NextInst = NextInst->getNextNonDebugInstruction(); 4341 } 4342 Value *NextCond = nullptr; 4343 if (match(NextInst, 4344 m_Intrinsic<Intrinsic::experimental_guard>(m_Value(NextCond)))) { 4345 Value *CurrCond = II->getArgOperand(0); 4346 4347 // Remove a guard that it is immediately preceded by an identical guard. 4348 // Otherwise canonicalize guard(a); guard(b) -> guard(a & b). 4349 if (CurrCond != NextCond) { 4350 Instruction *MoveI = II->getNextNonDebugInstruction(); 4351 while (MoveI != NextInst) { 4352 auto *Temp = MoveI; 4353 MoveI = MoveI->getNextNonDebugInstruction(); 4354 Temp->moveBefore(II); 4355 } 4356 replaceOperand(*II, 0, Builder.CreateAnd(CurrCond, NextCond)); 4357 } 4358 eraseInstFromFunction(*NextInst); 4359 return II; 4360 } 4361 break; 4362 } 4363 } 4364 return visitCallBase(*II); 4365 } 4366 4367 // Fence instruction simplification 4368 Instruction *InstCombiner::visitFenceInst(FenceInst &FI) { 4369 // Remove identical consecutive fences. 4370 Instruction *Next = FI.getNextNonDebugInstruction(); 4371 if (auto *NFI = dyn_cast<FenceInst>(Next)) 4372 if (FI.isIdenticalTo(NFI)) 4373 return eraseInstFromFunction(FI); 4374 return nullptr; 4375 } 4376 4377 // InvokeInst simplification 4378 Instruction *InstCombiner::visitInvokeInst(InvokeInst &II) { 4379 return visitCallBase(II); 4380 } 4381 4382 // CallBrInst simplification 4383 Instruction *InstCombiner::visitCallBrInst(CallBrInst &CBI) { 4384 return visitCallBase(CBI); 4385 } 4386 4387 /// If this cast does not affect the value passed through the varargs area, we 4388 /// can eliminate the use of the cast. 4389 static bool isSafeToEliminateVarargsCast(const CallBase &Call, 4390 const DataLayout &DL, 4391 const CastInst *const CI, 4392 const int ix) { 4393 if (!CI->isLosslessCast()) 4394 return false; 4395 4396 // If this is a GC intrinsic, avoid munging types. We need types for 4397 // statepoint reconstruction in SelectionDAG. 4398 // TODO: This is probably something which should be expanded to all 4399 // intrinsics since the entire point of intrinsics is that 4400 // they are understandable by the optimizer. 4401 if (isa<GCStatepointInst>(Call) || isa<GCRelocateInst>(Call) || 4402 isa<GCResultInst>(Call)) 4403 return false; 4404 4405 // The size of ByVal or InAlloca arguments is derived from the type, so we 4406 // can't change to a type with a different size. If the size were 4407 // passed explicitly we could avoid this check. 4408 if (!Call.isPassPointeeByValueArgument(ix)) 4409 return true; 4410 4411 Type* SrcTy = 4412 cast<PointerType>(CI->getOperand(0)->getType())->getElementType(); 4413 Type *DstTy = Call.isByValArgument(ix) 4414 ? Call.getParamByValType(ix) 4415 : cast<PointerType>(CI->getType())->getElementType(); 4416 if (!SrcTy->isSized() || !DstTy->isSized()) 4417 return false; 4418 if (DL.getTypeAllocSize(SrcTy) != DL.getTypeAllocSize(DstTy)) 4419 return false; 4420 return true; 4421 } 4422 4423 Instruction *InstCombiner::tryOptimizeCall(CallInst *CI) { 4424 if (!CI->getCalledFunction()) return nullptr; 4425 4426 auto InstCombineRAUW = [this](Instruction *From, Value *With) { 4427 replaceInstUsesWith(*From, With); 4428 }; 4429 auto InstCombineErase = [this](Instruction *I) { 4430 eraseInstFromFunction(*I); 4431 }; 4432 LibCallSimplifier Simplifier(DL, &TLI, ORE, BFI, PSI, InstCombineRAUW, 4433 InstCombineErase); 4434 if (Value *With = Simplifier.optimizeCall(CI, Builder)) { 4435 ++NumSimplified; 4436 return CI->use_empty() ? CI : replaceInstUsesWith(*CI, With); 4437 } 4438 4439 return nullptr; 4440 } 4441 4442 static IntrinsicInst *findInitTrampolineFromAlloca(Value *TrampMem) { 4443 // Strip off at most one level of pointer casts, looking for an alloca. This 4444 // is good enough in practice and simpler than handling any number of casts. 4445 Value *Underlying = TrampMem->stripPointerCasts(); 4446 if (Underlying != TrampMem && 4447 (!Underlying->hasOneUse() || Underlying->user_back() != TrampMem)) 4448 return nullptr; 4449 if (!isa<AllocaInst>(Underlying)) 4450 return nullptr; 4451 4452 IntrinsicInst *InitTrampoline = nullptr; 4453 for (User *U : TrampMem->users()) { 4454 IntrinsicInst *II = dyn_cast<IntrinsicInst>(U); 4455 if (!II) 4456 return nullptr; 4457 if (II->getIntrinsicID() == Intrinsic::init_trampoline) { 4458 if (InitTrampoline) 4459 // More than one init_trampoline writes to this value. Give up. 4460 return nullptr; 4461 InitTrampoline = II; 4462 continue; 4463 } 4464 if (II->getIntrinsicID() == Intrinsic::adjust_trampoline) 4465 // Allow any number of calls to adjust.trampoline. 4466 continue; 4467 return nullptr; 4468 } 4469 4470 // No call to init.trampoline found. 4471 if (!InitTrampoline) 4472 return nullptr; 4473 4474 // Check that the alloca is being used in the expected way. 4475 if (InitTrampoline->getOperand(0) != TrampMem) 4476 return nullptr; 4477 4478 return InitTrampoline; 4479 } 4480 4481 static IntrinsicInst *findInitTrampolineFromBB(IntrinsicInst *AdjustTramp, 4482 Value *TrampMem) { 4483 // Visit all the previous instructions in the basic block, and try to find a 4484 // init.trampoline which has a direct path to the adjust.trampoline. 4485 for (BasicBlock::iterator I = AdjustTramp->getIterator(), 4486 E = AdjustTramp->getParent()->begin(); 4487 I != E;) { 4488 Instruction *Inst = &*--I; 4489 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) 4490 if (II->getIntrinsicID() == Intrinsic::init_trampoline && 4491 II->getOperand(0) == TrampMem) 4492 return II; 4493 if (Inst->mayWriteToMemory()) 4494 return nullptr; 4495 } 4496 return nullptr; 4497 } 4498 4499 // Given a call to llvm.adjust.trampoline, find and return the corresponding 4500 // call to llvm.init.trampoline if the call to the trampoline can be optimized 4501 // to a direct call to a function. Otherwise return NULL. 4502 static IntrinsicInst *findInitTrampoline(Value *Callee) { 4503 Callee = Callee->stripPointerCasts(); 4504 IntrinsicInst *AdjustTramp = dyn_cast<IntrinsicInst>(Callee); 4505 if (!AdjustTramp || 4506 AdjustTramp->getIntrinsicID() != Intrinsic::adjust_trampoline) 4507 return nullptr; 4508 4509 Value *TrampMem = AdjustTramp->getOperand(0); 4510 4511 if (IntrinsicInst *IT = findInitTrampolineFromAlloca(TrampMem)) 4512 return IT; 4513 if (IntrinsicInst *IT = findInitTrampolineFromBB(AdjustTramp, TrampMem)) 4514 return IT; 4515 return nullptr; 4516 } 4517 4518 static void annotateAnyAllocSite(CallBase &Call, const TargetLibraryInfo *TLI) { 4519 unsigned NumArgs = Call.getNumArgOperands(); 4520 ConstantInt *Op0C = dyn_cast<ConstantInt>(Call.getOperand(0)); 4521 ConstantInt *Op1C = 4522 (NumArgs == 1) ? nullptr : dyn_cast<ConstantInt>(Call.getOperand(1)); 4523 // Bail out if the allocation size is zero (or an invalid alignment of zero 4524 // with aligned_alloc). 4525 if ((Op0C && Op0C->isNullValue()) || (Op1C && Op1C->isNullValue())) 4526 return; 4527 4528 if (isMallocLikeFn(&Call, TLI) && Op0C) { 4529 if (isOpNewLikeFn(&Call, TLI)) 4530 Call.addAttribute(AttributeList::ReturnIndex, 4531 Attribute::getWithDereferenceableBytes( 4532 Call.getContext(), Op0C->getZExtValue())); 4533 else 4534 Call.addAttribute(AttributeList::ReturnIndex, 4535 Attribute::getWithDereferenceableOrNullBytes( 4536 Call.getContext(), Op0C->getZExtValue())); 4537 } else if (isAlignedAllocLikeFn(&Call, TLI) && Op1C) { 4538 Call.addAttribute(AttributeList::ReturnIndex, 4539 Attribute::getWithDereferenceableOrNullBytes( 4540 Call.getContext(), Op1C->getZExtValue())); 4541 // Add alignment attribute if alignment is a power of two constant. 4542 if (Op0C && Op0C->getValue().ult(llvm::Value::MaximumAlignment)) { 4543 uint64_t AlignmentVal = Op0C->getZExtValue(); 4544 if (llvm::isPowerOf2_64(AlignmentVal)) 4545 Call.addAttribute(AttributeList::ReturnIndex, 4546 Attribute::getWithAlignment(Call.getContext(), 4547 Align(AlignmentVal))); 4548 } 4549 } else if (isReallocLikeFn(&Call, TLI) && Op1C) { 4550 Call.addAttribute(AttributeList::ReturnIndex, 4551 Attribute::getWithDereferenceableOrNullBytes( 4552 Call.getContext(), Op1C->getZExtValue())); 4553 } else if (isCallocLikeFn(&Call, TLI) && Op0C && Op1C) { 4554 bool Overflow; 4555 const APInt &N = Op0C->getValue(); 4556 APInt Size = N.umul_ov(Op1C->getValue(), Overflow); 4557 if (!Overflow) 4558 Call.addAttribute(AttributeList::ReturnIndex, 4559 Attribute::getWithDereferenceableOrNullBytes( 4560 Call.getContext(), Size.getZExtValue())); 4561 } else if (isStrdupLikeFn(&Call, TLI)) { 4562 uint64_t Len = GetStringLength(Call.getOperand(0)); 4563 if (Len) { 4564 // strdup 4565 if (NumArgs == 1) 4566 Call.addAttribute(AttributeList::ReturnIndex, 4567 Attribute::getWithDereferenceableOrNullBytes( 4568 Call.getContext(), Len)); 4569 // strndup 4570 else if (NumArgs == 2 && Op1C) 4571 Call.addAttribute( 4572 AttributeList::ReturnIndex, 4573 Attribute::getWithDereferenceableOrNullBytes( 4574 Call.getContext(), std::min(Len, Op1C->getZExtValue() + 1))); 4575 } 4576 } 4577 } 4578 4579 /// Improvements for call, callbr and invoke instructions. 4580 Instruction *InstCombiner::visitCallBase(CallBase &Call) { 4581 if (isAllocationFn(&Call, &TLI)) 4582 annotateAnyAllocSite(Call, &TLI); 4583 4584 bool Changed = false; 4585 4586 // Mark any parameters that are known to be non-null with the nonnull 4587 // attribute. This is helpful for inlining calls to functions with null 4588 // checks on their arguments. 4589 SmallVector<unsigned, 4> ArgNos; 4590 unsigned ArgNo = 0; 4591 4592 for (Value *V : Call.args()) { 4593 if (V->getType()->isPointerTy() && 4594 !Call.paramHasAttr(ArgNo, Attribute::NonNull) && 4595 isKnownNonZero(V, DL, 0, &AC, &Call, &DT)) 4596 ArgNos.push_back(ArgNo); 4597 ArgNo++; 4598 } 4599 4600 assert(ArgNo == Call.arg_size() && "sanity check"); 4601 4602 if (!ArgNos.empty()) { 4603 AttributeList AS = Call.getAttributes(); 4604 LLVMContext &Ctx = Call.getContext(); 4605 AS = AS.addParamAttribute(Ctx, ArgNos, 4606 Attribute::get(Ctx, Attribute::NonNull)); 4607 Call.setAttributes(AS); 4608 Changed = true; 4609 } 4610 4611 // If the callee is a pointer to a function, attempt to move any casts to the 4612 // arguments of the call/callbr/invoke. 4613 Value *Callee = Call.getCalledOperand(); 4614 if (!isa<Function>(Callee) && transformConstExprCastCall(Call)) 4615 return nullptr; 4616 4617 if (Function *CalleeF = dyn_cast<Function>(Callee)) { 4618 // Remove the convergent attr on calls when the callee is not convergent. 4619 if (Call.isConvergent() && !CalleeF->isConvergent() && 4620 !CalleeF->isIntrinsic()) { 4621 LLVM_DEBUG(dbgs() << "Removing convergent attr from instr " << Call 4622 << "\n"); 4623 Call.setNotConvergent(); 4624 return &Call; 4625 } 4626 4627 // If the call and callee calling conventions don't match, this call must 4628 // be unreachable, as the call is undefined. 4629 if (CalleeF->getCallingConv() != Call.getCallingConv() && 4630 // Only do this for calls to a function with a body. A prototype may 4631 // not actually end up matching the implementation's calling conv for a 4632 // variety of reasons (e.g. it may be written in assembly). 4633 !CalleeF->isDeclaration()) { 4634 Instruction *OldCall = &Call; 4635 CreateNonTerminatorUnreachable(OldCall); 4636 // If OldCall does not return void then replaceAllUsesWith undef. 4637 // This allows ValueHandlers and custom metadata to adjust itself. 4638 if (!OldCall->getType()->isVoidTy()) 4639 replaceInstUsesWith(*OldCall, UndefValue::get(OldCall->getType())); 4640 if (isa<CallInst>(OldCall)) 4641 return eraseInstFromFunction(*OldCall); 4642 4643 // We cannot remove an invoke or a callbr, because it would change thexi 4644 // CFG, just change the callee to a null pointer. 4645 cast<CallBase>(OldCall)->setCalledFunction( 4646 CalleeF->getFunctionType(), 4647 Constant::getNullValue(CalleeF->getType())); 4648 return nullptr; 4649 } 4650 } 4651 4652 if ((isa<ConstantPointerNull>(Callee) && 4653 !NullPointerIsDefined(Call.getFunction())) || 4654 isa<UndefValue>(Callee)) { 4655 // If Call does not return void then replaceAllUsesWith undef. 4656 // This allows ValueHandlers and custom metadata to adjust itself. 4657 if (!Call.getType()->isVoidTy()) 4658 replaceInstUsesWith(Call, UndefValue::get(Call.getType())); 4659 4660 if (Call.isTerminator()) { 4661 // Can't remove an invoke or callbr because we cannot change the CFG. 4662 return nullptr; 4663 } 4664 4665 // This instruction is not reachable, just remove it. 4666 CreateNonTerminatorUnreachable(&Call); 4667 return eraseInstFromFunction(Call); 4668 } 4669 4670 if (IntrinsicInst *II = findInitTrampoline(Callee)) 4671 return transformCallThroughTrampoline(Call, *II); 4672 4673 PointerType *PTy = cast<PointerType>(Callee->getType()); 4674 FunctionType *FTy = cast<FunctionType>(PTy->getElementType()); 4675 if (FTy->isVarArg()) { 4676 int ix = FTy->getNumParams(); 4677 // See if we can optimize any arguments passed through the varargs area of 4678 // the call. 4679 for (auto I = Call.arg_begin() + FTy->getNumParams(), E = Call.arg_end(); 4680 I != E; ++I, ++ix) { 4681 CastInst *CI = dyn_cast<CastInst>(*I); 4682 if (CI && isSafeToEliminateVarargsCast(Call, DL, CI, ix)) { 4683 replaceUse(*I, CI->getOperand(0)); 4684 4685 // Update the byval type to match the argument type. 4686 if (Call.isByValArgument(ix)) { 4687 Call.removeParamAttr(ix, Attribute::ByVal); 4688 Call.addParamAttr( 4689 ix, Attribute::getWithByValType( 4690 Call.getContext(), 4691 CI->getOperand(0)->getType()->getPointerElementType())); 4692 } 4693 Changed = true; 4694 } 4695 } 4696 } 4697 4698 if (isa<InlineAsm>(Callee) && !Call.doesNotThrow()) { 4699 // Inline asm calls cannot throw - mark them 'nounwind'. 4700 Call.setDoesNotThrow(); 4701 Changed = true; 4702 } 4703 4704 // Try to optimize the call if possible, we require DataLayout for most of 4705 // this. None of these calls are seen as possibly dead so go ahead and 4706 // delete the instruction now. 4707 if (CallInst *CI = dyn_cast<CallInst>(&Call)) { 4708 Instruction *I = tryOptimizeCall(CI); 4709 // If we changed something return the result, etc. Otherwise let 4710 // the fallthrough check. 4711 if (I) return eraseInstFromFunction(*I); 4712 } 4713 4714 if (!Call.use_empty() && !Call.isMustTailCall()) 4715 if (Value *ReturnedArg = Call.getReturnedArgOperand()) { 4716 Type *CallTy = Call.getType(); 4717 Type *RetArgTy = ReturnedArg->getType(); 4718 if (RetArgTy->canLosslesslyBitCastTo(CallTy)) 4719 return replaceInstUsesWith( 4720 Call, Builder.CreateBitOrPointerCast(ReturnedArg, CallTy)); 4721 } 4722 4723 if (isAllocLikeFn(&Call, &TLI)) 4724 return visitAllocSite(Call); 4725 4726 return Changed ? &Call : nullptr; 4727 } 4728 4729 /// If the callee is a constexpr cast of a function, attempt to move the cast to 4730 /// the arguments of the call/callbr/invoke. 4731 bool InstCombiner::transformConstExprCastCall(CallBase &Call) { 4732 auto *Callee = 4733 dyn_cast<Function>(Call.getCalledOperand()->stripPointerCasts()); 4734 if (!Callee) 4735 return false; 4736 4737 // If this is a call to a thunk function, don't remove the cast. Thunks are 4738 // used to transparently forward all incoming parameters and outgoing return 4739 // values, so it's important to leave the cast in place. 4740 if (Callee->hasFnAttribute("thunk")) 4741 return false; 4742 4743 // If this is a musttail call, the callee's prototype must match the caller's 4744 // prototype with the exception of pointee types. The code below doesn't 4745 // implement that, so we can't do this transform. 4746 // TODO: Do the transform if it only requires adding pointer casts. 4747 if (Call.isMustTailCall()) 4748 return false; 4749 4750 Instruction *Caller = &Call; 4751 const AttributeList &CallerPAL = Call.getAttributes(); 4752 4753 // Okay, this is a cast from a function to a different type. Unless doing so 4754 // would cause a type conversion of one of our arguments, change this call to 4755 // be a direct call with arguments casted to the appropriate types. 4756 FunctionType *FT = Callee->getFunctionType(); 4757 Type *OldRetTy = Caller->getType(); 4758 Type *NewRetTy = FT->getReturnType(); 4759 4760 // Check to see if we are changing the return type... 4761 if (OldRetTy != NewRetTy) { 4762 4763 if (NewRetTy->isStructTy()) 4764 return false; // TODO: Handle multiple return values. 4765 4766 if (!CastInst::isBitOrNoopPointerCastable(NewRetTy, OldRetTy, DL)) { 4767 if (Callee->isDeclaration()) 4768 return false; // Cannot transform this return value. 4769 4770 if (!Caller->use_empty() && 4771 // void -> non-void is handled specially 4772 !NewRetTy->isVoidTy()) 4773 return false; // Cannot transform this return value. 4774 } 4775 4776 if (!CallerPAL.isEmpty() && !Caller->use_empty()) { 4777 AttrBuilder RAttrs(CallerPAL, AttributeList::ReturnIndex); 4778 if (RAttrs.overlaps(AttributeFuncs::typeIncompatible(NewRetTy))) 4779 return false; // Attribute not compatible with transformed value. 4780 } 4781 4782 // If the callbase is an invoke/callbr instruction, and the return value is 4783 // used by a PHI node in a successor, we cannot change the return type of 4784 // the call because there is no place to put the cast instruction (without 4785 // breaking the critical edge). Bail out in this case. 4786 if (!Caller->use_empty()) { 4787 if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) 4788 for (User *U : II->users()) 4789 if (PHINode *PN = dyn_cast<PHINode>(U)) 4790 if (PN->getParent() == II->getNormalDest() || 4791 PN->getParent() == II->getUnwindDest()) 4792 return false; 4793 // FIXME: Be conservative for callbr to avoid a quadratic search. 4794 if (isa<CallBrInst>(Caller)) 4795 return false; 4796 } 4797 } 4798 4799 unsigned NumActualArgs = Call.arg_size(); 4800 unsigned NumCommonArgs = std::min(FT->getNumParams(), NumActualArgs); 4801 4802 // Prevent us turning: 4803 // declare void @takes_i32_inalloca(i32* inalloca) 4804 // call void bitcast (void (i32*)* @takes_i32_inalloca to void (i32)*)(i32 0) 4805 // 4806 // into: 4807 // call void @takes_i32_inalloca(i32* null) 4808 // 4809 // Similarly, avoid folding away bitcasts of byval calls. 4810 if (Callee->getAttributes().hasAttrSomewhere(Attribute::InAlloca) || 4811 Callee->getAttributes().hasAttrSomewhere(Attribute::Preallocated) || 4812 Callee->getAttributes().hasAttrSomewhere(Attribute::ByVal)) 4813 return false; 4814 4815 auto AI = Call.arg_begin(); 4816 for (unsigned i = 0, e = NumCommonArgs; i != e; ++i, ++AI) { 4817 Type *ParamTy = FT->getParamType(i); 4818 Type *ActTy = (*AI)->getType(); 4819 4820 if (!CastInst::isBitOrNoopPointerCastable(ActTy, ParamTy, DL)) 4821 return false; // Cannot transform this parameter value. 4822 4823 if (AttrBuilder(CallerPAL.getParamAttributes(i)) 4824 .overlaps(AttributeFuncs::typeIncompatible(ParamTy))) 4825 return false; // Attribute not compatible with transformed value. 4826 4827 if (Call.isInAllocaArgument(i)) 4828 return false; // Cannot transform to and from inalloca. 4829 4830 // If the parameter is passed as a byval argument, then we have to have a 4831 // sized type and the sized type has to have the same size as the old type. 4832 if (ParamTy != ActTy && CallerPAL.hasParamAttribute(i, Attribute::ByVal)) { 4833 PointerType *ParamPTy = dyn_cast<PointerType>(ParamTy); 4834 if (!ParamPTy || !ParamPTy->getElementType()->isSized()) 4835 return false; 4836 4837 Type *CurElTy = Call.getParamByValType(i); 4838 if (DL.getTypeAllocSize(CurElTy) != 4839 DL.getTypeAllocSize(ParamPTy->getElementType())) 4840 return false; 4841 } 4842 } 4843 4844 if (Callee->isDeclaration()) { 4845 // Do not delete arguments unless we have a function body. 4846 if (FT->getNumParams() < NumActualArgs && !FT->isVarArg()) 4847 return false; 4848 4849 // If the callee is just a declaration, don't change the varargsness of the 4850 // call. We don't want to introduce a varargs call where one doesn't 4851 // already exist. 4852 PointerType *APTy = cast<PointerType>(Call.getCalledOperand()->getType()); 4853 if (FT->isVarArg()!=cast<FunctionType>(APTy->getElementType())->isVarArg()) 4854 return false; 4855 4856 // If both the callee and the cast type are varargs, we still have to make 4857 // sure the number of fixed parameters are the same or we have the same 4858 // ABI issues as if we introduce a varargs call. 4859 if (FT->isVarArg() && 4860 cast<FunctionType>(APTy->getElementType())->isVarArg() && 4861 FT->getNumParams() != 4862 cast<FunctionType>(APTy->getElementType())->getNumParams()) 4863 return false; 4864 } 4865 4866 if (FT->getNumParams() < NumActualArgs && FT->isVarArg() && 4867 !CallerPAL.isEmpty()) { 4868 // In this case we have more arguments than the new function type, but we 4869 // won't be dropping them. Check that these extra arguments have attributes 4870 // that are compatible with being a vararg call argument. 4871 unsigned SRetIdx; 4872 if (CallerPAL.hasAttrSomewhere(Attribute::StructRet, &SRetIdx) && 4873 SRetIdx > FT->getNumParams()) 4874 return false; 4875 } 4876 4877 // Okay, we decided that this is a safe thing to do: go ahead and start 4878 // inserting cast instructions as necessary. 4879 SmallVector<Value *, 8> Args; 4880 SmallVector<AttributeSet, 8> ArgAttrs; 4881 Args.reserve(NumActualArgs); 4882 ArgAttrs.reserve(NumActualArgs); 4883 4884 // Get any return attributes. 4885 AttrBuilder RAttrs(CallerPAL, AttributeList::ReturnIndex); 4886 4887 // If the return value is not being used, the type may not be compatible 4888 // with the existing attributes. Wipe out any problematic attributes. 4889 RAttrs.remove(AttributeFuncs::typeIncompatible(NewRetTy)); 4890 4891 LLVMContext &Ctx = Call.getContext(); 4892 AI = Call.arg_begin(); 4893 for (unsigned i = 0; i != NumCommonArgs; ++i, ++AI) { 4894 Type *ParamTy = FT->getParamType(i); 4895 4896 Value *NewArg = *AI; 4897 if ((*AI)->getType() != ParamTy) 4898 NewArg = Builder.CreateBitOrPointerCast(*AI, ParamTy); 4899 Args.push_back(NewArg); 4900 4901 // Add any parameter attributes. 4902 if (CallerPAL.hasParamAttribute(i, Attribute::ByVal)) { 4903 AttrBuilder AB(CallerPAL.getParamAttributes(i)); 4904 AB.addByValAttr(NewArg->getType()->getPointerElementType()); 4905 ArgAttrs.push_back(AttributeSet::get(Ctx, AB)); 4906 } else 4907 ArgAttrs.push_back(CallerPAL.getParamAttributes(i)); 4908 } 4909 4910 // If the function takes more arguments than the call was taking, add them 4911 // now. 4912 for (unsigned i = NumCommonArgs; i != FT->getNumParams(); ++i) { 4913 Args.push_back(Constant::getNullValue(FT->getParamType(i))); 4914 ArgAttrs.push_back(AttributeSet()); 4915 } 4916 4917 // If we are removing arguments to the function, emit an obnoxious warning. 4918 if (FT->getNumParams() < NumActualArgs) { 4919 // TODO: if (!FT->isVarArg()) this call may be unreachable. PR14722 4920 if (FT->isVarArg()) { 4921 // Add all of the arguments in their promoted form to the arg list. 4922 for (unsigned i = FT->getNumParams(); i != NumActualArgs; ++i, ++AI) { 4923 Type *PTy = getPromotedType((*AI)->getType()); 4924 Value *NewArg = *AI; 4925 if (PTy != (*AI)->getType()) { 4926 // Must promote to pass through va_arg area! 4927 Instruction::CastOps opcode = 4928 CastInst::getCastOpcode(*AI, false, PTy, false); 4929 NewArg = Builder.CreateCast(opcode, *AI, PTy); 4930 } 4931 Args.push_back(NewArg); 4932 4933 // Add any parameter attributes. 4934 ArgAttrs.push_back(CallerPAL.getParamAttributes(i)); 4935 } 4936 } 4937 } 4938 4939 AttributeSet FnAttrs = CallerPAL.getFnAttributes(); 4940 4941 if (NewRetTy->isVoidTy()) 4942 Caller->setName(""); // Void type should not have a name. 4943 4944 assert((ArgAttrs.size() == FT->getNumParams() || FT->isVarArg()) && 4945 "missing argument attributes"); 4946 AttributeList NewCallerPAL = AttributeList::get( 4947 Ctx, FnAttrs, AttributeSet::get(Ctx, RAttrs), ArgAttrs); 4948 4949 SmallVector<OperandBundleDef, 1> OpBundles; 4950 Call.getOperandBundlesAsDefs(OpBundles); 4951 4952 CallBase *NewCall; 4953 if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) { 4954 NewCall = Builder.CreateInvoke(Callee, II->getNormalDest(), 4955 II->getUnwindDest(), Args, OpBundles); 4956 } else if (CallBrInst *CBI = dyn_cast<CallBrInst>(Caller)) { 4957 NewCall = Builder.CreateCallBr(Callee, CBI->getDefaultDest(), 4958 CBI->getIndirectDests(), Args, OpBundles); 4959 } else { 4960 NewCall = Builder.CreateCall(Callee, Args, OpBundles); 4961 cast<CallInst>(NewCall)->setTailCallKind( 4962 cast<CallInst>(Caller)->getTailCallKind()); 4963 } 4964 NewCall->takeName(Caller); 4965 NewCall->setCallingConv(Call.getCallingConv()); 4966 NewCall->setAttributes(NewCallerPAL); 4967 4968 // Preserve prof metadata if any. 4969 NewCall->copyMetadata(*Caller, {LLVMContext::MD_prof}); 4970 4971 // Insert a cast of the return type as necessary. 4972 Instruction *NC = NewCall; 4973 Value *NV = NC; 4974 if (OldRetTy != NV->getType() && !Caller->use_empty()) { 4975 if (!NV->getType()->isVoidTy()) { 4976 NV = NC = CastInst::CreateBitOrPointerCast(NC, OldRetTy); 4977 NC->setDebugLoc(Caller->getDebugLoc()); 4978 4979 // If this is an invoke/callbr instruction, we should insert it after the 4980 // first non-phi instruction in the normal successor block. 4981 if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) { 4982 BasicBlock::iterator I = II->getNormalDest()->getFirstInsertionPt(); 4983 InsertNewInstBefore(NC, *I); 4984 } else if (CallBrInst *CBI = dyn_cast<CallBrInst>(Caller)) { 4985 BasicBlock::iterator I = CBI->getDefaultDest()->getFirstInsertionPt(); 4986 InsertNewInstBefore(NC, *I); 4987 } else { 4988 // Otherwise, it's a call, just insert cast right after the call. 4989 InsertNewInstBefore(NC, *Caller); 4990 } 4991 Worklist.pushUsersToWorkList(*Caller); 4992 } else { 4993 NV = UndefValue::get(Caller->getType()); 4994 } 4995 } 4996 4997 if (!Caller->use_empty()) 4998 replaceInstUsesWith(*Caller, NV); 4999 else if (Caller->hasValueHandle()) { 5000 if (OldRetTy == NV->getType()) 5001 ValueHandleBase::ValueIsRAUWd(Caller, NV); 5002 else 5003 // We cannot call ValueIsRAUWd with a different type, and the 5004 // actual tracked value will disappear. 5005 ValueHandleBase::ValueIsDeleted(Caller); 5006 } 5007 5008 eraseInstFromFunction(*Caller); 5009 return true; 5010 } 5011 5012 /// Turn a call to a function created by init_trampoline / adjust_trampoline 5013 /// intrinsic pair into a direct call to the underlying function. 5014 Instruction * 5015 InstCombiner::transformCallThroughTrampoline(CallBase &Call, 5016 IntrinsicInst &Tramp) { 5017 Value *Callee = Call.getCalledOperand(); 5018 Type *CalleeTy = Callee->getType(); 5019 FunctionType *FTy = Call.getFunctionType(); 5020 AttributeList Attrs = Call.getAttributes(); 5021 5022 // If the call already has the 'nest' attribute somewhere then give up - 5023 // otherwise 'nest' would occur twice after splicing in the chain. 5024 if (Attrs.hasAttrSomewhere(Attribute::Nest)) 5025 return nullptr; 5026 5027 Function *NestF = cast<Function>(Tramp.getArgOperand(1)->stripPointerCasts()); 5028 FunctionType *NestFTy = NestF->getFunctionType(); 5029 5030 AttributeList NestAttrs = NestF->getAttributes(); 5031 if (!NestAttrs.isEmpty()) { 5032 unsigned NestArgNo = 0; 5033 Type *NestTy = nullptr; 5034 AttributeSet NestAttr; 5035 5036 // Look for a parameter marked with the 'nest' attribute. 5037 for (FunctionType::param_iterator I = NestFTy->param_begin(), 5038 E = NestFTy->param_end(); 5039 I != E; ++NestArgNo, ++I) { 5040 AttributeSet AS = NestAttrs.getParamAttributes(NestArgNo); 5041 if (AS.hasAttribute(Attribute::Nest)) { 5042 // Record the parameter type and any other attributes. 5043 NestTy = *I; 5044 NestAttr = AS; 5045 break; 5046 } 5047 } 5048 5049 if (NestTy) { 5050 std::vector<Value*> NewArgs; 5051 std::vector<AttributeSet> NewArgAttrs; 5052 NewArgs.reserve(Call.arg_size() + 1); 5053 NewArgAttrs.reserve(Call.arg_size()); 5054 5055 // Insert the nest argument into the call argument list, which may 5056 // mean appending it. Likewise for attributes. 5057 5058 { 5059 unsigned ArgNo = 0; 5060 auto I = Call.arg_begin(), E = Call.arg_end(); 5061 do { 5062 if (ArgNo == NestArgNo) { 5063 // Add the chain argument and attributes. 5064 Value *NestVal = Tramp.getArgOperand(2); 5065 if (NestVal->getType() != NestTy) 5066 NestVal = Builder.CreateBitCast(NestVal, NestTy, "nest"); 5067 NewArgs.push_back(NestVal); 5068 NewArgAttrs.push_back(NestAttr); 5069 } 5070 5071 if (I == E) 5072 break; 5073 5074 // Add the original argument and attributes. 5075 NewArgs.push_back(*I); 5076 NewArgAttrs.push_back(Attrs.getParamAttributes(ArgNo)); 5077 5078 ++ArgNo; 5079 ++I; 5080 } while (true); 5081 } 5082 5083 // The trampoline may have been bitcast to a bogus type (FTy). 5084 // Handle this by synthesizing a new function type, equal to FTy 5085 // with the chain parameter inserted. 5086 5087 std::vector<Type*> NewTypes; 5088 NewTypes.reserve(FTy->getNumParams()+1); 5089 5090 // Insert the chain's type into the list of parameter types, which may 5091 // mean appending it. 5092 { 5093 unsigned ArgNo = 0; 5094 FunctionType::param_iterator I = FTy->param_begin(), 5095 E = FTy->param_end(); 5096 5097 do { 5098 if (ArgNo == NestArgNo) 5099 // Add the chain's type. 5100 NewTypes.push_back(NestTy); 5101 5102 if (I == E) 5103 break; 5104 5105 // Add the original type. 5106 NewTypes.push_back(*I); 5107 5108 ++ArgNo; 5109 ++I; 5110 } while (true); 5111 } 5112 5113 // Replace the trampoline call with a direct call. Let the generic 5114 // code sort out any function type mismatches. 5115 FunctionType *NewFTy = FunctionType::get(FTy->getReturnType(), NewTypes, 5116 FTy->isVarArg()); 5117 Constant *NewCallee = 5118 NestF->getType() == PointerType::getUnqual(NewFTy) ? 5119 NestF : ConstantExpr::getBitCast(NestF, 5120 PointerType::getUnqual(NewFTy)); 5121 AttributeList NewPAL = 5122 AttributeList::get(FTy->getContext(), Attrs.getFnAttributes(), 5123 Attrs.getRetAttributes(), NewArgAttrs); 5124 5125 SmallVector<OperandBundleDef, 1> OpBundles; 5126 Call.getOperandBundlesAsDefs(OpBundles); 5127 5128 Instruction *NewCaller; 5129 if (InvokeInst *II = dyn_cast<InvokeInst>(&Call)) { 5130 NewCaller = InvokeInst::Create(NewFTy, NewCallee, 5131 II->getNormalDest(), II->getUnwindDest(), 5132 NewArgs, OpBundles); 5133 cast<InvokeInst>(NewCaller)->setCallingConv(II->getCallingConv()); 5134 cast<InvokeInst>(NewCaller)->setAttributes(NewPAL); 5135 } else if (CallBrInst *CBI = dyn_cast<CallBrInst>(&Call)) { 5136 NewCaller = 5137 CallBrInst::Create(NewFTy, NewCallee, CBI->getDefaultDest(), 5138 CBI->getIndirectDests(), NewArgs, OpBundles); 5139 cast<CallBrInst>(NewCaller)->setCallingConv(CBI->getCallingConv()); 5140 cast<CallBrInst>(NewCaller)->setAttributes(NewPAL); 5141 } else { 5142 NewCaller = CallInst::Create(NewFTy, NewCallee, NewArgs, OpBundles); 5143 cast<CallInst>(NewCaller)->setTailCallKind( 5144 cast<CallInst>(Call).getTailCallKind()); 5145 cast<CallInst>(NewCaller)->setCallingConv( 5146 cast<CallInst>(Call).getCallingConv()); 5147 cast<CallInst>(NewCaller)->setAttributes(NewPAL); 5148 } 5149 NewCaller->setDebugLoc(Call.getDebugLoc()); 5150 5151 return NewCaller; 5152 } 5153 } 5154 5155 // Replace the trampoline call with a direct call. Since there is no 'nest' 5156 // parameter, there is no need to adjust the argument list. Let the generic 5157 // code sort out any function type mismatches. 5158 Constant *NewCallee = ConstantExpr::getBitCast(NestF, CalleeTy); 5159 Call.setCalledFunction(FTy, NewCallee); 5160 return &Call; 5161 } 5162