1 //===- InstCombineCalls.cpp -----------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements the visitCall, visitInvoke, and visitCallBr functions. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "InstCombineInternal.h" 14 #include "llvm/ADT/APFloat.h" 15 #include "llvm/ADT/APInt.h" 16 #include "llvm/ADT/APSInt.h" 17 #include "llvm/ADT/ArrayRef.h" 18 #include "llvm/ADT/None.h" 19 #include "llvm/ADT/Optional.h" 20 #include "llvm/ADT/STLExtras.h" 21 #include "llvm/ADT/SmallVector.h" 22 #include "llvm/ADT/Statistic.h" 23 #include "llvm/ADT/Twine.h" 24 #include "llvm/Analysis/AssumptionCache.h" 25 #include "llvm/Analysis/InstructionSimplify.h" 26 #include "llvm/Analysis/Loads.h" 27 #include "llvm/Analysis/MemoryBuiltins.h" 28 #include "llvm/Analysis/ValueTracking.h" 29 #include "llvm/Analysis/VectorUtils.h" 30 #include "llvm/IR/Attributes.h" 31 #include "llvm/IR/BasicBlock.h" 32 #include "llvm/IR/Constant.h" 33 #include "llvm/IR/Constants.h" 34 #include "llvm/IR/DataLayout.h" 35 #include "llvm/IR/DerivedTypes.h" 36 #include "llvm/IR/Function.h" 37 #include "llvm/IR/GlobalVariable.h" 38 #include "llvm/IR/InstrTypes.h" 39 #include "llvm/IR/Instruction.h" 40 #include "llvm/IR/Instructions.h" 41 #include "llvm/IR/IntrinsicInst.h" 42 #include "llvm/IR/Intrinsics.h" 43 #include "llvm/IR/LLVMContext.h" 44 #include "llvm/IR/Metadata.h" 45 #include "llvm/IR/PatternMatch.h" 46 #include "llvm/IR/Statepoint.h" 47 #include "llvm/IR/Type.h" 48 #include "llvm/IR/User.h" 49 #include "llvm/IR/Value.h" 50 #include "llvm/IR/ValueHandle.h" 51 #include "llvm/Support/AtomicOrdering.h" 52 #include "llvm/Support/Casting.h" 53 #include "llvm/Support/CommandLine.h" 54 #include "llvm/Support/Compiler.h" 55 #include "llvm/Support/Debug.h" 56 #include "llvm/Support/ErrorHandling.h" 57 #include "llvm/Support/KnownBits.h" 58 #include "llvm/Support/MathExtras.h" 59 #include "llvm/Support/raw_ostream.h" 60 #include "llvm/Transforms/InstCombine/InstCombineWorklist.h" 61 #include "llvm/Transforms/Utils/Local.h" 62 #include "llvm/Transforms/Utils/SimplifyLibCalls.h" 63 #include <algorithm> 64 #include <cassert> 65 #include <cstdint> 66 #include <cstring> 67 #include <utility> 68 #include <vector> 69 70 using namespace llvm; 71 using namespace PatternMatch; 72 73 #define DEBUG_TYPE "instcombine" 74 75 STATISTIC(NumSimplified, "Number of library calls simplified"); 76 77 static cl::opt<unsigned> GuardWideningWindow( 78 "instcombine-guard-widening-window", 79 cl::init(3), 80 cl::desc("How wide an instruction window to bypass looking for " 81 "another guard")); 82 83 /// Return the specified type promoted as it would be to pass though a va_arg 84 /// area. 85 static Type *getPromotedType(Type *Ty) { 86 if (IntegerType* ITy = dyn_cast<IntegerType>(Ty)) { 87 if (ITy->getBitWidth() < 32) 88 return Type::getInt32Ty(Ty->getContext()); 89 } 90 return Ty; 91 } 92 93 /// Return a constant boolean vector that has true elements in all positions 94 /// where the input constant data vector has an element with the sign bit set. 95 static Constant *getNegativeIsTrueBoolVec(ConstantDataVector *V) { 96 SmallVector<Constant *, 32> BoolVec; 97 IntegerType *BoolTy = Type::getInt1Ty(V->getContext()); 98 for (unsigned I = 0, E = V->getNumElements(); I != E; ++I) { 99 Constant *Elt = V->getElementAsConstant(I); 100 assert((isa<ConstantInt>(Elt) || isa<ConstantFP>(Elt)) && 101 "Unexpected constant data vector element type"); 102 bool Sign = V->getElementType()->isIntegerTy() 103 ? cast<ConstantInt>(Elt)->isNegative() 104 : cast<ConstantFP>(Elt)->isNegative(); 105 BoolVec.push_back(ConstantInt::get(BoolTy, Sign)); 106 } 107 return ConstantVector::get(BoolVec); 108 } 109 110 Instruction *InstCombiner::SimplifyAnyMemTransfer(AnyMemTransferInst *MI) { 111 unsigned DstAlign = getKnownAlignment(MI->getRawDest(), DL, MI, &AC, &DT); 112 unsigned CopyDstAlign = MI->getDestAlignment(); 113 if (CopyDstAlign < DstAlign){ 114 MI->setDestAlignment(DstAlign); 115 return MI; 116 } 117 118 unsigned SrcAlign = getKnownAlignment(MI->getRawSource(), DL, MI, &AC, &DT); 119 unsigned CopySrcAlign = MI->getSourceAlignment(); 120 if (CopySrcAlign < SrcAlign) { 121 MI->setSourceAlignment(SrcAlign); 122 return MI; 123 } 124 125 // If we have a store to a location which is known constant, we can conclude 126 // that the store must be storing the constant value (else the memory 127 // wouldn't be constant), and this must be a noop. 128 if (AA->pointsToConstantMemory(MI->getDest())) { 129 // Set the size of the copy to 0, it will be deleted on the next iteration. 130 MI->setLength(Constant::getNullValue(MI->getLength()->getType())); 131 return MI; 132 } 133 134 // If MemCpyInst length is 1/2/4/8 bytes then replace memcpy with 135 // load/store. 136 ConstantInt *MemOpLength = dyn_cast<ConstantInt>(MI->getLength()); 137 if (!MemOpLength) return nullptr; 138 139 // Source and destination pointer types are always "i8*" for intrinsic. See 140 // if the size is something we can handle with a single primitive load/store. 141 // A single load+store correctly handles overlapping memory in the memmove 142 // case. 143 uint64_t Size = MemOpLength->getLimitedValue(); 144 assert(Size && "0-sized memory transferring should be removed already."); 145 146 if (Size > 8 || (Size&(Size-1))) 147 return nullptr; // If not 1/2/4/8 bytes, exit. 148 149 // If it is an atomic and alignment is less than the size then we will 150 // introduce the unaligned memory access which will be later transformed 151 // into libcall in CodeGen. This is not evident performance gain so disable 152 // it now. 153 if (isa<AtomicMemTransferInst>(MI)) 154 if (CopyDstAlign < Size || CopySrcAlign < Size) 155 return nullptr; 156 157 // Use an integer load+store unless we can find something better. 158 unsigned SrcAddrSp = 159 cast<PointerType>(MI->getArgOperand(1)->getType())->getAddressSpace(); 160 unsigned DstAddrSp = 161 cast<PointerType>(MI->getArgOperand(0)->getType())->getAddressSpace(); 162 163 IntegerType* IntType = IntegerType::get(MI->getContext(), Size<<3); 164 Type *NewSrcPtrTy = PointerType::get(IntType, SrcAddrSp); 165 Type *NewDstPtrTy = PointerType::get(IntType, DstAddrSp); 166 167 // If the memcpy has metadata describing the members, see if we can get the 168 // TBAA tag describing our copy. 169 MDNode *CopyMD = nullptr; 170 if (MDNode *M = MI->getMetadata(LLVMContext::MD_tbaa)) { 171 CopyMD = M; 172 } else if (MDNode *M = MI->getMetadata(LLVMContext::MD_tbaa_struct)) { 173 if (M->getNumOperands() == 3 && M->getOperand(0) && 174 mdconst::hasa<ConstantInt>(M->getOperand(0)) && 175 mdconst::extract<ConstantInt>(M->getOperand(0))->isZero() && 176 M->getOperand(1) && 177 mdconst::hasa<ConstantInt>(M->getOperand(1)) && 178 mdconst::extract<ConstantInt>(M->getOperand(1))->getValue() == 179 Size && 180 M->getOperand(2) && isa<MDNode>(M->getOperand(2))) 181 CopyMD = cast<MDNode>(M->getOperand(2)); 182 } 183 184 Value *Src = Builder.CreateBitCast(MI->getArgOperand(1), NewSrcPtrTy); 185 Value *Dest = Builder.CreateBitCast(MI->getArgOperand(0), NewDstPtrTy); 186 LoadInst *L = Builder.CreateLoad(IntType, Src); 187 // Alignment from the mem intrinsic will be better, so use it. 188 L->setAlignment( 189 MaybeAlign(CopySrcAlign)); // FIXME: Check if we can use Align instead. 190 if (CopyMD) 191 L->setMetadata(LLVMContext::MD_tbaa, CopyMD); 192 MDNode *LoopMemParallelMD = 193 MI->getMetadata(LLVMContext::MD_mem_parallel_loop_access); 194 if (LoopMemParallelMD) 195 L->setMetadata(LLVMContext::MD_mem_parallel_loop_access, LoopMemParallelMD); 196 MDNode *AccessGroupMD = MI->getMetadata(LLVMContext::MD_access_group); 197 if (AccessGroupMD) 198 L->setMetadata(LLVMContext::MD_access_group, AccessGroupMD); 199 200 StoreInst *S = Builder.CreateStore(L, Dest); 201 // Alignment from the mem intrinsic will be better, so use it. 202 S->setAlignment( 203 MaybeAlign(CopyDstAlign)); // FIXME: Check if we can use Align instead. 204 if (CopyMD) 205 S->setMetadata(LLVMContext::MD_tbaa, CopyMD); 206 if (LoopMemParallelMD) 207 S->setMetadata(LLVMContext::MD_mem_parallel_loop_access, LoopMemParallelMD); 208 if (AccessGroupMD) 209 S->setMetadata(LLVMContext::MD_access_group, AccessGroupMD); 210 211 if (auto *MT = dyn_cast<MemTransferInst>(MI)) { 212 // non-atomics can be volatile 213 L->setVolatile(MT->isVolatile()); 214 S->setVolatile(MT->isVolatile()); 215 } 216 if (isa<AtomicMemTransferInst>(MI)) { 217 // atomics have to be unordered 218 L->setOrdering(AtomicOrdering::Unordered); 219 S->setOrdering(AtomicOrdering::Unordered); 220 } 221 222 // Set the size of the copy to 0, it will be deleted on the next iteration. 223 MI->setLength(Constant::getNullValue(MemOpLength->getType())); 224 return MI; 225 } 226 227 Instruction *InstCombiner::SimplifyAnyMemSet(AnyMemSetInst *MI) { 228 const unsigned KnownAlignment = 229 getKnownAlignment(MI->getDest(), DL, MI, &AC, &DT); 230 if (MI->getDestAlignment() < KnownAlignment) { 231 MI->setDestAlignment(KnownAlignment); 232 return MI; 233 } 234 235 // If we have a store to a location which is known constant, we can conclude 236 // that the store must be storing the constant value (else the memory 237 // wouldn't be constant), and this must be a noop. 238 if (AA->pointsToConstantMemory(MI->getDest())) { 239 // Set the size of the copy to 0, it will be deleted on the next iteration. 240 MI->setLength(Constant::getNullValue(MI->getLength()->getType())); 241 return MI; 242 } 243 244 // Extract the length and alignment and fill if they are constant. 245 ConstantInt *LenC = dyn_cast<ConstantInt>(MI->getLength()); 246 ConstantInt *FillC = dyn_cast<ConstantInt>(MI->getValue()); 247 if (!LenC || !FillC || !FillC->getType()->isIntegerTy(8)) 248 return nullptr; 249 const uint64_t Len = LenC->getLimitedValue(); 250 assert(Len && "0-sized memory setting should be removed already."); 251 const Align Alignment = assumeAligned(MI->getDestAlignment()); 252 253 // If it is an atomic and alignment is less than the size then we will 254 // introduce the unaligned memory access which will be later transformed 255 // into libcall in CodeGen. This is not evident performance gain so disable 256 // it now. 257 if (isa<AtomicMemSetInst>(MI)) 258 if (Alignment < Len) 259 return nullptr; 260 261 // memset(s,c,n) -> store s, c (for n=1,2,4,8) 262 if (Len <= 8 && isPowerOf2_32((uint32_t)Len)) { 263 Type *ITy = IntegerType::get(MI->getContext(), Len*8); // n=1 -> i8. 264 265 Value *Dest = MI->getDest(); 266 unsigned DstAddrSp = cast<PointerType>(Dest->getType())->getAddressSpace(); 267 Type *NewDstPtrTy = PointerType::get(ITy, DstAddrSp); 268 Dest = Builder.CreateBitCast(Dest, NewDstPtrTy); 269 270 // Extract the fill value and store. 271 uint64_t Fill = FillC->getZExtValue()*0x0101010101010101ULL; 272 StoreInst *S = Builder.CreateStore(ConstantInt::get(ITy, Fill), Dest, 273 MI->isVolatile()); 274 S->setAlignment(Alignment); 275 if (isa<AtomicMemSetInst>(MI)) 276 S->setOrdering(AtomicOrdering::Unordered); 277 278 // Set the size of the copy to 0, it will be deleted on the next iteration. 279 MI->setLength(Constant::getNullValue(LenC->getType())); 280 return MI; 281 } 282 283 return nullptr; 284 } 285 286 static Value *simplifyX86immShift(const IntrinsicInst &II, 287 InstCombiner::BuilderTy &Builder) { 288 bool LogicalShift = false; 289 bool ShiftLeft = false; 290 291 switch (II.getIntrinsicID()) { 292 default: llvm_unreachable("Unexpected intrinsic!"); 293 case Intrinsic::x86_sse2_psra_d: 294 case Intrinsic::x86_sse2_psra_w: 295 case Intrinsic::x86_sse2_psrai_d: 296 case Intrinsic::x86_sse2_psrai_w: 297 case Intrinsic::x86_avx2_psra_d: 298 case Intrinsic::x86_avx2_psra_w: 299 case Intrinsic::x86_avx2_psrai_d: 300 case Intrinsic::x86_avx2_psrai_w: 301 case Intrinsic::x86_avx512_psra_q_128: 302 case Intrinsic::x86_avx512_psrai_q_128: 303 case Intrinsic::x86_avx512_psra_q_256: 304 case Intrinsic::x86_avx512_psrai_q_256: 305 case Intrinsic::x86_avx512_psra_d_512: 306 case Intrinsic::x86_avx512_psra_q_512: 307 case Intrinsic::x86_avx512_psra_w_512: 308 case Intrinsic::x86_avx512_psrai_d_512: 309 case Intrinsic::x86_avx512_psrai_q_512: 310 case Intrinsic::x86_avx512_psrai_w_512: 311 LogicalShift = false; ShiftLeft = false; 312 break; 313 case Intrinsic::x86_sse2_psrl_d: 314 case Intrinsic::x86_sse2_psrl_q: 315 case Intrinsic::x86_sse2_psrl_w: 316 case Intrinsic::x86_sse2_psrli_d: 317 case Intrinsic::x86_sse2_psrli_q: 318 case Intrinsic::x86_sse2_psrli_w: 319 case Intrinsic::x86_avx2_psrl_d: 320 case Intrinsic::x86_avx2_psrl_q: 321 case Intrinsic::x86_avx2_psrl_w: 322 case Intrinsic::x86_avx2_psrli_d: 323 case Intrinsic::x86_avx2_psrli_q: 324 case Intrinsic::x86_avx2_psrli_w: 325 case Intrinsic::x86_avx512_psrl_d_512: 326 case Intrinsic::x86_avx512_psrl_q_512: 327 case Intrinsic::x86_avx512_psrl_w_512: 328 case Intrinsic::x86_avx512_psrli_d_512: 329 case Intrinsic::x86_avx512_psrli_q_512: 330 case Intrinsic::x86_avx512_psrli_w_512: 331 LogicalShift = true; ShiftLeft = false; 332 break; 333 case Intrinsic::x86_sse2_psll_d: 334 case Intrinsic::x86_sse2_psll_q: 335 case Intrinsic::x86_sse2_psll_w: 336 case Intrinsic::x86_sse2_pslli_d: 337 case Intrinsic::x86_sse2_pslli_q: 338 case Intrinsic::x86_sse2_pslli_w: 339 case Intrinsic::x86_avx2_psll_d: 340 case Intrinsic::x86_avx2_psll_q: 341 case Intrinsic::x86_avx2_psll_w: 342 case Intrinsic::x86_avx2_pslli_d: 343 case Intrinsic::x86_avx2_pslli_q: 344 case Intrinsic::x86_avx2_pslli_w: 345 case Intrinsic::x86_avx512_psll_d_512: 346 case Intrinsic::x86_avx512_psll_q_512: 347 case Intrinsic::x86_avx512_psll_w_512: 348 case Intrinsic::x86_avx512_pslli_d_512: 349 case Intrinsic::x86_avx512_pslli_q_512: 350 case Intrinsic::x86_avx512_pslli_w_512: 351 LogicalShift = true; ShiftLeft = true; 352 break; 353 } 354 assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left"); 355 356 // Simplify if count is constant. 357 auto Arg1 = II.getArgOperand(1); 358 auto CAZ = dyn_cast<ConstantAggregateZero>(Arg1); 359 auto CDV = dyn_cast<ConstantDataVector>(Arg1); 360 auto CInt = dyn_cast<ConstantInt>(Arg1); 361 if (!CAZ && !CDV && !CInt) 362 return nullptr; 363 364 APInt Count(64, 0); 365 if (CDV) { 366 // SSE2/AVX2 uses all the first 64-bits of the 128-bit vector 367 // operand to compute the shift amount. 368 auto VT = cast<VectorType>(CDV->getType()); 369 unsigned BitWidth = VT->getElementType()->getPrimitiveSizeInBits(); 370 assert((64 % BitWidth) == 0 && "Unexpected packed shift size"); 371 unsigned NumSubElts = 64 / BitWidth; 372 373 // Concatenate the sub-elements to create the 64-bit value. 374 for (unsigned i = 0; i != NumSubElts; ++i) { 375 unsigned SubEltIdx = (NumSubElts - 1) - i; 376 auto SubElt = cast<ConstantInt>(CDV->getElementAsConstant(SubEltIdx)); 377 Count <<= BitWidth; 378 Count |= SubElt->getValue().zextOrTrunc(64); 379 } 380 } 381 else if (CInt) 382 Count = CInt->getValue(); 383 384 auto Vec = II.getArgOperand(0); 385 auto VT = cast<VectorType>(Vec->getType()); 386 auto SVT = VT->getElementType(); 387 unsigned VWidth = VT->getNumElements(); 388 unsigned BitWidth = SVT->getPrimitiveSizeInBits(); 389 390 // If shift-by-zero then just return the original value. 391 if (Count.isNullValue()) 392 return Vec; 393 394 // Handle cases when Shift >= BitWidth. 395 if (Count.uge(BitWidth)) { 396 // If LogicalShift - just return zero. 397 if (LogicalShift) 398 return ConstantAggregateZero::get(VT); 399 400 // If ArithmeticShift - clamp Shift to (BitWidth - 1). 401 Count = APInt(64, BitWidth - 1); 402 } 403 404 // Get a constant vector of the same type as the first operand. 405 auto ShiftAmt = ConstantInt::get(SVT, Count.zextOrTrunc(BitWidth)); 406 auto ShiftVec = Builder.CreateVectorSplat(VWidth, ShiftAmt); 407 408 if (ShiftLeft) 409 return Builder.CreateShl(Vec, ShiftVec); 410 411 if (LogicalShift) 412 return Builder.CreateLShr(Vec, ShiftVec); 413 414 return Builder.CreateAShr(Vec, ShiftVec); 415 } 416 417 // Attempt to simplify AVX2 per-element shift intrinsics to a generic IR shift. 418 // Unlike the generic IR shifts, the intrinsics have defined behaviour for out 419 // of range shift amounts (logical - set to zero, arithmetic - splat sign bit). 420 static Value *simplifyX86varShift(const IntrinsicInst &II, 421 InstCombiner::BuilderTy &Builder) { 422 bool LogicalShift = false; 423 bool ShiftLeft = false; 424 425 switch (II.getIntrinsicID()) { 426 default: llvm_unreachable("Unexpected intrinsic!"); 427 case Intrinsic::x86_avx2_psrav_d: 428 case Intrinsic::x86_avx2_psrav_d_256: 429 case Intrinsic::x86_avx512_psrav_q_128: 430 case Intrinsic::x86_avx512_psrav_q_256: 431 case Intrinsic::x86_avx512_psrav_d_512: 432 case Intrinsic::x86_avx512_psrav_q_512: 433 case Intrinsic::x86_avx512_psrav_w_128: 434 case Intrinsic::x86_avx512_psrav_w_256: 435 case Intrinsic::x86_avx512_psrav_w_512: 436 LogicalShift = false; 437 ShiftLeft = false; 438 break; 439 case Intrinsic::x86_avx2_psrlv_d: 440 case Intrinsic::x86_avx2_psrlv_d_256: 441 case Intrinsic::x86_avx2_psrlv_q: 442 case Intrinsic::x86_avx2_psrlv_q_256: 443 case Intrinsic::x86_avx512_psrlv_d_512: 444 case Intrinsic::x86_avx512_psrlv_q_512: 445 case Intrinsic::x86_avx512_psrlv_w_128: 446 case Intrinsic::x86_avx512_psrlv_w_256: 447 case Intrinsic::x86_avx512_psrlv_w_512: 448 LogicalShift = true; 449 ShiftLeft = false; 450 break; 451 case Intrinsic::x86_avx2_psllv_d: 452 case Intrinsic::x86_avx2_psllv_d_256: 453 case Intrinsic::x86_avx2_psllv_q: 454 case Intrinsic::x86_avx2_psllv_q_256: 455 case Intrinsic::x86_avx512_psllv_d_512: 456 case Intrinsic::x86_avx512_psllv_q_512: 457 case Intrinsic::x86_avx512_psllv_w_128: 458 case Intrinsic::x86_avx512_psllv_w_256: 459 case Intrinsic::x86_avx512_psllv_w_512: 460 LogicalShift = true; 461 ShiftLeft = true; 462 break; 463 } 464 assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left"); 465 466 // Simplify if all shift amounts are constant/undef. 467 auto *CShift = dyn_cast<Constant>(II.getArgOperand(1)); 468 if (!CShift) 469 return nullptr; 470 471 auto Vec = II.getArgOperand(0); 472 auto VT = cast<VectorType>(II.getType()); 473 auto SVT = VT->getVectorElementType(); 474 int NumElts = VT->getNumElements(); 475 int BitWidth = SVT->getIntegerBitWidth(); 476 477 // Collect each element's shift amount. 478 // We also collect special cases: UNDEF = -1, OUT-OF-RANGE = BitWidth. 479 bool AnyOutOfRange = false; 480 SmallVector<int, 8> ShiftAmts; 481 for (int I = 0; I < NumElts; ++I) { 482 auto *CElt = CShift->getAggregateElement(I); 483 if (CElt && isa<UndefValue>(CElt)) { 484 ShiftAmts.push_back(-1); 485 continue; 486 } 487 488 auto *COp = dyn_cast_or_null<ConstantInt>(CElt); 489 if (!COp) 490 return nullptr; 491 492 // Handle out of range shifts. 493 // If LogicalShift - set to BitWidth (special case). 494 // If ArithmeticShift - set to (BitWidth - 1) (sign splat). 495 APInt ShiftVal = COp->getValue(); 496 if (ShiftVal.uge(BitWidth)) { 497 AnyOutOfRange = LogicalShift; 498 ShiftAmts.push_back(LogicalShift ? BitWidth : BitWidth - 1); 499 continue; 500 } 501 502 ShiftAmts.push_back((int)ShiftVal.getZExtValue()); 503 } 504 505 // If all elements out of range or UNDEF, return vector of zeros/undefs. 506 // ArithmeticShift should only hit this if they are all UNDEF. 507 auto OutOfRange = [&](int Idx) { return (Idx < 0) || (BitWidth <= Idx); }; 508 if (llvm::all_of(ShiftAmts, OutOfRange)) { 509 SmallVector<Constant *, 8> ConstantVec; 510 for (int Idx : ShiftAmts) { 511 if (Idx < 0) { 512 ConstantVec.push_back(UndefValue::get(SVT)); 513 } else { 514 assert(LogicalShift && "Logical shift expected"); 515 ConstantVec.push_back(ConstantInt::getNullValue(SVT)); 516 } 517 } 518 return ConstantVector::get(ConstantVec); 519 } 520 521 // We can't handle only some out of range values with generic logical shifts. 522 if (AnyOutOfRange) 523 return nullptr; 524 525 // Build the shift amount constant vector. 526 SmallVector<Constant *, 8> ShiftVecAmts; 527 for (int Idx : ShiftAmts) { 528 if (Idx < 0) 529 ShiftVecAmts.push_back(UndefValue::get(SVT)); 530 else 531 ShiftVecAmts.push_back(ConstantInt::get(SVT, Idx)); 532 } 533 auto ShiftVec = ConstantVector::get(ShiftVecAmts); 534 535 if (ShiftLeft) 536 return Builder.CreateShl(Vec, ShiftVec); 537 538 if (LogicalShift) 539 return Builder.CreateLShr(Vec, ShiftVec); 540 541 return Builder.CreateAShr(Vec, ShiftVec); 542 } 543 544 static Value *simplifyX86pack(IntrinsicInst &II, 545 InstCombiner::BuilderTy &Builder, bool IsSigned) { 546 Value *Arg0 = II.getArgOperand(0); 547 Value *Arg1 = II.getArgOperand(1); 548 Type *ResTy = II.getType(); 549 550 // Fast all undef handling. 551 if (isa<UndefValue>(Arg0) && isa<UndefValue>(Arg1)) 552 return UndefValue::get(ResTy); 553 554 Type *ArgTy = Arg0->getType(); 555 unsigned NumLanes = ResTy->getPrimitiveSizeInBits() / 128; 556 unsigned NumSrcElts = ArgTy->getVectorNumElements(); 557 assert(ResTy->getVectorNumElements() == (2 * NumSrcElts) && 558 "Unexpected packing types"); 559 560 unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes; 561 unsigned DstScalarSizeInBits = ResTy->getScalarSizeInBits(); 562 unsigned SrcScalarSizeInBits = ArgTy->getScalarSizeInBits(); 563 assert(SrcScalarSizeInBits == (2 * DstScalarSizeInBits) && 564 "Unexpected packing types"); 565 566 // Constant folding. 567 if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1)) 568 return nullptr; 569 570 // Clamp Values - signed/unsigned both use signed clamp values, but they 571 // differ on the min/max values. 572 APInt MinValue, MaxValue; 573 if (IsSigned) { 574 // PACKSS: Truncate signed value with signed saturation. 575 // Source values less than dst minint are saturated to minint. 576 // Source values greater than dst maxint are saturated to maxint. 577 MinValue = 578 APInt::getSignedMinValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits); 579 MaxValue = 580 APInt::getSignedMaxValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits); 581 } else { 582 // PACKUS: Truncate signed value with unsigned saturation. 583 // Source values less than zero are saturated to zero. 584 // Source values greater than dst maxuint are saturated to maxuint. 585 MinValue = APInt::getNullValue(SrcScalarSizeInBits); 586 MaxValue = APInt::getLowBitsSet(SrcScalarSizeInBits, DstScalarSizeInBits); 587 } 588 589 auto *MinC = Constant::getIntegerValue(ArgTy, MinValue); 590 auto *MaxC = Constant::getIntegerValue(ArgTy, MaxValue); 591 Arg0 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg0, MinC), MinC, Arg0); 592 Arg1 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg1, MinC), MinC, Arg1); 593 Arg0 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg0, MaxC), MaxC, Arg0); 594 Arg1 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg1, MaxC), MaxC, Arg1); 595 596 // Shuffle clamped args together at the lane level. 597 SmallVector<unsigned, 32> PackMask; 598 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { 599 for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt) 600 PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane)); 601 for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt) 602 PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane) + NumSrcElts); 603 } 604 auto *Shuffle = Builder.CreateShuffleVector(Arg0, Arg1, PackMask); 605 606 // Truncate to dst size. 607 return Builder.CreateTrunc(Shuffle, ResTy); 608 } 609 610 static Value *simplifyX86movmsk(const IntrinsicInst &II, 611 InstCombiner::BuilderTy &Builder) { 612 Value *Arg = II.getArgOperand(0); 613 Type *ResTy = II.getType(); 614 Type *ArgTy = Arg->getType(); 615 616 // movmsk(undef) -> zero as we must ensure the upper bits are zero. 617 if (isa<UndefValue>(Arg)) 618 return Constant::getNullValue(ResTy); 619 620 // We can't easily peek through x86_mmx types. 621 if (!ArgTy->isVectorTy()) 622 return nullptr; 623 624 // Expand MOVMSK to compare/bitcast/zext: 625 // e.g. PMOVMSKB(v16i8 x): 626 // %cmp = icmp slt <16 x i8> %x, zeroinitializer 627 // %int = bitcast <16 x i1> %cmp to i16 628 // %res = zext i16 %int to i32 629 unsigned NumElts = ArgTy->getVectorNumElements(); 630 Type *IntegerVecTy = VectorType::getInteger(cast<VectorType>(ArgTy)); 631 Type *IntegerTy = Builder.getIntNTy(NumElts); 632 633 Value *Res = Builder.CreateBitCast(Arg, IntegerVecTy); 634 Res = Builder.CreateICmpSLT(Res, Constant::getNullValue(IntegerVecTy)); 635 Res = Builder.CreateBitCast(Res, IntegerTy); 636 Res = Builder.CreateZExtOrTrunc(Res, ResTy); 637 return Res; 638 } 639 640 static Value *simplifyX86addcarry(const IntrinsicInst &II, 641 InstCombiner::BuilderTy &Builder) { 642 Value *CarryIn = II.getArgOperand(0); 643 Value *Op1 = II.getArgOperand(1); 644 Value *Op2 = II.getArgOperand(2); 645 Type *RetTy = II.getType(); 646 Type *OpTy = Op1->getType(); 647 assert(RetTy->getStructElementType(0)->isIntegerTy(8) && 648 RetTy->getStructElementType(1) == OpTy && OpTy == Op2->getType() && 649 "Unexpected types for x86 addcarry"); 650 651 // If carry-in is zero, this is just an unsigned add with overflow. 652 if (match(CarryIn, m_ZeroInt())) { 653 Value *UAdd = Builder.CreateIntrinsic(Intrinsic::uadd_with_overflow, OpTy, 654 { Op1, Op2 }); 655 // The types have to be adjusted to match the x86 call types. 656 Value *UAddResult = Builder.CreateExtractValue(UAdd, 0); 657 Value *UAddOV = Builder.CreateZExt(Builder.CreateExtractValue(UAdd, 1), 658 Builder.getInt8Ty()); 659 Value *Res = UndefValue::get(RetTy); 660 Res = Builder.CreateInsertValue(Res, UAddOV, 0); 661 return Builder.CreateInsertValue(Res, UAddResult, 1); 662 } 663 664 return nullptr; 665 } 666 667 static Value *simplifyX86insertps(const IntrinsicInst &II, 668 InstCombiner::BuilderTy &Builder) { 669 auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2)); 670 if (!CInt) 671 return nullptr; 672 673 VectorType *VecTy = cast<VectorType>(II.getType()); 674 assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type"); 675 676 // The immediate permute control byte looks like this: 677 // [3:0] - zero mask for each 32-bit lane 678 // [5:4] - select one 32-bit destination lane 679 // [7:6] - select one 32-bit source lane 680 681 uint8_t Imm = CInt->getZExtValue(); 682 uint8_t ZMask = Imm & 0xf; 683 uint8_t DestLane = (Imm >> 4) & 0x3; 684 uint8_t SourceLane = (Imm >> 6) & 0x3; 685 686 ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy); 687 688 // If all zero mask bits are set, this was just a weird way to 689 // generate a zero vector. 690 if (ZMask == 0xf) 691 return ZeroVector; 692 693 // Initialize by passing all of the first source bits through. 694 uint32_t ShuffleMask[4] = { 0, 1, 2, 3 }; 695 696 // We may replace the second operand with the zero vector. 697 Value *V1 = II.getArgOperand(1); 698 699 if (ZMask) { 700 // If the zero mask is being used with a single input or the zero mask 701 // overrides the destination lane, this is a shuffle with the zero vector. 702 if ((II.getArgOperand(0) == II.getArgOperand(1)) || 703 (ZMask & (1 << DestLane))) { 704 V1 = ZeroVector; 705 // We may still move 32-bits of the first source vector from one lane 706 // to another. 707 ShuffleMask[DestLane] = SourceLane; 708 // The zero mask may override the previous insert operation. 709 for (unsigned i = 0; i < 4; ++i) 710 if ((ZMask >> i) & 0x1) 711 ShuffleMask[i] = i + 4; 712 } else { 713 // TODO: Model this case as 2 shuffles or a 'logical and' plus shuffle? 714 return nullptr; 715 } 716 } else { 717 // Replace the selected destination lane with the selected source lane. 718 ShuffleMask[DestLane] = SourceLane + 4; 719 } 720 721 return Builder.CreateShuffleVector(II.getArgOperand(0), V1, ShuffleMask); 722 } 723 724 /// Attempt to simplify SSE4A EXTRQ/EXTRQI instructions using constant folding 725 /// or conversion to a shuffle vector. 726 static Value *simplifyX86extrq(IntrinsicInst &II, Value *Op0, 727 ConstantInt *CILength, ConstantInt *CIIndex, 728 InstCombiner::BuilderTy &Builder) { 729 auto LowConstantHighUndef = [&](uint64_t Val) { 730 Type *IntTy64 = Type::getInt64Ty(II.getContext()); 731 Constant *Args[] = {ConstantInt::get(IntTy64, Val), 732 UndefValue::get(IntTy64)}; 733 return ConstantVector::get(Args); 734 }; 735 736 // See if we're dealing with constant values. 737 Constant *C0 = dyn_cast<Constant>(Op0); 738 ConstantInt *CI0 = 739 C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0)) 740 : nullptr; 741 742 // Attempt to constant fold. 743 if (CILength && CIIndex) { 744 // From AMD documentation: "The bit index and field length are each six 745 // bits in length other bits of the field are ignored." 746 APInt APIndex = CIIndex->getValue().zextOrTrunc(6); 747 APInt APLength = CILength->getValue().zextOrTrunc(6); 748 749 unsigned Index = APIndex.getZExtValue(); 750 751 // From AMD documentation: "a value of zero in the field length is 752 // defined as length of 64". 753 unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue(); 754 755 // From AMD documentation: "If the sum of the bit index + length field 756 // is greater than 64, the results are undefined". 757 unsigned End = Index + Length; 758 759 // Note that both field index and field length are 8-bit quantities. 760 // Since variables 'Index' and 'Length' are unsigned values 761 // obtained from zero-extending field index and field length 762 // respectively, their sum should never wrap around. 763 if (End > 64) 764 return UndefValue::get(II.getType()); 765 766 // If we are inserting whole bytes, we can convert this to a shuffle. 767 // Lowering can recognize EXTRQI shuffle masks. 768 if ((Length % 8) == 0 && (Index % 8) == 0) { 769 // Convert bit indices to byte indices. 770 Length /= 8; 771 Index /= 8; 772 773 Type *IntTy8 = Type::getInt8Ty(II.getContext()); 774 Type *IntTy32 = Type::getInt32Ty(II.getContext()); 775 VectorType *ShufTy = VectorType::get(IntTy8, 16); 776 777 SmallVector<Constant *, 16> ShuffleMask; 778 for (int i = 0; i != (int)Length; ++i) 779 ShuffleMask.push_back( 780 Constant::getIntegerValue(IntTy32, APInt(32, i + Index))); 781 for (int i = Length; i != 8; ++i) 782 ShuffleMask.push_back( 783 Constant::getIntegerValue(IntTy32, APInt(32, i + 16))); 784 for (int i = 8; i != 16; ++i) 785 ShuffleMask.push_back(UndefValue::get(IntTy32)); 786 787 Value *SV = Builder.CreateShuffleVector( 788 Builder.CreateBitCast(Op0, ShufTy), 789 ConstantAggregateZero::get(ShufTy), ConstantVector::get(ShuffleMask)); 790 return Builder.CreateBitCast(SV, II.getType()); 791 } 792 793 // Constant Fold - shift Index'th bit to lowest position and mask off 794 // Length bits. 795 if (CI0) { 796 APInt Elt = CI0->getValue(); 797 Elt.lshrInPlace(Index); 798 Elt = Elt.zextOrTrunc(Length); 799 return LowConstantHighUndef(Elt.getZExtValue()); 800 } 801 802 // If we were an EXTRQ call, we'll save registers if we convert to EXTRQI. 803 if (II.getIntrinsicID() == Intrinsic::x86_sse4a_extrq) { 804 Value *Args[] = {Op0, CILength, CIIndex}; 805 Module *M = II.getModule(); 806 Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_extrqi); 807 return Builder.CreateCall(F, Args); 808 } 809 } 810 811 // Constant Fold - extraction from zero is always {zero, undef}. 812 if (CI0 && CI0->isZero()) 813 return LowConstantHighUndef(0); 814 815 return nullptr; 816 } 817 818 /// Attempt to simplify SSE4A INSERTQ/INSERTQI instructions using constant 819 /// folding or conversion to a shuffle vector. 820 static Value *simplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1, 821 APInt APLength, APInt APIndex, 822 InstCombiner::BuilderTy &Builder) { 823 // From AMD documentation: "The bit index and field length are each six bits 824 // in length other bits of the field are ignored." 825 APIndex = APIndex.zextOrTrunc(6); 826 APLength = APLength.zextOrTrunc(6); 827 828 // Attempt to constant fold. 829 unsigned Index = APIndex.getZExtValue(); 830 831 // From AMD documentation: "a value of zero in the field length is 832 // defined as length of 64". 833 unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue(); 834 835 // From AMD documentation: "If the sum of the bit index + length field 836 // is greater than 64, the results are undefined". 837 unsigned End = Index + Length; 838 839 // Note that both field index and field length are 8-bit quantities. 840 // Since variables 'Index' and 'Length' are unsigned values 841 // obtained from zero-extending field index and field length 842 // respectively, their sum should never wrap around. 843 if (End > 64) 844 return UndefValue::get(II.getType()); 845 846 // If we are inserting whole bytes, we can convert this to a shuffle. 847 // Lowering can recognize INSERTQI shuffle masks. 848 if ((Length % 8) == 0 && (Index % 8) == 0) { 849 // Convert bit indices to byte indices. 850 Length /= 8; 851 Index /= 8; 852 853 Type *IntTy8 = Type::getInt8Ty(II.getContext()); 854 Type *IntTy32 = Type::getInt32Ty(II.getContext()); 855 VectorType *ShufTy = VectorType::get(IntTy8, 16); 856 857 SmallVector<Constant *, 16> ShuffleMask; 858 for (int i = 0; i != (int)Index; ++i) 859 ShuffleMask.push_back(Constant::getIntegerValue(IntTy32, APInt(32, i))); 860 for (int i = 0; i != (int)Length; ++i) 861 ShuffleMask.push_back( 862 Constant::getIntegerValue(IntTy32, APInt(32, i + 16))); 863 for (int i = Index + Length; i != 8; ++i) 864 ShuffleMask.push_back(Constant::getIntegerValue(IntTy32, APInt(32, i))); 865 for (int i = 8; i != 16; ++i) 866 ShuffleMask.push_back(UndefValue::get(IntTy32)); 867 868 Value *SV = Builder.CreateShuffleVector(Builder.CreateBitCast(Op0, ShufTy), 869 Builder.CreateBitCast(Op1, ShufTy), 870 ConstantVector::get(ShuffleMask)); 871 return Builder.CreateBitCast(SV, II.getType()); 872 } 873 874 // See if we're dealing with constant values. 875 Constant *C0 = dyn_cast<Constant>(Op0); 876 Constant *C1 = dyn_cast<Constant>(Op1); 877 ConstantInt *CI00 = 878 C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0)) 879 : nullptr; 880 ConstantInt *CI10 = 881 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0)) 882 : nullptr; 883 884 // Constant Fold - insert bottom Length bits starting at the Index'th bit. 885 if (CI00 && CI10) { 886 APInt V00 = CI00->getValue(); 887 APInt V10 = CI10->getValue(); 888 APInt Mask = APInt::getLowBitsSet(64, Length).shl(Index); 889 V00 = V00 & ~Mask; 890 V10 = V10.zextOrTrunc(Length).zextOrTrunc(64).shl(Index); 891 APInt Val = V00 | V10; 892 Type *IntTy64 = Type::getInt64Ty(II.getContext()); 893 Constant *Args[] = {ConstantInt::get(IntTy64, Val.getZExtValue()), 894 UndefValue::get(IntTy64)}; 895 return ConstantVector::get(Args); 896 } 897 898 // If we were an INSERTQ call, we'll save demanded elements if we convert to 899 // INSERTQI. 900 if (II.getIntrinsicID() == Intrinsic::x86_sse4a_insertq) { 901 Type *IntTy8 = Type::getInt8Ty(II.getContext()); 902 Constant *CILength = ConstantInt::get(IntTy8, Length, false); 903 Constant *CIIndex = ConstantInt::get(IntTy8, Index, false); 904 905 Value *Args[] = {Op0, Op1, CILength, CIIndex}; 906 Module *M = II.getModule(); 907 Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi); 908 return Builder.CreateCall(F, Args); 909 } 910 911 return nullptr; 912 } 913 914 /// Attempt to convert pshufb* to shufflevector if the mask is constant. 915 static Value *simplifyX86pshufb(const IntrinsicInst &II, 916 InstCombiner::BuilderTy &Builder) { 917 Constant *V = dyn_cast<Constant>(II.getArgOperand(1)); 918 if (!V) 919 return nullptr; 920 921 auto *VecTy = cast<VectorType>(II.getType()); 922 auto *MaskEltTy = Type::getInt32Ty(II.getContext()); 923 unsigned NumElts = VecTy->getNumElements(); 924 assert((NumElts == 16 || NumElts == 32 || NumElts == 64) && 925 "Unexpected number of elements in shuffle mask!"); 926 927 // Construct a shuffle mask from constant integers or UNDEFs. 928 Constant *Indexes[64] = {nullptr}; 929 930 // Each byte in the shuffle control mask forms an index to permute the 931 // corresponding byte in the destination operand. 932 for (unsigned I = 0; I < NumElts; ++I) { 933 Constant *COp = V->getAggregateElement(I); 934 if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) 935 return nullptr; 936 937 if (isa<UndefValue>(COp)) { 938 Indexes[I] = UndefValue::get(MaskEltTy); 939 continue; 940 } 941 942 int8_t Index = cast<ConstantInt>(COp)->getValue().getZExtValue(); 943 944 // If the most significant bit (bit[7]) of each byte of the shuffle 945 // control mask is set, then zero is written in the result byte. 946 // The zero vector is in the right-hand side of the resulting 947 // shufflevector. 948 949 // The value of each index for the high 128-bit lane is the least 950 // significant 4 bits of the respective shuffle control byte. 951 Index = ((Index < 0) ? NumElts : Index & 0x0F) + (I & 0xF0); 952 Indexes[I] = ConstantInt::get(MaskEltTy, Index); 953 } 954 955 auto ShuffleMask = ConstantVector::get(makeArrayRef(Indexes, NumElts)); 956 auto V1 = II.getArgOperand(0); 957 auto V2 = Constant::getNullValue(VecTy); 958 return Builder.CreateShuffleVector(V1, V2, ShuffleMask); 959 } 960 961 /// Attempt to convert vpermilvar* to shufflevector if the mask is constant. 962 static Value *simplifyX86vpermilvar(const IntrinsicInst &II, 963 InstCombiner::BuilderTy &Builder) { 964 Constant *V = dyn_cast<Constant>(II.getArgOperand(1)); 965 if (!V) 966 return nullptr; 967 968 auto *VecTy = cast<VectorType>(II.getType()); 969 auto *MaskEltTy = Type::getInt32Ty(II.getContext()); 970 unsigned NumElts = VecTy->getVectorNumElements(); 971 bool IsPD = VecTy->getScalarType()->isDoubleTy(); 972 unsigned NumLaneElts = IsPD ? 2 : 4; 973 assert(NumElts == 16 || NumElts == 8 || NumElts == 4 || NumElts == 2); 974 975 // Construct a shuffle mask from constant integers or UNDEFs. 976 Constant *Indexes[16] = {nullptr}; 977 978 // The intrinsics only read one or two bits, clear the rest. 979 for (unsigned I = 0; I < NumElts; ++I) { 980 Constant *COp = V->getAggregateElement(I); 981 if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) 982 return nullptr; 983 984 if (isa<UndefValue>(COp)) { 985 Indexes[I] = UndefValue::get(MaskEltTy); 986 continue; 987 } 988 989 APInt Index = cast<ConstantInt>(COp)->getValue(); 990 Index = Index.zextOrTrunc(32).getLoBits(2); 991 992 // The PD variants uses bit 1 to select per-lane element index, so 993 // shift down to convert to generic shuffle mask index. 994 if (IsPD) 995 Index.lshrInPlace(1); 996 997 // The _256 variants are a bit trickier since the mask bits always index 998 // into the corresponding 128 half. In order to convert to a generic 999 // shuffle, we have to make that explicit. 1000 Index += APInt(32, (I / NumLaneElts) * NumLaneElts); 1001 1002 Indexes[I] = ConstantInt::get(MaskEltTy, Index); 1003 } 1004 1005 auto ShuffleMask = ConstantVector::get(makeArrayRef(Indexes, NumElts)); 1006 auto V1 = II.getArgOperand(0); 1007 auto V2 = UndefValue::get(V1->getType()); 1008 return Builder.CreateShuffleVector(V1, V2, ShuffleMask); 1009 } 1010 1011 /// Attempt to convert vpermd/vpermps to shufflevector if the mask is constant. 1012 static Value *simplifyX86vpermv(const IntrinsicInst &II, 1013 InstCombiner::BuilderTy &Builder) { 1014 auto *V = dyn_cast<Constant>(II.getArgOperand(1)); 1015 if (!V) 1016 return nullptr; 1017 1018 auto *VecTy = cast<VectorType>(II.getType()); 1019 auto *MaskEltTy = Type::getInt32Ty(II.getContext()); 1020 unsigned Size = VecTy->getNumElements(); 1021 assert((Size == 4 || Size == 8 || Size == 16 || Size == 32 || Size == 64) && 1022 "Unexpected shuffle mask size"); 1023 1024 // Construct a shuffle mask from constant integers or UNDEFs. 1025 Constant *Indexes[64] = {nullptr}; 1026 1027 for (unsigned I = 0; I < Size; ++I) { 1028 Constant *COp = V->getAggregateElement(I); 1029 if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) 1030 return nullptr; 1031 1032 if (isa<UndefValue>(COp)) { 1033 Indexes[I] = UndefValue::get(MaskEltTy); 1034 continue; 1035 } 1036 1037 uint32_t Index = cast<ConstantInt>(COp)->getZExtValue(); 1038 Index &= Size - 1; 1039 Indexes[I] = ConstantInt::get(MaskEltTy, Index); 1040 } 1041 1042 auto ShuffleMask = ConstantVector::get(makeArrayRef(Indexes, Size)); 1043 auto V1 = II.getArgOperand(0); 1044 auto V2 = UndefValue::get(VecTy); 1045 return Builder.CreateShuffleVector(V1, V2, ShuffleMask); 1046 } 1047 1048 // TODO, Obvious Missing Transforms: 1049 // * Narrow width by halfs excluding zero/undef lanes 1050 Value *InstCombiner::simplifyMaskedLoad(IntrinsicInst &II) { 1051 Value *LoadPtr = II.getArgOperand(0); 1052 unsigned Alignment = cast<ConstantInt>(II.getArgOperand(1))->getZExtValue(); 1053 1054 // If the mask is all ones or undefs, this is a plain vector load of the 1st 1055 // argument. 1056 if (maskIsAllOneOrUndef(II.getArgOperand(2))) 1057 return Builder.CreateAlignedLoad(II.getType(), LoadPtr, Alignment, 1058 "unmaskedload"); 1059 1060 // If we can unconditionally load from this address, replace with a 1061 // load/select idiom. TODO: use DT for context sensitive query 1062 if (isDereferenceableAndAlignedPointer( 1063 LoadPtr, II.getType(), MaybeAlign(Alignment), 1064 II.getModule()->getDataLayout(), &II, nullptr)) { 1065 Value *LI = Builder.CreateAlignedLoad(II.getType(), LoadPtr, Alignment, 1066 "unmaskedload"); 1067 return Builder.CreateSelect(II.getArgOperand(2), LI, II.getArgOperand(3)); 1068 } 1069 1070 return nullptr; 1071 } 1072 1073 // TODO, Obvious Missing Transforms: 1074 // * Single constant active lane -> store 1075 // * Narrow width by halfs excluding zero/undef lanes 1076 Instruction *InstCombiner::simplifyMaskedStore(IntrinsicInst &II) { 1077 auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(3)); 1078 if (!ConstMask) 1079 return nullptr; 1080 1081 // If the mask is all zeros, this instruction does nothing. 1082 if (ConstMask->isNullValue()) 1083 return eraseInstFromFunction(II); 1084 1085 // If the mask is all ones, this is a plain vector store of the 1st argument. 1086 if (ConstMask->isAllOnesValue()) { 1087 Value *StorePtr = II.getArgOperand(1); 1088 MaybeAlign Alignment( 1089 cast<ConstantInt>(II.getArgOperand(2))->getZExtValue()); 1090 return new StoreInst(II.getArgOperand(0), StorePtr, false, Alignment); 1091 } 1092 1093 // Use masked off lanes to simplify operands via SimplifyDemandedVectorElts 1094 APInt DemandedElts = possiblyDemandedEltsInMask(ConstMask); 1095 APInt UndefElts(DemandedElts.getBitWidth(), 0); 1096 if (Value *V = SimplifyDemandedVectorElts(II.getOperand(0), 1097 DemandedElts, UndefElts)) { 1098 II.setOperand(0, V); 1099 return &II; 1100 } 1101 1102 return nullptr; 1103 } 1104 1105 // TODO, Obvious Missing Transforms: 1106 // * Single constant active lane load -> load 1107 // * Dereferenceable address & few lanes -> scalarize speculative load/selects 1108 // * Adjacent vector addresses -> masked.load 1109 // * Narrow width by halfs excluding zero/undef lanes 1110 // * Vector splat address w/known mask -> scalar load 1111 // * Vector incrementing address -> vector masked load 1112 Instruction *InstCombiner::simplifyMaskedGather(IntrinsicInst &II) { 1113 return nullptr; 1114 } 1115 1116 // TODO, Obvious Missing Transforms: 1117 // * Single constant active lane -> store 1118 // * Adjacent vector addresses -> masked.store 1119 // * Narrow store width by halfs excluding zero/undef lanes 1120 // * Vector splat address w/known mask -> scalar store 1121 // * Vector incrementing address -> vector masked store 1122 Instruction *InstCombiner::simplifyMaskedScatter(IntrinsicInst &II) { 1123 auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(3)); 1124 if (!ConstMask) 1125 return nullptr; 1126 1127 // If the mask is all zeros, a scatter does nothing. 1128 if (ConstMask->isNullValue()) 1129 return eraseInstFromFunction(II); 1130 1131 // Use masked off lanes to simplify operands via SimplifyDemandedVectorElts 1132 APInt DemandedElts = possiblyDemandedEltsInMask(ConstMask); 1133 APInt UndefElts(DemandedElts.getBitWidth(), 0); 1134 if (Value *V = SimplifyDemandedVectorElts(II.getOperand(0), 1135 DemandedElts, UndefElts)) { 1136 II.setOperand(0, V); 1137 return &II; 1138 } 1139 if (Value *V = SimplifyDemandedVectorElts(II.getOperand(1), 1140 DemandedElts, UndefElts)) { 1141 II.setOperand(1, V); 1142 return &II; 1143 } 1144 1145 return nullptr; 1146 } 1147 1148 /// This function transforms launder.invariant.group and strip.invariant.group 1149 /// like: 1150 /// launder(launder(%x)) -> launder(%x) (the result is not the argument) 1151 /// launder(strip(%x)) -> launder(%x) 1152 /// strip(strip(%x)) -> strip(%x) (the result is not the argument) 1153 /// strip(launder(%x)) -> strip(%x) 1154 /// This is legal because it preserves the most recent information about 1155 /// the presence or absence of invariant.group. 1156 static Instruction *simplifyInvariantGroupIntrinsic(IntrinsicInst &II, 1157 InstCombiner &IC) { 1158 auto *Arg = II.getArgOperand(0); 1159 auto *StrippedArg = Arg->stripPointerCasts(); 1160 auto *StrippedInvariantGroupsArg = Arg->stripPointerCastsAndInvariantGroups(); 1161 if (StrippedArg == StrippedInvariantGroupsArg) 1162 return nullptr; // No launders/strips to remove. 1163 1164 Value *Result = nullptr; 1165 1166 if (II.getIntrinsicID() == Intrinsic::launder_invariant_group) 1167 Result = IC.Builder.CreateLaunderInvariantGroup(StrippedInvariantGroupsArg); 1168 else if (II.getIntrinsicID() == Intrinsic::strip_invariant_group) 1169 Result = IC.Builder.CreateStripInvariantGroup(StrippedInvariantGroupsArg); 1170 else 1171 llvm_unreachable( 1172 "simplifyInvariantGroupIntrinsic only handles launder and strip"); 1173 if (Result->getType()->getPointerAddressSpace() != 1174 II.getType()->getPointerAddressSpace()) 1175 Result = IC.Builder.CreateAddrSpaceCast(Result, II.getType()); 1176 if (Result->getType() != II.getType()) 1177 Result = IC.Builder.CreateBitCast(Result, II.getType()); 1178 1179 return cast<Instruction>(Result); 1180 } 1181 1182 static Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombiner &IC) { 1183 assert((II.getIntrinsicID() == Intrinsic::cttz || 1184 II.getIntrinsicID() == Intrinsic::ctlz) && 1185 "Expected cttz or ctlz intrinsic"); 1186 bool IsTZ = II.getIntrinsicID() == Intrinsic::cttz; 1187 Value *Op0 = II.getArgOperand(0); 1188 Value *X; 1189 // ctlz(bitreverse(x)) -> cttz(x) 1190 // cttz(bitreverse(x)) -> ctlz(x) 1191 if (match(Op0, m_BitReverse(m_Value(X)))) { 1192 Intrinsic::ID ID = IsTZ ? Intrinsic::ctlz : Intrinsic::cttz; 1193 Function *F = Intrinsic::getDeclaration(II.getModule(), ID, II.getType()); 1194 return CallInst::Create(F, {X, II.getArgOperand(1)}); 1195 } 1196 1197 if (IsTZ) { 1198 // cttz(-x) -> cttz(x) 1199 if (match(Op0, m_Neg(m_Value(X)))) { 1200 II.setOperand(0, X); 1201 return &II; 1202 } 1203 1204 // cttz(abs(x)) -> cttz(x) 1205 // cttz(nabs(x)) -> cttz(x) 1206 Value *Y; 1207 SelectPatternFlavor SPF = matchSelectPattern(Op0, X, Y).Flavor; 1208 if (SPF == SPF_ABS || SPF == SPF_NABS) { 1209 II.setOperand(0, X); 1210 return &II; 1211 } 1212 } 1213 1214 KnownBits Known = IC.computeKnownBits(Op0, 0, &II); 1215 1216 // Create a mask for bits above (ctlz) or below (cttz) the first known one. 1217 unsigned PossibleZeros = IsTZ ? Known.countMaxTrailingZeros() 1218 : Known.countMaxLeadingZeros(); 1219 unsigned DefiniteZeros = IsTZ ? Known.countMinTrailingZeros() 1220 : Known.countMinLeadingZeros(); 1221 1222 // If all bits above (ctlz) or below (cttz) the first known one are known 1223 // zero, this value is constant. 1224 // FIXME: This should be in InstSimplify because we're replacing an 1225 // instruction with a constant. 1226 if (PossibleZeros == DefiniteZeros) { 1227 auto *C = ConstantInt::get(Op0->getType(), DefiniteZeros); 1228 return IC.replaceInstUsesWith(II, C); 1229 } 1230 1231 // If the input to cttz/ctlz is known to be non-zero, 1232 // then change the 'ZeroIsUndef' parameter to 'true' 1233 // because we know the zero behavior can't affect the result. 1234 if (!Known.One.isNullValue() || 1235 isKnownNonZero(Op0, IC.getDataLayout(), 0, &IC.getAssumptionCache(), &II, 1236 &IC.getDominatorTree())) { 1237 if (!match(II.getArgOperand(1), m_One())) { 1238 II.setOperand(1, IC.Builder.getTrue()); 1239 return &II; 1240 } 1241 } 1242 1243 // Add range metadata since known bits can't completely reflect what we know. 1244 // TODO: Handle splat vectors. 1245 auto *IT = dyn_cast<IntegerType>(Op0->getType()); 1246 if (IT && IT->getBitWidth() != 1 && !II.getMetadata(LLVMContext::MD_range)) { 1247 Metadata *LowAndHigh[] = { 1248 ConstantAsMetadata::get(ConstantInt::get(IT, DefiniteZeros)), 1249 ConstantAsMetadata::get(ConstantInt::get(IT, PossibleZeros + 1))}; 1250 II.setMetadata(LLVMContext::MD_range, 1251 MDNode::get(II.getContext(), LowAndHigh)); 1252 return &II; 1253 } 1254 1255 return nullptr; 1256 } 1257 1258 static Instruction *foldCtpop(IntrinsicInst &II, InstCombiner &IC) { 1259 assert(II.getIntrinsicID() == Intrinsic::ctpop && 1260 "Expected ctpop intrinsic"); 1261 Value *Op0 = II.getArgOperand(0); 1262 Value *X; 1263 // ctpop(bitreverse(x)) -> ctpop(x) 1264 // ctpop(bswap(x)) -> ctpop(x) 1265 if (match(Op0, m_BitReverse(m_Value(X))) || match(Op0, m_BSwap(m_Value(X)))) { 1266 II.setOperand(0, X); 1267 return &II; 1268 } 1269 1270 // FIXME: Try to simplify vectors of integers. 1271 auto *IT = dyn_cast<IntegerType>(Op0->getType()); 1272 if (!IT) 1273 return nullptr; 1274 1275 unsigned BitWidth = IT->getBitWidth(); 1276 KnownBits Known(BitWidth); 1277 IC.computeKnownBits(Op0, Known, 0, &II); 1278 1279 unsigned MinCount = Known.countMinPopulation(); 1280 unsigned MaxCount = Known.countMaxPopulation(); 1281 1282 // Add range metadata since known bits can't completely reflect what we know. 1283 if (IT->getBitWidth() != 1 && !II.getMetadata(LLVMContext::MD_range)) { 1284 Metadata *LowAndHigh[] = { 1285 ConstantAsMetadata::get(ConstantInt::get(IT, MinCount)), 1286 ConstantAsMetadata::get(ConstantInt::get(IT, MaxCount + 1))}; 1287 II.setMetadata(LLVMContext::MD_range, 1288 MDNode::get(II.getContext(), LowAndHigh)); 1289 return &II; 1290 } 1291 1292 return nullptr; 1293 } 1294 1295 // TODO: If the x86 backend knew how to convert a bool vector mask back to an 1296 // XMM register mask efficiently, we could transform all x86 masked intrinsics 1297 // to LLVM masked intrinsics and remove the x86 masked intrinsic defs. 1298 static Instruction *simplifyX86MaskedLoad(IntrinsicInst &II, InstCombiner &IC) { 1299 Value *Ptr = II.getOperand(0); 1300 Value *Mask = II.getOperand(1); 1301 Constant *ZeroVec = Constant::getNullValue(II.getType()); 1302 1303 // Special case a zero mask since that's not a ConstantDataVector. 1304 // This masked load instruction creates a zero vector. 1305 if (isa<ConstantAggregateZero>(Mask)) 1306 return IC.replaceInstUsesWith(II, ZeroVec); 1307 1308 auto *ConstMask = dyn_cast<ConstantDataVector>(Mask); 1309 if (!ConstMask) 1310 return nullptr; 1311 1312 // The mask is constant. Convert this x86 intrinsic to the LLVM instrinsic 1313 // to allow target-independent optimizations. 1314 1315 // First, cast the x86 intrinsic scalar pointer to a vector pointer to match 1316 // the LLVM intrinsic definition for the pointer argument. 1317 unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace(); 1318 PointerType *VecPtrTy = PointerType::get(II.getType(), AddrSpace); 1319 Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec"); 1320 1321 // Second, convert the x86 XMM integer vector mask to a vector of bools based 1322 // on each element's most significant bit (the sign bit). 1323 Constant *BoolMask = getNegativeIsTrueBoolVec(ConstMask); 1324 1325 // The pass-through vector for an x86 masked load is a zero vector. 1326 CallInst *NewMaskedLoad = 1327 IC.Builder.CreateMaskedLoad(PtrCast, 1, BoolMask, ZeroVec); 1328 return IC.replaceInstUsesWith(II, NewMaskedLoad); 1329 } 1330 1331 // TODO: If the x86 backend knew how to convert a bool vector mask back to an 1332 // XMM register mask efficiently, we could transform all x86 masked intrinsics 1333 // to LLVM masked intrinsics and remove the x86 masked intrinsic defs. 1334 static bool simplifyX86MaskedStore(IntrinsicInst &II, InstCombiner &IC) { 1335 Value *Ptr = II.getOperand(0); 1336 Value *Mask = II.getOperand(1); 1337 Value *Vec = II.getOperand(2); 1338 1339 // Special case a zero mask since that's not a ConstantDataVector: 1340 // this masked store instruction does nothing. 1341 if (isa<ConstantAggregateZero>(Mask)) { 1342 IC.eraseInstFromFunction(II); 1343 return true; 1344 } 1345 1346 // The SSE2 version is too weird (eg, unaligned but non-temporal) to do 1347 // anything else at this level. 1348 if (II.getIntrinsicID() == Intrinsic::x86_sse2_maskmov_dqu) 1349 return false; 1350 1351 auto *ConstMask = dyn_cast<ConstantDataVector>(Mask); 1352 if (!ConstMask) 1353 return false; 1354 1355 // The mask is constant. Convert this x86 intrinsic to the LLVM instrinsic 1356 // to allow target-independent optimizations. 1357 1358 // First, cast the x86 intrinsic scalar pointer to a vector pointer to match 1359 // the LLVM intrinsic definition for the pointer argument. 1360 unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace(); 1361 PointerType *VecPtrTy = PointerType::get(Vec->getType(), AddrSpace); 1362 Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec"); 1363 1364 // Second, convert the x86 XMM integer vector mask to a vector of bools based 1365 // on each element's most significant bit (the sign bit). 1366 Constant *BoolMask = getNegativeIsTrueBoolVec(ConstMask); 1367 1368 IC.Builder.CreateMaskedStore(Vec, PtrCast, 1, BoolMask); 1369 1370 // 'Replace uses' doesn't work for stores. Erase the original masked store. 1371 IC.eraseInstFromFunction(II); 1372 return true; 1373 } 1374 1375 // Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs. 1376 // 1377 // A single NaN input is folded to minnum, so we rely on that folding for 1378 // handling NaNs. 1379 static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1, 1380 const APFloat &Src2) { 1381 APFloat Max3 = maxnum(maxnum(Src0, Src1), Src2); 1382 1383 APFloat::cmpResult Cmp0 = Max3.compare(Src0); 1384 assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately"); 1385 if (Cmp0 == APFloat::cmpEqual) 1386 return maxnum(Src1, Src2); 1387 1388 APFloat::cmpResult Cmp1 = Max3.compare(Src1); 1389 assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately"); 1390 if (Cmp1 == APFloat::cmpEqual) 1391 return maxnum(Src0, Src2); 1392 1393 return maxnum(Src0, Src1); 1394 } 1395 1396 /// Convert a table lookup to shufflevector if the mask is constant. 1397 /// This could benefit tbl1 if the mask is { 7,6,5,4,3,2,1,0 }, in 1398 /// which case we could lower the shufflevector with rev64 instructions 1399 /// as it's actually a byte reverse. 1400 static Value *simplifyNeonTbl1(const IntrinsicInst &II, 1401 InstCombiner::BuilderTy &Builder) { 1402 // Bail out if the mask is not a constant. 1403 auto *C = dyn_cast<Constant>(II.getArgOperand(1)); 1404 if (!C) 1405 return nullptr; 1406 1407 auto *VecTy = cast<VectorType>(II.getType()); 1408 unsigned NumElts = VecTy->getNumElements(); 1409 1410 // Only perform this transformation for <8 x i8> vector types. 1411 if (!VecTy->getElementType()->isIntegerTy(8) || NumElts != 8) 1412 return nullptr; 1413 1414 uint32_t Indexes[8]; 1415 1416 for (unsigned I = 0; I < NumElts; ++I) { 1417 Constant *COp = C->getAggregateElement(I); 1418 1419 if (!COp || !isa<ConstantInt>(COp)) 1420 return nullptr; 1421 1422 Indexes[I] = cast<ConstantInt>(COp)->getLimitedValue(); 1423 1424 // Make sure the mask indices are in range. 1425 if (Indexes[I] >= NumElts) 1426 return nullptr; 1427 } 1428 1429 auto *ShuffleMask = ConstantDataVector::get(II.getContext(), 1430 makeArrayRef(Indexes)); 1431 auto *V1 = II.getArgOperand(0); 1432 auto *V2 = Constant::getNullValue(V1->getType()); 1433 return Builder.CreateShuffleVector(V1, V2, ShuffleMask); 1434 } 1435 1436 /// Convert a vector load intrinsic into a simple llvm load instruction. 1437 /// This is beneficial when the underlying object being addressed comes 1438 /// from a constant, since we get constant-folding for free. 1439 static Value *simplifyNeonVld1(const IntrinsicInst &II, 1440 unsigned MemAlign, 1441 InstCombiner::BuilderTy &Builder) { 1442 auto *IntrAlign = dyn_cast<ConstantInt>(II.getArgOperand(1)); 1443 1444 if (!IntrAlign) 1445 return nullptr; 1446 1447 unsigned Alignment = IntrAlign->getLimitedValue() < MemAlign ? 1448 MemAlign : IntrAlign->getLimitedValue(); 1449 1450 if (!isPowerOf2_32(Alignment)) 1451 return nullptr; 1452 1453 auto *BCastInst = Builder.CreateBitCast(II.getArgOperand(0), 1454 PointerType::get(II.getType(), 0)); 1455 return Builder.CreateAlignedLoad(II.getType(), BCastInst, Alignment); 1456 } 1457 1458 // Returns true iff the 2 intrinsics have the same operands, limiting the 1459 // comparison to the first NumOperands. 1460 static bool haveSameOperands(const IntrinsicInst &I, const IntrinsicInst &E, 1461 unsigned NumOperands) { 1462 assert(I.getNumArgOperands() >= NumOperands && "Not enough operands"); 1463 assert(E.getNumArgOperands() >= NumOperands && "Not enough operands"); 1464 for (unsigned i = 0; i < NumOperands; i++) 1465 if (I.getArgOperand(i) != E.getArgOperand(i)) 1466 return false; 1467 return true; 1468 } 1469 1470 // Remove trivially empty start/end intrinsic ranges, i.e. a start 1471 // immediately followed by an end (ignoring debuginfo or other 1472 // start/end intrinsics in between). As this handles only the most trivial 1473 // cases, tracking the nesting level is not needed: 1474 // 1475 // call @llvm.foo.start(i1 0) ; &I 1476 // call @llvm.foo.start(i1 0) 1477 // call @llvm.foo.end(i1 0) ; This one will not be skipped: it will be removed 1478 // call @llvm.foo.end(i1 0) 1479 static bool removeTriviallyEmptyRange(IntrinsicInst &I, unsigned StartID, 1480 unsigned EndID, InstCombiner &IC) { 1481 assert(I.getIntrinsicID() == StartID && 1482 "Start intrinsic does not have expected ID"); 1483 BasicBlock::iterator BI(I), BE(I.getParent()->end()); 1484 for (++BI; BI != BE; ++BI) { 1485 if (auto *E = dyn_cast<IntrinsicInst>(BI)) { 1486 if (isa<DbgInfoIntrinsic>(E) || E->getIntrinsicID() == StartID) 1487 continue; 1488 if (E->getIntrinsicID() == EndID && 1489 haveSameOperands(I, *E, E->getNumArgOperands())) { 1490 IC.eraseInstFromFunction(*E); 1491 IC.eraseInstFromFunction(I); 1492 return true; 1493 } 1494 } 1495 break; 1496 } 1497 1498 return false; 1499 } 1500 1501 // Convert NVVM intrinsics to target-generic LLVM code where possible. 1502 static Instruction *SimplifyNVVMIntrinsic(IntrinsicInst *II, InstCombiner &IC) { 1503 // Each NVVM intrinsic we can simplify can be replaced with one of: 1504 // 1505 // * an LLVM intrinsic, 1506 // * an LLVM cast operation, 1507 // * an LLVM binary operation, or 1508 // * ad-hoc LLVM IR for the particular operation. 1509 1510 // Some transformations are only valid when the module's 1511 // flush-denormals-to-zero (ftz) setting is true/false, whereas other 1512 // transformations are valid regardless of the module's ftz setting. 1513 enum FtzRequirementTy { 1514 FTZ_Any, // Any ftz setting is ok. 1515 FTZ_MustBeOn, // Transformation is valid only if ftz is on. 1516 FTZ_MustBeOff, // Transformation is valid only if ftz is off. 1517 }; 1518 // Classes of NVVM intrinsics that can't be replaced one-to-one with a 1519 // target-generic intrinsic, cast op, or binary op but that we can nonetheless 1520 // simplify. 1521 enum SpecialCase { 1522 SPC_Reciprocal, 1523 }; 1524 1525 // SimplifyAction is a poor-man's variant (plus an additional flag) that 1526 // represents how to replace an NVVM intrinsic with target-generic LLVM IR. 1527 struct SimplifyAction { 1528 // Invariant: At most one of these Optionals has a value. 1529 Optional<Intrinsic::ID> IID; 1530 Optional<Instruction::CastOps> CastOp; 1531 Optional<Instruction::BinaryOps> BinaryOp; 1532 Optional<SpecialCase> Special; 1533 1534 FtzRequirementTy FtzRequirement = FTZ_Any; 1535 1536 SimplifyAction() = default; 1537 1538 SimplifyAction(Intrinsic::ID IID, FtzRequirementTy FtzReq) 1539 : IID(IID), FtzRequirement(FtzReq) {} 1540 1541 // Cast operations don't have anything to do with FTZ, so we skip that 1542 // argument. 1543 SimplifyAction(Instruction::CastOps CastOp) : CastOp(CastOp) {} 1544 1545 SimplifyAction(Instruction::BinaryOps BinaryOp, FtzRequirementTy FtzReq) 1546 : BinaryOp(BinaryOp), FtzRequirement(FtzReq) {} 1547 1548 SimplifyAction(SpecialCase Special, FtzRequirementTy FtzReq) 1549 : Special(Special), FtzRequirement(FtzReq) {} 1550 }; 1551 1552 // Try to generate a SimplifyAction describing how to replace our 1553 // IntrinsicInstr with target-generic LLVM IR. 1554 const SimplifyAction Action = [II]() -> SimplifyAction { 1555 switch (II->getIntrinsicID()) { 1556 // NVVM intrinsics that map directly to LLVM intrinsics. 1557 case Intrinsic::nvvm_ceil_d: 1558 return {Intrinsic::ceil, FTZ_Any}; 1559 case Intrinsic::nvvm_ceil_f: 1560 return {Intrinsic::ceil, FTZ_MustBeOff}; 1561 case Intrinsic::nvvm_ceil_ftz_f: 1562 return {Intrinsic::ceil, FTZ_MustBeOn}; 1563 case Intrinsic::nvvm_fabs_d: 1564 return {Intrinsic::fabs, FTZ_Any}; 1565 case Intrinsic::nvvm_fabs_f: 1566 return {Intrinsic::fabs, FTZ_MustBeOff}; 1567 case Intrinsic::nvvm_fabs_ftz_f: 1568 return {Intrinsic::fabs, FTZ_MustBeOn}; 1569 case Intrinsic::nvvm_floor_d: 1570 return {Intrinsic::floor, FTZ_Any}; 1571 case Intrinsic::nvvm_floor_f: 1572 return {Intrinsic::floor, FTZ_MustBeOff}; 1573 case Intrinsic::nvvm_floor_ftz_f: 1574 return {Intrinsic::floor, FTZ_MustBeOn}; 1575 case Intrinsic::nvvm_fma_rn_d: 1576 return {Intrinsic::fma, FTZ_Any}; 1577 case Intrinsic::nvvm_fma_rn_f: 1578 return {Intrinsic::fma, FTZ_MustBeOff}; 1579 case Intrinsic::nvvm_fma_rn_ftz_f: 1580 return {Intrinsic::fma, FTZ_MustBeOn}; 1581 case Intrinsic::nvvm_fmax_d: 1582 return {Intrinsic::maxnum, FTZ_Any}; 1583 case Intrinsic::nvvm_fmax_f: 1584 return {Intrinsic::maxnum, FTZ_MustBeOff}; 1585 case Intrinsic::nvvm_fmax_ftz_f: 1586 return {Intrinsic::maxnum, FTZ_MustBeOn}; 1587 case Intrinsic::nvvm_fmin_d: 1588 return {Intrinsic::minnum, FTZ_Any}; 1589 case Intrinsic::nvvm_fmin_f: 1590 return {Intrinsic::minnum, FTZ_MustBeOff}; 1591 case Intrinsic::nvvm_fmin_ftz_f: 1592 return {Intrinsic::minnum, FTZ_MustBeOn}; 1593 case Intrinsic::nvvm_round_d: 1594 return {Intrinsic::round, FTZ_Any}; 1595 case Intrinsic::nvvm_round_f: 1596 return {Intrinsic::round, FTZ_MustBeOff}; 1597 case Intrinsic::nvvm_round_ftz_f: 1598 return {Intrinsic::round, FTZ_MustBeOn}; 1599 case Intrinsic::nvvm_sqrt_rn_d: 1600 return {Intrinsic::sqrt, FTZ_Any}; 1601 case Intrinsic::nvvm_sqrt_f: 1602 // nvvm_sqrt_f is a special case. For most intrinsics, foo_ftz_f is the 1603 // ftz version, and foo_f is the non-ftz version. But nvvm_sqrt_f adopts 1604 // the ftz-ness of the surrounding code. sqrt_rn_f and sqrt_rn_ftz_f are 1605 // the versions with explicit ftz-ness. 1606 return {Intrinsic::sqrt, FTZ_Any}; 1607 case Intrinsic::nvvm_sqrt_rn_f: 1608 return {Intrinsic::sqrt, FTZ_MustBeOff}; 1609 case Intrinsic::nvvm_sqrt_rn_ftz_f: 1610 return {Intrinsic::sqrt, FTZ_MustBeOn}; 1611 case Intrinsic::nvvm_trunc_d: 1612 return {Intrinsic::trunc, FTZ_Any}; 1613 case Intrinsic::nvvm_trunc_f: 1614 return {Intrinsic::trunc, FTZ_MustBeOff}; 1615 case Intrinsic::nvvm_trunc_ftz_f: 1616 return {Intrinsic::trunc, FTZ_MustBeOn}; 1617 1618 // NVVM intrinsics that map to LLVM cast operations. 1619 // 1620 // Note that llvm's target-generic conversion operators correspond to the rz 1621 // (round to zero) versions of the nvvm conversion intrinsics, even though 1622 // most everything else here uses the rn (round to nearest even) nvvm ops. 1623 case Intrinsic::nvvm_d2i_rz: 1624 case Intrinsic::nvvm_f2i_rz: 1625 case Intrinsic::nvvm_d2ll_rz: 1626 case Intrinsic::nvvm_f2ll_rz: 1627 return {Instruction::FPToSI}; 1628 case Intrinsic::nvvm_d2ui_rz: 1629 case Intrinsic::nvvm_f2ui_rz: 1630 case Intrinsic::nvvm_d2ull_rz: 1631 case Intrinsic::nvvm_f2ull_rz: 1632 return {Instruction::FPToUI}; 1633 case Intrinsic::nvvm_i2d_rz: 1634 case Intrinsic::nvvm_i2f_rz: 1635 case Intrinsic::nvvm_ll2d_rz: 1636 case Intrinsic::nvvm_ll2f_rz: 1637 return {Instruction::SIToFP}; 1638 case Intrinsic::nvvm_ui2d_rz: 1639 case Intrinsic::nvvm_ui2f_rz: 1640 case Intrinsic::nvvm_ull2d_rz: 1641 case Intrinsic::nvvm_ull2f_rz: 1642 return {Instruction::UIToFP}; 1643 1644 // NVVM intrinsics that map to LLVM binary ops. 1645 case Intrinsic::nvvm_add_rn_d: 1646 return {Instruction::FAdd, FTZ_Any}; 1647 case Intrinsic::nvvm_add_rn_f: 1648 return {Instruction::FAdd, FTZ_MustBeOff}; 1649 case Intrinsic::nvvm_add_rn_ftz_f: 1650 return {Instruction::FAdd, FTZ_MustBeOn}; 1651 case Intrinsic::nvvm_mul_rn_d: 1652 return {Instruction::FMul, FTZ_Any}; 1653 case Intrinsic::nvvm_mul_rn_f: 1654 return {Instruction::FMul, FTZ_MustBeOff}; 1655 case Intrinsic::nvvm_mul_rn_ftz_f: 1656 return {Instruction::FMul, FTZ_MustBeOn}; 1657 case Intrinsic::nvvm_div_rn_d: 1658 return {Instruction::FDiv, FTZ_Any}; 1659 case Intrinsic::nvvm_div_rn_f: 1660 return {Instruction::FDiv, FTZ_MustBeOff}; 1661 case Intrinsic::nvvm_div_rn_ftz_f: 1662 return {Instruction::FDiv, FTZ_MustBeOn}; 1663 1664 // The remainder of cases are NVVM intrinsics that map to LLVM idioms, but 1665 // need special handling. 1666 // 1667 // We seem to be missing intrinsics for rcp.approx.{ftz.}f32, which is just 1668 // as well. 1669 case Intrinsic::nvvm_rcp_rn_d: 1670 return {SPC_Reciprocal, FTZ_Any}; 1671 case Intrinsic::nvvm_rcp_rn_f: 1672 return {SPC_Reciprocal, FTZ_MustBeOff}; 1673 case Intrinsic::nvvm_rcp_rn_ftz_f: 1674 return {SPC_Reciprocal, FTZ_MustBeOn}; 1675 1676 // We do not currently simplify intrinsics that give an approximate answer. 1677 // These include: 1678 // 1679 // - nvvm_cos_approx_{f,ftz_f} 1680 // - nvvm_ex2_approx_{d,f,ftz_f} 1681 // - nvvm_lg2_approx_{d,f,ftz_f} 1682 // - nvvm_sin_approx_{f,ftz_f} 1683 // - nvvm_sqrt_approx_{f,ftz_f} 1684 // - nvvm_rsqrt_approx_{d,f,ftz_f} 1685 // - nvvm_div_approx_{ftz_d,ftz_f,f} 1686 // - nvvm_rcp_approx_ftz_d 1687 // 1688 // Ideally we'd encode them as e.g. "fast call @llvm.cos", where "fast" 1689 // means that fastmath is enabled in the intrinsic. Unfortunately only 1690 // binary operators (currently) have a fastmath bit in SelectionDAG, so this 1691 // information gets lost and we can't select on it. 1692 // 1693 // TODO: div and rcp are lowered to a binary op, so these we could in theory 1694 // lower them to "fast fdiv". 1695 1696 default: 1697 return {}; 1698 } 1699 }(); 1700 1701 // If Action.FtzRequirementTy is not satisfied by the module's ftz state, we 1702 // can bail out now. (Notice that in the case that IID is not an NVVM 1703 // intrinsic, we don't have to look up any module metadata, as 1704 // FtzRequirementTy will be FTZ_Any.) 1705 if (Action.FtzRequirement != FTZ_Any) { 1706 bool FtzEnabled = 1707 II->getFunction()->getFnAttribute("nvptx-f32ftz").getValueAsString() == 1708 "true"; 1709 1710 if (FtzEnabled != (Action.FtzRequirement == FTZ_MustBeOn)) 1711 return nullptr; 1712 } 1713 1714 // Simplify to target-generic intrinsic. 1715 if (Action.IID) { 1716 SmallVector<Value *, 4> Args(II->arg_operands()); 1717 // All the target-generic intrinsics currently of interest to us have one 1718 // type argument, equal to that of the nvvm intrinsic's argument. 1719 Type *Tys[] = {II->getArgOperand(0)->getType()}; 1720 return CallInst::Create( 1721 Intrinsic::getDeclaration(II->getModule(), *Action.IID, Tys), Args); 1722 } 1723 1724 // Simplify to target-generic binary op. 1725 if (Action.BinaryOp) 1726 return BinaryOperator::Create(*Action.BinaryOp, II->getArgOperand(0), 1727 II->getArgOperand(1), II->getName()); 1728 1729 // Simplify to target-generic cast op. 1730 if (Action.CastOp) 1731 return CastInst::Create(*Action.CastOp, II->getArgOperand(0), II->getType(), 1732 II->getName()); 1733 1734 // All that's left are the special cases. 1735 if (!Action.Special) 1736 return nullptr; 1737 1738 switch (*Action.Special) { 1739 case SPC_Reciprocal: 1740 // Simplify reciprocal. 1741 return BinaryOperator::Create( 1742 Instruction::FDiv, ConstantFP::get(II->getArgOperand(0)->getType(), 1), 1743 II->getArgOperand(0), II->getName()); 1744 } 1745 llvm_unreachable("All SpecialCase enumerators should be handled in switch."); 1746 } 1747 1748 Instruction *InstCombiner::visitVAStartInst(VAStartInst &I) { 1749 removeTriviallyEmptyRange(I, Intrinsic::vastart, Intrinsic::vaend, *this); 1750 return nullptr; 1751 } 1752 1753 Instruction *InstCombiner::visitVACopyInst(VACopyInst &I) { 1754 removeTriviallyEmptyRange(I, Intrinsic::vacopy, Intrinsic::vaend, *this); 1755 return nullptr; 1756 } 1757 1758 static Instruction *canonicalizeConstantArg0ToArg1(CallInst &Call) { 1759 assert(Call.getNumArgOperands() > 1 && "Need at least 2 args to swap"); 1760 Value *Arg0 = Call.getArgOperand(0), *Arg1 = Call.getArgOperand(1); 1761 if (isa<Constant>(Arg0) && !isa<Constant>(Arg1)) { 1762 Call.setArgOperand(0, Arg1); 1763 Call.setArgOperand(1, Arg0); 1764 return &Call; 1765 } 1766 return nullptr; 1767 } 1768 1769 Instruction *InstCombiner::foldIntrinsicWithOverflowCommon(IntrinsicInst *II) { 1770 WithOverflowInst *WO = cast<WithOverflowInst>(II); 1771 Value *OperationResult = nullptr; 1772 Constant *OverflowResult = nullptr; 1773 if (OptimizeOverflowCheck(WO->getBinaryOp(), WO->isSigned(), WO->getLHS(), 1774 WO->getRHS(), *WO, OperationResult, OverflowResult)) 1775 return CreateOverflowTuple(WO, OperationResult, OverflowResult); 1776 return nullptr; 1777 } 1778 1779 /// CallInst simplification. This mostly only handles folding of intrinsic 1780 /// instructions. For normal calls, it allows visitCallBase to do the heavy 1781 /// lifting. 1782 Instruction *InstCombiner::visitCallInst(CallInst &CI) { 1783 if (Value *V = SimplifyCall(&CI, SQ.getWithInstruction(&CI))) 1784 return replaceInstUsesWith(CI, V); 1785 1786 if (isFreeCall(&CI, &TLI)) 1787 return visitFree(CI); 1788 1789 // If the caller function is nounwind, mark the call as nounwind, even if the 1790 // callee isn't. 1791 if (CI.getFunction()->doesNotThrow() && !CI.doesNotThrow()) { 1792 CI.setDoesNotThrow(); 1793 return &CI; 1794 } 1795 1796 IntrinsicInst *II = dyn_cast<IntrinsicInst>(&CI); 1797 if (!II) return visitCallBase(CI); 1798 1799 // Intrinsics cannot occur in an invoke or a callbr, so handle them here 1800 // instead of in visitCallBase. 1801 if (auto *MI = dyn_cast<AnyMemIntrinsic>(II)) { 1802 bool Changed = false; 1803 1804 // memmove/cpy/set of zero bytes is a noop. 1805 if (Constant *NumBytes = dyn_cast<Constant>(MI->getLength())) { 1806 if (NumBytes->isNullValue()) 1807 return eraseInstFromFunction(CI); 1808 1809 if (ConstantInt *CI = dyn_cast<ConstantInt>(NumBytes)) 1810 if (CI->getZExtValue() == 1) { 1811 // Replace the instruction with just byte operations. We would 1812 // transform other cases to loads/stores, but we don't know if 1813 // alignment is sufficient. 1814 } 1815 } 1816 1817 // No other transformations apply to volatile transfers. 1818 if (auto *M = dyn_cast<MemIntrinsic>(MI)) 1819 if (M->isVolatile()) 1820 return nullptr; 1821 1822 // If we have a memmove and the source operation is a constant global, 1823 // then the source and dest pointers can't alias, so we can change this 1824 // into a call to memcpy. 1825 if (auto *MMI = dyn_cast<AnyMemMoveInst>(MI)) { 1826 if (GlobalVariable *GVSrc = dyn_cast<GlobalVariable>(MMI->getSource())) 1827 if (GVSrc->isConstant()) { 1828 Module *M = CI.getModule(); 1829 Intrinsic::ID MemCpyID = 1830 isa<AtomicMemMoveInst>(MMI) 1831 ? Intrinsic::memcpy_element_unordered_atomic 1832 : Intrinsic::memcpy; 1833 Type *Tys[3] = { CI.getArgOperand(0)->getType(), 1834 CI.getArgOperand(1)->getType(), 1835 CI.getArgOperand(2)->getType() }; 1836 CI.setCalledFunction(Intrinsic::getDeclaration(M, MemCpyID, Tys)); 1837 Changed = true; 1838 } 1839 } 1840 1841 if (AnyMemTransferInst *MTI = dyn_cast<AnyMemTransferInst>(MI)) { 1842 // memmove(x,x,size) -> noop. 1843 if (MTI->getSource() == MTI->getDest()) 1844 return eraseInstFromFunction(CI); 1845 } 1846 1847 // If we can determine a pointer alignment that is bigger than currently 1848 // set, update the alignment. 1849 if (auto *MTI = dyn_cast<AnyMemTransferInst>(MI)) { 1850 if (Instruction *I = SimplifyAnyMemTransfer(MTI)) 1851 return I; 1852 } else if (auto *MSI = dyn_cast<AnyMemSetInst>(MI)) { 1853 if (Instruction *I = SimplifyAnyMemSet(MSI)) 1854 return I; 1855 } 1856 1857 if (Changed) return II; 1858 } 1859 1860 // For vector result intrinsics, use the generic demanded vector support. 1861 if (II->getType()->isVectorTy()) { 1862 auto VWidth = II->getType()->getVectorNumElements(); 1863 APInt UndefElts(VWidth, 0); 1864 APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth)); 1865 if (Value *V = SimplifyDemandedVectorElts(II, AllOnesEltMask, UndefElts)) { 1866 if (V != II) 1867 return replaceInstUsesWith(*II, V); 1868 return II; 1869 } 1870 } 1871 1872 if (Instruction *I = SimplifyNVVMIntrinsic(II, *this)) 1873 return I; 1874 1875 auto SimplifyDemandedVectorEltsLow = [this](Value *Op, unsigned Width, 1876 unsigned DemandedWidth) { 1877 APInt UndefElts(Width, 0); 1878 APInt DemandedElts = APInt::getLowBitsSet(Width, DemandedWidth); 1879 return SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts); 1880 }; 1881 1882 Intrinsic::ID IID = II->getIntrinsicID(); 1883 switch (IID) { 1884 default: break; 1885 case Intrinsic::objectsize: 1886 if (Value *V = lowerObjectSizeCall(II, DL, &TLI, /*MustSucceed=*/false)) 1887 return replaceInstUsesWith(CI, V); 1888 return nullptr; 1889 case Intrinsic::bswap: { 1890 Value *IIOperand = II->getArgOperand(0); 1891 Value *X = nullptr; 1892 1893 // bswap(trunc(bswap(x))) -> trunc(lshr(x, c)) 1894 if (match(IIOperand, m_Trunc(m_BSwap(m_Value(X))))) { 1895 unsigned C = X->getType()->getPrimitiveSizeInBits() - 1896 IIOperand->getType()->getPrimitiveSizeInBits(); 1897 Value *CV = ConstantInt::get(X->getType(), C); 1898 Value *V = Builder.CreateLShr(X, CV); 1899 return new TruncInst(V, IIOperand->getType()); 1900 } 1901 break; 1902 } 1903 case Intrinsic::masked_load: 1904 if (Value *SimplifiedMaskedOp = simplifyMaskedLoad(*II)) 1905 return replaceInstUsesWith(CI, SimplifiedMaskedOp); 1906 break; 1907 case Intrinsic::masked_store: 1908 return simplifyMaskedStore(*II); 1909 case Intrinsic::masked_gather: 1910 return simplifyMaskedGather(*II); 1911 case Intrinsic::masked_scatter: 1912 return simplifyMaskedScatter(*II); 1913 case Intrinsic::launder_invariant_group: 1914 case Intrinsic::strip_invariant_group: 1915 if (auto *SkippedBarrier = simplifyInvariantGroupIntrinsic(*II, *this)) 1916 return replaceInstUsesWith(*II, SkippedBarrier); 1917 break; 1918 case Intrinsic::powi: 1919 if (ConstantInt *Power = dyn_cast<ConstantInt>(II->getArgOperand(1))) { 1920 // 0 and 1 are handled in instsimplify 1921 1922 // powi(x, -1) -> 1/x 1923 if (Power->isMinusOne()) 1924 return BinaryOperator::CreateFDiv(ConstantFP::get(CI.getType(), 1.0), 1925 II->getArgOperand(0)); 1926 // powi(x, 2) -> x*x 1927 if (Power->equalsInt(2)) 1928 return BinaryOperator::CreateFMul(II->getArgOperand(0), 1929 II->getArgOperand(0)); 1930 } 1931 break; 1932 1933 case Intrinsic::cttz: 1934 case Intrinsic::ctlz: 1935 if (auto *I = foldCttzCtlz(*II, *this)) 1936 return I; 1937 break; 1938 1939 case Intrinsic::ctpop: 1940 if (auto *I = foldCtpop(*II, *this)) 1941 return I; 1942 break; 1943 1944 case Intrinsic::fshl: 1945 case Intrinsic::fshr: { 1946 Value *Op0 = II->getArgOperand(0), *Op1 = II->getArgOperand(1); 1947 Type *Ty = II->getType(); 1948 unsigned BitWidth = Ty->getScalarSizeInBits(); 1949 Constant *ShAmtC; 1950 if (match(II->getArgOperand(2), m_Constant(ShAmtC)) && 1951 !isa<ConstantExpr>(ShAmtC) && !ShAmtC->containsConstantExpression()) { 1952 // Canonicalize a shift amount constant operand to modulo the bit-width. 1953 Constant *WidthC = ConstantInt::get(Ty, BitWidth); 1954 Constant *ModuloC = ConstantExpr::getURem(ShAmtC, WidthC); 1955 if (ModuloC != ShAmtC) { 1956 II->setArgOperand(2, ModuloC); 1957 return II; 1958 } 1959 assert(ConstantExpr::getICmp(ICmpInst::ICMP_UGT, WidthC, ShAmtC) == 1960 ConstantInt::getTrue(CmpInst::makeCmpResultType(Ty)) && 1961 "Shift amount expected to be modulo bitwidth"); 1962 1963 // Canonicalize funnel shift right by constant to funnel shift left. This 1964 // is not entirely arbitrary. For historical reasons, the backend may 1965 // recognize rotate left patterns but miss rotate right patterns. 1966 if (IID == Intrinsic::fshr) { 1967 // fshr X, Y, C --> fshl X, Y, (BitWidth - C) 1968 Constant *LeftShiftC = ConstantExpr::getSub(WidthC, ShAmtC); 1969 Module *Mod = II->getModule(); 1970 Function *Fshl = Intrinsic::getDeclaration(Mod, Intrinsic::fshl, Ty); 1971 return CallInst::Create(Fshl, { Op0, Op1, LeftShiftC }); 1972 } 1973 assert(IID == Intrinsic::fshl && 1974 "All funnel shifts by simple constants should go left"); 1975 1976 // fshl(X, 0, C) --> shl X, C 1977 // fshl(X, undef, C) --> shl X, C 1978 if (match(Op1, m_ZeroInt()) || match(Op1, m_Undef())) 1979 return BinaryOperator::CreateShl(Op0, ShAmtC); 1980 1981 // fshl(0, X, C) --> lshr X, (BW-C) 1982 // fshl(undef, X, C) --> lshr X, (BW-C) 1983 if (match(Op0, m_ZeroInt()) || match(Op0, m_Undef())) 1984 return BinaryOperator::CreateLShr(Op1, 1985 ConstantExpr::getSub(WidthC, ShAmtC)); 1986 1987 // fshl i16 X, X, 8 --> bswap i16 X (reduce to more-specific form) 1988 if (Op0 == Op1 && BitWidth == 16 && match(ShAmtC, m_SpecificInt(8))) { 1989 Module *Mod = II->getModule(); 1990 Function *Bswap = Intrinsic::getDeclaration(Mod, Intrinsic::bswap, Ty); 1991 return CallInst::Create(Bswap, { Op0 }); 1992 } 1993 } 1994 1995 // Left or right might be masked. 1996 if (SimplifyDemandedInstructionBits(*II)) 1997 return &CI; 1998 1999 // The shift amount (operand 2) of a funnel shift is modulo the bitwidth, 2000 // so only the low bits of the shift amount are demanded if the bitwidth is 2001 // a power-of-2. 2002 if (!isPowerOf2_32(BitWidth)) 2003 break; 2004 APInt Op2Demanded = APInt::getLowBitsSet(BitWidth, Log2_32_Ceil(BitWidth)); 2005 KnownBits Op2Known(BitWidth); 2006 if (SimplifyDemandedBits(II, 2, Op2Demanded, Op2Known)) 2007 return &CI; 2008 break; 2009 } 2010 case Intrinsic::uadd_with_overflow: 2011 case Intrinsic::sadd_with_overflow: { 2012 if (Instruction *I = canonicalizeConstantArg0ToArg1(CI)) 2013 return I; 2014 if (Instruction *I = foldIntrinsicWithOverflowCommon(II)) 2015 return I; 2016 2017 // Given 2 constant operands whose sum does not overflow: 2018 // uaddo (X +nuw C0), C1 -> uaddo X, C0 + C1 2019 // saddo (X +nsw C0), C1 -> saddo X, C0 + C1 2020 Value *X; 2021 const APInt *C0, *C1; 2022 Value *Arg0 = II->getArgOperand(0); 2023 Value *Arg1 = II->getArgOperand(1); 2024 bool IsSigned = IID == Intrinsic::sadd_with_overflow; 2025 bool HasNWAdd = IsSigned ? match(Arg0, m_NSWAdd(m_Value(X), m_APInt(C0))) 2026 : match(Arg0, m_NUWAdd(m_Value(X), m_APInt(C0))); 2027 if (HasNWAdd && match(Arg1, m_APInt(C1))) { 2028 bool Overflow; 2029 APInt NewC = 2030 IsSigned ? C1->sadd_ov(*C0, Overflow) : C1->uadd_ov(*C0, Overflow); 2031 if (!Overflow) 2032 return replaceInstUsesWith( 2033 *II, Builder.CreateBinaryIntrinsic( 2034 IID, X, ConstantInt::get(Arg1->getType(), NewC))); 2035 } 2036 break; 2037 } 2038 2039 case Intrinsic::umul_with_overflow: 2040 case Intrinsic::smul_with_overflow: 2041 if (Instruction *I = canonicalizeConstantArg0ToArg1(CI)) 2042 return I; 2043 LLVM_FALLTHROUGH; 2044 2045 case Intrinsic::usub_with_overflow: 2046 if (Instruction *I = foldIntrinsicWithOverflowCommon(II)) 2047 return I; 2048 break; 2049 2050 case Intrinsic::ssub_with_overflow: { 2051 if (Instruction *I = foldIntrinsicWithOverflowCommon(II)) 2052 return I; 2053 2054 Constant *C; 2055 Value *Arg0 = II->getArgOperand(0); 2056 Value *Arg1 = II->getArgOperand(1); 2057 // Given a constant C that is not the minimum signed value 2058 // for an integer of a given bit width: 2059 // 2060 // ssubo X, C -> saddo X, -C 2061 if (match(Arg1, m_Constant(C)) && C->isNotMinSignedValue()) { 2062 Value *NegVal = ConstantExpr::getNeg(C); 2063 // Build a saddo call that is equivalent to the discovered 2064 // ssubo call. 2065 return replaceInstUsesWith( 2066 *II, Builder.CreateBinaryIntrinsic(Intrinsic::sadd_with_overflow, 2067 Arg0, NegVal)); 2068 } 2069 2070 break; 2071 } 2072 2073 case Intrinsic::uadd_sat: 2074 case Intrinsic::sadd_sat: 2075 if (Instruction *I = canonicalizeConstantArg0ToArg1(CI)) 2076 return I; 2077 LLVM_FALLTHROUGH; 2078 case Intrinsic::usub_sat: 2079 case Intrinsic::ssub_sat: { 2080 SaturatingInst *SI = cast<SaturatingInst>(II); 2081 Type *Ty = SI->getType(); 2082 Value *Arg0 = SI->getLHS(); 2083 Value *Arg1 = SI->getRHS(); 2084 2085 // Make use of known overflow information. 2086 OverflowResult OR = computeOverflow(SI->getBinaryOp(), SI->isSigned(), 2087 Arg0, Arg1, SI); 2088 switch (OR) { 2089 case OverflowResult::MayOverflow: 2090 break; 2091 case OverflowResult::NeverOverflows: 2092 if (SI->isSigned()) 2093 return BinaryOperator::CreateNSW(SI->getBinaryOp(), Arg0, Arg1); 2094 else 2095 return BinaryOperator::CreateNUW(SI->getBinaryOp(), Arg0, Arg1); 2096 case OverflowResult::AlwaysOverflowsLow: { 2097 unsigned BitWidth = Ty->getScalarSizeInBits(); 2098 APInt Min = APSInt::getMinValue(BitWidth, !SI->isSigned()); 2099 return replaceInstUsesWith(*SI, ConstantInt::get(Ty, Min)); 2100 } 2101 case OverflowResult::AlwaysOverflowsHigh: { 2102 unsigned BitWidth = Ty->getScalarSizeInBits(); 2103 APInt Max = APSInt::getMaxValue(BitWidth, !SI->isSigned()); 2104 return replaceInstUsesWith(*SI, ConstantInt::get(Ty, Max)); 2105 } 2106 } 2107 2108 // ssub.sat(X, C) -> sadd.sat(X, -C) if C != MIN 2109 Constant *C; 2110 if (IID == Intrinsic::ssub_sat && match(Arg1, m_Constant(C)) && 2111 C->isNotMinSignedValue()) { 2112 Value *NegVal = ConstantExpr::getNeg(C); 2113 return replaceInstUsesWith( 2114 *II, Builder.CreateBinaryIntrinsic( 2115 Intrinsic::sadd_sat, Arg0, NegVal)); 2116 } 2117 2118 // sat(sat(X + Val2) + Val) -> sat(X + (Val+Val2)) 2119 // sat(sat(X - Val2) - Val) -> sat(X - (Val+Val2)) 2120 // if Val and Val2 have the same sign 2121 if (auto *Other = dyn_cast<IntrinsicInst>(Arg0)) { 2122 Value *X; 2123 const APInt *Val, *Val2; 2124 APInt NewVal; 2125 bool IsUnsigned = 2126 IID == Intrinsic::uadd_sat || IID == Intrinsic::usub_sat; 2127 if (Other->getIntrinsicID() == IID && 2128 match(Arg1, m_APInt(Val)) && 2129 match(Other->getArgOperand(0), m_Value(X)) && 2130 match(Other->getArgOperand(1), m_APInt(Val2))) { 2131 if (IsUnsigned) 2132 NewVal = Val->uadd_sat(*Val2); 2133 else if (Val->isNonNegative() == Val2->isNonNegative()) { 2134 bool Overflow; 2135 NewVal = Val->sadd_ov(*Val2, Overflow); 2136 if (Overflow) { 2137 // Both adds together may add more than SignedMaxValue 2138 // without saturating the final result. 2139 break; 2140 } 2141 } else { 2142 // Cannot fold saturated addition with different signs. 2143 break; 2144 } 2145 2146 return replaceInstUsesWith( 2147 *II, Builder.CreateBinaryIntrinsic( 2148 IID, X, ConstantInt::get(II->getType(), NewVal))); 2149 } 2150 } 2151 break; 2152 } 2153 2154 case Intrinsic::minnum: 2155 case Intrinsic::maxnum: 2156 case Intrinsic::minimum: 2157 case Intrinsic::maximum: { 2158 if (Instruction *I = canonicalizeConstantArg0ToArg1(CI)) 2159 return I; 2160 Value *Arg0 = II->getArgOperand(0); 2161 Value *Arg1 = II->getArgOperand(1); 2162 Value *X, *Y; 2163 if (match(Arg0, m_FNeg(m_Value(X))) && match(Arg1, m_FNeg(m_Value(Y))) && 2164 (Arg0->hasOneUse() || Arg1->hasOneUse())) { 2165 // If both operands are negated, invert the call and negate the result: 2166 // min(-X, -Y) --> -(max(X, Y)) 2167 // max(-X, -Y) --> -(min(X, Y)) 2168 Intrinsic::ID NewIID; 2169 switch (IID) { 2170 case Intrinsic::maxnum: 2171 NewIID = Intrinsic::minnum; 2172 break; 2173 case Intrinsic::minnum: 2174 NewIID = Intrinsic::maxnum; 2175 break; 2176 case Intrinsic::maximum: 2177 NewIID = Intrinsic::minimum; 2178 break; 2179 case Intrinsic::minimum: 2180 NewIID = Intrinsic::maximum; 2181 break; 2182 default: 2183 llvm_unreachable("unexpected intrinsic ID"); 2184 } 2185 Value *NewCall = Builder.CreateBinaryIntrinsic(NewIID, X, Y, II); 2186 Instruction *FNeg = BinaryOperator::CreateFNeg(NewCall); 2187 FNeg->copyIRFlags(II); 2188 return FNeg; 2189 } 2190 2191 // m(m(X, C2), C1) -> m(X, C) 2192 const APFloat *C1, *C2; 2193 if (auto *M = dyn_cast<IntrinsicInst>(Arg0)) { 2194 if (M->getIntrinsicID() == IID && match(Arg1, m_APFloat(C1)) && 2195 ((match(M->getArgOperand(0), m_Value(X)) && 2196 match(M->getArgOperand(1), m_APFloat(C2))) || 2197 (match(M->getArgOperand(1), m_Value(X)) && 2198 match(M->getArgOperand(0), m_APFloat(C2))))) { 2199 APFloat Res(0.0); 2200 switch (IID) { 2201 case Intrinsic::maxnum: 2202 Res = maxnum(*C1, *C2); 2203 break; 2204 case Intrinsic::minnum: 2205 Res = minnum(*C1, *C2); 2206 break; 2207 case Intrinsic::maximum: 2208 Res = maximum(*C1, *C2); 2209 break; 2210 case Intrinsic::minimum: 2211 Res = minimum(*C1, *C2); 2212 break; 2213 default: 2214 llvm_unreachable("unexpected intrinsic ID"); 2215 } 2216 Instruction *NewCall = Builder.CreateBinaryIntrinsic( 2217 IID, X, ConstantFP::get(Arg0->getType(), Res)); 2218 NewCall->copyIRFlags(II); 2219 return replaceInstUsesWith(*II, NewCall); 2220 } 2221 } 2222 2223 break; 2224 } 2225 case Intrinsic::fmuladd: { 2226 // Canonicalize fast fmuladd to the separate fmul + fadd. 2227 if (II->isFast()) { 2228 BuilderTy::FastMathFlagGuard Guard(Builder); 2229 Builder.setFastMathFlags(II->getFastMathFlags()); 2230 Value *Mul = Builder.CreateFMul(II->getArgOperand(0), 2231 II->getArgOperand(1)); 2232 Value *Add = Builder.CreateFAdd(Mul, II->getArgOperand(2)); 2233 Add->takeName(II); 2234 return replaceInstUsesWith(*II, Add); 2235 } 2236 2237 // Try to simplify the underlying FMul. 2238 if (Value *V = SimplifyFMulInst(II->getArgOperand(0), II->getArgOperand(1), 2239 II->getFastMathFlags(), 2240 SQ.getWithInstruction(II))) { 2241 auto *FAdd = BinaryOperator::CreateFAdd(V, II->getArgOperand(2)); 2242 FAdd->copyFastMathFlags(II); 2243 return FAdd; 2244 } 2245 2246 LLVM_FALLTHROUGH; 2247 } 2248 case Intrinsic::fma: { 2249 if (Instruction *I = canonicalizeConstantArg0ToArg1(CI)) 2250 return I; 2251 2252 // fma fneg(x), fneg(y), z -> fma x, y, z 2253 Value *Src0 = II->getArgOperand(0); 2254 Value *Src1 = II->getArgOperand(1); 2255 Value *X, *Y; 2256 if (match(Src0, m_FNeg(m_Value(X))) && match(Src1, m_FNeg(m_Value(Y)))) { 2257 II->setArgOperand(0, X); 2258 II->setArgOperand(1, Y); 2259 return II; 2260 } 2261 2262 // fma fabs(x), fabs(x), z -> fma x, x, z 2263 if (match(Src0, m_FAbs(m_Value(X))) && 2264 match(Src1, m_FAbs(m_Specific(X)))) { 2265 II->setArgOperand(0, X); 2266 II->setArgOperand(1, X); 2267 return II; 2268 } 2269 2270 // Try to simplify the underlying FMul. We can only apply simplifications 2271 // that do not require rounding. 2272 if (Value *V = SimplifyFMAFMul(II->getArgOperand(0), II->getArgOperand(1), 2273 II->getFastMathFlags(), 2274 SQ.getWithInstruction(II))) { 2275 auto *FAdd = BinaryOperator::CreateFAdd(V, II->getArgOperand(2)); 2276 FAdd->copyFastMathFlags(II); 2277 return FAdd; 2278 } 2279 2280 break; 2281 } 2282 case Intrinsic::fabs: { 2283 Value *Cond; 2284 Constant *LHS, *RHS; 2285 if (match(II->getArgOperand(0), 2286 m_Select(m_Value(Cond), m_Constant(LHS), m_Constant(RHS)))) { 2287 CallInst *Call0 = Builder.CreateCall(II->getCalledFunction(), {LHS}); 2288 CallInst *Call1 = Builder.CreateCall(II->getCalledFunction(), {RHS}); 2289 return SelectInst::Create(Cond, Call0, Call1); 2290 } 2291 2292 LLVM_FALLTHROUGH; 2293 } 2294 case Intrinsic::ceil: 2295 case Intrinsic::floor: 2296 case Intrinsic::round: 2297 case Intrinsic::nearbyint: 2298 case Intrinsic::rint: 2299 case Intrinsic::trunc: { 2300 Value *ExtSrc; 2301 if (match(II->getArgOperand(0), m_OneUse(m_FPExt(m_Value(ExtSrc))))) { 2302 // Narrow the call: intrinsic (fpext x) -> fpext (intrinsic x) 2303 Value *NarrowII = Builder.CreateUnaryIntrinsic(IID, ExtSrc, II); 2304 return new FPExtInst(NarrowII, II->getType()); 2305 } 2306 break; 2307 } 2308 case Intrinsic::cos: 2309 case Intrinsic::amdgcn_cos: { 2310 Value *X; 2311 Value *Src = II->getArgOperand(0); 2312 if (match(Src, m_FNeg(m_Value(X))) || match(Src, m_FAbs(m_Value(X)))) { 2313 // cos(-x) -> cos(x) 2314 // cos(fabs(x)) -> cos(x) 2315 II->setArgOperand(0, X); 2316 return II; 2317 } 2318 break; 2319 } 2320 case Intrinsic::sin: { 2321 Value *X; 2322 if (match(II->getArgOperand(0), m_OneUse(m_FNeg(m_Value(X))))) { 2323 // sin(-x) --> -sin(x) 2324 Value *NewSin = Builder.CreateUnaryIntrinsic(Intrinsic::sin, X, II); 2325 Instruction *FNeg = BinaryOperator::CreateFNeg(NewSin); 2326 FNeg->copyFastMathFlags(II); 2327 return FNeg; 2328 } 2329 break; 2330 } 2331 case Intrinsic::ppc_altivec_lvx: 2332 case Intrinsic::ppc_altivec_lvxl: 2333 // Turn PPC lvx -> load if the pointer is known aligned. 2334 if (getOrEnforceKnownAlignment(II->getArgOperand(0), 16, DL, II, &AC, 2335 &DT) >= 16) { 2336 Value *Ptr = Builder.CreateBitCast(II->getArgOperand(0), 2337 PointerType::getUnqual(II->getType())); 2338 return new LoadInst(II->getType(), Ptr); 2339 } 2340 break; 2341 case Intrinsic::ppc_vsx_lxvw4x: 2342 case Intrinsic::ppc_vsx_lxvd2x: { 2343 // Turn PPC VSX loads into normal loads. 2344 Value *Ptr = Builder.CreateBitCast(II->getArgOperand(0), 2345 PointerType::getUnqual(II->getType())); 2346 return new LoadInst(II->getType(), Ptr, Twine(""), false, Align::None()); 2347 } 2348 case Intrinsic::ppc_altivec_stvx: 2349 case Intrinsic::ppc_altivec_stvxl: 2350 // Turn stvx -> store if the pointer is known aligned. 2351 if (getOrEnforceKnownAlignment(II->getArgOperand(1), 16, DL, II, &AC, 2352 &DT) >= 16) { 2353 Type *OpPtrTy = 2354 PointerType::getUnqual(II->getArgOperand(0)->getType()); 2355 Value *Ptr = Builder.CreateBitCast(II->getArgOperand(1), OpPtrTy); 2356 return new StoreInst(II->getArgOperand(0), Ptr); 2357 } 2358 break; 2359 case Intrinsic::ppc_vsx_stxvw4x: 2360 case Intrinsic::ppc_vsx_stxvd2x: { 2361 // Turn PPC VSX stores into normal stores. 2362 Type *OpPtrTy = PointerType::getUnqual(II->getArgOperand(0)->getType()); 2363 Value *Ptr = Builder.CreateBitCast(II->getArgOperand(1), OpPtrTy); 2364 return new StoreInst(II->getArgOperand(0), Ptr, false, Align::None()); 2365 } 2366 case Intrinsic::ppc_qpx_qvlfs: 2367 // Turn PPC QPX qvlfs -> load if the pointer is known aligned. 2368 if (getOrEnforceKnownAlignment(II->getArgOperand(0), 16, DL, II, &AC, 2369 &DT) >= 16) { 2370 Type *VTy = VectorType::get(Builder.getFloatTy(), 2371 II->getType()->getVectorNumElements()); 2372 Value *Ptr = Builder.CreateBitCast(II->getArgOperand(0), 2373 PointerType::getUnqual(VTy)); 2374 Value *Load = Builder.CreateLoad(VTy, Ptr); 2375 return new FPExtInst(Load, II->getType()); 2376 } 2377 break; 2378 case Intrinsic::ppc_qpx_qvlfd: 2379 // Turn PPC QPX qvlfd -> load if the pointer is known aligned. 2380 if (getOrEnforceKnownAlignment(II->getArgOperand(0), 32, DL, II, &AC, 2381 &DT) >= 32) { 2382 Value *Ptr = Builder.CreateBitCast(II->getArgOperand(0), 2383 PointerType::getUnqual(II->getType())); 2384 return new LoadInst(II->getType(), Ptr); 2385 } 2386 break; 2387 case Intrinsic::ppc_qpx_qvstfs: 2388 // Turn PPC QPX qvstfs -> store if the pointer is known aligned. 2389 if (getOrEnforceKnownAlignment(II->getArgOperand(1), 16, DL, II, &AC, 2390 &DT) >= 16) { 2391 Type *VTy = VectorType::get(Builder.getFloatTy(), 2392 II->getArgOperand(0)->getType()->getVectorNumElements()); 2393 Value *TOp = Builder.CreateFPTrunc(II->getArgOperand(0), VTy); 2394 Type *OpPtrTy = PointerType::getUnqual(VTy); 2395 Value *Ptr = Builder.CreateBitCast(II->getArgOperand(1), OpPtrTy); 2396 return new StoreInst(TOp, Ptr); 2397 } 2398 break; 2399 case Intrinsic::ppc_qpx_qvstfd: 2400 // Turn PPC QPX qvstfd -> store if the pointer is known aligned. 2401 if (getOrEnforceKnownAlignment(II->getArgOperand(1), 32, DL, II, &AC, 2402 &DT) >= 32) { 2403 Type *OpPtrTy = 2404 PointerType::getUnqual(II->getArgOperand(0)->getType()); 2405 Value *Ptr = Builder.CreateBitCast(II->getArgOperand(1), OpPtrTy); 2406 return new StoreInst(II->getArgOperand(0), Ptr); 2407 } 2408 break; 2409 2410 case Intrinsic::x86_bmi_bextr_32: 2411 case Intrinsic::x86_bmi_bextr_64: 2412 case Intrinsic::x86_tbm_bextri_u32: 2413 case Intrinsic::x86_tbm_bextri_u64: 2414 // If the RHS is a constant we can try some simplifications. 2415 if (auto *C = dyn_cast<ConstantInt>(II->getArgOperand(1))) { 2416 uint64_t Shift = C->getZExtValue(); 2417 uint64_t Length = (Shift >> 8) & 0xff; 2418 Shift &= 0xff; 2419 unsigned BitWidth = II->getType()->getIntegerBitWidth(); 2420 // If the length is 0 or the shift is out of range, replace with zero. 2421 if (Length == 0 || Shift >= BitWidth) 2422 return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), 0)); 2423 // If the LHS is also a constant, we can completely constant fold this. 2424 if (auto *InC = dyn_cast<ConstantInt>(II->getArgOperand(0))) { 2425 uint64_t Result = InC->getZExtValue() >> Shift; 2426 if (Length > BitWidth) 2427 Length = BitWidth; 2428 Result &= maskTrailingOnes<uint64_t>(Length); 2429 return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), Result)); 2430 } 2431 // TODO should we turn this into 'and' if shift is 0? Or 'shl' if we 2432 // are only masking bits that a shift already cleared? 2433 } 2434 break; 2435 2436 case Intrinsic::x86_bmi_bzhi_32: 2437 case Intrinsic::x86_bmi_bzhi_64: 2438 // If the RHS is a constant we can try some simplifications. 2439 if (auto *C = dyn_cast<ConstantInt>(II->getArgOperand(1))) { 2440 uint64_t Index = C->getZExtValue() & 0xff; 2441 unsigned BitWidth = II->getType()->getIntegerBitWidth(); 2442 if (Index >= BitWidth) 2443 return replaceInstUsesWith(CI, II->getArgOperand(0)); 2444 if (Index == 0) 2445 return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), 0)); 2446 // If the LHS is also a constant, we can completely constant fold this. 2447 if (auto *InC = dyn_cast<ConstantInt>(II->getArgOperand(0))) { 2448 uint64_t Result = InC->getZExtValue(); 2449 Result &= maskTrailingOnes<uint64_t>(Index); 2450 return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), Result)); 2451 } 2452 // TODO should we convert this to an AND if the RHS is constant? 2453 } 2454 break; 2455 2456 case Intrinsic::x86_vcvtph2ps_128: 2457 case Intrinsic::x86_vcvtph2ps_256: { 2458 auto Arg = II->getArgOperand(0); 2459 auto ArgType = cast<VectorType>(Arg->getType()); 2460 auto RetType = cast<VectorType>(II->getType()); 2461 unsigned ArgWidth = ArgType->getNumElements(); 2462 unsigned RetWidth = RetType->getNumElements(); 2463 assert(RetWidth <= ArgWidth && "Unexpected input/return vector widths"); 2464 assert(ArgType->isIntOrIntVectorTy() && 2465 ArgType->getScalarSizeInBits() == 16 && 2466 "CVTPH2PS input type should be 16-bit integer vector"); 2467 assert(RetType->getScalarType()->isFloatTy() && 2468 "CVTPH2PS output type should be 32-bit float vector"); 2469 2470 // Constant folding: Convert to generic half to single conversion. 2471 if (isa<ConstantAggregateZero>(Arg)) 2472 return replaceInstUsesWith(*II, ConstantAggregateZero::get(RetType)); 2473 2474 if (isa<ConstantDataVector>(Arg)) { 2475 auto VectorHalfAsShorts = Arg; 2476 if (RetWidth < ArgWidth) { 2477 SmallVector<uint32_t, 8> SubVecMask; 2478 for (unsigned i = 0; i != RetWidth; ++i) 2479 SubVecMask.push_back((int)i); 2480 VectorHalfAsShorts = Builder.CreateShuffleVector( 2481 Arg, UndefValue::get(ArgType), SubVecMask); 2482 } 2483 2484 auto VectorHalfType = 2485 VectorType::get(Type::getHalfTy(II->getContext()), RetWidth); 2486 auto VectorHalfs = 2487 Builder.CreateBitCast(VectorHalfAsShorts, VectorHalfType); 2488 auto VectorFloats = Builder.CreateFPExt(VectorHalfs, RetType); 2489 return replaceInstUsesWith(*II, VectorFloats); 2490 } 2491 2492 // We only use the lowest lanes of the argument. 2493 if (Value *V = SimplifyDemandedVectorEltsLow(Arg, ArgWidth, RetWidth)) { 2494 II->setArgOperand(0, V); 2495 return II; 2496 } 2497 break; 2498 } 2499 2500 case Intrinsic::x86_sse_cvtss2si: 2501 case Intrinsic::x86_sse_cvtss2si64: 2502 case Intrinsic::x86_sse_cvttss2si: 2503 case Intrinsic::x86_sse_cvttss2si64: 2504 case Intrinsic::x86_sse2_cvtsd2si: 2505 case Intrinsic::x86_sse2_cvtsd2si64: 2506 case Intrinsic::x86_sse2_cvttsd2si: 2507 case Intrinsic::x86_sse2_cvttsd2si64: 2508 case Intrinsic::x86_avx512_vcvtss2si32: 2509 case Intrinsic::x86_avx512_vcvtss2si64: 2510 case Intrinsic::x86_avx512_vcvtss2usi32: 2511 case Intrinsic::x86_avx512_vcvtss2usi64: 2512 case Intrinsic::x86_avx512_vcvtsd2si32: 2513 case Intrinsic::x86_avx512_vcvtsd2si64: 2514 case Intrinsic::x86_avx512_vcvtsd2usi32: 2515 case Intrinsic::x86_avx512_vcvtsd2usi64: 2516 case Intrinsic::x86_avx512_cvttss2si: 2517 case Intrinsic::x86_avx512_cvttss2si64: 2518 case Intrinsic::x86_avx512_cvttss2usi: 2519 case Intrinsic::x86_avx512_cvttss2usi64: 2520 case Intrinsic::x86_avx512_cvttsd2si: 2521 case Intrinsic::x86_avx512_cvttsd2si64: 2522 case Intrinsic::x86_avx512_cvttsd2usi: 2523 case Intrinsic::x86_avx512_cvttsd2usi64: { 2524 // These intrinsics only demand the 0th element of their input vectors. If 2525 // we can simplify the input based on that, do so now. 2526 Value *Arg = II->getArgOperand(0); 2527 unsigned VWidth = Arg->getType()->getVectorNumElements(); 2528 if (Value *V = SimplifyDemandedVectorEltsLow(Arg, VWidth, 1)) { 2529 II->setArgOperand(0, V); 2530 return II; 2531 } 2532 break; 2533 } 2534 2535 case Intrinsic::x86_mmx_pmovmskb: 2536 case Intrinsic::x86_sse_movmsk_ps: 2537 case Intrinsic::x86_sse2_movmsk_pd: 2538 case Intrinsic::x86_sse2_pmovmskb_128: 2539 case Intrinsic::x86_avx_movmsk_pd_256: 2540 case Intrinsic::x86_avx_movmsk_ps_256: 2541 case Intrinsic::x86_avx2_pmovmskb: 2542 if (Value *V = simplifyX86movmsk(*II, Builder)) 2543 return replaceInstUsesWith(*II, V); 2544 break; 2545 2546 case Intrinsic::x86_sse_comieq_ss: 2547 case Intrinsic::x86_sse_comige_ss: 2548 case Intrinsic::x86_sse_comigt_ss: 2549 case Intrinsic::x86_sse_comile_ss: 2550 case Intrinsic::x86_sse_comilt_ss: 2551 case Intrinsic::x86_sse_comineq_ss: 2552 case Intrinsic::x86_sse_ucomieq_ss: 2553 case Intrinsic::x86_sse_ucomige_ss: 2554 case Intrinsic::x86_sse_ucomigt_ss: 2555 case Intrinsic::x86_sse_ucomile_ss: 2556 case Intrinsic::x86_sse_ucomilt_ss: 2557 case Intrinsic::x86_sse_ucomineq_ss: 2558 case Intrinsic::x86_sse2_comieq_sd: 2559 case Intrinsic::x86_sse2_comige_sd: 2560 case Intrinsic::x86_sse2_comigt_sd: 2561 case Intrinsic::x86_sse2_comile_sd: 2562 case Intrinsic::x86_sse2_comilt_sd: 2563 case Intrinsic::x86_sse2_comineq_sd: 2564 case Intrinsic::x86_sse2_ucomieq_sd: 2565 case Intrinsic::x86_sse2_ucomige_sd: 2566 case Intrinsic::x86_sse2_ucomigt_sd: 2567 case Intrinsic::x86_sse2_ucomile_sd: 2568 case Intrinsic::x86_sse2_ucomilt_sd: 2569 case Intrinsic::x86_sse2_ucomineq_sd: 2570 case Intrinsic::x86_avx512_vcomi_ss: 2571 case Intrinsic::x86_avx512_vcomi_sd: 2572 case Intrinsic::x86_avx512_mask_cmp_ss: 2573 case Intrinsic::x86_avx512_mask_cmp_sd: { 2574 // These intrinsics only demand the 0th element of their input vectors. If 2575 // we can simplify the input based on that, do so now. 2576 bool MadeChange = false; 2577 Value *Arg0 = II->getArgOperand(0); 2578 Value *Arg1 = II->getArgOperand(1); 2579 unsigned VWidth = Arg0->getType()->getVectorNumElements(); 2580 if (Value *V = SimplifyDemandedVectorEltsLow(Arg0, VWidth, 1)) { 2581 II->setArgOperand(0, V); 2582 MadeChange = true; 2583 } 2584 if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, 1)) { 2585 II->setArgOperand(1, V); 2586 MadeChange = true; 2587 } 2588 if (MadeChange) 2589 return II; 2590 break; 2591 } 2592 case Intrinsic::x86_avx512_cmp_pd_128: 2593 case Intrinsic::x86_avx512_cmp_pd_256: 2594 case Intrinsic::x86_avx512_cmp_pd_512: 2595 case Intrinsic::x86_avx512_cmp_ps_128: 2596 case Intrinsic::x86_avx512_cmp_ps_256: 2597 case Intrinsic::x86_avx512_cmp_ps_512: { 2598 // Folding cmp(sub(a,b),0) -> cmp(a,b) and cmp(0,sub(a,b)) -> cmp(b,a) 2599 Value *Arg0 = II->getArgOperand(0); 2600 Value *Arg1 = II->getArgOperand(1); 2601 bool Arg0IsZero = match(Arg0, m_PosZeroFP()); 2602 if (Arg0IsZero) 2603 std::swap(Arg0, Arg1); 2604 Value *A, *B; 2605 // This fold requires only the NINF(not +/- inf) since inf minus 2606 // inf is nan. 2607 // NSZ(No Signed Zeros) is not needed because zeros of any sign are 2608 // equal for both compares. 2609 // NNAN is not needed because nans compare the same for both compares. 2610 // The compare intrinsic uses the above assumptions and therefore 2611 // doesn't require additional flags. 2612 if ((match(Arg0, m_OneUse(m_FSub(m_Value(A), m_Value(B)))) && 2613 match(Arg1, m_PosZeroFP()) && isa<Instruction>(Arg0) && 2614 cast<Instruction>(Arg0)->getFastMathFlags().noInfs())) { 2615 if (Arg0IsZero) 2616 std::swap(A, B); 2617 II->setArgOperand(0, A); 2618 II->setArgOperand(1, B); 2619 return II; 2620 } 2621 break; 2622 } 2623 2624 case Intrinsic::x86_avx512_add_ps_512: 2625 case Intrinsic::x86_avx512_div_ps_512: 2626 case Intrinsic::x86_avx512_mul_ps_512: 2627 case Intrinsic::x86_avx512_sub_ps_512: 2628 case Intrinsic::x86_avx512_add_pd_512: 2629 case Intrinsic::x86_avx512_div_pd_512: 2630 case Intrinsic::x86_avx512_mul_pd_512: 2631 case Intrinsic::x86_avx512_sub_pd_512: 2632 // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular 2633 // IR operations. 2634 if (auto *R = dyn_cast<ConstantInt>(II->getArgOperand(2))) { 2635 if (R->getValue() == 4) { 2636 Value *Arg0 = II->getArgOperand(0); 2637 Value *Arg1 = II->getArgOperand(1); 2638 2639 Value *V; 2640 switch (IID) { 2641 default: llvm_unreachable("Case stmts out of sync!"); 2642 case Intrinsic::x86_avx512_add_ps_512: 2643 case Intrinsic::x86_avx512_add_pd_512: 2644 V = Builder.CreateFAdd(Arg0, Arg1); 2645 break; 2646 case Intrinsic::x86_avx512_sub_ps_512: 2647 case Intrinsic::x86_avx512_sub_pd_512: 2648 V = Builder.CreateFSub(Arg0, Arg1); 2649 break; 2650 case Intrinsic::x86_avx512_mul_ps_512: 2651 case Intrinsic::x86_avx512_mul_pd_512: 2652 V = Builder.CreateFMul(Arg0, Arg1); 2653 break; 2654 case Intrinsic::x86_avx512_div_ps_512: 2655 case Intrinsic::x86_avx512_div_pd_512: 2656 V = Builder.CreateFDiv(Arg0, Arg1); 2657 break; 2658 } 2659 2660 return replaceInstUsesWith(*II, V); 2661 } 2662 } 2663 break; 2664 2665 case Intrinsic::x86_avx512_mask_add_ss_round: 2666 case Intrinsic::x86_avx512_mask_div_ss_round: 2667 case Intrinsic::x86_avx512_mask_mul_ss_round: 2668 case Intrinsic::x86_avx512_mask_sub_ss_round: 2669 case Intrinsic::x86_avx512_mask_add_sd_round: 2670 case Intrinsic::x86_avx512_mask_div_sd_round: 2671 case Intrinsic::x86_avx512_mask_mul_sd_round: 2672 case Intrinsic::x86_avx512_mask_sub_sd_round: 2673 // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular 2674 // IR operations. 2675 if (auto *R = dyn_cast<ConstantInt>(II->getArgOperand(4))) { 2676 if (R->getValue() == 4) { 2677 // Extract the element as scalars. 2678 Value *Arg0 = II->getArgOperand(0); 2679 Value *Arg1 = II->getArgOperand(1); 2680 Value *LHS = Builder.CreateExtractElement(Arg0, (uint64_t)0); 2681 Value *RHS = Builder.CreateExtractElement(Arg1, (uint64_t)0); 2682 2683 Value *V; 2684 switch (IID) { 2685 default: llvm_unreachable("Case stmts out of sync!"); 2686 case Intrinsic::x86_avx512_mask_add_ss_round: 2687 case Intrinsic::x86_avx512_mask_add_sd_round: 2688 V = Builder.CreateFAdd(LHS, RHS); 2689 break; 2690 case Intrinsic::x86_avx512_mask_sub_ss_round: 2691 case Intrinsic::x86_avx512_mask_sub_sd_round: 2692 V = Builder.CreateFSub(LHS, RHS); 2693 break; 2694 case Intrinsic::x86_avx512_mask_mul_ss_round: 2695 case Intrinsic::x86_avx512_mask_mul_sd_round: 2696 V = Builder.CreateFMul(LHS, RHS); 2697 break; 2698 case Intrinsic::x86_avx512_mask_div_ss_round: 2699 case Intrinsic::x86_avx512_mask_div_sd_round: 2700 V = Builder.CreateFDiv(LHS, RHS); 2701 break; 2702 } 2703 2704 // Handle the masking aspect of the intrinsic. 2705 Value *Mask = II->getArgOperand(3); 2706 auto *C = dyn_cast<ConstantInt>(Mask); 2707 // We don't need a select if we know the mask bit is a 1. 2708 if (!C || !C->getValue()[0]) { 2709 // Cast the mask to an i1 vector and then extract the lowest element. 2710 auto *MaskTy = VectorType::get(Builder.getInt1Ty(), 2711 cast<IntegerType>(Mask->getType())->getBitWidth()); 2712 Mask = Builder.CreateBitCast(Mask, MaskTy); 2713 Mask = Builder.CreateExtractElement(Mask, (uint64_t)0); 2714 // Extract the lowest element from the passthru operand. 2715 Value *Passthru = Builder.CreateExtractElement(II->getArgOperand(2), 2716 (uint64_t)0); 2717 V = Builder.CreateSelect(Mask, V, Passthru); 2718 } 2719 2720 // Insert the result back into the original argument 0. 2721 V = Builder.CreateInsertElement(Arg0, V, (uint64_t)0); 2722 2723 return replaceInstUsesWith(*II, V); 2724 } 2725 } 2726 break; 2727 2728 // Constant fold ashr( <A x Bi>, Ci ). 2729 // Constant fold lshr( <A x Bi>, Ci ). 2730 // Constant fold shl( <A x Bi>, Ci ). 2731 case Intrinsic::x86_sse2_psrai_d: 2732 case Intrinsic::x86_sse2_psrai_w: 2733 case Intrinsic::x86_avx2_psrai_d: 2734 case Intrinsic::x86_avx2_psrai_w: 2735 case Intrinsic::x86_avx512_psrai_q_128: 2736 case Intrinsic::x86_avx512_psrai_q_256: 2737 case Intrinsic::x86_avx512_psrai_d_512: 2738 case Intrinsic::x86_avx512_psrai_q_512: 2739 case Intrinsic::x86_avx512_psrai_w_512: 2740 case Intrinsic::x86_sse2_psrli_d: 2741 case Intrinsic::x86_sse2_psrli_q: 2742 case Intrinsic::x86_sse2_psrli_w: 2743 case Intrinsic::x86_avx2_psrli_d: 2744 case Intrinsic::x86_avx2_psrli_q: 2745 case Intrinsic::x86_avx2_psrli_w: 2746 case Intrinsic::x86_avx512_psrli_d_512: 2747 case Intrinsic::x86_avx512_psrli_q_512: 2748 case Intrinsic::x86_avx512_psrli_w_512: 2749 case Intrinsic::x86_sse2_pslli_d: 2750 case Intrinsic::x86_sse2_pslli_q: 2751 case Intrinsic::x86_sse2_pslli_w: 2752 case Intrinsic::x86_avx2_pslli_d: 2753 case Intrinsic::x86_avx2_pslli_q: 2754 case Intrinsic::x86_avx2_pslli_w: 2755 case Intrinsic::x86_avx512_pslli_d_512: 2756 case Intrinsic::x86_avx512_pslli_q_512: 2757 case Intrinsic::x86_avx512_pslli_w_512: 2758 if (Value *V = simplifyX86immShift(*II, Builder)) 2759 return replaceInstUsesWith(*II, V); 2760 break; 2761 2762 case Intrinsic::x86_sse2_psra_d: 2763 case Intrinsic::x86_sse2_psra_w: 2764 case Intrinsic::x86_avx2_psra_d: 2765 case Intrinsic::x86_avx2_psra_w: 2766 case Intrinsic::x86_avx512_psra_q_128: 2767 case Intrinsic::x86_avx512_psra_q_256: 2768 case Intrinsic::x86_avx512_psra_d_512: 2769 case Intrinsic::x86_avx512_psra_q_512: 2770 case Intrinsic::x86_avx512_psra_w_512: 2771 case Intrinsic::x86_sse2_psrl_d: 2772 case Intrinsic::x86_sse2_psrl_q: 2773 case Intrinsic::x86_sse2_psrl_w: 2774 case Intrinsic::x86_avx2_psrl_d: 2775 case Intrinsic::x86_avx2_psrl_q: 2776 case Intrinsic::x86_avx2_psrl_w: 2777 case Intrinsic::x86_avx512_psrl_d_512: 2778 case Intrinsic::x86_avx512_psrl_q_512: 2779 case Intrinsic::x86_avx512_psrl_w_512: 2780 case Intrinsic::x86_sse2_psll_d: 2781 case Intrinsic::x86_sse2_psll_q: 2782 case Intrinsic::x86_sse2_psll_w: 2783 case Intrinsic::x86_avx2_psll_d: 2784 case Intrinsic::x86_avx2_psll_q: 2785 case Intrinsic::x86_avx2_psll_w: 2786 case Intrinsic::x86_avx512_psll_d_512: 2787 case Intrinsic::x86_avx512_psll_q_512: 2788 case Intrinsic::x86_avx512_psll_w_512: { 2789 if (Value *V = simplifyX86immShift(*II, Builder)) 2790 return replaceInstUsesWith(*II, V); 2791 2792 // SSE2/AVX2 uses only the first 64-bits of the 128-bit vector 2793 // operand to compute the shift amount. 2794 Value *Arg1 = II->getArgOperand(1); 2795 assert(Arg1->getType()->getPrimitiveSizeInBits() == 128 && 2796 "Unexpected packed shift size"); 2797 unsigned VWidth = Arg1->getType()->getVectorNumElements(); 2798 2799 if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, VWidth / 2)) { 2800 II->setArgOperand(1, V); 2801 return II; 2802 } 2803 break; 2804 } 2805 2806 case Intrinsic::x86_avx2_psllv_d: 2807 case Intrinsic::x86_avx2_psllv_d_256: 2808 case Intrinsic::x86_avx2_psllv_q: 2809 case Intrinsic::x86_avx2_psllv_q_256: 2810 case Intrinsic::x86_avx512_psllv_d_512: 2811 case Intrinsic::x86_avx512_psllv_q_512: 2812 case Intrinsic::x86_avx512_psllv_w_128: 2813 case Intrinsic::x86_avx512_psllv_w_256: 2814 case Intrinsic::x86_avx512_psllv_w_512: 2815 case Intrinsic::x86_avx2_psrav_d: 2816 case Intrinsic::x86_avx2_psrav_d_256: 2817 case Intrinsic::x86_avx512_psrav_q_128: 2818 case Intrinsic::x86_avx512_psrav_q_256: 2819 case Intrinsic::x86_avx512_psrav_d_512: 2820 case Intrinsic::x86_avx512_psrav_q_512: 2821 case Intrinsic::x86_avx512_psrav_w_128: 2822 case Intrinsic::x86_avx512_psrav_w_256: 2823 case Intrinsic::x86_avx512_psrav_w_512: 2824 case Intrinsic::x86_avx2_psrlv_d: 2825 case Intrinsic::x86_avx2_psrlv_d_256: 2826 case Intrinsic::x86_avx2_psrlv_q: 2827 case Intrinsic::x86_avx2_psrlv_q_256: 2828 case Intrinsic::x86_avx512_psrlv_d_512: 2829 case Intrinsic::x86_avx512_psrlv_q_512: 2830 case Intrinsic::x86_avx512_psrlv_w_128: 2831 case Intrinsic::x86_avx512_psrlv_w_256: 2832 case Intrinsic::x86_avx512_psrlv_w_512: 2833 if (Value *V = simplifyX86varShift(*II, Builder)) 2834 return replaceInstUsesWith(*II, V); 2835 break; 2836 2837 case Intrinsic::x86_sse2_packssdw_128: 2838 case Intrinsic::x86_sse2_packsswb_128: 2839 case Intrinsic::x86_avx2_packssdw: 2840 case Intrinsic::x86_avx2_packsswb: 2841 case Intrinsic::x86_avx512_packssdw_512: 2842 case Intrinsic::x86_avx512_packsswb_512: 2843 if (Value *V = simplifyX86pack(*II, Builder, true)) 2844 return replaceInstUsesWith(*II, V); 2845 break; 2846 2847 case Intrinsic::x86_sse2_packuswb_128: 2848 case Intrinsic::x86_sse41_packusdw: 2849 case Intrinsic::x86_avx2_packusdw: 2850 case Intrinsic::x86_avx2_packuswb: 2851 case Intrinsic::x86_avx512_packusdw_512: 2852 case Intrinsic::x86_avx512_packuswb_512: 2853 if (Value *V = simplifyX86pack(*II, Builder, false)) 2854 return replaceInstUsesWith(*II, V); 2855 break; 2856 2857 case Intrinsic::x86_pclmulqdq: 2858 case Intrinsic::x86_pclmulqdq_256: 2859 case Intrinsic::x86_pclmulqdq_512: { 2860 if (auto *C = dyn_cast<ConstantInt>(II->getArgOperand(2))) { 2861 unsigned Imm = C->getZExtValue(); 2862 2863 bool MadeChange = false; 2864 Value *Arg0 = II->getArgOperand(0); 2865 Value *Arg1 = II->getArgOperand(1); 2866 unsigned VWidth = Arg0->getType()->getVectorNumElements(); 2867 2868 APInt UndefElts1(VWidth, 0); 2869 APInt DemandedElts1 = APInt::getSplat(VWidth, 2870 APInt(2, (Imm & 0x01) ? 2 : 1)); 2871 if (Value *V = SimplifyDemandedVectorElts(Arg0, DemandedElts1, 2872 UndefElts1)) { 2873 II->setArgOperand(0, V); 2874 MadeChange = true; 2875 } 2876 2877 APInt UndefElts2(VWidth, 0); 2878 APInt DemandedElts2 = APInt::getSplat(VWidth, 2879 APInt(2, (Imm & 0x10) ? 2 : 1)); 2880 if (Value *V = SimplifyDemandedVectorElts(Arg1, DemandedElts2, 2881 UndefElts2)) { 2882 II->setArgOperand(1, V); 2883 MadeChange = true; 2884 } 2885 2886 // If either input elements are undef, the result is zero. 2887 if (DemandedElts1.isSubsetOf(UndefElts1) || 2888 DemandedElts2.isSubsetOf(UndefElts2)) 2889 return replaceInstUsesWith(*II, 2890 ConstantAggregateZero::get(II->getType())); 2891 2892 if (MadeChange) 2893 return II; 2894 } 2895 break; 2896 } 2897 2898 case Intrinsic::x86_sse41_insertps: 2899 if (Value *V = simplifyX86insertps(*II, Builder)) 2900 return replaceInstUsesWith(*II, V); 2901 break; 2902 2903 case Intrinsic::x86_sse4a_extrq: { 2904 Value *Op0 = II->getArgOperand(0); 2905 Value *Op1 = II->getArgOperand(1); 2906 unsigned VWidth0 = Op0->getType()->getVectorNumElements(); 2907 unsigned VWidth1 = Op1->getType()->getVectorNumElements(); 2908 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && 2909 Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 && 2910 VWidth1 == 16 && "Unexpected operand sizes"); 2911 2912 // See if we're dealing with constant values. 2913 Constant *C1 = dyn_cast<Constant>(Op1); 2914 ConstantInt *CILength = 2915 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0)) 2916 : nullptr; 2917 ConstantInt *CIIndex = 2918 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1)) 2919 : nullptr; 2920 2921 // Attempt to simplify to a constant, shuffle vector or EXTRQI call. 2922 if (Value *V = simplifyX86extrq(*II, Op0, CILength, CIIndex, Builder)) 2923 return replaceInstUsesWith(*II, V); 2924 2925 // EXTRQ only uses the lowest 64-bits of the first 128-bit vector 2926 // operands and the lowest 16-bits of the second. 2927 bool MadeChange = false; 2928 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) { 2929 II->setArgOperand(0, V); 2930 MadeChange = true; 2931 } 2932 if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 2)) { 2933 II->setArgOperand(1, V); 2934 MadeChange = true; 2935 } 2936 if (MadeChange) 2937 return II; 2938 break; 2939 } 2940 2941 case Intrinsic::x86_sse4a_extrqi: { 2942 // EXTRQI: Extract Length bits starting from Index. Zero pad the remaining 2943 // bits of the lower 64-bits. The upper 64-bits are undefined. 2944 Value *Op0 = II->getArgOperand(0); 2945 unsigned VWidth = Op0->getType()->getVectorNumElements(); 2946 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 && 2947 "Unexpected operand size"); 2948 2949 // See if we're dealing with constant values. 2950 ConstantInt *CILength = dyn_cast<ConstantInt>(II->getArgOperand(1)); 2951 ConstantInt *CIIndex = dyn_cast<ConstantInt>(II->getArgOperand(2)); 2952 2953 // Attempt to simplify to a constant or shuffle vector. 2954 if (Value *V = simplifyX86extrq(*II, Op0, CILength, CIIndex, Builder)) 2955 return replaceInstUsesWith(*II, V); 2956 2957 // EXTRQI only uses the lowest 64-bits of the first 128-bit vector 2958 // operand. 2959 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) { 2960 II->setArgOperand(0, V); 2961 return II; 2962 } 2963 break; 2964 } 2965 2966 case Intrinsic::x86_sse4a_insertq: { 2967 Value *Op0 = II->getArgOperand(0); 2968 Value *Op1 = II->getArgOperand(1); 2969 unsigned VWidth = Op0->getType()->getVectorNumElements(); 2970 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && 2971 Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 && 2972 Op1->getType()->getVectorNumElements() == 2 && 2973 "Unexpected operand size"); 2974 2975 // See if we're dealing with constant values. 2976 Constant *C1 = dyn_cast<Constant>(Op1); 2977 ConstantInt *CI11 = 2978 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1)) 2979 : nullptr; 2980 2981 // Attempt to simplify to a constant, shuffle vector or INSERTQI call. 2982 if (CI11) { 2983 const APInt &V11 = CI11->getValue(); 2984 APInt Len = V11.zextOrTrunc(6); 2985 APInt Idx = V11.lshr(8).zextOrTrunc(6); 2986 if (Value *V = simplifyX86insertq(*II, Op0, Op1, Len, Idx, Builder)) 2987 return replaceInstUsesWith(*II, V); 2988 } 2989 2990 // INSERTQ only uses the lowest 64-bits of the first 128-bit vector 2991 // operand. 2992 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) { 2993 II->setArgOperand(0, V); 2994 return II; 2995 } 2996 break; 2997 } 2998 2999 case Intrinsic::x86_sse4a_insertqi: { 3000 // INSERTQI: Extract lowest Length bits from lower half of second source and 3001 // insert over first source starting at Index bit. The upper 64-bits are 3002 // undefined. 3003 Value *Op0 = II->getArgOperand(0); 3004 Value *Op1 = II->getArgOperand(1); 3005 unsigned VWidth0 = Op0->getType()->getVectorNumElements(); 3006 unsigned VWidth1 = Op1->getType()->getVectorNumElements(); 3007 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && 3008 Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 && 3009 VWidth1 == 2 && "Unexpected operand sizes"); 3010 3011 // See if we're dealing with constant values. 3012 ConstantInt *CILength = dyn_cast<ConstantInt>(II->getArgOperand(2)); 3013 ConstantInt *CIIndex = dyn_cast<ConstantInt>(II->getArgOperand(3)); 3014 3015 // Attempt to simplify to a constant or shuffle vector. 3016 if (CILength && CIIndex) { 3017 APInt Len = CILength->getValue().zextOrTrunc(6); 3018 APInt Idx = CIIndex->getValue().zextOrTrunc(6); 3019 if (Value *V = simplifyX86insertq(*II, Op0, Op1, Len, Idx, Builder)) 3020 return replaceInstUsesWith(*II, V); 3021 } 3022 3023 // INSERTQI only uses the lowest 64-bits of the first two 128-bit vector 3024 // operands. 3025 bool MadeChange = false; 3026 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) { 3027 II->setArgOperand(0, V); 3028 MadeChange = true; 3029 } 3030 if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 1)) { 3031 II->setArgOperand(1, V); 3032 MadeChange = true; 3033 } 3034 if (MadeChange) 3035 return II; 3036 break; 3037 } 3038 3039 case Intrinsic::x86_sse41_pblendvb: 3040 case Intrinsic::x86_sse41_blendvps: 3041 case Intrinsic::x86_sse41_blendvpd: 3042 case Intrinsic::x86_avx_blendv_ps_256: 3043 case Intrinsic::x86_avx_blendv_pd_256: 3044 case Intrinsic::x86_avx2_pblendvb: { 3045 // fold (blend A, A, Mask) -> A 3046 Value *Op0 = II->getArgOperand(0); 3047 Value *Op1 = II->getArgOperand(1); 3048 Value *Mask = II->getArgOperand(2); 3049 if (Op0 == Op1) 3050 return replaceInstUsesWith(CI, Op0); 3051 3052 // Zero Mask - select 1st argument. 3053 if (isa<ConstantAggregateZero>(Mask)) 3054 return replaceInstUsesWith(CI, Op0); 3055 3056 // Constant Mask - select 1st/2nd argument lane based on top bit of mask. 3057 if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) { 3058 Constant *NewSelector = getNegativeIsTrueBoolVec(ConstantMask); 3059 return SelectInst::Create(NewSelector, Op1, Op0, "blendv"); 3060 } 3061 3062 // Convert to a vector select if we can bypass casts and find a boolean 3063 // vector condition value. 3064 Value *BoolVec; 3065 Mask = peekThroughBitcast(Mask); 3066 if (match(Mask, m_SExt(m_Value(BoolVec))) && 3067 BoolVec->getType()->isVectorTy() && 3068 BoolVec->getType()->getScalarSizeInBits() == 1) { 3069 assert(Mask->getType()->getPrimitiveSizeInBits() == 3070 II->getType()->getPrimitiveSizeInBits() && 3071 "Not expecting mask and operands with different sizes"); 3072 3073 unsigned NumMaskElts = Mask->getType()->getVectorNumElements(); 3074 unsigned NumOperandElts = II->getType()->getVectorNumElements(); 3075 if (NumMaskElts == NumOperandElts) 3076 return SelectInst::Create(BoolVec, Op1, Op0); 3077 3078 // If the mask has less elements than the operands, each mask bit maps to 3079 // multiple elements of the operands. Bitcast back and forth. 3080 if (NumMaskElts < NumOperandElts) { 3081 Value *CastOp0 = Builder.CreateBitCast(Op0, Mask->getType()); 3082 Value *CastOp1 = Builder.CreateBitCast(Op1, Mask->getType()); 3083 Value *Sel = Builder.CreateSelect(BoolVec, CastOp1, CastOp0); 3084 return new BitCastInst(Sel, II->getType()); 3085 } 3086 } 3087 3088 break; 3089 } 3090 3091 case Intrinsic::x86_ssse3_pshuf_b_128: 3092 case Intrinsic::x86_avx2_pshuf_b: 3093 case Intrinsic::x86_avx512_pshuf_b_512: 3094 if (Value *V = simplifyX86pshufb(*II, Builder)) 3095 return replaceInstUsesWith(*II, V); 3096 break; 3097 3098 case Intrinsic::x86_avx_vpermilvar_ps: 3099 case Intrinsic::x86_avx_vpermilvar_ps_256: 3100 case Intrinsic::x86_avx512_vpermilvar_ps_512: 3101 case Intrinsic::x86_avx_vpermilvar_pd: 3102 case Intrinsic::x86_avx_vpermilvar_pd_256: 3103 case Intrinsic::x86_avx512_vpermilvar_pd_512: 3104 if (Value *V = simplifyX86vpermilvar(*II, Builder)) 3105 return replaceInstUsesWith(*II, V); 3106 break; 3107 3108 case Intrinsic::x86_avx2_permd: 3109 case Intrinsic::x86_avx2_permps: 3110 case Intrinsic::x86_avx512_permvar_df_256: 3111 case Intrinsic::x86_avx512_permvar_df_512: 3112 case Intrinsic::x86_avx512_permvar_di_256: 3113 case Intrinsic::x86_avx512_permvar_di_512: 3114 case Intrinsic::x86_avx512_permvar_hi_128: 3115 case Intrinsic::x86_avx512_permvar_hi_256: 3116 case Intrinsic::x86_avx512_permvar_hi_512: 3117 case Intrinsic::x86_avx512_permvar_qi_128: 3118 case Intrinsic::x86_avx512_permvar_qi_256: 3119 case Intrinsic::x86_avx512_permvar_qi_512: 3120 case Intrinsic::x86_avx512_permvar_sf_512: 3121 case Intrinsic::x86_avx512_permvar_si_512: 3122 if (Value *V = simplifyX86vpermv(*II, Builder)) 3123 return replaceInstUsesWith(*II, V); 3124 break; 3125 3126 case Intrinsic::x86_avx_maskload_ps: 3127 case Intrinsic::x86_avx_maskload_pd: 3128 case Intrinsic::x86_avx_maskload_ps_256: 3129 case Intrinsic::x86_avx_maskload_pd_256: 3130 case Intrinsic::x86_avx2_maskload_d: 3131 case Intrinsic::x86_avx2_maskload_q: 3132 case Intrinsic::x86_avx2_maskload_d_256: 3133 case Intrinsic::x86_avx2_maskload_q_256: 3134 if (Instruction *I = simplifyX86MaskedLoad(*II, *this)) 3135 return I; 3136 break; 3137 3138 case Intrinsic::x86_sse2_maskmov_dqu: 3139 case Intrinsic::x86_avx_maskstore_ps: 3140 case Intrinsic::x86_avx_maskstore_pd: 3141 case Intrinsic::x86_avx_maskstore_ps_256: 3142 case Intrinsic::x86_avx_maskstore_pd_256: 3143 case Intrinsic::x86_avx2_maskstore_d: 3144 case Intrinsic::x86_avx2_maskstore_q: 3145 case Intrinsic::x86_avx2_maskstore_d_256: 3146 case Intrinsic::x86_avx2_maskstore_q_256: 3147 if (simplifyX86MaskedStore(*II, *this)) 3148 return nullptr; 3149 break; 3150 3151 case Intrinsic::x86_addcarry_32: 3152 case Intrinsic::x86_addcarry_64: 3153 if (Value *V = simplifyX86addcarry(*II, Builder)) 3154 return replaceInstUsesWith(*II, V); 3155 break; 3156 3157 case Intrinsic::ppc_altivec_vperm: 3158 // Turn vperm(V1,V2,mask) -> shuffle(V1,V2,mask) if mask is a constant. 3159 // Note that ppc_altivec_vperm has a big-endian bias, so when creating 3160 // a vectorshuffle for little endian, we must undo the transformation 3161 // performed on vec_perm in altivec.h. That is, we must complement 3162 // the permutation mask with respect to 31 and reverse the order of 3163 // V1 and V2. 3164 if (Constant *Mask = dyn_cast<Constant>(II->getArgOperand(2))) { 3165 assert(Mask->getType()->getVectorNumElements() == 16 && 3166 "Bad type for intrinsic!"); 3167 3168 // Check that all of the elements are integer constants or undefs. 3169 bool AllEltsOk = true; 3170 for (unsigned i = 0; i != 16; ++i) { 3171 Constant *Elt = Mask->getAggregateElement(i); 3172 if (!Elt || !(isa<ConstantInt>(Elt) || isa<UndefValue>(Elt))) { 3173 AllEltsOk = false; 3174 break; 3175 } 3176 } 3177 3178 if (AllEltsOk) { 3179 // Cast the input vectors to byte vectors. 3180 Value *Op0 = Builder.CreateBitCast(II->getArgOperand(0), 3181 Mask->getType()); 3182 Value *Op1 = Builder.CreateBitCast(II->getArgOperand(1), 3183 Mask->getType()); 3184 Value *Result = UndefValue::get(Op0->getType()); 3185 3186 // Only extract each element once. 3187 Value *ExtractedElts[32]; 3188 memset(ExtractedElts, 0, sizeof(ExtractedElts)); 3189 3190 for (unsigned i = 0; i != 16; ++i) { 3191 if (isa<UndefValue>(Mask->getAggregateElement(i))) 3192 continue; 3193 unsigned Idx = 3194 cast<ConstantInt>(Mask->getAggregateElement(i))->getZExtValue(); 3195 Idx &= 31; // Match the hardware behavior. 3196 if (DL.isLittleEndian()) 3197 Idx = 31 - Idx; 3198 3199 if (!ExtractedElts[Idx]) { 3200 Value *Op0ToUse = (DL.isLittleEndian()) ? Op1 : Op0; 3201 Value *Op1ToUse = (DL.isLittleEndian()) ? Op0 : Op1; 3202 ExtractedElts[Idx] = 3203 Builder.CreateExtractElement(Idx < 16 ? Op0ToUse : Op1ToUse, 3204 Builder.getInt32(Idx&15)); 3205 } 3206 3207 // Insert this value into the result vector. 3208 Result = Builder.CreateInsertElement(Result, ExtractedElts[Idx], 3209 Builder.getInt32(i)); 3210 } 3211 return CastInst::Create(Instruction::BitCast, Result, CI.getType()); 3212 } 3213 } 3214 break; 3215 3216 case Intrinsic::arm_neon_vld1: { 3217 unsigned MemAlign = getKnownAlignment(II->getArgOperand(0), 3218 DL, II, &AC, &DT); 3219 if (Value *V = simplifyNeonVld1(*II, MemAlign, Builder)) 3220 return replaceInstUsesWith(*II, V); 3221 break; 3222 } 3223 3224 case Intrinsic::arm_neon_vld2: 3225 case Intrinsic::arm_neon_vld3: 3226 case Intrinsic::arm_neon_vld4: 3227 case Intrinsic::arm_neon_vld2lane: 3228 case Intrinsic::arm_neon_vld3lane: 3229 case Intrinsic::arm_neon_vld4lane: 3230 case Intrinsic::arm_neon_vst1: 3231 case Intrinsic::arm_neon_vst2: 3232 case Intrinsic::arm_neon_vst3: 3233 case Intrinsic::arm_neon_vst4: 3234 case Intrinsic::arm_neon_vst2lane: 3235 case Intrinsic::arm_neon_vst3lane: 3236 case Intrinsic::arm_neon_vst4lane: { 3237 unsigned MemAlign = 3238 getKnownAlignment(II->getArgOperand(0), DL, II, &AC, &DT); 3239 unsigned AlignArg = II->getNumArgOperands() - 1; 3240 ConstantInt *IntrAlign = dyn_cast<ConstantInt>(II->getArgOperand(AlignArg)); 3241 if (IntrAlign && IntrAlign->getZExtValue() < MemAlign) { 3242 II->setArgOperand(AlignArg, 3243 ConstantInt::get(Type::getInt32Ty(II->getContext()), 3244 MemAlign, false)); 3245 return II; 3246 } 3247 break; 3248 } 3249 3250 case Intrinsic::arm_neon_vtbl1: 3251 case Intrinsic::aarch64_neon_tbl1: 3252 if (Value *V = simplifyNeonTbl1(*II, Builder)) 3253 return replaceInstUsesWith(*II, V); 3254 break; 3255 3256 case Intrinsic::arm_neon_vmulls: 3257 case Intrinsic::arm_neon_vmullu: 3258 case Intrinsic::aarch64_neon_smull: 3259 case Intrinsic::aarch64_neon_umull: { 3260 Value *Arg0 = II->getArgOperand(0); 3261 Value *Arg1 = II->getArgOperand(1); 3262 3263 // Handle mul by zero first: 3264 if (isa<ConstantAggregateZero>(Arg0) || isa<ConstantAggregateZero>(Arg1)) { 3265 return replaceInstUsesWith(CI, ConstantAggregateZero::get(II->getType())); 3266 } 3267 3268 // Check for constant LHS & RHS - in this case we just simplify. 3269 bool Zext = (IID == Intrinsic::arm_neon_vmullu || 3270 IID == Intrinsic::aarch64_neon_umull); 3271 VectorType *NewVT = cast<VectorType>(II->getType()); 3272 if (Constant *CV0 = dyn_cast<Constant>(Arg0)) { 3273 if (Constant *CV1 = dyn_cast<Constant>(Arg1)) { 3274 CV0 = ConstantExpr::getIntegerCast(CV0, NewVT, /*isSigned=*/!Zext); 3275 CV1 = ConstantExpr::getIntegerCast(CV1, NewVT, /*isSigned=*/!Zext); 3276 3277 return replaceInstUsesWith(CI, ConstantExpr::getMul(CV0, CV1)); 3278 } 3279 3280 // Couldn't simplify - canonicalize constant to the RHS. 3281 std::swap(Arg0, Arg1); 3282 } 3283 3284 // Handle mul by one: 3285 if (Constant *CV1 = dyn_cast<Constant>(Arg1)) 3286 if (ConstantInt *Splat = 3287 dyn_cast_or_null<ConstantInt>(CV1->getSplatValue())) 3288 if (Splat->isOne()) 3289 return CastInst::CreateIntegerCast(Arg0, II->getType(), 3290 /*isSigned=*/!Zext); 3291 3292 break; 3293 } 3294 case Intrinsic::arm_neon_aesd: 3295 case Intrinsic::arm_neon_aese: 3296 case Intrinsic::aarch64_crypto_aesd: 3297 case Intrinsic::aarch64_crypto_aese: { 3298 Value *DataArg = II->getArgOperand(0); 3299 Value *KeyArg = II->getArgOperand(1); 3300 3301 // Try to use the builtin XOR in AESE and AESD to eliminate a prior XOR 3302 Value *Data, *Key; 3303 if (match(KeyArg, m_ZeroInt()) && 3304 match(DataArg, m_Xor(m_Value(Data), m_Value(Key)))) { 3305 II->setArgOperand(0, Data); 3306 II->setArgOperand(1, Key); 3307 return II; 3308 } 3309 break; 3310 } 3311 case Intrinsic::amdgcn_rcp: { 3312 Value *Src = II->getArgOperand(0); 3313 3314 // TODO: Move to ConstantFolding/InstSimplify? 3315 if (isa<UndefValue>(Src)) 3316 return replaceInstUsesWith(CI, Src); 3317 3318 if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) { 3319 const APFloat &ArgVal = C->getValueAPF(); 3320 APFloat Val(ArgVal.getSemantics(), 1.0); 3321 APFloat::opStatus Status = Val.divide(ArgVal, 3322 APFloat::rmNearestTiesToEven); 3323 // Only do this if it was exact and therefore not dependent on the 3324 // rounding mode. 3325 if (Status == APFloat::opOK) 3326 return replaceInstUsesWith(CI, ConstantFP::get(II->getContext(), Val)); 3327 } 3328 3329 break; 3330 } 3331 case Intrinsic::amdgcn_rsq: { 3332 Value *Src = II->getArgOperand(0); 3333 3334 // TODO: Move to ConstantFolding/InstSimplify? 3335 if (isa<UndefValue>(Src)) 3336 return replaceInstUsesWith(CI, Src); 3337 break; 3338 } 3339 case Intrinsic::amdgcn_frexp_mant: 3340 case Intrinsic::amdgcn_frexp_exp: { 3341 Value *Src = II->getArgOperand(0); 3342 if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) { 3343 int Exp; 3344 APFloat Significand = frexp(C->getValueAPF(), Exp, 3345 APFloat::rmNearestTiesToEven); 3346 3347 if (IID == Intrinsic::amdgcn_frexp_mant) { 3348 return replaceInstUsesWith(CI, ConstantFP::get(II->getContext(), 3349 Significand)); 3350 } 3351 3352 // Match instruction special case behavior. 3353 if (Exp == APFloat::IEK_NaN || Exp == APFloat::IEK_Inf) 3354 Exp = 0; 3355 3356 return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), Exp)); 3357 } 3358 3359 if (isa<UndefValue>(Src)) 3360 return replaceInstUsesWith(CI, UndefValue::get(II->getType())); 3361 3362 break; 3363 } 3364 case Intrinsic::amdgcn_class: { 3365 enum { 3366 S_NAN = 1 << 0, // Signaling NaN 3367 Q_NAN = 1 << 1, // Quiet NaN 3368 N_INFINITY = 1 << 2, // Negative infinity 3369 N_NORMAL = 1 << 3, // Negative normal 3370 N_SUBNORMAL = 1 << 4, // Negative subnormal 3371 N_ZERO = 1 << 5, // Negative zero 3372 P_ZERO = 1 << 6, // Positive zero 3373 P_SUBNORMAL = 1 << 7, // Positive subnormal 3374 P_NORMAL = 1 << 8, // Positive normal 3375 P_INFINITY = 1 << 9 // Positive infinity 3376 }; 3377 3378 const uint32_t FullMask = S_NAN | Q_NAN | N_INFINITY | N_NORMAL | 3379 N_SUBNORMAL | N_ZERO | P_ZERO | P_SUBNORMAL | P_NORMAL | P_INFINITY; 3380 3381 Value *Src0 = II->getArgOperand(0); 3382 Value *Src1 = II->getArgOperand(1); 3383 const ConstantInt *CMask = dyn_cast<ConstantInt>(Src1); 3384 if (!CMask) { 3385 if (isa<UndefValue>(Src0)) 3386 return replaceInstUsesWith(*II, UndefValue::get(II->getType())); 3387 3388 if (isa<UndefValue>(Src1)) 3389 return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), false)); 3390 break; 3391 } 3392 3393 uint32_t Mask = CMask->getZExtValue(); 3394 3395 // If all tests are made, it doesn't matter what the value is. 3396 if ((Mask & FullMask) == FullMask) 3397 return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), true)); 3398 3399 if ((Mask & FullMask) == 0) 3400 return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), false)); 3401 3402 if (Mask == (S_NAN | Q_NAN)) { 3403 // Equivalent of isnan. Replace with standard fcmp. 3404 Value *FCmp = Builder.CreateFCmpUNO(Src0, Src0); 3405 FCmp->takeName(II); 3406 return replaceInstUsesWith(*II, FCmp); 3407 } 3408 3409 if (Mask == (N_ZERO | P_ZERO)) { 3410 // Equivalent of == 0. 3411 Value *FCmp = Builder.CreateFCmpOEQ( 3412 Src0, ConstantFP::get(Src0->getType(), 0.0)); 3413 3414 FCmp->takeName(II); 3415 return replaceInstUsesWith(*II, FCmp); 3416 } 3417 3418 // fp_class (nnan x), qnan|snan|other -> fp_class (nnan x), other 3419 if (((Mask & S_NAN) || (Mask & Q_NAN)) && isKnownNeverNaN(Src0, &TLI)) { 3420 II->setArgOperand(1, ConstantInt::get(Src1->getType(), 3421 Mask & ~(S_NAN | Q_NAN))); 3422 return II; 3423 } 3424 3425 const ConstantFP *CVal = dyn_cast<ConstantFP>(Src0); 3426 if (!CVal) { 3427 if (isa<UndefValue>(Src0)) 3428 return replaceInstUsesWith(*II, UndefValue::get(II->getType())); 3429 3430 // Clamp mask to used bits 3431 if ((Mask & FullMask) != Mask) { 3432 CallInst *NewCall = Builder.CreateCall(II->getCalledFunction(), 3433 { Src0, ConstantInt::get(Src1->getType(), Mask & FullMask) } 3434 ); 3435 3436 NewCall->takeName(II); 3437 return replaceInstUsesWith(*II, NewCall); 3438 } 3439 3440 break; 3441 } 3442 3443 const APFloat &Val = CVal->getValueAPF(); 3444 3445 bool Result = 3446 ((Mask & S_NAN) && Val.isNaN() && Val.isSignaling()) || 3447 ((Mask & Q_NAN) && Val.isNaN() && !Val.isSignaling()) || 3448 ((Mask & N_INFINITY) && Val.isInfinity() && Val.isNegative()) || 3449 ((Mask & N_NORMAL) && Val.isNormal() && Val.isNegative()) || 3450 ((Mask & N_SUBNORMAL) && Val.isDenormal() && Val.isNegative()) || 3451 ((Mask & N_ZERO) && Val.isZero() && Val.isNegative()) || 3452 ((Mask & P_ZERO) && Val.isZero() && !Val.isNegative()) || 3453 ((Mask & P_SUBNORMAL) && Val.isDenormal() && !Val.isNegative()) || 3454 ((Mask & P_NORMAL) && Val.isNormal() && !Val.isNegative()) || 3455 ((Mask & P_INFINITY) && Val.isInfinity() && !Val.isNegative()); 3456 3457 return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), Result)); 3458 } 3459 case Intrinsic::amdgcn_cvt_pkrtz: { 3460 Value *Src0 = II->getArgOperand(0); 3461 Value *Src1 = II->getArgOperand(1); 3462 if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) { 3463 if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) { 3464 const fltSemantics &HalfSem 3465 = II->getType()->getScalarType()->getFltSemantics(); 3466 bool LosesInfo; 3467 APFloat Val0 = C0->getValueAPF(); 3468 APFloat Val1 = C1->getValueAPF(); 3469 Val0.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo); 3470 Val1.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo); 3471 3472 Constant *Folded = ConstantVector::get({ 3473 ConstantFP::get(II->getContext(), Val0), 3474 ConstantFP::get(II->getContext(), Val1) }); 3475 return replaceInstUsesWith(*II, Folded); 3476 } 3477 } 3478 3479 if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) 3480 return replaceInstUsesWith(*II, UndefValue::get(II->getType())); 3481 3482 break; 3483 } 3484 case Intrinsic::amdgcn_cvt_pknorm_i16: 3485 case Intrinsic::amdgcn_cvt_pknorm_u16: 3486 case Intrinsic::amdgcn_cvt_pk_i16: 3487 case Intrinsic::amdgcn_cvt_pk_u16: { 3488 Value *Src0 = II->getArgOperand(0); 3489 Value *Src1 = II->getArgOperand(1); 3490 3491 if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) 3492 return replaceInstUsesWith(*II, UndefValue::get(II->getType())); 3493 3494 break; 3495 } 3496 case Intrinsic::amdgcn_ubfe: 3497 case Intrinsic::amdgcn_sbfe: { 3498 // Decompose simple cases into standard shifts. 3499 Value *Src = II->getArgOperand(0); 3500 if (isa<UndefValue>(Src)) 3501 return replaceInstUsesWith(*II, Src); 3502 3503 unsigned Width; 3504 Type *Ty = II->getType(); 3505 unsigned IntSize = Ty->getIntegerBitWidth(); 3506 3507 ConstantInt *CWidth = dyn_cast<ConstantInt>(II->getArgOperand(2)); 3508 if (CWidth) { 3509 Width = CWidth->getZExtValue(); 3510 if ((Width & (IntSize - 1)) == 0) 3511 return replaceInstUsesWith(*II, ConstantInt::getNullValue(Ty)); 3512 3513 if (Width >= IntSize) { 3514 // Hardware ignores high bits, so remove those. 3515 II->setArgOperand(2, ConstantInt::get(CWidth->getType(), 3516 Width & (IntSize - 1))); 3517 return II; 3518 } 3519 } 3520 3521 unsigned Offset; 3522 ConstantInt *COffset = dyn_cast<ConstantInt>(II->getArgOperand(1)); 3523 if (COffset) { 3524 Offset = COffset->getZExtValue(); 3525 if (Offset >= IntSize) { 3526 II->setArgOperand(1, ConstantInt::get(COffset->getType(), 3527 Offset & (IntSize - 1))); 3528 return II; 3529 } 3530 } 3531 3532 bool Signed = IID == Intrinsic::amdgcn_sbfe; 3533 3534 if (!CWidth || !COffset) 3535 break; 3536 3537 // The case of Width == 0 is handled above, which makes this tranformation 3538 // safe. If Width == 0, then the ashr and lshr instructions become poison 3539 // value since the shift amount would be equal to the bit size. 3540 assert(Width != 0); 3541 3542 // TODO: This allows folding to undef when the hardware has specific 3543 // behavior? 3544 if (Offset + Width < IntSize) { 3545 Value *Shl = Builder.CreateShl(Src, IntSize - Offset - Width); 3546 Value *RightShift = Signed ? Builder.CreateAShr(Shl, IntSize - Width) 3547 : Builder.CreateLShr(Shl, IntSize - Width); 3548 RightShift->takeName(II); 3549 return replaceInstUsesWith(*II, RightShift); 3550 } 3551 3552 Value *RightShift = Signed ? Builder.CreateAShr(Src, Offset) 3553 : Builder.CreateLShr(Src, Offset); 3554 3555 RightShift->takeName(II); 3556 return replaceInstUsesWith(*II, RightShift); 3557 } 3558 case Intrinsic::amdgcn_exp: 3559 case Intrinsic::amdgcn_exp_compr: { 3560 ConstantInt *En = cast<ConstantInt>(II->getArgOperand(1)); 3561 unsigned EnBits = En->getZExtValue(); 3562 if (EnBits == 0xf) 3563 break; // All inputs enabled. 3564 3565 bool IsCompr = IID == Intrinsic::amdgcn_exp_compr; 3566 bool Changed = false; 3567 for (int I = 0; I < (IsCompr ? 2 : 4); ++I) { 3568 if ((!IsCompr && (EnBits & (1 << I)) == 0) || 3569 (IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) { 3570 Value *Src = II->getArgOperand(I + 2); 3571 if (!isa<UndefValue>(Src)) { 3572 II->setArgOperand(I + 2, UndefValue::get(Src->getType())); 3573 Changed = true; 3574 } 3575 } 3576 } 3577 3578 if (Changed) 3579 return II; 3580 3581 break; 3582 } 3583 case Intrinsic::amdgcn_fmed3: { 3584 // Note this does not preserve proper sNaN behavior if IEEE-mode is enabled 3585 // for the shader. 3586 3587 Value *Src0 = II->getArgOperand(0); 3588 Value *Src1 = II->getArgOperand(1); 3589 Value *Src2 = II->getArgOperand(2); 3590 3591 // Checking for NaN before canonicalization provides better fidelity when 3592 // mapping other operations onto fmed3 since the order of operands is 3593 // unchanged. 3594 CallInst *NewCall = nullptr; 3595 if (match(Src0, m_NaN()) || isa<UndefValue>(Src0)) { 3596 NewCall = Builder.CreateMinNum(Src1, Src2); 3597 } else if (match(Src1, m_NaN()) || isa<UndefValue>(Src1)) { 3598 NewCall = Builder.CreateMinNum(Src0, Src2); 3599 } else if (match(Src2, m_NaN()) || isa<UndefValue>(Src2)) { 3600 NewCall = Builder.CreateMaxNum(Src0, Src1); 3601 } 3602 3603 if (NewCall) { 3604 NewCall->copyFastMathFlags(II); 3605 NewCall->takeName(II); 3606 return replaceInstUsesWith(*II, NewCall); 3607 } 3608 3609 bool Swap = false; 3610 // Canonicalize constants to RHS operands. 3611 // 3612 // fmed3(c0, x, c1) -> fmed3(x, c0, c1) 3613 if (isa<Constant>(Src0) && !isa<Constant>(Src1)) { 3614 std::swap(Src0, Src1); 3615 Swap = true; 3616 } 3617 3618 if (isa<Constant>(Src1) && !isa<Constant>(Src2)) { 3619 std::swap(Src1, Src2); 3620 Swap = true; 3621 } 3622 3623 if (isa<Constant>(Src0) && !isa<Constant>(Src1)) { 3624 std::swap(Src0, Src1); 3625 Swap = true; 3626 } 3627 3628 if (Swap) { 3629 II->setArgOperand(0, Src0); 3630 II->setArgOperand(1, Src1); 3631 II->setArgOperand(2, Src2); 3632 return II; 3633 } 3634 3635 if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) { 3636 if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) { 3637 if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Src2)) { 3638 APFloat Result = fmed3AMDGCN(C0->getValueAPF(), C1->getValueAPF(), 3639 C2->getValueAPF()); 3640 return replaceInstUsesWith(*II, 3641 ConstantFP::get(Builder.getContext(), Result)); 3642 } 3643 } 3644 } 3645 3646 break; 3647 } 3648 case Intrinsic::amdgcn_icmp: 3649 case Intrinsic::amdgcn_fcmp: { 3650 const ConstantInt *CC = cast<ConstantInt>(II->getArgOperand(2)); 3651 // Guard against invalid arguments. 3652 int64_t CCVal = CC->getZExtValue(); 3653 bool IsInteger = IID == Intrinsic::amdgcn_icmp; 3654 if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE || 3655 CCVal > CmpInst::LAST_ICMP_PREDICATE)) || 3656 (!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE || 3657 CCVal > CmpInst::LAST_FCMP_PREDICATE))) 3658 break; 3659 3660 Value *Src0 = II->getArgOperand(0); 3661 Value *Src1 = II->getArgOperand(1); 3662 3663 if (auto *CSrc0 = dyn_cast<Constant>(Src0)) { 3664 if (auto *CSrc1 = dyn_cast<Constant>(Src1)) { 3665 Constant *CCmp = ConstantExpr::getCompare(CCVal, CSrc0, CSrc1); 3666 if (CCmp->isNullValue()) { 3667 return replaceInstUsesWith( 3668 *II, ConstantExpr::getSExt(CCmp, II->getType())); 3669 } 3670 3671 // The result of V_ICMP/V_FCMP assembly instructions (which this 3672 // intrinsic exposes) is one bit per thread, masked with the EXEC 3673 // register (which contains the bitmask of live threads). So a 3674 // comparison that always returns true is the same as a read of the 3675 // EXEC register. 3676 Function *NewF = Intrinsic::getDeclaration( 3677 II->getModule(), Intrinsic::read_register, II->getType()); 3678 Metadata *MDArgs[] = {MDString::get(II->getContext(), "exec")}; 3679 MDNode *MD = MDNode::get(II->getContext(), MDArgs); 3680 Value *Args[] = {MetadataAsValue::get(II->getContext(), MD)}; 3681 CallInst *NewCall = Builder.CreateCall(NewF, Args); 3682 NewCall->addAttribute(AttributeList::FunctionIndex, 3683 Attribute::Convergent); 3684 NewCall->takeName(II); 3685 return replaceInstUsesWith(*II, NewCall); 3686 } 3687 3688 // Canonicalize constants to RHS. 3689 CmpInst::Predicate SwapPred 3690 = CmpInst::getSwappedPredicate(static_cast<CmpInst::Predicate>(CCVal)); 3691 II->setArgOperand(0, Src1); 3692 II->setArgOperand(1, Src0); 3693 II->setArgOperand(2, ConstantInt::get(CC->getType(), 3694 static_cast<int>(SwapPred))); 3695 return II; 3696 } 3697 3698 if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE) 3699 break; 3700 3701 // Canonicalize compare eq with true value to compare != 0 3702 // llvm.amdgcn.icmp(zext (i1 x), 1, eq) 3703 // -> llvm.amdgcn.icmp(zext (i1 x), 0, ne) 3704 // llvm.amdgcn.icmp(sext (i1 x), -1, eq) 3705 // -> llvm.amdgcn.icmp(sext (i1 x), 0, ne) 3706 Value *ExtSrc; 3707 if (CCVal == CmpInst::ICMP_EQ && 3708 ((match(Src1, m_One()) && match(Src0, m_ZExt(m_Value(ExtSrc)))) || 3709 (match(Src1, m_AllOnes()) && match(Src0, m_SExt(m_Value(ExtSrc))))) && 3710 ExtSrc->getType()->isIntegerTy(1)) { 3711 II->setArgOperand(1, ConstantInt::getNullValue(Src1->getType())); 3712 II->setArgOperand(2, ConstantInt::get(CC->getType(), CmpInst::ICMP_NE)); 3713 return II; 3714 } 3715 3716 CmpInst::Predicate SrcPred; 3717 Value *SrcLHS; 3718 Value *SrcRHS; 3719 3720 // Fold compare eq/ne with 0 from a compare result as the predicate to the 3721 // intrinsic. The typical use is a wave vote function in the library, which 3722 // will be fed from a user code condition compared with 0. Fold in the 3723 // redundant compare. 3724 3725 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne) 3726 // -> llvm.amdgcn.[if]cmp(a, b, pred) 3727 // 3728 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq) 3729 // -> llvm.amdgcn.[if]cmp(a, b, inv pred) 3730 if (match(Src1, m_Zero()) && 3731 match(Src0, 3732 m_ZExtOrSExt(m_Cmp(SrcPred, m_Value(SrcLHS), m_Value(SrcRHS))))) { 3733 if (CCVal == CmpInst::ICMP_EQ) 3734 SrcPred = CmpInst::getInversePredicate(SrcPred); 3735 3736 Intrinsic::ID NewIID = CmpInst::isFPPredicate(SrcPred) ? 3737 Intrinsic::amdgcn_fcmp : Intrinsic::amdgcn_icmp; 3738 3739 Type *Ty = SrcLHS->getType(); 3740 if (auto *CmpType = dyn_cast<IntegerType>(Ty)) { 3741 // Promote to next legal integer type. 3742 unsigned Width = CmpType->getBitWidth(); 3743 unsigned NewWidth = Width; 3744 3745 // Don't do anything for i1 comparisons. 3746 if (Width == 1) 3747 break; 3748 3749 if (Width <= 16) 3750 NewWidth = 16; 3751 else if (Width <= 32) 3752 NewWidth = 32; 3753 else if (Width <= 64) 3754 NewWidth = 64; 3755 else if (Width > 64) 3756 break; // Can't handle this. 3757 3758 if (Width != NewWidth) { 3759 IntegerType *CmpTy = Builder.getIntNTy(NewWidth); 3760 if (CmpInst::isSigned(SrcPred)) { 3761 SrcLHS = Builder.CreateSExt(SrcLHS, CmpTy); 3762 SrcRHS = Builder.CreateSExt(SrcRHS, CmpTy); 3763 } else { 3764 SrcLHS = Builder.CreateZExt(SrcLHS, CmpTy); 3765 SrcRHS = Builder.CreateZExt(SrcRHS, CmpTy); 3766 } 3767 } 3768 } else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy()) 3769 break; 3770 3771 Function *NewF = 3772 Intrinsic::getDeclaration(II->getModule(), NewIID, 3773 { II->getType(), 3774 SrcLHS->getType() }); 3775 Value *Args[] = { SrcLHS, SrcRHS, 3776 ConstantInt::get(CC->getType(), SrcPred) }; 3777 CallInst *NewCall = Builder.CreateCall(NewF, Args); 3778 NewCall->takeName(II); 3779 return replaceInstUsesWith(*II, NewCall); 3780 } 3781 3782 break; 3783 } 3784 case Intrinsic::amdgcn_wqm_vote: { 3785 // wqm_vote is identity when the argument is constant. 3786 if (!isa<Constant>(II->getArgOperand(0))) 3787 break; 3788 3789 return replaceInstUsesWith(*II, II->getArgOperand(0)); 3790 } 3791 case Intrinsic::amdgcn_kill: { 3792 const ConstantInt *C = dyn_cast<ConstantInt>(II->getArgOperand(0)); 3793 if (!C || !C->getZExtValue()) 3794 break; 3795 3796 // amdgcn.kill(i1 1) is a no-op 3797 return eraseInstFromFunction(CI); 3798 } 3799 case Intrinsic::amdgcn_update_dpp: { 3800 Value *Old = II->getArgOperand(0); 3801 3802 auto BC = cast<ConstantInt>(II->getArgOperand(5)); 3803 auto RM = cast<ConstantInt>(II->getArgOperand(3)); 3804 auto BM = cast<ConstantInt>(II->getArgOperand(4)); 3805 if (BC->isZeroValue() || 3806 RM->getZExtValue() != 0xF || 3807 BM->getZExtValue() != 0xF || 3808 isa<UndefValue>(Old)) 3809 break; 3810 3811 // If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value. 3812 II->setOperand(0, UndefValue::get(Old->getType())); 3813 return II; 3814 } 3815 case Intrinsic::amdgcn_readfirstlane: 3816 case Intrinsic::amdgcn_readlane: { 3817 // A constant value is trivially uniform. 3818 if (Constant *C = dyn_cast<Constant>(II->getArgOperand(0))) 3819 return replaceInstUsesWith(*II, C); 3820 3821 // The rest of these may not be safe if the exec may not be the same between 3822 // the def and use. 3823 Value *Src = II->getArgOperand(0); 3824 Instruction *SrcInst = dyn_cast<Instruction>(Src); 3825 if (SrcInst && SrcInst->getParent() != II->getParent()) 3826 break; 3827 3828 // readfirstlane (readfirstlane x) -> readfirstlane x 3829 // readlane (readfirstlane x), y -> readfirstlane x 3830 if (match(Src, m_Intrinsic<Intrinsic::amdgcn_readfirstlane>())) 3831 return replaceInstUsesWith(*II, Src); 3832 3833 if (IID == Intrinsic::amdgcn_readfirstlane) { 3834 // readfirstlane (readlane x, y) -> readlane x, y 3835 if (match(Src, m_Intrinsic<Intrinsic::amdgcn_readlane>())) 3836 return replaceInstUsesWith(*II, Src); 3837 } else { 3838 // readlane (readlane x, y), y -> readlane x, y 3839 if (match(Src, m_Intrinsic<Intrinsic::amdgcn_readlane>( 3840 m_Value(), m_Specific(II->getArgOperand(1))))) 3841 return replaceInstUsesWith(*II, Src); 3842 } 3843 3844 break; 3845 } 3846 case Intrinsic::stackrestore: { 3847 // If the save is right next to the restore, remove the restore. This can 3848 // happen when variable allocas are DCE'd. 3849 if (IntrinsicInst *SS = dyn_cast<IntrinsicInst>(II->getArgOperand(0))) { 3850 if (SS->getIntrinsicID() == Intrinsic::stacksave) { 3851 // Skip over debug info. 3852 if (SS->getNextNonDebugInstruction() == II) { 3853 return eraseInstFromFunction(CI); 3854 } 3855 } 3856 } 3857 3858 // Scan down this block to see if there is another stack restore in the 3859 // same block without an intervening call/alloca. 3860 BasicBlock::iterator BI(II); 3861 Instruction *TI = II->getParent()->getTerminator(); 3862 bool CannotRemove = false; 3863 for (++BI; &*BI != TI; ++BI) { 3864 if (isa<AllocaInst>(BI)) { 3865 CannotRemove = true; 3866 break; 3867 } 3868 if (CallInst *BCI = dyn_cast<CallInst>(BI)) { 3869 if (auto *II2 = dyn_cast<IntrinsicInst>(BCI)) { 3870 // If there is a stackrestore below this one, remove this one. 3871 if (II2->getIntrinsicID() == Intrinsic::stackrestore) 3872 return eraseInstFromFunction(CI); 3873 3874 // Bail if we cross over an intrinsic with side effects, such as 3875 // llvm.stacksave, llvm.read_register, or llvm.setjmp. 3876 if (II2->mayHaveSideEffects()) { 3877 CannotRemove = true; 3878 break; 3879 } 3880 } else { 3881 // If we found a non-intrinsic call, we can't remove the stack 3882 // restore. 3883 CannotRemove = true; 3884 break; 3885 } 3886 } 3887 } 3888 3889 // If the stack restore is in a return, resume, or unwind block and if there 3890 // are no allocas or calls between the restore and the return, nuke the 3891 // restore. 3892 if (!CannotRemove && (isa<ReturnInst>(TI) || isa<ResumeInst>(TI))) 3893 return eraseInstFromFunction(CI); 3894 break; 3895 } 3896 case Intrinsic::lifetime_start: 3897 // Asan needs to poison memory to detect invalid access which is possible 3898 // even for empty lifetime range. 3899 if (II->getFunction()->hasFnAttribute(Attribute::SanitizeAddress) || 3900 II->getFunction()->hasFnAttribute(Attribute::SanitizeMemory) || 3901 II->getFunction()->hasFnAttribute(Attribute::SanitizeHWAddress)) 3902 break; 3903 3904 if (removeTriviallyEmptyRange(*II, Intrinsic::lifetime_start, 3905 Intrinsic::lifetime_end, *this)) 3906 return nullptr; 3907 break; 3908 case Intrinsic::assume: { 3909 Value *IIOperand = II->getArgOperand(0); 3910 // Remove an assume if it is followed by an identical assume. 3911 // TODO: Do we need this? Unless there are conflicting assumptions, the 3912 // computeKnownBits(IIOperand) below here eliminates redundant assumes. 3913 Instruction *Next = II->getNextNonDebugInstruction(); 3914 if (match(Next, m_Intrinsic<Intrinsic::assume>(m_Specific(IIOperand)))) 3915 return eraseInstFromFunction(CI); 3916 3917 // Canonicalize assume(a && b) -> assume(a); assume(b); 3918 // Note: New assumption intrinsics created here are registered by 3919 // the InstCombineIRInserter object. 3920 FunctionType *AssumeIntrinsicTy = II->getFunctionType(); 3921 Value *AssumeIntrinsic = II->getCalledValue(); 3922 Value *A, *B; 3923 if (match(IIOperand, m_And(m_Value(A), m_Value(B)))) { 3924 Builder.CreateCall(AssumeIntrinsicTy, AssumeIntrinsic, A, II->getName()); 3925 Builder.CreateCall(AssumeIntrinsicTy, AssumeIntrinsic, B, II->getName()); 3926 return eraseInstFromFunction(*II); 3927 } 3928 // assume(!(a || b)) -> assume(!a); assume(!b); 3929 if (match(IIOperand, m_Not(m_Or(m_Value(A), m_Value(B))))) { 3930 Builder.CreateCall(AssumeIntrinsicTy, AssumeIntrinsic, 3931 Builder.CreateNot(A), II->getName()); 3932 Builder.CreateCall(AssumeIntrinsicTy, AssumeIntrinsic, 3933 Builder.CreateNot(B), II->getName()); 3934 return eraseInstFromFunction(*II); 3935 } 3936 3937 // assume( (load addr) != null ) -> add 'nonnull' metadata to load 3938 // (if assume is valid at the load) 3939 CmpInst::Predicate Pred; 3940 Instruction *LHS; 3941 if (match(IIOperand, m_ICmp(Pred, m_Instruction(LHS), m_Zero())) && 3942 Pred == ICmpInst::ICMP_NE && LHS->getOpcode() == Instruction::Load && 3943 LHS->getType()->isPointerTy() && 3944 isValidAssumeForContext(II, LHS, &DT)) { 3945 MDNode *MD = MDNode::get(II->getContext(), None); 3946 LHS->setMetadata(LLVMContext::MD_nonnull, MD); 3947 return eraseInstFromFunction(*II); 3948 3949 // TODO: apply nonnull return attributes to calls and invokes 3950 // TODO: apply range metadata for range check patterns? 3951 } 3952 3953 // If there is a dominating assume with the same condition as this one, 3954 // then this one is redundant, and should be removed. 3955 KnownBits Known(1); 3956 computeKnownBits(IIOperand, Known, 0, II); 3957 if (Known.isAllOnes()) 3958 return eraseInstFromFunction(*II); 3959 3960 // Update the cache of affected values for this assumption (we might be 3961 // here because we just simplified the condition). 3962 AC.updateAffectedValues(II); 3963 break; 3964 } 3965 case Intrinsic::experimental_gc_relocate: { 3966 auto &GCR = *cast<GCRelocateInst>(II); 3967 3968 // If we have two copies of the same pointer in the statepoint argument 3969 // list, canonicalize to one. This may let us common gc.relocates. 3970 if (GCR.getBasePtr() == GCR.getDerivedPtr() && 3971 GCR.getBasePtrIndex() != GCR.getDerivedPtrIndex()) { 3972 auto *OpIntTy = GCR.getOperand(2)->getType(); 3973 II->setOperand(2, ConstantInt::get(OpIntTy, GCR.getBasePtrIndex())); 3974 return II; 3975 } 3976 3977 // Translate facts known about a pointer before relocating into 3978 // facts about the relocate value, while being careful to 3979 // preserve relocation semantics. 3980 Value *DerivedPtr = GCR.getDerivedPtr(); 3981 3982 // Remove the relocation if unused, note that this check is required 3983 // to prevent the cases below from looping forever. 3984 if (II->use_empty()) 3985 return eraseInstFromFunction(*II); 3986 3987 // Undef is undef, even after relocation. 3988 // TODO: provide a hook for this in GCStrategy. This is clearly legal for 3989 // most practical collectors, but there was discussion in the review thread 3990 // about whether it was legal for all possible collectors. 3991 if (isa<UndefValue>(DerivedPtr)) 3992 // Use undef of gc_relocate's type to replace it. 3993 return replaceInstUsesWith(*II, UndefValue::get(II->getType())); 3994 3995 if (auto *PT = dyn_cast<PointerType>(II->getType())) { 3996 // The relocation of null will be null for most any collector. 3997 // TODO: provide a hook for this in GCStrategy. There might be some 3998 // weird collector this property does not hold for. 3999 if (isa<ConstantPointerNull>(DerivedPtr)) 4000 // Use null-pointer of gc_relocate's type to replace it. 4001 return replaceInstUsesWith(*II, ConstantPointerNull::get(PT)); 4002 4003 // isKnownNonNull -> nonnull attribute 4004 if (!II->hasRetAttr(Attribute::NonNull) && 4005 isKnownNonZero(DerivedPtr, DL, 0, &AC, II, &DT)) { 4006 II->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull); 4007 return II; 4008 } 4009 } 4010 4011 // TODO: bitcast(relocate(p)) -> relocate(bitcast(p)) 4012 // Canonicalize on the type from the uses to the defs 4013 4014 // TODO: relocate((gep p, C, C2, ...)) -> gep(relocate(p), C, C2, ...) 4015 break; 4016 } 4017 4018 case Intrinsic::experimental_guard: { 4019 // Is this guard followed by another guard? We scan forward over a small 4020 // fixed window of instructions to handle common cases with conditions 4021 // computed between guards. 4022 Instruction *NextInst = II->getNextNode(); 4023 for (unsigned i = 0; i < GuardWideningWindow; i++) { 4024 // Note: Using context-free form to avoid compile time blow up 4025 if (!isSafeToSpeculativelyExecute(NextInst)) 4026 break; 4027 NextInst = NextInst->getNextNode(); 4028 } 4029 Value *NextCond = nullptr; 4030 if (match(NextInst, 4031 m_Intrinsic<Intrinsic::experimental_guard>(m_Value(NextCond)))) { 4032 Value *CurrCond = II->getArgOperand(0); 4033 4034 // Remove a guard that it is immediately preceded by an identical guard. 4035 if (CurrCond == NextCond) 4036 return eraseInstFromFunction(*NextInst); 4037 4038 // Otherwise canonicalize guard(a); guard(b) -> guard(a & b). 4039 Instruction* MoveI = II->getNextNode(); 4040 while (MoveI != NextInst) { 4041 auto *Temp = MoveI; 4042 MoveI = MoveI->getNextNode(); 4043 Temp->moveBefore(II); 4044 } 4045 II->setArgOperand(0, Builder.CreateAnd(CurrCond, NextCond)); 4046 return eraseInstFromFunction(*NextInst); 4047 } 4048 break; 4049 } 4050 } 4051 return visitCallBase(*II); 4052 } 4053 4054 // Fence instruction simplification 4055 Instruction *InstCombiner::visitFenceInst(FenceInst &FI) { 4056 // Remove identical consecutive fences. 4057 Instruction *Next = FI.getNextNonDebugInstruction(); 4058 if (auto *NFI = dyn_cast<FenceInst>(Next)) 4059 if (FI.isIdenticalTo(NFI)) 4060 return eraseInstFromFunction(FI); 4061 return nullptr; 4062 } 4063 4064 // InvokeInst simplification 4065 Instruction *InstCombiner::visitInvokeInst(InvokeInst &II) { 4066 return visitCallBase(II); 4067 } 4068 4069 // CallBrInst simplification 4070 Instruction *InstCombiner::visitCallBrInst(CallBrInst &CBI) { 4071 return visitCallBase(CBI); 4072 } 4073 4074 /// If this cast does not affect the value passed through the varargs area, we 4075 /// can eliminate the use of the cast. 4076 static bool isSafeToEliminateVarargsCast(const CallBase &Call, 4077 const DataLayout &DL, 4078 const CastInst *const CI, 4079 const int ix) { 4080 if (!CI->isLosslessCast()) 4081 return false; 4082 4083 // If this is a GC intrinsic, avoid munging types. We need types for 4084 // statepoint reconstruction in SelectionDAG. 4085 // TODO: This is probably something which should be expanded to all 4086 // intrinsics since the entire point of intrinsics is that 4087 // they are understandable by the optimizer. 4088 if (isStatepoint(&Call) || isGCRelocate(&Call) || isGCResult(&Call)) 4089 return false; 4090 4091 // The size of ByVal or InAlloca arguments is derived from the type, so we 4092 // can't change to a type with a different size. If the size were 4093 // passed explicitly we could avoid this check. 4094 if (!Call.isByValOrInAllocaArgument(ix)) 4095 return true; 4096 4097 Type* SrcTy = 4098 cast<PointerType>(CI->getOperand(0)->getType())->getElementType(); 4099 Type *DstTy = Call.isByValArgument(ix) 4100 ? Call.getParamByValType(ix) 4101 : cast<PointerType>(CI->getType())->getElementType(); 4102 if (!SrcTy->isSized() || !DstTy->isSized()) 4103 return false; 4104 if (DL.getTypeAllocSize(SrcTy) != DL.getTypeAllocSize(DstTy)) 4105 return false; 4106 return true; 4107 } 4108 4109 Instruction *InstCombiner::tryOptimizeCall(CallInst *CI) { 4110 if (!CI->getCalledFunction()) return nullptr; 4111 4112 auto InstCombineRAUW = [this](Instruction *From, Value *With) { 4113 replaceInstUsesWith(*From, With); 4114 }; 4115 auto InstCombineErase = [this](Instruction *I) { 4116 eraseInstFromFunction(*I); 4117 }; 4118 LibCallSimplifier Simplifier(DL, &TLI, ORE, BFI, PSI, InstCombineRAUW, 4119 InstCombineErase); 4120 if (Value *With = Simplifier.optimizeCall(CI)) { 4121 ++NumSimplified; 4122 return CI->use_empty() ? CI : replaceInstUsesWith(*CI, With); 4123 } 4124 4125 return nullptr; 4126 } 4127 4128 static IntrinsicInst *findInitTrampolineFromAlloca(Value *TrampMem) { 4129 // Strip off at most one level of pointer casts, looking for an alloca. This 4130 // is good enough in practice and simpler than handling any number of casts. 4131 Value *Underlying = TrampMem->stripPointerCasts(); 4132 if (Underlying != TrampMem && 4133 (!Underlying->hasOneUse() || Underlying->user_back() != TrampMem)) 4134 return nullptr; 4135 if (!isa<AllocaInst>(Underlying)) 4136 return nullptr; 4137 4138 IntrinsicInst *InitTrampoline = nullptr; 4139 for (User *U : TrampMem->users()) { 4140 IntrinsicInst *II = dyn_cast<IntrinsicInst>(U); 4141 if (!II) 4142 return nullptr; 4143 if (II->getIntrinsicID() == Intrinsic::init_trampoline) { 4144 if (InitTrampoline) 4145 // More than one init_trampoline writes to this value. Give up. 4146 return nullptr; 4147 InitTrampoline = II; 4148 continue; 4149 } 4150 if (II->getIntrinsicID() == Intrinsic::adjust_trampoline) 4151 // Allow any number of calls to adjust.trampoline. 4152 continue; 4153 return nullptr; 4154 } 4155 4156 // No call to init.trampoline found. 4157 if (!InitTrampoline) 4158 return nullptr; 4159 4160 // Check that the alloca is being used in the expected way. 4161 if (InitTrampoline->getOperand(0) != TrampMem) 4162 return nullptr; 4163 4164 return InitTrampoline; 4165 } 4166 4167 static IntrinsicInst *findInitTrampolineFromBB(IntrinsicInst *AdjustTramp, 4168 Value *TrampMem) { 4169 // Visit all the previous instructions in the basic block, and try to find a 4170 // init.trampoline which has a direct path to the adjust.trampoline. 4171 for (BasicBlock::iterator I = AdjustTramp->getIterator(), 4172 E = AdjustTramp->getParent()->begin(); 4173 I != E;) { 4174 Instruction *Inst = &*--I; 4175 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) 4176 if (II->getIntrinsicID() == Intrinsic::init_trampoline && 4177 II->getOperand(0) == TrampMem) 4178 return II; 4179 if (Inst->mayWriteToMemory()) 4180 return nullptr; 4181 } 4182 return nullptr; 4183 } 4184 4185 // Given a call to llvm.adjust.trampoline, find and return the corresponding 4186 // call to llvm.init.trampoline if the call to the trampoline can be optimized 4187 // to a direct call to a function. Otherwise return NULL. 4188 static IntrinsicInst *findInitTrampoline(Value *Callee) { 4189 Callee = Callee->stripPointerCasts(); 4190 IntrinsicInst *AdjustTramp = dyn_cast<IntrinsicInst>(Callee); 4191 if (!AdjustTramp || 4192 AdjustTramp->getIntrinsicID() != Intrinsic::adjust_trampoline) 4193 return nullptr; 4194 4195 Value *TrampMem = AdjustTramp->getOperand(0); 4196 4197 if (IntrinsicInst *IT = findInitTrampolineFromAlloca(TrampMem)) 4198 return IT; 4199 if (IntrinsicInst *IT = findInitTrampolineFromBB(AdjustTramp, TrampMem)) 4200 return IT; 4201 return nullptr; 4202 } 4203 4204 static void annotateAnyAllocSite(CallBase &Call, const TargetLibraryInfo *TLI) { 4205 unsigned NumArgs = Call.getNumArgOperands(); 4206 ConstantInt *Op0C = dyn_cast<ConstantInt>(Call.getOperand(0)); 4207 ConstantInt *Op1C = 4208 (NumArgs == 1) ? nullptr : dyn_cast<ConstantInt>(Call.getOperand(1)); 4209 // Bail out if the allocation size is zero. 4210 if ((Op0C && Op0C->isNullValue()) || (Op1C && Op1C->isNullValue())) 4211 return; 4212 4213 if (isMallocLikeFn(&Call, TLI) && Op0C) { 4214 if (isOpNewLikeFn(&Call, TLI)) 4215 Call.addAttribute(AttributeList::ReturnIndex, 4216 Attribute::getWithDereferenceableBytes( 4217 Call.getContext(), Op0C->getZExtValue())); 4218 else 4219 Call.addAttribute(AttributeList::ReturnIndex, 4220 Attribute::getWithDereferenceableOrNullBytes( 4221 Call.getContext(), Op0C->getZExtValue())); 4222 } else if (isReallocLikeFn(&Call, TLI) && Op1C) { 4223 Call.addAttribute(AttributeList::ReturnIndex, 4224 Attribute::getWithDereferenceableOrNullBytes( 4225 Call.getContext(), Op1C->getZExtValue())); 4226 } else if (isCallocLikeFn(&Call, TLI) && Op0C && Op1C) { 4227 bool Overflow; 4228 const APInt &N = Op0C->getValue(); 4229 APInt Size = N.umul_ov(Op1C->getValue(), Overflow); 4230 if (!Overflow) 4231 Call.addAttribute(AttributeList::ReturnIndex, 4232 Attribute::getWithDereferenceableOrNullBytes( 4233 Call.getContext(), Size.getZExtValue())); 4234 } else if (isStrdupLikeFn(&Call, TLI)) { 4235 uint64_t Len = GetStringLength(Call.getOperand(0)); 4236 if (Len) { 4237 // strdup 4238 if (NumArgs == 1) 4239 Call.addAttribute(AttributeList::ReturnIndex, 4240 Attribute::getWithDereferenceableOrNullBytes( 4241 Call.getContext(), Len)); 4242 // strndup 4243 else if (NumArgs == 2 && Op1C) 4244 Call.addAttribute( 4245 AttributeList::ReturnIndex, 4246 Attribute::getWithDereferenceableOrNullBytes( 4247 Call.getContext(), std::min(Len, Op1C->getZExtValue() + 1))); 4248 } 4249 } 4250 } 4251 4252 /// Improvements for call, callbr and invoke instructions. 4253 Instruction *InstCombiner::visitCallBase(CallBase &Call) { 4254 if (isAllocationFn(&Call, &TLI)) 4255 annotateAnyAllocSite(Call, &TLI); 4256 4257 bool Changed = false; 4258 4259 // Mark any parameters that are known to be non-null with the nonnull 4260 // attribute. This is helpful for inlining calls to functions with null 4261 // checks on their arguments. 4262 SmallVector<unsigned, 4> ArgNos; 4263 unsigned ArgNo = 0; 4264 4265 for (Value *V : Call.args()) { 4266 if (V->getType()->isPointerTy() && 4267 !Call.paramHasAttr(ArgNo, Attribute::NonNull) && 4268 isKnownNonZero(V, DL, 0, &AC, &Call, &DT)) 4269 ArgNos.push_back(ArgNo); 4270 ArgNo++; 4271 } 4272 4273 assert(ArgNo == Call.arg_size() && "sanity check"); 4274 4275 if (!ArgNos.empty()) { 4276 AttributeList AS = Call.getAttributes(); 4277 LLVMContext &Ctx = Call.getContext(); 4278 AS = AS.addParamAttribute(Ctx, ArgNos, 4279 Attribute::get(Ctx, Attribute::NonNull)); 4280 Call.setAttributes(AS); 4281 Changed = true; 4282 } 4283 4284 // If the callee is a pointer to a function, attempt to move any casts to the 4285 // arguments of the call/callbr/invoke. 4286 Value *Callee = Call.getCalledValue(); 4287 if (!isa<Function>(Callee) && transformConstExprCastCall(Call)) 4288 return nullptr; 4289 4290 if (Function *CalleeF = dyn_cast<Function>(Callee)) { 4291 // Remove the convergent attr on calls when the callee is not convergent. 4292 if (Call.isConvergent() && !CalleeF->isConvergent() && 4293 !CalleeF->isIntrinsic()) { 4294 LLVM_DEBUG(dbgs() << "Removing convergent attr from instr " << Call 4295 << "\n"); 4296 Call.setNotConvergent(); 4297 return &Call; 4298 } 4299 4300 // If the call and callee calling conventions don't match, this call must 4301 // be unreachable, as the call is undefined. 4302 if (CalleeF->getCallingConv() != Call.getCallingConv() && 4303 // Only do this for calls to a function with a body. A prototype may 4304 // not actually end up matching the implementation's calling conv for a 4305 // variety of reasons (e.g. it may be written in assembly). 4306 !CalleeF->isDeclaration()) { 4307 Instruction *OldCall = &Call; 4308 CreateNonTerminatorUnreachable(OldCall); 4309 // If OldCall does not return void then replaceAllUsesWith undef. 4310 // This allows ValueHandlers and custom metadata to adjust itself. 4311 if (!OldCall->getType()->isVoidTy()) 4312 replaceInstUsesWith(*OldCall, UndefValue::get(OldCall->getType())); 4313 if (isa<CallInst>(OldCall)) 4314 return eraseInstFromFunction(*OldCall); 4315 4316 // We cannot remove an invoke or a callbr, because it would change thexi 4317 // CFG, just change the callee to a null pointer. 4318 cast<CallBase>(OldCall)->setCalledFunction( 4319 CalleeF->getFunctionType(), 4320 Constant::getNullValue(CalleeF->getType())); 4321 return nullptr; 4322 } 4323 } 4324 4325 if ((isa<ConstantPointerNull>(Callee) && 4326 !NullPointerIsDefined(Call.getFunction())) || 4327 isa<UndefValue>(Callee)) { 4328 // If Call does not return void then replaceAllUsesWith undef. 4329 // This allows ValueHandlers and custom metadata to adjust itself. 4330 if (!Call.getType()->isVoidTy()) 4331 replaceInstUsesWith(Call, UndefValue::get(Call.getType())); 4332 4333 if (Call.isTerminator()) { 4334 // Can't remove an invoke or callbr because we cannot change the CFG. 4335 return nullptr; 4336 } 4337 4338 // This instruction is not reachable, just remove it. 4339 CreateNonTerminatorUnreachable(&Call); 4340 return eraseInstFromFunction(Call); 4341 } 4342 4343 if (IntrinsicInst *II = findInitTrampoline(Callee)) 4344 return transformCallThroughTrampoline(Call, *II); 4345 4346 PointerType *PTy = cast<PointerType>(Callee->getType()); 4347 FunctionType *FTy = cast<FunctionType>(PTy->getElementType()); 4348 if (FTy->isVarArg()) { 4349 int ix = FTy->getNumParams(); 4350 // See if we can optimize any arguments passed through the varargs area of 4351 // the call. 4352 for (auto I = Call.arg_begin() + FTy->getNumParams(), E = Call.arg_end(); 4353 I != E; ++I, ++ix) { 4354 CastInst *CI = dyn_cast<CastInst>(*I); 4355 if (CI && isSafeToEliminateVarargsCast(Call, DL, CI, ix)) { 4356 *I = CI->getOperand(0); 4357 4358 // Update the byval type to match the argument type. 4359 if (Call.isByValArgument(ix)) { 4360 Call.removeParamAttr(ix, Attribute::ByVal); 4361 Call.addParamAttr( 4362 ix, Attribute::getWithByValType( 4363 Call.getContext(), 4364 CI->getOperand(0)->getType()->getPointerElementType())); 4365 } 4366 Changed = true; 4367 } 4368 } 4369 } 4370 4371 if (isa<InlineAsm>(Callee) && !Call.doesNotThrow()) { 4372 // Inline asm calls cannot throw - mark them 'nounwind'. 4373 Call.setDoesNotThrow(); 4374 Changed = true; 4375 } 4376 4377 // Try to optimize the call if possible, we require DataLayout for most of 4378 // this. None of these calls are seen as possibly dead so go ahead and 4379 // delete the instruction now. 4380 if (CallInst *CI = dyn_cast<CallInst>(&Call)) { 4381 Instruction *I = tryOptimizeCall(CI); 4382 // If we changed something return the result, etc. Otherwise let 4383 // the fallthrough check. 4384 if (I) return eraseInstFromFunction(*I); 4385 } 4386 4387 if (isAllocLikeFn(&Call, &TLI)) 4388 return visitAllocSite(Call); 4389 4390 return Changed ? &Call : nullptr; 4391 } 4392 4393 /// If the callee is a constexpr cast of a function, attempt to move the cast to 4394 /// the arguments of the call/callbr/invoke. 4395 bool InstCombiner::transformConstExprCastCall(CallBase &Call) { 4396 auto *Callee = dyn_cast<Function>(Call.getCalledValue()->stripPointerCasts()); 4397 if (!Callee) 4398 return false; 4399 4400 // If this is a call to a thunk function, don't remove the cast. Thunks are 4401 // used to transparently forward all incoming parameters and outgoing return 4402 // values, so it's important to leave the cast in place. 4403 if (Callee->hasFnAttribute("thunk")) 4404 return false; 4405 4406 // If this is a musttail call, the callee's prototype must match the caller's 4407 // prototype with the exception of pointee types. The code below doesn't 4408 // implement that, so we can't do this transform. 4409 // TODO: Do the transform if it only requires adding pointer casts. 4410 if (Call.isMustTailCall()) 4411 return false; 4412 4413 Instruction *Caller = &Call; 4414 const AttributeList &CallerPAL = Call.getAttributes(); 4415 4416 // Okay, this is a cast from a function to a different type. Unless doing so 4417 // would cause a type conversion of one of our arguments, change this call to 4418 // be a direct call with arguments casted to the appropriate types. 4419 FunctionType *FT = Callee->getFunctionType(); 4420 Type *OldRetTy = Caller->getType(); 4421 Type *NewRetTy = FT->getReturnType(); 4422 4423 // Check to see if we are changing the return type... 4424 if (OldRetTy != NewRetTy) { 4425 4426 if (NewRetTy->isStructTy()) 4427 return false; // TODO: Handle multiple return values. 4428 4429 if (!CastInst::isBitOrNoopPointerCastable(NewRetTy, OldRetTy, DL)) { 4430 if (Callee->isDeclaration()) 4431 return false; // Cannot transform this return value. 4432 4433 if (!Caller->use_empty() && 4434 // void -> non-void is handled specially 4435 !NewRetTy->isVoidTy()) 4436 return false; // Cannot transform this return value. 4437 } 4438 4439 if (!CallerPAL.isEmpty() && !Caller->use_empty()) { 4440 AttrBuilder RAttrs(CallerPAL, AttributeList::ReturnIndex); 4441 if (RAttrs.overlaps(AttributeFuncs::typeIncompatible(NewRetTy))) 4442 return false; // Attribute not compatible with transformed value. 4443 } 4444 4445 // If the callbase is an invoke/callbr instruction, and the return value is 4446 // used by a PHI node in a successor, we cannot change the return type of 4447 // the call because there is no place to put the cast instruction (without 4448 // breaking the critical edge). Bail out in this case. 4449 if (!Caller->use_empty()) { 4450 if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) 4451 for (User *U : II->users()) 4452 if (PHINode *PN = dyn_cast<PHINode>(U)) 4453 if (PN->getParent() == II->getNormalDest() || 4454 PN->getParent() == II->getUnwindDest()) 4455 return false; 4456 // FIXME: Be conservative for callbr to avoid a quadratic search. 4457 if (isa<CallBrInst>(Caller)) 4458 return false; 4459 } 4460 } 4461 4462 unsigned NumActualArgs = Call.arg_size(); 4463 unsigned NumCommonArgs = std::min(FT->getNumParams(), NumActualArgs); 4464 4465 // Prevent us turning: 4466 // declare void @takes_i32_inalloca(i32* inalloca) 4467 // call void bitcast (void (i32*)* @takes_i32_inalloca to void (i32)*)(i32 0) 4468 // 4469 // into: 4470 // call void @takes_i32_inalloca(i32* null) 4471 // 4472 // Similarly, avoid folding away bitcasts of byval calls. 4473 if (Callee->getAttributes().hasAttrSomewhere(Attribute::InAlloca) || 4474 Callee->getAttributes().hasAttrSomewhere(Attribute::ByVal)) 4475 return false; 4476 4477 auto AI = Call.arg_begin(); 4478 for (unsigned i = 0, e = NumCommonArgs; i != e; ++i, ++AI) { 4479 Type *ParamTy = FT->getParamType(i); 4480 Type *ActTy = (*AI)->getType(); 4481 4482 if (!CastInst::isBitOrNoopPointerCastable(ActTy, ParamTy, DL)) 4483 return false; // Cannot transform this parameter value. 4484 4485 if (AttrBuilder(CallerPAL.getParamAttributes(i)) 4486 .overlaps(AttributeFuncs::typeIncompatible(ParamTy))) 4487 return false; // Attribute not compatible with transformed value. 4488 4489 if (Call.isInAllocaArgument(i)) 4490 return false; // Cannot transform to and from inalloca. 4491 4492 // If the parameter is passed as a byval argument, then we have to have a 4493 // sized type and the sized type has to have the same size as the old type. 4494 if (ParamTy != ActTy && CallerPAL.hasParamAttribute(i, Attribute::ByVal)) { 4495 PointerType *ParamPTy = dyn_cast<PointerType>(ParamTy); 4496 if (!ParamPTy || !ParamPTy->getElementType()->isSized()) 4497 return false; 4498 4499 Type *CurElTy = Call.getParamByValType(i); 4500 if (DL.getTypeAllocSize(CurElTy) != 4501 DL.getTypeAllocSize(ParamPTy->getElementType())) 4502 return false; 4503 } 4504 } 4505 4506 if (Callee->isDeclaration()) { 4507 // Do not delete arguments unless we have a function body. 4508 if (FT->getNumParams() < NumActualArgs && !FT->isVarArg()) 4509 return false; 4510 4511 // If the callee is just a declaration, don't change the varargsness of the 4512 // call. We don't want to introduce a varargs call where one doesn't 4513 // already exist. 4514 PointerType *APTy = cast<PointerType>(Call.getCalledValue()->getType()); 4515 if (FT->isVarArg()!=cast<FunctionType>(APTy->getElementType())->isVarArg()) 4516 return false; 4517 4518 // If both the callee and the cast type are varargs, we still have to make 4519 // sure the number of fixed parameters are the same or we have the same 4520 // ABI issues as if we introduce a varargs call. 4521 if (FT->isVarArg() && 4522 cast<FunctionType>(APTy->getElementType())->isVarArg() && 4523 FT->getNumParams() != 4524 cast<FunctionType>(APTy->getElementType())->getNumParams()) 4525 return false; 4526 } 4527 4528 if (FT->getNumParams() < NumActualArgs && FT->isVarArg() && 4529 !CallerPAL.isEmpty()) { 4530 // In this case we have more arguments than the new function type, but we 4531 // won't be dropping them. Check that these extra arguments have attributes 4532 // that are compatible with being a vararg call argument. 4533 unsigned SRetIdx; 4534 if (CallerPAL.hasAttrSomewhere(Attribute::StructRet, &SRetIdx) && 4535 SRetIdx > FT->getNumParams()) 4536 return false; 4537 } 4538 4539 // Okay, we decided that this is a safe thing to do: go ahead and start 4540 // inserting cast instructions as necessary. 4541 SmallVector<Value *, 8> Args; 4542 SmallVector<AttributeSet, 8> ArgAttrs; 4543 Args.reserve(NumActualArgs); 4544 ArgAttrs.reserve(NumActualArgs); 4545 4546 // Get any return attributes. 4547 AttrBuilder RAttrs(CallerPAL, AttributeList::ReturnIndex); 4548 4549 // If the return value is not being used, the type may not be compatible 4550 // with the existing attributes. Wipe out any problematic attributes. 4551 RAttrs.remove(AttributeFuncs::typeIncompatible(NewRetTy)); 4552 4553 LLVMContext &Ctx = Call.getContext(); 4554 AI = Call.arg_begin(); 4555 for (unsigned i = 0; i != NumCommonArgs; ++i, ++AI) { 4556 Type *ParamTy = FT->getParamType(i); 4557 4558 Value *NewArg = *AI; 4559 if ((*AI)->getType() != ParamTy) 4560 NewArg = Builder.CreateBitOrPointerCast(*AI, ParamTy); 4561 Args.push_back(NewArg); 4562 4563 // Add any parameter attributes. 4564 if (CallerPAL.hasParamAttribute(i, Attribute::ByVal)) { 4565 AttrBuilder AB(CallerPAL.getParamAttributes(i)); 4566 AB.addByValAttr(NewArg->getType()->getPointerElementType()); 4567 ArgAttrs.push_back(AttributeSet::get(Ctx, AB)); 4568 } else 4569 ArgAttrs.push_back(CallerPAL.getParamAttributes(i)); 4570 } 4571 4572 // If the function takes more arguments than the call was taking, add them 4573 // now. 4574 for (unsigned i = NumCommonArgs; i != FT->getNumParams(); ++i) { 4575 Args.push_back(Constant::getNullValue(FT->getParamType(i))); 4576 ArgAttrs.push_back(AttributeSet()); 4577 } 4578 4579 // If we are removing arguments to the function, emit an obnoxious warning. 4580 if (FT->getNumParams() < NumActualArgs) { 4581 // TODO: if (!FT->isVarArg()) this call may be unreachable. PR14722 4582 if (FT->isVarArg()) { 4583 // Add all of the arguments in their promoted form to the arg list. 4584 for (unsigned i = FT->getNumParams(); i != NumActualArgs; ++i, ++AI) { 4585 Type *PTy = getPromotedType((*AI)->getType()); 4586 Value *NewArg = *AI; 4587 if (PTy != (*AI)->getType()) { 4588 // Must promote to pass through va_arg area! 4589 Instruction::CastOps opcode = 4590 CastInst::getCastOpcode(*AI, false, PTy, false); 4591 NewArg = Builder.CreateCast(opcode, *AI, PTy); 4592 } 4593 Args.push_back(NewArg); 4594 4595 // Add any parameter attributes. 4596 ArgAttrs.push_back(CallerPAL.getParamAttributes(i)); 4597 } 4598 } 4599 } 4600 4601 AttributeSet FnAttrs = CallerPAL.getFnAttributes(); 4602 4603 if (NewRetTy->isVoidTy()) 4604 Caller->setName(""); // Void type should not have a name. 4605 4606 assert((ArgAttrs.size() == FT->getNumParams() || FT->isVarArg()) && 4607 "missing argument attributes"); 4608 AttributeList NewCallerPAL = AttributeList::get( 4609 Ctx, FnAttrs, AttributeSet::get(Ctx, RAttrs), ArgAttrs); 4610 4611 SmallVector<OperandBundleDef, 1> OpBundles; 4612 Call.getOperandBundlesAsDefs(OpBundles); 4613 4614 CallBase *NewCall; 4615 if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) { 4616 NewCall = Builder.CreateInvoke(Callee, II->getNormalDest(), 4617 II->getUnwindDest(), Args, OpBundles); 4618 } else if (CallBrInst *CBI = dyn_cast<CallBrInst>(Caller)) { 4619 NewCall = Builder.CreateCallBr(Callee, CBI->getDefaultDest(), 4620 CBI->getIndirectDests(), Args, OpBundles); 4621 } else { 4622 NewCall = Builder.CreateCall(Callee, Args, OpBundles); 4623 cast<CallInst>(NewCall)->setTailCallKind( 4624 cast<CallInst>(Caller)->getTailCallKind()); 4625 } 4626 NewCall->takeName(Caller); 4627 NewCall->setCallingConv(Call.getCallingConv()); 4628 NewCall->setAttributes(NewCallerPAL); 4629 4630 // Preserve the weight metadata for the new call instruction. The metadata 4631 // is used by SamplePGO to check callsite's hotness. 4632 uint64_t W; 4633 if (Caller->extractProfTotalWeight(W)) 4634 NewCall->setProfWeight(W); 4635 4636 // Insert a cast of the return type as necessary. 4637 Instruction *NC = NewCall; 4638 Value *NV = NC; 4639 if (OldRetTy != NV->getType() && !Caller->use_empty()) { 4640 if (!NV->getType()->isVoidTy()) { 4641 NV = NC = CastInst::CreateBitOrPointerCast(NC, OldRetTy); 4642 NC->setDebugLoc(Caller->getDebugLoc()); 4643 4644 // If this is an invoke/callbr instruction, we should insert it after the 4645 // first non-phi instruction in the normal successor block. 4646 if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) { 4647 BasicBlock::iterator I = II->getNormalDest()->getFirstInsertionPt(); 4648 InsertNewInstBefore(NC, *I); 4649 } else if (CallBrInst *CBI = dyn_cast<CallBrInst>(Caller)) { 4650 BasicBlock::iterator I = CBI->getDefaultDest()->getFirstInsertionPt(); 4651 InsertNewInstBefore(NC, *I); 4652 } else { 4653 // Otherwise, it's a call, just insert cast right after the call. 4654 InsertNewInstBefore(NC, *Caller); 4655 } 4656 Worklist.AddUsersToWorkList(*Caller); 4657 } else { 4658 NV = UndefValue::get(Caller->getType()); 4659 } 4660 } 4661 4662 if (!Caller->use_empty()) 4663 replaceInstUsesWith(*Caller, NV); 4664 else if (Caller->hasValueHandle()) { 4665 if (OldRetTy == NV->getType()) 4666 ValueHandleBase::ValueIsRAUWd(Caller, NV); 4667 else 4668 // We cannot call ValueIsRAUWd with a different type, and the 4669 // actual tracked value will disappear. 4670 ValueHandleBase::ValueIsDeleted(Caller); 4671 } 4672 4673 eraseInstFromFunction(*Caller); 4674 return true; 4675 } 4676 4677 /// Turn a call to a function created by init_trampoline / adjust_trampoline 4678 /// intrinsic pair into a direct call to the underlying function. 4679 Instruction * 4680 InstCombiner::transformCallThroughTrampoline(CallBase &Call, 4681 IntrinsicInst &Tramp) { 4682 Value *Callee = Call.getCalledValue(); 4683 Type *CalleeTy = Callee->getType(); 4684 FunctionType *FTy = Call.getFunctionType(); 4685 AttributeList Attrs = Call.getAttributes(); 4686 4687 // If the call already has the 'nest' attribute somewhere then give up - 4688 // otherwise 'nest' would occur twice after splicing in the chain. 4689 if (Attrs.hasAttrSomewhere(Attribute::Nest)) 4690 return nullptr; 4691 4692 Function *NestF = cast<Function>(Tramp.getArgOperand(1)->stripPointerCasts()); 4693 FunctionType *NestFTy = NestF->getFunctionType(); 4694 4695 AttributeList NestAttrs = NestF->getAttributes(); 4696 if (!NestAttrs.isEmpty()) { 4697 unsigned NestArgNo = 0; 4698 Type *NestTy = nullptr; 4699 AttributeSet NestAttr; 4700 4701 // Look for a parameter marked with the 'nest' attribute. 4702 for (FunctionType::param_iterator I = NestFTy->param_begin(), 4703 E = NestFTy->param_end(); 4704 I != E; ++NestArgNo, ++I) { 4705 AttributeSet AS = NestAttrs.getParamAttributes(NestArgNo); 4706 if (AS.hasAttribute(Attribute::Nest)) { 4707 // Record the parameter type and any other attributes. 4708 NestTy = *I; 4709 NestAttr = AS; 4710 break; 4711 } 4712 } 4713 4714 if (NestTy) { 4715 std::vector<Value*> NewArgs; 4716 std::vector<AttributeSet> NewArgAttrs; 4717 NewArgs.reserve(Call.arg_size() + 1); 4718 NewArgAttrs.reserve(Call.arg_size()); 4719 4720 // Insert the nest argument into the call argument list, which may 4721 // mean appending it. Likewise for attributes. 4722 4723 { 4724 unsigned ArgNo = 0; 4725 auto I = Call.arg_begin(), E = Call.arg_end(); 4726 do { 4727 if (ArgNo == NestArgNo) { 4728 // Add the chain argument and attributes. 4729 Value *NestVal = Tramp.getArgOperand(2); 4730 if (NestVal->getType() != NestTy) 4731 NestVal = Builder.CreateBitCast(NestVal, NestTy, "nest"); 4732 NewArgs.push_back(NestVal); 4733 NewArgAttrs.push_back(NestAttr); 4734 } 4735 4736 if (I == E) 4737 break; 4738 4739 // Add the original argument and attributes. 4740 NewArgs.push_back(*I); 4741 NewArgAttrs.push_back(Attrs.getParamAttributes(ArgNo)); 4742 4743 ++ArgNo; 4744 ++I; 4745 } while (true); 4746 } 4747 4748 // The trampoline may have been bitcast to a bogus type (FTy). 4749 // Handle this by synthesizing a new function type, equal to FTy 4750 // with the chain parameter inserted. 4751 4752 std::vector<Type*> NewTypes; 4753 NewTypes.reserve(FTy->getNumParams()+1); 4754 4755 // Insert the chain's type into the list of parameter types, which may 4756 // mean appending it. 4757 { 4758 unsigned ArgNo = 0; 4759 FunctionType::param_iterator I = FTy->param_begin(), 4760 E = FTy->param_end(); 4761 4762 do { 4763 if (ArgNo == NestArgNo) 4764 // Add the chain's type. 4765 NewTypes.push_back(NestTy); 4766 4767 if (I == E) 4768 break; 4769 4770 // Add the original type. 4771 NewTypes.push_back(*I); 4772 4773 ++ArgNo; 4774 ++I; 4775 } while (true); 4776 } 4777 4778 // Replace the trampoline call with a direct call. Let the generic 4779 // code sort out any function type mismatches. 4780 FunctionType *NewFTy = FunctionType::get(FTy->getReturnType(), NewTypes, 4781 FTy->isVarArg()); 4782 Constant *NewCallee = 4783 NestF->getType() == PointerType::getUnqual(NewFTy) ? 4784 NestF : ConstantExpr::getBitCast(NestF, 4785 PointerType::getUnqual(NewFTy)); 4786 AttributeList NewPAL = 4787 AttributeList::get(FTy->getContext(), Attrs.getFnAttributes(), 4788 Attrs.getRetAttributes(), NewArgAttrs); 4789 4790 SmallVector<OperandBundleDef, 1> OpBundles; 4791 Call.getOperandBundlesAsDefs(OpBundles); 4792 4793 Instruction *NewCaller; 4794 if (InvokeInst *II = dyn_cast<InvokeInst>(&Call)) { 4795 NewCaller = InvokeInst::Create(NewFTy, NewCallee, 4796 II->getNormalDest(), II->getUnwindDest(), 4797 NewArgs, OpBundles); 4798 cast<InvokeInst>(NewCaller)->setCallingConv(II->getCallingConv()); 4799 cast<InvokeInst>(NewCaller)->setAttributes(NewPAL); 4800 } else if (CallBrInst *CBI = dyn_cast<CallBrInst>(&Call)) { 4801 NewCaller = 4802 CallBrInst::Create(NewFTy, NewCallee, CBI->getDefaultDest(), 4803 CBI->getIndirectDests(), NewArgs, OpBundles); 4804 cast<CallBrInst>(NewCaller)->setCallingConv(CBI->getCallingConv()); 4805 cast<CallBrInst>(NewCaller)->setAttributes(NewPAL); 4806 } else { 4807 NewCaller = CallInst::Create(NewFTy, NewCallee, NewArgs, OpBundles); 4808 cast<CallInst>(NewCaller)->setTailCallKind( 4809 cast<CallInst>(Call).getTailCallKind()); 4810 cast<CallInst>(NewCaller)->setCallingConv( 4811 cast<CallInst>(Call).getCallingConv()); 4812 cast<CallInst>(NewCaller)->setAttributes(NewPAL); 4813 } 4814 NewCaller->setDebugLoc(Call.getDebugLoc()); 4815 4816 return NewCaller; 4817 } 4818 } 4819 4820 // Replace the trampoline call with a direct call. Since there is no 'nest' 4821 // parameter, there is no need to adjust the argument list. Let the generic 4822 // code sort out any function type mismatches. 4823 Constant *NewCallee = ConstantExpr::getBitCast(NestF, CalleeTy); 4824 Call.setCalledFunction(FTy, NewCallee); 4825 return &Call; 4826 } 4827