1 //===- AMDGPInstCombineIntrinsic.cpp - AMDGPU specific InstCombine pass ---===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // \file 10 // This file implements a TargetTransformInfo analysis pass specific to the 11 // AMDGPU target machine. It uses the target's detailed information to provide 12 // more precise answers to certain TTI queries, while letting the target 13 // independent and default TTI implementations handle the rest. 14 // 15 //===----------------------------------------------------------------------===// 16 17 #include "AMDGPUInstrInfo.h" 18 #include "AMDGPUTargetTransformInfo.h" 19 #include "GCNSubtarget.h" 20 #include "llvm/ADT/FloatingPointMode.h" 21 #include "llvm/IR/IntrinsicsAMDGPU.h" 22 #include "llvm/Transforms/InstCombine/InstCombiner.h" 23 #include <optional> 24 25 using namespace llvm; 26 using namespace llvm::PatternMatch; 27 28 #define DEBUG_TYPE "AMDGPUtti" 29 30 namespace { 31 32 struct AMDGPUImageDMaskIntrinsic { 33 unsigned Intr; 34 }; 35 36 #define GET_AMDGPUImageDMaskIntrinsicTable_IMPL 37 #include "InstCombineTables.inc" 38 39 } // end anonymous namespace 40 41 // Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs. 42 // 43 // A single NaN input is folded to minnum, so we rely on that folding for 44 // handling NaNs. 45 static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1, 46 const APFloat &Src2) { 47 APFloat Max3 = maxnum(maxnum(Src0, Src1), Src2); 48 49 APFloat::cmpResult Cmp0 = Max3.compare(Src0); 50 assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately"); 51 if (Cmp0 == APFloat::cmpEqual) 52 return maxnum(Src1, Src2); 53 54 APFloat::cmpResult Cmp1 = Max3.compare(Src1); 55 assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately"); 56 if (Cmp1 == APFloat::cmpEqual) 57 return maxnum(Src0, Src2); 58 59 return maxnum(Src0, Src1); 60 } 61 62 // Check if a value can be converted to a 16-bit value without losing 63 // precision. 64 // The value is expected to be either a float (IsFloat = true) or an unsigned 65 // integer (IsFloat = false). 66 static bool canSafelyConvertTo16Bit(Value &V, bool IsFloat) { 67 Type *VTy = V.getType(); 68 if (VTy->isHalfTy() || VTy->isIntegerTy(16)) { 69 // The value is already 16-bit, so we don't want to convert to 16-bit again! 70 return false; 71 } 72 if (IsFloat) { 73 if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(&V)) { 74 // We need to check that if we cast the index down to a half, we do not 75 // lose precision. 76 APFloat FloatValue(ConstFloat->getValueAPF()); 77 bool LosesInfo = true; 78 FloatValue.convert(APFloat::IEEEhalf(), APFloat::rmTowardZero, 79 &LosesInfo); 80 return !LosesInfo; 81 } 82 } else { 83 if (ConstantInt *ConstInt = dyn_cast<ConstantInt>(&V)) { 84 // We need to check that if we cast the index down to an i16, we do not 85 // lose precision. 86 APInt IntValue(ConstInt->getValue()); 87 return IntValue.getActiveBits() <= 16; 88 } 89 } 90 91 Value *CastSrc; 92 bool IsExt = IsFloat ? match(&V, m_FPExt(PatternMatch::m_Value(CastSrc))) 93 : match(&V, m_ZExt(PatternMatch::m_Value(CastSrc))); 94 if (IsExt) { 95 Type *CastSrcTy = CastSrc->getType(); 96 if (CastSrcTy->isHalfTy() || CastSrcTy->isIntegerTy(16)) 97 return true; 98 } 99 100 return false; 101 } 102 103 // Convert a value to 16-bit. 104 static Value *convertTo16Bit(Value &V, InstCombiner::BuilderTy &Builder) { 105 Type *VTy = V.getType(); 106 if (isa<FPExtInst>(&V) || isa<SExtInst>(&V) || isa<ZExtInst>(&V)) 107 return cast<Instruction>(&V)->getOperand(0); 108 if (VTy->isIntegerTy()) 109 return Builder.CreateIntCast(&V, Type::getInt16Ty(V.getContext()), false); 110 if (VTy->isFloatingPointTy()) 111 return Builder.CreateFPCast(&V, Type::getHalfTy(V.getContext())); 112 113 llvm_unreachable("Should never be called!"); 114 } 115 116 /// Applies Func(OldIntr.Args, OldIntr.ArgTys), creates intrinsic call with 117 /// modified arguments (based on OldIntr) and replaces InstToReplace with 118 /// this newly created intrinsic call. 119 static std::optional<Instruction *> modifyIntrinsicCall( 120 IntrinsicInst &OldIntr, Instruction &InstToReplace, unsigned NewIntr, 121 InstCombiner &IC, 122 std::function<void(SmallVectorImpl<Value *> &, SmallVectorImpl<Type *> &)> 123 Func) { 124 SmallVector<Type *, 4> ArgTys; 125 if (!Intrinsic::getIntrinsicSignature(OldIntr.getCalledFunction(), ArgTys)) 126 return std::nullopt; 127 128 SmallVector<Value *, 8> Args(OldIntr.args()); 129 130 // Modify arguments and types 131 Func(Args, ArgTys); 132 133 Function *I = Intrinsic::getDeclaration(OldIntr.getModule(), NewIntr, ArgTys); 134 135 CallInst *NewCall = IC.Builder.CreateCall(I, Args); 136 NewCall->takeName(&OldIntr); 137 NewCall->copyMetadata(OldIntr); 138 if (isa<FPMathOperator>(NewCall)) 139 NewCall->copyFastMathFlags(&OldIntr); 140 141 // Erase and replace uses 142 if (!InstToReplace.getType()->isVoidTy()) 143 IC.replaceInstUsesWith(InstToReplace, NewCall); 144 145 bool RemoveOldIntr = &OldIntr != &InstToReplace; 146 147 auto RetValue = IC.eraseInstFromFunction(InstToReplace); 148 if (RemoveOldIntr) 149 IC.eraseInstFromFunction(OldIntr); 150 151 return RetValue; 152 } 153 154 static std::optional<Instruction *> 155 simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST, 156 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr, 157 IntrinsicInst &II, InstCombiner &IC) { 158 // Optimize _L to _LZ when _L is zero 159 if (const auto *LZMappingInfo = 160 AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) { 161 if (auto *ConstantLod = 162 dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->LodIndex))) { 163 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 164 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = 165 AMDGPU::getImageDimIntrinsicByBaseOpcode(LZMappingInfo->LZ, 166 ImageDimIntr->Dim); 167 return modifyIntrinsicCall( 168 II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) { 169 Args.erase(Args.begin() + ImageDimIntr->LodIndex); 170 }); 171 } 172 } 173 } 174 175 // Optimize _mip away, when 'lod' is zero 176 if (const auto *MIPMappingInfo = 177 AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) { 178 if (auto *ConstantMip = 179 dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->MipIndex))) { 180 if (ConstantMip->isZero()) { 181 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = 182 AMDGPU::getImageDimIntrinsicByBaseOpcode(MIPMappingInfo->NONMIP, 183 ImageDimIntr->Dim); 184 return modifyIntrinsicCall( 185 II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) { 186 Args.erase(Args.begin() + ImageDimIntr->MipIndex); 187 }); 188 } 189 } 190 } 191 192 // Optimize _bias away when 'bias' is zero 193 if (const auto *BiasMappingInfo = 194 AMDGPU::getMIMGBiasMappingInfo(ImageDimIntr->BaseOpcode)) { 195 if (auto *ConstantBias = 196 dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->BiasIndex))) { 197 if (ConstantBias->isZero()) { 198 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = 199 AMDGPU::getImageDimIntrinsicByBaseOpcode(BiasMappingInfo->NoBias, 200 ImageDimIntr->Dim); 201 return modifyIntrinsicCall( 202 II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) { 203 Args.erase(Args.begin() + ImageDimIntr->BiasIndex); 204 ArgTys.erase(ArgTys.begin() + ImageDimIntr->BiasTyArg); 205 }); 206 } 207 } 208 } 209 210 // Optimize _offset away when 'offset' is zero 211 if (const auto *OffsetMappingInfo = 212 AMDGPU::getMIMGOffsetMappingInfo(ImageDimIntr->BaseOpcode)) { 213 if (auto *ConstantOffset = 214 dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->OffsetIndex))) { 215 if (ConstantOffset->isZero()) { 216 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = 217 AMDGPU::getImageDimIntrinsicByBaseOpcode( 218 OffsetMappingInfo->NoOffset, ImageDimIntr->Dim); 219 return modifyIntrinsicCall( 220 II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) { 221 Args.erase(Args.begin() + ImageDimIntr->OffsetIndex); 222 }); 223 } 224 } 225 } 226 227 // Try to use D16 228 if (ST->hasD16Images()) { 229 230 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 231 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode); 232 233 if (BaseOpcode->HasD16) { 234 235 // If the only use of image intrinsic is a fptrunc (with conversion to 236 // half) then both fptrunc and image intrinsic will be replaced with image 237 // intrinsic with D16 flag. 238 if (II.hasOneUse()) { 239 Instruction *User = II.user_back(); 240 241 if (User->getOpcode() == Instruction::FPTrunc && 242 User->getType()->getScalarType()->isHalfTy()) { 243 244 return modifyIntrinsicCall(II, *User, ImageDimIntr->Intr, IC, 245 [&](auto &Args, auto &ArgTys) { 246 // Change return type of image intrinsic. 247 // Set it to return type of fptrunc. 248 ArgTys[0] = User->getType(); 249 }); 250 } 251 } 252 } 253 } 254 255 // Try to use A16 or G16 256 if (!ST->hasA16() && !ST->hasG16()) 257 return std::nullopt; 258 259 // Address is interpreted as float if the instruction has a sampler or as 260 // unsigned int if there is no sampler. 261 bool HasSampler = 262 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode)->Sampler; 263 bool FloatCoord = false; 264 // true means derivatives can be converted to 16 bit, coordinates not 265 bool OnlyDerivatives = false; 266 267 for (unsigned OperandIndex = ImageDimIntr->GradientStart; 268 OperandIndex < ImageDimIntr->VAddrEnd; OperandIndex++) { 269 Value *Coord = II.getOperand(OperandIndex); 270 // If the values are not derived from 16-bit values, we cannot optimize. 271 if (!canSafelyConvertTo16Bit(*Coord, HasSampler)) { 272 if (OperandIndex < ImageDimIntr->CoordStart || 273 ImageDimIntr->GradientStart == ImageDimIntr->CoordStart) { 274 return std::nullopt; 275 } 276 // All gradients can be converted, so convert only them 277 OnlyDerivatives = true; 278 break; 279 } 280 281 assert(OperandIndex == ImageDimIntr->GradientStart || 282 FloatCoord == Coord->getType()->isFloatingPointTy()); 283 FloatCoord = Coord->getType()->isFloatingPointTy(); 284 } 285 286 if (!OnlyDerivatives && !ST->hasA16()) 287 OnlyDerivatives = true; // Only supports G16 288 289 // Check if there is a bias parameter and if it can be converted to f16 290 if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) { 291 Value *Bias = II.getOperand(ImageDimIntr->BiasIndex); 292 assert(HasSampler && 293 "Only image instructions with a sampler can have a bias"); 294 if (!canSafelyConvertTo16Bit(*Bias, HasSampler)) 295 OnlyDerivatives = true; 296 } 297 298 if (OnlyDerivatives && (!ST->hasG16() || ImageDimIntr->GradientStart == 299 ImageDimIntr->CoordStart)) 300 return std::nullopt; 301 302 Type *CoordType = FloatCoord ? Type::getHalfTy(II.getContext()) 303 : Type::getInt16Ty(II.getContext()); 304 305 return modifyIntrinsicCall( 306 II, II, II.getIntrinsicID(), IC, [&](auto &Args, auto &ArgTys) { 307 ArgTys[ImageDimIntr->GradientTyArg] = CoordType; 308 if (!OnlyDerivatives) { 309 ArgTys[ImageDimIntr->CoordTyArg] = CoordType; 310 311 // Change the bias type 312 if (ImageDimIntr->NumBiasArgs != 0) 313 ArgTys[ImageDimIntr->BiasTyArg] = Type::getHalfTy(II.getContext()); 314 } 315 316 unsigned EndIndex = 317 OnlyDerivatives ? ImageDimIntr->CoordStart : ImageDimIntr->VAddrEnd; 318 for (unsigned OperandIndex = ImageDimIntr->GradientStart; 319 OperandIndex < EndIndex; OperandIndex++) { 320 Args[OperandIndex] = 321 convertTo16Bit(*II.getOperand(OperandIndex), IC.Builder); 322 } 323 324 // Convert the bias 325 if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) { 326 Value *Bias = II.getOperand(ImageDimIntr->BiasIndex); 327 Args[ImageDimIntr->BiasIndex] = convertTo16Bit(*Bias, IC.Builder); 328 } 329 }); 330 } 331 332 bool GCNTTIImpl::canSimplifyLegacyMulToMul(const Instruction &I, 333 const Value *Op0, const Value *Op1, 334 InstCombiner &IC) const { 335 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or 336 // infinity, gives +0.0. If we can prove we don't have one of the special 337 // cases then we can use a normal multiply instead. 338 // TODO: Create and use isKnownFiniteNonZero instead of just matching 339 // constants here. 340 if (match(Op0, PatternMatch::m_FiniteNonZero()) || 341 match(Op1, PatternMatch::m_FiniteNonZero())) { 342 // One operand is not zero or infinity or NaN. 343 return true; 344 } 345 346 auto *TLI = &IC.getTargetLibraryInfo(); 347 if (isKnownNeverInfOrNaN(Op0, IC.getDataLayout(), TLI, 0, 348 &IC.getAssumptionCache(), &I, 349 &IC.getDominatorTree()) && 350 isKnownNeverInfOrNaN(Op1, IC.getDataLayout(), TLI, 0, 351 &IC.getAssumptionCache(), &I, 352 &IC.getDominatorTree())) { 353 // Neither operand is infinity or NaN. 354 return true; 355 } 356 return false; 357 } 358 359 /// Match an fpext from half to float, or a constant we can convert. 360 static bool matchFPExtFromF16(Value *Arg, Value *&FPExtSrc) { 361 if (match(Arg, m_OneUse(m_FPExt(m_Value(FPExtSrc))))) 362 return FPExtSrc->getType()->isHalfTy(); 363 364 ConstantFP *CFP; 365 if (match(Arg, m_ConstantFP(CFP))) { 366 bool LosesInfo; 367 APFloat Val(CFP->getValueAPF()); 368 Val.convert(APFloat::IEEEhalf(), APFloat::rmNearestTiesToEven, &LosesInfo); 369 if (LosesInfo) 370 return false; 371 372 FPExtSrc = ConstantFP::get(Type::getHalfTy(Arg->getContext()), Val); 373 return true; 374 } 375 376 return false; 377 } 378 379 // Trim all zero components from the end of the vector \p UseV and return 380 // an appropriate bitset with known elements. 381 static APInt trimTrailingZerosInVector(InstCombiner &IC, Value *UseV, 382 Instruction *I) { 383 auto *VTy = cast<FixedVectorType>(UseV->getType()); 384 unsigned VWidth = VTy->getNumElements(); 385 APInt DemandedElts = APInt::getAllOnes(VWidth); 386 387 for (int i = VWidth - 1; i > 0; --i) { 388 auto *Elt = findScalarElement(UseV, i); 389 if (!Elt) 390 break; 391 392 if (auto *ConstElt = dyn_cast<Constant>(Elt)) { 393 if (!ConstElt->isNullValue() && !isa<UndefValue>(Elt)) 394 break; 395 } else { 396 break; 397 } 398 399 DemandedElts.clearBit(i); 400 } 401 402 return DemandedElts; 403 } 404 405 // Trim elements of the end of the vector \p V, if they are 406 // equal to the first element of the vector. 407 static APInt defaultComponentBroadcast(Value *V) { 408 auto *VTy = cast<FixedVectorType>(V->getType()); 409 unsigned VWidth = VTy->getNumElements(); 410 APInt DemandedElts = APInt::getAllOnes(VWidth); 411 Value *FirstComponent = findScalarElement(V, 0); 412 413 SmallVector<int> ShuffleMask; 414 if (auto *SVI = dyn_cast<ShuffleVectorInst>(V)) 415 SVI->getShuffleMask(ShuffleMask); 416 417 for (int I = VWidth - 1; I > 0; --I) { 418 if (ShuffleMask.empty()) { 419 auto *Elt = findScalarElement(V, I); 420 if (!Elt || (Elt != FirstComponent && !isa<UndefValue>(Elt))) 421 break; 422 } else { 423 // Detect identical elements in the shufflevector result, even though 424 // findScalarElement cannot tell us what that element is. 425 if (ShuffleMask[I] != ShuffleMask[0] && ShuffleMask[I] != PoisonMaskElem) 426 break; 427 } 428 DemandedElts.clearBit(I); 429 } 430 431 return DemandedElts; 432 } 433 434 static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC, 435 IntrinsicInst &II, 436 APInt DemandedElts, 437 int DMaskIdx = -1, 438 bool IsLoad = true); 439 440 /// Return true if it's legal to contract llvm.amdgcn.rcp(llvm.sqrt) 441 static bool canContractSqrtToRsq(const FPMathOperator *SqrtOp) { 442 return (SqrtOp->getType()->isFloatTy() && 443 (SqrtOp->hasApproxFunc() || SqrtOp->getFPAccuracy() >= 1.0f)) || 444 SqrtOp->getType()->isHalfTy(); 445 } 446 447 std::optional<Instruction *> 448 GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { 449 Intrinsic::ID IID = II.getIntrinsicID(); 450 switch (IID) { 451 case Intrinsic::amdgcn_rcp: { 452 Value *Src = II.getArgOperand(0); 453 454 // TODO: Move to ConstantFolding/InstSimplify? 455 if (isa<UndefValue>(Src)) { 456 Type *Ty = II.getType(); 457 auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics())); 458 return IC.replaceInstUsesWith(II, QNaN); 459 } 460 461 if (II.isStrictFP()) 462 break; 463 464 if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) { 465 const APFloat &ArgVal = C->getValueAPF(); 466 APFloat Val(ArgVal.getSemantics(), 1); 467 Val.divide(ArgVal, APFloat::rmNearestTiesToEven); 468 469 // This is more precise than the instruction may give. 470 // 471 // TODO: The instruction always flushes denormal results (except for f16), 472 // should this also? 473 return IC.replaceInstUsesWith(II, ConstantFP::get(II.getContext(), Val)); 474 } 475 476 FastMathFlags FMF = cast<FPMathOperator>(II).getFastMathFlags(); 477 if (!FMF.allowContract()) 478 break; 479 auto *SrcCI = dyn_cast<IntrinsicInst>(Src); 480 if (!SrcCI) 481 break; 482 483 auto IID = SrcCI->getIntrinsicID(); 484 // llvm.amdgcn.rcp(llvm.amdgcn.sqrt(x)) -> llvm.amdgcn.rsq(x) if contractable 485 // 486 // llvm.amdgcn.rcp(llvm.sqrt(x)) -> llvm.amdgcn.rsq(x) if contractable and 487 // relaxed. 488 if (IID == Intrinsic::amdgcn_sqrt || IID == Intrinsic::sqrt) { 489 const FPMathOperator *SqrtOp = cast<FPMathOperator>(SrcCI); 490 FastMathFlags InnerFMF = SqrtOp->getFastMathFlags(); 491 if (!InnerFMF.allowContract() || !SrcCI->hasOneUse()) 492 break; 493 494 if (IID == Intrinsic::sqrt && !canContractSqrtToRsq(SqrtOp)) 495 break; 496 497 Function *NewDecl = Intrinsic::getDeclaration( 498 SrcCI->getModule(), Intrinsic::amdgcn_rsq, {SrcCI->getType()}); 499 500 InnerFMF |= FMF; 501 II.setFastMathFlags(InnerFMF); 502 503 II.setCalledFunction(NewDecl); 504 return IC.replaceOperand(II, 0, SrcCI->getArgOperand(0)); 505 } 506 507 break; 508 } 509 case Intrinsic::amdgcn_sqrt: 510 case Intrinsic::amdgcn_rsq: { 511 Value *Src = II.getArgOperand(0); 512 513 // TODO: Move to ConstantFolding/InstSimplify? 514 if (isa<UndefValue>(Src)) { 515 Type *Ty = II.getType(); 516 auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics())); 517 return IC.replaceInstUsesWith(II, QNaN); 518 } 519 520 // f16 amdgcn.sqrt is identical to regular sqrt. 521 if (IID == Intrinsic::amdgcn_sqrt && Src->getType()->isHalfTy()) { 522 Function *NewDecl = Intrinsic::getDeclaration( 523 II.getModule(), Intrinsic::sqrt, {II.getType()}); 524 II.setCalledFunction(NewDecl); 525 return &II; 526 } 527 528 break; 529 } 530 case Intrinsic::amdgcn_log: 531 case Intrinsic::amdgcn_exp2: { 532 const bool IsLog = IID == Intrinsic::amdgcn_log; 533 const bool IsExp = IID == Intrinsic::amdgcn_exp2; 534 Value *Src = II.getArgOperand(0); 535 Type *Ty = II.getType(); 536 537 if (isa<PoisonValue>(Src)) 538 return IC.replaceInstUsesWith(II, Src); 539 540 if (IC.getSimplifyQuery().isUndefValue(Src)) 541 return IC.replaceInstUsesWith(II, ConstantFP::getNaN(Ty)); 542 543 if (ConstantFP *C = dyn_cast<ConstantFP>(Src)) { 544 if (C->isInfinity()) { 545 // exp2(+inf) -> +inf 546 // log2(+inf) -> +inf 547 if (!C->isNegative()) 548 return IC.replaceInstUsesWith(II, C); 549 550 // exp2(-inf) -> 0 551 if (IsExp && C->isNegative()) 552 return IC.replaceInstUsesWith(II, ConstantFP::getZero(Ty)); 553 } 554 555 if (II.isStrictFP()) 556 break; 557 558 if (C->isNaN()) { 559 Constant *Quieted = ConstantFP::get(Ty, C->getValue().makeQuiet()); 560 return IC.replaceInstUsesWith(II, Quieted); 561 } 562 563 // f32 instruction doesn't handle denormals, f16 does. 564 if (C->isZero() || (C->getValue().isDenormal() && Ty->isFloatTy())) { 565 Constant *FoldedValue = IsLog ? ConstantFP::getInfinity(Ty, true) 566 : ConstantFP::get(Ty, 1.0); 567 return IC.replaceInstUsesWith(II, FoldedValue); 568 } 569 570 if (IsLog && C->isNegative()) 571 return IC.replaceInstUsesWith(II, ConstantFP::getNaN(Ty)); 572 573 // TODO: Full constant folding matching hardware behavior. 574 } 575 576 break; 577 } 578 case Intrinsic::amdgcn_frexp_mant: 579 case Intrinsic::amdgcn_frexp_exp: { 580 Value *Src = II.getArgOperand(0); 581 if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) { 582 int Exp; 583 APFloat Significand = 584 frexp(C->getValueAPF(), Exp, APFloat::rmNearestTiesToEven); 585 586 if (IID == Intrinsic::amdgcn_frexp_mant) { 587 return IC.replaceInstUsesWith( 588 II, ConstantFP::get(II.getContext(), Significand)); 589 } 590 591 // Match instruction special case behavior. 592 if (Exp == APFloat::IEK_NaN || Exp == APFloat::IEK_Inf) 593 Exp = 0; 594 595 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Exp)); 596 } 597 598 if (isa<UndefValue>(Src)) { 599 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 600 } 601 602 break; 603 } 604 case Intrinsic::amdgcn_class: { 605 Value *Src0 = II.getArgOperand(0); 606 Value *Src1 = II.getArgOperand(1); 607 const ConstantInt *CMask = dyn_cast<ConstantInt>(Src1); 608 if (CMask) { 609 II.setCalledOperand(Intrinsic::getDeclaration( 610 II.getModule(), Intrinsic::is_fpclass, Src0->getType())); 611 612 // Clamp any excess bits, as they're illegal for the generic intrinsic. 613 II.setArgOperand(1, ConstantInt::get(Src1->getType(), 614 CMask->getZExtValue() & fcAllFlags)); 615 return &II; 616 } 617 618 // Propagate poison. 619 if (isa<PoisonValue>(Src0) || isa<PoisonValue>(Src1)) 620 return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType())); 621 622 // llvm.amdgcn.class(_, undef) -> false 623 if (IC.getSimplifyQuery().isUndefValue(Src1)) 624 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), false)); 625 626 // llvm.amdgcn.class(undef, mask) -> mask != 0 627 if (IC.getSimplifyQuery().isUndefValue(Src0)) { 628 Value *CmpMask = IC.Builder.CreateICmpNE( 629 Src1, ConstantInt::getNullValue(Src1->getType())); 630 return IC.replaceInstUsesWith(II, CmpMask); 631 } 632 break; 633 } 634 case Intrinsic::amdgcn_cvt_pkrtz: { 635 Value *Src0 = II.getArgOperand(0); 636 Value *Src1 = II.getArgOperand(1); 637 if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) { 638 if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) { 639 const fltSemantics &HalfSem = 640 II.getType()->getScalarType()->getFltSemantics(); 641 bool LosesInfo; 642 APFloat Val0 = C0->getValueAPF(); 643 APFloat Val1 = C1->getValueAPF(); 644 Val0.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo); 645 Val1.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo); 646 647 Constant *Folded = 648 ConstantVector::get({ConstantFP::get(II.getContext(), Val0), 649 ConstantFP::get(II.getContext(), Val1)}); 650 return IC.replaceInstUsesWith(II, Folded); 651 } 652 } 653 654 if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) { 655 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 656 } 657 658 break; 659 } 660 case Intrinsic::amdgcn_cvt_pknorm_i16: 661 case Intrinsic::amdgcn_cvt_pknorm_u16: 662 case Intrinsic::amdgcn_cvt_pk_i16: 663 case Intrinsic::amdgcn_cvt_pk_u16: { 664 Value *Src0 = II.getArgOperand(0); 665 Value *Src1 = II.getArgOperand(1); 666 667 if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) { 668 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 669 } 670 671 break; 672 } 673 case Intrinsic::amdgcn_ubfe: 674 case Intrinsic::amdgcn_sbfe: { 675 // Decompose simple cases into standard shifts. 676 Value *Src = II.getArgOperand(0); 677 if (isa<UndefValue>(Src)) { 678 return IC.replaceInstUsesWith(II, Src); 679 } 680 681 unsigned Width; 682 Type *Ty = II.getType(); 683 unsigned IntSize = Ty->getIntegerBitWidth(); 684 685 ConstantInt *CWidth = dyn_cast<ConstantInt>(II.getArgOperand(2)); 686 if (CWidth) { 687 Width = CWidth->getZExtValue(); 688 if ((Width & (IntSize - 1)) == 0) { 689 return IC.replaceInstUsesWith(II, ConstantInt::getNullValue(Ty)); 690 } 691 692 // Hardware ignores high bits, so remove those. 693 if (Width >= IntSize) { 694 return IC.replaceOperand( 695 II, 2, ConstantInt::get(CWidth->getType(), Width & (IntSize - 1))); 696 } 697 } 698 699 unsigned Offset; 700 ConstantInt *COffset = dyn_cast<ConstantInt>(II.getArgOperand(1)); 701 if (COffset) { 702 Offset = COffset->getZExtValue(); 703 if (Offset >= IntSize) { 704 return IC.replaceOperand( 705 II, 1, 706 ConstantInt::get(COffset->getType(), Offset & (IntSize - 1))); 707 } 708 } 709 710 bool Signed = IID == Intrinsic::amdgcn_sbfe; 711 712 if (!CWidth || !COffset) 713 break; 714 715 // The case of Width == 0 is handled above, which makes this transformation 716 // safe. If Width == 0, then the ashr and lshr instructions become poison 717 // value since the shift amount would be equal to the bit size. 718 assert(Width != 0); 719 720 // TODO: This allows folding to undef when the hardware has specific 721 // behavior? 722 if (Offset + Width < IntSize) { 723 Value *Shl = IC.Builder.CreateShl(Src, IntSize - Offset - Width); 724 Value *RightShift = Signed ? IC.Builder.CreateAShr(Shl, IntSize - Width) 725 : IC.Builder.CreateLShr(Shl, IntSize - Width); 726 RightShift->takeName(&II); 727 return IC.replaceInstUsesWith(II, RightShift); 728 } 729 730 Value *RightShift = Signed ? IC.Builder.CreateAShr(Src, Offset) 731 : IC.Builder.CreateLShr(Src, Offset); 732 733 RightShift->takeName(&II); 734 return IC.replaceInstUsesWith(II, RightShift); 735 } 736 case Intrinsic::amdgcn_exp: 737 case Intrinsic::amdgcn_exp_row: 738 case Intrinsic::amdgcn_exp_compr: { 739 ConstantInt *En = cast<ConstantInt>(II.getArgOperand(1)); 740 unsigned EnBits = En->getZExtValue(); 741 if (EnBits == 0xf) 742 break; // All inputs enabled. 743 744 bool IsCompr = IID == Intrinsic::amdgcn_exp_compr; 745 bool Changed = false; 746 for (int I = 0; I < (IsCompr ? 2 : 4); ++I) { 747 if ((!IsCompr && (EnBits & (1 << I)) == 0) || 748 (IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) { 749 Value *Src = II.getArgOperand(I + 2); 750 if (!isa<UndefValue>(Src)) { 751 IC.replaceOperand(II, I + 2, UndefValue::get(Src->getType())); 752 Changed = true; 753 } 754 } 755 } 756 757 if (Changed) { 758 return &II; 759 } 760 761 break; 762 } 763 case Intrinsic::amdgcn_fmed3: { 764 // Note this does not preserve proper sNaN behavior if IEEE-mode is enabled 765 // for the shader. 766 767 Value *Src0 = II.getArgOperand(0); 768 Value *Src1 = II.getArgOperand(1); 769 Value *Src2 = II.getArgOperand(2); 770 771 // Checking for NaN before canonicalization provides better fidelity when 772 // mapping other operations onto fmed3 since the order of operands is 773 // unchanged. 774 CallInst *NewCall = nullptr; 775 if (match(Src0, PatternMatch::m_NaN()) || isa<UndefValue>(Src0)) { 776 NewCall = IC.Builder.CreateMinNum(Src1, Src2); 777 } else if (match(Src1, PatternMatch::m_NaN()) || isa<UndefValue>(Src1)) { 778 NewCall = IC.Builder.CreateMinNum(Src0, Src2); 779 } else if (match(Src2, PatternMatch::m_NaN()) || isa<UndefValue>(Src2)) { 780 NewCall = IC.Builder.CreateMaxNum(Src0, Src1); 781 } 782 783 if (NewCall) { 784 NewCall->copyFastMathFlags(&II); 785 NewCall->takeName(&II); 786 return IC.replaceInstUsesWith(II, NewCall); 787 } 788 789 bool Swap = false; 790 // Canonicalize constants to RHS operands. 791 // 792 // fmed3(c0, x, c1) -> fmed3(x, c0, c1) 793 if (isa<Constant>(Src0) && !isa<Constant>(Src1)) { 794 std::swap(Src0, Src1); 795 Swap = true; 796 } 797 798 if (isa<Constant>(Src1) && !isa<Constant>(Src2)) { 799 std::swap(Src1, Src2); 800 Swap = true; 801 } 802 803 if (isa<Constant>(Src0) && !isa<Constant>(Src1)) { 804 std::swap(Src0, Src1); 805 Swap = true; 806 } 807 808 if (Swap) { 809 II.setArgOperand(0, Src0); 810 II.setArgOperand(1, Src1); 811 II.setArgOperand(2, Src2); 812 return &II; 813 } 814 815 if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) { 816 if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) { 817 if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Src2)) { 818 APFloat Result = fmed3AMDGCN(C0->getValueAPF(), C1->getValueAPF(), 819 C2->getValueAPF()); 820 return IC.replaceInstUsesWith( 821 II, ConstantFP::get(IC.Builder.getContext(), Result)); 822 } 823 } 824 } 825 826 if (!ST->hasMed3_16()) 827 break; 828 829 Value *X, *Y, *Z; 830 831 // Repeat floating-point width reduction done for minnum/maxnum. 832 // fmed3((fpext X), (fpext Y), (fpext Z)) -> fpext (fmed3(X, Y, Z)) 833 if (matchFPExtFromF16(Src0, X) && matchFPExtFromF16(Src1, Y) && 834 matchFPExtFromF16(Src2, Z)) { 835 Value *NewCall = IC.Builder.CreateIntrinsic(IID, {X->getType()}, 836 {X, Y, Z}, &II, II.getName()); 837 return new FPExtInst(NewCall, II.getType()); 838 } 839 840 break; 841 } 842 case Intrinsic::amdgcn_icmp: 843 case Intrinsic::amdgcn_fcmp: { 844 const ConstantInt *CC = cast<ConstantInt>(II.getArgOperand(2)); 845 // Guard against invalid arguments. 846 int64_t CCVal = CC->getZExtValue(); 847 bool IsInteger = IID == Intrinsic::amdgcn_icmp; 848 if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE || 849 CCVal > CmpInst::LAST_ICMP_PREDICATE)) || 850 (!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE || 851 CCVal > CmpInst::LAST_FCMP_PREDICATE))) 852 break; 853 854 Value *Src0 = II.getArgOperand(0); 855 Value *Src1 = II.getArgOperand(1); 856 857 if (auto *CSrc0 = dyn_cast<Constant>(Src0)) { 858 if (auto *CSrc1 = dyn_cast<Constant>(Src1)) { 859 Constant *CCmp = ConstantExpr::getCompare(CCVal, CSrc0, CSrc1); 860 if (CCmp->isNullValue()) { 861 return IC.replaceInstUsesWith( 862 II, IC.Builder.CreateSExt(CCmp, II.getType())); 863 } 864 865 // The result of V_ICMP/V_FCMP assembly instructions (which this 866 // intrinsic exposes) is one bit per thread, masked with the EXEC 867 // register (which contains the bitmask of live threads). So a 868 // comparison that always returns true is the same as a read of the 869 // EXEC register. 870 Function *NewF = Intrinsic::getDeclaration( 871 II.getModule(), Intrinsic::read_register, II.getType()); 872 Metadata *MDArgs[] = {MDString::get(II.getContext(), "exec")}; 873 MDNode *MD = MDNode::get(II.getContext(), MDArgs); 874 Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)}; 875 CallInst *NewCall = IC.Builder.CreateCall(NewF, Args); 876 NewCall->addFnAttr(Attribute::Convergent); 877 NewCall->takeName(&II); 878 return IC.replaceInstUsesWith(II, NewCall); 879 } 880 881 // Canonicalize constants to RHS. 882 CmpInst::Predicate SwapPred = 883 CmpInst::getSwappedPredicate(static_cast<CmpInst::Predicate>(CCVal)); 884 II.setArgOperand(0, Src1); 885 II.setArgOperand(1, Src0); 886 II.setArgOperand( 887 2, ConstantInt::get(CC->getType(), static_cast<int>(SwapPred))); 888 return &II; 889 } 890 891 if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE) 892 break; 893 894 // Canonicalize compare eq with true value to compare != 0 895 // llvm.amdgcn.icmp(zext (i1 x), 1, eq) 896 // -> llvm.amdgcn.icmp(zext (i1 x), 0, ne) 897 // llvm.amdgcn.icmp(sext (i1 x), -1, eq) 898 // -> llvm.amdgcn.icmp(sext (i1 x), 0, ne) 899 Value *ExtSrc; 900 if (CCVal == CmpInst::ICMP_EQ && 901 ((match(Src1, PatternMatch::m_One()) && 902 match(Src0, m_ZExt(PatternMatch::m_Value(ExtSrc)))) || 903 (match(Src1, PatternMatch::m_AllOnes()) && 904 match(Src0, m_SExt(PatternMatch::m_Value(ExtSrc))))) && 905 ExtSrc->getType()->isIntegerTy(1)) { 906 IC.replaceOperand(II, 1, ConstantInt::getNullValue(Src1->getType())); 907 IC.replaceOperand(II, 2, 908 ConstantInt::get(CC->getType(), CmpInst::ICMP_NE)); 909 return &II; 910 } 911 912 CmpInst::Predicate SrcPred; 913 Value *SrcLHS; 914 Value *SrcRHS; 915 916 // Fold compare eq/ne with 0 from a compare result as the predicate to the 917 // intrinsic. The typical use is a wave vote function in the library, which 918 // will be fed from a user code condition compared with 0. Fold in the 919 // redundant compare. 920 921 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne) 922 // -> llvm.amdgcn.[if]cmp(a, b, pred) 923 // 924 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq) 925 // -> llvm.amdgcn.[if]cmp(a, b, inv pred) 926 if (match(Src1, PatternMatch::m_Zero()) && 927 match(Src0, PatternMatch::m_ZExtOrSExt( 928 m_Cmp(SrcPred, PatternMatch::m_Value(SrcLHS), 929 PatternMatch::m_Value(SrcRHS))))) { 930 if (CCVal == CmpInst::ICMP_EQ) 931 SrcPred = CmpInst::getInversePredicate(SrcPred); 932 933 Intrinsic::ID NewIID = CmpInst::isFPPredicate(SrcPred) 934 ? Intrinsic::amdgcn_fcmp 935 : Intrinsic::amdgcn_icmp; 936 937 Type *Ty = SrcLHS->getType(); 938 if (auto *CmpType = dyn_cast<IntegerType>(Ty)) { 939 // Promote to next legal integer type. 940 unsigned Width = CmpType->getBitWidth(); 941 unsigned NewWidth = Width; 942 943 // Don't do anything for i1 comparisons. 944 if (Width == 1) 945 break; 946 947 if (Width <= 16) 948 NewWidth = 16; 949 else if (Width <= 32) 950 NewWidth = 32; 951 else if (Width <= 64) 952 NewWidth = 64; 953 else if (Width > 64) 954 break; // Can't handle this. 955 956 if (Width != NewWidth) { 957 IntegerType *CmpTy = IC.Builder.getIntNTy(NewWidth); 958 if (CmpInst::isSigned(SrcPred)) { 959 SrcLHS = IC.Builder.CreateSExt(SrcLHS, CmpTy); 960 SrcRHS = IC.Builder.CreateSExt(SrcRHS, CmpTy); 961 } else { 962 SrcLHS = IC.Builder.CreateZExt(SrcLHS, CmpTy); 963 SrcRHS = IC.Builder.CreateZExt(SrcRHS, CmpTy); 964 } 965 } 966 } else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy()) 967 break; 968 969 Function *NewF = Intrinsic::getDeclaration( 970 II.getModule(), NewIID, {II.getType(), SrcLHS->getType()}); 971 Value *Args[] = {SrcLHS, SrcRHS, 972 ConstantInt::get(CC->getType(), SrcPred)}; 973 CallInst *NewCall = IC.Builder.CreateCall(NewF, Args); 974 NewCall->takeName(&II); 975 return IC.replaceInstUsesWith(II, NewCall); 976 } 977 978 break; 979 } 980 case Intrinsic::amdgcn_mbcnt_hi: { 981 // exec_hi is all 0, so this is just a copy. 982 if (ST->isWave32()) 983 return IC.replaceInstUsesWith(II, II.getArgOperand(1)); 984 break; 985 } 986 case Intrinsic::amdgcn_ballot: { 987 if (auto *Src = dyn_cast<ConstantInt>(II.getArgOperand(0))) { 988 if (Src->isZero()) { 989 // amdgcn.ballot(i1 0) is zero. 990 return IC.replaceInstUsesWith(II, Constant::getNullValue(II.getType())); 991 } 992 } 993 break; 994 } 995 case Intrinsic::amdgcn_wqm_vote: { 996 // wqm_vote is identity when the argument is constant. 997 if (!isa<Constant>(II.getArgOperand(0))) 998 break; 999 1000 return IC.replaceInstUsesWith(II, II.getArgOperand(0)); 1001 } 1002 case Intrinsic::amdgcn_kill: { 1003 const ConstantInt *C = dyn_cast<ConstantInt>(II.getArgOperand(0)); 1004 if (!C || !C->getZExtValue()) 1005 break; 1006 1007 // amdgcn.kill(i1 1) is a no-op 1008 return IC.eraseInstFromFunction(II); 1009 } 1010 case Intrinsic::amdgcn_update_dpp: { 1011 Value *Old = II.getArgOperand(0); 1012 1013 auto *BC = cast<ConstantInt>(II.getArgOperand(5)); 1014 auto *RM = cast<ConstantInt>(II.getArgOperand(3)); 1015 auto *BM = cast<ConstantInt>(II.getArgOperand(4)); 1016 if (BC->isZeroValue() || RM->getZExtValue() != 0xF || 1017 BM->getZExtValue() != 0xF || isa<UndefValue>(Old)) 1018 break; 1019 1020 // If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value. 1021 return IC.replaceOperand(II, 0, UndefValue::get(Old->getType())); 1022 } 1023 case Intrinsic::amdgcn_permlane16: 1024 case Intrinsic::amdgcn_permlane16_var: 1025 case Intrinsic::amdgcn_permlanex16: 1026 case Intrinsic::amdgcn_permlanex16_var: { 1027 // Discard vdst_in if it's not going to be read. 1028 Value *VDstIn = II.getArgOperand(0); 1029 if (isa<UndefValue>(VDstIn)) 1030 break; 1031 1032 // FetchInvalid operand idx. 1033 unsigned int FiIdx = (IID == Intrinsic::amdgcn_permlane16 || 1034 IID == Intrinsic::amdgcn_permlanex16) 1035 ? 4 /* for permlane16 and permlanex16 */ 1036 : 3; /* for permlane16_var and permlanex16_var */ 1037 1038 // BoundCtrl operand idx. 1039 // For permlane16 and permlanex16 it should be 5 1040 // For Permlane16_var and permlanex16_var it should be 4 1041 unsigned int BcIdx = FiIdx + 1; 1042 1043 ConstantInt *FetchInvalid = cast<ConstantInt>(II.getArgOperand(FiIdx)); 1044 ConstantInt *BoundCtrl = cast<ConstantInt>(II.getArgOperand(BcIdx)); 1045 if (!FetchInvalid->getZExtValue() && !BoundCtrl->getZExtValue()) 1046 break; 1047 1048 return IC.replaceOperand(II, 0, UndefValue::get(VDstIn->getType())); 1049 } 1050 case Intrinsic::amdgcn_permlane64: 1051 // A constant value is trivially uniform. 1052 if (Constant *C = dyn_cast<Constant>(II.getArgOperand(0))) { 1053 return IC.replaceInstUsesWith(II, C); 1054 } 1055 break; 1056 case Intrinsic::amdgcn_readfirstlane: 1057 case Intrinsic::amdgcn_readlane: { 1058 // A constant value is trivially uniform. 1059 if (Constant *C = dyn_cast<Constant>(II.getArgOperand(0))) { 1060 return IC.replaceInstUsesWith(II, C); 1061 } 1062 1063 // The rest of these may not be safe if the exec may not be the same between 1064 // the def and use. 1065 Value *Src = II.getArgOperand(0); 1066 Instruction *SrcInst = dyn_cast<Instruction>(Src); 1067 if (SrcInst && SrcInst->getParent() != II.getParent()) 1068 break; 1069 1070 // readfirstlane (readfirstlane x) -> readfirstlane x 1071 // readlane (readfirstlane x), y -> readfirstlane x 1072 if (match(Src, 1073 PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readfirstlane>())) { 1074 return IC.replaceInstUsesWith(II, Src); 1075 } 1076 1077 if (IID == Intrinsic::amdgcn_readfirstlane) { 1078 // readfirstlane (readlane x, y) -> readlane x, y 1079 if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>())) { 1080 return IC.replaceInstUsesWith(II, Src); 1081 } 1082 } else { 1083 // readlane (readlane x, y), y -> readlane x, y 1084 if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>( 1085 PatternMatch::m_Value(), 1086 PatternMatch::m_Specific(II.getArgOperand(1))))) { 1087 return IC.replaceInstUsesWith(II, Src); 1088 } 1089 } 1090 1091 break; 1092 } 1093 case Intrinsic::amdgcn_fmul_legacy: { 1094 Value *Op0 = II.getArgOperand(0); 1095 Value *Op1 = II.getArgOperand(1); 1096 1097 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or 1098 // infinity, gives +0.0. 1099 // TODO: Move to InstSimplify? 1100 if (match(Op0, PatternMatch::m_AnyZeroFP()) || 1101 match(Op1, PatternMatch::m_AnyZeroFP())) 1102 return IC.replaceInstUsesWith(II, ConstantFP::getZero(II.getType())); 1103 1104 // If we can prove we don't have one of the special cases then we can use a 1105 // normal fmul instruction instead. 1106 if (canSimplifyLegacyMulToMul(II, Op0, Op1, IC)) { 1107 auto *FMul = IC.Builder.CreateFMulFMF(Op0, Op1, &II); 1108 FMul->takeName(&II); 1109 return IC.replaceInstUsesWith(II, FMul); 1110 } 1111 break; 1112 } 1113 case Intrinsic::amdgcn_fma_legacy: { 1114 Value *Op0 = II.getArgOperand(0); 1115 Value *Op1 = II.getArgOperand(1); 1116 Value *Op2 = II.getArgOperand(2); 1117 1118 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or 1119 // infinity, gives +0.0. 1120 // TODO: Move to InstSimplify? 1121 if (match(Op0, PatternMatch::m_AnyZeroFP()) || 1122 match(Op1, PatternMatch::m_AnyZeroFP())) { 1123 // It's tempting to just return Op2 here, but that would give the wrong 1124 // result if Op2 was -0.0. 1125 auto *Zero = ConstantFP::getZero(II.getType()); 1126 auto *FAdd = IC.Builder.CreateFAddFMF(Zero, Op2, &II); 1127 FAdd->takeName(&II); 1128 return IC.replaceInstUsesWith(II, FAdd); 1129 } 1130 1131 // If we can prove we don't have one of the special cases then we can use a 1132 // normal fma instead. 1133 if (canSimplifyLegacyMulToMul(II, Op0, Op1, IC)) { 1134 II.setCalledOperand(Intrinsic::getDeclaration( 1135 II.getModule(), Intrinsic::fma, II.getType())); 1136 return &II; 1137 } 1138 break; 1139 } 1140 case Intrinsic::amdgcn_is_shared: 1141 case Intrinsic::amdgcn_is_private: { 1142 if (isa<UndefValue>(II.getArgOperand(0))) 1143 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 1144 1145 if (isa<ConstantPointerNull>(II.getArgOperand(0))) 1146 return IC.replaceInstUsesWith(II, ConstantInt::getFalse(II.getType())); 1147 break; 1148 } 1149 case Intrinsic::amdgcn_buffer_store_format: 1150 case Intrinsic::amdgcn_raw_buffer_store_format: 1151 case Intrinsic::amdgcn_struct_buffer_store_format: 1152 case Intrinsic::amdgcn_raw_tbuffer_store: 1153 case Intrinsic::amdgcn_struct_tbuffer_store: 1154 case Intrinsic::amdgcn_tbuffer_store: 1155 case Intrinsic::amdgcn_image_store_1d: 1156 case Intrinsic::amdgcn_image_store_1darray: 1157 case Intrinsic::amdgcn_image_store_2d: 1158 case Intrinsic::amdgcn_image_store_2darray: 1159 case Intrinsic::amdgcn_image_store_2darraymsaa: 1160 case Intrinsic::amdgcn_image_store_2dmsaa: 1161 case Intrinsic::amdgcn_image_store_3d: 1162 case Intrinsic::amdgcn_image_store_cube: 1163 case Intrinsic::amdgcn_image_store_mip_1d: 1164 case Intrinsic::amdgcn_image_store_mip_1darray: 1165 case Intrinsic::amdgcn_image_store_mip_2d: 1166 case Intrinsic::amdgcn_image_store_mip_2darray: 1167 case Intrinsic::amdgcn_image_store_mip_3d: 1168 case Intrinsic::amdgcn_image_store_mip_cube: { 1169 if (!isa<FixedVectorType>(II.getArgOperand(0)->getType())) 1170 break; 1171 1172 APInt DemandedElts; 1173 if (ST->hasDefaultComponentBroadcast()) 1174 DemandedElts = defaultComponentBroadcast(II.getArgOperand(0)); 1175 else if (ST->hasDefaultComponentZero()) 1176 DemandedElts = trimTrailingZerosInVector(IC, II.getArgOperand(0), &II); 1177 else 1178 break; 1179 1180 int DMaskIdx = getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID()) ? 1 : -1; 1181 if (simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, DMaskIdx, 1182 false)) { 1183 return IC.eraseInstFromFunction(II); 1184 } 1185 1186 break; 1187 } 1188 } 1189 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 1190 AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) { 1191 return simplifyAMDGCNImageIntrinsic(ST, ImageDimIntr, II, IC); 1192 } 1193 return std::nullopt; 1194 } 1195 1196 /// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics. 1197 /// 1198 /// The result of simplifying amdgcn image and buffer store intrinsics is updating 1199 /// definitions of the intrinsics vector argument, not Uses of the result like 1200 /// image and buffer loads. 1201 /// Note: This only supports non-TFE/LWE image intrinsic calls; those have 1202 /// struct returns. 1203 static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC, 1204 IntrinsicInst &II, 1205 APInt DemandedElts, 1206 int DMaskIdx, bool IsLoad) { 1207 1208 auto *IIVTy = cast<FixedVectorType>(IsLoad ? II.getType() 1209 : II.getOperand(0)->getType()); 1210 unsigned VWidth = IIVTy->getNumElements(); 1211 if (VWidth == 1) 1212 return nullptr; 1213 Type *EltTy = IIVTy->getElementType(); 1214 1215 IRBuilderBase::InsertPointGuard Guard(IC.Builder); 1216 IC.Builder.SetInsertPoint(&II); 1217 1218 // Assume the arguments are unchanged and later override them, if needed. 1219 SmallVector<Value *, 16> Args(II.args()); 1220 1221 if (DMaskIdx < 0) { 1222 // Buffer case. 1223 1224 const unsigned ActiveBits = DemandedElts.getActiveBits(); 1225 const unsigned UnusedComponentsAtFront = DemandedElts.countr_zero(); 1226 1227 // Start assuming the prefix of elements is demanded, but possibly clear 1228 // some other bits if there are trailing zeros (unused components at front) 1229 // and update offset. 1230 DemandedElts = (1 << ActiveBits) - 1; 1231 1232 if (UnusedComponentsAtFront > 0) { 1233 static const unsigned InvalidOffsetIdx = 0xf; 1234 1235 unsigned OffsetIdx; 1236 switch (II.getIntrinsicID()) { 1237 case Intrinsic::amdgcn_raw_buffer_load: 1238 case Intrinsic::amdgcn_raw_ptr_buffer_load: 1239 OffsetIdx = 1; 1240 break; 1241 case Intrinsic::amdgcn_s_buffer_load: 1242 // If resulting type is vec3, there is no point in trimming the 1243 // load with updated offset, as the vec3 would most likely be widened to 1244 // vec4 anyway during lowering. 1245 if (ActiveBits == 4 && UnusedComponentsAtFront == 1) 1246 OffsetIdx = InvalidOffsetIdx; 1247 else 1248 OffsetIdx = 1; 1249 break; 1250 case Intrinsic::amdgcn_struct_buffer_load: 1251 case Intrinsic::amdgcn_struct_ptr_buffer_load: 1252 OffsetIdx = 2; 1253 break; 1254 default: 1255 // TODO: handle tbuffer* intrinsics. 1256 OffsetIdx = InvalidOffsetIdx; 1257 break; 1258 } 1259 1260 if (OffsetIdx != InvalidOffsetIdx) { 1261 // Clear demanded bits and update the offset. 1262 DemandedElts &= ~((1 << UnusedComponentsAtFront) - 1); 1263 auto *Offset = Args[OffsetIdx]; 1264 unsigned SingleComponentSizeInBits = 1265 IC.getDataLayout().getTypeSizeInBits(EltTy); 1266 unsigned OffsetAdd = 1267 UnusedComponentsAtFront * SingleComponentSizeInBits / 8; 1268 auto *OffsetAddVal = ConstantInt::get(Offset->getType(), OffsetAdd); 1269 Args[OffsetIdx] = IC.Builder.CreateAdd(Offset, OffsetAddVal); 1270 } 1271 } 1272 } else { 1273 // Image case. 1274 1275 ConstantInt *DMask = cast<ConstantInt>(Args[DMaskIdx]); 1276 unsigned DMaskVal = DMask->getZExtValue() & 0xf; 1277 1278 // dmask 0 has special semantics, do not simplify. 1279 if (DMaskVal == 0) 1280 return nullptr; 1281 1282 // Mask off values that are undefined because the dmask doesn't cover them 1283 DemandedElts &= (1 << llvm::popcount(DMaskVal)) - 1; 1284 1285 unsigned NewDMaskVal = 0; 1286 unsigned OrigLdStIdx = 0; 1287 for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) { 1288 const unsigned Bit = 1 << SrcIdx; 1289 if (!!(DMaskVal & Bit)) { 1290 if (!!DemandedElts[OrigLdStIdx]) 1291 NewDMaskVal |= Bit; 1292 OrigLdStIdx++; 1293 } 1294 } 1295 1296 if (DMaskVal != NewDMaskVal) 1297 Args[DMaskIdx] = ConstantInt::get(DMask->getType(), NewDMaskVal); 1298 } 1299 1300 unsigned NewNumElts = DemandedElts.popcount(); 1301 if (!NewNumElts) 1302 return PoisonValue::get(IIVTy); 1303 1304 if (NewNumElts >= VWidth && DemandedElts.isMask()) { 1305 if (DMaskIdx >= 0) 1306 II.setArgOperand(DMaskIdx, Args[DMaskIdx]); 1307 return nullptr; 1308 } 1309 1310 // Validate function argument and return types, extracting overloaded types 1311 // along the way. 1312 SmallVector<Type *, 6> OverloadTys; 1313 if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), OverloadTys)) 1314 return nullptr; 1315 1316 Type *NewTy = 1317 (NewNumElts == 1) ? EltTy : FixedVectorType::get(EltTy, NewNumElts); 1318 OverloadTys[0] = NewTy; 1319 1320 if (!IsLoad) { 1321 SmallVector<int, 8> EltMask; 1322 for (unsigned OrigStoreIdx = 0; OrigStoreIdx < VWidth; ++OrigStoreIdx) 1323 if (DemandedElts[OrigStoreIdx]) 1324 EltMask.push_back(OrigStoreIdx); 1325 1326 if (NewNumElts == 1) 1327 Args[0] = IC.Builder.CreateExtractElement(II.getOperand(0), EltMask[0]); 1328 else 1329 Args[0] = IC.Builder.CreateShuffleVector(II.getOperand(0), EltMask); 1330 } 1331 1332 Function *NewIntrin = Intrinsic::getDeclaration( 1333 II.getModule(), II.getIntrinsicID(), OverloadTys); 1334 CallInst *NewCall = IC.Builder.CreateCall(NewIntrin, Args); 1335 NewCall->takeName(&II); 1336 NewCall->copyMetadata(II); 1337 1338 if (IsLoad) { 1339 if (NewNumElts == 1) { 1340 return IC.Builder.CreateInsertElement(PoisonValue::get(IIVTy), NewCall, 1341 DemandedElts.countr_zero()); 1342 } 1343 1344 SmallVector<int, 8> EltMask; 1345 unsigned NewLoadIdx = 0; 1346 for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) { 1347 if (!!DemandedElts[OrigLoadIdx]) 1348 EltMask.push_back(NewLoadIdx++); 1349 else 1350 EltMask.push_back(NewNumElts); 1351 } 1352 1353 auto *Shuffle = IC.Builder.CreateShuffleVector(NewCall, EltMask); 1354 1355 return Shuffle; 1356 } 1357 1358 return NewCall; 1359 } 1360 1361 std::optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic( 1362 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, 1363 APInt &UndefElts2, APInt &UndefElts3, 1364 std::function<void(Instruction *, unsigned, APInt, APInt &)> 1365 SimplifyAndSetOp) const { 1366 switch (II.getIntrinsicID()) { 1367 case Intrinsic::amdgcn_buffer_load: 1368 case Intrinsic::amdgcn_buffer_load_format: 1369 case Intrinsic::amdgcn_raw_buffer_load: 1370 case Intrinsic::amdgcn_raw_ptr_buffer_load: 1371 case Intrinsic::amdgcn_raw_buffer_load_format: 1372 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: 1373 case Intrinsic::amdgcn_raw_tbuffer_load: 1374 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: 1375 case Intrinsic::amdgcn_s_buffer_load: 1376 case Intrinsic::amdgcn_struct_buffer_load: 1377 case Intrinsic::amdgcn_struct_ptr_buffer_load: 1378 case Intrinsic::amdgcn_struct_buffer_load_format: 1379 case Intrinsic::amdgcn_struct_ptr_buffer_load_format: 1380 case Intrinsic::amdgcn_struct_tbuffer_load: 1381 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: 1382 case Intrinsic::amdgcn_tbuffer_load: 1383 return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts); 1384 default: { 1385 if (getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID())) { 1386 return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, 0); 1387 } 1388 break; 1389 } 1390 } 1391 return std::nullopt; 1392 } 1393