xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp (revision 0fca6ea1d4eea4c934cfff25ac9ee8ad6fe95583)
1 //===- AMDGPInstCombineIntrinsic.cpp - AMDGPU specific InstCombine pass ---===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // \file
10 // This file implements a TargetTransformInfo analysis pass specific to the
11 // AMDGPU target machine. It uses the target's detailed information to provide
12 // more precise answers to certain TTI queries, while letting the target
13 // independent and default TTI implementations handle the rest.
14 //
15 //===----------------------------------------------------------------------===//
16 
17 #include "AMDGPUInstrInfo.h"
18 #include "AMDGPUTargetTransformInfo.h"
19 #include "GCNSubtarget.h"
20 #include "llvm/ADT/FloatingPointMode.h"
21 #include "llvm/IR/IntrinsicsAMDGPU.h"
22 #include "llvm/Transforms/InstCombine/InstCombiner.h"
23 #include <optional>
24 
25 using namespace llvm;
26 using namespace llvm::PatternMatch;
27 
28 #define DEBUG_TYPE "AMDGPUtti"
29 
30 namespace {
31 
32 struct AMDGPUImageDMaskIntrinsic {
33   unsigned Intr;
34 };
35 
36 #define GET_AMDGPUImageDMaskIntrinsicTable_IMPL
37 #include "InstCombineTables.inc"
38 
39 } // end anonymous namespace
40 
41 // Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs.
42 //
43 // A single NaN input is folded to minnum, so we rely on that folding for
44 // handling NaNs.
fmed3AMDGCN(const APFloat & Src0,const APFloat & Src1,const APFloat & Src2)45 static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1,
46                            const APFloat &Src2) {
47   APFloat Max3 = maxnum(maxnum(Src0, Src1), Src2);
48 
49   APFloat::cmpResult Cmp0 = Max3.compare(Src0);
50   assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately");
51   if (Cmp0 == APFloat::cmpEqual)
52     return maxnum(Src1, Src2);
53 
54   APFloat::cmpResult Cmp1 = Max3.compare(Src1);
55   assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately");
56   if (Cmp1 == APFloat::cmpEqual)
57     return maxnum(Src0, Src2);
58 
59   return maxnum(Src0, Src1);
60 }
61 
62 // Check if a value can be converted to a 16-bit value without losing
63 // precision.
64 // The value is expected to be either a float (IsFloat = true) or an unsigned
65 // integer (IsFloat = false).
canSafelyConvertTo16Bit(Value & V,bool IsFloat)66 static bool canSafelyConvertTo16Bit(Value &V, bool IsFloat) {
67   Type *VTy = V.getType();
68   if (VTy->isHalfTy() || VTy->isIntegerTy(16)) {
69     // The value is already 16-bit, so we don't want to convert to 16-bit again!
70     return false;
71   }
72   if (IsFloat) {
73     if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(&V)) {
74       // We need to check that if we cast the index down to a half, we do not
75       // lose precision.
76       APFloat FloatValue(ConstFloat->getValueAPF());
77       bool LosesInfo = true;
78       FloatValue.convert(APFloat::IEEEhalf(), APFloat::rmTowardZero,
79                          &LosesInfo);
80       return !LosesInfo;
81     }
82   } else {
83     if (ConstantInt *ConstInt = dyn_cast<ConstantInt>(&V)) {
84       // We need to check that if we cast the index down to an i16, we do not
85       // lose precision.
86       APInt IntValue(ConstInt->getValue());
87       return IntValue.getActiveBits() <= 16;
88     }
89   }
90 
91   Value *CastSrc;
92   bool IsExt = IsFloat ? match(&V, m_FPExt(PatternMatch::m_Value(CastSrc)))
93                        : match(&V, m_ZExt(PatternMatch::m_Value(CastSrc)));
94   if (IsExt) {
95     Type *CastSrcTy = CastSrc->getType();
96     if (CastSrcTy->isHalfTy() || CastSrcTy->isIntegerTy(16))
97       return true;
98   }
99 
100   return false;
101 }
102 
103 // Convert a value to 16-bit.
convertTo16Bit(Value & V,InstCombiner::BuilderTy & Builder)104 static Value *convertTo16Bit(Value &V, InstCombiner::BuilderTy &Builder) {
105   Type *VTy = V.getType();
106   if (isa<FPExtInst>(&V) || isa<SExtInst>(&V) || isa<ZExtInst>(&V))
107     return cast<Instruction>(&V)->getOperand(0);
108   if (VTy->isIntegerTy())
109     return Builder.CreateIntCast(&V, Type::getInt16Ty(V.getContext()), false);
110   if (VTy->isFloatingPointTy())
111     return Builder.CreateFPCast(&V, Type::getHalfTy(V.getContext()));
112 
113   llvm_unreachable("Should never be called!");
114 }
115 
116 /// Applies Func(OldIntr.Args, OldIntr.ArgTys), creates intrinsic call with
117 /// modified arguments (based on OldIntr) and replaces InstToReplace with
118 /// this newly created intrinsic call.
modifyIntrinsicCall(IntrinsicInst & OldIntr,Instruction & InstToReplace,unsigned NewIntr,InstCombiner & IC,std::function<void (SmallVectorImpl<Value * > &,SmallVectorImpl<Type * > &)> Func)119 static std::optional<Instruction *> modifyIntrinsicCall(
120     IntrinsicInst &OldIntr, Instruction &InstToReplace, unsigned NewIntr,
121     InstCombiner &IC,
122     std::function<void(SmallVectorImpl<Value *> &, SmallVectorImpl<Type *> &)>
123         Func) {
124   SmallVector<Type *, 4> ArgTys;
125   if (!Intrinsic::getIntrinsicSignature(OldIntr.getCalledFunction(), ArgTys))
126     return std::nullopt;
127 
128   SmallVector<Value *, 8> Args(OldIntr.args());
129 
130   // Modify arguments and types
131   Func(Args, ArgTys);
132 
133   Function *I = Intrinsic::getDeclaration(OldIntr.getModule(), NewIntr, ArgTys);
134 
135   CallInst *NewCall = IC.Builder.CreateCall(I, Args);
136   NewCall->takeName(&OldIntr);
137   NewCall->copyMetadata(OldIntr);
138   if (isa<FPMathOperator>(NewCall))
139     NewCall->copyFastMathFlags(&OldIntr);
140 
141   // Erase and replace uses
142   if (!InstToReplace.getType()->isVoidTy())
143     IC.replaceInstUsesWith(InstToReplace, NewCall);
144 
145   bool RemoveOldIntr = &OldIntr != &InstToReplace;
146 
147   auto RetValue = IC.eraseInstFromFunction(InstToReplace);
148   if (RemoveOldIntr)
149     IC.eraseInstFromFunction(OldIntr);
150 
151   return RetValue;
152 }
153 
154 static std::optional<Instruction *>
simplifyAMDGCNImageIntrinsic(const GCNSubtarget * ST,const AMDGPU::ImageDimIntrinsicInfo * ImageDimIntr,IntrinsicInst & II,InstCombiner & IC)155 simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST,
156                              const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr,
157                              IntrinsicInst &II, InstCombiner &IC) {
158   // Optimize _L to _LZ when _L is zero
159   if (const auto *LZMappingInfo =
160           AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) {
161     if (auto *ConstantLod =
162             dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->LodIndex))) {
163       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
164         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
165             AMDGPU::getImageDimIntrinsicByBaseOpcode(LZMappingInfo->LZ,
166                                                      ImageDimIntr->Dim);
167         return modifyIntrinsicCall(
168             II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
169               Args.erase(Args.begin() + ImageDimIntr->LodIndex);
170             });
171       }
172     }
173   }
174 
175   // Optimize _mip away, when 'lod' is zero
176   if (const auto *MIPMappingInfo =
177           AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) {
178     if (auto *ConstantMip =
179             dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->MipIndex))) {
180       if (ConstantMip->isZero()) {
181         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
182             AMDGPU::getImageDimIntrinsicByBaseOpcode(MIPMappingInfo->NONMIP,
183                                                      ImageDimIntr->Dim);
184         return modifyIntrinsicCall(
185             II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
186               Args.erase(Args.begin() + ImageDimIntr->MipIndex);
187             });
188       }
189     }
190   }
191 
192   // Optimize _bias away when 'bias' is zero
193   if (const auto *BiasMappingInfo =
194           AMDGPU::getMIMGBiasMappingInfo(ImageDimIntr->BaseOpcode)) {
195     if (auto *ConstantBias =
196             dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->BiasIndex))) {
197       if (ConstantBias->isZero()) {
198         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
199             AMDGPU::getImageDimIntrinsicByBaseOpcode(BiasMappingInfo->NoBias,
200                                                      ImageDimIntr->Dim);
201         return modifyIntrinsicCall(
202             II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
203               Args.erase(Args.begin() + ImageDimIntr->BiasIndex);
204               ArgTys.erase(ArgTys.begin() + ImageDimIntr->BiasTyArg);
205             });
206       }
207     }
208   }
209 
210   // Optimize _offset away when 'offset' is zero
211   if (const auto *OffsetMappingInfo =
212           AMDGPU::getMIMGOffsetMappingInfo(ImageDimIntr->BaseOpcode)) {
213     if (auto *ConstantOffset =
214             dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->OffsetIndex))) {
215       if (ConstantOffset->isZero()) {
216         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
217             AMDGPU::getImageDimIntrinsicByBaseOpcode(
218                 OffsetMappingInfo->NoOffset, ImageDimIntr->Dim);
219         return modifyIntrinsicCall(
220             II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
221               Args.erase(Args.begin() + ImageDimIntr->OffsetIndex);
222             });
223       }
224     }
225   }
226 
227   // Try to use D16
228   if (ST->hasD16Images()) {
229 
230     const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
231         AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
232 
233     if (BaseOpcode->HasD16) {
234 
235       // If the only use of image intrinsic is a fptrunc (with conversion to
236       // half) then both fptrunc and image intrinsic will be replaced with image
237       // intrinsic with D16 flag.
238       if (II.hasOneUse()) {
239         Instruction *User = II.user_back();
240 
241         if (User->getOpcode() == Instruction::FPTrunc &&
242             User->getType()->getScalarType()->isHalfTy()) {
243 
244           return modifyIntrinsicCall(II, *User, ImageDimIntr->Intr, IC,
245                                      [&](auto &Args, auto &ArgTys) {
246                                        // Change return type of image intrinsic.
247                                        // Set it to return type of fptrunc.
248                                        ArgTys[0] = User->getType();
249                                      });
250         }
251       }
252     }
253   }
254 
255   // Try to use A16 or G16
256   if (!ST->hasA16() && !ST->hasG16())
257     return std::nullopt;
258 
259   // Address is interpreted as float if the instruction has a sampler or as
260   // unsigned int if there is no sampler.
261   bool HasSampler =
262       AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode)->Sampler;
263   bool FloatCoord = false;
264   // true means derivatives can be converted to 16 bit, coordinates not
265   bool OnlyDerivatives = false;
266 
267   for (unsigned OperandIndex = ImageDimIntr->GradientStart;
268        OperandIndex < ImageDimIntr->VAddrEnd; OperandIndex++) {
269     Value *Coord = II.getOperand(OperandIndex);
270     // If the values are not derived from 16-bit values, we cannot optimize.
271     if (!canSafelyConvertTo16Bit(*Coord, HasSampler)) {
272       if (OperandIndex < ImageDimIntr->CoordStart ||
273           ImageDimIntr->GradientStart == ImageDimIntr->CoordStart) {
274         return std::nullopt;
275       }
276       // All gradients can be converted, so convert only them
277       OnlyDerivatives = true;
278       break;
279     }
280 
281     assert(OperandIndex == ImageDimIntr->GradientStart ||
282            FloatCoord == Coord->getType()->isFloatingPointTy());
283     FloatCoord = Coord->getType()->isFloatingPointTy();
284   }
285 
286   if (!OnlyDerivatives && !ST->hasA16())
287     OnlyDerivatives = true; // Only supports G16
288 
289   // Check if there is a bias parameter and if it can be converted to f16
290   if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) {
291     Value *Bias = II.getOperand(ImageDimIntr->BiasIndex);
292     assert(HasSampler &&
293            "Only image instructions with a sampler can have a bias");
294     if (!canSafelyConvertTo16Bit(*Bias, HasSampler))
295       OnlyDerivatives = true;
296   }
297 
298   if (OnlyDerivatives && (!ST->hasG16() || ImageDimIntr->GradientStart ==
299                                                ImageDimIntr->CoordStart))
300     return std::nullopt;
301 
302   Type *CoordType = FloatCoord ? Type::getHalfTy(II.getContext())
303                                : Type::getInt16Ty(II.getContext());
304 
305   return modifyIntrinsicCall(
306       II, II, II.getIntrinsicID(), IC, [&](auto &Args, auto &ArgTys) {
307         ArgTys[ImageDimIntr->GradientTyArg] = CoordType;
308         if (!OnlyDerivatives) {
309           ArgTys[ImageDimIntr->CoordTyArg] = CoordType;
310 
311           // Change the bias type
312           if (ImageDimIntr->NumBiasArgs != 0)
313             ArgTys[ImageDimIntr->BiasTyArg] = Type::getHalfTy(II.getContext());
314         }
315 
316         unsigned EndIndex =
317             OnlyDerivatives ? ImageDimIntr->CoordStart : ImageDimIntr->VAddrEnd;
318         for (unsigned OperandIndex = ImageDimIntr->GradientStart;
319              OperandIndex < EndIndex; OperandIndex++) {
320           Args[OperandIndex] =
321               convertTo16Bit(*II.getOperand(OperandIndex), IC.Builder);
322         }
323 
324         // Convert the bias
325         if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) {
326           Value *Bias = II.getOperand(ImageDimIntr->BiasIndex);
327           Args[ImageDimIntr->BiasIndex] = convertTo16Bit(*Bias, IC.Builder);
328         }
329       });
330 }
331 
canSimplifyLegacyMulToMul(const Instruction & I,const Value * Op0,const Value * Op1,InstCombiner & IC) const332 bool GCNTTIImpl::canSimplifyLegacyMulToMul(const Instruction &I,
333                                            const Value *Op0, const Value *Op1,
334                                            InstCombiner &IC) const {
335   // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
336   // infinity, gives +0.0. If we can prove we don't have one of the special
337   // cases then we can use a normal multiply instead.
338   // TODO: Create and use isKnownFiniteNonZero instead of just matching
339   // constants here.
340   if (match(Op0, PatternMatch::m_FiniteNonZero()) ||
341       match(Op1, PatternMatch::m_FiniteNonZero())) {
342     // One operand is not zero or infinity or NaN.
343     return true;
344   }
345 
346   SimplifyQuery SQ = IC.getSimplifyQuery().getWithInstruction(&I);
347   if (isKnownNeverInfOrNaN(Op0, /*Depth=*/0, SQ) &&
348       isKnownNeverInfOrNaN(Op1, /*Depth=*/0, SQ)) {
349     // Neither operand is infinity or NaN.
350     return true;
351   }
352   return false;
353 }
354 
355 /// Match an fpext from half to float, or a constant we can convert.
matchFPExtFromF16(Value * Arg,Value * & FPExtSrc)356 static bool matchFPExtFromF16(Value *Arg, Value *&FPExtSrc) {
357   if (match(Arg, m_OneUse(m_FPExt(m_Value(FPExtSrc)))))
358     return FPExtSrc->getType()->isHalfTy();
359 
360   ConstantFP *CFP;
361   if (match(Arg, m_ConstantFP(CFP))) {
362     bool LosesInfo;
363     APFloat Val(CFP->getValueAPF());
364     Val.convert(APFloat::IEEEhalf(), APFloat::rmNearestTiesToEven, &LosesInfo);
365     if (LosesInfo)
366       return false;
367 
368     FPExtSrc = ConstantFP::get(Type::getHalfTy(Arg->getContext()), Val);
369     return true;
370   }
371 
372   return false;
373 }
374 
375 // Trim all zero components from the end of the vector \p UseV and return
376 // an appropriate bitset with known elements.
trimTrailingZerosInVector(InstCombiner & IC,Value * UseV,Instruction * I)377 static APInt trimTrailingZerosInVector(InstCombiner &IC, Value *UseV,
378                                        Instruction *I) {
379   auto *VTy = cast<FixedVectorType>(UseV->getType());
380   unsigned VWidth = VTy->getNumElements();
381   APInt DemandedElts = APInt::getAllOnes(VWidth);
382 
383   for (int i = VWidth - 1; i > 0; --i) {
384     auto *Elt = findScalarElement(UseV, i);
385     if (!Elt)
386       break;
387 
388     if (auto *ConstElt = dyn_cast<Constant>(Elt)) {
389       if (!ConstElt->isNullValue() && !isa<UndefValue>(Elt))
390         break;
391     } else {
392       break;
393     }
394 
395     DemandedElts.clearBit(i);
396   }
397 
398   return DemandedElts;
399 }
400 
401 // Trim elements of the end of the vector \p V, if they are
402 // equal to the first element of the vector.
defaultComponentBroadcast(Value * V)403 static APInt defaultComponentBroadcast(Value *V) {
404   auto *VTy = cast<FixedVectorType>(V->getType());
405   unsigned VWidth = VTy->getNumElements();
406   APInt DemandedElts = APInt::getAllOnes(VWidth);
407   Value *FirstComponent = findScalarElement(V, 0);
408 
409   SmallVector<int> ShuffleMask;
410   if (auto *SVI = dyn_cast<ShuffleVectorInst>(V))
411     SVI->getShuffleMask(ShuffleMask);
412 
413   for (int I = VWidth - 1; I > 0; --I) {
414     if (ShuffleMask.empty()) {
415       auto *Elt = findScalarElement(V, I);
416       if (!Elt || (Elt != FirstComponent && !isa<UndefValue>(Elt)))
417         break;
418     } else {
419       // Detect identical elements in the shufflevector result, even though
420       // findScalarElement cannot tell us what that element is.
421       if (ShuffleMask[I] != ShuffleMask[0] && ShuffleMask[I] != PoisonMaskElem)
422         break;
423     }
424     DemandedElts.clearBit(I);
425   }
426 
427   return DemandedElts;
428 }
429 
430 static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
431                                                     IntrinsicInst &II,
432                                                     APInt DemandedElts,
433                                                     int DMaskIdx = -1,
434                                                     bool IsLoad = true);
435 
436 /// Return true if it's legal to contract llvm.amdgcn.rcp(llvm.sqrt)
canContractSqrtToRsq(const FPMathOperator * SqrtOp)437 static bool canContractSqrtToRsq(const FPMathOperator *SqrtOp) {
438   return (SqrtOp->getType()->isFloatTy() &&
439           (SqrtOp->hasApproxFunc() || SqrtOp->getFPAccuracy() >= 1.0f)) ||
440          SqrtOp->getType()->isHalfTy();
441 }
442 
443 std::optional<Instruction *>
instCombineIntrinsic(InstCombiner & IC,IntrinsicInst & II) const444 GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
445   Intrinsic::ID IID = II.getIntrinsicID();
446   switch (IID) {
447   case Intrinsic::amdgcn_rcp: {
448     Value *Src = II.getArgOperand(0);
449 
450     // TODO: Move to ConstantFolding/InstSimplify?
451     if (isa<UndefValue>(Src)) {
452       Type *Ty = II.getType();
453       auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
454       return IC.replaceInstUsesWith(II, QNaN);
455     }
456 
457     if (II.isStrictFP())
458       break;
459 
460     if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
461       const APFloat &ArgVal = C->getValueAPF();
462       APFloat Val(ArgVal.getSemantics(), 1);
463       Val.divide(ArgVal, APFloat::rmNearestTiesToEven);
464 
465       // This is more precise than the instruction may give.
466       //
467       // TODO: The instruction always flushes denormal results (except for f16),
468       // should this also?
469       return IC.replaceInstUsesWith(II, ConstantFP::get(II.getContext(), Val));
470     }
471 
472     FastMathFlags FMF = cast<FPMathOperator>(II).getFastMathFlags();
473     if (!FMF.allowContract())
474       break;
475     auto *SrcCI = dyn_cast<IntrinsicInst>(Src);
476     if (!SrcCI)
477       break;
478 
479     auto IID = SrcCI->getIntrinsicID();
480     // llvm.amdgcn.rcp(llvm.amdgcn.sqrt(x)) -> llvm.amdgcn.rsq(x) if contractable
481     //
482     // llvm.amdgcn.rcp(llvm.sqrt(x)) -> llvm.amdgcn.rsq(x) if contractable and
483     // relaxed.
484     if (IID == Intrinsic::amdgcn_sqrt || IID == Intrinsic::sqrt) {
485       const FPMathOperator *SqrtOp = cast<FPMathOperator>(SrcCI);
486       FastMathFlags InnerFMF = SqrtOp->getFastMathFlags();
487       if (!InnerFMF.allowContract() || !SrcCI->hasOneUse())
488         break;
489 
490       if (IID == Intrinsic::sqrt && !canContractSqrtToRsq(SqrtOp))
491         break;
492 
493       Function *NewDecl = Intrinsic::getDeclaration(
494           SrcCI->getModule(), Intrinsic::amdgcn_rsq, {SrcCI->getType()});
495 
496       InnerFMF |= FMF;
497       II.setFastMathFlags(InnerFMF);
498 
499       II.setCalledFunction(NewDecl);
500       return IC.replaceOperand(II, 0, SrcCI->getArgOperand(0));
501     }
502 
503     break;
504   }
505   case Intrinsic::amdgcn_sqrt:
506   case Intrinsic::amdgcn_rsq: {
507     Value *Src = II.getArgOperand(0);
508 
509     // TODO: Move to ConstantFolding/InstSimplify?
510     if (isa<UndefValue>(Src)) {
511       Type *Ty = II.getType();
512       auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
513       return IC.replaceInstUsesWith(II, QNaN);
514     }
515 
516     // f16 amdgcn.sqrt is identical to regular sqrt.
517     if (IID == Intrinsic::amdgcn_sqrt && Src->getType()->isHalfTy()) {
518       Function *NewDecl = Intrinsic::getDeclaration(
519           II.getModule(), Intrinsic::sqrt, {II.getType()});
520       II.setCalledFunction(NewDecl);
521       return &II;
522     }
523 
524     break;
525   }
526   case Intrinsic::amdgcn_log:
527   case Intrinsic::amdgcn_exp2: {
528     const bool IsLog = IID == Intrinsic::amdgcn_log;
529     const bool IsExp = IID == Intrinsic::amdgcn_exp2;
530     Value *Src = II.getArgOperand(0);
531     Type *Ty = II.getType();
532 
533     if (isa<PoisonValue>(Src))
534       return IC.replaceInstUsesWith(II, Src);
535 
536     if (IC.getSimplifyQuery().isUndefValue(Src))
537       return IC.replaceInstUsesWith(II, ConstantFP::getNaN(Ty));
538 
539     if (ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
540       if (C->isInfinity()) {
541         // exp2(+inf) -> +inf
542         // log2(+inf) -> +inf
543         if (!C->isNegative())
544           return IC.replaceInstUsesWith(II, C);
545 
546         // exp2(-inf) -> 0
547         if (IsExp && C->isNegative())
548           return IC.replaceInstUsesWith(II, ConstantFP::getZero(Ty));
549       }
550 
551       if (II.isStrictFP())
552         break;
553 
554       if (C->isNaN()) {
555         Constant *Quieted = ConstantFP::get(Ty, C->getValue().makeQuiet());
556         return IC.replaceInstUsesWith(II, Quieted);
557       }
558 
559       // f32 instruction doesn't handle denormals, f16 does.
560       if (C->isZero() || (C->getValue().isDenormal() && Ty->isFloatTy())) {
561         Constant *FoldedValue = IsLog ? ConstantFP::getInfinity(Ty, true)
562                                       : ConstantFP::get(Ty, 1.0);
563         return IC.replaceInstUsesWith(II, FoldedValue);
564       }
565 
566       if (IsLog && C->isNegative())
567         return IC.replaceInstUsesWith(II, ConstantFP::getNaN(Ty));
568 
569       // TODO: Full constant folding matching hardware behavior.
570     }
571 
572     break;
573   }
574   case Intrinsic::amdgcn_frexp_mant:
575   case Intrinsic::amdgcn_frexp_exp: {
576     Value *Src = II.getArgOperand(0);
577     if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
578       int Exp;
579       APFloat Significand =
580           frexp(C->getValueAPF(), Exp, APFloat::rmNearestTiesToEven);
581 
582       if (IID == Intrinsic::amdgcn_frexp_mant) {
583         return IC.replaceInstUsesWith(
584             II, ConstantFP::get(II.getContext(), Significand));
585       }
586 
587       // Match instruction special case behavior.
588       if (Exp == APFloat::IEK_NaN || Exp == APFloat::IEK_Inf)
589         Exp = 0;
590 
591       return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Exp));
592     }
593 
594     if (isa<UndefValue>(Src)) {
595       return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
596     }
597 
598     break;
599   }
600   case Intrinsic::amdgcn_class: {
601     Value *Src0 = II.getArgOperand(0);
602     Value *Src1 = II.getArgOperand(1);
603     const ConstantInt *CMask = dyn_cast<ConstantInt>(Src1);
604     if (CMask) {
605       II.setCalledOperand(Intrinsic::getDeclaration(
606           II.getModule(), Intrinsic::is_fpclass, Src0->getType()));
607 
608       // Clamp any excess bits, as they're illegal for the generic intrinsic.
609       II.setArgOperand(1, ConstantInt::get(Src1->getType(),
610                                            CMask->getZExtValue() & fcAllFlags));
611       return &II;
612     }
613 
614     // Propagate poison.
615     if (isa<PoisonValue>(Src0) || isa<PoisonValue>(Src1))
616       return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
617 
618     // llvm.amdgcn.class(_, undef) -> false
619     if (IC.getSimplifyQuery().isUndefValue(Src1))
620       return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), false));
621 
622     // llvm.amdgcn.class(undef, mask) -> mask != 0
623     if (IC.getSimplifyQuery().isUndefValue(Src0)) {
624       Value *CmpMask = IC.Builder.CreateICmpNE(
625           Src1, ConstantInt::getNullValue(Src1->getType()));
626       return IC.replaceInstUsesWith(II, CmpMask);
627     }
628     break;
629   }
630   case Intrinsic::amdgcn_cvt_pkrtz: {
631     Value *Src0 = II.getArgOperand(0);
632     Value *Src1 = II.getArgOperand(1);
633     if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
634       if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
635         const fltSemantics &HalfSem =
636             II.getType()->getScalarType()->getFltSemantics();
637         bool LosesInfo;
638         APFloat Val0 = C0->getValueAPF();
639         APFloat Val1 = C1->getValueAPF();
640         Val0.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
641         Val1.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
642 
643         Constant *Folded =
644             ConstantVector::get({ConstantFP::get(II.getContext(), Val0),
645                                  ConstantFP::get(II.getContext(), Val1)});
646         return IC.replaceInstUsesWith(II, Folded);
647       }
648     }
649 
650     if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) {
651       return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
652     }
653 
654     break;
655   }
656   case Intrinsic::amdgcn_cvt_pknorm_i16:
657   case Intrinsic::amdgcn_cvt_pknorm_u16:
658   case Intrinsic::amdgcn_cvt_pk_i16:
659   case Intrinsic::amdgcn_cvt_pk_u16: {
660     Value *Src0 = II.getArgOperand(0);
661     Value *Src1 = II.getArgOperand(1);
662 
663     if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) {
664       return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
665     }
666 
667     break;
668   }
669   case Intrinsic::amdgcn_ubfe:
670   case Intrinsic::amdgcn_sbfe: {
671     // Decompose simple cases into standard shifts.
672     Value *Src = II.getArgOperand(0);
673     if (isa<UndefValue>(Src)) {
674       return IC.replaceInstUsesWith(II, Src);
675     }
676 
677     unsigned Width;
678     Type *Ty = II.getType();
679     unsigned IntSize = Ty->getIntegerBitWidth();
680 
681     ConstantInt *CWidth = dyn_cast<ConstantInt>(II.getArgOperand(2));
682     if (CWidth) {
683       Width = CWidth->getZExtValue();
684       if ((Width & (IntSize - 1)) == 0) {
685         return IC.replaceInstUsesWith(II, ConstantInt::getNullValue(Ty));
686       }
687 
688       // Hardware ignores high bits, so remove those.
689       if (Width >= IntSize) {
690         return IC.replaceOperand(
691             II, 2, ConstantInt::get(CWidth->getType(), Width & (IntSize - 1)));
692       }
693     }
694 
695     unsigned Offset;
696     ConstantInt *COffset = dyn_cast<ConstantInt>(II.getArgOperand(1));
697     if (COffset) {
698       Offset = COffset->getZExtValue();
699       if (Offset >= IntSize) {
700         return IC.replaceOperand(
701             II, 1,
702             ConstantInt::get(COffset->getType(), Offset & (IntSize - 1)));
703       }
704     }
705 
706     bool Signed = IID == Intrinsic::amdgcn_sbfe;
707 
708     if (!CWidth || !COffset)
709       break;
710 
711     // The case of Width == 0 is handled above, which makes this transformation
712     // safe.  If Width == 0, then the ashr and lshr instructions become poison
713     // value since the shift amount would be equal to the bit size.
714     assert(Width != 0);
715 
716     // TODO: This allows folding to undef when the hardware has specific
717     // behavior?
718     if (Offset + Width < IntSize) {
719       Value *Shl = IC.Builder.CreateShl(Src, IntSize - Offset - Width);
720       Value *RightShift = Signed ? IC.Builder.CreateAShr(Shl, IntSize - Width)
721                                  : IC.Builder.CreateLShr(Shl, IntSize - Width);
722       RightShift->takeName(&II);
723       return IC.replaceInstUsesWith(II, RightShift);
724     }
725 
726     Value *RightShift = Signed ? IC.Builder.CreateAShr(Src, Offset)
727                                : IC.Builder.CreateLShr(Src, Offset);
728 
729     RightShift->takeName(&II);
730     return IC.replaceInstUsesWith(II, RightShift);
731   }
732   case Intrinsic::amdgcn_exp:
733   case Intrinsic::amdgcn_exp_row:
734   case Intrinsic::amdgcn_exp_compr: {
735     ConstantInt *En = cast<ConstantInt>(II.getArgOperand(1));
736     unsigned EnBits = En->getZExtValue();
737     if (EnBits == 0xf)
738       break; // All inputs enabled.
739 
740     bool IsCompr = IID == Intrinsic::amdgcn_exp_compr;
741     bool Changed = false;
742     for (int I = 0; I < (IsCompr ? 2 : 4); ++I) {
743       if ((!IsCompr && (EnBits & (1 << I)) == 0) ||
744           (IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) {
745         Value *Src = II.getArgOperand(I + 2);
746         if (!isa<UndefValue>(Src)) {
747           IC.replaceOperand(II, I + 2, UndefValue::get(Src->getType()));
748           Changed = true;
749         }
750       }
751     }
752 
753     if (Changed) {
754       return &II;
755     }
756 
757     break;
758   }
759   case Intrinsic::amdgcn_fmed3: {
760     // Note this does not preserve proper sNaN behavior if IEEE-mode is enabled
761     // for the shader.
762 
763     Value *Src0 = II.getArgOperand(0);
764     Value *Src1 = II.getArgOperand(1);
765     Value *Src2 = II.getArgOperand(2);
766 
767     // Checking for NaN before canonicalization provides better fidelity when
768     // mapping other operations onto fmed3 since the order of operands is
769     // unchanged.
770     Value *V = nullptr;
771     if (match(Src0, PatternMatch::m_NaN()) || isa<UndefValue>(Src0)) {
772       V = IC.Builder.CreateMinNum(Src1, Src2);
773     } else if (match(Src1, PatternMatch::m_NaN()) || isa<UndefValue>(Src1)) {
774       V = IC.Builder.CreateMinNum(Src0, Src2);
775     } else if (match(Src2, PatternMatch::m_NaN()) || isa<UndefValue>(Src2)) {
776       V = IC.Builder.CreateMaxNum(Src0, Src1);
777     }
778 
779     if (V) {
780       if (auto *CI = dyn_cast<CallInst>(V)) {
781         CI->copyFastMathFlags(&II);
782         CI->takeName(&II);
783       }
784       return IC.replaceInstUsesWith(II, V);
785     }
786 
787     bool Swap = false;
788     // Canonicalize constants to RHS operands.
789     //
790     // fmed3(c0, x, c1) -> fmed3(x, c0, c1)
791     if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
792       std::swap(Src0, Src1);
793       Swap = true;
794     }
795 
796     if (isa<Constant>(Src1) && !isa<Constant>(Src2)) {
797       std::swap(Src1, Src2);
798       Swap = true;
799     }
800 
801     if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
802       std::swap(Src0, Src1);
803       Swap = true;
804     }
805 
806     if (Swap) {
807       II.setArgOperand(0, Src0);
808       II.setArgOperand(1, Src1);
809       II.setArgOperand(2, Src2);
810       return &II;
811     }
812 
813     if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
814       if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
815         if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Src2)) {
816           APFloat Result = fmed3AMDGCN(C0->getValueAPF(), C1->getValueAPF(),
817                                        C2->getValueAPF());
818           return IC.replaceInstUsesWith(
819               II, ConstantFP::get(IC.Builder.getContext(), Result));
820         }
821       }
822     }
823 
824     if (!ST->hasMed3_16())
825       break;
826 
827     Value *X, *Y, *Z;
828 
829     // Repeat floating-point width reduction done for minnum/maxnum.
830     // fmed3((fpext X), (fpext Y), (fpext Z)) -> fpext (fmed3(X, Y, Z))
831     if (matchFPExtFromF16(Src0, X) && matchFPExtFromF16(Src1, Y) &&
832         matchFPExtFromF16(Src2, Z)) {
833       Value *NewCall = IC.Builder.CreateIntrinsic(IID, {X->getType()},
834                                                   {X, Y, Z}, &II, II.getName());
835       return new FPExtInst(NewCall, II.getType());
836     }
837 
838     break;
839   }
840   case Intrinsic::amdgcn_icmp:
841   case Intrinsic::amdgcn_fcmp: {
842     const ConstantInt *CC = cast<ConstantInt>(II.getArgOperand(2));
843     // Guard against invalid arguments.
844     int64_t CCVal = CC->getZExtValue();
845     bool IsInteger = IID == Intrinsic::amdgcn_icmp;
846     if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE ||
847                        CCVal > CmpInst::LAST_ICMP_PREDICATE)) ||
848         (!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE ||
849                         CCVal > CmpInst::LAST_FCMP_PREDICATE)))
850       break;
851 
852     Value *Src0 = II.getArgOperand(0);
853     Value *Src1 = II.getArgOperand(1);
854 
855     if (auto *CSrc0 = dyn_cast<Constant>(Src0)) {
856       if (auto *CSrc1 = dyn_cast<Constant>(Src1)) {
857         Constant *CCmp = ConstantFoldCompareInstOperands(
858             (ICmpInst::Predicate)CCVal, CSrc0, CSrc1, DL);
859         if (CCmp && CCmp->isNullValue()) {
860           return IC.replaceInstUsesWith(
861               II, IC.Builder.CreateSExt(CCmp, II.getType()));
862         }
863 
864         // The result of V_ICMP/V_FCMP assembly instructions (which this
865         // intrinsic exposes) is one bit per thread, masked with the EXEC
866         // register (which contains the bitmask of live threads). So a
867         // comparison that always returns true is the same as a read of the
868         // EXEC register.
869         Function *NewF = Intrinsic::getDeclaration(
870             II.getModule(), Intrinsic::read_register, II.getType());
871         Metadata *MDArgs[] = {MDString::get(II.getContext(), "exec")};
872         MDNode *MD = MDNode::get(II.getContext(), MDArgs);
873         Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)};
874         CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
875         NewCall->addFnAttr(Attribute::Convergent);
876         NewCall->takeName(&II);
877         return IC.replaceInstUsesWith(II, NewCall);
878       }
879 
880       // Canonicalize constants to RHS.
881       CmpInst::Predicate SwapPred =
882           CmpInst::getSwappedPredicate(static_cast<CmpInst::Predicate>(CCVal));
883       II.setArgOperand(0, Src1);
884       II.setArgOperand(1, Src0);
885       II.setArgOperand(
886           2, ConstantInt::get(CC->getType(), static_cast<int>(SwapPred)));
887       return &II;
888     }
889 
890     if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE)
891       break;
892 
893     // Canonicalize compare eq with true value to compare != 0
894     // llvm.amdgcn.icmp(zext (i1 x), 1, eq)
895     //   -> llvm.amdgcn.icmp(zext (i1 x), 0, ne)
896     // llvm.amdgcn.icmp(sext (i1 x), -1, eq)
897     //   -> llvm.amdgcn.icmp(sext (i1 x), 0, ne)
898     Value *ExtSrc;
899     if (CCVal == CmpInst::ICMP_EQ &&
900         ((match(Src1, PatternMatch::m_One()) &&
901           match(Src0, m_ZExt(PatternMatch::m_Value(ExtSrc)))) ||
902          (match(Src1, PatternMatch::m_AllOnes()) &&
903           match(Src0, m_SExt(PatternMatch::m_Value(ExtSrc))))) &&
904         ExtSrc->getType()->isIntegerTy(1)) {
905       IC.replaceOperand(II, 1, ConstantInt::getNullValue(Src1->getType()));
906       IC.replaceOperand(II, 2,
907                         ConstantInt::get(CC->getType(), CmpInst::ICMP_NE));
908       return &II;
909     }
910 
911     CmpInst::Predicate SrcPred;
912     Value *SrcLHS;
913     Value *SrcRHS;
914 
915     // Fold compare eq/ne with 0 from a compare result as the predicate to the
916     // intrinsic. The typical use is a wave vote function in the library, which
917     // will be fed from a user code condition compared with 0. Fold in the
918     // redundant compare.
919 
920     // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne)
921     //   -> llvm.amdgcn.[if]cmp(a, b, pred)
922     //
923     // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq)
924     //   -> llvm.amdgcn.[if]cmp(a, b, inv pred)
925     if (match(Src1, PatternMatch::m_Zero()) &&
926         match(Src0, PatternMatch::m_ZExtOrSExt(
927                         m_Cmp(SrcPred, PatternMatch::m_Value(SrcLHS),
928                               PatternMatch::m_Value(SrcRHS))))) {
929       if (CCVal == CmpInst::ICMP_EQ)
930         SrcPred = CmpInst::getInversePredicate(SrcPred);
931 
932       Intrinsic::ID NewIID = CmpInst::isFPPredicate(SrcPred)
933                                  ? Intrinsic::amdgcn_fcmp
934                                  : Intrinsic::amdgcn_icmp;
935 
936       Type *Ty = SrcLHS->getType();
937       if (auto *CmpType = dyn_cast<IntegerType>(Ty)) {
938         // Promote to next legal integer type.
939         unsigned Width = CmpType->getBitWidth();
940         unsigned NewWidth = Width;
941 
942         // Don't do anything for i1 comparisons.
943         if (Width == 1)
944           break;
945 
946         if (Width <= 16)
947           NewWidth = 16;
948         else if (Width <= 32)
949           NewWidth = 32;
950         else if (Width <= 64)
951           NewWidth = 64;
952         else
953           break; // Can't handle this.
954 
955         if (Width != NewWidth) {
956           IntegerType *CmpTy = IC.Builder.getIntNTy(NewWidth);
957           if (CmpInst::isSigned(SrcPred)) {
958             SrcLHS = IC.Builder.CreateSExt(SrcLHS, CmpTy);
959             SrcRHS = IC.Builder.CreateSExt(SrcRHS, CmpTy);
960           } else {
961             SrcLHS = IC.Builder.CreateZExt(SrcLHS, CmpTy);
962             SrcRHS = IC.Builder.CreateZExt(SrcRHS, CmpTy);
963           }
964         }
965       } else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy())
966         break;
967 
968       Function *NewF = Intrinsic::getDeclaration(
969           II.getModule(), NewIID, {II.getType(), SrcLHS->getType()});
970       Value *Args[] = {SrcLHS, SrcRHS,
971                        ConstantInt::get(CC->getType(), SrcPred)};
972       CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
973       NewCall->takeName(&II);
974       return IC.replaceInstUsesWith(II, NewCall);
975     }
976 
977     break;
978   }
979   case Intrinsic::amdgcn_mbcnt_hi: {
980     // exec_hi is all 0, so this is just a copy.
981     if (ST->isWave32())
982       return IC.replaceInstUsesWith(II, II.getArgOperand(1));
983     break;
984   }
985   case Intrinsic::amdgcn_ballot: {
986     if (auto *Src = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
987       if (Src->isZero()) {
988         // amdgcn.ballot(i1 0) is zero.
989         return IC.replaceInstUsesWith(II, Constant::getNullValue(II.getType()));
990       }
991     }
992     if (ST->isWave32() && II.getType()->getIntegerBitWidth() == 64) {
993       // %b64 = call i64 ballot.i64(...)
994       // =>
995       // %b32 = call i32 ballot.i32(...)
996       // %b64 = zext i32 %b32 to i64
997       Value *Call = IC.Builder.CreateZExt(
998           IC.Builder.CreateIntrinsic(Intrinsic::amdgcn_ballot,
999                                      {IC.Builder.getInt32Ty()},
1000                                      {II.getArgOperand(0)}),
1001           II.getType());
1002       Call->takeName(&II);
1003       return IC.replaceInstUsesWith(II, Call);
1004     }
1005     break;
1006   }
1007   case Intrinsic::amdgcn_wqm_vote: {
1008     // wqm_vote is identity when the argument is constant.
1009     if (!isa<Constant>(II.getArgOperand(0)))
1010       break;
1011 
1012     return IC.replaceInstUsesWith(II, II.getArgOperand(0));
1013   }
1014   case Intrinsic::amdgcn_kill: {
1015     const ConstantInt *C = dyn_cast<ConstantInt>(II.getArgOperand(0));
1016     if (!C || !C->getZExtValue())
1017       break;
1018 
1019     // amdgcn.kill(i1 1) is a no-op
1020     return IC.eraseInstFromFunction(II);
1021   }
1022   case Intrinsic::amdgcn_update_dpp: {
1023     Value *Old = II.getArgOperand(0);
1024 
1025     auto *BC = cast<ConstantInt>(II.getArgOperand(5));
1026     auto *RM = cast<ConstantInt>(II.getArgOperand(3));
1027     auto *BM = cast<ConstantInt>(II.getArgOperand(4));
1028     if (BC->isZeroValue() || RM->getZExtValue() != 0xF ||
1029         BM->getZExtValue() != 0xF || isa<UndefValue>(Old))
1030       break;
1031 
1032     // If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value.
1033     return IC.replaceOperand(II, 0, UndefValue::get(Old->getType()));
1034   }
1035   case Intrinsic::amdgcn_permlane16:
1036   case Intrinsic::amdgcn_permlane16_var:
1037   case Intrinsic::amdgcn_permlanex16:
1038   case Intrinsic::amdgcn_permlanex16_var: {
1039     // Discard vdst_in if it's not going to be read.
1040     Value *VDstIn = II.getArgOperand(0);
1041     if (isa<UndefValue>(VDstIn))
1042       break;
1043 
1044     // FetchInvalid operand idx.
1045     unsigned int FiIdx = (IID == Intrinsic::amdgcn_permlane16 ||
1046                           IID == Intrinsic::amdgcn_permlanex16)
1047                              ? 4  /* for permlane16 and permlanex16 */
1048                              : 3; /* for permlane16_var and permlanex16_var */
1049 
1050     // BoundCtrl operand idx.
1051     // For permlane16 and permlanex16 it should be 5
1052     // For Permlane16_var and permlanex16_var it should be 4
1053     unsigned int BcIdx = FiIdx + 1;
1054 
1055     ConstantInt *FetchInvalid = cast<ConstantInt>(II.getArgOperand(FiIdx));
1056     ConstantInt *BoundCtrl = cast<ConstantInt>(II.getArgOperand(BcIdx));
1057     if (!FetchInvalid->getZExtValue() && !BoundCtrl->getZExtValue())
1058       break;
1059 
1060     return IC.replaceOperand(II, 0, UndefValue::get(VDstIn->getType()));
1061   }
1062   case Intrinsic::amdgcn_permlane64:
1063     // A constant value is trivially uniform.
1064     if (Constant *C = dyn_cast<Constant>(II.getArgOperand(0))) {
1065       return IC.replaceInstUsesWith(II, C);
1066     }
1067     break;
1068   case Intrinsic::amdgcn_readfirstlane:
1069   case Intrinsic::amdgcn_readlane: {
1070     // A constant value is trivially uniform.
1071     if (Constant *C = dyn_cast<Constant>(II.getArgOperand(0))) {
1072       return IC.replaceInstUsesWith(II, C);
1073     }
1074 
1075     // The rest of these may not be safe if the exec may not be the same between
1076     // the def and use.
1077     Value *Src = II.getArgOperand(0);
1078     Instruction *SrcInst = dyn_cast<Instruction>(Src);
1079     if (SrcInst && SrcInst->getParent() != II.getParent())
1080       break;
1081 
1082     // readfirstlane (readfirstlane x) -> readfirstlane x
1083     // readlane (readfirstlane x), y -> readfirstlane x
1084     if (match(Src,
1085               PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readfirstlane>())) {
1086       return IC.replaceInstUsesWith(II, Src);
1087     }
1088 
1089     if (IID == Intrinsic::amdgcn_readfirstlane) {
1090       // readfirstlane (readlane x, y) -> readlane x, y
1091       if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>())) {
1092         return IC.replaceInstUsesWith(II, Src);
1093       }
1094     } else {
1095       // readlane (readlane x, y), y -> readlane x, y
1096       if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>(
1097                          PatternMatch::m_Value(),
1098                          PatternMatch::m_Specific(II.getArgOperand(1))))) {
1099         return IC.replaceInstUsesWith(II, Src);
1100       }
1101     }
1102 
1103     break;
1104   }
1105   case Intrinsic::amdgcn_trig_preop: {
1106     // The intrinsic is declared with name mangling, but currently the
1107     // instruction only exists for f64
1108     if (!II.getType()->isDoubleTy())
1109       break;
1110 
1111     Value *Src = II.getArgOperand(0);
1112     Value *Segment = II.getArgOperand(1);
1113     if (isa<PoisonValue>(Src) || isa<PoisonValue>(Segment))
1114       return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
1115 
1116     if (isa<UndefValue>(Src)) {
1117       auto *QNaN = ConstantFP::get(
1118           II.getType(), APFloat::getQNaN(II.getType()->getFltSemantics()));
1119       return IC.replaceInstUsesWith(II, QNaN);
1120     }
1121 
1122     const ConstantFP *Csrc = dyn_cast<ConstantFP>(Src);
1123     if (!Csrc)
1124       break;
1125 
1126     if (II.isStrictFP())
1127       break;
1128 
1129     const APFloat &Fsrc = Csrc->getValueAPF();
1130     if (Fsrc.isNaN()) {
1131       auto *Quieted = ConstantFP::get(II.getType(), Fsrc.makeQuiet());
1132       return IC.replaceInstUsesWith(II, Quieted);
1133     }
1134 
1135     const ConstantInt *Cseg = dyn_cast<ConstantInt>(Segment);
1136     if (!Cseg)
1137       break;
1138 
1139     unsigned Exponent = (Fsrc.bitcastToAPInt().getZExtValue() >> 52) & 0x7ff;
1140     unsigned SegmentVal = Cseg->getValue().trunc(5).getZExtValue();
1141     unsigned Shift = SegmentVal * 53;
1142     if (Exponent > 1077)
1143       Shift += Exponent - 1077;
1144 
1145     // 2.0/PI table.
1146     static const uint32_t TwoByPi[] = {
1147         0xa2f9836e, 0x4e441529, 0xfc2757d1, 0xf534ddc0, 0xdb629599, 0x3c439041,
1148         0xfe5163ab, 0xdebbc561, 0xb7246e3a, 0x424dd2e0, 0x06492eea, 0x09d1921c,
1149         0xfe1deb1c, 0xb129a73e, 0xe88235f5, 0x2ebb4484, 0xe99c7026, 0xb45f7e41,
1150         0x3991d639, 0x835339f4, 0x9c845f8b, 0xbdf9283b, 0x1ff897ff, 0xde05980f,
1151         0xef2f118b, 0x5a0a6d1f, 0x6d367ecf, 0x27cb09b7, 0x4f463f66, 0x9e5fea2d,
1152         0x7527bac7, 0xebe5f17b, 0x3d0739f7, 0x8a5292ea, 0x6bfb5fb1, 0x1f8d5d08,
1153         0x56033046};
1154 
1155     // Return 0 for outbound segment (hardware behavior).
1156     unsigned Idx = Shift >> 5;
1157     if (Idx + 2 >= std::size(TwoByPi)) {
1158       APFloat Zero = APFloat::getZero(II.getType()->getFltSemantics());
1159       return IC.replaceInstUsesWith(II, ConstantFP::get(II.getType(), Zero));
1160     }
1161 
1162     unsigned BShift = Shift & 0x1f;
1163     uint64_t Thi = Make_64(TwoByPi[Idx], TwoByPi[Idx + 1]);
1164     uint64_t Tlo = Make_64(TwoByPi[Idx + 2], 0);
1165     if (BShift)
1166       Thi = (Thi << BShift) | (Tlo >> (64 - BShift));
1167     Thi = Thi >> 11;
1168     APFloat Result = APFloat((double)Thi);
1169 
1170     int Scale = -53 - Shift;
1171     if (Exponent >= 1968)
1172       Scale += 128;
1173 
1174     Result = scalbn(Result, Scale, RoundingMode::NearestTiesToEven);
1175     return IC.replaceInstUsesWith(II, ConstantFP::get(Src->getType(), Result));
1176   }
1177   case Intrinsic::amdgcn_fmul_legacy: {
1178     Value *Op0 = II.getArgOperand(0);
1179     Value *Op1 = II.getArgOperand(1);
1180 
1181     // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
1182     // infinity, gives +0.0.
1183     // TODO: Move to InstSimplify?
1184     if (match(Op0, PatternMatch::m_AnyZeroFP()) ||
1185         match(Op1, PatternMatch::m_AnyZeroFP()))
1186       return IC.replaceInstUsesWith(II, ConstantFP::getZero(II.getType()));
1187 
1188     // If we can prove we don't have one of the special cases then we can use a
1189     // normal fmul instruction instead.
1190     if (canSimplifyLegacyMulToMul(II, Op0, Op1, IC)) {
1191       auto *FMul = IC.Builder.CreateFMulFMF(Op0, Op1, &II);
1192       FMul->takeName(&II);
1193       return IC.replaceInstUsesWith(II, FMul);
1194     }
1195     break;
1196   }
1197   case Intrinsic::amdgcn_fma_legacy: {
1198     Value *Op0 = II.getArgOperand(0);
1199     Value *Op1 = II.getArgOperand(1);
1200     Value *Op2 = II.getArgOperand(2);
1201 
1202     // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
1203     // infinity, gives +0.0.
1204     // TODO: Move to InstSimplify?
1205     if (match(Op0, PatternMatch::m_AnyZeroFP()) ||
1206         match(Op1, PatternMatch::m_AnyZeroFP())) {
1207       // It's tempting to just return Op2 here, but that would give the wrong
1208       // result if Op2 was -0.0.
1209       auto *Zero = ConstantFP::getZero(II.getType());
1210       auto *FAdd = IC.Builder.CreateFAddFMF(Zero, Op2, &II);
1211       FAdd->takeName(&II);
1212       return IC.replaceInstUsesWith(II, FAdd);
1213     }
1214 
1215     // If we can prove we don't have one of the special cases then we can use a
1216     // normal fma instead.
1217     if (canSimplifyLegacyMulToMul(II, Op0, Op1, IC)) {
1218       II.setCalledOperand(Intrinsic::getDeclaration(
1219           II.getModule(), Intrinsic::fma, II.getType()));
1220       return &II;
1221     }
1222     break;
1223   }
1224   case Intrinsic::amdgcn_is_shared:
1225   case Intrinsic::amdgcn_is_private: {
1226     if (isa<UndefValue>(II.getArgOperand(0)))
1227       return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
1228 
1229     if (isa<ConstantPointerNull>(II.getArgOperand(0)))
1230       return IC.replaceInstUsesWith(II, ConstantInt::getFalse(II.getType()));
1231     break;
1232   }
1233   case Intrinsic::amdgcn_raw_buffer_store_format:
1234   case Intrinsic::amdgcn_struct_buffer_store_format:
1235   case Intrinsic::amdgcn_raw_tbuffer_store:
1236   case Intrinsic::amdgcn_struct_tbuffer_store:
1237   case Intrinsic::amdgcn_image_store_1d:
1238   case Intrinsic::amdgcn_image_store_1darray:
1239   case Intrinsic::amdgcn_image_store_2d:
1240   case Intrinsic::amdgcn_image_store_2darray:
1241   case Intrinsic::amdgcn_image_store_2darraymsaa:
1242   case Intrinsic::amdgcn_image_store_2dmsaa:
1243   case Intrinsic::amdgcn_image_store_3d:
1244   case Intrinsic::amdgcn_image_store_cube:
1245   case Intrinsic::amdgcn_image_store_mip_1d:
1246   case Intrinsic::amdgcn_image_store_mip_1darray:
1247   case Intrinsic::amdgcn_image_store_mip_2d:
1248   case Intrinsic::amdgcn_image_store_mip_2darray:
1249   case Intrinsic::amdgcn_image_store_mip_3d:
1250   case Intrinsic::amdgcn_image_store_mip_cube: {
1251     if (!isa<FixedVectorType>(II.getArgOperand(0)->getType()))
1252       break;
1253 
1254     APInt DemandedElts;
1255     if (ST->hasDefaultComponentBroadcast())
1256       DemandedElts = defaultComponentBroadcast(II.getArgOperand(0));
1257     else if (ST->hasDefaultComponentZero())
1258       DemandedElts = trimTrailingZerosInVector(IC, II.getArgOperand(0), &II);
1259     else
1260       break;
1261 
1262     int DMaskIdx = getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID()) ? 1 : -1;
1263     if (simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, DMaskIdx,
1264                                               false)) {
1265       return IC.eraseInstFromFunction(II);
1266     }
1267 
1268     break;
1269   }
1270   }
1271   if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
1272             AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) {
1273     return simplifyAMDGCNImageIntrinsic(ST, ImageDimIntr, II, IC);
1274   }
1275   return std::nullopt;
1276 }
1277 
1278 /// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
1279 ///
1280 /// The result of simplifying amdgcn image and buffer store intrinsics is updating
1281 /// definitions of the intrinsics vector argument, not Uses of the result like
1282 /// image and buffer loads.
1283 /// Note: This only supports non-TFE/LWE image intrinsic calls; those have
1284 ///       struct returns.
simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner & IC,IntrinsicInst & II,APInt DemandedElts,int DMaskIdx,bool IsLoad)1285 static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
1286                                                     IntrinsicInst &II,
1287                                                     APInt DemandedElts,
1288                                                     int DMaskIdx, bool IsLoad) {
1289 
1290   auto *IIVTy = cast<FixedVectorType>(IsLoad ? II.getType()
1291                                              : II.getOperand(0)->getType());
1292   unsigned VWidth = IIVTy->getNumElements();
1293   if (VWidth == 1)
1294     return nullptr;
1295   Type *EltTy = IIVTy->getElementType();
1296 
1297   IRBuilderBase::InsertPointGuard Guard(IC.Builder);
1298   IC.Builder.SetInsertPoint(&II);
1299 
1300   // Assume the arguments are unchanged and later override them, if needed.
1301   SmallVector<Value *, 16> Args(II.args());
1302 
1303   if (DMaskIdx < 0) {
1304     // Buffer case.
1305 
1306     const unsigned ActiveBits = DemandedElts.getActiveBits();
1307     const unsigned UnusedComponentsAtFront = DemandedElts.countr_zero();
1308 
1309     // Start assuming the prefix of elements is demanded, but possibly clear
1310     // some other bits if there are trailing zeros (unused components at front)
1311     // and update offset.
1312     DemandedElts = (1 << ActiveBits) - 1;
1313 
1314     if (UnusedComponentsAtFront > 0) {
1315       static const unsigned InvalidOffsetIdx = 0xf;
1316 
1317       unsigned OffsetIdx;
1318       switch (II.getIntrinsicID()) {
1319       case Intrinsic::amdgcn_raw_buffer_load:
1320       case Intrinsic::amdgcn_raw_ptr_buffer_load:
1321         OffsetIdx = 1;
1322         break;
1323       case Intrinsic::amdgcn_s_buffer_load:
1324         // If resulting type is vec3, there is no point in trimming the
1325         // load with updated offset, as the vec3 would most likely be widened to
1326         // vec4 anyway during lowering.
1327         if (ActiveBits == 4 && UnusedComponentsAtFront == 1)
1328           OffsetIdx = InvalidOffsetIdx;
1329         else
1330           OffsetIdx = 1;
1331         break;
1332       case Intrinsic::amdgcn_struct_buffer_load:
1333       case Intrinsic::amdgcn_struct_ptr_buffer_load:
1334         OffsetIdx = 2;
1335         break;
1336       default:
1337         // TODO: handle tbuffer* intrinsics.
1338         OffsetIdx = InvalidOffsetIdx;
1339         break;
1340       }
1341 
1342       if (OffsetIdx != InvalidOffsetIdx) {
1343         // Clear demanded bits and update the offset.
1344         DemandedElts &= ~((1 << UnusedComponentsAtFront) - 1);
1345         auto *Offset = Args[OffsetIdx];
1346         unsigned SingleComponentSizeInBits =
1347             IC.getDataLayout().getTypeSizeInBits(EltTy);
1348         unsigned OffsetAdd =
1349             UnusedComponentsAtFront * SingleComponentSizeInBits / 8;
1350         auto *OffsetAddVal = ConstantInt::get(Offset->getType(), OffsetAdd);
1351         Args[OffsetIdx] = IC.Builder.CreateAdd(Offset, OffsetAddVal);
1352       }
1353     }
1354   } else {
1355     // Image case.
1356 
1357     ConstantInt *DMask = cast<ConstantInt>(Args[DMaskIdx]);
1358     unsigned DMaskVal = DMask->getZExtValue() & 0xf;
1359 
1360     // dmask 0 has special semantics, do not simplify.
1361     if (DMaskVal == 0)
1362       return nullptr;
1363 
1364     // Mask off values that are undefined because the dmask doesn't cover them
1365     DemandedElts &= (1 << llvm::popcount(DMaskVal)) - 1;
1366 
1367     unsigned NewDMaskVal = 0;
1368     unsigned OrigLdStIdx = 0;
1369     for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) {
1370       const unsigned Bit = 1 << SrcIdx;
1371       if (!!(DMaskVal & Bit)) {
1372         if (!!DemandedElts[OrigLdStIdx])
1373           NewDMaskVal |= Bit;
1374         OrigLdStIdx++;
1375       }
1376     }
1377 
1378     if (DMaskVal != NewDMaskVal)
1379       Args[DMaskIdx] = ConstantInt::get(DMask->getType(), NewDMaskVal);
1380   }
1381 
1382   unsigned NewNumElts = DemandedElts.popcount();
1383   if (!NewNumElts)
1384     return PoisonValue::get(IIVTy);
1385 
1386   if (NewNumElts >= VWidth && DemandedElts.isMask()) {
1387     if (DMaskIdx >= 0)
1388       II.setArgOperand(DMaskIdx, Args[DMaskIdx]);
1389     return nullptr;
1390   }
1391 
1392   // Validate function argument and return types, extracting overloaded types
1393   // along the way.
1394   SmallVector<Type *, 6> OverloadTys;
1395   if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), OverloadTys))
1396     return nullptr;
1397 
1398   Type *NewTy =
1399       (NewNumElts == 1) ? EltTy : FixedVectorType::get(EltTy, NewNumElts);
1400   OverloadTys[0] = NewTy;
1401 
1402   if (!IsLoad) {
1403     SmallVector<int, 8> EltMask;
1404     for (unsigned OrigStoreIdx = 0; OrigStoreIdx < VWidth; ++OrigStoreIdx)
1405       if (DemandedElts[OrigStoreIdx])
1406         EltMask.push_back(OrigStoreIdx);
1407 
1408     if (NewNumElts == 1)
1409       Args[0] = IC.Builder.CreateExtractElement(II.getOperand(0), EltMask[0]);
1410     else
1411       Args[0] = IC.Builder.CreateShuffleVector(II.getOperand(0), EltMask);
1412   }
1413 
1414   Function *NewIntrin = Intrinsic::getDeclaration(
1415       II.getModule(), II.getIntrinsicID(), OverloadTys);
1416   CallInst *NewCall = IC.Builder.CreateCall(NewIntrin, Args);
1417   NewCall->takeName(&II);
1418   NewCall->copyMetadata(II);
1419 
1420   if (IsLoad) {
1421     if (NewNumElts == 1) {
1422       return IC.Builder.CreateInsertElement(PoisonValue::get(IIVTy), NewCall,
1423                                             DemandedElts.countr_zero());
1424     }
1425 
1426     SmallVector<int, 8> EltMask;
1427     unsigned NewLoadIdx = 0;
1428     for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) {
1429       if (!!DemandedElts[OrigLoadIdx])
1430         EltMask.push_back(NewLoadIdx++);
1431       else
1432         EltMask.push_back(NewNumElts);
1433     }
1434 
1435     auto *Shuffle = IC.Builder.CreateShuffleVector(NewCall, EltMask);
1436 
1437     return Shuffle;
1438   }
1439 
1440   return NewCall;
1441 }
1442 
simplifyDemandedVectorEltsIntrinsic(InstCombiner & IC,IntrinsicInst & II,APInt DemandedElts,APInt & UndefElts,APInt & UndefElts2,APInt & UndefElts3,std::function<void (Instruction *,unsigned,APInt,APInt &)> SimplifyAndSetOp) const1443 std::optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic(
1444     InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
1445     APInt &UndefElts2, APInt &UndefElts3,
1446     std::function<void(Instruction *, unsigned, APInt, APInt &)>
1447         SimplifyAndSetOp) const {
1448   switch (II.getIntrinsicID()) {
1449   case Intrinsic::amdgcn_raw_buffer_load:
1450   case Intrinsic::amdgcn_raw_ptr_buffer_load:
1451   case Intrinsic::amdgcn_raw_buffer_load_format:
1452   case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
1453   case Intrinsic::amdgcn_raw_tbuffer_load:
1454   case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
1455   case Intrinsic::amdgcn_s_buffer_load:
1456   case Intrinsic::amdgcn_struct_buffer_load:
1457   case Intrinsic::amdgcn_struct_ptr_buffer_load:
1458   case Intrinsic::amdgcn_struct_buffer_load_format:
1459   case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
1460   case Intrinsic::amdgcn_struct_tbuffer_load:
1461   case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
1462     return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts);
1463   default: {
1464     if (getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID())) {
1465       return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, 0);
1466     }
1467     break;
1468   }
1469   }
1470   return std::nullopt;
1471 }
1472