xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp (revision 357378bbdedf24ce2b90e9bd831af4a9db3ec70a)
1 //===- AMDGPInstCombineIntrinsic.cpp - AMDGPU specific InstCombine pass ---===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // \file
10 // This file implements a TargetTransformInfo analysis pass specific to the
11 // AMDGPU target machine. It uses the target's detailed information to provide
12 // more precise answers to certain TTI queries, while letting the target
13 // independent and default TTI implementations handle the rest.
14 //
15 //===----------------------------------------------------------------------===//
16 
17 #include "AMDGPUInstrInfo.h"
18 #include "AMDGPUTargetTransformInfo.h"
19 #include "GCNSubtarget.h"
20 #include "llvm/ADT/FloatingPointMode.h"
21 #include "llvm/IR/IntrinsicsAMDGPU.h"
22 #include "llvm/Transforms/InstCombine/InstCombiner.h"
23 #include <optional>
24 
25 using namespace llvm;
26 using namespace llvm::PatternMatch;
27 
28 #define DEBUG_TYPE "AMDGPUtti"
29 
30 namespace {
31 
32 struct AMDGPUImageDMaskIntrinsic {
33   unsigned Intr;
34 };
35 
36 #define GET_AMDGPUImageDMaskIntrinsicTable_IMPL
37 #include "InstCombineTables.inc"
38 
39 } // end anonymous namespace
40 
41 // Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs.
42 //
43 // A single NaN input is folded to minnum, so we rely on that folding for
44 // handling NaNs.
45 static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1,
46                            const APFloat &Src2) {
47   APFloat Max3 = maxnum(maxnum(Src0, Src1), Src2);
48 
49   APFloat::cmpResult Cmp0 = Max3.compare(Src0);
50   assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately");
51   if (Cmp0 == APFloat::cmpEqual)
52     return maxnum(Src1, Src2);
53 
54   APFloat::cmpResult Cmp1 = Max3.compare(Src1);
55   assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately");
56   if (Cmp1 == APFloat::cmpEqual)
57     return maxnum(Src0, Src2);
58 
59   return maxnum(Src0, Src1);
60 }
61 
62 // Check if a value can be converted to a 16-bit value without losing
63 // precision.
64 // The value is expected to be either a float (IsFloat = true) or an unsigned
65 // integer (IsFloat = false).
66 static bool canSafelyConvertTo16Bit(Value &V, bool IsFloat) {
67   Type *VTy = V.getType();
68   if (VTy->isHalfTy() || VTy->isIntegerTy(16)) {
69     // The value is already 16-bit, so we don't want to convert to 16-bit again!
70     return false;
71   }
72   if (IsFloat) {
73     if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(&V)) {
74       // We need to check that if we cast the index down to a half, we do not
75       // lose precision.
76       APFloat FloatValue(ConstFloat->getValueAPF());
77       bool LosesInfo = true;
78       FloatValue.convert(APFloat::IEEEhalf(), APFloat::rmTowardZero,
79                          &LosesInfo);
80       return !LosesInfo;
81     }
82   } else {
83     if (ConstantInt *ConstInt = dyn_cast<ConstantInt>(&V)) {
84       // We need to check that if we cast the index down to an i16, we do not
85       // lose precision.
86       APInt IntValue(ConstInt->getValue());
87       return IntValue.getActiveBits() <= 16;
88     }
89   }
90 
91   Value *CastSrc;
92   bool IsExt = IsFloat ? match(&V, m_FPExt(PatternMatch::m_Value(CastSrc)))
93                        : match(&V, m_ZExt(PatternMatch::m_Value(CastSrc)));
94   if (IsExt) {
95     Type *CastSrcTy = CastSrc->getType();
96     if (CastSrcTy->isHalfTy() || CastSrcTy->isIntegerTy(16))
97       return true;
98   }
99 
100   return false;
101 }
102 
103 // Convert a value to 16-bit.
104 static Value *convertTo16Bit(Value &V, InstCombiner::BuilderTy &Builder) {
105   Type *VTy = V.getType();
106   if (isa<FPExtInst>(&V) || isa<SExtInst>(&V) || isa<ZExtInst>(&V))
107     return cast<Instruction>(&V)->getOperand(0);
108   if (VTy->isIntegerTy())
109     return Builder.CreateIntCast(&V, Type::getInt16Ty(V.getContext()), false);
110   if (VTy->isFloatingPointTy())
111     return Builder.CreateFPCast(&V, Type::getHalfTy(V.getContext()));
112 
113   llvm_unreachable("Should never be called!");
114 }
115 
116 /// Applies Func(OldIntr.Args, OldIntr.ArgTys), creates intrinsic call with
117 /// modified arguments (based on OldIntr) and replaces InstToReplace with
118 /// this newly created intrinsic call.
119 static std::optional<Instruction *> modifyIntrinsicCall(
120     IntrinsicInst &OldIntr, Instruction &InstToReplace, unsigned NewIntr,
121     InstCombiner &IC,
122     std::function<void(SmallVectorImpl<Value *> &, SmallVectorImpl<Type *> &)>
123         Func) {
124   SmallVector<Type *, 4> ArgTys;
125   if (!Intrinsic::getIntrinsicSignature(OldIntr.getCalledFunction(), ArgTys))
126     return std::nullopt;
127 
128   SmallVector<Value *, 8> Args(OldIntr.args());
129 
130   // Modify arguments and types
131   Func(Args, ArgTys);
132 
133   Function *I = Intrinsic::getDeclaration(OldIntr.getModule(), NewIntr, ArgTys);
134 
135   CallInst *NewCall = IC.Builder.CreateCall(I, Args);
136   NewCall->takeName(&OldIntr);
137   NewCall->copyMetadata(OldIntr);
138   if (isa<FPMathOperator>(NewCall))
139     NewCall->copyFastMathFlags(&OldIntr);
140 
141   // Erase and replace uses
142   if (!InstToReplace.getType()->isVoidTy())
143     IC.replaceInstUsesWith(InstToReplace, NewCall);
144 
145   bool RemoveOldIntr = &OldIntr != &InstToReplace;
146 
147   auto RetValue = IC.eraseInstFromFunction(InstToReplace);
148   if (RemoveOldIntr)
149     IC.eraseInstFromFunction(OldIntr);
150 
151   return RetValue;
152 }
153 
154 static std::optional<Instruction *>
155 simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST,
156                              const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr,
157                              IntrinsicInst &II, InstCombiner &IC) {
158   // Optimize _L to _LZ when _L is zero
159   if (const auto *LZMappingInfo =
160           AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) {
161     if (auto *ConstantLod =
162             dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->LodIndex))) {
163       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
164         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
165             AMDGPU::getImageDimIntrinsicByBaseOpcode(LZMappingInfo->LZ,
166                                                      ImageDimIntr->Dim);
167         return modifyIntrinsicCall(
168             II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
169               Args.erase(Args.begin() + ImageDimIntr->LodIndex);
170             });
171       }
172     }
173   }
174 
175   // Optimize _mip away, when 'lod' is zero
176   if (const auto *MIPMappingInfo =
177           AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) {
178     if (auto *ConstantMip =
179             dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->MipIndex))) {
180       if (ConstantMip->isZero()) {
181         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
182             AMDGPU::getImageDimIntrinsicByBaseOpcode(MIPMappingInfo->NONMIP,
183                                                      ImageDimIntr->Dim);
184         return modifyIntrinsicCall(
185             II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
186               Args.erase(Args.begin() + ImageDimIntr->MipIndex);
187             });
188       }
189     }
190   }
191 
192   // Optimize _bias away when 'bias' is zero
193   if (const auto *BiasMappingInfo =
194           AMDGPU::getMIMGBiasMappingInfo(ImageDimIntr->BaseOpcode)) {
195     if (auto *ConstantBias =
196             dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->BiasIndex))) {
197       if (ConstantBias->isZero()) {
198         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
199             AMDGPU::getImageDimIntrinsicByBaseOpcode(BiasMappingInfo->NoBias,
200                                                      ImageDimIntr->Dim);
201         return modifyIntrinsicCall(
202             II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
203               Args.erase(Args.begin() + ImageDimIntr->BiasIndex);
204               ArgTys.erase(ArgTys.begin() + ImageDimIntr->BiasTyArg);
205             });
206       }
207     }
208   }
209 
210   // Optimize _offset away when 'offset' is zero
211   if (const auto *OffsetMappingInfo =
212           AMDGPU::getMIMGOffsetMappingInfo(ImageDimIntr->BaseOpcode)) {
213     if (auto *ConstantOffset =
214             dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->OffsetIndex))) {
215       if (ConstantOffset->isZero()) {
216         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
217             AMDGPU::getImageDimIntrinsicByBaseOpcode(
218                 OffsetMappingInfo->NoOffset, ImageDimIntr->Dim);
219         return modifyIntrinsicCall(
220             II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
221               Args.erase(Args.begin() + ImageDimIntr->OffsetIndex);
222             });
223       }
224     }
225   }
226 
227   // Try to use D16
228   if (ST->hasD16Images()) {
229 
230     const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
231         AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
232 
233     if (BaseOpcode->HasD16) {
234 
235       // If the only use of image intrinsic is a fptrunc (with conversion to
236       // half) then both fptrunc and image intrinsic will be replaced with image
237       // intrinsic with D16 flag.
238       if (II.hasOneUse()) {
239         Instruction *User = II.user_back();
240 
241         if (User->getOpcode() == Instruction::FPTrunc &&
242             User->getType()->getScalarType()->isHalfTy()) {
243 
244           return modifyIntrinsicCall(II, *User, ImageDimIntr->Intr, IC,
245                                      [&](auto &Args, auto &ArgTys) {
246                                        // Change return type of image intrinsic.
247                                        // Set it to return type of fptrunc.
248                                        ArgTys[0] = User->getType();
249                                      });
250         }
251       }
252     }
253   }
254 
255   // Try to use A16 or G16
256   if (!ST->hasA16() && !ST->hasG16())
257     return std::nullopt;
258 
259   // Address is interpreted as float if the instruction has a sampler or as
260   // unsigned int if there is no sampler.
261   bool HasSampler =
262       AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode)->Sampler;
263   bool FloatCoord = false;
264   // true means derivatives can be converted to 16 bit, coordinates not
265   bool OnlyDerivatives = false;
266 
267   for (unsigned OperandIndex = ImageDimIntr->GradientStart;
268        OperandIndex < ImageDimIntr->VAddrEnd; OperandIndex++) {
269     Value *Coord = II.getOperand(OperandIndex);
270     // If the values are not derived from 16-bit values, we cannot optimize.
271     if (!canSafelyConvertTo16Bit(*Coord, HasSampler)) {
272       if (OperandIndex < ImageDimIntr->CoordStart ||
273           ImageDimIntr->GradientStart == ImageDimIntr->CoordStart) {
274         return std::nullopt;
275       }
276       // All gradients can be converted, so convert only them
277       OnlyDerivatives = true;
278       break;
279     }
280 
281     assert(OperandIndex == ImageDimIntr->GradientStart ||
282            FloatCoord == Coord->getType()->isFloatingPointTy());
283     FloatCoord = Coord->getType()->isFloatingPointTy();
284   }
285 
286   if (!OnlyDerivatives && !ST->hasA16())
287     OnlyDerivatives = true; // Only supports G16
288 
289   // Check if there is a bias parameter and if it can be converted to f16
290   if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) {
291     Value *Bias = II.getOperand(ImageDimIntr->BiasIndex);
292     assert(HasSampler &&
293            "Only image instructions with a sampler can have a bias");
294     if (!canSafelyConvertTo16Bit(*Bias, HasSampler))
295       OnlyDerivatives = true;
296   }
297 
298   if (OnlyDerivatives && (!ST->hasG16() || ImageDimIntr->GradientStart ==
299                                                ImageDimIntr->CoordStart))
300     return std::nullopt;
301 
302   Type *CoordType = FloatCoord ? Type::getHalfTy(II.getContext())
303                                : Type::getInt16Ty(II.getContext());
304 
305   return modifyIntrinsicCall(
306       II, II, II.getIntrinsicID(), IC, [&](auto &Args, auto &ArgTys) {
307         ArgTys[ImageDimIntr->GradientTyArg] = CoordType;
308         if (!OnlyDerivatives) {
309           ArgTys[ImageDimIntr->CoordTyArg] = CoordType;
310 
311           // Change the bias type
312           if (ImageDimIntr->NumBiasArgs != 0)
313             ArgTys[ImageDimIntr->BiasTyArg] = Type::getHalfTy(II.getContext());
314         }
315 
316         unsigned EndIndex =
317             OnlyDerivatives ? ImageDimIntr->CoordStart : ImageDimIntr->VAddrEnd;
318         for (unsigned OperandIndex = ImageDimIntr->GradientStart;
319              OperandIndex < EndIndex; OperandIndex++) {
320           Args[OperandIndex] =
321               convertTo16Bit(*II.getOperand(OperandIndex), IC.Builder);
322         }
323 
324         // Convert the bias
325         if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) {
326           Value *Bias = II.getOperand(ImageDimIntr->BiasIndex);
327           Args[ImageDimIntr->BiasIndex] = convertTo16Bit(*Bias, IC.Builder);
328         }
329       });
330 }
331 
332 bool GCNTTIImpl::canSimplifyLegacyMulToMul(const Instruction &I,
333                                            const Value *Op0, const Value *Op1,
334                                            InstCombiner &IC) const {
335   // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
336   // infinity, gives +0.0. If we can prove we don't have one of the special
337   // cases then we can use a normal multiply instead.
338   // TODO: Create and use isKnownFiniteNonZero instead of just matching
339   // constants here.
340   if (match(Op0, PatternMatch::m_FiniteNonZero()) ||
341       match(Op1, PatternMatch::m_FiniteNonZero())) {
342     // One operand is not zero or infinity or NaN.
343     return true;
344   }
345 
346   auto *TLI = &IC.getTargetLibraryInfo();
347   if (isKnownNeverInfOrNaN(Op0, IC.getDataLayout(), TLI, 0,
348                            &IC.getAssumptionCache(), &I,
349                            &IC.getDominatorTree()) &&
350       isKnownNeverInfOrNaN(Op1, IC.getDataLayout(), TLI, 0,
351                            &IC.getAssumptionCache(), &I,
352                            &IC.getDominatorTree())) {
353     // Neither operand is infinity or NaN.
354     return true;
355   }
356   return false;
357 }
358 
359 /// Match an fpext from half to float, or a constant we can convert.
360 static bool matchFPExtFromF16(Value *Arg, Value *&FPExtSrc) {
361   if (match(Arg, m_OneUse(m_FPExt(m_Value(FPExtSrc)))))
362     return FPExtSrc->getType()->isHalfTy();
363 
364   ConstantFP *CFP;
365   if (match(Arg, m_ConstantFP(CFP))) {
366     bool LosesInfo;
367     APFloat Val(CFP->getValueAPF());
368     Val.convert(APFloat::IEEEhalf(), APFloat::rmNearestTiesToEven, &LosesInfo);
369     if (LosesInfo)
370       return false;
371 
372     FPExtSrc = ConstantFP::get(Type::getHalfTy(Arg->getContext()), Val);
373     return true;
374   }
375 
376   return false;
377 }
378 
379 // Trim all zero components from the end of the vector \p UseV and return
380 // an appropriate bitset with known elements.
381 static APInt trimTrailingZerosInVector(InstCombiner &IC, Value *UseV,
382                                        Instruction *I) {
383   auto *VTy = cast<FixedVectorType>(UseV->getType());
384   unsigned VWidth = VTy->getNumElements();
385   APInt DemandedElts = APInt::getAllOnes(VWidth);
386 
387   for (int i = VWidth - 1; i > 0; --i) {
388     auto *Elt = findScalarElement(UseV, i);
389     if (!Elt)
390       break;
391 
392     if (auto *ConstElt = dyn_cast<Constant>(Elt)) {
393       if (!ConstElt->isNullValue() && !isa<UndefValue>(Elt))
394         break;
395     } else {
396       break;
397     }
398 
399     DemandedElts.clearBit(i);
400   }
401 
402   return DemandedElts;
403 }
404 
405 // Trim elements of the end of the vector \p V, if they are
406 // equal to the first element of the vector.
407 static APInt defaultComponentBroadcast(Value *V) {
408   auto *VTy = cast<FixedVectorType>(V->getType());
409   unsigned VWidth = VTy->getNumElements();
410   APInt DemandedElts = APInt::getAllOnes(VWidth);
411   Value *FirstComponent = findScalarElement(V, 0);
412 
413   SmallVector<int> ShuffleMask;
414   if (auto *SVI = dyn_cast<ShuffleVectorInst>(V))
415     SVI->getShuffleMask(ShuffleMask);
416 
417   for (int I = VWidth - 1; I > 0; --I) {
418     if (ShuffleMask.empty()) {
419       auto *Elt = findScalarElement(V, I);
420       if (!Elt || (Elt != FirstComponent && !isa<UndefValue>(Elt)))
421         break;
422     } else {
423       // Detect identical elements in the shufflevector result, even though
424       // findScalarElement cannot tell us what that element is.
425       if (ShuffleMask[I] != ShuffleMask[0] && ShuffleMask[I] != PoisonMaskElem)
426         break;
427     }
428     DemandedElts.clearBit(I);
429   }
430 
431   return DemandedElts;
432 }
433 
434 static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
435                                                     IntrinsicInst &II,
436                                                     APInt DemandedElts,
437                                                     int DMaskIdx = -1,
438                                                     bool IsLoad = true);
439 
440 /// Return true if it's legal to contract llvm.amdgcn.rcp(llvm.sqrt)
441 static bool canContractSqrtToRsq(const FPMathOperator *SqrtOp) {
442   return (SqrtOp->getType()->isFloatTy() &&
443           (SqrtOp->hasApproxFunc() || SqrtOp->getFPAccuracy() >= 1.0f)) ||
444          SqrtOp->getType()->isHalfTy();
445 }
446 
447 std::optional<Instruction *>
448 GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
449   Intrinsic::ID IID = II.getIntrinsicID();
450   switch (IID) {
451   case Intrinsic::amdgcn_rcp: {
452     Value *Src = II.getArgOperand(0);
453 
454     // TODO: Move to ConstantFolding/InstSimplify?
455     if (isa<UndefValue>(Src)) {
456       Type *Ty = II.getType();
457       auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
458       return IC.replaceInstUsesWith(II, QNaN);
459     }
460 
461     if (II.isStrictFP())
462       break;
463 
464     if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
465       const APFloat &ArgVal = C->getValueAPF();
466       APFloat Val(ArgVal.getSemantics(), 1);
467       Val.divide(ArgVal, APFloat::rmNearestTiesToEven);
468 
469       // This is more precise than the instruction may give.
470       //
471       // TODO: The instruction always flushes denormal results (except for f16),
472       // should this also?
473       return IC.replaceInstUsesWith(II, ConstantFP::get(II.getContext(), Val));
474     }
475 
476     FastMathFlags FMF = cast<FPMathOperator>(II).getFastMathFlags();
477     if (!FMF.allowContract())
478       break;
479     auto *SrcCI = dyn_cast<IntrinsicInst>(Src);
480     if (!SrcCI)
481       break;
482 
483     auto IID = SrcCI->getIntrinsicID();
484     // llvm.amdgcn.rcp(llvm.amdgcn.sqrt(x)) -> llvm.amdgcn.rsq(x) if contractable
485     //
486     // llvm.amdgcn.rcp(llvm.sqrt(x)) -> llvm.amdgcn.rsq(x) if contractable and
487     // relaxed.
488     if (IID == Intrinsic::amdgcn_sqrt || IID == Intrinsic::sqrt) {
489       const FPMathOperator *SqrtOp = cast<FPMathOperator>(SrcCI);
490       FastMathFlags InnerFMF = SqrtOp->getFastMathFlags();
491       if (!InnerFMF.allowContract() || !SrcCI->hasOneUse())
492         break;
493 
494       if (IID == Intrinsic::sqrt && !canContractSqrtToRsq(SqrtOp))
495         break;
496 
497       Function *NewDecl = Intrinsic::getDeclaration(
498           SrcCI->getModule(), Intrinsic::amdgcn_rsq, {SrcCI->getType()});
499 
500       InnerFMF |= FMF;
501       II.setFastMathFlags(InnerFMF);
502 
503       II.setCalledFunction(NewDecl);
504       return IC.replaceOperand(II, 0, SrcCI->getArgOperand(0));
505     }
506 
507     break;
508   }
509   case Intrinsic::amdgcn_sqrt:
510   case Intrinsic::amdgcn_rsq: {
511     Value *Src = II.getArgOperand(0);
512 
513     // TODO: Move to ConstantFolding/InstSimplify?
514     if (isa<UndefValue>(Src)) {
515       Type *Ty = II.getType();
516       auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
517       return IC.replaceInstUsesWith(II, QNaN);
518     }
519 
520     // f16 amdgcn.sqrt is identical to regular sqrt.
521     if (IID == Intrinsic::amdgcn_sqrt && Src->getType()->isHalfTy()) {
522       Function *NewDecl = Intrinsic::getDeclaration(
523           II.getModule(), Intrinsic::sqrt, {II.getType()});
524       II.setCalledFunction(NewDecl);
525       return &II;
526     }
527 
528     break;
529   }
530   case Intrinsic::amdgcn_log:
531   case Intrinsic::amdgcn_exp2: {
532     const bool IsLog = IID == Intrinsic::amdgcn_log;
533     const bool IsExp = IID == Intrinsic::amdgcn_exp2;
534     Value *Src = II.getArgOperand(0);
535     Type *Ty = II.getType();
536 
537     if (isa<PoisonValue>(Src))
538       return IC.replaceInstUsesWith(II, Src);
539 
540     if (IC.getSimplifyQuery().isUndefValue(Src))
541       return IC.replaceInstUsesWith(II, ConstantFP::getNaN(Ty));
542 
543     if (ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
544       if (C->isInfinity()) {
545         // exp2(+inf) -> +inf
546         // log2(+inf) -> +inf
547         if (!C->isNegative())
548           return IC.replaceInstUsesWith(II, C);
549 
550         // exp2(-inf) -> 0
551         if (IsExp && C->isNegative())
552           return IC.replaceInstUsesWith(II, ConstantFP::getZero(Ty));
553       }
554 
555       if (II.isStrictFP())
556         break;
557 
558       if (C->isNaN()) {
559         Constant *Quieted = ConstantFP::get(Ty, C->getValue().makeQuiet());
560         return IC.replaceInstUsesWith(II, Quieted);
561       }
562 
563       // f32 instruction doesn't handle denormals, f16 does.
564       if (C->isZero() || (C->getValue().isDenormal() && Ty->isFloatTy())) {
565         Constant *FoldedValue = IsLog ? ConstantFP::getInfinity(Ty, true)
566                                       : ConstantFP::get(Ty, 1.0);
567         return IC.replaceInstUsesWith(II, FoldedValue);
568       }
569 
570       if (IsLog && C->isNegative())
571         return IC.replaceInstUsesWith(II, ConstantFP::getNaN(Ty));
572 
573       // TODO: Full constant folding matching hardware behavior.
574     }
575 
576     break;
577   }
578   case Intrinsic::amdgcn_frexp_mant:
579   case Intrinsic::amdgcn_frexp_exp: {
580     Value *Src = II.getArgOperand(0);
581     if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
582       int Exp;
583       APFloat Significand =
584           frexp(C->getValueAPF(), Exp, APFloat::rmNearestTiesToEven);
585 
586       if (IID == Intrinsic::amdgcn_frexp_mant) {
587         return IC.replaceInstUsesWith(
588             II, ConstantFP::get(II.getContext(), Significand));
589       }
590 
591       // Match instruction special case behavior.
592       if (Exp == APFloat::IEK_NaN || Exp == APFloat::IEK_Inf)
593         Exp = 0;
594 
595       return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Exp));
596     }
597 
598     if (isa<UndefValue>(Src)) {
599       return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
600     }
601 
602     break;
603   }
604   case Intrinsic::amdgcn_class: {
605     Value *Src0 = II.getArgOperand(0);
606     Value *Src1 = II.getArgOperand(1);
607     const ConstantInt *CMask = dyn_cast<ConstantInt>(Src1);
608     if (CMask) {
609       II.setCalledOperand(Intrinsic::getDeclaration(
610           II.getModule(), Intrinsic::is_fpclass, Src0->getType()));
611 
612       // Clamp any excess bits, as they're illegal for the generic intrinsic.
613       II.setArgOperand(1, ConstantInt::get(Src1->getType(),
614                                            CMask->getZExtValue() & fcAllFlags));
615       return &II;
616     }
617 
618     // Propagate poison.
619     if (isa<PoisonValue>(Src0) || isa<PoisonValue>(Src1))
620       return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
621 
622     // llvm.amdgcn.class(_, undef) -> false
623     if (IC.getSimplifyQuery().isUndefValue(Src1))
624       return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), false));
625 
626     // llvm.amdgcn.class(undef, mask) -> mask != 0
627     if (IC.getSimplifyQuery().isUndefValue(Src0)) {
628       Value *CmpMask = IC.Builder.CreateICmpNE(
629           Src1, ConstantInt::getNullValue(Src1->getType()));
630       return IC.replaceInstUsesWith(II, CmpMask);
631     }
632     break;
633   }
634   case Intrinsic::amdgcn_cvt_pkrtz: {
635     Value *Src0 = II.getArgOperand(0);
636     Value *Src1 = II.getArgOperand(1);
637     if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
638       if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
639         const fltSemantics &HalfSem =
640             II.getType()->getScalarType()->getFltSemantics();
641         bool LosesInfo;
642         APFloat Val0 = C0->getValueAPF();
643         APFloat Val1 = C1->getValueAPF();
644         Val0.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
645         Val1.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
646 
647         Constant *Folded =
648             ConstantVector::get({ConstantFP::get(II.getContext(), Val0),
649                                  ConstantFP::get(II.getContext(), Val1)});
650         return IC.replaceInstUsesWith(II, Folded);
651       }
652     }
653 
654     if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) {
655       return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
656     }
657 
658     break;
659   }
660   case Intrinsic::amdgcn_cvt_pknorm_i16:
661   case Intrinsic::amdgcn_cvt_pknorm_u16:
662   case Intrinsic::amdgcn_cvt_pk_i16:
663   case Intrinsic::amdgcn_cvt_pk_u16: {
664     Value *Src0 = II.getArgOperand(0);
665     Value *Src1 = II.getArgOperand(1);
666 
667     if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) {
668       return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
669     }
670 
671     break;
672   }
673   case Intrinsic::amdgcn_ubfe:
674   case Intrinsic::amdgcn_sbfe: {
675     // Decompose simple cases into standard shifts.
676     Value *Src = II.getArgOperand(0);
677     if (isa<UndefValue>(Src)) {
678       return IC.replaceInstUsesWith(II, Src);
679     }
680 
681     unsigned Width;
682     Type *Ty = II.getType();
683     unsigned IntSize = Ty->getIntegerBitWidth();
684 
685     ConstantInt *CWidth = dyn_cast<ConstantInt>(II.getArgOperand(2));
686     if (CWidth) {
687       Width = CWidth->getZExtValue();
688       if ((Width & (IntSize - 1)) == 0) {
689         return IC.replaceInstUsesWith(II, ConstantInt::getNullValue(Ty));
690       }
691 
692       // Hardware ignores high bits, so remove those.
693       if (Width >= IntSize) {
694         return IC.replaceOperand(
695             II, 2, ConstantInt::get(CWidth->getType(), Width & (IntSize - 1)));
696       }
697     }
698 
699     unsigned Offset;
700     ConstantInt *COffset = dyn_cast<ConstantInt>(II.getArgOperand(1));
701     if (COffset) {
702       Offset = COffset->getZExtValue();
703       if (Offset >= IntSize) {
704         return IC.replaceOperand(
705             II, 1,
706             ConstantInt::get(COffset->getType(), Offset & (IntSize - 1)));
707       }
708     }
709 
710     bool Signed = IID == Intrinsic::amdgcn_sbfe;
711 
712     if (!CWidth || !COffset)
713       break;
714 
715     // The case of Width == 0 is handled above, which makes this transformation
716     // safe.  If Width == 0, then the ashr and lshr instructions become poison
717     // value since the shift amount would be equal to the bit size.
718     assert(Width != 0);
719 
720     // TODO: This allows folding to undef when the hardware has specific
721     // behavior?
722     if (Offset + Width < IntSize) {
723       Value *Shl = IC.Builder.CreateShl(Src, IntSize - Offset - Width);
724       Value *RightShift = Signed ? IC.Builder.CreateAShr(Shl, IntSize - Width)
725                                  : IC.Builder.CreateLShr(Shl, IntSize - Width);
726       RightShift->takeName(&II);
727       return IC.replaceInstUsesWith(II, RightShift);
728     }
729 
730     Value *RightShift = Signed ? IC.Builder.CreateAShr(Src, Offset)
731                                : IC.Builder.CreateLShr(Src, Offset);
732 
733     RightShift->takeName(&II);
734     return IC.replaceInstUsesWith(II, RightShift);
735   }
736   case Intrinsic::amdgcn_exp:
737   case Intrinsic::amdgcn_exp_row:
738   case Intrinsic::amdgcn_exp_compr: {
739     ConstantInt *En = cast<ConstantInt>(II.getArgOperand(1));
740     unsigned EnBits = En->getZExtValue();
741     if (EnBits == 0xf)
742       break; // All inputs enabled.
743 
744     bool IsCompr = IID == Intrinsic::amdgcn_exp_compr;
745     bool Changed = false;
746     for (int I = 0; I < (IsCompr ? 2 : 4); ++I) {
747       if ((!IsCompr && (EnBits & (1 << I)) == 0) ||
748           (IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) {
749         Value *Src = II.getArgOperand(I + 2);
750         if (!isa<UndefValue>(Src)) {
751           IC.replaceOperand(II, I + 2, UndefValue::get(Src->getType()));
752           Changed = true;
753         }
754       }
755     }
756 
757     if (Changed) {
758       return &II;
759     }
760 
761     break;
762   }
763   case Intrinsic::amdgcn_fmed3: {
764     // Note this does not preserve proper sNaN behavior if IEEE-mode is enabled
765     // for the shader.
766 
767     Value *Src0 = II.getArgOperand(0);
768     Value *Src1 = II.getArgOperand(1);
769     Value *Src2 = II.getArgOperand(2);
770 
771     // Checking for NaN before canonicalization provides better fidelity when
772     // mapping other operations onto fmed3 since the order of operands is
773     // unchanged.
774     CallInst *NewCall = nullptr;
775     if (match(Src0, PatternMatch::m_NaN()) || isa<UndefValue>(Src0)) {
776       NewCall = IC.Builder.CreateMinNum(Src1, Src2);
777     } else if (match(Src1, PatternMatch::m_NaN()) || isa<UndefValue>(Src1)) {
778       NewCall = IC.Builder.CreateMinNum(Src0, Src2);
779     } else if (match(Src2, PatternMatch::m_NaN()) || isa<UndefValue>(Src2)) {
780       NewCall = IC.Builder.CreateMaxNum(Src0, Src1);
781     }
782 
783     if (NewCall) {
784       NewCall->copyFastMathFlags(&II);
785       NewCall->takeName(&II);
786       return IC.replaceInstUsesWith(II, NewCall);
787     }
788 
789     bool Swap = false;
790     // Canonicalize constants to RHS operands.
791     //
792     // fmed3(c0, x, c1) -> fmed3(x, c0, c1)
793     if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
794       std::swap(Src0, Src1);
795       Swap = true;
796     }
797 
798     if (isa<Constant>(Src1) && !isa<Constant>(Src2)) {
799       std::swap(Src1, Src2);
800       Swap = true;
801     }
802 
803     if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
804       std::swap(Src0, Src1);
805       Swap = true;
806     }
807 
808     if (Swap) {
809       II.setArgOperand(0, Src0);
810       II.setArgOperand(1, Src1);
811       II.setArgOperand(2, Src2);
812       return &II;
813     }
814 
815     if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
816       if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
817         if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Src2)) {
818           APFloat Result = fmed3AMDGCN(C0->getValueAPF(), C1->getValueAPF(),
819                                        C2->getValueAPF());
820           return IC.replaceInstUsesWith(
821               II, ConstantFP::get(IC.Builder.getContext(), Result));
822         }
823       }
824     }
825 
826     if (!ST->hasMed3_16())
827       break;
828 
829     Value *X, *Y, *Z;
830 
831     // Repeat floating-point width reduction done for minnum/maxnum.
832     // fmed3((fpext X), (fpext Y), (fpext Z)) -> fpext (fmed3(X, Y, Z))
833     if (matchFPExtFromF16(Src0, X) && matchFPExtFromF16(Src1, Y) &&
834         matchFPExtFromF16(Src2, Z)) {
835       Value *NewCall = IC.Builder.CreateIntrinsic(IID, {X->getType()},
836                                                   {X, Y, Z}, &II, II.getName());
837       return new FPExtInst(NewCall, II.getType());
838     }
839 
840     break;
841   }
842   case Intrinsic::amdgcn_icmp:
843   case Intrinsic::amdgcn_fcmp: {
844     const ConstantInt *CC = cast<ConstantInt>(II.getArgOperand(2));
845     // Guard against invalid arguments.
846     int64_t CCVal = CC->getZExtValue();
847     bool IsInteger = IID == Intrinsic::amdgcn_icmp;
848     if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE ||
849                        CCVal > CmpInst::LAST_ICMP_PREDICATE)) ||
850         (!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE ||
851                         CCVal > CmpInst::LAST_FCMP_PREDICATE)))
852       break;
853 
854     Value *Src0 = II.getArgOperand(0);
855     Value *Src1 = II.getArgOperand(1);
856 
857     if (auto *CSrc0 = dyn_cast<Constant>(Src0)) {
858       if (auto *CSrc1 = dyn_cast<Constant>(Src1)) {
859         Constant *CCmp = ConstantExpr::getCompare(CCVal, CSrc0, CSrc1);
860         if (CCmp->isNullValue()) {
861           return IC.replaceInstUsesWith(
862               II, IC.Builder.CreateSExt(CCmp, II.getType()));
863         }
864 
865         // The result of V_ICMP/V_FCMP assembly instructions (which this
866         // intrinsic exposes) is one bit per thread, masked with the EXEC
867         // register (which contains the bitmask of live threads). So a
868         // comparison that always returns true is the same as a read of the
869         // EXEC register.
870         Function *NewF = Intrinsic::getDeclaration(
871             II.getModule(), Intrinsic::read_register, II.getType());
872         Metadata *MDArgs[] = {MDString::get(II.getContext(), "exec")};
873         MDNode *MD = MDNode::get(II.getContext(), MDArgs);
874         Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)};
875         CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
876         NewCall->addFnAttr(Attribute::Convergent);
877         NewCall->takeName(&II);
878         return IC.replaceInstUsesWith(II, NewCall);
879       }
880 
881       // Canonicalize constants to RHS.
882       CmpInst::Predicate SwapPred =
883           CmpInst::getSwappedPredicate(static_cast<CmpInst::Predicate>(CCVal));
884       II.setArgOperand(0, Src1);
885       II.setArgOperand(1, Src0);
886       II.setArgOperand(
887           2, ConstantInt::get(CC->getType(), static_cast<int>(SwapPred)));
888       return &II;
889     }
890 
891     if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE)
892       break;
893 
894     // Canonicalize compare eq with true value to compare != 0
895     // llvm.amdgcn.icmp(zext (i1 x), 1, eq)
896     //   -> llvm.amdgcn.icmp(zext (i1 x), 0, ne)
897     // llvm.amdgcn.icmp(sext (i1 x), -1, eq)
898     //   -> llvm.amdgcn.icmp(sext (i1 x), 0, ne)
899     Value *ExtSrc;
900     if (CCVal == CmpInst::ICMP_EQ &&
901         ((match(Src1, PatternMatch::m_One()) &&
902           match(Src0, m_ZExt(PatternMatch::m_Value(ExtSrc)))) ||
903          (match(Src1, PatternMatch::m_AllOnes()) &&
904           match(Src0, m_SExt(PatternMatch::m_Value(ExtSrc))))) &&
905         ExtSrc->getType()->isIntegerTy(1)) {
906       IC.replaceOperand(II, 1, ConstantInt::getNullValue(Src1->getType()));
907       IC.replaceOperand(II, 2,
908                         ConstantInt::get(CC->getType(), CmpInst::ICMP_NE));
909       return &II;
910     }
911 
912     CmpInst::Predicate SrcPred;
913     Value *SrcLHS;
914     Value *SrcRHS;
915 
916     // Fold compare eq/ne with 0 from a compare result as the predicate to the
917     // intrinsic. The typical use is a wave vote function in the library, which
918     // will be fed from a user code condition compared with 0. Fold in the
919     // redundant compare.
920 
921     // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne)
922     //   -> llvm.amdgcn.[if]cmp(a, b, pred)
923     //
924     // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq)
925     //   -> llvm.amdgcn.[if]cmp(a, b, inv pred)
926     if (match(Src1, PatternMatch::m_Zero()) &&
927         match(Src0, PatternMatch::m_ZExtOrSExt(
928                         m_Cmp(SrcPred, PatternMatch::m_Value(SrcLHS),
929                               PatternMatch::m_Value(SrcRHS))))) {
930       if (CCVal == CmpInst::ICMP_EQ)
931         SrcPred = CmpInst::getInversePredicate(SrcPred);
932 
933       Intrinsic::ID NewIID = CmpInst::isFPPredicate(SrcPred)
934                                  ? Intrinsic::amdgcn_fcmp
935                                  : Intrinsic::amdgcn_icmp;
936 
937       Type *Ty = SrcLHS->getType();
938       if (auto *CmpType = dyn_cast<IntegerType>(Ty)) {
939         // Promote to next legal integer type.
940         unsigned Width = CmpType->getBitWidth();
941         unsigned NewWidth = Width;
942 
943         // Don't do anything for i1 comparisons.
944         if (Width == 1)
945           break;
946 
947         if (Width <= 16)
948           NewWidth = 16;
949         else if (Width <= 32)
950           NewWidth = 32;
951         else if (Width <= 64)
952           NewWidth = 64;
953         else if (Width > 64)
954           break; // Can't handle this.
955 
956         if (Width != NewWidth) {
957           IntegerType *CmpTy = IC.Builder.getIntNTy(NewWidth);
958           if (CmpInst::isSigned(SrcPred)) {
959             SrcLHS = IC.Builder.CreateSExt(SrcLHS, CmpTy);
960             SrcRHS = IC.Builder.CreateSExt(SrcRHS, CmpTy);
961           } else {
962             SrcLHS = IC.Builder.CreateZExt(SrcLHS, CmpTy);
963             SrcRHS = IC.Builder.CreateZExt(SrcRHS, CmpTy);
964           }
965         }
966       } else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy())
967         break;
968 
969       Function *NewF = Intrinsic::getDeclaration(
970           II.getModule(), NewIID, {II.getType(), SrcLHS->getType()});
971       Value *Args[] = {SrcLHS, SrcRHS,
972                        ConstantInt::get(CC->getType(), SrcPred)};
973       CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
974       NewCall->takeName(&II);
975       return IC.replaceInstUsesWith(II, NewCall);
976     }
977 
978     break;
979   }
980   case Intrinsic::amdgcn_mbcnt_hi: {
981     // exec_hi is all 0, so this is just a copy.
982     if (ST->isWave32())
983       return IC.replaceInstUsesWith(II, II.getArgOperand(1));
984     break;
985   }
986   case Intrinsic::amdgcn_ballot: {
987     if (auto *Src = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
988       if (Src->isZero()) {
989         // amdgcn.ballot(i1 0) is zero.
990         return IC.replaceInstUsesWith(II, Constant::getNullValue(II.getType()));
991       }
992     }
993     break;
994   }
995   case Intrinsic::amdgcn_wqm_vote: {
996     // wqm_vote is identity when the argument is constant.
997     if (!isa<Constant>(II.getArgOperand(0)))
998       break;
999 
1000     return IC.replaceInstUsesWith(II, II.getArgOperand(0));
1001   }
1002   case Intrinsic::amdgcn_kill: {
1003     const ConstantInt *C = dyn_cast<ConstantInt>(II.getArgOperand(0));
1004     if (!C || !C->getZExtValue())
1005       break;
1006 
1007     // amdgcn.kill(i1 1) is a no-op
1008     return IC.eraseInstFromFunction(II);
1009   }
1010   case Intrinsic::amdgcn_update_dpp: {
1011     Value *Old = II.getArgOperand(0);
1012 
1013     auto *BC = cast<ConstantInt>(II.getArgOperand(5));
1014     auto *RM = cast<ConstantInt>(II.getArgOperand(3));
1015     auto *BM = cast<ConstantInt>(II.getArgOperand(4));
1016     if (BC->isZeroValue() || RM->getZExtValue() != 0xF ||
1017         BM->getZExtValue() != 0xF || isa<UndefValue>(Old))
1018       break;
1019 
1020     // If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value.
1021     return IC.replaceOperand(II, 0, UndefValue::get(Old->getType()));
1022   }
1023   case Intrinsic::amdgcn_permlane16:
1024   case Intrinsic::amdgcn_permlane16_var:
1025   case Intrinsic::amdgcn_permlanex16:
1026   case Intrinsic::amdgcn_permlanex16_var: {
1027     // Discard vdst_in if it's not going to be read.
1028     Value *VDstIn = II.getArgOperand(0);
1029     if (isa<UndefValue>(VDstIn))
1030       break;
1031 
1032     // FetchInvalid operand idx.
1033     unsigned int FiIdx = (IID == Intrinsic::amdgcn_permlane16 ||
1034                           IID == Intrinsic::amdgcn_permlanex16)
1035                              ? 4  /* for permlane16 and permlanex16 */
1036                              : 3; /* for permlane16_var and permlanex16_var */
1037 
1038     // BoundCtrl operand idx.
1039     // For permlane16 and permlanex16 it should be 5
1040     // For Permlane16_var and permlanex16_var it should be 4
1041     unsigned int BcIdx = FiIdx + 1;
1042 
1043     ConstantInt *FetchInvalid = cast<ConstantInt>(II.getArgOperand(FiIdx));
1044     ConstantInt *BoundCtrl = cast<ConstantInt>(II.getArgOperand(BcIdx));
1045     if (!FetchInvalid->getZExtValue() && !BoundCtrl->getZExtValue())
1046       break;
1047 
1048     return IC.replaceOperand(II, 0, UndefValue::get(VDstIn->getType()));
1049   }
1050   case Intrinsic::amdgcn_permlane64:
1051     // A constant value is trivially uniform.
1052     if (Constant *C = dyn_cast<Constant>(II.getArgOperand(0))) {
1053       return IC.replaceInstUsesWith(II, C);
1054     }
1055     break;
1056   case Intrinsic::amdgcn_readfirstlane:
1057   case Intrinsic::amdgcn_readlane: {
1058     // A constant value is trivially uniform.
1059     if (Constant *C = dyn_cast<Constant>(II.getArgOperand(0))) {
1060       return IC.replaceInstUsesWith(II, C);
1061     }
1062 
1063     // The rest of these may not be safe if the exec may not be the same between
1064     // the def and use.
1065     Value *Src = II.getArgOperand(0);
1066     Instruction *SrcInst = dyn_cast<Instruction>(Src);
1067     if (SrcInst && SrcInst->getParent() != II.getParent())
1068       break;
1069 
1070     // readfirstlane (readfirstlane x) -> readfirstlane x
1071     // readlane (readfirstlane x), y -> readfirstlane x
1072     if (match(Src,
1073               PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readfirstlane>())) {
1074       return IC.replaceInstUsesWith(II, Src);
1075     }
1076 
1077     if (IID == Intrinsic::amdgcn_readfirstlane) {
1078       // readfirstlane (readlane x, y) -> readlane x, y
1079       if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>())) {
1080         return IC.replaceInstUsesWith(II, Src);
1081       }
1082     } else {
1083       // readlane (readlane x, y), y -> readlane x, y
1084       if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>(
1085                          PatternMatch::m_Value(),
1086                          PatternMatch::m_Specific(II.getArgOperand(1))))) {
1087         return IC.replaceInstUsesWith(II, Src);
1088       }
1089     }
1090 
1091     break;
1092   }
1093   case Intrinsic::amdgcn_fmul_legacy: {
1094     Value *Op0 = II.getArgOperand(0);
1095     Value *Op1 = II.getArgOperand(1);
1096 
1097     // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
1098     // infinity, gives +0.0.
1099     // TODO: Move to InstSimplify?
1100     if (match(Op0, PatternMatch::m_AnyZeroFP()) ||
1101         match(Op1, PatternMatch::m_AnyZeroFP()))
1102       return IC.replaceInstUsesWith(II, ConstantFP::getZero(II.getType()));
1103 
1104     // If we can prove we don't have one of the special cases then we can use a
1105     // normal fmul instruction instead.
1106     if (canSimplifyLegacyMulToMul(II, Op0, Op1, IC)) {
1107       auto *FMul = IC.Builder.CreateFMulFMF(Op0, Op1, &II);
1108       FMul->takeName(&II);
1109       return IC.replaceInstUsesWith(II, FMul);
1110     }
1111     break;
1112   }
1113   case Intrinsic::amdgcn_fma_legacy: {
1114     Value *Op0 = II.getArgOperand(0);
1115     Value *Op1 = II.getArgOperand(1);
1116     Value *Op2 = II.getArgOperand(2);
1117 
1118     // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
1119     // infinity, gives +0.0.
1120     // TODO: Move to InstSimplify?
1121     if (match(Op0, PatternMatch::m_AnyZeroFP()) ||
1122         match(Op1, PatternMatch::m_AnyZeroFP())) {
1123       // It's tempting to just return Op2 here, but that would give the wrong
1124       // result if Op2 was -0.0.
1125       auto *Zero = ConstantFP::getZero(II.getType());
1126       auto *FAdd = IC.Builder.CreateFAddFMF(Zero, Op2, &II);
1127       FAdd->takeName(&II);
1128       return IC.replaceInstUsesWith(II, FAdd);
1129     }
1130 
1131     // If we can prove we don't have one of the special cases then we can use a
1132     // normal fma instead.
1133     if (canSimplifyLegacyMulToMul(II, Op0, Op1, IC)) {
1134       II.setCalledOperand(Intrinsic::getDeclaration(
1135           II.getModule(), Intrinsic::fma, II.getType()));
1136       return &II;
1137     }
1138     break;
1139   }
1140   case Intrinsic::amdgcn_is_shared:
1141   case Intrinsic::amdgcn_is_private: {
1142     if (isa<UndefValue>(II.getArgOperand(0)))
1143       return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
1144 
1145     if (isa<ConstantPointerNull>(II.getArgOperand(0)))
1146       return IC.replaceInstUsesWith(II, ConstantInt::getFalse(II.getType()));
1147     break;
1148   }
1149   case Intrinsic::amdgcn_buffer_store_format:
1150   case Intrinsic::amdgcn_raw_buffer_store_format:
1151   case Intrinsic::amdgcn_struct_buffer_store_format:
1152   case Intrinsic::amdgcn_raw_tbuffer_store:
1153   case Intrinsic::amdgcn_struct_tbuffer_store:
1154   case Intrinsic::amdgcn_tbuffer_store:
1155   case Intrinsic::amdgcn_image_store_1d:
1156   case Intrinsic::amdgcn_image_store_1darray:
1157   case Intrinsic::amdgcn_image_store_2d:
1158   case Intrinsic::amdgcn_image_store_2darray:
1159   case Intrinsic::amdgcn_image_store_2darraymsaa:
1160   case Intrinsic::amdgcn_image_store_2dmsaa:
1161   case Intrinsic::amdgcn_image_store_3d:
1162   case Intrinsic::amdgcn_image_store_cube:
1163   case Intrinsic::amdgcn_image_store_mip_1d:
1164   case Intrinsic::amdgcn_image_store_mip_1darray:
1165   case Intrinsic::amdgcn_image_store_mip_2d:
1166   case Intrinsic::amdgcn_image_store_mip_2darray:
1167   case Intrinsic::amdgcn_image_store_mip_3d:
1168   case Intrinsic::amdgcn_image_store_mip_cube: {
1169     if (!isa<FixedVectorType>(II.getArgOperand(0)->getType()))
1170       break;
1171 
1172     APInt DemandedElts;
1173     if (ST->hasDefaultComponentBroadcast())
1174       DemandedElts = defaultComponentBroadcast(II.getArgOperand(0));
1175     else if (ST->hasDefaultComponentZero())
1176       DemandedElts = trimTrailingZerosInVector(IC, II.getArgOperand(0), &II);
1177     else
1178       break;
1179 
1180     int DMaskIdx = getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID()) ? 1 : -1;
1181     if (simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, DMaskIdx,
1182                                               false)) {
1183       return IC.eraseInstFromFunction(II);
1184     }
1185 
1186     break;
1187   }
1188   }
1189   if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
1190             AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) {
1191     return simplifyAMDGCNImageIntrinsic(ST, ImageDimIntr, II, IC);
1192   }
1193   return std::nullopt;
1194 }
1195 
1196 /// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
1197 ///
1198 /// The result of simplifying amdgcn image and buffer store intrinsics is updating
1199 /// definitions of the intrinsics vector argument, not Uses of the result like
1200 /// image and buffer loads.
1201 /// Note: This only supports non-TFE/LWE image intrinsic calls; those have
1202 ///       struct returns.
1203 static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
1204                                                     IntrinsicInst &II,
1205                                                     APInt DemandedElts,
1206                                                     int DMaskIdx, bool IsLoad) {
1207 
1208   auto *IIVTy = cast<FixedVectorType>(IsLoad ? II.getType()
1209                                              : II.getOperand(0)->getType());
1210   unsigned VWidth = IIVTy->getNumElements();
1211   if (VWidth == 1)
1212     return nullptr;
1213   Type *EltTy = IIVTy->getElementType();
1214 
1215   IRBuilderBase::InsertPointGuard Guard(IC.Builder);
1216   IC.Builder.SetInsertPoint(&II);
1217 
1218   // Assume the arguments are unchanged and later override them, if needed.
1219   SmallVector<Value *, 16> Args(II.args());
1220 
1221   if (DMaskIdx < 0) {
1222     // Buffer case.
1223 
1224     const unsigned ActiveBits = DemandedElts.getActiveBits();
1225     const unsigned UnusedComponentsAtFront = DemandedElts.countr_zero();
1226 
1227     // Start assuming the prefix of elements is demanded, but possibly clear
1228     // some other bits if there are trailing zeros (unused components at front)
1229     // and update offset.
1230     DemandedElts = (1 << ActiveBits) - 1;
1231 
1232     if (UnusedComponentsAtFront > 0) {
1233       static const unsigned InvalidOffsetIdx = 0xf;
1234 
1235       unsigned OffsetIdx;
1236       switch (II.getIntrinsicID()) {
1237       case Intrinsic::amdgcn_raw_buffer_load:
1238       case Intrinsic::amdgcn_raw_ptr_buffer_load:
1239         OffsetIdx = 1;
1240         break;
1241       case Intrinsic::amdgcn_s_buffer_load:
1242         // If resulting type is vec3, there is no point in trimming the
1243         // load with updated offset, as the vec3 would most likely be widened to
1244         // vec4 anyway during lowering.
1245         if (ActiveBits == 4 && UnusedComponentsAtFront == 1)
1246           OffsetIdx = InvalidOffsetIdx;
1247         else
1248           OffsetIdx = 1;
1249         break;
1250       case Intrinsic::amdgcn_struct_buffer_load:
1251       case Intrinsic::amdgcn_struct_ptr_buffer_load:
1252         OffsetIdx = 2;
1253         break;
1254       default:
1255         // TODO: handle tbuffer* intrinsics.
1256         OffsetIdx = InvalidOffsetIdx;
1257         break;
1258       }
1259 
1260       if (OffsetIdx != InvalidOffsetIdx) {
1261         // Clear demanded bits and update the offset.
1262         DemandedElts &= ~((1 << UnusedComponentsAtFront) - 1);
1263         auto *Offset = Args[OffsetIdx];
1264         unsigned SingleComponentSizeInBits =
1265             IC.getDataLayout().getTypeSizeInBits(EltTy);
1266         unsigned OffsetAdd =
1267             UnusedComponentsAtFront * SingleComponentSizeInBits / 8;
1268         auto *OffsetAddVal = ConstantInt::get(Offset->getType(), OffsetAdd);
1269         Args[OffsetIdx] = IC.Builder.CreateAdd(Offset, OffsetAddVal);
1270       }
1271     }
1272   } else {
1273     // Image case.
1274 
1275     ConstantInt *DMask = cast<ConstantInt>(Args[DMaskIdx]);
1276     unsigned DMaskVal = DMask->getZExtValue() & 0xf;
1277 
1278     // dmask 0 has special semantics, do not simplify.
1279     if (DMaskVal == 0)
1280       return nullptr;
1281 
1282     // Mask off values that are undefined because the dmask doesn't cover them
1283     DemandedElts &= (1 << llvm::popcount(DMaskVal)) - 1;
1284 
1285     unsigned NewDMaskVal = 0;
1286     unsigned OrigLdStIdx = 0;
1287     for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) {
1288       const unsigned Bit = 1 << SrcIdx;
1289       if (!!(DMaskVal & Bit)) {
1290         if (!!DemandedElts[OrigLdStIdx])
1291           NewDMaskVal |= Bit;
1292         OrigLdStIdx++;
1293       }
1294     }
1295 
1296     if (DMaskVal != NewDMaskVal)
1297       Args[DMaskIdx] = ConstantInt::get(DMask->getType(), NewDMaskVal);
1298   }
1299 
1300   unsigned NewNumElts = DemandedElts.popcount();
1301   if (!NewNumElts)
1302     return PoisonValue::get(IIVTy);
1303 
1304   if (NewNumElts >= VWidth && DemandedElts.isMask()) {
1305     if (DMaskIdx >= 0)
1306       II.setArgOperand(DMaskIdx, Args[DMaskIdx]);
1307     return nullptr;
1308   }
1309 
1310   // Validate function argument and return types, extracting overloaded types
1311   // along the way.
1312   SmallVector<Type *, 6> OverloadTys;
1313   if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), OverloadTys))
1314     return nullptr;
1315 
1316   Type *NewTy =
1317       (NewNumElts == 1) ? EltTy : FixedVectorType::get(EltTy, NewNumElts);
1318   OverloadTys[0] = NewTy;
1319 
1320   if (!IsLoad) {
1321     SmallVector<int, 8> EltMask;
1322     for (unsigned OrigStoreIdx = 0; OrigStoreIdx < VWidth; ++OrigStoreIdx)
1323       if (DemandedElts[OrigStoreIdx])
1324         EltMask.push_back(OrigStoreIdx);
1325 
1326     if (NewNumElts == 1)
1327       Args[0] = IC.Builder.CreateExtractElement(II.getOperand(0), EltMask[0]);
1328     else
1329       Args[0] = IC.Builder.CreateShuffleVector(II.getOperand(0), EltMask);
1330   }
1331 
1332   Function *NewIntrin = Intrinsic::getDeclaration(
1333       II.getModule(), II.getIntrinsicID(), OverloadTys);
1334   CallInst *NewCall = IC.Builder.CreateCall(NewIntrin, Args);
1335   NewCall->takeName(&II);
1336   NewCall->copyMetadata(II);
1337 
1338   if (IsLoad) {
1339     if (NewNumElts == 1) {
1340       return IC.Builder.CreateInsertElement(PoisonValue::get(IIVTy), NewCall,
1341                                             DemandedElts.countr_zero());
1342     }
1343 
1344     SmallVector<int, 8> EltMask;
1345     unsigned NewLoadIdx = 0;
1346     for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) {
1347       if (!!DemandedElts[OrigLoadIdx])
1348         EltMask.push_back(NewLoadIdx++);
1349       else
1350         EltMask.push_back(NewNumElts);
1351     }
1352 
1353     auto *Shuffle = IC.Builder.CreateShuffleVector(NewCall, EltMask);
1354 
1355     return Shuffle;
1356   }
1357 
1358   return NewCall;
1359 }
1360 
1361 std::optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic(
1362     InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
1363     APInt &UndefElts2, APInt &UndefElts3,
1364     std::function<void(Instruction *, unsigned, APInt, APInt &)>
1365         SimplifyAndSetOp) const {
1366   switch (II.getIntrinsicID()) {
1367   case Intrinsic::amdgcn_buffer_load:
1368   case Intrinsic::amdgcn_buffer_load_format:
1369   case Intrinsic::amdgcn_raw_buffer_load:
1370   case Intrinsic::amdgcn_raw_ptr_buffer_load:
1371   case Intrinsic::amdgcn_raw_buffer_load_format:
1372   case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
1373   case Intrinsic::amdgcn_raw_tbuffer_load:
1374   case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
1375   case Intrinsic::amdgcn_s_buffer_load:
1376   case Intrinsic::amdgcn_struct_buffer_load:
1377   case Intrinsic::amdgcn_struct_ptr_buffer_load:
1378   case Intrinsic::amdgcn_struct_buffer_load_format:
1379   case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
1380   case Intrinsic::amdgcn_struct_tbuffer_load:
1381   case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
1382   case Intrinsic::amdgcn_tbuffer_load:
1383     return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts);
1384   default: {
1385     if (getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID())) {
1386       return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, 0);
1387     }
1388     break;
1389   }
1390   }
1391   return std::nullopt;
1392 }
1393