xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp (revision e64fe029e9d3ce476e77a478318e0c3cd201ff08)
1 //===- AMDGPInstCombineIntrinsic.cpp - AMDGPU specific InstCombine pass ---===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // \file
10 // This file implements a TargetTransformInfo analysis pass specific to the
11 // AMDGPU target machine. It uses the target's detailed information to provide
12 // more precise answers to certain TTI queries, while letting the target
13 // independent and default TTI implementations handle the rest.
14 //
15 //===----------------------------------------------------------------------===//
16 
17 #include "AMDGPUInstrInfo.h"
18 #include "AMDGPUTargetTransformInfo.h"
19 #include "GCNSubtarget.h"
20 #include "llvm/ADT/FloatingPointMode.h"
21 #include "llvm/IR/IntrinsicsAMDGPU.h"
22 #include "llvm/Transforms/InstCombine/InstCombiner.h"
23 #include <optional>
24 
25 using namespace llvm;
26 
27 #define DEBUG_TYPE "AMDGPUtti"
28 
29 namespace {
30 
31 struct AMDGPUImageDMaskIntrinsic {
32   unsigned Intr;
33 };
34 
35 #define GET_AMDGPUImageDMaskIntrinsicTable_IMPL
36 #include "InstCombineTables.inc"
37 
38 } // end anonymous namespace
39 
40 // Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs.
41 //
42 // A single NaN input is folded to minnum, so we rely on that folding for
43 // handling NaNs.
44 static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1,
45                            const APFloat &Src2) {
46   APFloat Max3 = maxnum(maxnum(Src0, Src1), Src2);
47 
48   APFloat::cmpResult Cmp0 = Max3.compare(Src0);
49   assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately");
50   if (Cmp0 == APFloat::cmpEqual)
51     return maxnum(Src1, Src2);
52 
53   APFloat::cmpResult Cmp1 = Max3.compare(Src1);
54   assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately");
55   if (Cmp1 == APFloat::cmpEqual)
56     return maxnum(Src0, Src2);
57 
58   return maxnum(Src0, Src1);
59 }
60 
61 // Check if a value can be converted to a 16-bit value without losing
62 // precision.
63 // The value is expected to be either a float (IsFloat = true) or an unsigned
64 // integer (IsFloat = false).
65 static bool canSafelyConvertTo16Bit(Value &V, bool IsFloat) {
66   Type *VTy = V.getType();
67   if (VTy->isHalfTy() || VTy->isIntegerTy(16)) {
68     // The value is already 16-bit, so we don't want to convert to 16-bit again!
69     return false;
70   }
71   if (IsFloat) {
72     if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(&V)) {
73       // We need to check that if we cast the index down to a half, we do not
74       // lose precision.
75       APFloat FloatValue(ConstFloat->getValueAPF());
76       bool LosesInfo = true;
77       FloatValue.convert(APFloat::IEEEhalf(), APFloat::rmTowardZero,
78                          &LosesInfo);
79       return !LosesInfo;
80     }
81   } else {
82     if (ConstantInt *ConstInt = dyn_cast<ConstantInt>(&V)) {
83       // We need to check that if we cast the index down to an i16, we do not
84       // lose precision.
85       APInt IntValue(ConstInt->getValue());
86       return IntValue.getActiveBits() <= 16;
87     }
88   }
89 
90   Value *CastSrc;
91   bool IsExt = IsFloat ? match(&V, m_FPExt(PatternMatch::m_Value(CastSrc)))
92                        : match(&V, m_ZExt(PatternMatch::m_Value(CastSrc)));
93   if (IsExt) {
94     Type *CastSrcTy = CastSrc->getType();
95     if (CastSrcTy->isHalfTy() || CastSrcTy->isIntegerTy(16))
96       return true;
97   }
98 
99   return false;
100 }
101 
102 // Convert a value to 16-bit.
103 static Value *convertTo16Bit(Value &V, InstCombiner::BuilderTy &Builder) {
104   Type *VTy = V.getType();
105   if (isa<FPExtInst>(&V) || isa<SExtInst>(&V) || isa<ZExtInst>(&V))
106     return cast<Instruction>(&V)->getOperand(0);
107   if (VTy->isIntegerTy())
108     return Builder.CreateIntCast(&V, Type::getInt16Ty(V.getContext()), false);
109   if (VTy->isFloatingPointTy())
110     return Builder.CreateFPCast(&V, Type::getHalfTy(V.getContext()));
111 
112   llvm_unreachable("Should never be called!");
113 }
114 
115 /// Applies Func(OldIntr.Args, OldIntr.ArgTys), creates intrinsic call with
116 /// modified arguments (based on OldIntr) and replaces InstToReplace with
117 /// this newly created intrinsic call.
118 static std::optional<Instruction *> modifyIntrinsicCall(
119     IntrinsicInst &OldIntr, Instruction &InstToReplace, unsigned NewIntr,
120     InstCombiner &IC,
121     std::function<void(SmallVectorImpl<Value *> &, SmallVectorImpl<Type *> &)>
122         Func) {
123   SmallVector<Type *, 4> ArgTys;
124   if (!Intrinsic::getIntrinsicSignature(OldIntr.getCalledFunction(), ArgTys))
125     return std::nullopt;
126 
127   SmallVector<Value *, 8> Args(OldIntr.args());
128 
129   // Modify arguments and types
130   Func(Args, ArgTys);
131 
132   Function *I = Intrinsic::getDeclaration(OldIntr.getModule(), NewIntr, ArgTys);
133 
134   CallInst *NewCall = IC.Builder.CreateCall(I, Args);
135   NewCall->takeName(&OldIntr);
136   NewCall->copyMetadata(OldIntr);
137   if (isa<FPMathOperator>(NewCall))
138     NewCall->copyFastMathFlags(&OldIntr);
139 
140   // Erase and replace uses
141   if (!InstToReplace.getType()->isVoidTy())
142     IC.replaceInstUsesWith(InstToReplace, NewCall);
143 
144   bool RemoveOldIntr = &OldIntr != &InstToReplace;
145 
146   auto RetValue = IC.eraseInstFromFunction(InstToReplace);
147   if (RemoveOldIntr)
148     IC.eraseInstFromFunction(OldIntr);
149 
150   return RetValue;
151 }
152 
153 static std::optional<Instruction *>
154 simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST,
155                              const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr,
156                              IntrinsicInst &II, InstCombiner &IC) {
157   // Optimize _L to _LZ when _L is zero
158   if (const auto *LZMappingInfo =
159           AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) {
160     if (auto *ConstantLod =
161             dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->LodIndex))) {
162       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
163         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
164             AMDGPU::getImageDimIntrinsicByBaseOpcode(LZMappingInfo->LZ,
165                                                      ImageDimIntr->Dim);
166         return modifyIntrinsicCall(
167             II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
168               Args.erase(Args.begin() + ImageDimIntr->LodIndex);
169             });
170       }
171     }
172   }
173 
174   // Optimize _mip away, when 'lod' is zero
175   if (const auto *MIPMappingInfo =
176           AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) {
177     if (auto *ConstantMip =
178             dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->MipIndex))) {
179       if (ConstantMip->isZero()) {
180         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
181             AMDGPU::getImageDimIntrinsicByBaseOpcode(MIPMappingInfo->NONMIP,
182                                                      ImageDimIntr->Dim);
183         return modifyIntrinsicCall(
184             II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
185               Args.erase(Args.begin() + ImageDimIntr->MipIndex);
186             });
187       }
188     }
189   }
190 
191   // Optimize _bias away when 'bias' is zero
192   if (const auto *BiasMappingInfo =
193           AMDGPU::getMIMGBiasMappingInfo(ImageDimIntr->BaseOpcode)) {
194     if (auto *ConstantBias =
195             dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->BiasIndex))) {
196       if (ConstantBias->isZero()) {
197         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
198             AMDGPU::getImageDimIntrinsicByBaseOpcode(BiasMappingInfo->NoBias,
199                                                      ImageDimIntr->Dim);
200         return modifyIntrinsicCall(
201             II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
202               Args.erase(Args.begin() + ImageDimIntr->BiasIndex);
203               ArgTys.erase(ArgTys.begin() + ImageDimIntr->BiasTyArg);
204             });
205       }
206     }
207   }
208 
209   // Optimize _offset away when 'offset' is zero
210   if (const auto *OffsetMappingInfo =
211           AMDGPU::getMIMGOffsetMappingInfo(ImageDimIntr->BaseOpcode)) {
212     if (auto *ConstantOffset =
213             dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->OffsetIndex))) {
214       if (ConstantOffset->isZero()) {
215         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
216             AMDGPU::getImageDimIntrinsicByBaseOpcode(
217                 OffsetMappingInfo->NoOffset, ImageDimIntr->Dim);
218         return modifyIntrinsicCall(
219             II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
220               Args.erase(Args.begin() + ImageDimIntr->OffsetIndex);
221             });
222       }
223     }
224   }
225 
226   // Try to use D16
227   if (ST->hasD16Images()) {
228 
229     const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
230         AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
231 
232     if (BaseOpcode->HasD16) {
233 
234       // If the only use of image intrinsic is a fptrunc (with conversion to
235       // half) then both fptrunc and image intrinsic will be replaced with image
236       // intrinsic with D16 flag.
237       if (II.hasOneUse()) {
238         Instruction *User = II.user_back();
239 
240         if (User->getOpcode() == Instruction::FPTrunc &&
241             User->getType()->getScalarType()->isHalfTy()) {
242 
243           return modifyIntrinsicCall(II, *User, ImageDimIntr->Intr, IC,
244                                      [&](auto &Args, auto &ArgTys) {
245                                        // Change return type of image intrinsic.
246                                        // Set it to return type of fptrunc.
247                                        ArgTys[0] = User->getType();
248                                      });
249         }
250       }
251     }
252   }
253 
254   // Try to use A16 or G16
255   if (!ST->hasA16() && !ST->hasG16())
256     return std::nullopt;
257 
258   // Address is interpreted as float if the instruction has a sampler or as
259   // unsigned int if there is no sampler.
260   bool HasSampler =
261       AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode)->Sampler;
262   bool FloatCoord = false;
263   // true means derivatives can be converted to 16 bit, coordinates not
264   bool OnlyDerivatives = false;
265 
266   for (unsigned OperandIndex = ImageDimIntr->GradientStart;
267        OperandIndex < ImageDimIntr->VAddrEnd; OperandIndex++) {
268     Value *Coord = II.getOperand(OperandIndex);
269     // If the values are not derived from 16-bit values, we cannot optimize.
270     if (!canSafelyConvertTo16Bit(*Coord, HasSampler)) {
271       if (OperandIndex < ImageDimIntr->CoordStart ||
272           ImageDimIntr->GradientStart == ImageDimIntr->CoordStart) {
273         return std::nullopt;
274       }
275       // All gradients can be converted, so convert only them
276       OnlyDerivatives = true;
277       break;
278     }
279 
280     assert(OperandIndex == ImageDimIntr->GradientStart ||
281            FloatCoord == Coord->getType()->isFloatingPointTy());
282     FloatCoord = Coord->getType()->isFloatingPointTy();
283   }
284 
285   if (!OnlyDerivatives && !ST->hasA16())
286     OnlyDerivatives = true; // Only supports G16
287 
288   // Check if there is a bias parameter and if it can be converted to f16
289   if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) {
290     Value *Bias = II.getOperand(ImageDimIntr->BiasIndex);
291     assert(HasSampler &&
292            "Only image instructions with a sampler can have a bias");
293     if (!canSafelyConvertTo16Bit(*Bias, HasSampler))
294       OnlyDerivatives = true;
295   }
296 
297   if (OnlyDerivatives && (!ST->hasG16() || ImageDimIntr->GradientStart ==
298                                                ImageDimIntr->CoordStart))
299     return std::nullopt;
300 
301   Type *CoordType = FloatCoord ? Type::getHalfTy(II.getContext())
302                                : Type::getInt16Ty(II.getContext());
303 
304   return modifyIntrinsicCall(
305       II, II, II.getIntrinsicID(), IC, [&](auto &Args, auto &ArgTys) {
306         ArgTys[ImageDimIntr->GradientTyArg] = CoordType;
307         if (!OnlyDerivatives) {
308           ArgTys[ImageDimIntr->CoordTyArg] = CoordType;
309 
310           // Change the bias type
311           if (ImageDimIntr->NumBiasArgs != 0)
312             ArgTys[ImageDimIntr->BiasTyArg] = Type::getHalfTy(II.getContext());
313         }
314 
315         unsigned EndIndex =
316             OnlyDerivatives ? ImageDimIntr->CoordStart : ImageDimIntr->VAddrEnd;
317         for (unsigned OperandIndex = ImageDimIntr->GradientStart;
318              OperandIndex < EndIndex; OperandIndex++) {
319           Args[OperandIndex] =
320               convertTo16Bit(*II.getOperand(OperandIndex), IC.Builder);
321         }
322 
323         // Convert the bias
324         if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) {
325           Value *Bias = II.getOperand(ImageDimIntr->BiasIndex);
326           Args[ImageDimIntr->BiasIndex] = convertTo16Bit(*Bias, IC.Builder);
327         }
328       });
329 }
330 
331 bool GCNTTIImpl::canSimplifyLegacyMulToMul(const Value *Op0, const Value *Op1,
332                                            InstCombiner &IC) const {
333   // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
334   // infinity, gives +0.0. If we can prove we don't have one of the special
335   // cases then we can use a normal multiply instead.
336   // TODO: Create and use isKnownFiniteNonZero instead of just matching
337   // constants here.
338   if (match(Op0, PatternMatch::m_FiniteNonZero()) ||
339       match(Op1, PatternMatch::m_FiniteNonZero())) {
340     // One operand is not zero or infinity or NaN.
341     return true;
342   }
343   auto *TLI = &IC.getTargetLibraryInfo();
344   if (isKnownNeverInfinity(Op0, TLI) && isKnownNeverNaN(Op0, TLI) &&
345       isKnownNeverInfinity(Op1, TLI) && isKnownNeverNaN(Op1, TLI)) {
346     // Neither operand is infinity or NaN.
347     return true;
348   }
349   return false;
350 }
351 
352 std::optional<Instruction *>
353 GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
354   Intrinsic::ID IID = II.getIntrinsicID();
355   switch (IID) {
356   case Intrinsic::amdgcn_rcp: {
357     Value *Src = II.getArgOperand(0);
358 
359     // TODO: Move to ConstantFolding/InstSimplify?
360     if (isa<UndefValue>(Src)) {
361       Type *Ty = II.getType();
362       auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
363       return IC.replaceInstUsesWith(II, QNaN);
364     }
365 
366     if (II.isStrictFP())
367       break;
368 
369     if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
370       const APFloat &ArgVal = C->getValueAPF();
371       APFloat Val(ArgVal.getSemantics(), 1);
372       Val.divide(ArgVal, APFloat::rmNearestTiesToEven);
373 
374       // This is more precise than the instruction may give.
375       //
376       // TODO: The instruction always flushes denormal results (except for f16),
377       // should this also?
378       return IC.replaceInstUsesWith(II, ConstantFP::get(II.getContext(), Val));
379     }
380 
381     break;
382   }
383   case Intrinsic::amdgcn_sqrt:
384   case Intrinsic::amdgcn_rsq: {
385     Value *Src = II.getArgOperand(0);
386 
387     // TODO: Move to ConstantFolding/InstSimplify?
388     if (isa<UndefValue>(Src)) {
389       Type *Ty = II.getType();
390       auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
391       return IC.replaceInstUsesWith(II, QNaN);
392     }
393 
394     break;
395   }
396   case Intrinsic::amdgcn_frexp_mant:
397   case Intrinsic::amdgcn_frexp_exp: {
398     Value *Src = II.getArgOperand(0);
399     if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
400       int Exp;
401       APFloat Significand =
402           frexp(C->getValueAPF(), Exp, APFloat::rmNearestTiesToEven);
403 
404       if (IID == Intrinsic::amdgcn_frexp_mant) {
405         return IC.replaceInstUsesWith(
406             II, ConstantFP::get(II.getContext(), Significand));
407       }
408 
409       // Match instruction special case behavior.
410       if (Exp == APFloat::IEK_NaN || Exp == APFloat::IEK_Inf)
411         Exp = 0;
412 
413       return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Exp));
414     }
415 
416     if (isa<UndefValue>(Src)) {
417       return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
418     }
419 
420     break;
421   }
422   case Intrinsic::amdgcn_class: {
423     Value *Src0 = II.getArgOperand(0);
424     Value *Src1 = II.getArgOperand(1);
425     const ConstantInt *CMask = dyn_cast<ConstantInt>(Src1);
426     if (!CMask) {
427       if (isa<UndefValue>(Src0)) {
428         return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
429       }
430 
431       if (isa<UndefValue>(Src1)) {
432         return IC.replaceInstUsesWith(II,
433                                       ConstantInt::get(II.getType(), false));
434       }
435       break;
436     }
437 
438     uint32_t Mask = CMask->getZExtValue();
439 
440     // If all tests are made, it doesn't matter what the value is.
441     if ((Mask & fcAllFlags) == fcAllFlags) {
442       return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), true));
443     }
444 
445     if ((Mask & fcAllFlags) == 0) {
446       return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), false));
447     }
448 
449     if (Mask == fcNan && !II.isStrictFP()) {
450       // Equivalent of isnan. Replace with standard fcmp.
451       Value *FCmp = IC.Builder.CreateFCmpUNO(Src0, Src0);
452       FCmp->takeName(&II);
453       return IC.replaceInstUsesWith(II, FCmp);
454     }
455 
456     if (Mask == fcZero && !II.isStrictFP()) {
457       // Equivalent of == 0.
458       Value *FCmp =
459           IC.Builder.CreateFCmpOEQ(Src0, ConstantFP::get(Src0->getType(), 0.0));
460 
461       FCmp->takeName(&II);
462       return IC.replaceInstUsesWith(II, FCmp);
463     }
464 
465     // fp_class (nnan x), qnan|snan|other -> fp_class (nnan x), other
466     if ((Mask & fcNan) && isKnownNeverNaN(Src0, &IC.getTargetLibraryInfo())) {
467       return IC.replaceOperand(
468           II, 1, ConstantInt::get(Src1->getType(), Mask & ~fcNan));
469     }
470 
471     const ConstantFP *CVal = dyn_cast<ConstantFP>(Src0);
472     if (!CVal) {
473       if (isa<UndefValue>(Src0)) {
474         return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
475       }
476 
477       // Clamp mask to used bits
478       if ((Mask & fcAllFlags) != Mask) {
479         CallInst *NewCall = IC.Builder.CreateCall(
480             II.getCalledFunction(),
481             {Src0, ConstantInt::get(Src1->getType(), Mask & fcAllFlags)});
482 
483         NewCall->takeName(&II);
484         return IC.replaceInstUsesWith(II, NewCall);
485       }
486 
487       break;
488     }
489 
490     const APFloat &Val = CVal->getValueAPF();
491 
492     bool Result =
493         ((Mask & fcSNan) && Val.isNaN() && Val.isSignaling()) ||
494         ((Mask & fcQNan) && Val.isNaN() && !Val.isSignaling()) ||
495         ((Mask & fcNegInf) && Val.isInfinity() && Val.isNegative()) ||
496         ((Mask & fcNegNormal) && Val.isNormal() && Val.isNegative()) ||
497         ((Mask & fcNegSubnormal) && Val.isDenormal() && Val.isNegative()) ||
498         ((Mask & fcNegZero) && Val.isZero() && Val.isNegative()) ||
499         ((Mask & fcPosZero) && Val.isZero() && !Val.isNegative()) ||
500         ((Mask & fcPosSubnormal) && Val.isDenormal() && !Val.isNegative()) ||
501         ((Mask & fcPosNormal) && Val.isNormal() && !Val.isNegative()) ||
502         ((Mask & fcPosInf) && Val.isInfinity() && !Val.isNegative());
503 
504     return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Result));
505   }
506   case Intrinsic::amdgcn_cvt_pkrtz: {
507     Value *Src0 = II.getArgOperand(0);
508     Value *Src1 = II.getArgOperand(1);
509     if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
510       if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
511         const fltSemantics &HalfSem =
512             II.getType()->getScalarType()->getFltSemantics();
513         bool LosesInfo;
514         APFloat Val0 = C0->getValueAPF();
515         APFloat Val1 = C1->getValueAPF();
516         Val0.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
517         Val1.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
518 
519         Constant *Folded =
520             ConstantVector::get({ConstantFP::get(II.getContext(), Val0),
521                                  ConstantFP::get(II.getContext(), Val1)});
522         return IC.replaceInstUsesWith(II, Folded);
523       }
524     }
525 
526     if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) {
527       return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
528     }
529 
530     break;
531   }
532   case Intrinsic::amdgcn_cvt_pknorm_i16:
533   case Intrinsic::amdgcn_cvt_pknorm_u16:
534   case Intrinsic::amdgcn_cvt_pk_i16:
535   case Intrinsic::amdgcn_cvt_pk_u16: {
536     Value *Src0 = II.getArgOperand(0);
537     Value *Src1 = II.getArgOperand(1);
538 
539     if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) {
540       return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
541     }
542 
543     break;
544   }
545   case Intrinsic::amdgcn_ubfe:
546   case Intrinsic::amdgcn_sbfe: {
547     // Decompose simple cases into standard shifts.
548     Value *Src = II.getArgOperand(0);
549     if (isa<UndefValue>(Src)) {
550       return IC.replaceInstUsesWith(II, Src);
551     }
552 
553     unsigned Width;
554     Type *Ty = II.getType();
555     unsigned IntSize = Ty->getIntegerBitWidth();
556 
557     ConstantInt *CWidth = dyn_cast<ConstantInt>(II.getArgOperand(2));
558     if (CWidth) {
559       Width = CWidth->getZExtValue();
560       if ((Width & (IntSize - 1)) == 0) {
561         return IC.replaceInstUsesWith(II, ConstantInt::getNullValue(Ty));
562       }
563 
564       // Hardware ignores high bits, so remove those.
565       if (Width >= IntSize) {
566         return IC.replaceOperand(
567             II, 2, ConstantInt::get(CWidth->getType(), Width & (IntSize - 1)));
568       }
569     }
570 
571     unsigned Offset;
572     ConstantInt *COffset = dyn_cast<ConstantInt>(II.getArgOperand(1));
573     if (COffset) {
574       Offset = COffset->getZExtValue();
575       if (Offset >= IntSize) {
576         return IC.replaceOperand(
577             II, 1,
578             ConstantInt::get(COffset->getType(), Offset & (IntSize - 1)));
579       }
580     }
581 
582     bool Signed = IID == Intrinsic::amdgcn_sbfe;
583 
584     if (!CWidth || !COffset)
585       break;
586 
587     // The case of Width == 0 is handled above, which makes this transformation
588     // safe.  If Width == 0, then the ashr and lshr instructions become poison
589     // value since the shift amount would be equal to the bit size.
590     assert(Width != 0);
591 
592     // TODO: This allows folding to undef when the hardware has specific
593     // behavior?
594     if (Offset + Width < IntSize) {
595       Value *Shl = IC.Builder.CreateShl(Src, IntSize - Offset - Width);
596       Value *RightShift = Signed ? IC.Builder.CreateAShr(Shl, IntSize - Width)
597                                  : IC.Builder.CreateLShr(Shl, IntSize - Width);
598       RightShift->takeName(&II);
599       return IC.replaceInstUsesWith(II, RightShift);
600     }
601 
602     Value *RightShift = Signed ? IC.Builder.CreateAShr(Src, Offset)
603                                : IC.Builder.CreateLShr(Src, Offset);
604 
605     RightShift->takeName(&II);
606     return IC.replaceInstUsesWith(II, RightShift);
607   }
608   case Intrinsic::amdgcn_exp:
609   case Intrinsic::amdgcn_exp_row:
610   case Intrinsic::amdgcn_exp_compr: {
611     ConstantInt *En = cast<ConstantInt>(II.getArgOperand(1));
612     unsigned EnBits = En->getZExtValue();
613     if (EnBits == 0xf)
614       break; // All inputs enabled.
615 
616     bool IsCompr = IID == Intrinsic::amdgcn_exp_compr;
617     bool Changed = false;
618     for (int I = 0; I < (IsCompr ? 2 : 4); ++I) {
619       if ((!IsCompr && (EnBits & (1 << I)) == 0) ||
620           (IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) {
621         Value *Src = II.getArgOperand(I + 2);
622         if (!isa<UndefValue>(Src)) {
623           IC.replaceOperand(II, I + 2, UndefValue::get(Src->getType()));
624           Changed = true;
625         }
626       }
627     }
628 
629     if (Changed) {
630       return &II;
631     }
632 
633     break;
634   }
635   case Intrinsic::amdgcn_fmed3: {
636     // Note this does not preserve proper sNaN behavior if IEEE-mode is enabled
637     // for the shader.
638 
639     Value *Src0 = II.getArgOperand(0);
640     Value *Src1 = II.getArgOperand(1);
641     Value *Src2 = II.getArgOperand(2);
642 
643     // Checking for NaN before canonicalization provides better fidelity when
644     // mapping other operations onto fmed3 since the order of operands is
645     // unchanged.
646     CallInst *NewCall = nullptr;
647     if (match(Src0, PatternMatch::m_NaN()) || isa<UndefValue>(Src0)) {
648       NewCall = IC.Builder.CreateMinNum(Src1, Src2);
649     } else if (match(Src1, PatternMatch::m_NaN()) || isa<UndefValue>(Src1)) {
650       NewCall = IC.Builder.CreateMinNum(Src0, Src2);
651     } else if (match(Src2, PatternMatch::m_NaN()) || isa<UndefValue>(Src2)) {
652       NewCall = IC.Builder.CreateMaxNum(Src0, Src1);
653     }
654 
655     if (NewCall) {
656       NewCall->copyFastMathFlags(&II);
657       NewCall->takeName(&II);
658       return IC.replaceInstUsesWith(II, NewCall);
659     }
660 
661     bool Swap = false;
662     // Canonicalize constants to RHS operands.
663     //
664     // fmed3(c0, x, c1) -> fmed3(x, c0, c1)
665     if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
666       std::swap(Src0, Src1);
667       Swap = true;
668     }
669 
670     if (isa<Constant>(Src1) && !isa<Constant>(Src2)) {
671       std::swap(Src1, Src2);
672       Swap = true;
673     }
674 
675     if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
676       std::swap(Src0, Src1);
677       Swap = true;
678     }
679 
680     if (Swap) {
681       II.setArgOperand(0, Src0);
682       II.setArgOperand(1, Src1);
683       II.setArgOperand(2, Src2);
684       return &II;
685     }
686 
687     if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
688       if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
689         if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Src2)) {
690           APFloat Result = fmed3AMDGCN(C0->getValueAPF(), C1->getValueAPF(),
691                                        C2->getValueAPF());
692           return IC.replaceInstUsesWith(
693               II, ConstantFP::get(IC.Builder.getContext(), Result));
694         }
695       }
696     }
697 
698     break;
699   }
700   case Intrinsic::amdgcn_icmp:
701   case Intrinsic::amdgcn_fcmp: {
702     const ConstantInt *CC = cast<ConstantInt>(II.getArgOperand(2));
703     // Guard against invalid arguments.
704     int64_t CCVal = CC->getZExtValue();
705     bool IsInteger = IID == Intrinsic::amdgcn_icmp;
706     if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE ||
707                        CCVal > CmpInst::LAST_ICMP_PREDICATE)) ||
708         (!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE ||
709                         CCVal > CmpInst::LAST_FCMP_PREDICATE)))
710       break;
711 
712     Value *Src0 = II.getArgOperand(0);
713     Value *Src1 = II.getArgOperand(1);
714 
715     if (auto *CSrc0 = dyn_cast<Constant>(Src0)) {
716       if (auto *CSrc1 = dyn_cast<Constant>(Src1)) {
717         Constant *CCmp = ConstantExpr::getCompare(CCVal, CSrc0, CSrc1);
718         if (CCmp->isNullValue()) {
719           return IC.replaceInstUsesWith(
720               II, ConstantExpr::getSExt(CCmp, II.getType()));
721         }
722 
723         // The result of V_ICMP/V_FCMP assembly instructions (which this
724         // intrinsic exposes) is one bit per thread, masked with the EXEC
725         // register (which contains the bitmask of live threads). So a
726         // comparison that always returns true is the same as a read of the
727         // EXEC register.
728         Function *NewF = Intrinsic::getDeclaration(
729             II.getModule(), Intrinsic::read_register, II.getType());
730         Metadata *MDArgs[] = {MDString::get(II.getContext(), "exec")};
731         MDNode *MD = MDNode::get(II.getContext(), MDArgs);
732         Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)};
733         CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
734         NewCall->addFnAttr(Attribute::Convergent);
735         NewCall->takeName(&II);
736         return IC.replaceInstUsesWith(II, NewCall);
737       }
738 
739       // Canonicalize constants to RHS.
740       CmpInst::Predicate SwapPred =
741           CmpInst::getSwappedPredicate(static_cast<CmpInst::Predicate>(CCVal));
742       II.setArgOperand(0, Src1);
743       II.setArgOperand(1, Src0);
744       II.setArgOperand(
745           2, ConstantInt::get(CC->getType(), static_cast<int>(SwapPred)));
746       return &II;
747     }
748 
749     if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE)
750       break;
751 
752     // Canonicalize compare eq with true value to compare != 0
753     // llvm.amdgcn.icmp(zext (i1 x), 1, eq)
754     //   -> llvm.amdgcn.icmp(zext (i1 x), 0, ne)
755     // llvm.amdgcn.icmp(sext (i1 x), -1, eq)
756     //   -> llvm.amdgcn.icmp(sext (i1 x), 0, ne)
757     Value *ExtSrc;
758     if (CCVal == CmpInst::ICMP_EQ &&
759         ((match(Src1, PatternMatch::m_One()) &&
760           match(Src0, m_ZExt(PatternMatch::m_Value(ExtSrc)))) ||
761          (match(Src1, PatternMatch::m_AllOnes()) &&
762           match(Src0, m_SExt(PatternMatch::m_Value(ExtSrc))))) &&
763         ExtSrc->getType()->isIntegerTy(1)) {
764       IC.replaceOperand(II, 1, ConstantInt::getNullValue(Src1->getType()));
765       IC.replaceOperand(II, 2,
766                         ConstantInt::get(CC->getType(), CmpInst::ICMP_NE));
767       return &II;
768     }
769 
770     CmpInst::Predicate SrcPred;
771     Value *SrcLHS;
772     Value *SrcRHS;
773 
774     // Fold compare eq/ne with 0 from a compare result as the predicate to the
775     // intrinsic. The typical use is a wave vote function in the library, which
776     // will be fed from a user code condition compared with 0. Fold in the
777     // redundant compare.
778 
779     // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne)
780     //   -> llvm.amdgcn.[if]cmp(a, b, pred)
781     //
782     // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq)
783     //   -> llvm.amdgcn.[if]cmp(a, b, inv pred)
784     if (match(Src1, PatternMatch::m_Zero()) &&
785         match(Src0, PatternMatch::m_ZExtOrSExt(
786                         m_Cmp(SrcPred, PatternMatch::m_Value(SrcLHS),
787                               PatternMatch::m_Value(SrcRHS))))) {
788       if (CCVal == CmpInst::ICMP_EQ)
789         SrcPred = CmpInst::getInversePredicate(SrcPred);
790 
791       Intrinsic::ID NewIID = CmpInst::isFPPredicate(SrcPred)
792                                  ? Intrinsic::amdgcn_fcmp
793                                  : Intrinsic::amdgcn_icmp;
794 
795       Type *Ty = SrcLHS->getType();
796       if (auto *CmpType = dyn_cast<IntegerType>(Ty)) {
797         // Promote to next legal integer type.
798         unsigned Width = CmpType->getBitWidth();
799         unsigned NewWidth = Width;
800 
801         // Don't do anything for i1 comparisons.
802         if (Width == 1)
803           break;
804 
805         if (Width <= 16)
806           NewWidth = 16;
807         else if (Width <= 32)
808           NewWidth = 32;
809         else if (Width <= 64)
810           NewWidth = 64;
811         else if (Width > 64)
812           break; // Can't handle this.
813 
814         if (Width != NewWidth) {
815           IntegerType *CmpTy = IC.Builder.getIntNTy(NewWidth);
816           if (CmpInst::isSigned(SrcPred)) {
817             SrcLHS = IC.Builder.CreateSExt(SrcLHS, CmpTy);
818             SrcRHS = IC.Builder.CreateSExt(SrcRHS, CmpTy);
819           } else {
820             SrcLHS = IC.Builder.CreateZExt(SrcLHS, CmpTy);
821             SrcRHS = IC.Builder.CreateZExt(SrcRHS, CmpTy);
822           }
823         }
824       } else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy())
825         break;
826 
827       Function *NewF = Intrinsic::getDeclaration(
828           II.getModule(), NewIID, {II.getType(), SrcLHS->getType()});
829       Value *Args[] = {SrcLHS, SrcRHS,
830                        ConstantInt::get(CC->getType(), SrcPred)};
831       CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
832       NewCall->takeName(&II);
833       return IC.replaceInstUsesWith(II, NewCall);
834     }
835 
836     break;
837   }
838   case Intrinsic::amdgcn_ballot: {
839     if (auto *Src = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
840       if (Src->isZero()) {
841         // amdgcn.ballot(i1 0) is zero.
842         return IC.replaceInstUsesWith(II, Constant::getNullValue(II.getType()));
843       }
844 
845       if (Src->isOne()) {
846         // amdgcn.ballot(i1 1) is exec.
847         const char *RegName = "exec";
848         if (II.getType()->isIntegerTy(32))
849           RegName = "exec_lo";
850         else if (!II.getType()->isIntegerTy(64))
851           break;
852 
853         Function *NewF = Intrinsic::getDeclaration(
854             II.getModule(), Intrinsic::read_register, II.getType());
855         Metadata *MDArgs[] = {MDString::get(II.getContext(), RegName)};
856         MDNode *MD = MDNode::get(II.getContext(), MDArgs);
857         Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)};
858         CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
859         NewCall->addFnAttr(Attribute::Convergent);
860         NewCall->takeName(&II);
861         return IC.replaceInstUsesWith(II, NewCall);
862       }
863     }
864     break;
865   }
866   case Intrinsic::amdgcn_wqm_vote: {
867     // wqm_vote is identity when the argument is constant.
868     if (!isa<Constant>(II.getArgOperand(0)))
869       break;
870 
871     return IC.replaceInstUsesWith(II, II.getArgOperand(0));
872   }
873   case Intrinsic::amdgcn_kill: {
874     const ConstantInt *C = dyn_cast<ConstantInt>(II.getArgOperand(0));
875     if (!C || !C->getZExtValue())
876       break;
877 
878     // amdgcn.kill(i1 1) is a no-op
879     return IC.eraseInstFromFunction(II);
880   }
881   case Intrinsic::amdgcn_update_dpp: {
882     Value *Old = II.getArgOperand(0);
883 
884     auto *BC = cast<ConstantInt>(II.getArgOperand(5));
885     auto *RM = cast<ConstantInt>(II.getArgOperand(3));
886     auto *BM = cast<ConstantInt>(II.getArgOperand(4));
887     if (BC->isZeroValue() || RM->getZExtValue() != 0xF ||
888         BM->getZExtValue() != 0xF || isa<UndefValue>(Old))
889       break;
890 
891     // If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value.
892     return IC.replaceOperand(II, 0, UndefValue::get(Old->getType()));
893   }
894   case Intrinsic::amdgcn_permlane16:
895   case Intrinsic::amdgcn_permlanex16: {
896     // Discard vdst_in if it's not going to be read.
897     Value *VDstIn = II.getArgOperand(0);
898     if (isa<UndefValue>(VDstIn))
899       break;
900 
901     ConstantInt *FetchInvalid = cast<ConstantInt>(II.getArgOperand(4));
902     ConstantInt *BoundCtrl = cast<ConstantInt>(II.getArgOperand(5));
903     if (!FetchInvalid->getZExtValue() && !BoundCtrl->getZExtValue())
904       break;
905 
906     return IC.replaceOperand(II, 0, UndefValue::get(VDstIn->getType()));
907   }
908   case Intrinsic::amdgcn_permlane64:
909     // A constant value is trivially uniform.
910     if (Constant *C = dyn_cast<Constant>(II.getArgOperand(0))) {
911       return IC.replaceInstUsesWith(II, C);
912     }
913     break;
914   case Intrinsic::amdgcn_readfirstlane:
915   case Intrinsic::amdgcn_readlane: {
916     // A constant value is trivially uniform.
917     if (Constant *C = dyn_cast<Constant>(II.getArgOperand(0))) {
918       return IC.replaceInstUsesWith(II, C);
919     }
920 
921     // The rest of these may not be safe if the exec may not be the same between
922     // the def and use.
923     Value *Src = II.getArgOperand(0);
924     Instruction *SrcInst = dyn_cast<Instruction>(Src);
925     if (SrcInst && SrcInst->getParent() != II.getParent())
926       break;
927 
928     // readfirstlane (readfirstlane x) -> readfirstlane x
929     // readlane (readfirstlane x), y -> readfirstlane x
930     if (match(Src,
931               PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readfirstlane>())) {
932       return IC.replaceInstUsesWith(II, Src);
933     }
934 
935     if (IID == Intrinsic::amdgcn_readfirstlane) {
936       // readfirstlane (readlane x, y) -> readlane x, y
937       if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>())) {
938         return IC.replaceInstUsesWith(II, Src);
939       }
940     } else {
941       // readlane (readlane x, y), y -> readlane x, y
942       if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>(
943                          PatternMatch::m_Value(),
944                          PatternMatch::m_Specific(II.getArgOperand(1))))) {
945         return IC.replaceInstUsesWith(II, Src);
946       }
947     }
948 
949     break;
950   }
951   case Intrinsic::amdgcn_ldexp: {
952     // FIXME: This doesn't introduce new instructions and belongs in
953     // InstructionSimplify.
954     Type *Ty = II.getType();
955     Value *Op0 = II.getArgOperand(0);
956     Value *Op1 = II.getArgOperand(1);
957 
958     // Folding undef to qnan is safe regardless of the FP mode.
959     if (isa<UndefValue>(Op0)) {
960       auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
961       return IC.replaceInstUsesWith(II, QNaN);
962     }
963 
964     const APFloat *C = nullptr;
965     match(Op0, PatternMatch::m_APFloat(C));
966 
967     // FIXME: Should flush denorms depending on FP mode, but that's ignored
968     // everywhere else.
969     //
970     // These cases should be safe, even with strictfp.
971     // ldexp(0.0, x) -> 0.0
972     // ldexp(-0.0, x) -> -0.0
973     // ldexp(inf, x) -> inf
974     // ldexp(-inf, x) -> -inf
975     if (C && (C->isZero() || C->isInfinity())) {
976       return IC.replaceInstUsesWith(II, Op0);
977     }
978 
979     // With strictfp, be more careful about possibly needing to flush denormals
980     // or not, and snan behavior depends on ieee_mode.
981     if (II.isStrictFP())
982       break;
983 
984     if (C && C->isNaN()) {
985       // FIXME: We just need to make the nan quiet here, but that's unavailable
986       // on APFloat, only IEEEfloat
987       auto *Quieted =
988           ConstantFP::get(Ty, scalbn(*C, 0, APFloat::rmNearestTiesToEven));
989       return IC.replaceInstUsesWith(II, Quieted);
990     }
991 
992     // ldexp(x, 0) -> x
993     // ldexp(x, undef) -> x
994     if (isa<UndefValue>(Op1) || match(Op1, PatternMatch::m_ZeroInt())) {
995       return IC.replaceInstUsesWith(II, Op0);
996     }
997 
998     break;
999   }
1000   case Intrinsic::amdgcn_fmul_legacy: {
1001     Value *Op0 = II.getArgOperand(0);
1002     Value *Op1 = II.getArgOperand(1);
1003 
1004     // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
1005     // infinity, gives +0.0.
1006     // TODO: Move to InstSimplify?
1007     if (match(Op0, PatternMatch::m_AnyZeroFP()) ||
1008         match(Op1, PatternMatch::m_AnyZeroFP()))
1009       return IC.replaceInstUsesWith(II, ConstantFP::getNullValue(II.getType()));
1010 
1011     // If we can prove we don't have one of the special cases then we can use a
1012     // normal fmul instruction instead.
1013     if (canSimplifyLegacyMulToMul(Op0, Op1, IC)) {
1014       auto *FMul = IC.Builder.CreateFMulFMF(Op0, Op1, &II);
1015       FMul->takeName(&II);
1016       return IC.replaceInstUsesWith(II, FMul);
1017     }
1018     break;
1019   }
1020   case Intrinsic::amdgcn_fma_legacy: {
1021     Value *Op0 = II.getArgOperand(0);
1022     Value *Op1 = II.getArgOperand(1);
1023     Value *Op2 = II.getArgOperand(2);
1024 
1025     // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
1026     // infinity, gives +0.0.
1027     // TODO: Move to InstSimplify?
1028     if (match(Op0, PatternMatch::m_AnyZeroFP()) ||
1029         match(Op1, PatternMatch::m_AnyZeroFP())) {
1030       // It's tempting to just return Op2 here, but that would give the wrong
1031       // result if Op2 was -0.0.
1032       auto *Zero = ConstantFP::getNullValue(II.getType());
1033       auto *FAdd = IC.Builder.CreateFAddFMF(Zero, Op2, &II);
1034       FAdd->takeName(&II);
1035       return IC.replaceInstUsesWith(II, FAdd);
1036     }
1037 
1038     // If we can prove we don't have one of the special cases then we can use a
1039     // normal fma instead.
1040     if (canSimplifyLegacyMulToMul(Op0, Op1, IC)) {
1041       II.setCalledOperand(Intrinsic::getDeclaration(
1042           II.getModule(), Intrinsic::fma, II.getType()));
1043       return &II;
1044     }
1045     break;
1046   }
1047   case Intrinsic::amdgcn_is_shared:
1048   case Intrinsic::amdgcn_is_private: {
1049     if (isa<UndefValue>(II.getArgOperand(0)))
1050       return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
1051 
1052     if (isa<ConstantPointerNull>(II.getArgOperand(0)))
1053       return IC.replaceInstUsesWith(II, ConstantInt::getFalse(II.getType()));
1054     break;
1055   }
1056   default: {
1057     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
1058             AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) {
1059       return simplifyAMDGCNImageIntrinsic(ST, ImageDimIntr, II, IC);
1060     }
1061   }
1062   }
1063   return std::nullopt;
1064 }
1065 
1066 /// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
1067 ///
1068 /// Note: This only supports non-TFE/LWE image intrinsic calls; those have
1069 ///       struct returns.
1070 static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
1071                                                     IntrinsicInst &II,
1072                                                     APInt DemandedElts,
1073                                                     int DMaskIdx = -1) {
1074 
1075   auto *IIVTy = cast<FixedVectorType>(II.getType());
1076   unsigned VWidth = IIVTy->getNumElements();
1077   if (VWidth == 1)
1078     return nullptr;
1079   Type *EltTy = IIVTy->getElementType();
1080 
1081   IRBuilderBase::InsertPointGuard Guard(IC.Builder);
1082   IC.Builder.SetInsertPoint(&II);
1083 
1084   // Assume the arguments are unchanged and later override them, if needed.
1085   SmallVector<Value *, 16> Args(II.args());
1086 
1087   if (DMaskIdx < 0) {
1088     // Buffer case.
1089 
1090     const unsigned ActiveBits = DemandedElts.getActiveBits();
1091     const unsigned UnusedComponentsAtFront = DemandedElts.countTrailingZeros();
1092 
1093     // Start assuming the prefix of elements is demanded, but possibly clear
1094     // some other bits if there are trailing zeros (unused components at front)
1095     // and update offset.
1096     DemandedElts = (1 << ActiveBits) - 1;
1097 
1098     if (UnusedComponentsAtFront > 0) {
1099       static const unsigned InvalidOffsetIdx = 0xf;
1100 
1101       unsigned OffsetIdx;
1102       switch (II.getIntrinsicID()) {
1103       case Intrinsic::amdgcn_raw_buffer_load:
1104         OffsetIdx = 1;
1105         break;
1106       case Intrinsic::amdgcn_s_buffer_load:
1107         // If resulting type is vec3, there is no point in trimming the
1108         // load with updated offset, as the vec3 would most likely be widened to
1109         // vec4 anyway during lowering.
1110         if (ActiveBits == 4 && UnusedComponentsAtFront == 1)
1111           OffsetIdx = InvalidOffsetIdx;
1112         else
1113           OffsetIdx = 1;
1114         break;
1115       case Intrinsic::amdgcn_struct_buffer_load:
1116         OffsetIdx = 2;
1117         break;
1118       default:
1119         // TODO: handle tbuffer* intrinsics.
1120         OffsetIdx = InvalidOffsetIdx;
1121         break;
1122       }
1123 
1124       if (OffsetIdx != InvalidOffsetIdx) {
1125         // Clear demanded bits and update the offset.
1126         DemandedElts &= ~((1 << UnusedComponentsAtFront) - 1);
1127         auto *Offset = Args[OffsetIdx];
1128         unsigned SingleComponentSizeInBits =
1129             IC.getDataLayout().getTypeSizeInBits(EltTy);
1130         unsigned OffsetAdd =
1131             UnusedComponentsAtFront * SingleComponentSizeInBits / 8;
1132         auto *OffsetAddVal = ConstantInt::get(Offset->getType(), OffsetAdd);
1133         Args[OffsetIdx] = IC.Builder.CreateAdd(Offset, OffsetAddVal);
1134       }
1135     }
1136   } else {
1137     // Image case.
1138 
1139     ConstantInt *DMask = cast<ConstantInt>(Args[DMaskIdx]);
1140     unsigned DMaskVal = DMask->getZExtValue() & 0xf;
1141 
1142     // Mask off values that are undefined because the dmask doesn't cover them
1143     DemandedElts &= (1 << llvm::popcount(DMaskVal)) - 1;
1144 
1145     unsigned NewDMaskVal = 0;
1146     unsigned OrigLoadIdx = 0;
1147     for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) {
1148       const unsigned Bit = 1 << SrcIdx;
1149       if (!!(DMaskVal & Bit)) {
1150         if (!!DemandedElts[OrigLoadIdx])
1151           NewDMaskVal |= Bit;
1152         OrigLoadIdx++;
1153       }
1154     }
1155 
1156     if (DMaskVal != NewDMaskVal)
1157       Args[DMaskIdx] = ConstantInt::get(DMask->getType(), NewDMaskVal);
1158   }
1159 
1160   unsigned NewNumElts = DemandedElts.countPopulation();
1161   if (!NewNumElts)
1162     return UndefValue::get(IIVTy);
1163 
1164   if (NewNumElts >= VWidth && DemandedElts.isMask()) {
1165     if (DMaskIdx >= 0)
1166       II.setArgOperand(DMaskIdx, Args[DMaskIdx]);
1167     return nullptr;
1168   }
1169 
1170   // Validate function argument and return types, extracting overloaded types
1171   // along the way.
1172   SmallVector<Type *, 6> OverloadTys;
1173   if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), OverloadTys))
1174     return nullptr;
1175 
1176   Type *NewTy =
1177       (NewNumElts == 1) ? EltTy : FixedVectorType::get(EltTy, NewNumElts);
1178   OverloadTys[0] = NewTy;
1179 
1180   Function *NewIntrin = Intrinsic::getDeclaration(
1181       II.getModule(), II.getIntrinsicID(), OverloadTys);
1182   CallInst *NewCall = IC.Builder.CreateCall(NewIntrin, Args);
1183   NewCall->takeName(&II);
1184   NewCall->copyMetadata(II);
1185 
1186   if (NewNumElts == 1) {
1187     return IC.Builder.CreateInsertElement(UndefValue::get(IIVTy), NewCall,
1188                                           DemandedElts.countTrailingZeros());
1189   }
1190 
1191   SmallVector<int, 8> EltMask;
1192   unsigned NewLoadIdx = 0;
1193   for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) {
1194     if (!!DemandedElts[OrigLoadIdx])
1195       EltMask.push_back(NewLoadIdx++);
1196     else
1197       EltMask.push_back(NewNumElts);
1198   }
1199 
1200   Value *Shuffle = IC.Builder.CreateShuffleVector(NewCall, EltMask);
1201 
1202   return Shuffle;
1203 }
1204 
1205 std::optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic(
1206     InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
1207     APInt &UndefElts2, APInt &UndefElts3,
1208     std::function<void(Instruction *, unsigned, APInt, APInt &)>
1209         SimplifyAndSetOp) const {
1210   switch (II.getIntrinsicID()) {
1211   case Intrinsic::amdgcn_buffer_load:
1212   case Intrinsic::amdgcn_buffer_load_format:
1213   case Intrinsic::amdgcn_raw_buffer_load:
1214   case Intrinsic::amdgcn_raw_buffer_load_format:
1215   case Intrinsic::amdgcn_raw_tbuffer_load:
1216   case Intrinsic::amdgcn_s_buffer_load:
1217   case Intrinsic::amdgcn_struct_buffer_load:
1218   case Intrinsic::amdgcn_struct_buffer_load_format:
1219   case Intrinsic::amdgcn_struct_tbuffer_load:
1220   case Intrinsic::amdgcn_tbuffer_load:
1221     return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts);
1222   default: {
1223     if (getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID())) {
1224       return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, 0);
1225     }
1226     break;
1227   }
1228   }
1229   return std::nullopt;
1230 }
1231