xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp (revision 62ff619dcc3540659a319be71c9a489f1659e14a)
1 //===- AMDGPInstCombineIntrinsic.cpp - AMDGPU specific InstCombine pass ---===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // \file
10 // This file implements a TargetTransformInfo analysis pass specific to the
11 // AMDGPU target machine. It uses the target's detailed information to provide
12 // more precise answers to certain TTI queries, while letting the target
13 // independent and default TTI implementations handle the rest.
14 //
15 //===----------------------------------------------------------------------===//
16 
17 #include "AMDGPUInstrInfo.h"
18 #include "AMDGPUTargetTransformInfo.h"
19 #include "GCNSubtarget.h"
20 #include "llvm/IR/IntrinsicsAMDGPU.h"
21 #include "llvm/Transforms/InstCombine/InstCombiner.h"
22 
23 using namespace llvm;
24 
25 #define DEBUG_TYPE "AMDGPUtti"
26 
27 namespace {
28 
29 struct AMDGPUImageDMaskIntrinsic {
30   unsigned Intr;
31 };
32 
33 #define GET_AMDGPUImageDMaskIntrinsicTable_IMPL
34 #include "InstCombineTables.inc"
35 
36 } // end anonymous namespace
37 
38 // Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs.
39 //
40 // A single NaN input is folded to minnum, so we rely on that folding for
41 // handling NaNs.
42 static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1,
43                            const APFloat &Src2) {
44   APFloat Max3 = maxnum(maxnum(Src0, Src1), Src2);
45 
46   APFloat::cmpResult Cmp0 = Max3.compare(Src0);
47   assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately");
48   if (Cmp0 == APFloat::cmpEqual)
49     return maxnum(Src1, Src2);
50 
51   APFloat::cmpResult Cmp1 = Max3.compare(Src1);
52   assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately");
53   if (Cmp1 == APFloat::cmpEqual)
54     return maxnum(Src0, Src2);
55 
56   return maxnum(Src0, Src1);
57 }
58 
59 // Check if a value can be converted to a 16-bit value without losing
60 // precision.
61 // The value is expected to be either a float (IsFloat = true) or an unsigned
62 // integer (IsFloat = false).
63 static bool canSafelyConvertTo16Bit(Value &V, bool IsFloat) {
64   Type *VTy = V.getType();
65   if (VTy->isHalfTy() || VTy->isIntegerTy(16)) {
66     // The value is already 16-bit, so we don't want to convert to 16-bit again!
67     return false;
68   }
69   if (IsFloat) {
70     if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(&V)) {
71       // We need to check that if we cast the index down to a half, we do not
72       // lose precision.
73       APFloat FloatValue(ConstFloat->getValueAPF());
74       bool LosesInfo = true;
75       FloatValue.convert(APFloat::IEEEhalf(), APFloat::rmTowardZero,
76                          &LosesInfo);
77       return !LosesInfo;
78     }
79   } else {
80     if (ConstantInt *ConstInt = dyn_cast<ConstantInt>(&V)) {
81       // We need to check that if we cast the index down to an i16, we do not
82       // lose precision.
83       APInt IntValue(ConstInt->getValue());
84       return IntValue.getActiveBits() <= 16;
85     }
86   }
87 
88   Value *CastSrc;
89   bool IsExt = IsFloat ? match(&V, m_FPExt(PatternMatch::m_Value(CastSrc)))
90                        : match(&V, m_ZExt(PatternMatch::m_Value(CastSrc)));
91   if (IsExt) {
92     Type *CastSrcTy = CastSrc->getType();
93     if (CastSrcTy->isHalfTy() || CastSrcTy->isIntegerTy(16))
94       return true;
95   }
96 
97   return false;
98 }
99 
100 // Convert a value to 16-bit.
101 static Value *convertTo16Bit(Value &V, InstCombiner::BuilderTy &Builder) {
102   Type *VTy = V.getType();
103   if (isa<FPExtInst>(&V) || isa<SExtInst>(&V) || isa<ZExtInst>(&V))
104     return cast<Instruction>(&V)->getOperand(0);
105   if (VTy->isIntegerTy())
106     return Builder.CreateIntCast(&V, Type::getInt16Ty(V.getContext()), false);
107   if (VTy->isFloatingPointTy())
108     return Builder.CreateFPCast(&V, Type::getHalfTy(V.getContext()));
109 
110   llvm_unreachable("Should never be called!");
111 }
112 
113 /// Applies Function(II.Args, II.ArgTys) and replaces the intrinsic call with
114 /// the modified arguments.
115 static Optional<Instruction *> modifyIntrinsicCall(
116     IntrinsicInst &II, unsigned NewIntr, InstCombiner &IC,
117     std::function<void(SmallVectorImpl<Value *> &, SmallVectorImpl<Type *> &)>
118         Func) {
119   SmallVector<Type *, 4> ArgTys;
120   if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), ArgTys))
121     return None;
122 
123   SmallVector<Value *, 8> Args(II.args());
124 
125   // Modify arguments and types
126   Func(Args, ArgTys);
127 
128   Function *I = Intrinsic::getDeclaration(II.getModule(), NewIntr, ArgTys);
129 
130   CallInst *NewCall = IC.Builder.CreateCall(I, Args);
131   NewCall->takeName(&II);
132   NewCall->copyMetadata(II);
133   if (isa<FPMathOperator>(NewCall))
134     NewCall->copyFastMathFlags(&II);
135 
136   // Erase and replace uses
137   if (!II.getType()->isVoidTy())
138     IC.replaceInstUsesWith(II, NewCall);
139   return IC.eraseInstFromFunction(II);
140 }
141 
142 static Optional<Instruction *>
143 simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST,
144                              const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr,
145                              IntrinsicInst &II, InstCombiner &IC) {
146   // Optimize _L to _LZ when _L is zero
147   if (const auto *LZMappingInfo =
148           AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) {
149     if (auto *ConstantLod =
150             dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->LodIndex))) {
151       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
152         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
153             AMDGPU::getImageDimIntrinsicByBaseOpcode(LZMappingInfo->LZ,
154                                                      ImageDimIntr->Dim);
155         return modifyIntrinsicCall(
156             II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
157               Args.erase(Args.begin() + ImageDimIntr->LodIndex);
158             });
159       }
160     }
161   }
162 
163   // Optimize _mip away, when 'lod' is zero
164   if (const auto *MIPMappingInfo =
165           AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) {
166     if (auto *ConstantMip =
167             dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->MipIndex))) {
168       if (ConstantMip->isZero()) {
169         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
170             AMDGPU::getImageDimIntrinsicByBaseOpcode(MIPMappingInfo->NONMIP,
171                                                      ImageDimIntr->Dim);
172         return modifyIntrinsicCall(
173             II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
174               Args.erase(Args.begin() + ImageDimIntr->MipIndex);
175             });
176       }
177     }
178   }
179 
180   // Optimize _bias away when 'bias' is zero
181   if (const auto *BiasMappingInfo =
182           AMDGPU::getMIMGBiasMappingInfo(ImageDimIntr->BaseOpcode)) {
183     if (auto *ConstantBias =
184             dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->BiasIndex))) {
185       if (ConstantBias->isZero()) {
186         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
187             AMDGPU::getImageDimIntrinsicByBaseOpcode(BiasMappingInfo->NoBias,
188                                                      ImageDimIntr->Dim);
189         return modifyIntrinsicCall(
190             II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
191               Args.erase(Args.begin() + ImageDimIntr->BiasIndex);
192               ArgTys.erase(ArgTys.begin() + ImageDimIntr->BiasTyArg);
193             });
194       }
195     }
196   }
197 
198   // Optimize _offset away when 'offset' is zero
199   if (const auto *OffsetMappingInfo =
200           AMDGPU::getMIMGOffsetMappingInfo(ImageDimIntr->BaseOpcode)) {
201     if (auto *ConstantOffset =
202             dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->OffsetIndex))) {
203       if (ConstantOffset->isZero()) {
204         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
205             AMDGPU::getImageDimIntrinsicByBaseOpcode(
206                 OffsetMappingInfo->NoOffset, ImageDimIntr->Dim);
207         return modifyIntrinsicCall(
208             II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
209               Args.erase(Args.begin() + ImageDimIntr->OffsetIndex);
210             });
211       }
212     }
213   }
214 
215   // Try to use A16 or G16
216   if (!ST->hasA16() && !ST->hasG16())
217     return None;
218 
219   // Address is interpreted as float if the instruction has a sampler or as
220   // unsigned int if there is no sampler.
221   bool HasSampler =
222       AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode)->Sampler;
223   bool FloatCoord = false;
224   // true means derivatives can be converted to 16 bit, coordinates not
225   bool OnlyDerivatives = false;
226 
227   for (unsigned OperandIndex = ImageDimIntr->GradientStart;
228        OperandIndex < ImageDimIntr->VAddrEnd; OperandIndex++) {
229     Value *Coord = II.getOperand(OperandIndex);
230     // If the values are not derived from 16-bit values, we cannot optimize.
231     if (!canSafelyConvertTo16Bit(*Coord, HasSampler)) {
232       if (OperandIndex < ImageDimIntr->CoordStart ||
233           ImageDimIntr->GradientStart == ImageDimIntr->CoordStart) {
234         return None;
235       }
236       // All gradients can be converted, so convert only them
237       OnlyDerivatives = true;
238       break;
239     }
240 
241     assert(OperandIndex == ImageDimIntr->GradientStart ||
242            FloatCoord == Coord->getType()->isFloatingPointTy());
243     FloatCoord = Coord->getType()->isFloatingPointTy();
244   }
245 
246   if (!OnlyDerivatives && !ST->hasA16())
247     OnlyDerivatives = true; // Only supports G16
248 
249   // Check if there is a bias parameter and if it can be converted to f16
250   if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) {
251     Value *Bias = II.getOperand(ImageDimIntr->BiasIndex);
252     assert(HasSampler &&
253            "Only image instructions with a sampler can have a bias");
254     if (!canSafelyConvertTo16Bit(*Bias, HasSampler))
255       OnlyDerivatives = true;
256   }
257 
258   if (OnlyDerivatives && (!ST->hasG16() || ImageDimIntr->GradientStart ==
259                                                ImageDimIntr->CoordStart))
260     return None;
261 
262   Type *CoordType = FloatCoord ? Type::getHalfTy(II.getContext())
263                                : Type::getInt16Ty(II.getContext());
264 
265   return modifyIntrinsicCall(
266       II, II.getIntrinsicID(), IC, [&](auto &Args, auto &ArgTys) {
267         ArgTys[ImageDimIntr->GradientTyArg] = CoordType;
268         if (!OnlyDerivatives) {
269           ArgTys[ImageDimIntr->CoordTyArg] = CoordType;
270 
271           // Change the bias type
272           if (ImageDimIntr->NumBiasArgs != 0)
273             ArgTys[ImageDimIntr->BiasTyArg] = Type::getHalfTy(II.getContext());
274         }
275 
276         unsigned EndIndex =
277             OnlyDerivatives ? ImageDimIntr->CoordStart : ImageDimIntr->VAddrEnd;
278         for (unsigned OperandIndex = ImageDimIntr->GradientStart;
279              OperandIndex < EndIndex; OperandIndex++) {
280           Args[OperandIndex] =
281               convertTo16Bit(*II.getOperand(OperandIndex), IC.Builder);
282         }
283 
284         // Convert the bias
285         if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) {
286           Value *Bias = II.getOperand(ImageDimIntr->BiasIndex);
287           Args[ImageDimIntr->BiasIndex] = convertTo16Bit(*Bias, IC.Builder);
288         }
289       });
290 }
291 
292 bool GCNTTIImpl::canSimplifyLegacyMulToMul(const Value *Op0, const Value *Op1,
293                                            InstCombiner &IC) const {
294   // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
295   // infinity, gives +0.0. If we can prove we don't have one of the special
296   // cases then we can use a normal multiply instead.
297   // TODO: Create and use isKnownFiniteNonZero instead of just matching
298   // constants here.
299   if (match(Op0, PatternMatch::m_FiniteNonZero()) ||
300       match(Op1, PatternMatch::m_FiniteNonZero())) {
301     // One operand is not zero or infinity or NaN.
302     return true;
303   }
304   auto *TLI = &IC.getTargetLibraryInfo();
305   if (isKnownNeverInfinity(Op0, TLI) && isKnownNeverNaN(Op0, TLI) &&
306       isKnownNeverInfinity(Op1, TLI) && isKnownNeverNaN(Op1, TLI)) {
307     // Neither operand is infinity or NaN.
308     return true;
309   }
310   return false;
311 }
312 
313 Optional<Instruction *>
314 GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
315   Intrinsic::ID IID = II.getIntrinsicID();
316   switch (IID) {
317   case Intrinsic::amdgcn_rcp: {
318     Value *Src = II.getArgOperand(0);
319 
320     // TODO: Move to ConstantFolding/InstSimplify?
321     if (isa<UndefValue>(Src)) {
322       Type *Ty = II.getType();
323       auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
324       return IC.replaceInstUsesWith(II, QNaN);
325     }
326 
327     if (II.isStrictFP())
328       break;
329 
330     if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
331       const APFloat &ArgVal = C->getValueAPF();
332       APFloat Val(ArgVal.getSemantics(), 1);
333       Val.divide(ArgVal, APFloat::rmNearestTiesToEven);
334 
335       // This is more precise than the instruction may give.
336       //
337       // TODO: The instruction always flushes denormal results (except for f16),
338       // should this also?
339       return IC.replaceInstUsesWith(II, ConstantFP::get(II.getContext(), Val));
340     }
341 
342     break;
343   }
344   case Intrinsic::amdgcn_rsq: {
345     Value *Src = II.getArgOperand(0);
346 
347     // TODO: Move to ConstantFolding/InstSimplify?
348     if (isa<UndefValue>(Src)) {
349       Type *Ty = II.getType();
350       auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
351       return IC.replaceInstUsesWith(II, QNaN);
352     }
353 
354     break;
355   }
356   case Intrinsic::amdgcn_frexp_mant:
357   case Intrinsic::amdgcn_frexp_exp: {
358     Value *Src = II.getArgOperand(0);
359     if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
360       int Exp;
361       APFloat Significand =
362           frexp(C->getValueAPF(), Exp, APFloat::rmNearestTiesToEven);
363 
364       if (IID == Intrinsic::amdgcn_frexp_mant) {
365         return IC.replaceInstUsesWith(
366             II, ConstantFP::get(II.getContext(), Significand));
367       }
368 
369       // Match instruction special case behavior.
370       if (Exp == APFloat::IEK_NaN || Exp == APFloat::IEK_Inf)
371         Exp = 0;
372 
373       return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Exp));
374     }
375 
376     if (isa<UndefValue>(Src)) {
377       return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
378     }
379 
380     break;
381   }
382   case Intrinsic::amdgcn_class: {
383     enum {
384       S_NAN = 1 << 0,       // Signaling NaN
385       Q_NAN = 1 << 1,       // Quiet NaN
386       N_INFINITY = 1 << 2,  // Negative infinity
387       N_NORMAL = 1 << 3,    // Negative normal
388       N_SUBNORMAL = 1 << 4, // Negative subnormal
389       N_ZERO = 1 << 5,      // Negative zero
390       P_ZERO = 1 << 6,      // Positive zero
391       P_SUBNORMAL = 1 << 7, // Positive subnormal
392       P_NORMAL = 1 << 8,    // Positive normal
393       P_INFINITY = 1 << 9   // Positive infinity
394     };
395 
396     const uint32_t FullMask = S_NAN | Q_NAN | N_INFINITY | N_NORMAL |
397                               N_SUBNORMAL | N_ZERO | P_ZERO | P_SUBNORMAL |
398                               P_NORMAL | P_INFINITY;
399 
400     Value *Src0 = II.getArgOperand(0);
401     Value *Src1 = II.getArgOperand(1);
402     const ConstantInt *CMask = dyn_cast<ConstantInt>(Src1);
403     if (!CMask) {
404       if (isa<UndefValue>(Src0)) {
405         return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
406       }
407 
408       if (isa<UndefValue>(Src1)) {
409         return IC.replaceInstUsesWith(II,
410                                       ConstantInt::get(II.getType(), false));
411       }
412       break;
413     }
414 
415     uint32_t Mask = CMask->getZExtValue();
416 
417     // If all tests are made, it doesn't matter what the value is.
418     if ((Mask & FullMask) == FullMask) {
419       return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), true));
420     }
421 
422     if ((Mask & FullMask) == 0) {
423       return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), false));
424     }
425 
426     if (Mask == (S_NAN | Q_NAN)) {
427       // Equivalent of isnan. Replace with standard fcmp.
428       Value *FCmp = IC.Builder.CreateFCmpUNO(Src0, Src0);
429       FCmp->takeName(&II);
430       return IC.replaceInstUsesWith(II, FCmp);
431     }
432 
433     if (Mask == (N_ZERO | P_ZERO)) {
434       // Equivalent of == 0.
435       Value *FCmp =
436           IC.Builder.CreateFCmpOEQ(Src0, ConstantFP::get(Src0->getType(), 0.0));
437 
438       FCmp->takeName(&II);
439       return IC.replaceInstUsesWith(II, FCmp);
440     }
441 
442     // fp_class (nnan x), qnan|snan|other -> fp_class (nnan x), other
443     if (((Mask & S_NAN) || (Mask & Q_NAN)) &&
444         isKnownNeverNaN(Src0, &IC.getTargetLibraryInfo())) {
445       return IC.replaceOperand(
446           II, 1, ConstantInt::get(Src1->getType(), Mask & ~(S_NAN | Q_NAN)));
447     }
448 
449     const ConstantFP *CVal = dyn_cast<ConstantFP>(Src0);
450     if (!CVal) {
451       if (isa<UndefValue>(Src0)) {
452         return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
453       }
454 
455       // Clamp mask to used bits
456       if ((Mask & FullMask) != Mask) {
457         CallInst *NewCall = IC.Builder.CreateCall(
458             II.getCalledFunction(),
459             {Src0, ConstantInt::get(Src1->getType(), Mask & FullMask)});
460 
461         NewCall->takeName(&II);
462         return IC.replaceInstUsesWith(II, NewCall);
463       }
464 
465       break;
466     }
467 
468     const APFloat &Val = CVal->getValueAPF();
469 
470     bool Result =
471         ((Mask & S_NAN) && Val.isNaN() && Val.isSignaling()) ||
472         ((Mask & Q_NAN) && Val.isNaN() && !Val.isSignaling()) ||
473         ((Mask & N_INFINITY) && Val.isInfinity() && Val.isNegative()) ||
474         ((Mask & N_NORMAL) && Val.isNormal() && Val.isNegative()) ||
475         ((Mask & N_SUBNORMAL) && Val.isDenormal() && Val.isNegative()) ||
476         ((Mask & N_ZERO) && Val.isZero() && Val.isNegative()) ||
477         ((Mask & P_ZERO) && Val.isZero() && !Val.isNegative()) ||
478         ((Mask & P_SUBNORMAL) && Val.isDenormal() && !Val.isNegative()) ||
479         ((Mask & P_NORMAL) && Val.isNormal() && !Val.isNegative()) ||
480         ((Mask & P_INFINITY) && Val.isInfinity() && !Val.isNegative());
481 
482     return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Result));
483   }
484   case Intrinsic::amdgcn_cvt_pkrtz: {
485     Value *Src0 = II.getArgOperand(0);
486     Value *Src1 = II.getArgOperand(1);
487     if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
488       if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
489         const fltSemantics &HalfSem =
490             II.getType()->getScalarType()->getFltSemantics();
491         bool LosesInfo;
492         APFloat Val0 = C0->getValueAPF();
493         APFloat Val1 = C1->getValueAPF();
494         Val0.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
495         Val1.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
496 
497         Constant *Folded =
498             ConstantVector::get({ConstantFP::get(II.getContext(), Val0),
499                                  ConstantFP::get(II.getContext(), Val1)});
500         return IC.replaceInstUsesWith(II, Folded);
501       }
502     }
503 
504     if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) {
505       return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
506     }
507 
508     break;
509   }
510   case Intrinsic::amdgcn_cvt_pknorm_i16:
511   case Intrinsic::amdgcn_cvt_pknorm_u16:
512   case Intrinsic::amdgcn_cvt_pk_i16:
513   case Intrinsic::amdgcn_cvt_pk_u16: {
514     Value *Src0 = II.getArgOperand(0);
515     Value *Src1 = II.getArgOperand(1);
516 
517     if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) {
518       return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
519     }
520 
521     break;
522   }
523   case Intrinsic::amdgcn_ubfe:
524   case Intrinsic::amdgcn_sbfe: {
525     // Decompose simple cases into standard shifts.
526     Value *Src = II.getArgOperand(0);
527     if (isa<UndefValue>(Src)) {
528       return IC.replaceInstUsesWith(II, Src);
529     }
530 
531     unsigned Width;
532     Type *Ty = II.getType();
533     unsigned IntSize = Ty->getIntegerBitWidth();
534 
535     ConstantInt *CWidth = dyn_cast<ConstantInt>(II.getArgOperand(2));
536     if (CWidth) {
537       Width = CWidth->getZExtValue();
538       if ((Width & (IntSize - 1)) == 0) {
539         return IC.replaceInstUsesWith(II, ConstantInt::getNullValue(Ty));
540       }
541 
542       // Hardware ignores high bits, so remove those.
543       if (Width >= IntSize) {
544         return IC.replaceOperand(
545             II, 2, ConstantInt::get(CWidth->getType(), Width & (IntSize - 1)));
546       }
547     }
548 
549     unsigned Offset;
550     ConstantInt *COffset = dyn_cast<ConstantInt>(II.getArgOperand(1));
551     if (COffset) {
552       Offset = COffset->getZExtValue();
553       if (Offset >= IntSize) {
554         return IC.replaceOperand(
555             II, 1,
556             ConstantInt::get(COffset->getType(), Offset & (IntSize - 1)));
557       }
558     }
559 
560     bool Signed = IID == Intrinsic::amdgcn_sbfe;
561 
562     if (!CWidth || !COffset)
563       break;
564 
565     // The case of Width == 0 is handled above, which makes this transformation
566     // safe.  If Width == 0, then the ashr and lshr instructions become poison
567     // value since the shift amount would be equal to the bit size.
568     assert(Width != 0);
569 
570     // TODO: This allows folding to undef when the hardware has specific
571     // behavior?
572     if (Offset + Width < IntSize) {
573       Value *Shl = IC.Builder.CreateShl(Src, IntSize - Offset - Width);
574       Value *RightShift = Signed ? IC.Builder.CreateAShr(Shl, IntSize - Width)
575                                  : IC.Builder.CreateLShr(Shl, IntSize - Width);
576       RightShift->takeName(&II);
577       return IC.replaceInstUsesWith(II, RightShift);
578     }
579 
580     Value *RightShift = Signed ? IC.Builder.CreateAShr(Src, Offset)
581                                : IC.Builder.CreateLShr(Src, Offset);
582 
583     RightShift->takeName(&II);
584     return IC.replaceInstUsesWith(II, RightShift);
585   }
586   case Intrinsic::amdgcn_exp:
587   case Intrinsic::amdgcn_exp_compr: {
588     ConstantInt *En = cast<ConstantInt>(II.getArgOperand(1));
589     unsigned EnBits = En->getZExtValue();
590     if (EnBits == 0xf)
591       break; // All inputs enabled.
592 
593     bool IsCompr = IID == Intrinsic::amdgcn_exp_compr;
594     bool Changed = false;
595     for (int I = 0; I < (IsCompr ? 2 : 4); ++I) {
596       if ((!IsCompr && (EnBits & (1 << I)) == 0) ||
597           (IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) {
598         Value *Src = II.getArgOperand(I + 2);
599         if (!isa<UndefValue>(Src)) {
600           IC.replaceOperand(II, I + 2, UndefValue::get(Src->getType()));
601           Changed = true;
602         }
603       }
604     }
605 
606     if (Changed) {
607       return &II;
608     }
609 
610     break;
611   }
612   case Intrinsic::amdgcn_fmed3: {
613     // Note this does not preserve proper sNaN behavior if IEEE-mode is enabled
614     // for the shader.
615 
616     Value *Src0 = II.getArgOperand(0);
617     Value *Src1 = II.getArgOperand(1);
618     Value *Src2 = II.getArgOperand(2);
619 
620     // Checking for NaN before canonicalization provides better fidelity when
621     // mapping other operations onto fmed3 since the order of operands is
622     // unchanged.
623     CallInst *NewCall = nullptr;
624     if (match(Src0, PatternMatch::m_NaN()) || isa<UndefValue>(Src0)) {
625       NewCall = IC.Builder.CreateMinNum(Src1, Src2);
626     } else if (match(Src1, PatternMatch::m_NaN()) || isa<UndefValue>(Src1)) {
627       NewCall = IC.Builder.CreateMinNum(Src0, Src2);
628     } else if (match(Src2, PatternMatch::m_NaN()) || isa<UndefValue>(Src2)) {
629       NewCall = IC.Builder.CreateMaxNum(Src0, Src1);
630     }
631 
632     if (NewCall) {
633       NewCall->copyFastMathFlags(&II);
634       NewCall->takeName(&II);
635       return IC.replaceInstUsesWith(II, NewCall);
636     }
637 
638     bool Swap = false;
639     // Canonicalize constants to RHS operands.
640     //
641     // fmed3(c0, x, c1) -> fmed3(x, c0, c1)
642     if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
643       std::swap(Src0, Src1);
644       Swap = true;
645     }
646 
647     if (isa<Constant>(Src1) && !isa<Constant>(Src2)) {
648       std::swap(Src1, Src2);
649       Swap = true;
650     }
651 
652     if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
653       std::swap(Src0, Src1);
654       Swap = true;
655     }
656 
657     if (Swap) {
658       II.setArgOperand(0, Src0);
659       II.setArgOperand(1, Src1);
660       II.setArgOperand(2, Src2);
661       return &II;
662     }
663 
664     if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
665       if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
666         if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Src2)) {
667           APFloat Result = fmed3AMDGCN(C0->getValueAPF(), C1->getValueAPF(),
668                                        C2->getValueAPF());
669           return IC.replaceInstUsesWith(
670               II, ConstantFP::get(IC.Builder.getContext(), Result));
671         }
672       }
673     }
674 
675     break;
676   }
677   case Intrinsic::amdgcn_icmp:
678   case Intrinsic::amdgcn_fcmp: {
679     const ConstantInt *CC = cast<ConstantInt>(II.getArgOperand(2));
680     // Guard against invalid arguments.
681     int64_t CCVal = CC->getZExtValue();
682     bool IsInteger = IID == Intrinsic::amdgcn_icmp;
683     if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE ||
684                        CCVal > CmpInst::LAST_ICMP_PREDICATE)) ||
685         (!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE ||
686                         CCVal > CmpInst::LAST_FCMP_PREDICATE)))
687       break;
688 
689     Value *Src0 = II.getArgOperand(0);
690     Value *Src1 = II.getArgOperand(1);
691 
692     if (auto *CSrc0 = dyn_cast<Constant>(Src0)) {
693       if (auto *CSrc1 = dyn_cast<Constant>(Src1)) {
694         Constant *CCmp = ConstantExpr::getCompare(CCVal, CSrc0, CSrc1);
695         if (CCmp->isNullValue()) {
696           return IC.replaceInstUsesWith(
697               II, ConstantExpr::getSExt(CCmp, II.getType()));
698         }
699 
700         // The result of V_ICMP/V_FCMP assembly instructions (which this
701         // intrinsic exposes) is one bit per thread, masked with the EXEC
702         // register (which contains the bitmask of live threads). So a
703         // comparison that always returns true is the same as a read of the
704         // EXEC register.
705         Function *NewF = Intrinsic::getDeclaration(
706             II.getModule(), Intrinsic::read_register, II.getType());
707         Metadata *MDArgs[] = {MDString::get(II.getContext(), "exec")};
708         MDNode *MD = MDNode::get(II.getContext(), MDArgs);
709         Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)};
710         CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
711         NewCall->addFnAttr(Attribute::Convergent);
712         NewCall->takeName(&II);
713         return IC.replaceInstUsesWith(II, NewCall);
714       }
715 
716       // Canonicalize constants to RHS.
717       CmpInst::Predicate SwapPred =
718           CmpInst::getSwappedPredicate(static_cast<CmpInst::Predicate>(CCVal));
719       II.setArgOperand(0, Src1);
720       II.setArgOperand(1, Src0);
721       II.setArgOperand(
722           2, ConstantInt::get(CC->getType(), static_cast<int>(SwapPred)));
723       return &II;
724     }
725 
726     if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE)
727       break;
728 
729     // Canonicalize compare eq with true value to compare != 0
730     // llvm.amdgcn.icmp(zext (i1 x), 1, eq)
731     //   -> llvm.amdgcn.icmp(zext (i1 x), 0, ne)
732     // llvm.amdgcn.icmp(sext (i1 x), -1, eq)
733     //   -> llvm.amdgcn.icmp(sext (i1 x), 0, ne)
734     Value *ExtSrc;
735     if (CCVal == CmpInst::ICMP_EQ &&
736         ((match(Src1, PatternMatch::m_One()) &&
737           match(Src0, m_ZExt(PatternMatch::m_Value(ExtSrc)))) ||
738          (match(Src1, PatternMatch::m_AllOnes()) &&
739           match(Src0, m_SExt(PatternMatch::m_Value(ExtSrc))))) &&
740         ExtSrc->getType()->isIntegerTy(1)) {
741       IC.replaceOperand(II, 1, ConstantInt::getNullValue(Src1->getType()));
742       IC.replaceOperand(II, 2,
743                         ConstantInt::get(CC->getType(), CmpInst::ICMP_NE));
744       return &II;
745     }
746 
747     CmpInst::Predicate SrcPred;
748     Value *SrcLHS;
749     Value *SrcRHS;
750 
751     // Fold compare eq/ne with 0 from a compare result as the predicate to the
752     // intrinsic. The typical use is a wave vote function in the library, which
753     // will be fed from a user code condition compared with 0. Fold in the
754     // redundant compare.
755 
756     // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne)
757     //   -> llvm.amdgcn.[if]cmp(a, b, pred)
758     //
759     // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq)
760     //   -> llvm.amdgcn.[if]cmp(a, b, inv pred)
761     if (match(Src1, PatternMatch::m_Zero()) &&
762         match(Src0, PatternMatch::m_ZExtOrSExt(
763                         m_Cmp(SrcPred, PatternMatch::m_Value(SrcLHS),
764                               PatternMatch::m_Value(SrcRHS))))) {
765       if (CCVal == CmpInst::ICMP_EQ)
766         SrcPred = CmpInst::getInversePredicate(SrcPred);
767 
768       Intrinsic::ID NewIID = CmpInst::isFPPredicate(SrcPred)
769                                  ? Intrinsic::amdgcn_fcmp
770                                  : Intrinsic::amdgcn_icmp;
771 
772       Type *Ty = SrcLHS->getType();
773       if (auto *CmpType = dyn_cast<IntegerType>(Ty)) {
774         // Promote to next legal integer type.
775         unsigned Width = CmpType->getBitWidth();
776         unsigned NewWidth = Width;
777 
778         // Don't do anything for i1 comparisons.
779         if (Width == 1)
780           break;
781 
782         if (Width <= 16)
783           NewWidth = 16;
784         else if (Width <= 32)
785           NewWidth = 32;
786         else if (Width <= 64)
787           NewWidth = 64;
788         else if (Width > 64)
789           break; // Can't handle this.
790 
791         if (Width != NewWidth) {
792           IntegerType *CmpTy = IC.Builder.getIntNTy(NewWidth);
793           if (CmpInst::isSigned(SrcPred)) {
794             SrcLHS = IC.Builder.CreateSExt(SrcLHS, CmpTy);
795             SrcRHS = IC.Builder.CreateSExt(SrcRHS, CmpTy);
796           } else {
797             SrcLHS = IC.Builder.CreateZExt(SrcLHS, CmpTy);
798             SrcRHS = IC.Builder.CreateZExt(SrcRHS, CmpTy);
799           }
800         }
801       } else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy())
802         break;
803 
804       Function *NewF = Intrinsic::getDeclaration(
805           II.getModule(), NewIID, {II.getType(), SrcLHS->getType()});
806       Value *Args[] = {SrcLHS, SrcRHS,
807                        ConstantInt::get(CC->getType(), SrcPred)};
808       CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
809       NewCall->takeName(&II);
810       return IC.replaceInstUsesWith(II, NewCall);
811     }
812 
813     break;
814   }
815   case Intrinsic::amdgcn_ballot: {
816     if (auto *Src = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
817       if (Src->isZero()) {
818         // amdgcn.ballot(i1 0) is zero.
819         return IC.replaceInstUsesWith(II, Constant::getNullValue(II.getType()));
820       }
821 
822       if (Src->isOne()) {
823         // amdgcn.ballot(i1 1) is exec.
824         const char *RegName = "exec";
825         if (II.getType()->isIntegerTy(32))
826           RegName = "exec_lo";
827         else if (!II.getType()->isIntegerTy(64))
828           break;
829 
830         Function *NewF = Intrinsic::getDeclaration(
831             II.getModule(), Intrinsic::read_register, II.getType());
832         Metadata *MDArgs[] = {MDString::get(II.getContext(), RegName)};
833         MDNode *MD = MDNode::get(II.getContext(), MDArgs);
834         Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)};
835         CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
836         NewCall->addFnAttr(Attribute::Convergent);
837         NewCall->takeName(&II);
838         return IC.replaceInstUsesWith(II, NewCall);
839       }
840     }
841     break;
842   }
843   case Intrinsic::amdgcn_wqm_vote: {
844     // wqm_vote is identity when the argument is constant.
845     if (!isa<Constant>(II.getArgOperand(0)))
846       break;
847 
848     return IC.replaceInstUsesWith(II, II.getArgOperand(0));
849   }
850   case Intrinsic::amdgcn_kill: {
851     const ConstantInt *C = dyn_cast<ConstantInt>(II.getArgOperand(0));
852     if (!C || !C->getZExtValue())
853       break;
854 
855     // amdgcn.kill(i1 1) is a no-op
856     return IC.eraseInstFromFunction(II);
857   }
858   case Intrinsic::amdgcn_update_dpp: {
859     Value *Old = II.getArgOperand(0);
860 
861     auto *BC = cast<ConstantInt>(II.getArgOperand(5));
862     auto *RM = cast<ConstantInt>(II.getArgOperand(3));
863     auto *BM = cast<ConstantInt>(II.getArgOperand(4));
864     if (BC->isZeroValue() || RM->getZExtValue() != 0xF ||
865         BM->getZExtValue() != 0xF || isa<UndefValue>(Old))
866       break;
867 
868     // If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value.
869     return IC.replaceOperand(II, 0, UndefValue::get(Old->getType()));
870   }
871   case Intrinsic::amdgcn_permlane16:
872   case Intrinsic::amdgcn_permlanex16: {
873     // Discard vdst_in if it's not going to be read.
874     Value *VDstIn = II.getArgOperand(0);
875     if (isa<UndefValue>(VDstIn))
876       break;
877 
878     ConstantInt *FetchInvalid = cast<ConstantInt>(II.getArgOperand(4));
879     ConstantInt *BoundCtrl = cast<ConstantInt>(II.getArgOperand(5));
880     if (!FetchInvalid->getZExtValue() && !BoundCtrl->getZExtValue())
881       break;
882 
883     return IC.replaceOperand(II, 0, UndefValue::get(VDstIn->getType()));
884   }
885   case Intrinsic::amdgcn_readfirstlane:
886   case Intrinsic::amdgcn_readlane: {
887     // A constant value is trivially uniform.
888     if (Constant *C = dyn_cast<Constant>(II.getArgOperand(0))) {
889       return IC.replaceInstUsesWith(II, C);
890     }
891 
892     // The rest of these may not be safe if the exec may not be the same between
893     // the def and use.
894     Value *Src = II.getArgOperand(0);
895     Instruction *SrcInst = dyn_cast<Instruction>(Src);
896     if (SrcInst && SrcInst->getParent() != II.getParent())
897       break;
898 
899     // readfirstlane (readfirstlane x) -> readfirstlane x
900     // readlane (readfirstlane x), y -> readfirstlane x
901     if (match(Src,
902               PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readfirstlane>())) {
903       return IC.replaceInstUsesWith(II, Src);
904     }
905 
906     if (IID == Intrinsic::amdgcn_readfirstlane) {
907       // readfirstlane (readlane x, y) -> readlane x, y
908       if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>())) {
909         return IC.replaceInstUsesWith(II, Src);
910       }
911     } else {
912       // readlane (readlane x, y), y -> readlane x, y
913       if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>(
914                          PatternMatch::m_Value(),
915                          PatternMatch::m_Specific(II.getArgOperand(1))))) {
916         return IC.replaceInstUsesWith(II, Src);
917       }
918     }
919 
920     break;
921   }
922   case Intrinsic::amdgcn_ldexp: {
923     // FIXME: This doesn't introduce new instructions and belongs in
924     // InstructionSimplify.
925     Type *Ty = II.getType();
926     Value *Op0 = II.getArgOperand(0);
927     Value *Op1 = II.getArgOperand(1);
928 
929     // Folding undef to qnan is safe regardless of the FP mode.
930     if (isa<UndefValue>(Op0)) {
931       auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
932       return IC.replaceInstUsesWith(II, QNaN);
933     }
934 
935     const APFloat *C = nullptr;
936     match(Op0, PatternMatch::m_APFloat(C));
937 
938     // FIXME: Should flush denorms depending on FP mode, but that's ignored
939     // everywhere else.
940     //
941     // These cases should be safe, even with strictfp.
942     // ldexp(0.0, x) -> 0.0
943     // ldexp(-0.0, x) -> -0.0
944     // ldexp(inf, x) -> inf
945     // ldexp(-inf, x) -> -inf
946     if (C && (C->isZero() || C->isInfinity())) {
947       return IC.replaceInstUsesWith(II, Op0);
948     }
949 
950     // With strictfp, be more careful about possibly needing to flush denormals
951     // or not, and snan behavior depends on ieee_mode.
952     if (II.isStrictFP())
953       break;
954 
955     if (C && C->isNaN()) {
956       // FIXME: We just need to make the nan quiet here, but that's unavailable
957       // on APFloat, only IEEEfloat
958       auto *Quieted =
959           ConstantFP::get(Ty, scalbn(*C, 0, APFloat::rmNearestTiesToEven));
960       return IC.replaceInstUsesWith(II, Quieted);
961     }
962 
963     // ldexp(x, 0) -> x
964     // ldexp(x, undef) -> x
965     if (isa<UndefValue>(Op1) || match(Op1, PatternMatch::m_ZeroInt())) {
966       return IC.replaceInstUsesWith(II, Op0);
967     }
968 
969     break;
970   }
971   case Intrinsic::amdgcn_fmul_legacy: {
972     Value *Op0 = II.getArgOperand(0);
973     Value *Op1 = II.getArgOperand(1);
974 
975     // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
976     // infinity, gives +0.0.
977     // TODO: Move to InstSimplify?
978     if (match(Op0, PatternMatch::m_AnyZeroFP()) ||
979         match(Op1, PatternMatch::m_AnyZeroFP()))
980       return IC.replaceInstUsesWith(II, ConstantFP::getNullValue(II.getType()));
981 
982     // If we can prove we don't have one of the special cases then we can use a
983     // normal fmul instruction instead.
984     if (canSimplifyLegacyMulToMul(Op0, Op1, IC)) {
985       auto *FMul = IC.Builder.CreateFMulFMF(Op0, Op1, &II);
986       FMul->takeName(&II);
987       return IC.replaceInstUsesWith(II, FMul);
988     }
989     break;
990   }
991   case Intrinsic::amdgcn_fma_legacy: {
992     Value *Op0 = II.getArgOperand(0);
993     Value *Op1 = II.getArgOperand(1);
994     Value *Op2 = II.getArgOperand(2);
995 
996     // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
997     // infinity, gives +0.0.
998     // TODO: Move to InstSimplify?
999     if (match(Op0, PatternMatch::m_AnyZeroFP()) ||
1000         match(Op1, PatternMatch::m_AnyZeroFP())) {
1001       // It's tempting to just return Op2 here, but that would give the wrong
1002       // result if Op2 was -0.0.
1003       auto *Zero = ConstantFP::getNullValue(II.getType());
1004       auto *FAdd = IC.Builder.CreateFAddFMF(Zero, Op2, &II);
1005       FAdd->takeName(&II);
1006       return IC.replaceInstUsesWith(II, FAdd);
1007     }
1008 
1009     // If we can prove we don't have one of the special cases then we can use a
1010     // normal fma instead.
1011     if (canSimplifyLegacyMulToMul(Op0, Op1, IC)) {
1012       II.setCalledOperand(Intrinsic::getDeclaration(
1013           II.getModule(), Intrinsic::fma, II.getType()));
1014       return &II;
1015     }
1016     break;
1017   }
1018   case Intrinsic::amdgcn_is_shared:
1019   case Intrinsic::amdgcn_is_private: {
1020     if (isa<UndefValue>(II.getArgOperand(0)))
1021       return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
1022 
1023     if (isa<ConstantPointerNull>(II.getArgOperand(0)))
1024       return IC.replaceInstUsesWith(II, ConstantInt::getFalse(II.getType()));
1025     break;
1026   }
1027   default: {
1028     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
1029             AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) {
1030       return simplifyAMDGCNImageIntrinsic(ST, ImageDimIntr, II, IC);
1031     }
1032   }
1033   }
1034   return None;
1035 }
1036 
1037 /// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
1038 ///
1039 /// Note: This only supports non-TFE/LWE image intrinsic calls; those have
1040 ///       struct returns.
1041 static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
1042                                                     IntrinsicInst &II,
1043                                                     APInt DemandedElts,
1044                                                     int DMaskIdx = -1) {
1045 
1046   auto *IIVTy = cast<FixedVectorType>(II.getType());
1047   unsigned VWidth = IIVTy->getNumElements();
1048   if (VWidth == 1)
1049     return nullptr;
1050 
1051   IRBuilderBase::InsertPointGuard Guard(IC.Builder);
1052   IC.Builder.SetInsertPoint(&II);
1053 
1054   // Assume the arguments are unchanged and later override them, if needed.
1055   SmallVector<Value *, 16> Args(II.args());
1056 
1057   if (DMaskIdx < 0) {
1058     // Buffer case.
1059 
1060     const unsigned ActiveBits = DemandedElts.getActiveBits();
1061     const unsigned UnusedComponentsAtFront = DemandedElts.countTrailingZeros();
1062 
1063     // Start assuming the prefix of elements is demanded, but possibly clear
1064     // some other bits if there are trailing zeros (unused components at front)
1065     // and update offset.
1066     DemandedElts = (1 << ActiveBits) - 1;
1067 
1068     if (UnusedComponentsAtFront > 0) {
1069       static const unsigned InvalidOffsetIdx = 0xf;
1070 
1071       unsigned OffsetIdx;
1072       switch (II.getIntrinsicID()) {
1073       case Intrinsic::amdgcn_raw_buffer_load:
1074         OffsetIdx = 1;
1075         break;
1076       case Intrinsic::amdgcn_s_buffer_load:
1077         // If resulting type is vec3, there is no point in trimming the
1078         // load with updated offset, as the vec3 would most likely be widened to
1079         // vec4 anyway during lowering.
1080         if (ActiveBits == 4 && UnusedComponentsAtFront == 1)
1081           OffsetIdx = InvalidOffsetIdx;
1082         else
1083           OffsetIdx = 1;
1084         break;
1085       case Intrinsic::amdgcn_struct_buffer_load:
1086         OffsetIdx = 2;
1087         break;
1088       default:
1089         // TODO: handle tbuffer* intrinsics.
1090         OffsetIdx = InvalidOffsetIdx;
1091         break;
1092       }
1093 
1094       if (OffsetIdx != InvalidOffsetIdx) {
1095         // Clear demanded bits and update the offset.
1096         DemandedElts &= ~((1 << UnusedComponentsAtFront) - 1);
1097         auto *Offset = II.getArgOperand(OffsetIdx);
1098         unsigned SingleComponentSizeInBits =
1099             IC.getDataLayout().getTypeSizeInBits(II.getType()->getScalarType());
1100         unsigned OffsetAdd =
1101             UnusedComponentsAtFront * SingleComponentSizeInBits / 8;
1102         auto *OffsetAddVal = ConstantInt::get(Offset->getType(), OffsetAdd);
1103         Args[OffsetIdx] = IC.Builder.CreateAdd(Offset, OffsetAddVal);
1104       }
1105     }
1106   } else {
1107     // Image case.
1108 
1109     ConstantInt *DMask = cast<ConstantInt>(II.getArgOperand(DMaskIdx));
1110     unsigned DMaskVal = DMask->getZExtValue() & 0xf;
1111 
1112     // Mask off values that are undefined because the dmask doesn't cover them
1113     DemandedElts &= (1 << countPopulation(DMaskVal)) - 1;
1114 
1115     unsigned NewDMaskVal = 0;
1116     unsigned OrigLoadIdx = 0;
1117     for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) {
1118       const unsigned Bit = 1 << SrcIdx;
1119       if (!!(DMaskVal & Bit)) {
1120         if (!!DemandedElts[OrigLoadIdx])
1121           NewDMaskVal |= Bit;
1122         OrigLoadIdx++;
1123       }
1124     }
1125 
1126     if (DMaskVal != NewDMaskVal)
1127       Args[DMaskIdx] = ConstantInt::get(DMask->getType(), NewDMaskVal);
1128   }
1129 
1130   unsigned NewNumElts = DemandedElts.countPopulation();
1131   if (!NewNumElts)
1132     return UndefValue::get(II.getType());
1133 
1134   if (NewNumElts >= VWidth && DemandedElts.isMask()) {
1135     if (DMaskIdx >= 0)
1136       II.setArgOperand(DMaskIdx, Args[DMaskIdx]);
1137     return nullptr;
1138   }
1139 
1140   // Validate function argument and return types, extracting overloaded types
1141   // along the way.
1142   SmallVector<Type *, 6> OverloadTys;
1143   if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), OverloadTys))
1144     return nullptr;
1145 
1146   Module *M = II.getParent()->getParent()->getParent();
1147   Type *EltTy = IIVTy->getElementType();
1148   Type *NewTy =
1149       (NewNumElts == 1) ? EltTy : FixedVectorType::get(EltTy, NewNumElts);
1150 
1151   OverloadTys[0] = NewTy;
1152   Function *NewIntrin =
1153       Intrinsic::getDeclaration(M, II.getIntrinsicID(), OverloadTys);
1154 
1155   CallInst *NewCall = IC.Builder.CreateCall(NewIntrin, Args);
1156   NewCall->takeName(&II);
1157   NewCall->copyMetadata(II);
1158 
1159   if (NewNumElts == 1) {
1160     return IC.Builder.CreateInsertElement(UndefValue::get(II.getType()),
1161                                           NewCall,
1162                                           DemandedElts.countTrailingZeros());
1163   }
1164 
1165   SmallVector<int, 8> EltMask;
1166   unsigned NewLoadIdx = 0;
1167   for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) {
1168     if (!!DemandedElts[OrigLoadIdx])
1169       EltMask.push_back(NewLoadIdx++);
1170     else
1171       EltMask.push_back(NewNumElts);
1172   }
1173 
1174   Value *Shuffle = IC.Builder.CreateShuffleVector(NewCall, EltMask);
1175 
1176   return Shuffle;
1177 }
1178 
1179 Optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic(
1180     InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
1181     APInt &UndefElts2, APInt &UndefElts3,
1182     std::function<void(Instruction *, unsigned, APInt, APInt &)>
1183         SimplifyAndSetOp) const {
1184   switch (II.getIntrinsicID()) {
1185   case Intrinsic::amdgcn_buffer_load:
1186   case Intrinsic::amdgcn_buffer_load_format:
1187   case Intrinsic::amdgcn_raw_buffer_load:
1188   case Intrinsic::amdgcn_raw_buffer_load_format:
1189   case Intrinsic::amdgcn_raw_tbuffer_load:
1190   case Intrinsic::amdgcn_s_buffer_load:
1191   case Intrinsic::amdgcn_struct_buffer_load:
1192   case Intrinsic::amdgcn_struct_buffer_load_format:
1193   case Intrinsic::amdgcn_struct_tbuffer_load:
1194   case Intrinsic::amdgcn_tbuffer_load:
1195     return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts);
1196   default: {
1197     if (getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID())) {
1198       return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, 0);
1199     }
1200     break;
1201   }
1202   }
1203   return None;
1204 }
1205