xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp (revision 06c3fb2749bda94cb5201f81ffdb8fa6c3161b2e)
1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUInstrInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "Utils/AMDGPUBaseInfo.h"
22 #include "llvm/ADT/ScopeExit.h"
23 #include "llvm/BinaryFormat/ELF.h"
24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/CodeGen/GlobalISel/Utils.h"
28 #include "llvm/IR/DiagnosticInfo.h"
29 #include "llvm/IR/IntrinsicsAMDGPU.h"
30 #include "llvm/IR/IntrinsicsR600.h"
31 
32 #define DEBUG_TYPE "amdgpu-legalinfo"
33 
34 using namespace llvm;
35 using namespace LegalizeActions;
36 using namespace LegalizeMutations;
37 using namespace LegalityPredicates;
38 using namespace MIPatternMatch;
39 
40 // Hack until load/store selection patterns support any tuple of legal types.
41 static cl::opt<bool> EnableNewLegality(
42   "amdgpu-global-isel-new-legality",
43   cl::desc("Use GlobalISel desired legality, rather than try to use"
44            "rules compatible with selection patterns"),
45   cl::init(false),
46   cl::ReallyHidden);
47 
48 static constexpr unsigned MaxRegisterSize = 1024;
49 
50 // Round the number of elements to the next power of two elements
51 static LLT getPow2VectorType(LLT Ty) {
52   unsigned NElts = Ty.getNumElements();
53   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
54   return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
55 }
56 
57 // Round the number of bits to the next power of two bits
58 static LLT getPow2ScalarType(LLT Ty) {
59   unsigned Bits = Ty.getSizeInBits();
60   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
61   return LLT::scalar(Pow2Bits);
62 }
63 
64 /// \returns true if this is an odd sized vector which should widen by adding an
65 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
66 /// excludes s1 vectors, which should always be scalarized.
67 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
68   return [=](const LegalityQuery &Query) {
69     const LLT Ty = Query.Types[TypeIdx];
70     if (!Ty.isVector())
71       return false;
72 
73     const LLT EltTy = Ty.getElementType();
74     const unsigned EltSize = EltTy.getSizeInBits();
75     return Ty.getNumElements() % 2 != 0 &&
76            EltSize > 1 && EltSize < 32 &&
77            Ty.getSizeInBits() % 32 != 0;
78   };
79 }
80 
81 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
82   return [=](const LegalityQuery &Query) {
83     const LLT Ty = Query.Types[TypeIdx];
84     return Ty.getSizeInBits() % 32 == 0;
85   };
86 }
87 
88 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
89   return [=](const LegalityQuery &Query) {
90     const LLT Ty = Query.Types[TypeIdx];
91     const LLT EltTy = Ty.getScalarType();
92     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
93   };
94 }
95 
96 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
97   return [=](const LegalityQuery &Query) {
98     const LLT Ty = Query.Types[TypeIdx];
99     const LLT EltTy = Ty.getElementType();
100     return std::pair(TypeIdx,
101                      LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
102   };
103 }
104 
105 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
106   return [=](const LegalityQuery &Query) {
107     const LLT Ty = Query.Types[TypeIdx];
108     const LLT EltTy = Ty.getElementType();
109     unsigned Size = Ty.getSizeInBits();
110     unsigned Pieces = (Size + 63) / 64;
111     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
112     return std::pair(TypeIdx, LLT::scalarOrVector(
113                                   ElementCount::getFixed(NewNumElts), EltTy));
114   };
115 }
116 
117 // Increase the number of vector elements to reach the next multiple of 32-bit
118 // type.
119 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
120   return [=](const LegalityQuery &Query) {
121     const LLT Ty = Query.Types[TypeIdx];
122 
123     const LLT EltTy = Ty.getElementType();
124     const int Size = Ty.getSizeInBits();
125     const int EltSize = EltTy.getSizeInBits();
126     const int NextMul32 = (Size + 31) / 32;
127 
128     assert(EltSize < 32);
129 
130     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
131     return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
132   };
133 }
134 
135 // Increase the number of vector elements to reach the next legal RegClass.
136 static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx) {
137   return [=](const LegalityQuery &Query) {
138     const LLT Ty = Query.Types[TypeIdx];
139     const unsigned NumElts = Ty.getNumElements();
140     const unsigned EltSize = Ty.getElementType().getSizeInBits();
141     const unsigned MaxNumElts = MaxRegisterSize / EltSize;
142 
143     assert(EltSize == 32 || EltSize == 64);
144     assert(Ty.getSizeInBits() < MaxRegisterSize);
145 
146     unsigned NewNumElts;
147     // Find the nearest legal RegClass that is larger than the current type.
148     for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
149       if (SIRegisterInfo::getSGPRClassForBitWidth(NewNumElts * EltSize))
150         break;
151     }
152 
153     return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltSize));
154   };
155 }
156 
157 static LLT getBufferRsrcScalarType(const LLT Ty) {
158   if (!Ty.isVector())
159     return LLT::scalar(128);
160   const ElementCount NumElems = Ty.getElementCount();
161   return LLT::vector(NumElems, LLT::scalar(128));
162 }
163 
164 static LLT getBufferRsrcRegisterType(const LLT Ty) {
165   if (!Ty.isVector())
166     return LLT::fixed_vector(4, LLT::scalar(32));
167   const unsigned NumElems = Ty.getElementCount().getFixedValue();
168   return LLT::fixed_vector(NumElems * 4, LLT::scalar(32));
169 }
170 
171 static LLT getBitcastRegisterType(const LLT Ty) {
172   const unsigned Size = Ty.getSizeInBits();
173 
174   if (Size <= 32) {
175     // <2 x s8> -> s16
176     // <4 x s8> -> s32
177     return LLT::scalar(Size);
178   }
179 
180   return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32);
181 }
182 
183 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
184   return [=](const LegalityQuery &Query) {
185     const LLT Ty = Query.Types[TypeIdx];
186     return std::pair(TypeIdx, getBitcastRegisterType(Ty));
187   };
188 }
189 
190 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
191   return [=](const LegalityQuery &Query) {
192     const LLT Ty = Query.Types[TypeIdx];
193     unsigned Size = Ty.getSizeInBits();
194     assert(Size % 32 == 0);
195     return std::pair(
196         TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32));
197   };
198 }
199 
200 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
201   return [=](const LegalityQuery &Query) {
202     const LLT QueryTy = Query.Types[TypeIdx];
203     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
204   };
205 }
206 
207 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
208   return [=](const LegalityQuery &Query) {
209     const LLT QueryTy = Query.Types[TypeIdx];
210     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
211   };
212 }
213 
214 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
215   return [=](const LegalityQuery &Query) {
216     const LLT QueryTy = Query.Types[TypeIdx];
217     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
218   };
219 }
220 
221 static bool isRegisterSize(unsigned Size) {
222   return Size % 32 == 0 && Size <= MaxRegisterSize;
223 }
224 
225 static bool isRegisterVectorElementType(LLT EltTy) {
226   const int EltSize = EltTy.getSizeInBits();
227   return EltSize == 16 || EltSize % 32 == 0;
228 }
229 
230 static bool isRegisterVectorType(LLT Ty) {
231   const int EltSize = Ty.getElementType().getSizeInBits();
232   return EltSize == 32 || EltSize == 64 ||
233          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
234          EltSize == 128 || EltSize == 256;
235 }
236 
237 static bool isRegisterType(LLT Ty) {
238   if (!isRegisterSize(Ty.getSizeInBits()))
239     return false;
240 
241   if (Ty.isVector())
242     return isRegisterVectorType(Ty);
243 
244   return true;
245 }
246 
247 // Any combination of 32 or 64-bit elements up the maximum register size, and
248 // multiples of v2s16.
249 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
250   return [=](const LegalityQuery &Query) {
251     return isRegisterType(Query.Types[TypeIdx]);
252   };
253 }
254 
255 // RegisterType that doesn't have a corresponding RegClass.
256 static LegalityPredicate isIllegalRegisterType(unsigned TypeIdx) {
257   return [=](const LegalityQuery &Query) {
258     LLT Ty = Query.Types[TypeIdx];
259     return isRegisterType(Ty) &&
260            !SIRegisterInfo::getSGPRClassForBitWidth(Ty.getSizeInBits());
261   };
262 }
263 
264 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
265   return [=](const LegalityQuery &Query) {
266     const LLT QueryTy = Query.Types[TypeIdx];
267     if (!QueryTy.isVector())
268       return false;
269     const LLT EltTy = QueryTy.getElementType();
270     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
271   };
272 }
273 
274 // If we have a truncating store or an extending load with a data size larger
275 // than 32-bits, we need to reduce to a 32-bit type.
276 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
277   return [=](const LegalityQuery &Query) {
278     const LLT Ty = Query.Types[TypeIdx];
279     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
280            Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
281   };
282 }
283 
284 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
285 // handle some operations by just promoting the register during
286 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
287 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
288                                     bool IsLoad, bool IsAtomic) {
289   switch (AS) {
290   case AMDGPUAS::PRIVATE_ADDRESS:
291     // FIXME: Private element size.
292     return ST.enableFlatScratch() ? 128 : 32;
293   case AMDGPUAS::LOCAL_ADDRESS:
294     return ST.useDS128() ? 128 : 64;
295   case AMDGPUAS::GLOBAL_ADDRESS:
296   case AMDGPUAS::CONSTANT_ADDRESS:
297   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
298   case AMDGPUAS::BUFFER_RESOURCE:
299     // Treat constant and global as identical. SMRD loads are sometimes usable for
300     // global loads (ideally constant address space should be eliminated)
301     // depending on the context. Legality cannot be context dependent, but
302     // RegBankSelect can split the load as necessary depending on the pointer
303     // register bank/uniformity and if the memory is invariant or not written in a
304     // kernel.
305     return IsLoad ? 512 : 128;
306   default:
307     // FIXME: Flat addresses may contextually need to be split to 32-bit parts
308     // if they may alias scratch depending on the subtarget.  This needs to be
309     // moved to custom handling to use addressMayBeAccessedAsPrivate
310     return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
311   }
312 }
313 
314 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
315                                  const LegalityQuery &Query) {
316   const LLT Ty = Query.Types[0];
317 
318   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
319   const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
320 
321   unsigned RegSize = Ty.getSizeInBits();
322   uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
323   uint64_t AlignBits = Query.MMODescrs[0].AlignInBits;
324   unsigned AS = Query.Types[1].getAddressSpace();
325 
326   // All of these need to be custom lowered to cast the pointer operand.
327   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
328     return false;
329 
330   // Do not handle extending vector loads.
331   if (Ty.isVector() && MemSize != RegSize)
332     return false;
333 
334   // TODO: We should be able to widen loads if the alignment is high enough, but
335   // we also need to modify the memory access size.
336 #if 0
337   // Accept widening loads based on alignment.
338   if (IsLoad && MemSize < Size)
339     MemSize = std::max(MemSize, Align);
340 #endif
341 
342   // Only 1-byte and 2-byte to 32-bit extloads are valid.
343   if (MemSize != RegSize && RegSize != 32)
344     return false;
345 
346   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
347                                     Query.MMODescrs[0].Ordering !=
348                                         AtomicOrdering::NotAtomic))
349     return false;
350 
351   switch (MemSize) {
352   case 8:
353   case 16:
354   case 32:
355   case 64:
356   case 128:
357     break;
358   case 96:
359     if (!ST.hasDwordx3LoadStores())
360       return false;
361     break;
362   case 256:
363   case 512:
364     // These may contextually need to be broken down.
365     break;
366   default:
367     return false;
368   }
369 
370   assert(RegSize >= MemSize);
371 
372   if (AlignBits < MemSize) {
373     const SITargetLowering *TLI = ST.getTargetLowering();
374     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
375                                                  Align(AlignBits / 8)))
376       return false;
377   }
378 
379   return true;
380 }
381 
382 // The newer buffer intrinsic forms take their resource arguments as
383 // pointers in address space 8, aka s128 values. However, in order to not break
384 // SelectionDAG, the underlying operations have to continue to take v4i32
385 // arguments. Therefore, we convert resource pointers - or vectors of them
386 // to integer values here.
387 static bool hasBufferRsrcWorkaround(const LLT Ty) {
388   if (Ty.isPointer() && Ty.getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
389     return true;
390   if (Ty.isVector()) {
391     const LLT ElemTy = Ty.getElementType();
392     return hasBufferRsrcWorkaround(ElemTy);
393   }
394   return false;
395 }
396 
397 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
398 // workaround this. Eventually it should ignore the type for loads and only care
399 // about the size. Return true in cases where we will workaround this for now by
400 // bitcasting.
401 static bool loadStoreBitcastWorkaround(const LLT Ty) {
402   if (EnableNewLegality)
403     return false;
404 
405   const unsigned Size = Ty.getSizeInBits();
406   if (Size <= 64)
407     return false;
408   // Address space 8 pointers get their own workaround.
409   if (hasBufferRsrcWorkaround(Ty))
410     return false;
411   if (!Ty.isVector())
412     return true;
413 
414   LLT EltTy = Ty.getElementType();
415   if (EltTy.isPointer())
416     return true;
417 
418   unsigned EltSize = EltTy.getSizeInBits();
419   return EltSize != 32 && EltSize != 64;
420 }
421 
422 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
423   const LLT Ty = Query.Types[0];
424   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
425          !hasBufferRsrcWorkaround(Ty) && !loadStoreBitcastWorkaround(Ty);
426 }
427 
428 /// Return true if a load or store of the type should be lowered with a bitcast
429 /// to a different type.
430 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
431                                        const LLT MemTy) {
432   const unsigned MemSizeInBits = MemTy.getSizeInBits();
433   const unsigned Size = Ty.getSizeInBits();
434   if (Size != MemSizeInBits)
435     return Size <= 32 && Ty.isVector();
436 
437   if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
438     return true;
439 
440   // Don't try to handle bitcasting vector ext loads for now.
441   return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
442          (Size <= 32 || isRegisterSize(Size)) &&
443          !isRegisterVectorElementType(Ty.getElementType());
444 }
445 
446 /// Return true if we should legalize a load by widening an odd sized memory
447 /// access up to the alignment. Note this case when the memory access itself
448 /// changes, not the size of the result register.
449 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
450                             uint64_t AlignInBits, unsigned AddrSpace,
451                             unsigned Opcode) {
452   unsigned SizeInBits = MemoryTy.getSizeInBits();
453   // We don't want to widen cases that are naturally legal.
454   if (isPowerOf2_32(SizeInBits))
455     return false;
456 
457   // If we have 96-bit memory operations, we shouldn't touch them. Note we may
458   // end up widening these for a scalar load during RegBankSelect, since there
459   // aren't 96-bit scalar loads.
460   if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
461     return false;
462 
463   if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode, false))
464     return false;
465 
466   // A load is known dereferenceable up to the alignment, so it's legal to widen
467   // to it.
468   //
469   // TODO: Could check dereferenceable for less aligned cases.
470   unsigned RoundedSize = NextPowerOf2(SizeInBits);
471   if (AlignInBits < RoundedSize)
472     return false;
473 
474   // Do not widen if it would introduce a slow unaligned load.
475   const SITargetLowering *TLI = ST.getTargetLowering();
476   unsigned Fast = 0;
477   return TLI->allowsMisalignedMemoryAccessesImpl(
478              RoundedSize, AddrSpace, Align(AlignInBits / 8),
479              MachineMemOperand::MOLoad, &Fast) &&
480          Fast;
481 }
482 
483 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
484                             unsigned Opcode) {
485   if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
486     return false;
487 
488   return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
489                          Query.MMODescrs[0].AlignInBits,
490                          Query.Types[1].getAddressSpace(), Opcode);
491 }
492 
493 /// Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial
494 /// type of the operand `idx` and then to transform it to a `p8` via bitcasts
495 /// and inttoptr. In addition, handle vectors of p8. Returns the new type.
496 static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B,
497                                    MachineRegisterInfo &MRI, unsigned Idx) {
498   MachineOperand &MO = MI.getOperand(Idx);
499 
500   const LLT PointerTy = MRI.getType(MO.getReg());
501 
502   // Paranoidly prevent us from doing this multiple times.
503   if (!hasBufferRsrcWorkaround(PointerTy))
504     return PointerTy;
505 
506   const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
507   const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
508   if (!PointerTy.isVector()) {
509     // Happy path: (4 x s32) -> (s32, s32, s32, s32) -> (p8)
510     const unsigned NumParts = PointerTy.getSizeInBits() / 32;
511     const LLT S32 = LLT::scalar(32);
512 
513     Register VectorReg = MRI.createGenericVirtualRegister(VectorTy);
514     std::array<Register, 4> VectorElems;
515     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
516     for (unsigned I = 0; I < NumParts; ++I)
517       VectorElems[I] =
518           B.buildExtractVectorElementConstant(S32, VectorReg, I).getReg(0);
519     B.buildMergeValues(MO, VectorElems);
520     MO.setReg(VectorReg);
521     return VectorTy;
522   }
523   Register BitcastReg = MRI.createGenericVirtualRegister(VectorTy);
524   B.setInsertPt(B.getMBB(), ++B.getInsertPt());
525   auto Scalar = B.buildBitcast(ScalarTy, BitcastReg);
526   B.buildIntToPtr(MO, Scalar);
527   MO.setReg(BitcastReg);
528 
529   return VectorTy;
530 }
531 
532 /// Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is
533 /// the form in which the value must be in order to be passed to the low-level
534 /// representations used for MUBUF/MTBUF intrinsics. This is a hack, which is
535 /// needed in order to account for the fact that we can't define a register
536 /// class for s128 without breaking SelectionDAG.
537 static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B) {
538   MachineRegisterInfo &MRI = *B.getMRI();
539   const LLT PointerTy = MRI.getType(Pointer);
540   const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
541   const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
542 
543   if (!PointerTy.isVector()) {
544     // Special case: p8 -> (s32, s32, s32, s32) -> (4xs32)
545     SmallVector<Register, 4> PointerParts;
546     const unsigned NumParts = PointerTy.getSizeInBits() / 32;
547     auto Unmerged = B.buildUnmerge(LLT::scalar(32), Pointer);
548     for (unsigned I = 0; I < NumParts; ++I)
549       PointerParts.push_back(Unmerged.getReg(I));
550     return B.buildBuildVector(VectorTy, PointerParts).getReg(0);
551   }
552   Register Scalar = B.buildPtrToInt(ScalarTy, Pointer).getReg(0);
553   return B.buildBitcast(VectorTy, Scalar).getReg(0);
554 }
555 
556 static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B,
557                                      unsigned Idx) {
558   MachineOperand &MO = MI.getOperand(Idx);
559 
560   const LLT PointerTy = B.getMRI()->getType(MO.getReg());
561   // Paranoidly prevent us from doing this multiple times.
562   if (!hasBufferRsrcWorkaround(PointerTy))
563     return;
564   MO.setReg(castBufferRsrcToV4I32(MO.getReg(), B));
565 }
566 
567 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
568                                          const GCNTargetMachine &TM)
569   :  ST(ST_) {
570   using namespace TargetOpcode;
571 
572   auto GetAddrSpacePtr = [&TM](unsigned AS) {
573     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
574   };
575 
576   const LLT S1 = LLT::scalar(1);
577   const LLT S8 = LLT::scalar(8);
578   const LLT S16 = LLT::scalar(16);
579   const LLT S32 = LLT::scalar(32);
580   const LLT S64 = LLT::scalar(64);
581   const LLT S128 = LLT::scalar(128);
582   const LLT S256 = LLT::scalar(256);
583   const LLT S512 = LLT::scalar(512);
584   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
585 
586   const LLT V2S8 = LLT::fixed_vector(2, 8);
587   const LLT V2S16 = LLT::fixed_vector(2, 16);
588   const LLT V4S16 = LLT::fixed_vector(4, 16);
589 
590   const LLT V2S32 = LLT::fixed_vector(2, 32);
591   const LLT V3S32 = LLT::fixed_vector(3, 32);
592   const LLT V4S32 = LLT::fixed_vector(4, 32);
593   const LLT V5S32 = LLT::fixed_vector(5, 32);
594   const LLT V6S32 = LLT::fixed_vector(6, 32);
595   const LLT V7S32 = LLT::fixed_vector(7, 32);
596   const LLT V8S32 = LLT::fixed_vector(8, 32);
597   const LLT V9S32 = LLT::fixed_vector(9, 32);
598   const LLT V10S32 = LLT::fixed_vector(10, 32);
599   const LLT V11S32 = LLT::fixed_vector(11, 32);
600   const LLT V12S32 = LLT::fixed_vector(12, 32);
601   const LLT V13S32 = LLT::fixed_vector(13, 32);
602   const LLT V14S32 = LLT::fixed_vector(14, 32);
603   const LLT V15S32 = LLT::fixed_vector(15, 32);
604   const LLT V16S32 = LLT::fixed_vector(16, 32);
605   const LLT V32S32 = LLT::fixed_vector(32, 32);
606 
607   const LLT V2S64 = LLT::fixed_vector(2, 64);
608   const LLT V3S64 = LLT::fixed_vector(3, 64);
609   const LLT V4S64 = LLT::fixed_vector(4, 64);
610   const LLT V5S64 = LLT::fixed_vector(5, 64);
611   const LLT V6S64 = LLT::fixed_vector(6, 64);
612   const LLT V7S64 = LLT::fixed_vector(7, 64);
613   const LLT V8S64 = LLT::fixed_vector(8, 64);
614   const LLT V16S64 = LLT::fixed_vector(16, 64);
615 
616   std::initializer_list<LLT> AllS32Vectors =
617     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
618      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
619   std::initializer_list<LLT> AllS64Vectors =
620     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
621 
622   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
623   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
624   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
625   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
626   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
627   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
628   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
629   const LLT BufferFatPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_FAT_POINTER);
630   const LLT RsrcPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_RESOURCE);
631 
632   const LLT CodePtr = FlatPtr;
633 
634   const std::initializer_list<LLT> AddrSpaces64 = {
635     GlobalPtr, ConstantPtr, FlatPtr
636   };
637 
638   const std::initializer_list<LLT> AddrSpaces32 = {
639     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
640   };
641 
642   const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
643 
644   const std::initializer_list<LLT> FPTypesBase = {
645     S32, S64
646   };
647 
648   const std::initializer_list<LLT> FPTypes16 = {
649     S32, S64, S16
650   };
651 
652   const std::initializer_list<LLT> FPTypesPK16 = {
653     S32, S64, S16, V2S16
654   };
655 
656   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
657 
658   // s1 for VCC branches, s32 for SCC branches.
659   getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32});
660 
661   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
662   // elements for v3s16
663   getActionDefinitionsBuilder(G_PHI)
664       .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
665       .legalFor(AllS32Vectors)
666       .legalFor(AllS64Vectors)
667       .legalFor(AddrSpaces64)
668       .legalFor(AddrSpaces32)
669       .legalFor(AddrSpaces128)
670       .legalIf(isPointer(0))
671       .clampScalar(0, S16, S256)
672       .widenScalarToNextPow2(0, 32)
673       .clampMaxNumElements(0, S32, 16)
674       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
675       .scalarize(0);
676 
677   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
678     // Full set of gfx9 features.
679     getActionDefinitionsBuilder({G_ADD, G_SUB})
680       .legalFor({S32, S16, V2S16})
681       .clampMaxNumElementsStrict(0, S16, 2)
682       .scalarize(0)
683       .minScalar(0, S16)
684       .widenScalarToNextMultipleOf(0, 32)
685       .maxScalar(0, S32);
686 
687     getActionDefinitionsBuilder(G_MUL)
688       .legalFor({S32, S16, V2S16})
689       .clampMaxNumElementsStrict(0, S16, 2)
690       .scalarize(0)
691       .minScalar(0, S16)
692       .widenScalarToNextMultipleOf(0, 32)
693       .custom();
694     assert(ST.hasMad64_32());
695 
696     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
697       .legalFor({S32, S16, V2S16}) // Clamp modifier
698       .minScalarOrElt(0, S16)
699       .clampMaxNumElementsStrict(0, S16, 2)
700       .scalarize(0)
701       .widenScalarToNextPow2(0, 32)
702       .lower();
703   } else if (ST.has16BitInsts()) {
704     getActionDefinitionsBuilder({G_ADD, G_SUB})
705       .legalFor({S32, S16})
706       .minScalar(0, S16)
707       .widenScalarToNextMultipleOf(0, 32)
708       .maxScalar(0, S32)
709       .scalarize(0);
710 
711     getActionDefinitionsBuilder(G_MUL)
712       .legalFor({S32, S16})
713       .scalarize(0)
714       .minScalar(0, S16)
715       .widenScalarToNextMultipleOf(0, 32)
716       .custom();
717     assert(ST.hasMad64_32());
718 
719     // Technically the saturating operations require clamp bit support, but this
720     // was introduced at the same time as 16-bit operations.
721     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
722       .legalFor({S32, S16}) // Clamp modifier
723       .minScalar(0, S16)
724       .scalarize(0)
725       .widenScalarToNextPow2(0, 16)
726       .lower();
727 
728     // We're just lowering this, but it helps get a better result to try to
729     // coerce to the desired type first.
730     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
731       .minScalar(0, S16)
732       .scalarize(0)
733       .lower();
734   } else {
735     getActionDefinitionsBuilder({G_ADD, G_SUB})
736       .legalFor({S32})
737       .widenScalarToNextMultipleOf(0, 32)
738       .clampScalar(0, S32, S32)
739       .scalarize(0);
740 
741     auto &Mul = getActionDefinitionsBuilder(G_MUL)
742       .legalFor({S32})
743       .scalarize(0)
744       .minScalar(0, S32)
745       .widenScalarToNextMultipleOf(0, 32);
746 
747     if (ST.hasMad64_32())
748       Mul.custom();
749     else
750       Mul.maxScalar(0, S32);
751 
752     if (ST.hasIntClamp()) {
753       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
754         .legalFor({S32}) // Clamp modifier.
755         .scalarize(0)
756         .minScalarOrElt(0, S32)
757         .lower();
758     } else {
759       // Clamp bit support was added in VI, along with 16-bit operations.
760       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
761         .minScalar(0, S32)
762         .scalarize(0)
763         .lower();
764     }
765 
766     // FIXME: DAG expansion gets better results. The widening uses the smaller
767     // range values and goes for the min/max lowering directly.
768     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
769       .minScalar(0, S32)
770       .scalarize(0)
771       .lower();
772   }
773 
774   getActionDefinitionsBuilder(
775       {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
776       .customFor({S32, S64})
777       .clampScalar(0, S32, S64)
778       .widenScalarToNextPow2(0, 32)
779       .scalarize(0);
780 
781   auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
782                    .legalFor({S32})
783                    .maxScalar(0, S32);
784 
785   if (ST.hasVOP3PInsts()) {
786     Mulh
787       .clampMaxNumElements(0, S8, 2)
788       .lowerFor({V2S8});
789   }
790 
791   Mulh
792     .scalarize(0)
793     .lower();
794 
795   // Report legal for any types we can handle anywhere. For the cases only legal
796   // on the SALU, RegBankSelect will be able to re-legalize.
797   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
798     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
799     .clampScalar(0, S32, S64)
800     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
801     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
802     .widenScalarToNextPow2(0)
803     .scalarize(0);
804 
805   getActionDefinitionsBuilder(
806       {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
807       .legalFor({{S32, S1}, {S32, S32}})
808       .clampScalar(0, S32, S32)
809       .scalarize(0);
810 
811   getActionDefinitionsBuilder(G_BITCAST)
812     // Don't worry about the size constraint.
813     .legalIf(all(isRegisterType(0), isRegisterType(1)))
814     .lower();
815 
816 
817   getActionDefinitionsBuilder(G_CONSTANT)
818     .legalFor({S1, S32, S64, S16, GlobalPtr,
819                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
820     .legalIf(isPointer(0))
821     .clampScalar(0, S32, S64)
822     .widenScalarToNextPow2(0);
823 
824   getActionDefinitionsBuilder(G_FCONSTANT)
825     .legalFor({S32, S64, S16})
826     .clampScalar(0, S16, S64);
827 
828   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
829       .legalIf(isRegisterType(0))
830       // s1 and s16 are special cases because they have legal operations on
831       // them, but don't really occupy registers in the normal way.
832       .legalFor({S1, S16})
833       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
834       .clampScalarOrElt(0, S32, MaxScalar)
835       .widenScalarToNextPow2(0, 32)
836       .clampMaxNumElements(0, S32, 16);
837 
838   getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
839 
840   // If the amount is divergent, we have to do a wave reduction to get the
841   // maximum value, so this is expanded during RegBankSelect.
842   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
843     .legalFor({{PrivatePtr, S32}});
844 
845   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
846     .customIf(typeIsNot(0, PrivatePtr));
847 
848   getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
849 
850   auto &FPOpActions = getActionDefinitionsBuilder(
851     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
852       G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
853     .legalFor({S32, S64});
854   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
855     .customFor({S32, S64});
856   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
857     .customFor({S32, S64});
858 
859   if (ST.has16BitInsts()) {
860     if (ST.hasVOP3PInsts())
861       FPOpActions.legalFor({S16, V2S16});
862     else
863       FPOpActions.legalFor({S16});
864 
865     TrigActions.customFor({S16});
866     FDIVActions.customFor({S16});
867   }
868 
869   auto &MinNumMaxNum = getActionDefinitionsBuilder({
870       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
871 
872   if (ST.hasVOP3PInsts()) {
873     MinNumMaxNum.customFor(FPTypesPK16)
874       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
875       .clampMaxNumElements(0, S16, 2)
876       .clampScalar(0, S16, S64)
877       .scalarize(0);
878   } else if (ST.has16BitInsts()) {
879     MinNumMaxNum.customFor(FPTypes16)
880       .clampScalar(0, S16, S64)
881       .scalarize(0);
882   } else {
883     MinNumMaxNum.customFor(FPTypesBase)
884       .clampScalar(0, S32, S64)
885       .scalarize(0);
886   }
887 
888   if (ST.hasVOP3PInsts())
889     FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
890 
891   FPOpActions
892     .scalarize(0)
893     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
894 
895   TrigActions
896     .scalarize(0)
897     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
898 
899   FDIVActions
900     .scalarize(0)
901     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
902 
903   getActionDefinitionsBuilder({G_FNEG, G_FABS})
904     .legalFor(FPTypesPK16)
905     .clampMaxNumElementsStrict(0, S16, 2)
906     .scalarize(0)
907     .clampScalar(0, S16, S64);
908 
909   if (ST.has16BitInsts()) {
910     getActionDefinitionsBuilder(G_FSQRT)
911       .legalFor({S32, S16})
912       .customFor({S64})
913       .scalarize(0)
914       .clampScalar(0, S16, S64);
915     getActionDefinitionsBuilder(G_FFLOOR)
916       .legalFor({S32, S64, S16})
917       .scalarize(0)
918       .clampScalar(0, S16, S64);
919 
920     getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
921       .legalFor({{S32, S32}, {S64, S32}, {S16, S16}})
922       .scalarize(0)
923       .maxScalarIf(typeIs(0, S16), 1, S16)
924       .clampScalar(1, S32, S32)
925       .lower();
926 
927     getActionDefinitionsBuilder(G_FFREXP)
928       .customFor({{S32, S32}, {S64, S32}, {S16, S16}, {S16, S32}})
929       .scalarize(0)
930       .lower();
931   } else {
932     getActionDefinitionsBuilder(G_FSQRT)
933       .legalFor({S32})
934       .customFor({S64})
935       .scalarize(0)
936       .clampScalar(0, S32, S64);
937 
938     if (ST.hasFractBug()) {
939       getActionDefinitionsBuilder(G_FFLOOR)
940         .customFor({S64})
941         .legalFor({S32, S64})
942         .scalarize(0)
943         .clampScalar(0, S32, S64);
944     } else {
945       getActionDefinitionsBuilder(G_FFLOOR)
946         .legalFor({S32, S64})
947         .scalarize(0)
948         .clampScalar(0, S32, S64);
949     }
950 
951     getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
952       .legalFor({{S32, S32}, {S64, S32}})
953       .scalarize(0)
954       .clampScalar(0, S32, S64)
955       .clampScalar(1, S32, S32)
956       .lower();
957 
958     getActionDefinitionsBuilder(G_FFREXP)
959       .customFor({{S32, S32}, {S64, S32}})
960       .scalarize(0)
961       .minScalar(0, S32)
962       .clampScalar(1, S32, S32)
963       .lower();
964   }
965 
966   getActionDefinitionsBuilder(G_FPTRUNC)
967     .legalFor({{S32, S64}, {S16, S32}})
968     .scalarize(0)
969     .lower();
970 
971   getActionDefinitionsBuilder(G_FPEXT)
972     .legalFor({{S64, S32}, {S32, S16}})
973     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
974     .scalarize(0);
975 
976   auto &FSubActions = getActionDefinitionsBuilder({G_FSUB, G_STRICT_FSUB});
977   if (ST.has16BitInsts()) {
978     FSubActions
979       // Use actual fsub instruction
980       .legalFor({S32, S16})
981       // Must use fadd + fneg
982       .lowerFor({S64, V2S16});
983   } else {
984     FSubActions
985       // Use actual fsub instruction
986       .legalFor({S32})
987       // Must use fadd + fneg
988       .lowerFor({S64, S16, V2S16});
989   }
990 
991   FSubActions
992     .scalarize(0)
993     .clampScalar(0, S32, S64);
994 
995   // Whether this is legal depends on the floating point mode for the function.
996   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
997   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
998     FMad.customFor({S32, S16});
999   else if (ST.hasMadMacF32Insts())
1000     FMad.customFor({S32});
1001   else if (ST.hasMadF16())
1002     FMad.customFor({S16});
1003   FMad.scalarize(0)
1004       .lower();
1005 
1006   auto &FRem = getActionDefinitionsBuilder(G_FREM);
1007   if (ST.has16BitInsts()) {
1008     FRem.customFor({S16, S32, S64});
1009   } else {
1010     FRem.minScalar(0, S32)
1011         .customFor({S32, S64});
1012   }
1013   FRem.scalarize(0);
1014 
1015   // TODO: Do we need to clamp maximum bitwidth?
1016   getActionDefinitionsBuilder(G_TRUNC)
1017     .legalIf(isScalar(0))
1018     .legalFor({{V2S16, V2S32}})
1019     .clampMaxNumElements(0, S16, 2)
1020     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
1021     // situations (like an invalid implicit use), we don't want to infinite loop
1022     // in the legalizer.
1023     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
1024     .alwaysLegal();
1025 
1026   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
1027     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
1028                {S32, S1}, {S64, S1}, {S16, S1}})
1029     .scalarize(0)
1030     .clampScalar(0, S32, S64)
1031     .widenScalarToNextPow2(1, 32);
1032 
1033   // TODO: Split s1->s64 during regbankselect for VALU.
1034   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
1035                     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
1036                     .lowerIf(typeIs(1, S1))
1037                     .customFor({{S32, S64}, {S64, S64}});
1038   if (ST.has16BitInsts())
1039     IToFP.legalFor({{S16, S16}});
1040   IToFP.clampScalar(1, S32, S64)
1041        .minScalar(0, S32)
1042        .scalarize(0)
1043        .widenScalarToNextPow2(1);
1044 
1045   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
1046     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
1047     .customFor({{S64, S32}, {S64, S64}})
1048     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
1049   if (ST.has16BitInsts())
1050     FPToI.legalFor({{S16, S16}});
1051   else
1052     FPToI.minScalar(1, S32);
1053 
1054   FPToI.minScalar(0, S32)
1055        .widenScalarToNextPow2(0, 32)
1056        .scalarize(0)
1057        .lower();
1058 
1059   getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
1060       .customFor({S16, S32})
1061       .scalarize(0)
1062       .lower();
1063 
1064   // Lower roundeven into G_FRINT
1065   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
1066     .scalarize(0)
1067     .lower();
1068 
1069   if (ST.has16BitInsts()) {
1070     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
1071       .legalFor({S16, S32, S64})
1072       .clampScalar(0, S16, S64)
1073       .scalarize(0);
1074   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
1075     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
1076       .legalFor({S32, S64})
1077       .clampScalar(0, S32, S64)
1078       .scalarize(0);
1079   } else {
1080     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
1081       .legalFor({S32})
1082       .customFor({S64})
1083       .clampScalar(0, S32, S64)
1084       .scalarize(0);
1085   }
1086 
1087   getActionDefinitionsBuilder(G_PTR_ADD)
1088       .unsupportedFor({BufferFatPtr, RsrcPtr})
1089       .legalIf(all(isPointer(0), sameSize(0, 1)))
1090       .scalarize(0)
1091       .scalarSameSizeAs(1, 0);
1092 
1093   getActionDefinitionsBuilder(G_PTRMASK)
1094     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
1095     .scalarSameSizeAs(1, 0)
1096     .scalarize(0);
1097 
1098   auto &CmpBuilder =
1099     getActionDefinitionsBuilder(G_ICMP)
1100     // The compare output type differs based on the register bank of the output,
1101     // so make both s1 and s32 legal.
1102     //
1103     // Scalar compares producing output in scc will be promoted to s32, as that
1104     // is the allocatable register type that will be needed for the copy from
1105     // scc. This will be promoted during RegBankSelect, and we assume something
1106     // before that won't try to use s32 result types.
1107     //
1108     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
1109     // bank.
1110     .legalForCartesianProduct(
1111       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
1112     .legalForCartesianProduct(
1113       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
1114   if (ST.has16BitInsts()) {
1115     CmpBuilder.legalFor({{S1, S16}});
1116   }
1117 
1118   CmpBuilder
1119     .widenScalarToNextPow2(1)
1120     .clampScalar(1, S32, S64)
1121     .scalarize(0)
1122     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
1123 
1124   getActionDefinitionsBuilder(G_FCMP)
1125     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
1126     .widenScalarToNextPow2(1)
1127     .clampScalar(1, S32, S64)
1128     .scalarize(0);
1129 
1130   // FIXME: fpow has a selection pattern that should move to custom lowering.
1131   auto &ExpOps = getActionDefinitionsBuilder(G_FPOW);
1132   if (ST.has16BitInsts())
1133     ExpOps.customFor({{S32}, {S16}});
1134   else
1135     ExpOps.customFor({S32});
1136   ExpOps.clampScalar(0, MinScalarFPTy, S32)
1137         .scalarize(0);
1138 
1139   getActionDefinitionsBuilder(G_FPOWI)
1140     .clampScalar(0, MinScalarFPTy, S32)
1141     .lower();
1142 
1143   auto &Log2Ops = getActionDefinitionsBuilder({G_FLOG2, G_FEXP2});
1144   Log2Ops.customFor({S32});
1145   if (ST.has16BitInsts())
1146     Log2Ops.legalFor({S16});
1147   else
1148     Log2Ops.customFor({S16});
1149   Log2Ops.scalarize(0)
1150     .lower();
1151 
1152   auto &LogOps = getActionDefinitionsBuilder({G_FLOG, G_FLOG10, G_FEXP});
1153   LogOps.customFor({S32, S16});
1154   LogOps.clampScalar(0, MinScalarFPTy, S32)
1155         .scalarize(0);
1156 
1157   // The 64-bit versions produce 32-bit results, but only on the SALU.
1158   getActionDefinitionsBuilder(G_CTPOP)
1159     .legalFor({{S32, S32}, {S32, S64}})
1160     .clampScalar(0, S32, S32)
1161     .widenScalarToNextPow2(1, 32)
1162     .clampScalar(1, S32, S64)
1163     .scalarize(0)
1164     .widenScalarToNextPow2(0, 32);
1165 
1166   // If no 16 bit instr is available, lower into different instructions.
1167   if (ST.has16BitInsts())
1168     getActionDefinitionsBuilder(G_IS_FPCLASS)
1169         .legalForCartesianProduct({S1}, FPTypes16)
1170         .widenScalarToNextPow2(1)
1171         .scalarize(0)
1172         .lower();
1173   else
1174     getActionDefinitionsBuilder(G_IS_FPCLASS)
1175         .legalForCartesianProduct({S1}, FPTypesBase)
1176         .lowerFor({S1, S16})
1177         .widenScalarToNextPow2(1)
1178         .scalarize(0)
1179         .lower();
1180 
1181   // The hardware instructions return a different result on 0 than the generic
1182   // instructions expect. The hardware produces -1, but these produce the
1183   // bitwidth.
1184   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
1185     .scalarize(0)
1186     .clampScalar(0, S32, S32)
1187     .clampScalar(1, S32, S64)
1188     .widenScalarToNextPow2(0, 32)
1189     .widenScalarToNextPow2(1, 32)
1190     .custom();
1191 
1192   // The 64-bit versions produce 32-bit results, but only on the SALU.
1193   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
1194     .legalFor({{S32, S32}, {S32, S64}})
1195     .clampScalar(0, S32, S32)
1196     .clampScalar(1, S32, S64)
1197     .scalarize(0)
1198     .widenScalarToNextPow2(0, 32)
1199     .widenScalarToNextPow2(1, 32);
1200 
1201   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1202   // RegBankSelect.
1203   getActionDefinitionsBuilder(G_BITREVERSE)
1204     .legalFor({S32, S64})
1205     .clampScalar(0, S32, S64)
1206     .scalarize(0)
1207     .widenScalarToNextPow2(0);
1208 
1209   if (ST.has16BitInsts()) {
1210     getActionDefinitionsBuilder(G_BSWAP)
1211       .legalFor({S16, S32, V2S16})
1212       .clampMaxNumElementsStrict(0, S16, 2)
1213       // FIXME: Fixing non-power-of-2 before clamp is workaround for
1214       // narrowScalar limitation.
1215       .widenScalarToNextPow2(0)
1216       .clampScalar(0, S16, S32)
1217       .scalarize(0);
1218 
1219     if (ST.hasVOP3PInsts()) {
1220       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1221         .legalFor({S32, S16, V2S16})
1222         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1223         .clampMaxNumElements(0, S16, 2)
1224         .minScalar(0, S16)
1225         .widenScalarToNextPow2(0)
1226         .scalarize(0)
1227         .lower();
1228     } else {
1229       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1230         .legalFor({S32, S16})
1231         .widenScalarToNextPow2(0)
1232         .minScalar(0, S16)
1233         .scalarize(0)
1234         .lower();
1235     }
1236   } else {
1237     // TODO: Should have same legality without v_perm_b32
1238     getActionDefinitionsBuilder(G_BSWAP)
1239       .legalFor({S32})
1240       .lowerIf(scalarNarrowerThan(0, 32))
1241       // FIXME: Fixing non-power-of-2 before clamp is workaround for
1242       // narrowScalar limitation.
1243       .widenScalarToNextPow2(0)
1244       .maxScalar(0, S32)
1245       .scalarize(0)
1246       .lower();
1247 
1248     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1249       .legalFor({S32})
1250       .minScalar(0, S32)
1251       .widenScalarToNextPow2(0)
1252       .scalarize(0)
1253       .lower();
1254   }
1255 
1256   getActionDefinitionsBuilder(G_INTTOPTR)
1257       // List the common cases
1258       .legalForCartesianProduct(AddrSpaces64, {S64})
1259       .legalForCartesianProduct(AddrSpaces32, {S32})
1260       .scalarize(0)
1261       // Accept any address space as long as the size matches
1262       .legalIf(sameSize(0, 1))
1263       .widenScalarIf(smallerThan(1, 0),
1264                      [](const LegalityQuery &Query) {
1265                        return std::pair(
1266                            1, LLT::scalar(Query.Types[0].getSizeInBits()));
1267                      })
1268       .narrowScalarIf(largerThan(1, 0), [](const LegalityQuery &Query) {
1269         return std::pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1270       });
1271 
1272   getActionDefinitionsBuilder(G_PTRTOINT)
1273       // List the common cases
1274       .legalForCartesianProduct(AddrSpaces64, {S64})
1275       .legalForCartesianProduct(AddrSpaces32, {S32})
1276       .scalarize(0)
1277       // Accept any address space as long as the size matches
1278       .legalIf(sameSize(0, 1))
1279       .widenScalarIf(smallerThan(0, 1),
1280                      [](const LegalityQuery &Query) {
1281                        return std::pair(
1282                            0, LLT::scalar(Query.Types[1].getSizeInBits()));
1283                      })
1284       .narrowScalarIf(largerThan(0, 1), [](const LegalityQuery &Query) {
1285         return std::pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1286       });
1287 
1288   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1289     .scalarize(0)
1290     .custom();
1291 
1292   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1293                                     bool IsLoad) -> bool {
1294     const LLT DstTy = Query.Types[0];
1295 
1296     // Split vector extloads.
1297     unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1298 
1299     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1300       return true;
1301 
1302     const LLT PtrTy = Query.Types[1];
1303     unsigned AS = PtrTy.getAddressSpace();
1304     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
1305                                       Query.MMODescrs[0].Ordering !=
1306                                           AtomicOrdering::NotAtomic))
1307       return true;
1308 
1309     // Catch weird sized loads that don't evenly divide into the access sizes
1310     // TODO: May be able to widen depending on alignment etc.
1311     unsigned NumRegs = (MemSize + 31) / 32;
1312     if (NumRegs == 3) {
1313       if (!ST.hasDwordx3LoadStores())
1314         return true;
1315     } else {
1316       // If the alignment allows, these should have been widened.
1317       if (!isPowerOf2_32(NumRegs))
1318         return true;
1319     }
1320 
1321     return false;
1322   };
1323 
1324   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1325   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1326   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1327 
1328   // TODO: Refine based on subtargets which support unaligned access or 128-bit
1329   // LDS
1330   // TODO: Unsupported flat for SI.
1331 
1332   for (unsigned Op : {G_LOAD, G_STORE}) {
1333     const bool IsStore = Op == G_STORE;
1334 
1335     auto &Actions = getActionDefinitionsBuilder(Op);
1336     // Explicitly list some common cases.
1337     // TODO: Does this help compile time at all?
1338     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1339                                       {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1340                                       {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1341                                       {S64, GlobalPtr, S64, GlobalAlign32},
1342                                       {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1343                                       {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1344                                       {S32, GlobalPtr, S8, GlobalAlign8},
1345                                       {S32, GlobalPtr, S16, GlobalAlign16},
1346 
1347                                       {S32, LocalPtr, S32, 32},
1348                                       {S64, LocalPtr, S64, 32},
1349                                       {V2S32, LocalPtr, V2S32, 32},
1350                                       {S32, LocalPtr, S8, 8},
1351                                       {S32, LocalPtr, S16, 16},
1352                                       {V2S16, LocalPtr, S32, 32},
1353 
1354                                       {S32, PrivatePtr, S32, 32},
1355                                       {S32, PrivatePtr, S8, 8},
1356                                       {S32, PrivatePtr, S16, 16},
1357                                       {V2S16, PrivatePtr, S32, 32},
1358 
1359                                       {S32, ConstantPtr, S32, GlobalAlign32},
1360                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1361                                       {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1362                                       {S64, ConstantPtr, S64, GlobalAlign32},
1363                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1364     Actions.legalIf(
1365       [=](const LegalityQuery &Query) -> bool {
1366         return isLoadStoreLegal(ST, Query);
1367       });
1368 
1369     // The custom pointers (fat pointers, buffer resources) don't work with load
1370     // and store at this level. Fat pointers should have been lowered to
1371     // intrinsics before the translation to MIR.
1372     Actions.unsupportedIf(typeInSet(1, {BufferFatPtr, RsrcPtr}));
1373 
1374     // Address space 8 pointers are handled by a 4xs32 load, bitcast, and
1375     // ptrtoint. This is needed to account for the fact that we can't have i128
1376     // as a register class for SelectionDAG reasons.
1377     Actions.customIf([=](const LegalityQuery &Query) -> bool {
1378       return hasBufferRsrcWorkaround(Query.Types[0]);
1379     });
1380 
1381     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1382     // 64-bits.
1383     //
1384     // TODO: Should generalize bitcast action into coerce, which will also cover
1385     // inserting addrspacecasts.
1386     Actions.customIf(typeIs(1, Constant32Ptr));
1387 
1388     // Turn any illegal element vectors into something easier to deal
1389     // with. These will ultimately produce 32-bit scalar shifts to extract the
1390     // parts anyway.
1391     //
1392     // For odd 16-bit element vectors, prefer to split those into pieces with
1393     // 16-bit vector parts.
1394     Actions.bitcastIf(
1395       [=](const LegalityQuery &Query) -> bool {
1396         return shouldBitcastLoadStoreType(ST, Query.Types[0],
1397                                           Query.MMODescrs[0].MemoryTy);
1398       }, bitcastToRegisterType(0));
1399 
1400     if (!IsStore) {
1401       // Widen suitably aligned loads by loading extra bytes. The standard
1402       // legalization actions can't properly express widening memory operands.
1403       Actions.customIf([=](const LegalityQuery &Query) -> bool {
1404         return shouldWidenLoad(ST, Query, G_LOAD);
1405       });
1406     }
1407 
1408     // FIXME: load/store narrowing should be moved to lower action
1409     Actions
1410         .narrowScalarIf(
1411             [=](const LegalityQuery &Query) -> bool {
1412               return !Query.Types[0].isVector() &&
1413                      needToSplitMemOp(Query, Op == G_LOAD);
1414             },
1415             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1416               const LLT DstTy = Query.Types[0];
1417               const LLT PtrTy = Query.Types[1];
1418 
1419               const unsigned DstSize = DstTy.getSizeInBits();
1420               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1421 
1422               // Split extloads.
1423               if (DstSize > MemSize)
1424                 return std::pair(0, LLT::scalar(MemSize));
1425 
1426               unsigned MaxSize = maxSizeForAddrSpace(
1427                   ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1428                   Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1429               if (MemSize > MaxSize)
1430                 return std::pair(0, LLT::scalar(MaxSize));
1431 
1432               uint64_t Align = Query.MMODescrs[0].AlignInBits;
1433               return std::pair(0, LLT::scalar(Align));
1434             })
1435         .fewerElementsIf(
1436             [=](const LegalityQuery &Query) -> bool {
1437               return Query.Types[0].isVector() &&
1438                      needToSplitMemOp(Query, Op == G_LOAD);
1439             },
1440             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1441               const LLT DstTy = Query.Types[0];
1442               const LLT PtrTy = Query.Types[1];
1443 
1444               LLT EltTy = DstTy.getElementType();
1445               unsigned MaxSize = maxSizeForAddrSpace(
1446                   ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1447                   Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1448 
1449               // FIXME: Handle widened to power of 2 results better. This ends
1450               // up scalarizing.
1451               // FIXME: 3 element stores scalarized on SI
1452 
1453               // Split if it's too large for the address space.
1454               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1455               if (MemSize > MaxSize) {
1456                 unsigned NumElts = DstTy.getNumElements();
1457                 unsigned EltSize = EltTy.getSizeInBits();
1458 
1459                 if (MaxSize % EltSize == 0) {
1460                   return std::pair(
1461                       0, LLT::scalarOrVector(
1462                              ElementCount::getFixed(MaxSize / EltSize), EltTy));
1463                 }
1464 
1465                 unsigned NumPieces = MemSize / MaxSize;
1466 
1467                 // FIXME: Refine when odd breakdowns handled
1468                 // The scalars will need to be re-legalized.
1469                 if (NumPieces == 1 || NumPieces >= NumElts ||
1470                     NumElts % NumPieces != 0)
1471                   return std::pair(0, EltTy);
1472 
1473                 return std::pair(0,
1474                                  LLT::fixed_vector(NumElts / NumPieces, EltTy));
1475               }
1476 
1477               // FIXME: We could probably handle weird extending loads better.
1478               if (DstTy.getSizeInBits() > MemSize)
1479                 return std::pair(0, EltTy);
1480 
1481               unsigned EltSize = EltTy.getSizeInBits();
1482               unsigned DstSize = DstTy.getSizeInBits();
1483               if (!isPowerOf2_32(DstSize)) {
1484                 // We're probably decomposing an odd sized store. Try to split
1485                 // to the widest type. TODO: Account for alignment. As-is it
1486                 // should be OK, since the new parts will be further legalized.
1487                 unsigned FloorSize = llvm::bit_floor(DstSize);
1488                 return std::pair(
1489                     0, LLT::scalarOrVector(
1490                            ElementCount::getFixed(FloorSize / EltSize), EltTy));
1491               }
1492 
1493               // May need relegalization for the scalars.
1494               return std::pair(0, EltTy);
1495             })
1496     .minScalar(0, S32)
1497     .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
1498     .widenScalarToNextPow2(0)
1499     .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1500     .lower();
1501   }
1502 
1503   // FIXME: Unaligned accesses not lowered.
1504   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1505                        .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1506                                                   {S32, GlobalPtr, S16, 2 * 8},
1507                                                   {S32, LocalPtr, S8, 8},
1508                                                   {S32, LocalPtr, S16, 16},
1509                                                   {S32, PrivatePtr, S8, 8},
1510                                                   {S32, PrivatePtr, S16, 16},
1511                                                   {S32, ConstantPtr, S8, 8},
1512                                                   {S32, ConstantPtr, S16, 2 * 8}})
1513                        .legalIf(
1514                          [=](const LegalityQuery &Query) -> bool {
1515                            return isLoadStoreLegal(ST, Query);
1516                          });
1517 
1518   if (ST.hasFlatAddressSpace()) {
1519     ExtLoads.legalForTypesWithMemDesc(
1520         {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1521   }
1522 
1523   // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1524   // 64-bits.
1525   //
1526   // TODO: Should generalize bitcast action into coerce, which will also cover
1527   // inserting addrspacecasts.
1528   ExtLoads.customIf(typeIs(1, Constant32Ptr));
1529 
1530   ExtLoads.clampScalar(0, S32, S32)
1531           .widenScalarToNextPow2(0)
1532           .lower();
1533 
1534   auto &Atomics = getActionDefinitionsBuilder(
1535     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1536      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1537      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1538      G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1539     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1540                {S64, GlobalPtr}, {S64, LocalPtr},
1541                {S32, RegionPtr}, {S64, RegionPtr}});
1542   if (ST.hasFlatAddressSpace()) {
1543     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1544   }
1545 
1546   auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1547   if (ST.hasLDSFPAtomicAdd()) {
1548     Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1549     if (ST.hasGFX90AInsts())
1550       Atomic.legalFor({{S64, LocalPtr}});
1551     if (ST.hasAtomicDsPkAdd16Insts())
1552       Atomic.legalFor({{V2S16, LocalPtr}});
1553   }
1554   if (ST.hasAtomicFaddInsts())
1555     Atomic.legalFor({{S32, GlobalPtr}});
1556   if (ST.hasFlatAtomicFaddF32Inst())
1557     Atomic.legalFor({{S32, FlatPtr}});
1558 
1559   if (ST.hasGFX90AInsts()) {
1560     // These are legal with some caveats, and should have undergone expansion in
1561     // the IR in most situations
1562     // TODO: Move atomic expansion into legalizer
1563     Atomic.legalFor({
1564         {S32, GlobalPtr},
1565         {S64, GlobalPtr},
1566         {S64, FlatPtr}
1567       });
1568   }
1569 
1570   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1571   // demarshalling
1572   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1573     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1574                 {S32, FlatPtr}, {S64, FlatPtr}})
1575     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1576                {S32, RegionPtr}, {S64, RegionPtr}});
1577   // TODO: Pointer types, any 32-bit or 64-bit vector
1578 
1579   // Condition should be s32 for scalar, s1 for vector.
1580   getActionDefinitionsBuilder(G_SELECT)
1581       .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1582                                  LocalPtr, FlatPtr, PrivatePtr,
1583                                  LLT::fixed_vector(2, LocalPtr),
1584                                  LLT::fixed_vector(2, PrivatePtr)},
1585                                 {S1, S32})
1586       .clampScalar(0, S16, S64)
1587       .scalarize(1)
1588       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1589       .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1590       .clampMaxNumElements(0, S32, 2)
1591       .clampMaxNumElements(0, LocalPtr, 2)
1592       .clampMaxNumElements(0, PrivatePtr, 2)
1593       .scalarize(0)
1594       .widenScalarToNextPow2(0)
1595       .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1596 
1597   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1598   // be more flexible with the shift amount type.
1599   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1600     .legalFor({{S32, S32}, {S64, S32}});
1601   if (ST.has16BitInsts()) {
1602     if (ST.hasVOP3PInsts()) {
1603       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1604             .clampMaxNumElements(0, S16, 2);
1605     } else
1606       Shifts.legalFor({{S16, S16}});
1607 
1608     // TODO: Support 16-bit shift amounts for all types
1609     Shifts.widenScalarIf(
1610       [=](const LegalityQuery &Query) {
1611         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1612         // 32-bit amount.
1613         const LLT ValTy = Query.Types[0];
1614         const LLT AmountTy = Query.Types[1];
1615         return ValTy.getSizeInBits() <= 16 &&
1616                AmountTy.getSizeInBits() < 16;
1617       }, changeTo(1, S16));
1618     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1619     Shifts.clampScalar(1, S32, S32);
1620     Shifts.widenScalarToNextPow2(0, 16);
1621     Shifts.clampScalar(0, S16, S64);
1622 
1623     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1624       .minScalar(0, S16)
1625       .scalarize(0)
1626       .lower();
1627   } else {
1628     // Make sure we legalize the shift amount type first, as the general
1629     // expansion for the shifted type will produce much worse code if it hasn't
1630     // been truncated already.
1631     Shifts.clampScalar(1, S32, S32);
1632     Shifts.widenScalarToNextPow2(0, 32);
1633     Shifts.clampScalar(0, S32, S64);
1634 
1635     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1636       .minScalar(0, S32)
1637       .scalarize(0)
1638       .lower();
1639   }
1640   Shifts.scalarize(0);
1641 
1642   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1643     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1644     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1645     unsigned IdxTypeIdx = 2;
1646 
1647     getActionDefinitionsBuilder(Op)
1648       .customIf([=](const LegalityQuery &Query) {
1649           const LLT EltTy = Query.Types[EltTypeIdx];
1650           const LLT VecTy = Query.Types[VecTypeIdx];
1651           const LLT IdxTy = Query.Types[IdxTypeIdx];
1652           const unsigned EltSize = EltTy.getSizeInBits();
1653           const bool isLegalVecType =
1654               !!SIRegisterInfo::getSGPRClassForBitWidth(VecTy.getSizeInBits());
1655           // Address space 8 pointers are 128-bit wide values, but the logic
1656           // below will try to bitcast them to 2N x s64, which will fail.
1657           // Therefore, as an intermediate step, wrap extracts/insertions from a
1658           // ptrtoint-ing the vector and scalar arguments (or inttoptring the
1659           // extraction result) in order to produce a vector operation that can
1660           // be handled by the logic below.
1661           if (EltTy.isPointer() && EltSize > 64)
1662             return true;
1663           return (EltSize == 32 || EltSize == 64) &&
1664                   VecTy.getSizeInBits() % 32 == 0 &&
1665                   VecTy.getSizeInBits() <= MaxRegisterSize &&
1666                   IdxTy.getSizeInBits() == 32 &&
1667                   isLegalVecType;
1668         })
1669       .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1670                  bitcastToVectorElement32(VecTypeIdx))
1671       //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1672       .bitcastIf(
1673         all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1674         [=](const LegalityQuery &Query) {
1675           // For > 64-bit element types, try to turn this into a 64-bit
1676           // element vector since we may be able to do better indexing
1677           // if this is scalar. If not, fall back to 32.
1678           const LLT EltTy = Query.Types[EltTypeIdx];
1679           const LLT VecTy = Query.Types[VecTypeIdx];
1680           const unsigned DstEltSize = EltTy.getSizeInBits();
1681           const unsigned VecSize = VecTy.getSizeInBits();
1682 
1683           const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1684           return std::pair(
1685               VecTypeIdx,
1686               LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
1687         })
1688       .clampScalar(EltTypeIdx, S32, S64)
1689       .clampScalar(VecTypeIdx, S32, S64)
1690       .clampScalar(IdxTypeIdx, S32, S32)
1691       .clampMaxNumElements(VecTypeIdx, S32, 32)
1692       // TODO: Clamp elements for 64-bit vectors?
1693       .moreElementsIf(
1694         isIllegalRegisterType(VecTypeIdx),
1695         moreElementsToNextExistingRegClass(VecTypeIdx))
1696       // It should only be necessary with variable indexes.
1697       // As a last resort, lower to the stack
1698       .lower();
1699   }
1700 
1701   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1702     .unsupportedIf([=](const LegalityQuery &Query) {
1703         const LLT &EltTy = Query.Types[1].getElementType();
1704         return Query.Types[0] != EltTy;
1705       });
1706 
1707   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1708     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1709     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1710 
1711     // FIXME: Doesn't handle extract of illegal sizes.
1712     getActionDefinitionsBuilder(Op)
1713       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1714       .lowerIf([=](const LegalityQuery &Query) {
1715           // Sub-vector(or single element) insert and extract.
1716           // TODO: verify immediate offset here since lower only works with
1717           // whole elements.
1718           const LLT BigTy = Query.Types[BigTyIdx];
1719           return BigTy.isVector();
1720         })
1721       // FIXME: Multiples of 16 should not be legal.
1722       .legalIf([=](const LegalityQuery &Query) {
1723           const LLT BigTy = Query.Types[BigTyIdx];
1724           const LLT LitTy = Query.Types[LitTyIdx];
1725           return (BigTy.getSizeInBits() % 32 == 0) &&
1726                  (LitTy.getSizeInBits() % 16 == 0);
1727         })
1728       .widenScalarIf(
1729         [=](const LegalityQuery &Query) {
1730           const LLT BigTy = Query.Types[BigTyIdx];
1731           return (BigTy.getScalarSizeInBits() < 16);
1732         },
1733         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1734       .widenScalarIf(
1735         [=](const LegalityQuery &Query) {
1736           const LLT LitTy = Query.Types[LitTyIdx];
1737           return (LitTy.getScalarSizeInBits() < 16);
1738         },
1739         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1740       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1741       .widenScalarToNextPow2(BigTyIdx, 32);
1742 
1743   }
1744 
1745   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1746     .legalForCartesianProduct(AllS32Vectors, {S32})
1747     .legalForCartesianProduct(AllS64Vectors, {S64})
1748     .clampNumElements(0, V16S32, V32S32)
1749     .clampNumElements(0, V2S64, V16S64)
1750     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16))
1751     .moreElementsIf(
1752       isIllegalRegisterType(0),
1753       moreElementsToNextExistingRegClass(0));
1754 
1755   if (ST.hasScalarPackInsts()) {
1756     BuildVector
1757       // FIXME: Should probably widen s1 vectors straight to s32
1758       .minScalarOrElt(0, S16)
1759       .minScalar(1, S16);
1760 
1761     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1762       .legalFor({V2S16, S32})
1763       .lower();
1764   } else {
1765     BuildVector.customFor({V2S16, S16});
1766     BuildVector.minScalarOrElt(0, S32);
1767 
1768     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1769       .customFor({V2S16, S32})
1770       .lower();
1771   }
1772 
1773   BuildVector.legalIf(isRegisterType(0));
1774 
1775   // FIXME: Clamp maximum size
1776   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1777     .legalIf(all(isRegisterType(0), isRegisterType(1)))
1778     .clampMaxNumElements(0, S32, 32)
1779     .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1780     .clampMaxNumElements(0, S16, 64);
1781 
1782   getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1783 
1784   // Merge/Unmerge
1785   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1786     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1787     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1788 
1789     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1790       const LLT Ty = Query.Types[TypeIdx];
1791       if (Ty.isVector()) {
1792         const LLT &EltTy = Ty.getElementType();
1793         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1794           return true;
1795         if (!llvm::has_single_bit<uint32_t>(EltTy.getSizeInBits()))
1796           return true;
1797       }
1798       return false;
1799     };
1800 
1801     auto &Builder = getActionDefinitionsBuilder(Op)
1802       .legalIf(all(isRegisterType(0), isRegisterType(1)))
1803       .lowerFor({{S16, V2S16}})
1804       .lowerIf([=](const LegalityQuery &Query) {
1805           const LLT BigTy = Query.Types[BigTyIdx];
1806           return BigTy.getSizeInBits() == 32;
1807         })
1808       // Try to widen to s16 first for small types.
1809       // TODO: Only do this on targets with legal s16 shifts
1810       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1811       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1812       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1813       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1814                            elementTypeIs(1, S16)),
1815                        changeTo(1, V2S16))
1816       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1817       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1818       // valid.
1819       .clampScalar(LitTyIdx, S32, S512)
1820       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1821       // Break up vectors with weird elements into scalars
1822       .fewerElementsIf(
1823         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1824         scalarize(0))
1825       .fewerElementsIf(
1826         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1827         scalarize(1))
1828       .clampScalar(BigTyIdx, S32, MaxScalar);
1829 
1830     if (Op == G_MERGE_VALUES) {
1831       Builder.widenScalarIf(
1832         // TODO: Use 16-bit shifts if legal for 8-bit values?
1833         [=](const LegalityQuery &Query) {
1834           const LLT Ty = Query.Types[LitTyIdx];
1835           return Ty.getSizeInBits() < 32;
1836         },
1837         changeTo(LitTyIdx, S32));
1838     }
1839 
1840     Builder.widenScalarIf(
1841       [=](const LegalityQuery &Query) {
1842         const LLT Ty = Query.Types[BigTyIdx];
1843         return Ty.getSizeInBits() % 16 != 0;
1844       },
1845       [=](const LegalityQuery &Query) {
1846         // Pick the next power of 2, or a multiple of 64 over 128.
1847         // Whichever is smaller.
1848         const LLT &Ty = Query.Types[BigTyIdx];
1849         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1850         if (NewSizeInBits >= 256) {
1851           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1852           if (RoundedTo < NewSizeInBits)
1853             NewSizeInBits = RoundedTo;
1854         }
1855         return std::pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1856       })
1857       // Any vectors left are the wrong size. Scalarize them.
1858       .scalarize(0)
1859       .scalarize(1);
1860   }
1861 
1862   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1863   // RegBankSelect.
1864   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1865     .legalFor({{S32}, {S64}});
1866 
1867   if (ST.hasVOP3PInsts()) {
1868     SextInReg.lowerFor({{V2S16}})
1869       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1870       // get more vector shift opportunities, since we'll get those when
1871       // expanded.
1872       .clampMaxNumElementsStrict(0, S16, 2);
1873   } else if (ST.has16BitInsts()) {
1874     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1875   } else {
1876     // Prefer to promote to s32 before lowering if we don't have 16-bit
1877     // shifts. This avoid a lot of intermediate truncate and extend operations.
1878     SextInReg.lowerFor({{S32}, {S64}});
1879   }
1880 
1881   SextInReg
1882     .scalarize(0)
1883     .clampScalar(0, S32, S64)
1884     .lower();
1885 
1886   getActionDefinitionsBuilder({G_ROTR, G_ROTL})
1887     .scalarize(0)
1888     .lower();
1889 
1890   // TODO: Only Try to form v2s16 with legal packed instructions.
1891   getActionDefinitionsBuilder(G_FSHR)
1892     .legalFor({{S32, S32}})
1893     .lowerFor({{V2S16, V2S16}})
1894     .clampMaxNumElementsStrict(0, S16, 2)
1895     .scalarize(0)
1896     .lower();
1897 
1898   if (ST.hasVOP3PInsts()) {
1899     getActionDefinitionsBuilder(G_FSHL)
1900       .lowerFor({{V2S16, V2S16}})
1901       .clampMaxNumElementsStrict(0, S16, 2)
1902       .scalarize(0)
1903       .lower();
1904   } else {
1905     getActionDefinitionsBuilder(G_FSHL)
1906       .scalarize(0)
1907       .lower();
1908   }
1909 
1910   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1911     .legalFor({S64});
1912 
1913   getActionDefinitionsBuilder(G_FENCE)
1914     .alwaysLegal();
1915 
1916   getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1917       .scalarize(0)
1918       .minScalar(0, S32)
1919       .lower();
1920 
1921   getActionDefinitionsBuilder({G_SBFX, G_UBFX})
1922       .legalFor({{S32, S32}, {S64, S32}})
1923       .clampScalar(1, S32, S32)
1924       .clampScalar(0, S32, S64)
1925       .widenScalarToNextPow2(0)
1926       .scalarize(0);
1927 
1928   getActionDefinitionsBuilder({
1929       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1930       G_FCOPYSIGN,
1931 
1932       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1933       G_ATOMICRMW_NAND,
1934       G_ATOMICRMW_FSUB,
1935       G_READ_REGISTER,
1936       G_WRITE_REGISTER,
1937 
1938       G_SADDO, G_SSUBO,
1939 
1940        // TODO: Implement
1941       G_FMINIMUM, G_FMAXIMUM}).lower();
1942 
1943   getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
1944       .lower();
1945 
1946   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1947         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1948         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1949     .unsupported();
1950 
1951   getLegacyLegalizerInfo().computeTables();
1952   verify(*ST.getInstrInfo());
1953 }
1954 
1955 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1956                                          MachineInstr &MI) const {
1957   MachineIRBuilder &B = Helper.MIRBuilder;
1958   MachineRegisterInfo &MRI = *B.getMRI();
1959 
1960   switch (MI.getOpcode()) {
1961   case TargetOpcode::G_ADDRSPACE_CAST:
1962     return legalizeAddrSpaceCast(MI, MRI, B);
1963   case TargetOpcode::G_FRINT:
1964     return legalizeFrint(MI, MRI, B);
1965   case TargetOpcode::G_FCEIL:
1966     return legalizeFceil(MI, MRI, B);
1967   case TargetOpcode::G_FREM:
1968     return legalizeFrem(MI, MRI, B);
1969   case TargetOpcode::G_INTRINSIC_TRUNC:
1970     return legalizeIntrinsicTrunc(MI, MRI, B);
1971   case TargetOpcode::G_SITOFP:
1972     return legalizeITOFP(MI, MRI, B, true);
1973   case TargetOpcode::G_UITOFP:
1974     return legalizeITOFP(MI, MRI, B, false);
1975   case TargetOpcode::G_FPTOSI:
1976     return legalizeFPTOI(MI, MRI, B, true);
1977   case TargetOpcode::G_FPTOUI:
1978     return legalizeFPTOI(MI, MRI, B, false);
1979   case TargetOpcode::G_FMINNUM:
1980   case TargetOpcode::G_FMAXNUM:
1981   case TargetOpcode::G_FMINNUM_IEEE:
1982   case TargetOpcode::G_FMAXNUM_IEEE:
1983     return legalizeMinNumMaxNum(Helper, MI);
1984   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1985     return legalizeExtractVectorElt(MI, MRI, B);
1986   case TargetOpcode::G_INSERT_VECTOR_ELT:
1987     return legalizeInsertVectorElt(MI, MRI, B);
1988   case TargetOpcode::G_FSIN:
1989   case TargetOpcode::G_FCOS:
1990     return legalizeSinCos(MI, MRI, B);
1991   case TargetOpcode::G_GLOBAL_VALUE:
1992     return legalizeGlobalValue(MI, MRI, B);
1993   case TargetOpcode::G_LOAD:
1994   case TargetOpcode::G_SEXTLOAD:
1995   case TargetOpcode::G_ZEXTLOAD:
1996     return legalizeLoad(Helper, MI);
1997   case TargetOpcode::G_STORE:
1998     return legalizeStore(Helper, MI);
1999   case TargetOpcode::G_FMAD:
2000     return legalizeFMad(MI, MRI, B);
2001   case TargetOpcode::G_FDIV:
2002     return legalizeFDIV(MI, MRI, B);
2003   case TargetOpcode::G_FFREXP:
2004     return legalizeFFREXP(MI, MRI, B);
2005   case TargetOpcode::G_FSQRT:
2006     return legalizeFSQRT(MI, MRI, B);
2007   case TargetOpcode::G_UDIV:
2008   case TargetOpcode::G_UREM:
2009   case TargetOpcode::G_UDIVREM:
2010     return legalizeUnsignedDIV_REM(MI, MRI, B);
2011   case TargetOpcode::G_SDIV:
2012   case TargetOpcode::G_SREM:
2013   case TargetOpcode::G_SDIVREM:
2014     return legalizeSignedDIV_REM(MI, MRI, B);
2015   case TargetOpcode::G_ATOMIC_CMPXCHG:
2016     return legalizeAtomicCmpXChg(MI, MRI, B);
2017   case TargetOpcode::G_FLOG2:
2018     return legalizeFlog2(MI, B);
2019   case TargetOpcode::G_FLOG:
2020   case TargetOpcode::G_FLOG10:
2021     return legalizeFlogCommon(MI, B);
2022   case TargetOpcode::G_FEXP2:
2023     return legalizeFExp2(MI, B);
2024   case TargetOpcode::G_FEXP:
2025     return legalizeFExp(MI, B);
2026   case TargetOpcode::G_FPOW:
2027     return legalizeFPow(MI, B);
2028   case TargetOpcode::G_FFLOOR:
2029     return legalizeFFloor(MI, MRI, B);
2030   case TargetOpcode::G_BUILD_VECTOR:
2031   case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2032     return legalizeBuildVector(MI, MRI, B);
2033   case TargetOpcode::G_MUL:
2034     return legalizeMul(Helper, MI);
2035   case TargetOpcode::G_CTLZ:
2036   case TargetOpcode::G_CTTZ:
2037     return legalizeCTLZ_CTTZ(MI, MRI, B);
2038   case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND:
2039     return legalizeFPTruncRound(MI, B);
2040   default:
2041     return false;
2042   }
2043 
2044   llvm_unreachable("expected switch to return");
2045 }
2046 
2047 Register AMDGPULegalizerInfo::getSegmentAperture(
2048   unsigned AS,
2049   MachineRegisterInfo &MRI,
2050   MachineIRBuilder &B) const {
2051   MachineFunction &MF = B.getMF();
2052   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2053   const LLT S32 = LLT::scalar(32);
2054   const LLT S64 = LLT::scalar(64);
2055 
2056   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
2057 
2058   if (ST.hasApertureRegs()) {
2059     // Note: this register is somewhat broken. When used as a 32-bit operand,
2060     // it only returns zeroes. The real value is in the upper 32 bits.
2061     // Thus, we must emit extract the high 32 bits.
2062     const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
2063                                        ? AMDGPU::SRC_SHARED_BASE
2064                                        : AMDGPU::SRC_PRIVATE_BASE;
2065     // FIXME: It would be more natural to emit a COPY here, but then copy
2066     // coalescing would kick in and it would think it's okay to use the "HI"
2067     // subregister (instead of extracting the HI 32 bits) which is an artificial
2068     // (unusable) register.
2069     //  Register TableGen definitions would need an overhaul to get rid of the
2070     //  artificial "HI" aperture registers and prevent this kind of issue from
2071     //  happening.
2072     Register Dst = MRI.createGenericVirtualRegister(S64);
2073     MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass);
2074     B.buildInstr(AMDGPU::S_MOV_B64, {Dst}, {Register(ApertureRegNo)});
2075     return B.buildUnmerge(S32, Dst).getReg(1);
2076   }
2077 
2078   // TODO: can we be smarter about machine pointer info?
2079   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
2080   Register LoadAddr = MRI.createGenericVirtualRegister(
2081     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2082   // For code object version 5, private_base and shared_base are passed through
2083   // implicit kernargs.
2084   if (AMDGPU::getCodeObjectVersion(*MF.getFunction().getParent()) >=
2085       AMDGPU::AMDHSA_COV5) {
2086     AMDGPUTargetLowering::ImplicitParameter Param =
2087         AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPUTargetLowering::SHARED_BASE
2088                                       : AMDGPUTargetLowering::PRIVATE_BASE;
2089     uint64_t Offset =
2090         ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
2091 
2092     Register KernargPtrReg = MRI.createGenericVirtualRegister(
2093         LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2094 
2095     if (!loadInputValue(KernargPtrReg, B,
2096                         AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
2097       return Register();
2098 
2099     MachineMemOperand *MMO = MF.getMachineMemOperand(
2100         PtrInfo,
2101         MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2102             MachineMemOperand::MOInvariant,
2103         LLT::scalar(32), commonAlignment(Align(64), Offset));
2104 
2105     // Pointer address
2106     B.buildPtrAdd(LoadAddr, KernargPtrReg,
2107                   B.buildConstant(LLT::scalar(64), Offset).getReg(0));
2108     // Load address
2109     return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
2110   }
2111 
2112   Register QueuePtr = MRI.createGenericVirtualRegister(
2113     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2114 
2115   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
2116     return Register();
2117 
2118   // Offset into amd_queue_t for group_segment_aperture_base_hi /
2119   // private_segment_aperture_base_hi.
2120   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
2121 
2122   MachineMemOperand *MMO = MF.getMachineMemOperand(
2123       PtrInfo,
2124       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2125           MachineMemOperand::MOInvariant,
2126       LLT::scalar(32), commonAlignment(Align(64), StructOffset));
2127 
2128   B.buildPtrAdd(LoadAddr, QueuePtr,
2129                 B.buildConstant(LLT::scalar(64), StructOffset).getReg(0));
2130   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
2131 }
2132 
2133 /// Return true if the value is a known valid address, such that a null check is
2134 /// not necessary.
2135 static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI,
2136                            const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
2137   MachineInstr *Def = MRI.getVRegDef(Val);
2138   switch (Def->getOpcode()) {
2139   case AMDGPU::G_FRAME_INDEX:
2140   case AMDGPU::G_GLOBAL_VALUE:
2141   case AMDGPU::G_BLOCK_ADDR:
2142     return true;
2143   case AMDGPU::G_CONSTANT: {
2144     const ConstantInt *CI = Def->getOperand(1).getCImm();
2145     return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace);
2146   }
2147   default:
2148     return false;
2149   }
2150 
2151   return false;
2152 }
2153 
2154 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
2155   MachineInstr &MI, MachineRegisterInfo &MRI,
2156   MachineIRBuilder &B) const {
2157   MachineFunction &MF = B.getMF();
2158 
2159   const LLT S32 = LLT::scalar(32);
2160   Register Dst = MI.getOperand(0).getReg();
2161   Register Src = MI.getOperand(1).getReg();
2162 
2163   LLT DstTy = MRI.getType(Dst);
2164   LLT SrcTy = MRI.getType(Src);
2165   unsigned DestAS = DstTy.getAddressSpace();
2166   unsigned SrcAS = SrcTy.getAddressSpace();
2167 
2168   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
2169   // vector element.
2170   assert(!DstTy.isVector());
2171 
2172   const AMDGPUTargetMachine &TM
2173     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
2174 
2175   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
2176     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
2177     return true;
2178   }
2179 
2180   if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
2181       (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
2182        DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2183     if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
2184       // Extract low 32-bits of the pointer.
2185       B.buildExtract(Dst, Src, 0);
2186       MI.eraseFromParent();
2187       return true;
2188     }
2189 
2190     unsigned NullVal = TM.getNullPointerValue(DestAS);
2191 
2192     auto SegmentNull = B.buildConstant(DstTy, NullVal);
2193     auto FlatNull = B.buildConstant(SrcTy, 0);
2194 
2195     // Extract low 32-bits of the pointer.
2196     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
2197 
2198     auto CmpRes =
2199         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
2200     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
2201 
2202     MI.eraseFromParent();
2203     return true;
2204   }
2205 
2206   if (DestAS == AMDGPUAS::FLAT_ADDRESS &&
2207       (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
2208        SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2209     Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
2210     if (!ApertureReg.isValid())
2211       return false;
2212 
2213     // Coerce the type of the low half of the result so we can use merge_values.
2214     Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
2215 
2216     // TODO: Should we allow mismatched types but matching sizes in merges to
2217     // avoid the ptrtoint?
2218     auto BuildPtr = B.buildMergeLikeInstr(DstTy, {SrcAsInt, ApertureReg});
2219 
2220     if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
2221       B.buildCopy(Dst, BuildPtr);
2222       MI.eraseFromParent();
2223       return true;
2224     }
2225 
2226     auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
2227     auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
2228 
2229     auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src,
2230                               SegmentNull.getReg(0));
2231 
2232     B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2233 
2234     MI.eraseFromParent();
2235     return true;
2236   }
2237 
2238   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2239       SrcTy.getSizeInBits() == 64) {
2240     // Truncate.
2241     B.buildExtract(Dst, Src, 0);
2242     MI.eraseFromParent();
2243     return true;
2244   }
2245 
2246   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2247       DstTy.getSizeInBits() == 64) {
2248     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2249     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2250     auto PtrLo = B.buildPtrToInt(S32, Src);
2251     auto HighAddr = B.buildConstant(S32, AddrHiVal);
2252     B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
2253     MI.eraseFromParent();
2254     return true;
2255   }
2256 
2257   DiagnosticInfoUnsupported InvalidAddrSpaceCast(
2258       MF.getFunction(), "invalid addrspacecast", B.getDebugLoc());
2259 
2260   LLVMContext &Ctx = MF.getFunction().getContext();
2261   Ctx.diagnose(InvalidAddrSpaceCast);
2262   B.buildUndef(Dst);
2263   MI.eraseFromParent();
2264   return true;
2265 }
2266 
2267 bool AMDGPULegalizerInfo::legalizeFrint(
2268   MachineInstr &MI, MachineRegisterInfo &MRI,
2269   MachineIRBuilder &B) const {
2270   Register Src = MI.getOperand(1).getReg();
2271   LLT Ty = MRI.getType(Src);
2272   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
2273 
2274   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2275   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2276 
2277   auto C1 = B.buildFConstant(Ty, C1Val);
2278   auto CopySign = B.buildFCopysign(Ty, C1, Src);
2279 
2280   // TODO: Should this propagate fast-math-flags?
2281   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
2282   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
2283 
2284   auto C2 = B.buildFConstant(Ty, C2Val);
2285   auto Fabs = B.buildFAbs(Ty, Src);
2286 
2287   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
2288   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
2289   MI.eraseFromParent();
2290   return true;
2291 }
2292 
2293 bool AMDGPULegalizerInfo::legalizeFceil(
2294   MachineInstr &MI, MachineRegisterInfo &MRI,
2295   MachineIRBuilder &B) const {
2296 
2297   const LLT S1 = LLT::scalar(1);
2298   const LLT S64 = LLT::scalar(64);
2299 
2300   Register Src = MI.getOperand(1).getReg();
2301   assert(MRI.getType(Src) == S64);
2302 
2303   // result = trunc(src)
2304   // if (src > 0.0 && src != result)
2305   //   result += 1.0
2306 
2307   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
2308 
2309   const auto Zero = B.buildFConstant(S64, 0.0);
2310   const auto One = B.buildFConstant(S64, 1.0);
2311   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
2312   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
2313   auto And = B.buildAnd(S1, Lt0, NeTrunc);
2314   auto Add = B.buildSelect(S64, And, One, Zero);
2315 
2316   // TODO: Should this propagate fast-math-flags?
2317   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
2318   MI.eraseFromParent();
2319   return true;
2320 }
2321 
2322 bool AMDGPULegalizerInfo::legalizeFrem(
2323   MachineInstr &MI, MachineRegisterInfo &MRI,
2324   MachineIRBuilder &B) const {
2325     Register DstReg = MI.getOperand(0).getReg();
2326     Register Src0Reg = MI.getOperand(1).getReg();
2327     Register Src1Reg = MI.getOperand(2).getReg();
2328     auto Flags = MI.getFlags();
2329     LLT Ty = MRI.getType(DstReg);
2330 
2331     auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2332     auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
2333     auto Neg = B.buildFNeg(Ty, Trunc, Flags);
2334     B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2335     MI.eraseFromParent();
2336     return true;
2337 }
2338 
2339 static MachineInstrBuilder extractF64Exponent(Register Hi,
2340                                               MachineIRBuilder &B) {
2341   const unsigned FractBits = 52;
2342   const unsigned ExpBits = 11;
2343   LLT S32 = LLT::scalar(32);
2344 
2345   auto Const0 = B.buildConstant(S32, FractBits - 32);
2346   auto Const1 = B.buildConstant(S32, ExpBits);
2347 
2348   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
2349     .addUse(Hi)
2350     .addUse(Const0.getReg(0))
2351     .addUse(Const1.getReg(0));
2352 
2353   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2354 }
2355 
2356 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
2357   MachineInstr &MI, MachineRegisterInfo &MRI,
2358   MachineIRBuilder &B) const {
2359   const LLT S1 = LLT::scalar(1);
2360   const LLT S32 = LLT::scalar(32);
2361   const LLT S64 = LLT::scalar(64);
2362 
2363   Register Src = MI.getOperand(1).getReg();
2364   assert(MRI.getType(Src) == S64);
2365 
2366   // TODO: Should this use extract since the low half is unused?
2367   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2368   Register Hi = Unmerge.getReg(1);
2369 
2370   // Extract the upper half, since this is where we will find the sign and
2371   // exponent.
2372   auto Exp = extractF64Exponent(Hi, B);
2373 
2374   const unsigned FractBits = 52;
2375 
2376   // Extract the sign bit.
2377   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2378   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2379 
2380   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2381 
2382   const auto Zero32 = B.buildConstant(S32, 0);
2383 
2384   // Extend back to 64-bits.
2385   auto SignBit64 = B.buildMergeLikeInstr(S64, {Zero32, SignBit});
2386 
2387   auto Shr = B.buildAShr(S64, FractMask, Exp);
2388   auto Not = B.buildNot(S64, Shr);
2389   auto Tmp0 = B.buildAnd(S64, Src, Not);
2390   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2391 
2392   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2393   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2394 
2395   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2396   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2397   MI.eraseFromParent();
2398   return true;
2399 }
2400 
2401 bool AMDGPULegalizerInfo::legalizeITOFP(
2402   MachineInstr &MI, MachineRegisterInfo &MRI,
2403   MachineIRBuilder &B, bool Signed) const {
2404 
2405   Register Dst = MI.getOperand(0).getReg();
2406   Register Src = MI.getOperand(1).getReg();
2407 
2408   const LLT S64 = LLT::scalar(64);
2409   const LLT S32 = LLT::scalar(32);
2410 
2411   assert(MRI.getType(Src) == S64);
2412 
2413   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2414   auto ThirtyTwo = B.buildConstant(S32, 32);
2415 
2416   if (MRI.getType(Dst) == S64) {
2417     auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2418                         : B.buildUITOFP(S64, Unmerge.getReg(1));
2419 
2420     auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2421     auto LdExp = B.buildFLdexp(S64, CvtHi, ThirtyTwo);
2422 
2423     // TODO: Should this propagate fast-math-flags?
2424     B.buildFAdd(Dst, LdExp, CvtLo);
2425     MI.eraseFromParent();
2426     return true;
2427   }
2428 
2429   assert(MRI.getType(Dst) == S32);
2430 
2431   auto One = B.buildConstant(S32, 1);
2432 
2433   MachineInstrBuilder ShAmt;
2434   if (Signed) {
2435     auto ThirtyOne = B.buildConstant(S32, 31);
2436     auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2437     auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2438     auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2439     auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32},
2440                                /*HasSideEffects=*/false)
2441                   .addUse(Unmerge.getReg(1));
2442     auto LS2 = B.buildSub(S32, LS, One);
2443     ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2444   } else
2445     ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2446   auto Norm = B.buildShl(S64, Src, ShAmt);
2447   auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2448   auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2449   auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2450   auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2451   auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2452   B.buildFLdexp(Dst, FVal, Scale);
2453   MI.eraseFromParent();
2454   return true;
2455 }
2456 
2457 // TODO: Copied from DAG implementation. Verify logic and document how this
2458 // actually works.
2459 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
2460                                         MachineRegisterInfo &MRI,
2461                                         MachineIRBuilder &B,
2462                                         bool Signed) const {
2463 
2464   Register Dst = MI.getOperand(0).getReg();
2465   Register Src = MI.getOperand(1).getReg();
2466 
2467   const LLT S64 = LLT::scalar(64);
2468   const LLT S32 = LLT::scalar(32);
2469 
2470   const LLT SrcLT = MRI.getType(Src);
2471   assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2472 
2473   unsigned Flags = MI.getFlags();
2474 
2475   // The basic idea of converting a floating point number into a pair of 32-bit
2476   // integers is illustrated as follows:
2477   //
2478   //     tf := trunc(val);
2479   //    hif := floor(tf * 2^-32);
2480   //    lof := tf - hif * 2^32; // lof is always positive due to floor.
2481   //     hi := fptoi(hif);
2482   //     lo := fptoi(lof);
2483   //
2484   auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2485   MachineInstrBuilder Sign;
2486   if (Signed && SrcLT == S32) {
2487     // However, a 32-bit floating point number has only 23 bits mantissa and
2488     // it's not enough to hold all the significant bits of `lof` if val is
2489     // negative. To avoid the loss of precision, We need to take the absolute
2490     // value after truncating and flip the result back based on the original
2491     // signedness.
2492     Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2493     Trunc = B.buildFAbs(S32, Trunc, Flags);
2494   }
2495   MachineInstrBuilder K0, K1;
2496   if (SrcLT == S64) {
2497     K0 = B.buildFConstant(
2498         S64, llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2499     K1 = B.buildFConstant(
2500         S64, llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2501   } else {
2502     K0 = B.buildFConstant(
2503         S32, llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)));
2504     K1 = B.buildFConstant(
2505         S32, llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)));
2506   }
2507 
2508   auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2509   auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2510   auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2511 
2512   auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2513                                      : B.buildFPTOUI(S32, FloorMul);
2514   auto Lo = B.buildFPTOUI(S32, Fma);
2515 
2516   if (Signed && SrcLT == S32) {
2517     // Flip the result based on the signedness, which is either all 0s or 1s.
2518     Sign = B.buildMergeLikeInstr(S64, {Sign, Sign});
2519     // r := xor({lo, hi}, sign) - sign;
2520     B.buildSub(Dst, B.buildXor(S64, B.buildMergeLikeInstr(S64, {Lo, Hi}), Sign),
2521                Sign);
2522   } else
2523     B.buildMergeLikeInstr(Dst, {Lo, Hi});
2524   MI.eraseFromParent();
2525 
2526   return true;
2527 }
2528 
2529 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2530                                                MachineInstr &MI) const {
2531   MachineFunction &MF = Helper.MIRBuilder.getMF();
2532   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2533 
2534   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2535                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2536 
2537   // With ieee_mode disabled, the instructions have the correct behavior
2538   // already for G_FMINNUM/G_FMAXNUM
2539   if (!MFI->getMode().IEEE)
2540     return !IsIEEEOp;
2541 
2542   if (IsIEEEOp)
2543     return true;
2544 
2545   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2546 }
2547 
2548 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2549   MachineInstr &MI, MachineRegisterInfo &MRI,
2550   MachineIRBuilder &B) const {
2551   // TODO: Should move some of this into LegalizerHelper.
2552 
2553   // TODO: Promote dynamic indexing of s16 to s32
2554 
2555   Register Dst = MI.getOperand(0).getReg();
2556   Register Vec = MI.getOperand(1).getReg();
2557 
2558   LLT VecTy = MRI.getType(Vec);
2559   LLT EltTy = VecTy.getElementType();
2560   assert(EltTy == MRI.getType(Dst));
2561 
2562   // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
2563   // but we can't go directly to that logic becasue you can't bitcast a vector
2564   // of pointers to a vector of integers. Therefore, introduce an intermediate
2565   // vector of integers using ptrtoint (and inttoptr on the output) in order to
2566   // drive the legalization forward.
2567   if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
2568     LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
2569     LLT IntVecTy = VecTy.changeElementType(IntTy);
2570 
2571     auto IntVec = B.buildPtrToInt(IntVecTy, Vec);
2572     auto IntElt = B.buildExtractVectorElement(IntTy, IntVec, MI.getOperand(2));
2573     B.buildIntToPtr(Dst, IntElt);
2574 
2575     MI.eraseFromParent();
2576     return true;
2577   }
2578 
2579   // FIXME: Artifact combiner probably should have replaced the truncated
2580   // constant before this, so we shouldn't need
2581   // getIConstantVRegValWithLookThrough.
2582   std::optional<ValueAndVReg> MaybeIdxVal =
2583       getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2584   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2585     return true;
2586   const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2587 
2588   if (IdxVal < VecTy.getNumElements()) {
2589     auto Unmerge = B.buildUnmerge(EltTy, Vec);
2590     B.buildCopy(Dst, Unmerge.getReg(IdxVal));
2591   } else {
2592     B.buildUndef(Dst);
2593   }
2594 
2595   MI.eraseFromParent();
2596   return true;
2597 }
2598 
2599 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2600   MachineInstr &MI, MachineRegisterInfo &MRI,
2601   MachineIRBuilder &B) const {
2602   // TODO: Should move some of this into LegalizerHelper.
2603 
2604   // TODO: Promote dynamic indexing of s16 to s32
2605 
2606   Register Dst = MI.getOperand(0).getReg();
2607   Register Vec = MI.getOperand(1).getReg();
2608   Register Ins = MI.getOperand(2).getReg();
2609 
2610   LLT VecTy = MRI.getType(Vec);
2611   LLT EltTy = VecTy.getElementType();
2612   assert(EltTy == MRI.getType(Ins));
2613 
2614   // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
2615   // but we can't go directly to that logic becasue you can't bitcast a vector
2616   // of pointers to a vector of integers. Therefore, make the pointer vector
2617   // into an equivalent vector of integers with ptrtoint, insert the ptrtoint'd
2618   // new value, and then inttoptr the result vector back. This will then allow
2619   // the rest of legalization to take over.
2620   if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
2621     LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
2622     LLT IntVecTy = VecTy.changeElementType(IntTy);
2623 
2624     auto IntVecSource = B.buildPtrToInt(IntVecTy, Vec);
2625     auto IntIns = B.buildPtrToInt(IntTy, Ins);
2626     auto IntVecDest = B.buildInsertVectorElement(IntVecTy, IntVecSource, IntIns,
2627                                                  MI.getOperand(3));
2628     B.buildIntToPtr(Dst, IntVecDest);
2629     MI.eraseFromParent();
2630     return true;
2631   }
2632 
2633   // FIXME: Artifact combiner probably should have replaced the truncated
2634   // constant before this, so we shouldn't need
2635   // getIConstantVRegValWithLookThrough.
2636   std::optional<ValueAndVReg> MaybeIdxVal =
2637       getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2638   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2639     return true;
2640 
2641   const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2642 
2643   unsigned NumElts = VecTy.getNumElements();
2644   if (IdxVal < NumElts) {
2645     SmallVector<Register, 8> SrcRegs;
2646     for (unsigned i = 0; i < NumElts; ++i)
2647       SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy));
2648     B.buildUnmerge(SrcRegs, Vec);
2649 
2650     SrcRegs[IdxVal] = MI.getOperand(2).getReg();
2651     B.buildMergeLikeInstr(Dst, SrcRegs);
2652   } else {
2653     B.buildUndef(Dst);
2654   }
2655 
2656   MI.eraseFromParent();
2657   return true;
2658 }
2659 
2660 bool AMDGPULegalizerInfo::legalizeSinCos(
2661   MachineInstr &MI, MachineRegisterInfo &MRI,
2662   MachineIRBuilder &B) const {
2663 
2664   Register DstReg = MI.getOperand(0).getReg();
2665   Register SrcReg = MI.getOperand(1).getReg();
2666   LLT Ty = MRI.getType(DstReg);
2667   unsigned Flags = MI.getFlags();
2668 
2669   Register TrigVal;
2670   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2671   if (ST.hasTrigReducedRange()) {
2672     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2673     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2674       .addUse(MulVal.getReg(0))
2675       .setMIFlags(Flags).getReg(0);
2676   } else
2677     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2678 
2679   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2680     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2681   B.buildIntrinsic(TrigIntrin, ArrayRef<Register>(DstReg), false)
2682       .addUse(TrigVal)
2683       .setMIFlags(Flags);
2684   MI.eraseFromParent();
2685   return true;
2686 }
2687 
2688 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2689                                                   MachineIRBuilder &B,
2690                                                   const GlobalValue *GV,
2691                                                   int64_t Offset,
2692                                                   unsigned GAFlags) const {
2693   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2694   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2695   // to the following code sequence:
2696   //
2697   // For constant address space:
2698   //   s_getpc_b64 s[0:1]
2699   //   s_add_u32 s0, s0, $symbol
2700   //   s_addc_u32 s1, s1, 0
2701   //
2702   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2703   //   a fixup or relocation is emitted to replace $symbol with a literal
2704   //   constant, which is a pc-relative offset from the encoding of the $symbol
2705   //   operand to the global variable.
2706   //
2707   // For global address space:
2708   //   s_getpc_b64 s[0:1]
2709   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2710   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2711   //
2712   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2713   //   fixups or relocations are emitted to replace $symbol@*@lo and
2714   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2715   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2716   //   operand to the global variable.
2717   //
2718   // What we want here is an offset from the value returned by s_getpc
2719   // (which is the address of the s_add_u32 instruction) to the global
2720   // variable, but since the encoding of $symbol starts 4 bytes after the start
2721   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2722   // small. This requires us to add 4 to the global variable offset in order to
2723   // compute the correct address. Similarly for the s_addc_u32 instruction, the
2724   // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2725   // instruction.
2726 
2727   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2728 
2729   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2730     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2731 
2732   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2733     .addDef(PCReg);
2734 
2735   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2736   if (GAFlags == SIInstrInfo::MO_NONE)
2737     MIB.addImm(0);
2738   else
2739     MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2740 
2741   if (!B.getMRI()->getRegClassOrNull(PCReg))
2742     B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2743 
2744   if (PtrTy.getSizeInBits() == 32)
2745     B.buildExtract(DstReg, PCReg, 0);
2746   return true;
2747  }
2748 
2749 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2750   MachineInstr &MI, MachineRegisterInfo &MRI,
2751   MachineIRBuilder &B) const {
2752   Register DstReg = MI.getOperand(0).getReg();
2753   LLT Ty = MRI.getType(DstReg);
2754   unsigned AS = Ty.getAddressSpace();
2755 
2756   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2757   MachineFunction &MF = B.getMF();
2758   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2759 
2760   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2761     if (!MFI->isModuleEntryFunction() &&
2762         !GV->getName().equals("llvm.amdgcn.module.lds")) {
2763       const Function &Fn = MF.getFunction();
2764       DiagnosticInfoUnsupported BadLDSDecl(
2765         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2766         DS_Warning);
2767       Fn.getContext().diagnose(BadLDSDecl);
2768 
2769       // We currently don't have a way to correctly allocate LDS objects that
2770       // aren't directly associated with a kernel. We do force inlining of
2771       // functions that use local objects. However, if these dead functions are
2772       // not eliminated, we don't want a compile time error. Just emit a warning
2773       // and a trap, since there should be no callable path here.
2774       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2775       B.buildUndef(DstReg);
2776       MI.eraseFromParent();
2777       return true;
2778     }
2779 
2780     // TODO: We could emit code to handle the initialization somewhere.
2781     // We ignore the initializer for now and legalize it to allow selection.
2782     // The initializer will anyway get errored out during assembly emission.
2783     const SITargetLowering *TLI = ST.getTargetLowering();
2784     if (!TLI->shouldUseLDSConstAddress(GV)) {
2785       MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2786       return true; // Leave in place;
2787     }
2788 
2789     if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2790       Type *Ty = GV->getValueType();
2791       // HIP uses an unsized array `extern __shared__ T s[]` or similar
2792       // zero-sized type in other languages to declare the dynamic shared
2793       // memory which size is not known at the compile time. They will be
2794       // allocated by the runtime and placed directly after the static
2795       // allocated ones. They all share the same offset.
2796       if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2797         // Adjust alignment for that dynamic shared memory array.
2798         MFI->setDynLDSAlign(MF.getFunction(), *cast<GlobalVariable>(GV));
2799         LLT S32 = LLT::scalar(32);
2800         auto Sz =
2801             B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2802         B.buildIntToPtr(DstReg, Sz);
2803         MI.eraseFromParent();
2804         return true;
2805       }
2806     }
2807 
2808     B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(),
2809                                                    *cast<GlobalVariable>(GV)));
2810     MI.eraseFromParent();
2811     return true;
2812   }
2813 
2814   const SITargetLowering *TLI = ST.getTargetLowering();
2815 
2816   if (TLI->shouldEmitFixup(GV)) {
2817     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2818     MI.eraseFromParent();
2819     return true;
2820   }
2821 
2822   if (TLI->shouldEmitPCReloc(GV)) {
2823     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2824     MI.eraseFromParent();
2825     return true;
2826   }
2827 
2828   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2829   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2830 
2831   LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
2832   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2833       MachinePointerInfo::getGOT(MF),
2834       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2835           MachineMemOperand::MOInvariant,
2836       LoadTy, Align(8));
2837 
2838   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2839 
2840   if (Ty.getSizeInBits() == 32) {
2841     // Truncate if this is a 32-bit constant address.
2842     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2843     B.buildExtract(DstReg, Load, 0);
2844   } else
2845     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2846 
2847   MI.eraseFromParent();
2848   return true;
2849 }
2850 
2851 static LLT widenToNextPowerOf2(LLT Ty) {
2852   if (Ty.isVector())
2853     return Ty.changeElementCount(
2854         ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
2855   return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2856 }
2857 
2858 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2859                                        MachineInstr &MI) const {
2860   MachineIRBuilder &B = Helper.MIRBuilder;
2861   MachineRegisterInfo &MRI = *B.getMRI();
2862   GISelChangeObserver &Observer = Helper.Observer;
2863 
2864   Register PtrReg = MI.getOperand(1).getReg();
2865   LLT PtrTy = MRI.getType(PtrReg);
2866   unsigned AddrSpace = PtrTy.getAddressSpace();
2867 
2868   if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2869     LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2870     auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2871     Observer.changingInstr(MI);
2872     MI.getOperand(1).setReg(Cast.getReg(0));
2873     Observer.changedInstr(MI);
2874     return true;
2875   }
2876 
2877   if (MI.getOpcode() != AMDGPU::G_LOAD)
2878     return false;
2879 
2880   Register ValReg = MI.getOperand(0).getReg();
2881   LLT ValTy = MRI.getType(ValReg);
2882 
2883   if (hasBufferRsrcWorkaround(ValTy)) {
2884     Observer.changingInstr(MI);
2885     castBufferRsrcFromV4I32(MI, B, MRI, 0);
2886     Observer.changedInstr(MI);
2887     return true;
2888   }
2889 
2890   MachineMemOperand *MMO = *MI.memoperands_begin();
2891   const unsigned ValSize = ValTy.getSizeInBits();
2892   const LLT MemTy = MMO->getMemoryType();
2893   const Align MemAlign = MMO->getAlign();
2894   const unsigned MemSize = MemTy.getSizeInBits();
2895   const uint64_t AlignInBits = 8 * MemAlign.value();
2896 
2897   // Widen non-power-of-2 loads to the alignment if needed
2898   if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
2899     const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2900 
2901     // This was already the correct extending load result type, so just adjust
2902     // the memory type.
2903     if (WideMemSize == ValSize) {
2904       MachineFunction &MF = B.getMF();
2905 
2906       MachineMemOperand *WideMMO =
2907           MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2908       Observer.changingInstr(MI);
2909       MI.setMemRefs(MF, {WideMMO});
2910       Observer.changedInstr(MI);
2911       return true;
2912     }
2913 
2914     // Don't bother handling edge case that should probably never be produced.
2915     if (ValSize > WideMemSize)
2916       return false;
2917 
2918     LLT WideTy = widenToNextPowerOf2(ValTy);
2919 
2920     Register WideLoad;
2921     if (!WideTy.isVector()) {
2922       WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2923       B.buildTrunc(ValReg, WideLoad).getReg(0);
2924     } else {
2925       // Extract the subvector.
2926 
2927       if (isRegisterType(ValTy)) {
2928         // If this a case where G_EXTRACT is legal, use it.
2929         // (e.g. <3 x s32> -> <4 x s32>)
2930         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2931         B.buildExtract(ValReg, WideLoad, 0);
2932       } else {
2933         // For cases where the widened type isn't a nice register value, unmerge
2934         // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2935         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2936         B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
2937       }
2938     }
2939 
2940     MI.eraseFromParent();
2941     return true;
2942   }
2943 
2944   return false;
2945 }
2946 
2947 bool AMDGPULegalizerInfo::legalizeStore(LegalizerHelper &Helper,
2948                                         MachineInstr &MI) const {
2949   MachineIRBuilder &B = Helper.MIRBuilder;
2950   MachineRegisterInfo &MRI = *B.getMRI();
2951   GISelChangeObserver &Observer = Helper.Observer;
2952 
2953   Register DataReg = MI.getOperand(0).getReg();
2954   LLT DataTy = MRI.getType(DataReg);
2955 
2956   if (hasBufferRsrcWorkaround(DataTy)) {
2957     Observer.changingInstr(MI);
2958     castBufferRsrcArgToV4I32(MI, B, 0);
2959     Observer.changedInstr(MI);
2960     return true;
2961   }
2962   return false;
2963 }
2964 
2965 bool AMDGPULegalizerInfo::legalizeFMad(
2966   MachineInstr &MI, MachineRegisterInfo &MRI,
2967   MachineIRBuilder &B) const {
2968   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2969   assert(Ty.isScalar());
2970 
2971   MachineFunction &MF = B.getMF();
2972   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2973 
2974   // TODO: Always legal with future ftz flag.
2975   // FIXME: Do we need just output?
2976   if (Ty == LLT::scalar(32) &&
2977       MFI->getMode().FP32Denormals == DenormalMode::getPreserveSign())
2978     return true;
2979   if (Ty == LLT::scalar(16) &&
2980       MFI->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign())
2981     return true;
2982 
2983   MachineIRBuilder HelperBuilder(MI);
2984   GISelObserverWrapper DummyObserver;
2985   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2986   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2987 }
2988 
2989 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2990   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2991   Register DstReg = MI.getOperand(0).getReg();
2992   Register PtrReg = MI.getOperand(1).getReg();
2993   Register CmpVal = MI.getOperand(2).getReg();
2994   Register NewVal = MI.getOperand(3).getReg();
2995 
2996   assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
2997          "this should not have been custom lowered");
2998 
2999   LLT ValTy = MRI.getType(CmpVal);
3000   LLT VecTy = LLT::fixed_vector(2, ValTy);
3001 
3002   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
3003 
3004   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
3005     .addDef(DstReg)
3006     .addUse(PtrReg)
3007     .addUse(PackedVal)
3008     .setMemRefs(MI.memoperands());
3009 
3010   MI.eraseFromParent();
3011   return true;
3012 }
3013 
3014 /// Return true if it's known that \p Src can never be an f32 denormal value.
3015 static bool valueIsKnownNeverF32Denorm(const MachineRegisterInfo &MRI,
3016                                        Register Src) {
3017   Register ExtSrc;
3018   if (mi_match(Src, MRI, m_GFPExt(m_Reg(ExtSrc))))
3019     return MRI.getType(ExtSrc) == LLT::scalar(16);
3020   return false;
3021 }
3022 
3023 static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags) {
3024   if (Flags & MachineInstr::FmAfn)
3025     return true;
3026   const auto &Options = MF.getTarget().Options;
3027   return Options.UnsafeFPMath || Options.ApproxFuncFPMath;
3028 }
3029 
3030 static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src,
3031                                    unsigned Flags) {
3032   return !valueIsKnownNeverF32Denorm(MF.getRegInfo(), Src) &&
3033          MF.getDenormalMode(APFloat::IEEEsingle()).Input !=
3034              DenormalMode::PreserveSign;
3035 }
3036 
3037 std::pair<Register, Register>
3038 AMDGPULegalizerInfo::getScaledLogInput(MachineIRBuilder &B, Register Src,
3039                                        unsigned Flags) const {
3040   if (allowApproxFunc(B.getMF(), Flags) ||
3041       !needsDenormHandlingF32(B.getMF(), Src, Flags))
3042     return {};
3043 
3044   const LLT F32 = LLT::scalar(32);
3045   auto SmallestNormal = B.buildFConstant(
3046       F32, APFloat::getSmallestNormalized(APFloat::IEEEsingle()));
3047   auto IsLtSmallestNormal =
3048       B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src, SmallestNormal);
3049 
3050   auto Scale32 = B.buildFConstant(F32, 0x1.0p+32);
3051   auto One = B.buildFConstant(F32, 1.0);
3052   auto ScaleFactor =
3053       B.buildSelect(F32, IsLtSmallestNormal, Scale32, One, Flags);
3054   auto ScaledInput = B.buildFMul(F32, Src, ScaleFactor, Flags);
3055 
3056   return {ScaledInput.getReg(0), IsLtSmallestNormal.getReg(0)};
3057 }
3058 
3059 bool AMDGPULegalizerInfo::legalizeFlog2(MachineInstr &MI,
3060                                         MachineIRBuilder &B) const {
3061   // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
3062   // If we have to handle denormals, scale up the input and adjust the result.
3063 
3064   // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
3065   // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
3066 
3067   Register Dst = MI.getOperand(0).getReg();
3068   Register Src = MI.getOperand(1).getReg();
3069   LLT Ty = B.getMRI()->getType(Dst);
3070   unsigned Flags = MI.getFlags();
3071 
3072   if (Ty == LLT::scalar(16)) {
3073     const LLT F32 = LLT::scalar(32);
3074     // Nothing in half is a denormal when promoted to f32.
3075     auto Ext = B.buildFPExt(F32, Src, Flags);
3076     auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {F32}, false)
3077       .addUse(Ext.getReg(0))
3078       .setMIFlags(Flags);
3079     B.buildFPTrunc(Dst, Log2, Flags);
3080     MI.eraseFromParent();
3081     return true;
3082   }
3083 
3084   assert(Ty == LLT::scalar(32));
3085 
3086   auto [ScaledInput, IsLtSmallestNormal] = getScaledLogInput(B, Src, Flags);
3087   if (!ScaledInput) {
3088     B.buildIntrinsic(Intrinsic::amdgcn_log, {MI.getOperand(0)}, false)
3089         .addUse(Src)
3090         .setMIFlags(Flags);
3091     MI.eraseFromParent();
3092     return true;
3093   }
3094 
3095   auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}, false)
3096                   .addUse(ScaledInput)
3097                   .setMIFlags(Flags);
3098 
3099   auto ThirtyTwo = B.buildFConstant(Ty, 32.0);
3100   auto Zero = B.buildFConstant(Ty, 0.0);
3101   auto ResultOffset =
3102       B.buildSelect(Ty, IsLtSmallestNormal, ThirtyTwo, Zero, Flags);
3103   B.buildFSub(Dst, Log2, ResultOffset, Flags);
3104 
3105   MI.eraseFromParent();
3106   return true;
3107 }
3108 
3109 static Register getMad(MachineIRBuilder &B, LLT Ty, Register X, Register Y,
3110                        Register Z, unsigned Flags) {
3111   auto FMul = B.buildFMul(Ty, X, Y, Flags);
3112   return B.buildFAdd(Ty, FMul, Z, Flags).getReg(0);
3113 }
3114 
3115 bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI,
3116                                              MachineIRBuilder &B) const {
3117   const bool IsLog10 = MI.getOpcode() == TargetOpcode::G_FLOG10;
3118   assert(IsLog10 || MI.getOpcode() == TargetOpcode::G_FLOG);
3119 
3120   MachineRegisterInfo &MRI = *B.getMRI();
3121   Register Dst = MI.getOperand(0).getReg();
3122   Register X = MI.getOperand(1).getReg();
3123   unsigned Flags = MI.getFlags();
3124   const LLT Ty = MRI.getType(X);
3125   MachineFunction &MF = B.getMF();
3126 
3127   const LLT F32 = LLT::scalar(32);
3128   const LLT F16 = LLT::scalar(16);
3129 
3130   const AMDGPUTargetMachine &TM =
3131       static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
3132 
3133   if (Ty == F16 || MI.getFlag(MachineInstr::FmAfn) ||
3134       TM.Options.ApproxFuncFPMath || TM.Options.UnsafeFPMath) {
3135     const double Log2BaseInv =
3136         IsLog10 ? numbers::ln2 / numbers::ln10 : numbers::ln2;
3137 
3138     if (Ty == F16 && !ST.has16BitInsts()) {
3139       Register LogVal = MRI.createGenericVirtualRegister(F32);
3140       auto PromoteSrc = B.buildFPExt(F32, X);
3141       legalizeFlogUnsafe(B, LogVal, PromoteSrc.getReg(0), Log2BaseInv, Flags);
3142       B.buildFPTrunc(Dst, LogVal);
3143     } else {
3144       legalizeFlogUnsafe(B, Dst, X, Log2BaseInv, Flags);
3145     }
3146 
3147     MI.eraseFromParent();
3148     return true;
3149   }
3150 
3151   auto [ScaledInput, IsScaled] = getScaledLogInput(B, X, Flags);
3152   if (ScaledInput)
3153     X = ScaledInput;
3154 
3155   auto Y = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}, false)
3156     .addUse(X)
3157     .setMIFlags(Flags);
3158 
3159   Register R;
3160   if (ST.hasFastFMAF32()) {
3161     // c+cc are ln(2)/ln(10) to more than 49 bits
3162     const float c_log10 = 0x1.344134p-2f;
3163     const float cc_log10 = 0x1.09f79ep-26f;
3164 
3165     // c + cc is ln(2) to more than 49 bits
3166     const float c_log = 0x1.62e42ep-1f;
3167     const float cc_log = 0x1.efa39ep-25f;
3168 
3169     auto C = B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log);
3170     auto CC = B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log);
3171 
3172     R = B.buildFMul(Ty, Y, C, Flags).getReg(0);
3173     auto NegR = B.buildFNeg(Ty, R, Flags);
3174     auto FMA0 = B.buildFMA(Ty, Y, C, NegR, Flags);
3175     auto FMA1 = B.buildFMA(Ty, Y, CC, FMA0, Flags);
3176     R = B.buildFAdd(Ty, R, FMA1, Flags).getReg(0);
3177   } else {
3178     // ch+ct is ln(2)/ln(10) to more than 36 bits
3179     const float ch_log10 = 0x1.344000p-2f;
3180     const float ct_log10 = 0x1.3509f6p-18f;
3181 
3182     // ch + ct is ln(2) to more than 36 bits
3183     const float ch_log = 0x1.62e000p-1f;
3184     const float ct_log = 0x1.0bfbe8p-15f;
3185 
3186     auto CH = B.buildFConstant(Ty, IsLog10 ? ch_log10 : ch_log);
3187     auto CT = B.buildFConstant(Ty, IsLog10 ? ct_log10 : ct_log);
3188 
3189     auto MaskConst = B.buildConstant(Ty, 0xfffff000);
3190     auto YH = B.buildAnd(Ty, Y, MaskConst);
3191     auto YT = B.buildFSub(Ty, Y, YH, Flags);
3192     auto YTCT = B.buildFMul(Ty, YT, CT, Flags);
3193 
3194     Register Mad0 =
3195         getMad(B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), Flags);
3196     Register Mad1 = getMad(B, Ty, YT.getReg(0), CH.getReg(0), Mad0, Flags);
3197     R = getMad(B, Ty, YH.getReg(0), CH.getReg(0), Mad1, Flags);
3198   }
3199 
3200   const bool IsFiniteOnly =
3201       (MI.getFlag(MachineInstr::FmNoNans) || TM.Options.NoNaNsFPMath) &&
3202       (MI.getFlag(MachineInstr::FmNoInfs) || TM.Options.NoInfsFPMath);
3203 
3204   if (!IsFiniteOnly) {
3205     // Expand isfinite(x) => fabs(x) < inf
3206     auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
3207     auto Fabs = B.buildFAbs(Ty, Y);
3208     auto IsFinite =
3209         B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
3210     R = B.buildSelect(Ty, IsFinite, R, Y, Flags).getReg(0);
3211   }
3212 
3213   if (ScaledInput) {
3214     auto Zero = B.buildFConstant(Ty, 0.0);
3215     auto ShiftK =
3216         B.buildFConstant(Ty, IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f);
3217     auto Shift = B.buildSelect(Ty, IsScaled, ShiftK, Zero, Flags);
3218     B.buildFSub(Dst, R, Shift, Flags);
3219   } else {
3220     B.buildCopy(Dst, R);
3221   }
3222 
3223   MI.eraseFromParent();
3224   return true;
3225 }
3226 
3227 bool AMDGPULegalizerInfo::legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst,
3228                                              Register Src,
3229                                              double Log2BaseInverted,
3230                                              unsigned Flags) const {
3231   LLT Ty = B.getMRI()->getType(Dst);
3232   auto Log2Operand = Ty == LLT::scalar(16)
3233                          ? B.buildFLog2(Ty, Src, Flags)
3234                          : B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}, false)
3235                                .addUse(Src)
3236                                .setMIFlags(Flags);
3237   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
3238   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
3239   return true;
3240 }
3241 
3242 bool AMDGPULegalizerInfo::legalizeFExp2(MachineInstr &MI,
3243                                         MachineIRBuilder &B) const {
3244   // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
3245   // If we have to handle denormals, scale up the input and adjust the result.
3246 
3247   Register Dst = MI.getOperand(0).getReg();
3248   Register Src = MI.getOperand(1).getReg();
3249   unsigned Flags = MI.getFlags();
3250   LLT Ty = B.getMRI()->getType(Dst);
3251   const LLT F16 = LLT::scalar(16);
3252   const LLT F32 = LLT::scalar(32);
3253 
3254   if (Ty == F16) {
3255     // Nothing in half is a denormal when promoted to f32.
3256     auto Ext = B.buildFPExt(F32, Src, Flags);
3257     auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {F32}, false)
3258       .addUse(Ext.getReg(0))
3259       .setMIFlags(Flags);
3260     B.buildFPTrunc(Dst, Log2, Flags);
3261     MI.eraseFromParent();
3262     return true;
3263   }
3264 
3265   assert(Ty == F32);
3266 
3267   if (allowApproxFunc(B.getMF(), Flags) ||
3268       !needsDenormHandlingF32(B.getMF(), Src, Flags)) {
3269     B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst}, false)
3270       .addUse(Src)
3271       .setMIFlags(Flags);
3272     MI.eraseFromParent();
3273     return true;
3274   }
3275 
3276   // bool needs_scaling = x < -0x1.f80000p+6f;
3277   // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
3278 
3279   // -nextafter(128.0, -1)
3280   auto RangeCheckConst = B.buildFConstant(Ty, -0x1.f80000p+6f);
3281   auto NeedsScaling = B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src,
3282                                   RangeCheckConst, Flags);
3283 
3284   auto SixtyFour = B.buildFConstant(Ty, 0x1.0p+6f);
3285   auto Zero = B.buildFConstant(Ty, 0.0);
3286   auto AddOffset = B.buildSelect(F32, NeedsScaling, SixtyFour, Zero, Flags);
3287   auto AddInput = B.buildFAdd(F32, Src, AddOffset, Flags);
3288 
3289   auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty}, false)
3290                   .addUse(AddInput.getReg(0))
3291                   .setMIFlags(Flags);
3292 
3293   auto TwoExpNeg64 = B.buildFConstant(Ty, 0x1.0p-64f);
3294   auto One = B.buildFConstant(Ty, 1.0);
3295   auto ResultScale = B.buildSelect(F32, NeedsScaling, TwoExpNeg64, One, Flags);
3296   B.buildFMul(Dst, Exp2, ResultScale, Flags);
3297   MI.eraseFromParent();
3298   return true;
3299 }
3300 
3301 bool AMDGPULegalizerInfo::legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst,
3302                                              Register Src,
3303                                              unsigned Flags) const {
3304   LLT Ty = B.getMRI()->getType(Dst);
3305   auto K = B.buildFConstant(Ty, numbers::log2e);
3306   auto Mul = B.buildFMul(Ty, Src, K, Flags);
3307 
3308   if (Ty == LLT::scalar(32)) {
3309     B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst}, false)
3310       .addUse(Mul.getReg(0))
3311       .setMIFlags(Flags);
3312   } else {
3313     B.buildFExp2(Dst, Mul.getReg(0), Flags);
3314   }
3315 
3316   return true;
3317 }
3318 
3319 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
3320                                        MachineIRBuilder &B) const {
3321   Register Dst = MI.getOperand(0).getReg();
3322   Register X = MI.getOperand(1).getReg();
3323   const unsigned Flags = MI.getFlags();
3324   MachineFunction &MF = B.getMF();
3325   MachineRegisterInfo &MRI = *B.getMRI();
3326   LLT Ty = MRI.getType(Dst);
3327   const LLT F16 = LLT::scalar(16);
3328   const LLT F32 = LLT::scalar(32);
3329   const bool IsExp10 = false; // TODO: For some reason exp10 is missing
3330 
3331   if (Ty == F16) {
3332     // v_exp_f16 (fmul x, log2e)
3333     if (allowApproxFunc(MF, Flags)) {
3334       // TODO: Does this really require fast?
3335       legalizeFExpUnsafe(B, Dst, X, Flags);
3336       MI.eraseFromParent();
3337       return true;
3338     }
3339 
3340     // exp(f16 x) ->
3341     //   fptrunc (v_exp_f32 (fmul (fpext x), log2e))
3342 
3343     // Nothing in half is a denormal when promoted to f32.
3344     auto Ext = B.buildFPExt(F32, X, Flags);
3345     Register Lowered = MRI.createGenericVirtualRegister(F32);
3346     legalizeFExpUnsafe(B, Lowered, Ext.getReg(0), Flags);
3347     B.buildFPTrunc(Dst, Lowered, Flags);
3348     MI.eraseFromParent();
3349     return true;
3350   }
3351 
3352   assert(Ty == F32);
3353 
3354   // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
3355   // library behavior. Also, is known-not-daz source sufficient?
3356   if (allowApproxFunc(MF, Flags) && !needsDenormHandlingF32(MF, X, Flags)) {
3357     legalizeFExpUnsafe(B, Dst, X, Flags);
3358     MI.eraseFromParent();
3359     return true;
3360   }
3361 
3362   //    Algorithm:
3363   //
3364   //    e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
3365   //
3366   //    x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
3367   //    n = 64*m + j,   0 <= j < 64
3368   //
3369   //    e^x = 2^((64*m + j + f)/64)
3370   //        = (2^m) * (2^(j/64)) * 2^(f/64)
3371   //        = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
3372   //
3373   //    f = x*(64/ln(2)) - n
3374   //    r = f*(ln(2)/64) = x - n*(ln(2)/64)
3375   //
3376   //    e^x = (2^m) * (2^(j/64)) * e^r
3377   //
3378   //    (2^(j/64)) is precomputed
3379   //
3380   //    e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3381   //    e^r = 1 + q
3382   //
3383   //    q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3384   //
3385   //    e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
3386   const unsigned FlagsNoContract = Flags & ~MachineInstr::FmContract;
3387   Register PH, PL;
3388 
3389   if (ST.hasFastFMAF32()) {
3390     const float c_exp = numbers::log2ef;
3391     const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
3392     const float c_exp10 = 0x1.a934f0p+1f;
3393     const float cc_exp10 = 0x1.2f346ep-24f;
3394 
3395     auto C = B.buildFConstant(Ty, IsExp10 ? c_exp10 : c_exp);
3396     PH = B.buildFMul(Ty, X, C, Flags).getReg(0);
3397     auto NegPH = B.buildFNeg(Ty, PH, Flags);
3398     auto FMA0 = B.buildFMA(Ty, X, C, NegPH, Flags);
3399 
3400     auto CC = B.buildFConstant(Ty, IsExp10 ? cc_exp10 : cc_exp);
3401     PL = B.buildFMA(Ty, X, CC, FMA0, Flags).getReg(0);
3402   } else {
3403     const float ch_exp = 0x1.714000p+0f;
3404     const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
3405 
3406     const float ch_exp10 = 0x1.a92000p+1f;
3407     const float cl_exp10 = 0x1.4f0978p-11f;
3408 
3409     auto MaskConst = B.buildConstant(Ty, 0xfffff000);
3410     auto XH = B.buildAnd(Ty, X, MaskConst);
3411     auto XL = B.buildFSub(Ty, X, XH, Flags);
3412 
3413     auto CH = B.buildFConstant(Ty, IsExp10 ? ch_exp10 : ch_exp);
3414     PH = B.buildFMul(Ty, XH, CH, Flags).getReg(0);
3415 
3416     auto CL = B.buildFConstant(Ty, IsExp10 ? cl_exp10 : cl_exp);
3417     auto XLCL = B.buildFMul(Ty, XL, CL, Flags);
3418 
3419     Register Mad0 =
3420         getMad(B, Ty, XL.getReg(0), CH.getReg(0), XLCL.getReg(0), Flags);
3421     PL = getMad(B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags);
3422   }
3423 
3424   auto E = B.buildFRint(Ty, PH, Flags);
3425 
3426   // It is unsafe to contract this fsub into the PH multiply.
3427   auto PHSubE = B.buildFSub(Ty, PH, E, FlagsNoContract);
3428   auto A = B.buildFAdd(Ty, PHSubE, PL, Flags);
3429   auto IntE = B.buildFPTOSI(LLT::scalar(32), E);
3430 
3431   auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty}, false)
3432                   .addUse(A.getReg(0))
3433                   .setMIFlags(Flags);
3434   auto R = B.buildFLdexp(Ty, Exp2, IntE, Flags);
3435 
3436   auto UnderflowCheckConst =
3437       B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
3438   auto Zero = B.buildFConstant(Ty, 0.0);
3439   auto Underflow =
3440       B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, UnderflowCheckConst);
3441 
3442   R = B.buildSelect(Ty, Underflow, Zero, R);
3443 
3444   const auto &Options = MF.getTarget().Options;
3445 
3446   if (!(Flags & MachineInstr::FmNoInfs) && !Options.NoInfsFPMath) {
3447     auto OverflowCheckConst =
3448         B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
3449 
3450     auto Overflow =
3451         B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), X, OverflowCheckConst);
3452     auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
3453     R = B.buildSelect(Ty, Overflow, Inf, R, Flags);
3454   }
3455 
3456   B.buildCopy(Dst, R);
3457   MI.eraseFromParent();
3458   return true;
3459 }
3460 
3461 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
3462                                        MachineIRBuilder &B) const {
3463   Register Dst = MI.getOperand(0).getReg();
3464   Register Src0 = MI.getOperand(1).getReg();
3465   Register Src1 = MI.getOperand(2).getReg();
3466   unsigned Flags = MI.getFlags();
3467   LLT Ty = B.getMRI()->getType(Dst);
3468   const LLT S16 = LLT::scalar(16);
3469   const LLT S32 = LLT::scalar(32);
3470 
3471   if (Ty == S32) {
3472     auto Log = B.buildFLog2(S32, Src0, Flags);
3473     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
3474       .addUse(Log.getReg(0))
3475       .addUse(Src1)
3476       .setMIFlags(Flags);
3477     B.buildFExp2(Dst, Mul, Flags);
3478   } else if (Ty == S16) {
3479     // There's no f16 fmul_legacy, so we need to convert for it.
3480     auto Log = B.buildFLog2(S16, Src0, Flags);
3481     auto Ext0 = B.buildFPExt(S32, Log, Flags);
3482     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
3483     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
3484       .addUse(Ext0.getReg(0))
3485       .addUse(Ext1.getReg(0))
3486       .setMIFlags(Flags);
3487 
3488     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
3489   } else
3490     return false;
3491 
3492   MI.eraseFromParent();
3493   return true;
3494 }
3495 
3496 // Find a source register, ignoring any possible source modifiers.
3497 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
3498   Register ModSrc = OrigSrc;
3499   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
3500     ModSrc = SrcFNeg->getOperand(1).getReg();
3501     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
3502       ModSrc = SrcFAbs->getOperand(1).getReg();
3503   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
3504     ModSrc = SrcFAbs->getOperand(1).getReg();
3505   return ModSrc;
3506 }
3507 
3508 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
3509                                          MachineRegisterInfo &MRI,
3510                                          MachineIRBuilder &B) const {
3511 
3512   const LLT S1 = LLT::scalar(1);
3513   const LLT S64 = LLT::scalar(64);
3514   Register Dst = MI.getOperand(0).getReg();
3515   Register OrigSrc = MI.getOperand(1).getReg();
3516   unsigned Flags = MI.getFlags();
3517   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
3518          "this should not have been custom lowered");
3519 
3520   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
3521   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
3522   // efficient way to implement it is using V_FRACT_F64. The workaround for the
3523   // V_FRACT bug is:
3524   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
3525   //
3526   // Convert floor(x) to (x - fract(x))
3527 
3528   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
3529     .addUse(OrigSrc)
3530     .setMIFlags(Flags);
3531 
3532   // Give source modifier matching some assistance before obscuring a foldable
3533   // pattern.
3534 
3535   // TODO: We can avoid the neg on the fract? The input sign to fract
3536   // shouldn't matter?
3537   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
3538 
3539   auto Const =
3540       B.buildFConstant(S64, llvm::bit_cast<double>(0x3fefffffffffffff));
3541 
3542   Register Min = MRI.createGenericVirtualRegister(S64);
3543 
3544   // We don't need to concern ourselves with the snan handling difference, so
3545   // use the one which will directly select.
3546   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3547   if (MFI->getMode().IEEE)
3548     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
3549   else
3550     B.buildFMinNum(Min, Fract, Const, Flags);
3551 
3552   Register CorrectedFract = Min;
3553   if (!MI.getFlag(MachineInstr::FmNoNans)) {
3554     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
3555     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
3556   }
3557 
3558   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
3559   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
3560 
3561   MI.eraseFromParent();
3562   return true;
3563 }
3564 
3565 // Turn an illegal packed v2s16 build vector into bit operations.
3566 // TODO: This should probably be a bitcast action in LegalizerHelper.
3567 bool AMDGPULegalizerInfo::legalizeBuildVector(
3568   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
3569   Register Dst = MI.getOperand(0).getReg();
3570   const LLT S32 = LLT::scalar(32);
3571   const LLT S16 = LLT::scalar(16);
3572   assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
3573 
3574   Register Src0 = MI.getOperand(1).getReg();
3575   Register Src1 = MI.getOperand(2).getReg();
3576 
3577   if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
3578     assert(MRI.getType(Src0) == S32);
3579     Src0 = B.buildTrunc(S16, MI.getOperand(1).getReg()).getReg(0);
3580     Src1 = B.buildTrunc(S16, MI.getOperand(2).getReg()).getReg(0);
3581   }
3582 
3583   auto Merge = B.buildMergeLikeInstr(S32, {Src0, Src1});
3584   B.buildBitcast(Dst, Merge);
3585 
3586   MI.eraseFromParent();
3587   return true;
3588 }
3589 
3590 // Build a big integer multiply or multiply-add using MAD_64_32 instructions.
3591 //
3592 // Source and accumulation registers must all be 32-bits.
3593 //
3594 // TODO: When the multiply is uniform, we should produce a code sequence
3595 // that is better suited to instruction selection on the SALU. Instead of
3596 // the outer loop going over parts of the result, the outer loop should go
3597 // over parts of one of the factors. This should result in instruction
3598 // selection that makes full use of S_ADDC_U32 instructions.
3599 void AMDGPULegalizerInfo::buildMultiply(LegalizerHelper &Helper,
3600                                         MutableArrayRef<Register> Accum,
3601                                         ArrayRef<Register> Src0,
3602                                         ArrayRef<Register> Src1,
3603                                         bool UsePartialMad64_32,
3604                                         bool SeparateOddAlignedProducts) const {
3605   // Use (possibly empty) vectors of S1 registers to represent the set of
3606   // carries from one pair of positions to the next.
3607   using Carry = SmallVector<Register, 2>;
3608 
3609   MachineIRBuilder &B = Helper.MIRBuilder;
3610   GISelKnownBits &KB = *Helper.getKnownBits();
3611 
3612   const LLT S1 = LLT::scalar(1);
3613   const LLT S32 = LLT::scalar(32);
3614   const LLT S64 = LLT::scalar(64);
3615 
3616   Register Zero32;
3617   Register Zero64;
3618 
3619   auto getZero32 = [&]() -> Register {
3620     if (!Zero32)
3621       Zero32 = B.buildConstant(S32, 0).getReg(0);
3622     return Zero32;
3623   };
3624   auto getZero64 = [&]() -> Register {
3625     if (!Zero64)
3626       Zero64 = B.buildConstant(S64, 0).getReg(0);
3627     return Zero64;
3628   };
3629 
3630   SmallVector<bool, 2> Src0KnownZeros, Src1KnownZeros;
3631   for (unsigned i = 0; i < Src0.size(); ++i) {
3632     Src0KnownZeros.push_back(KB.getKnownBits(Src0[i]).isZero());
3633     Src1KnownZeros.push_back(KB.getKnownBits(Src1[i]).isZero());
3634   }
3635 
3636   // Merge the given carries into the 32-bit LocalAccum, which is modified
3637   // in-place.
3638   //
3639   // Returns the carry-out, which is a single S1 register or null.
3640   auto mergeCarry =
3641       [&](Register &LocalAccum, const Carry &CarryIn) -> Register {
3642         if (CarryIn.empty())
3643           return Register();
3644 
3645         bool HaveCarryOut = true;
3646         Register CarryAccum;
3647         if (CarryIn.size() == 1) {
3648           if (!LocalAccum) {
3649             LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
3650             return Register();
3651           }
3652 
3653           CarryAccum = getZero32();
3654         } else {
3655           CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
3656           for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
3657             CarryAccum =
3658                 B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i])
3659                     .getReg(0);
3660           }
3661 
3662           if (!LocalAccum) {
3663             LocalAccum = getZero32();
3664             HaveCarryOut = false;
3665           }
3666         }
3667 
3668         auto Add =
3669             B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back());
3670         LocalAccum = Add.getReg(0);
3671         return HaveCarryOut ? Add.getReg(1) : Register();
3672       };
3673 
3674   // Build a multiply-add chain to compute
3675   //
3676   //   LocalAccum + (partial products at DstIndex)
3677   //       + (opportunistic subset of CarryIn)
3678   //
3679   // LocalAccum is an array of one or two 32-bit registers that are updated
3680   // in-place. The incoming registers may be null.
3681   //
3682   // In some edge cases, carry-ins can be consumed "for free". In that case,
3683   // the consumed carry bits are removed from CarryIn in-place.
3684   auto buildMadChain =
3685       [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn)
3686           -> Carry {
3687         assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) ||
3688                (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1));
3689 
3690         Carry CarryOut;
3691         unsigned j0 = 0;
3692 
3693         // Use plain 32-bit multiplication for the most significant part of the
3694         // result by default.
3695         if (LocalAccum.size() == 1 &&
3696             (!UsePartialMad64_32 || !CarryIn.empty())) {
3697           do {
3698             // Skip multiplication if one of the operands is 0
3699             unsigned j1 = DstIndex - j0;
3700             if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
3701               ++j0;
3702               continue;
3703             }
3704             auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]);
3705             if (!LocalAccum[0] || KB.getKnownBits(LocalAccum[0]).isZero()) {
3706               LocalAccum[0] = Mul.getReg(0);
3707             } else {
3708               if (CarryIn.empty()) {
3709                 LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0);
3710               } else {
3711                 LocalAccum[0] =
3712                     B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back())
3713                         .getReg(0);
3714                 CarryIn.pop_back();
3715               }
3716             }
3717             ++j0;
3718           } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
3719         }
3720 
3721         // Build full 64-bit multiplies.
3722         if (j0 <= DstIndex) {
3723           bool HaveSmallAccum = false;
3724           Register Tmp;
3725 
3726           if (LocalAccum[0]) {
3727             if (LocalAccum.size() == 1) {
3728               Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0);
3729               HaveSmallAccum = true;
3730             } else if (LocalAccum[1]) {
3731               Tmp = B.buildMergeLikeInstr(S64, LocalAccum).getReg(0);
3732               HaveSmallAccum = false;
3733             } else {
3734               Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0);
3735               HaveSmallAccum = true;
3736             }
3737           } else {
3738             assert(LocalAccum.size() == 1 || !LocalAccum[1]);
3739             Tmp = getZero64();
3740             HaveSmallAccum = true;
3741           }
3742 
3743           do {
3744             unsigned j1 = DstIndex - j0;
3745             if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
3746               ++j0;
3747               continue;
3748             }
3749             auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1},
3750                                     {Src0[j0], Src1[j1], Tmp});
3751             Tmp = Mad.getReg(0);
3752             if (!HaveSmallAccum)
3753               CarryOut.push_back(Mad.getReg(1));
3754             HaveSmallAccum = false;
3755 
3756             ++j0;
3757           } while (j0 <= DstIndex);
3758 
3759           auto Unmerge = B.buildUnmerge(S32, Tmp);
3760           LocalAccum[0] = Unmerge.getReg(0);
3761           if (LocalAccum.size() > 1)
3762             LocalAccum[1] = Unmerge.getReg(1);
3763         }
3764 
3765         return CarryOut;
3766       };
3767 
3768   // Outer multiply loop, iterating over destination parts from least
3769   // significant to most significant parts.
3770   //
3771   // The columns of the following diagram correspond to the destination parts
3772   // affected by one iteration of the outer loop (ignoring boundary
3773   // conditions).
3774   //
3775   //   Dest index relative to 2 * i:      1 0 -1
3776   //                                      ------
3777   //   Carries from previous iteration:     e o
3778   //   Even-aligned partial product sum:  E E .
3779   //   Odd-aligned partial product sum:     O O
3780   //
3781   // 'o' is OddCarry, 'e' is EvenCarry.
3782   // EE and OO are computed from partial products via buildMadChain and use
3783   // accumulation where possible and appropriate.
3784   //
3785   Register SeparateOddCarry;
3786   Carry EvenCarry;
3787   Carry OddCarry;
3788 
3789   for (unsigned i = 0; i <= Accum.size() / 2; ++i) {
3790     Carry OddCarryIn = std::move(OddCarry);
3791     Carry EvenCarryIn = std::move(EvenCarry);
3792     OddCarry.clear();
3793     EvenCarry.clear();
3794 
3795     // Partial products at offset 2 * i.
3796     if (2 * i < Accum.size()) {
3797       auto LocalAccum = Accum.drop_front(2 * i).take_front(2);
3798       EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
3799     }
3800 
3801     // Partial products at offset 2 * i - 1.
3802     if (i > 0) {
3803       if (!SeparateOddAlignedProducts) {
3804         auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2);
3805         OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
3806       } else {
3807         bool IsHighest = 2 * i >= Accum.size();
3808         Register SeparateOddOut[2];
3809         auto LocalAccum = MutableArrayRef(SeparateOddOut)
3810                               .take_front(IsHighest ? 1 : 2);
3811         OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
3812 
3813         MachineInstr *Lo;
3814 
3815         if (i == 1) {
3816           if (!IsHighest)
3817             Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]);
3818           else
3819             Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]);
3820         } else {
3821           Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0],
3822                             SeparateOddCarry);
3823         }
3824         Accum[2 * i - 1] = Lo->getOperand(0).getReg();
3825 
3826         if (!IsHighest) {
3827           auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1],
3828                                 Lo->getOperand(1).getReg());
3829           Accum[2 * i] = Hi.getReg(0);
3830           SeparateOddCarry = Hi.getReg(1);
3831         }
3832       }
3833     }
3834 
3835     // Add in the carries from the previous iteration
3836     if (i > 0) {
3837       if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
3838         EvenCarryIn.push_back(CarryOut);
3839 
3840       if (2 * i < Accum.size()) {
3841         if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
3842           OddCarry.push_back(CarryOut);
3843       }
3844     }
3845   }
3846 }
3847 
3848 // Custom narrowing of wide multiplies using wide multiply-add instructions.
3849 //
3850 // TODO: If the multiply is followed by an addition, we should attempt to
3851 // integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities.
3852 bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper &Helper,
3853                                       MachineInstr &MI) const {
3854   assert(ST.hasMad64_32());
3855   assert(MI.getOpcode() == TargetOpcode::G_MUL);
3856 
3857   MachineIRBuilder &B = Helper.MIRBuilder;
3858   MachineRegisterInfo &MRI = *B.getMRI();
3859 
3860   Register DstReg = MI.getOperand(0).getReg();
3861   Register Src0 = MI.getOperand(1).getReg();
3862   Register Src1 = MI.getOperand(2).getReg();
3863 
3864   LLT Ty = MRI.getType(DstReg);
3865   assert(Ty.isScalar());
3866 
3867   unsigned Size = Ty.getSizeInBits();
3868   unsigned NumParts = Size / 32;
3869   assert((Size % 32) == 0);
3870   assert(NumParts >= 2);
3871 
3872   // Whether to use MAD_64_32 for partial products whose high half is
3873   // discarded. This avoids some ADD instructions but risks false dependency
3874   // stalls on some subtargets in some cases.
3875   const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10;
3876 
3877   // Whether to compute odd-aligned partial products separately. This is
3878   // advisable on subtargets where the accumulator of MAD_64_32 must be placed
3879   // in an even-aligned VGPR.
3880   const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
3881 
3882   LLT S32 = LLT::scalar(32);
3883   SmallVector<Register, 2> Src0Parts, Src1Parts;
3884   for (unsigned i = 0; i < NumParts; ++i) {
3885     Src0Parts.push_back(MRI.createGenericVirtualRegister(S32));
3886     Src1Parts.push_back(MRI.createGenericVirtualRegister(S32));
3887   }
3888   B.buildUnmerge(Src0Parts, Src0);
3889   B.buildUnmerge(Src1Parts, Src1);
3890 
3891   SmallVector<Register, 2> AccumRegs(NumParts);
3892   buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
3893                 SeparateOddAlignedProducts);
3894 
3895   B.buildMergeLikeInstr(DstReg, AccumRegs);
3896   MI.eraseFromParent();
3897   return true;
3898 }
3899 
3900 // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
3901 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
3902 // case with a single min instruction instead of a compare+select.
3903 bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI,
3904                                             MachineRegisterInfo &MRI,
3905                                             MachineIRBuilder &B) const {
3906   Register Dst = MI.getOperand(0).getReg();
3907   Register Src = MI.getOperand(1).getReg();
3908   LLT DstTy = MRI.getType(Dst);
3909   LLT SrcTy = MRI.getType(Src);
3910 
3911   unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
3912                         ? AMDGPU::G_AMDGPU_FFBH_U32
3913                         : AMDGPU::G_AMDGPU_FFBL_B32;
3914   auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
3915   B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
3916 
3917   MI.eraseFromParent();
3918   return true;
3919 }
3920 
3921 // Check that this is a G_XOR x, -1
3922 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
3923   if (MI.getOpcode() != TargetOpcode::G_XOR)
3924     return false;
3925   auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
3926   return ConstVal && *ConstVal == -1;
3927 }
3928 
3929 // Return the use branch instruction, otherwise null if the usage is invalid.
3930 static MachineInstr *
3931 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
3932                   MachineBasicBlock *&UncondBrTarget, bool &Negated) {
3933   Register CondDef = MI.getOperand(0).getReg();
3934   if (!MRI.hasOneNonDBGUse(CondDef))
3935     return nullptr;
3936 
3937   MachineBasicBlock *Parent = MI.getParent();
3938   MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
3939 
3940   if (isNot(MRI, *UseMI)) {
3941     Register NegatedCond = UseMI->getOperand(0).getReg();
3942     if (!MRI.hasOneNonDBGUse(NegatedCond))
3943       return nullptr;
3944 
3945     // We're deleting the def of this value, so we need to remove it.
3946     eraseInstr(*UseMI, MRI);
3947 
3948     UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
3949     Negated = true;
3950   }
3951 
3952   if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
3953     return nullptr;
3954 
3955   // Make sure the cond br is followed by a G_BR, or is the last instruction.
3956   MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
3957   if (Next == Parent->end()) {
3958     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
3959     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
3960       return nullptr;
3961     UncondBrTarget = &*NextMBB;
3962   } else {
3963     if (Next->getOpcode() != AMDGPU::G_BR)
3964       return nullptr;
3965     Br = &*Next;
3966     UncondBrTarget = Br->getOperand(0).getMBB();
3967   }
3968 
3969   return UseMI;
3970 }
3971 
3972 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
3973                                          const ArgDescriptor *Arg,
3974                                          const TargetRegisterClass *ArgRC,
3975                                          LLT ArgTy) const {
3976   MCRegister SrcReg = Arg->getRegister();
3977   assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
3978   assert(DstReg.isVirtual() && "Virtual register expected");
3979 
3980   Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg,
3981                                              *ArgRC, B.getDebugLoc(), ArgTy);
3982   if (Arg->isMasked()) {
3983     // TODO: Should we try to emit this once in the entry block?
3984     const LLT S32 = LLT::scalar(32);
3985     const unsigned Mask = Arg->getMask();
3986     const unsigned Shift = llvm::countr_zero<unsigned>(Mask);
3987 
3988     Register AndMaskSrc = LiveIn;
3989 
3990     // TODO: Avoid clearing the high bits if we know workitem id y/z are always
3991     // 0.
3992     if (Shift != 0) {
3993       auto ShiftAmt = B.buildConstant(S32, Shift);
3994       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
3995     }
3996 
3997     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
3998   } else {
3999     B.buildCopy(DstReg, LiveIn);
4000   }
4001 
4002   return true;
4003 }
4004 
4005 bool AMDGPULegalizerInfo::loadInputValue(
4006     Register DstReg, MachineIRBuilder &B,
4007     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4008   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4009   const ArgDescriptor *Arg;
4010   const TargetRegisterClass *ArgRC;
4011   LLT ArgTy;
4012   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
4013 
4014   if (!Arg) {
4015     if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) {
4016       // The intrinsic may appear when we have a 0 sized kernarg segment, in which
4017       // case the pointer argument may be missing and we use null.
4018       B.buildConstant(DstReg, 0);
4019       return true;
4020     }
4021 
4022     // It's undefined behavior if a function marked with the amdgpu-no-*
4023     // attributes uses the corresponding intrinsic.
4024     B.buildUndef(DstReg);
4025     return true;
4026   }
4027 
4028   if (!Arg->isRegister() || !Arg->getRegister().isValid())
4029     return false; // TODO: Handle these
4030   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
4031 }
4032 
4033 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
4034     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
4035     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4036   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
4037     return false;
4038 
4039   MI.eraseFromParent();
4040   return true;
4041 }
4042 
4043 static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI,
4044                                 int64_t C) {
4045   B.buildConstant(MI.getOperand(0).getReg(), C);
4046   MI.eraseFromParent();
4047   return true;
4048 }
4049 
4050 bool AMDGPULegalizerInfo::legalizeWorkitemIDIntrinsic(
4051     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
4052     unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4053   unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim);
4054   if (MaxID == 0)
4055     return replaceWithConstant(B, MI, 0);
4056 
4057   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4058   const ArgDescriptor *Arg;
4059   const TargetRegisterClass *ArgRC;
4060   LLT ArgTy;
4061   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
4062 
4063   Register DstReg = MI.getOperand(0).getReg();
4064   if (!Arg) {
4065     // It's undefined behavior if a function marked with the amdgpu-no-*
4066     // attributes uses the corresponding intrinsic.
4067     B.buildUndef(DstReg);
4068     MI.eraseFromParent();
4069     return true;
4070   }
4071 
4072   if (Arg->isMasked()) {
4073     // Don't bother inserting AssertZext for packed IDs since we're emitting the
4074     // masking operations anyway.
4075     //
4076     // TODO: We could assert the top bit is 0 for the source copy.
4077     if (!loadInputValue(DstReg, B, ArgType))
4078       return false;
4079   } else {
4080     Register TmpReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
4081     if (!loadInputValue(TmpReg, B, ArgType))
4082       return false;
4083     B.buildAssertZExt(DstReg, TmpReg, llvm::bit_width(MaxID));
4084   }
4085 
4086   MI.eraseFromParent();
4087   return true;
4088 }
4089 
4090 Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B,
4091                                                      int64_t Offset) const {
4092   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
4093   Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy);
4094 
4095   // TODO: If we passed in the base kernel offset we could have a better
4096   // alignment than 4, but we don't really need it.
4097   if (!loadInputValue(KernArgReg, B,
4098                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
4099     llvm_unreachable("failed to find kernarg segment ptr");
4100 
4101   auto COffset = B.buildConstant(LLT::scalar(64), Offset);
4102   // TODO: Should get nuw
4103   return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0);
4104 }
4105 
4106 /// Legalize a value that's loaded from kernel arguments. This is only used by
4107 /// legacy intrinsics.
4108 bool AMDGPULegalizerInfo::legalizeKernargMemParameter(MachineInstr &MI,
4109                                                       MachineIRBuilder &B,
4110                                                       uint64_t Offset,
4111                                                       Align Alignment) const {
4112   Register DstReg = MI.getOperand(0).getReg();
4113 
4114   assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) &&
4115          "unexpected kernarg parameter type");
4116 
4117   Register Ptr = getKernargParameterPtr(B, Offset);
4118   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
4119   B.buildLoad(DstReg, Ptr, PtrInfo, Align(4),
4120               MachineMemOperand::MODereferenceable |
4121                   MachineMemOperand::MOInvariant);
4122   MI.eraseFromParent();
4123   return true;
4124 }
4125 
4126 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
4127                                        MachineRegisterInfo &MRI,
4128                                        MachineIRBuilder &B) const {
4129   Register Dst = MI.getOperand(0).getReg();
4130   LLT DstTy = MRI.getType(Dst);
4131   LLT S16 = LLT::scalar(16);
4132   LLT S32 = LLT::scalar(32);
4133   LLT S64 = LLT::scalar(64);
4134 
4135   if (DstTy == S16)
4136     return legalizeFDIV16(MI, MRI, B);
4137   if (DstTy == S32)
4138     return legalizeFDIV32(MI, MRI, B);
4139   if (DstTy == S64)
4140     return legalizeFDIV64(MI, MRI, B);
4141 
4142   return false;
4143 }
4144 
4145 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
4146                                                         Register DstDivReg,
4147                                                         Register DstRemReg,
4148                                                         Register X,
4149                                                         Register Y) const {
4150   const LLT S1 = LLT::scalar(1);
4151   const LLT S32 = LLT::scalar(32);
4152 
4153   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
4154   // algorithm used here.
4155 
4156   // Initial estimate of inv(y).
4157   auto FloatY = B.buildUITOFP(S32, Y);
4158   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
4159   auto Scale = B.buildFConstant(S32, llvm::bit_cast<float>(0x4f7ffffe));
4160   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
4161   auto Z = B.buildFPTOUI(S32, ScaledY);
4162 
4163   // One round of UNR.
4164   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
4165   auto NegYZ = B.buildMul(S32, NegY, Z);
4166   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
4167 
4168   // Quotient/remainder estimate.
4169   auto Q = B.buildUMulH(S32, X, Z);
4170   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
4171 
4172   // First quotient/remainder refinement.
4173   auto One = B.buildConstant(S32, 1);
4174   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
4175   if (DstDivReg)
4176     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
4177   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
4178 
4179   // Second quotient/remainder refinement.
4180   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
4181   if (DstDivReg)
4182     B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
4183 
4184   if (DstRemReg)
4185     B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
4186 }
4187 
4188 // Build integer reciprocal sequence around V_RCP_IFLAG_F32
4189 //
4190 // Return lo, hi of result
4191 //
4192 // %cvt.lo = G_UITOFP Val.lo
4193 // %cvt.hi = G_UITOFP Val.hi
4194 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
4195 // %rcp = G_AMDGPU_RCP_IFLAG %mad
4196 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
4197 // %mul2 = G_FMUL %mul1, 2**(-32)
4198 // %trunc = G_INTRINSIC_TRUNC %mul2
4199 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
4200 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
4201 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
4202                                                        Register Val) {
4203   const LLT S32 = LLT::scalar(32);
4204   auto Unmerge = B.buildUnmerge(S32, Val);
4205 
4206   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
4207   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
4208 
4209   auto Mad = B.buildFMAD(
4210       S32, CvtHi, // 2**32
4211       B.buildFConstant(S32, llvm::bit_cast<float>(0x4f800000)), CvtLo);
4212 
4213   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
4214   auto Mul1 = B.buildFMul(
4215       S32, Rcp, B.buildFConstant(S32, llvm::bit_cast<float>(0x5f7ffffc)));
4216 
4217   // 2**(-32)
4218   auto Mul2 = B.buildFMul(
4219       S32, Mul1, B.buildFConstant(S32, llvm::bit_cast<float>(0x2f800000)));
4220   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
4221 
4222   // -(2**32)
4223   auto Mad2 = B.buildFMAD(
4224       S32, Trunc, B.buildFConstant(S32, llvm::bit_cast<float>(0xcf800000)),
4225       Mul1);
4226 
4227   auto ResultLo = B.buildFPTOUI(S32, Mad2);
4228   auto ResultHi = B.buildFPTOUI(S32, Trunc);
4229 
4230   return {ResultLo.getReg(0), ResultHi.getReg(0)};
4231 }
4232 
4233 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
4234                                                         Register DstDivReg,
4235                                                         Register DstRemReg,
4236                                                         Register Numer,
4237                                                         Register Denom) const {
4238   const LLT S32 = LLT::scalar(32);
4239   const LLT S64 = LLT::scalar(64);
4240   const LLT S1 = LLT::scalar(1);
4241   Register RcpLo, RcpHi;
4242 
4243   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
4244 
4245   auto Rcp = B.buildMergeLikeInstr(S64, {RcpLo, RcpHi});
4246 
4247   auto Zero64 = B.buildConstant(S64, 0);
4248   auto NegDenom = B.buildSub(S64, Zero64, Denom);
4249 
4250   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
4251   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
4252 
4253   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
4254   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
4255   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
4256 
4257   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
4258   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
4259   auto Add1 = B.buildMergeLikeInstr(S64, {Add1_Lo, Add1_Hi});
4260 
4261   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
4262   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
4263   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
4264   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
4265   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
4266 
4267   auto Zero32 = B.buildConstant(S32, 0);
4268   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
4269   auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
4270   auto Add2 = B.buildMergeLikeInstr(S64, {Add2_Lo, Add2_Hi});
4271 
4272   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
4273   Register NumerLo = UnmergeNumer.getReg(0);
4274   Register NumerHi = UnmergeNumer.getReg(1);
4275 
4276   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
4277   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
4278   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
4279   Register Mul3_Lo = UnmergeMul3.getReg(0);
4280   Register Mul3_Hi = UnmergeMul3.getReg(1);
4281   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
4282   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
4283   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
4284   auto Sub1 = B.buildMergeLikeInstr(S64, {Sub1_Lo, Sub1_Hi});
4285 
4286   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
4287   Register DenomLo = UnmergeDenom.getReg(0);
4288   Register DenomHi = UnmergeDenom.getReg(1);
4289 
4290   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
4291   auto C1 = B.buildSExt(S32, CmpHi);
4292 
4293   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
4294   auto C2 = B.buildSExt(S32, CmpLo);
4295 
4296   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
4297   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
4298 
4299   // TODO: Here and below portions of the code can be enclosed into if/endif.
4300   // Currently control flow is unconditional and we have 4 selects after
4301   // potential endif to substitute PHIs.
4302 
4303   // if C3 != 0 ...
4304   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
4305   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
4306   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
4307   auto Sub2 = B.buildMergeLikeInstr(S64, {Sub2_Lo, Sub2_Hi});
4308 
4309   auto One64 = B.buildConstant(S64, 1);
4310   auto Add3 = B.buildAdd(S64, MulHi3, One64);
4311 
4312   auto C4 =
4313       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
4314   auto C5 =
4315       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
4316   auto C6 = B.buildSelect(
4317       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
4318 
4319   // if (C6 != 0)
4320   auto Add4 = B.buildAdd(S64, Add3, One64);
4321   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
4322 
4323   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
4324   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
4325   auto Sub3 = B.buildMergeLikeInstr(S64, {Sub3_Lo, Sub3_Hi});
4326 
4327   // endif C6
4328   // endif C3
4329 
4330   if (DstDivReg) {
4331     auto Sel1 = B.buildSelect(
4332         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
4333     B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
4334                   Sel1, MulHi3);
4335   }
4336 
4337   if (DstRemReg) {
4338     auto Sel2 = B.buildSelect(
4339         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
4340     B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
4341                   Sel2, Sub1);
4342   }
4343 }
4344 
4345 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
4346                                                   MachineRegisterInfo &MRI,
4347                                                   MachineIRBuilder &B) const {
4348   Register DstDivReg, DstRemReg;
4349   switch (MI.getOpcode()) {
4350   default:
4351     llvm_unreachable("Unexpected opcode!");
4352   case AMDGPU::G_UDIV: {
4353     DstDivReg = MI.getOperand(0).getReg();
4354     break;
4355   }
4356   case AMDGPU::G_UREM: {
4357     DstRemReg = MI.getOperand(0).getReg();
4358     break;
4359   }
4360   case AMDGPU::G_UDIVREM: {
4361     DstDivReg = MI.getOperand(0).getReg();
4362     DstRemReg = MI.getOperand(1).getReg();
4363     break;
4364   }
4365   }
4366 
4367   const LLT S64 = LLT::scalar(64);
4368   const LLT S32 = LLT::scalar(32);
4369   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
4370   Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
4371   Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
4372   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
4373 
4374   if (Ty == S32)
4375     legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
4376   else if (Ty == S64)
4377     legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
4378   else
4379     return false;
4380 
4381   MI.eraseFromParent();
4382   return true;
4383 }
4384 
4385 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
4386                                                 MachineRegisterInfo &MRI,
4387                                                 MachineIRBuilder &B) const {
4388   const LLT S64 = LLT::scalar(64);
4389   const LLT S32 = LLT::scalar(32);
4390 
4391   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
4392   if (Ty != S32 && Ty != S64)
4393     return false;
4394 
4395   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
4396   Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
4397   Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
4398 
4399   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
4400   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
4401   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
4402 
4403   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
4404   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
4405 
4406   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
4407   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
4408 
4409   Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
4410   switch (MI.getOpcode()) {
4411   default:
4412     llvm_unreachable("Unexpected opcode!");
4413   case AMDGPU::G_SDIV: {
4414     DstDivReg = MI.getOperand(0).getReg();
4415     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
4416     break;
4417   }
4418   case AMDGPU::G_SREM: {
4419     DstRemReg = MI.getOperand(0).getReg();
4420     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
4421     break;
4422   }
4423   case AMDGPU::G_SDIVREM: {
4424     DstDivReg = MI.getOperand(0).getReg();
4425     DstRemReg = MI.getOperand(1).getReg();
4426     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
4427     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
4428     break;
4429   }
4430   }
4431 
4432   if (Ty == S32)
4433     legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
4434   else
4435     legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
4436 
4437   if (DstDivReg) {
4438     auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
4439     auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
4440     B.buildSub(DstDivReg, SignXor, Sign);
4441   }
4442 
4443   if (DstRemReg) {
4444     auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
4445     auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
4446     B.buildSub(DstRemReg, SignXor, Sign);
4447   }
4448 
4449   MI.eraseFromParent();
4450   return true;
4451 }
4452 
4453 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
4454                                                  MachineRegisterInfo &MRI,
4455                                                  MachineIRBuilder &B) const {
4456   Register Res = MI.getOperand(0).getReg();
4457   Register LHS = MI.getOperand(1).getReg();
4458   Register RHS = MI.getOperand(2).getReg();
4459   uint16_t Flags = MI.getFlags();
4460   LLT ResTy = MRI.getType(Res);
4461 
4462   const MachineFunction &MF = B.getMF();
4463   bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn) ||
4464                             MF.getTarget().Options.UnsafeFPMath;
4465 
4466   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
4467     if (!AllowInaccurateRcp && ResTy != LLT::scalar(16))
4468       return false;
4469 
4470     // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
4471     // the CI documentation has a worst case error of 1 ulp.
4472     // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
4473     // use it as long as we aren't trying to use denormals.
4474     //
4475     // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
4476 
4477     // 1 / x -> RCP(x)
4478     if (CLHS->isExactlyValue(1.0)) {
4479       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
4480         .addUse(RHS)
4481         .setMIFlags(Flags);
4482 
4483       MI.eraseFromParent();
4484       return true;
4485     }
4486 
4487     // TODO: Match rsq
4488 
4489     // -1 / x -> RCP( FNEG(x) )
4490     if (CLHS->isExactlyValue(-1.0)) {
4491       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
4492       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
4493         .addUse(FNeg.getReg(0))
4494         .setMIFlags(Flags);
4495 
4496       MI.eraseFromParent();
4497       return true;
4498     }
4499   }
4500 
4501   // For f16 require arcp only.
4502   // For f32 require afn+arcp.
4503   if (!AllowInaccurateRcp && (ResTy != LLT::scalar(16) ||
4504                               !MI.getFlag(MachineInstr::FmArcp)))
4505     return false;
4506 
4507   // x / y -> x * (1.0 / y)
4508   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
4509     .addUse(RHS)
4510     .setMIFlags(Flags);
4511   B.buildFMul(Res, LHS, RCP, Flags);
4512 
4513   MI.eraseFromParent();
4514   return true;
4515 }
4516 
4517 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
4518                                                    MachineRegisterInfo &MRI,
4519                                                    MachineIRBuilder &B) const {
4520   Register Res = MI.getOperand(0).getReg();
4521   Register X = MI.getOperand(1).getReg();
4522   Register Y = MI.getOperand(2).getReg();
4523   uint16_t Flags = MI.getFlags();
4524   LLT ResTy = MRI.getType(Res);
4525 
4526   const MachineFunction &MF = B.getMF();
4527   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
4528                             MI.getFlag(MachineInstr::FmAfn);
4529 
4530   if (!AllowInaccurateRcp)
4531     return false;
4532 
4533   auto NegY = B.buildFNeg(ResTy, Y);
4534   auto One = B.buildFConstant(ResTy, 1.0);
4535 
4536   auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
4537     .addUse(Y)
4538     .setMIFlags(Flags);
4539 
4540   auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
4541   R = B.buildFMA(ResTy, Tmp0, R, R);
4542 
4543   auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
4544   R = B.buildFMA(ResTy, Tmp1, R, R);
4545 
4546   auto Ret = B.buildFMul(ResTy, X, R);
4547   auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
4548 
4549   B.buildFMA(Res, Tmp2, R, Ret);
4550   MI.eraseFromParent();
4551   return true;
4552 }
4553 
4554 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
4555                                          MachineRegisterInfo &MRI,
4556                                          MachineIRBuilder &B) const {
4557   if (legalizeFastUnsafeFDIV(MI, MRI, B))
4558     return true;
4559 
4560   Register Res = MI.getOperand(0).getReg();
4561   Register LHS = MI.getOperand(1).getReg();
4562   Register RHS = MI.getOperand(2).getReg();
4563 
4564   uint16_t Flags = MI.getFlags();
4565 
4566   LLT S16 = LLT::scalar(16);
4567   LLT S32 = LLT::scalar(32);
4568 
4569   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
4570   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
4571 
4572   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
4573     .addUse(RHSExt.getReg(0))
4574     .setMIFlags(Flags);
4575 
4576   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
4577   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
4578 
4579   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
4580     .addUse(RDst.getReg(0))
4581     .addUse(RHS)
4582     .addUse(LHS)
4583     .setMIFlags(Flags);
4584 
4585   MI.eraseFromParent();
4586   return true;
4587 }
4588 
4589 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
4590 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
4591 static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B,
4592                                const GCNSubtarget &ST,
4593                                SIModeRegisterDefaults Mode) {
4594   // Set SP denorm mode to this value.
4595   unsigned SPDenormMode =
4596     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
4597 
4598   if (ST.hasDenormModeInst()) {
4599     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
4600     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
4601 
4602     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
4603     B.buildInstr(AMDGPU::S_DENORM_MODE)
4604       .addImm(NewDenormModeValue);
4605 
4606   } else {
4607     // Select FP32 bit field in mode register.
4608     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
4609                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
4610                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
4611 
4612     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
4613       .addImm(SPDenormMode)
4614       .addImm(SPDenormModeBitField);
4615   }
4616 }
4617 
4618 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
4619                                          MachineRegisterInfo &MRI,
4620                                          MachineIRBuilder &B) const {
4621   if (legalizeFastUnsafeFDIV(MI, MRI, B))
4622     return true;
4623 
4624   Register Res = MI.getOperand(0).getReg();
4625   Register LHS = MI.getOperand(1).getReg();
4626   Register RHS = MI.getOperand(2).getReg();
4627   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4628   SIModeRegisterDefaults Mode = MFI->getMode();
4629 
4630   uint16_t Flags = MI.getFlags();
4631 
4632   LLT S32 = LLT::scalar(32);
4633   LLT S1 = LLT::scalar(1);
4634 
4635   auto One = B.buildFConstant(S32, 1.0f);
4636 
4637   auto DenominatorScaled =
4638     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
4639       .addUse(LHS)
4640       .addUse(RHS)
4641       .addImm(0)
4642       .setMIFlags(Flags);
4643   auto NumeratorScaled =
4644     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
4645       .addUse(LHS)
4646       .addUse(RHS)
4647       .addImm(1)
4648       .setMIFlags(Flags);
4649 
4650   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
4651     .addUse(DenominatorScaled.getReg(0))
4652     .setMIFlags(Flags);
4653   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
4654 
4655   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
4656   // aren't modeled as reading it.
4657   if (Mode.FP32Denormals != DenormalMode::getIEEE())
4658     toggleSPDenormMode(true, B, ST, Mode);
4659 
4660   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
4661   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
4662   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
4663   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
4664   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
4665   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
4666 
4667   // FIXME: This mishandles dynamic denormal mode. We need to query the
4668   // current mode and restore the original.
4669   if (Mode.FP32Denormals != DenormalMode::getIEEE())
4670     toggleSPDenormMode(false, B, ST, Mode);
4671 
4672   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
4673     .addUse(Fma4.getReg(0))
4674     .addUse(Fma1.getReg(0))
4675     .addUse(Fma3.getReg(0))
4676     .addUse(NumeratorScaled.getReg(1))
4677     .setMIFlags(Flags);
4678 
4679   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
4680     .addUse(Fmas.getReg(0))
4681     .addUse(RHS)
4682     .addUse(LHS)
4683     .setMIFlags(Flags);
4684 
4685   MI.eraseFromParent();
4686   return true;
4687 }
4688 
4689 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
4690                                          MachineRegisterInfo &MRI,
4691                                          MachineIRBuilder &B) const {
4692   if (legalizeFastUnsafeFDIV64(MI, MRI, B))
4693     return true;
4694 
4695   Register Res = MI.getOperand(0).getReg();
4696   Register LHS = MI.getOperand(1).getReg();
4697   Register RHS = MI.getOperand(2).getReg();
4698 
4699   uint16_t Flags = MI.getFlags();
4700 
4701   LLT S64 = LLT::scalar(64);
4702   LLT S1 = LLT::scalar(1);
4703 
4704   auto One = B.buildFConstant(S64, 1.0);
4705 
4706   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
4707     .addUse(LHS)
4708     .addUse(RHS)
4709     .addImm(0)
4710     .setMIFlags(Flags);
4711 
4712   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
4713 
4714   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
4715     .addUse(DivScale0.getReg(0))
4716     .setMIFlags(Flags);
4717 
4718   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
4719   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
4720   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
4721 
4722   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
4723     .addUse(LHS)
4724     .addUse(RHS)
4725     .addImm(1)
4726     .setMIFlags(Flags);
4727 
4728   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
4729   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
4730   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
4731 
4732   Register Scale;
4733   if (!ST.hasUsableDivScaleConditionOutput()) {
4734     // Workaround a hardware bug on SI where the condition output from div_scale
4735     // is not usable.
4736 
4737     LLT S32 = LLT::scalar(32);
4738 
4739     auto NumUnmerge = B.buildUnmerge(S32, LHS);
4740     auto DenUnmerge = B.buildUnmerge(S32, RHS);
4741     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
4742     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
4743 
4744     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
4745                               Scale1Unmerge.getReg(1));
4746     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
4747                               Scale0Unmerge.getReg(1));
4748     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
4749   } else {
4750     Scale = DivScale1.getReg(1);
4751   }
4752 
4753   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
4754     .addUse(Fma4.getReg(0))
4755     .addUse(Fma3.getReg(0))
4756     .addUse(Mul.getReg(0))
4757     .addUse(Scale)
4758     .setMIFlags(Flags);
4759 
4760   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, ArrayRef(Res), false)
4761       .addUse(Fmas.getReg(0))
4762       .addUse(RHS)
4763       .addUse(LHS)
4764       .setMIFlags(Flags);
4765 
4766   MI.eraseFromParent();
4767   return true;
4768 }
4769 
4770 bool AMDGPULegalizerInfo::legalizeFFREXP(MachineInstr &MI,
4771                                          MachineRegisterInfo &MRI,
4772                                          MachineIRBuilder &B) const {
4773   Register Res0 = MI.getOperand(0).getReg();
4774   Register Res1 = MI.getOperand(1).getReg();
4775   Register Val = MI.getOperand(2).getReg();
4776   uint16_t Flags = MI.getFlags();
4777 
4778   LLT Ty = MRI.getType(Res0);
4779   LLT InstrExpTy = Ty == LLT::scalar(16) ? LLT::scalar(16) : LLT::scalar(32);
4780 
4781   auto Mant = B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty}, false)
4782                   .addUse(Val)
4783                   .setMIFlags(Flags);
4784   auto Exp = B.buildIntrinsic(Intrinsic::amdgcn_frexp_exp, {InstrExpTy}, false)
4785                  .addUse(Val)
4786                  .setMIFlags(Flags);
4787 
4788   if (ST.hasFractBug()) {
4789     auto Fabs = B.buildFAbs(Ty, Val);
4790     auto Inf = B.buildFConstant(Ty, APFloat::getInf(getFltSemanticForLLT(Ty)));
4791     auto IsFinite =
4792         B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
4793     auto Zero = B.buildConstant(InstrExpTy, 0);
4794     Exp = B.buildSelect(InstrExpTy, IsFinite, Exp, Zero);
4795     Mant = B.buildSelect(Ty, IsFinite, Mant, Val);
4796   }
4797 
4798   B.buildCopy(Res0, Mant);
4799   B.buildSExtOrTrunc(Res1, Exp);
4800 
4801   MI.eraseFromParent();
4802   return true;
4803 }
4804 
4805 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
4806                                                  MachineRegisterInfo &MRI,
4807                                                  MachineIRBuilder &B) const {
4808   Register Res = MI.getOperand(0).getReg();
4809   Register LHS = MI.getOperand(2).getReg();
4810   Register RHS = MI.getOperand(3).getReg();
4811   uint16_t Flags = MI.getFlags();
4812 
4813   LLT S32 = LLT::scalar(32);
4814   LLT S1 = LLT::scalar(1);
4815 
4816   auto Abs = B.buildFAbs(S32, RHS, Flags);
4817   const APFloat C0Val(1.0f);
4818 
4819   auto C0 = B.buildFConstant(S32, 0x1p+96f);
4820   auto C1 = B.buildFConstant(S32, 0x1p-32f);
4821   auto C2 = B.buildFConstant(S32, 1.0f);
4822 
4823   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
4824   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
4825 
4826   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
4827 
4828   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
4829     .addUse(Mul0.getReg(0))
4830     .setMIFlags(Flags);
4831 
4832   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
4833 
4834   B.buildFMul(Res, Sel, Mul1, Flags);
4835 
4836   MI.eraseFromParent();
4837   return true;
4838 }
4839 
4840 bool AMDGPULegalizerInfo::legalizeFSQRT(MachineInstr &MI,
4841                                         MachineRegisterInfo &MRI,
4842                                         MachineIRBuilder &B) const {
4843   // For double type, the SQRT and RSQ instructions don't have required
4844   // precision, we apply Goldschmidt's algorithm to improve the result:
4845   //
4846   //   y0 = rsq(x)
4847   //   g0 = x * y0
4848   //   h0 = 0.5 * y0
4849   //
4850   //   r0 = 0.5 - h0 * g0
4851   //   g1 = g0 * r0 + g0
4852   //   h1 = h0 * r0 + h0
4853   //
4854   //   r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
4855   //   g2 = g1 * r1 + g1     g2 = d0 * h1 + g1
4856   //   h2 = h1 * r1 + h1
4857   //
4858   //   r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
4859   //   g3 = g2 * r2 + g2     g3 = d1 * h1 + g2
4860   //
4861   //   sqrt(x) = g3
4862 
4863   const LLT S1 = LLT::scalar(1);
4864   const LLT S32 = LLT::scalar(32);
4865   const LLT F64 = LLT::scalar(64);
4866 
4867   Register Dst = MI.getOperand(0).getReg();
4868   assert(MRI.getType(Dst) == F64 && "only expect to lower f64 sqrt");
4869 
4870   Register X = MI.getOperand(1).getReg();
4871   unsigned Flags = MI.getFlags();
4872 
4873   auto ScaleConstant = B.buildFConstant(F64, 0x1.0p-767);
4874 
4875   auto ZeroInt = B.buildConstant(S32, 0);
4876   auto Scaling = B.buildFCmp(FCmpInst::FCMP_OLT, S1, X, ScaleConstant);
4877 
4878   // Scale up input if it is too small.
4879   auto ScaleUpFactor = B.buildConstant(S32, 256);
4880   auto ScaleUp = B.buildSelect(S32, Scaling, ScaleUpFactor, ZeroInt);
4881   auto SqrtX = B.buildFLdexp(F64, X, ScaleUp, Flags);
4882 
4883   auto SqrtY = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F64}, false)
4884                    .addReg(SqrtX.getReg(0));
4885 
4886   auto Half = B.buildFConstant(F64, 0.5);
4887   auto SqrtH0 = B.buildFMul(F64, SqrtY, Half);
4888   auto SqrtS0 = B.buildFMul(F64, SqrtX, SqrtY);
4889 
4890   auto NegSqrtH0 = B.buildFNeg(F64, SqrtH0);
4891   auto SqrtR0 = B.buildFMA(F64, NegSqrtH0, SqrtS0, Half);
4892 
4893   auto SqrtS1 = B.buildFMA(F64, SqrtS0, SqrtR0, SqrtS0);
4894   auto SqrtH1 = B.buildFMA(F64, SqrtH0, SqrtR0, SqrtH0);
4895 
4896   auto NegSqrtS1 = B.buildFNeg(F64, SqrtS1);
4897   auto SqrtD0 = B.buildFMA(F64, NegSqrtS1, SqrtS1, SqrtX);
4898 
4899   auto SqrtS2 = B.buildFMA(F64, SqrtD0, SqrtH1, SqrtS1);
4900 
4901   auto NegSqrtS2 = B.buildFNeg(F64, SqrtS2);
4902   auto SqrtD1 = B.buildFMA(F64, NegSqrtS2, SqrtS2, SqrtX);
4903 
4904   auto SqrtRet = B.buildFMA(F64, SqrtD1, SqrtH1, SqrtS2);
4905 
4906   // Scale down the result.
4907   auto ScaleDownFactor = B.buildConstant(S32, -128);
4908   auto ScaleDown = B.buildSelect(S32, Scaling, ScaleDownFactor, ZeroInt);
4909   SqrtRet = B.buildFLdexp(F64, SqrtRet, ScaleDown, Flags);
4910 
4911   // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
4912   // with finite only or nsz because rsq(+/-0) = +/-inf
4913 
4914   // TODO: Check for DAZ and expand to subnormals
4915   auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf);
4916 
4917   // If x is +INF, +0, or -0, use its original value
4918   B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtRet, Flags);
4919 
4920   MI.eraseFromParent();
4921   return true;
4922 }
4923 
4924 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
4925 // FIXME: Why do we handle this one but not other removed instructions?
4926 //
4927 // Reciprocal square root.  The clamp prevents infinite results, clamping
4928 // infinities to max_float.  D.f = 1.0 / sqrt(S0.f), result clamped to
4929 // +-max_float.
4930 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
4931                                                     MachineRegisterInfo &MRI,
4932                                                     MachineIRBuilder &B) const {
4933   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
4934     return true;
4935 
4936   Register Dst = MI.getOperand(0).getReg();
4937   Register Src = MI.getOperand(2).getReg();
4938   auto Flags = MI.getFlags();
4939 
4940   LLT Ty = MRI.getType(Dst);
4941 
4942   const fltSemantics *FltSemantics;
4943   if (Ty == LLT::scalar(32))
4944     FltSemantics = &APFloat::IEEEsingle();
4945   else if (Ty == LLT::scalar(64))
4946     FltSemantics = &APFloat::IEEEdouble();
4947   else
4948     return false;
4949 
4950   auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
4951     .addUse(Src)
4952     .setMIFlags(Flags);
4953 
4954   // We don't need to concern ourselves with the snan handling difference, since
4955   // the rsq quieted (or not) so use the one which will directly select.
4956   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4957   const bool UseIEEE = MFI->getMode().IEEE;
4958 
4959   auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
4960   auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
4961                             B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
4962 
4963   auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
4964 
4965   if (UseIEEE)
4966     B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
4967   else
4968     B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
4969   MI.eraseFromParent();
4970   return true;
4971 }
4972 
4973 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
4974   switch (IID) {
4975   case Intrinsic::amdgcn_ds_fadd:
4976     return AMDGPU::G_ATOMICRMW_FADD;
4977   case Intrinsic::amdgcn_ds_fmin:
4978     return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
4979   case Intrinsic::amdgcn_ds_fmax:
4980     return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
4981   default:
4982     llvm_unreachable("not a DS FP intrinsic");
4983   }
4984 }
4985 
4986 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
4987                                                       MachineInstr &MI,
4988                                                       Intrinsic::ID IID) const {
4989   GISelChangeObserver &Observer = Helper.Observer;
4990   Observer.changingInstr(MI);
4991 
4992   MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
4993 
4994   // The remaining operands were used to set fields in the MemOperand on
4995   // construction.
4996   for (int I = 6; I > 3; --I)
4997     MI.removeOperand(I);
4998 
4999   MI.removeOperand(1); // Remove the intrinsic ID.
5000   Observer.changedInstr(MI);
5001   return true;
5002 }
5003 
5004 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
5005                                             MachineRegisterInfo &MRI,
5006                                             MachineIRBuilder &B) const {
5007   uint64_t Offset =
5008     ST.getTargetLowering()->getImplicitParameterOffset(
5009       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
5010   LLT DstTy = MRI.getType(DstReg);
5011   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
5012 
5013   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
5014   if (!loadInputValue(KernargPtrReg, B,
5015                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
5016     return false;
5017 
5018   // FIXME: This should be nuw
5019   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
5020   return true;
5021 }
5022 
5023 /// To create a buffer resource from a 64-bit pointer, mask off the upper 32
5024 /// bits of the pointer and replace them with the stride argument, then
5025 /// merge_values everything together. In the common case of a raw buffer (the
5026 /// stride component is 0), we can just AND off the upper half.
5027 bool AMDGPULegalizerInfo::legalizePointerAsRsrcIntrin(
5028     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
5029   Register Result = MI.getOperand(0).getReg();
5030   Register Pointer = MI.getOperand(2).getReg();
5031   Register Stride = MI.getOperand(3).getReg();
5032   Register NumRecords = MI.getOperand(4).getReg();
5033   Register Flags = MI.getOperand(5).getReg();
5034 
5035   LLT S32 = LLT::scalar(32);
5036 
5037   B.setInsertPt(B.getMBB(), ++B.getInsertPt());
5038   auto Unmerge = B.buildUnmerge(S32, Pointer);
5039   Register LowHalf = Unmerge.getReg(0);
5040   Register HighHalf = Unmerge.getReg(1);
5041 
5042   auto AndMask = B.buildConstant(S32, 0x0000ffff);
5043   auto Masked = B.buildAnd(S32, HighHalf, AndMask);
5044 
5045   MachineInstrBuilder NewHighHalf = Masked;
5046   std::optional<ValueAndVReg> StrideConst =
5047       getIConstantVRegValWithLookThrough(Stride, MRI);
5048   if (!StrideConst || !StrideConst->Value.isZero()) {
5049     MachineInstrBuilder ShiftedStride;
5050     if (StrideConst) {
5051       uint32_t StrideVal = StrideConst->Value.getZExtValue();
5052       uint32_t ShiftedStrideVal = StrideVal << 16;
5053       ShiftedStride = B.buildConstant(S32, ShiftedStrideVal);
5054     } else {
5055       auto ExtStride = B.buildAnyExt(S32, Stride);
5056       auto ShiftConst = B.buildConstant(S32, 16);
5057       ShiftedStride = B.buildShl(S32, ExtStride, ShiftConst);
5058     }
5059     NewHighHalf = B.buildOr(S32, Masked, ShiftedStride);
5060   }
5061   Register NewHighHalfReg = NewHighHalf.getReg(0);
5062   B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags});
5063   MI.eraseFromParent();
5064   return true;
5065 }
5066 
5067 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
5068                                                  MachineRegisterInfo &MRI,
5069                                                  MachineIRBuilder &B) const {
5070   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5071   if (!MFI->isEntryFunction()) {
5072     return legalizePreloadedArgIntrin(MI, MRI, B,
5073                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
5074   }
5075 
5076   Register DstReg = MI.getOperand(0).getReg();
5077   if (!getImplicitArgPtr(DstReg, MRI, B))
5078     return false;
5079 
5080   MI.eraseFromParent();
5081   return true;
5082 }
5083 
5084 bool AMDGPULegalizerInfo::getLDSKernelId(Register DstReg,
5085                                          MachineRegisterInfo &MRI,
5086                                          MachineIRBuilder &B) const {
5087   Function &F = B.getMF().getFunction();
5088   std::optional<uint32_t> KnownSize =
5089       AMDGPUMachineFunction::getLDSKernelIdMetadata(F);
5090   if (KnownSize.has_value())
5091     B.buildConstant(DstReg, *KnownSize);
5092   return false;
5093 }
5094 
5095 bool AMDGPULegalizerInfo::legalizeLDSKernelId(MachineInstr &MI,
5096                                               MachineRegisterInfo &MRI,
5097                                               MachineIRBuilder &B) const {
5098 
5099   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5100   if (!MFI->isEntryFunction()) {
5101     return legalizePreloadedArgIntrin(MI, MRI, B,
5102                                       AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
5103   }
5104 
5105   Register DstReg = MI.getOperand(0).getReg();
5106   if (!getLDSKernelId(DstReg, MRI, B))
5107     return false;
5108 
5109   MI.eraseFromParent();
5110   return true;
5111 }
5112 
5113 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
5114                                               MachineRegisterInfo &MRI,
5115                                               MachineIRBuilder &B,
5116                                               unsigned AddrSpace) const {
5117   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
5118   auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
5119   Register Hi32 = Unmerge.getReg(1);
5120 
5121   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
5122   MI.eraseFromParent();
5123   return true;
5124 }
5125 
5126 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
5127 // offset (the offset that is included in bounds checking and swizzling, to be
5128 // split between the instruction's voffset and immoffset fields) and soffset
5129 // (the offset that is excluded from bounds checking and swizzling, to go in
5130 // the instruction's soffset field).  This function takes the first kind of
5131 // offset and figures out how to split it between voffset and immoffset.
5132 std::pair<Register, unsigned>
5133 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
5134                                         Register OrigOffset) const {
5135   const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset();
5136   Register BaseReg;
5137   unsigned ImmOffset;
5138   const LLT S32 = LLT::scalar(32);
5139   MachineRegisterInfo &MRI = *B.getMRI();
5140 
5141   std::tie(BaseReg, ImmOffset) =
5142       AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
5143 
5144   // If BaseReg is a pointer, convert it to int.
5145   if (MRI.getType(BaseReg).isPointer())
5146     BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
5147 
5148   // If the immediate value is too big for the immoffset field, put only bits
5149   // that would normally fit in the immoffset field. The remaining value that
5150   // is copied/added for the voffset field is a large power of 2, and it
5151   // stands more chance of being CSEd with the copy/add for another similar
5152   // load/store.
5153   // However, do not do that rounding down if that is a negative
5154   // number, as it appears to be illegal to have a negative offset in the
5155   // vgpr, even if adding the immediate offset makes it positive.
5156   unsigned Overflow = ImmOffset & ~MaxImm;
5157   ImmOffset -= Overflow;
5158   if ((int32_t)Overflow < 0) {
5159     Overflow += ImmOffset;
5160     ImmOffset = 0;
5161   }
5162 
5163   if (Overflow != 0) {
5164     if (!BaseReg) {
5165       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
5166     } else {
5167       auto OverflowVal = B.buildConstant(S32, Overflow);
5168       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
5169     }
5170   }
5171 
5172   if (!BaseReg)
5173     BaseReg = B.buildConstant(S32, 0).getReg(0);
5174 
5175   return std::pair(BaseReg, ImmOffset);
5176 }
5177 
5178 /// Handle register layout difference for f16 images for some subtargets.
5179 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
5180                                              MachineRegisterInfo &MRI,
5181                                              Register Reg,
5182                                              bool ImageStore) const {
5183   const LLT S16 = LLT::scalar(16);
5184   const LLT S32 = LLT::scalar(32);
5185   LLT StoreVT = MRI.getType(Reg);
5186   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
5187 
5188   if (ST.hasUnpackedD16VMem()) {
5189     auto Unmerge = B.buildUnmerge(S16, Reg);
5190 
5191     SmallVector<Register, 4> WideRegs;
5192     for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
5193       WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
5194 
5195     int NumElts = StoreVT.getNumElements();
5196 
5197     return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
5198         .getReg(0);
5199   }
5200 
5201   if (ImageStore && ST.hasImageStoreD16Bug()) {
5202     if (StoreVT.getNumElements() == 2) {
5203       SmallVector<Register, 4> PackedRegs;
5204       Reg = B.buildBitcast(S32, Reg).getReg(0);
5205       PackedRegs.push_back(Reg);
5206       PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
5207       return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
5208           .getReg(0);
5209     }
5210 
5211     if (StoreVT.getNumElements() == 3) {
5212       SmallVector<Register, 4> PackedRegs;
5213       auto Unmerge = B.buildUnmerge(S16, Reg);
5214       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
5215         PackedRegs.push_back(Unmerge.getReg(I));
5216       PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
5217       Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
5218       return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
5219     }
5220 
5221     if (StoreVT.getNumElements() == 4) {
5222       SmallVector<Register, 4> PackedRegs;
5223       Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
5224       auto Unmerge = B.buildUnmerge(S32, Reg);
5225       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
5226         PackedRegs.push_back(Unmerge.getReg(I));
5227       PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
5228       return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
5229           .getReg(0);
5230     }
5231 
5232     llvm_unreachable("invalid data type");
5233   }
5234 
5235   if (StoreVT == LLT::fixed_vector(3, S16)) {
5236     Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg)
5237               .getReg(0);
5238   }
5239   return Reg;
5240 }
5241 
5242 Register AMDGPULegalizerInfo::fixStoreSourceType(
5243   MachineIRBuilder &B, Register VData, bool IsFormat) const {
5244   MachineRegisterInfo *MRI = B.getMRI();
5245   LLT Ty = MRI->getType(VData);
5246 
5247   const LLT S16 = LLT::scalar(16);
5248 
5249   // Fixup buffer resources themselves needing to be v4i128.
5250   if (hasBufferRsrcWorkaround(Ty))
5251     return castBufferRsrcToV4I32(VData, B);
5252 
5253   // Fixup illegal register types for i8 stores.
5254   if (Ty == LLT::scalar(8) || Ty == S16) {
5255     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
5256     return AnyExt;
5257   }
5258 
5259   if (Ty.isVector()) {
5260     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
5261       if (IsFormat)
5262         return handleD16VData(B, *MRI, VData);
5263     }
5264   }
5265 
5266   return VData;
5267 }
5268 
5269 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
5270                                               MachineRegisterInfo &MRI,
5271                                               MachineIRBuilder &B,
5272                                               bool IsTyped,
5273                                               bool IsFormat) const {
5274   Register VData = MI.getOperand(1).getReg();
5275   LLT Ty = MRI.getType(VData);
5276   LLT EltTy = Ty.getScalarType();
5277   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
5278   const LLT S32 = LLT::scalar(32);
5279 
5280   VData = fixStoreSourceType(B, VData, IsFormat);
5281   castBufferRsrcArgToV4I32(MI, B, 2);
5282   Register RSrc = MI.getOperand(2).getReg();
5283 
5284   MachineMemOperand *MMO = *MI.memoperands_begin();
5285   const int MemSize = MMO->getSize();
5286 
5287   unsigned ImmOffset;
5288 
5289   // The typed intrinsics add an immediate after the registers.
5290   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
5291 
5292   // The struct intrinsic variants add one additional operand over raw.
5293   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
5294   Register VIndex;
5295   int OpOffset = 0;
5296   if (HasVIndex) {
5297     VIndex = MI.getOperand(3).getReg();
5298     OpOffset = 1;
5299   } else {
5300     VIndex = B.buildConstant(S32, 0).getReg(0);
5301   }
5302 
5303   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
5304   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
5305 
5306   unsigned Format = 0;
5307   if (IsTyped) {
5308     Format = MI.getOperand(5 + OpOffset).getImm();
5309     ++OpOffset;
5310   }
5311 
5312   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
5313 
5314   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
5315 
5316   unsigned Opc;
5317   if (IsTyped) {
5318     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
5319                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
5320   } else if (IsFormat) {
5321     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
5322                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
5323   } else {
5324     switch (MemSize) {
5325     case 1:
5326       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
5327       break;
5328     case 2:
5329       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
5330       break;
5331     default:
5332       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
5333       break;
5334     }
5335   }
5336 
5337   auto MIB = B.buildInstr(Opc)
5338     .addUse(VData)              // vdata
5339     .addUse(RSrc)               // rsrc
5340     .addUse(VIndex)             // vindex
5341     .addUse(VOffset)            // voffset
5342     .addUse(SOffset)            // soffset
5343     .addImm(ImmOffset);         // offset(imm)
5344 
5345   if (IsTyped)
5346     MIB.addImm(Format);
5347 
5348   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
5349      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
5350      .addMemOperand(MMO);
5351 
5352   MI.eraseFromParent();
5353   return true;
5354 }
5355 
5356 static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc,
5357                             Register VIndex, Register VOffset, Register SOffset,
5358                             unsigned ImmOffset, unsigned Format,
5359                             unsigned AuxiliaryData, MachineMemOperand *MMO,
5360                             bool IsTyped, bool HasVIndex, MachineIRBuilder &B) {
5361   auto MIB = B.buildInstr(Opc)
5362                  .addDef(LoadDstReg) // vdata
5363                  .addUse(RSrc)       // rsrc
5364                  .addUse(VIndex)     // vindex
5365                  .addUse(VOffset)    // voffset
5366                  .addUse(SOffset)    // soffset
5367                  .addImm(ImmOffset); // offset(imm)
5368 
5369   if (IsTyped)
5370     MIB.addImm(Format);
5371 
5372   MIB.addImm(AuxiliaryData)       // cachepolicy, swizzled buffer(imm)
5373       .addImm(HasVIndex ? -1 : 0) // idxen(imm)
5374       .addMemOperand(MMO);
5375 }
5376 
5377 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
5378                                              MachineRegisterInfo &MRI,
5379                                              MachineIRBuilder &B,
5380                                              bool IsFormat,
5381                                              bool IsTyped) const {
5382   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
5383   MachineMemOperand *MMO = *MI.memoperands_begin();
5384   const LLT MemTy = MMO->getMemoryType();
5385   const LLT S32 = LLT::scalar(32);
5386 
5387   Register Dst = MI.getOperand(0).getReg();
5388 
5389   Register StatusDst;
5390   int OpOffset = 0;
5391   assert(MI.getNumExplicitDefs() == 1 || MI.getNumExplicitDefs() == 2);
5392   bool IsTFE = MI.getNumExplicitDefs() == 2;
5393   if (IsTFE) {
5394     StatusDst = MI.getOperand(1).getReg();
5395     ++OpOffset;
5396   }
5397 
5398   castBufferRsrcArgToV4I32(MI, B, 2 + OpOffset);
5399   Register RSrc = MI.getOperand(2 + OpOffset).getReg();
5400 
5401   // The typed intrinsics add an immediate after the registers.
5402   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
5403 
5404   // The struct intrinsic variants add one additional operand over raw.
5405   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps + OpOffset;
5406   Register VIndex;
5407   if (HasVIndex) {
5408     VIndex = MI.getOperand(3 + OpOffset).getReg();
5409     ++OpOffset;
5410   } else {
5411     VIndex = B.buildConstant(S32, 0).getReg(0);
5412   }
5413 
5414   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
5415   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
5416 
5417   unsigned Format = 0;
5418   if (IsTyped) {
5419     Format = MI.getOperand(5 + OpOffset).getImm();
5420     ++OpOffset;
5421   }
5422 
5423   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
5424   unsigned ImmOffset;
5425 
5426   LLT Ty = MRI.getType(Dst);
5427   // Make addrspace 8 pointers loads into 4xs32 loads here, so the rest of the
5428   // logic doesn't have to handle that case.
5429   if (hasBufferRsrcWorkaround(Ty)) {
5430     Ty = castBufferRsrcFromV4I32(MI, B, MRI, 0);
5431     Dst = MI.getOperand(0).getReg();
5432   }
5433   LLT EltTy = Ty.getScalarType();
5434   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
5435   const bool Unpacked = ST.hasUnpackedD16VMem();
5436 
5437   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
5438 
5439   unsigned Opc;
5440 
5441   // TODO: Support TFE for typed and narrow loads.
5442   if (IsTyped) {
5443     if (IsTFE)
5444       return false;
5445     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
5446                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
5447   } else if (IsFormat) {
5448     if (IsD16) {
5449       if (IsTFE)
5450         return false;
5451       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
5452     } else {
5453       Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
5454                   : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
5455     }
5456   } else {
5457     if (IsTFE)
5458       return false;
5459     switch (MemTy.getSizeInBits()) {
5460     case 8:
5461       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
5462       break;
5463     case 16:
5464       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
5465       break;
5466     default:
5467       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
5468       break;
5469     }
5470   }
5471 
5472   if (IsTFE) {
5473     unsigned NumValueDWords = divideCeil(Ty.getSizeInBits(), 32);
5474     unsigned NumLoadDWords = NumValueDWords + 1;
5475     LLT LoadTy = LLT::fixed_vector(NumLoadDWords, S32);
5476     Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(LoadTy);
5477     buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
5478                     Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
5479     if (NumValueDWords == 1) {
5480       B.buildUnmerge({Dst, StatusDst}, LoadDstReg);
5481     } else {
5482       SmallVector<Register, 5> LoadElts;
5483       for (unsigned I = 0; I != NumValueDWords; ++I)
5484         LoadElts.push_back(B.getMRI()->createGenericVirtualRegister(S32));
5485       LoadElts.push_back(StatusDst);
5486       B.buildUnmerge(LoadElts, LoadDstReg);
5487       LoadElts.truncate(NumValueDWords);
5488       B.buildMergeLikeInstr(Dst, LoadElts);
5489     }
5490   } else if ((!IsD16 && MemTy.getSizeInBits() < 32) ||
5491              (IsD16 && !Ty.isVector())) {
5492     Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
5493     buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
5494                     Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
5495     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
5496     B.buildTrunc(Dst, LoadDstReg);
5497   } else if (Unpacked && IsD16 && Ty.isVector()) {
5498     LLT UnpackedTy = Ty.changeElementSize(32);
5499     Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
5500     buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
5501                     Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
5502     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
5503     // FIXME: G_TRUNC should work, but legalization currently fails
5504     auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
5505     SmallVector<Register, 4> Repack;
5506     for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
5507       Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
5508     B.buildMergeLikeInstr(Dst, Repack);
5509   } else {
5510     buildBufferLoad(Opc, Dst, RSrc, VIndex, VOffset, SOffset, ImmOffset, Format,
5511                     AuxiliaryData, MMO, IsTyped, HasVIndex, B);
5512   }
5513 
5514   MI.eraseFromParent();
5515   return true;
5516 }
5517 
5518 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
5519   switch (IntrID) {
5520   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
5521   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
5522   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
5523   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
5524     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
5525   case Intrinsic::amdgcn_raw_buffer_atomic_add:
5526   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
5527   case Intrinsic::amdgcn_struct_buffer_atomic_add:
5528   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
5529     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
5530   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
5531   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
5532   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
5533   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
5534     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
5535   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
5536   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
5537   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
5538   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
5539     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
5540   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
5541   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
5542   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
5543   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
5544     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
5545   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
5546   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
5547   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
5548   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
5549     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
5550   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
5551   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
5552   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
5553   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
5554     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
5555   case Intrinsic::amdgcn_raw_buffer_atomic_and:
5556   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
5557   case Intrinsic::amdgcn_struct_buffer_atomic_and:
5558   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
5559     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
5560   case Intrinsic::amdgcn_raw_buffer_atomic_or:
5561   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
5562   case Intrinsic::amdgcn_struct_buffer_atomic_or:
5563   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
5564     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
5565   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
5566   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
5567   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
5568   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
5569     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
5570   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
5571   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
5572   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
5573   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
5574     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
5575   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
5576   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
5577   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
5578   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
5579     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
5580   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
5581   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
5582   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
5583   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
5584     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
5585   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
5586   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
5587   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
5588   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
5589     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
5590   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
5591   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
5592   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
5593   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
5594     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
5595   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
5596   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
5597   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
5598   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
5599     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
5600   default:
5601     llvm_unreachable("unhandled atomic opcode");
5602   }
5603 }
5604 
5605 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
5606                                                MachineIRBuilder &B,
5607                                                Intrinsic::ID IID) const {
5608   const bool IsCmpSwap =
5609       IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
5610       IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap ||
5611       IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap ||
5612       IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap;
5613   const bool HasReturn = MI.getNumExplicitDefs() != 0;
5614 
5615   Register Dst;
5616 
5617   int OpOffset = 0;
5618   if (HasReturn) {
5619     // A few FP atomics do not support return values.
5620     Dst = MI.getOperand(0).getReg();
5621   } else {
5622     OpOffset = -1;
5623   }
5624 
5625   // Since we don't have 128-bit atomics, we don't need to handle the case of
5626   // p8 argmunents to the atomic itself
5627   Register VData = MI.getOperand(2 + OpOffset).getReg();
5628   Register CmpVal;
5629 
5630   if (IsCmpSwap) {
5631     CmpVal = MI.getOperand(3 + OpOffset).getReg();
5632     ++OpOffset;
5633   }
5634 
5635   castBufferRsrcArgToV4I32(MI, B, 3 + OpOffset);
5636   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
5637   const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
5638 
5639   // The struct intrinsic variants add one additional operand over raw.
5640   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
5641   Register VIndex;
5642   if (HasVIndex) {
5643     VIndex = MI.getOperand(4 + OpOffset).getReg();
5644     ++OpOffset;
5645   } else {
5646     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
5647   }
5648 
5649   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
5650   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
5651   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
5652 
5653   MachineMemOperand *MMO = *MI.memoperands_begin();
5654 
5655   unsigned ImmOffset;
5656   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
5657 
5658   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
5659 
5660   if (HasReturn)
5661     MIB.addDef(Dst);
5662 
5663   MIB.addUse(VData); // vdata
5664 
5665   if (IsCmpSwap)
5666     MIB.addReg(CmpVal);
5667 
5668   MIB.addUse(RSrc)               // rsrc
5669      .addUse(VIndex)             // vindex
5670      .addUse(VOffset)            // voffset
5671      .addUse(SOffset)            // soffset
5672      .addImm(ImmOffset)          // offset(imm)
5673      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
5674      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
5675      .addMemOperand(MMO);
5676 
5677   MI.eraseFromParent();
5678   return true;
5679 }
5680 
5681 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
5682 /// vector with s16 typed elements.
5683 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
5684                                       SmallVectorImpl<Register> &PackedAddrs,
5685                                       unsigned ArgOffset,
5686                                       const AMDGPU::ImageDimIntrinsicInfo *Intr,
5687                                       bool IsA16, bool IsG16) {
5688   const LLT S16 = LLT::scalar(16);
5689   const LLT V2S16 = LLT::fixed_vector(2, 16);
5690   auto EndIdx = Intr->VAddrEnd;
5691 
5692   for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
5693     MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
5694     if (!SrcOp.isReg())
5695       continue; // _L to _LZ may have eliminated this.
5696 
5697     Register AddrReg = SrcOp.getReg();
5698 
5699     if ((I < Intr->GradientStart) ||
5700         (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
5701         (I >= Intr->CoordStart && !IsA16)) {
5702       if ((I < Intr->GradientStart) && IsA16 &&
5703           (B.getMRI()->getType(AddrReg) == S16)) {
5704         assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
5705         // Special handling of bias when A16 is on. Bias is of type half but
5706         // occupies full 32-bit.
5707         PackedAddrs.push_back(
5708             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
5709                 .getReg(0));
5710       } else {
5711         assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
5712                "Bias needs to be converted to 16 bit in A16 mode");
5713         // Handle any gradient or coordinate operands that should not be packed
5714         AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
5715         PackedAddrs.push_back(AddrReg);
5716       }
5717     } else {
5718       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
5719       // derivatives dx/dh and dx/dv are packed with undef.
5720       if (((I + 1) >= EndIdx) ||
5721           ((Intr->NumGradients / 2) % 2 == 1 &&
5722            (I == static_cast<unsigned>(Intr->GradientStart +
5723                                        (Intr->NumGradients / 2) - 1) ||
5724             I == static_cast<unsigned>(Intr->GradientStart +
5725                                        Intr->NumGradients - 1))) ||
5726           // Check for _L to _LZ optimization
5727           !MI.getOperand(ArgOffset + I + 1).isReg()) {
5728         PackedAddrs.push_back(
5729             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
5730                 .getReg(0));
5731       } else {
5732         PackedAddrs.push_back(
5733             B.buildBuildVector(
5734                  V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
5735                 .getReg(0));
5736         ++I;
5737       }
5738     }
5739   }
5740 }
5741 
5742 /// Convert from separate vaddr components to a single vector address register,
5743 /// and replace the remaining operands with $noreg.
5744 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
5745                                      int DimIdx, int NumVAddrs) {
5746   const LLT S32 = LLT::scalar(32);
5747   (void)S32;
5748   SmallVector<Register, 8> AddrRegs;
5749   for (int I = 0; I != NumVAddrs; ++I) {
5750     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
5751     if (SrcOp.isReg()) {
5752       AddrRegs.push_back(SrcOp.getReg());
5753       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
5754     }
5755   }
5756 
5757   int NumAddrRegs = AddrRegs.size();
5758   if (NumAddrRegs != 1) {
5759     auto VAddr =
5760         B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
5761     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
5762   }
5763 
5764   for (int I = 1; I != NumVAddrs; ++I) {
5765     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
5766     if (SrcOp.isReg())
5767       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
5768   }
5769 }
5770 
5771 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
5772 ///
5773 /// Depending on the subtarget, load/store with 16-bit element data need to be
5774 /// rewritten to use the low half of 32-bit registers, or directly use a packed
5775 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
5776 /// registers.
5777 ///
5778 /// We don't want to directly select image instructions just yet, but also want
5779 /// to exposes all register repacking to the legalizer/combiners. We also don't
5780 /// want a selected instruction entering RegBankSelect. In order to avoid
5781 /// defining a multitude of intermediate image instructions, directly hack on
5782 /// the intrinsic's arguments. In cases like a16 addresses, this requires
5783 /// padding now unnecessary arguments with $noreg.
5784 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
5785     MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
5786     const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
5787 
5788   const MachineFunction &MF = *MI.getMF();
5789   const unsigned NumDefs = MI.getNumExplicitDefs();
5790   const unsigned ArgOffset = NumDefs + 1;
5791   bool IsTFE = NumDefs == 2;
5792   // We are only processing the operands of d16 image operations on subtargets
5793   // that use the unpacked register layout, or need to repack the TFE result.
5794 
5795   // TODO: Do we need to guard against already legalized intrinsics?
5796   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
5797       AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
5798 
5799   MachineRegisterInfo *MRI = B.getMRI();
5800   const LLT S32 = LLT::scalar(32);
5801   const LLT S16 = LLT::scalar(16);
5802   const LLT V2S16 = LLT::fixed_vector(2, 16);
5803 
5804   unsigned DMask = 0;
5805   Register VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
5806   LLT Ty = MRI->getType(VData);
5807 
5808   // Check for 16 bit addresses and pack if true.
5809   LLT GradTy =
5810       MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
5811   LLT AddrTy =
5812       MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
5813   const bool IsG16 =
5814       ST.hasG16() ? (BaseOpcode->Gradients && GradTy == S16) : GradTy == S16;
5815   const bool IsA16 = AddrTy == S16;
5816   const bool IsD16 = Ty.getScalarType() == S16;
5817 
5818   int DMaskLanes = 0;
5819   if (!BaseOpcode->Atomic) {
5820     DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
5821     if (BaseOpcode->Gather4) {
5822       DMaskLanes = 4;
5823     } else if (DMask != 0) {
5824       DMaskLanes = llvm::popcount(DMask);
5825     } else if (!IsTFE && !BaseOpcode->Store) {
5826       // If dmask is 0, this is a no-op load. This can be eliminated.
5827       B.buildUndef(MI.getOperand(0));
5828       MI.eraseFromParent();
5829       return true;
5830     }
5831   }
5832 
5833   Observer.changingInstr(MI);
5834   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
5835 
5836   const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
5837                                      : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
5838   const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
5839                                     : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
5840   unsigned NewOpcode = NumDefs == 0 ? StoreOpcode : LoadOpcode;
5841 
5842   // Track that we legalized this
5843   MI.setDesc(B.getTII().get(NewOpcode));
5844 
5845   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
5846   // dmask to be at least 1 otherwise the instruction will fail
5847   if (IsTFE && DMask == 0) {
5848     DMask = 0x1;
5849     DMaskLanes = 1;
5850     MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
5851   }
5852 
5853   if (BaseOpcode->Atomic) {
5854     Register VData0 = MI.getOperand(2).getReg();
5855     LLT Ty = MRI->getType(VData0);
5856 
5857     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
5858     if (Ty.isVector())
5859       return false;
5860 
5861     if (BaseOpcode->AtomicX2) {
5862       Register VData1 = MI.getOperand(3).getReg();
5863       // The two values are packed in one register.
5864       LLT PackedTy = LLT::fixed_vector(2, Ty);
5865       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
5866       MI.getOperand(2).setReg(Concat.getReg(0));
5867       MI.getOperand(3).setReg(AMDGPU::NoRegister);
5868     }
5869   }
5870 
5871   unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
5872 
5873   // Rewrite the addressing register layout before doing anything else.
5874   if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
5875     // 16 bit gradients are supported, but are tied to the A16 control
5876     // so both gradients and addresses must be 16 bit
5877     return false;
5878   }
5879 
5880   if (IsA16 && !ST.hasA16()) {
5881     // A16 not supported
5882     return false;
5883   }
5884 
5885   const unsigned NSAMaxSize = ST.getNSAMaxSize();
5886   const unsigned HasPartialNSA = ST.hasPartialNSAEncoding();
5887 
5888   if (IsA16 || IsG16) {
5889     if (Intr->NumVAddrs > 1) {
5890       SmallVector<Register, 4> PackedRegs;
5891 
5892       packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16,
5893                                 IsG16);
5894 
5895       // See also below in the non-a16 branch
5896       const bool UseNSA = ST.hasNSAEncoding() &&
5897                           PackedRegs.size() >= ST.getNSAThreshold(MF) &&
5898                           (PackedRegs.size() <= NSAMaxSize || HasPartialNSA);
5899       const bool UsePartialNSA =
5900           UseNSA && HasPartialNSA && PackedRegs.size() > NSAMaxSize;
5901 
5902       if (UsePartialNSA) {
5903         // Pack registers that would go over NSAMaxSize into last VAddr register
5904         LLT PackedAddrTy =
5905             LLT::fixed_vector(2 * (PackedRegs.size() - NSAMaxSize + 1), 16);
5906         auto Concat = B.buildConcatVectors(
5907             PackedAddrTy, ArrayRef(PackedRegs).slice(NSAMaxSize - 1));
5908         PackedRegs[NSAMaxSize - 1] = Concat.getReg(0);
5909         PackedRegs.resize(NSAMaxSize);
5910       } else if (!UseNSA && PackedRegs.size() > 1) {
5911         LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
5912         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
5913         PackedRegs[0] = Concat.getReg(0);
5914         PackedRegs.resize(1);
5915       }
5916 
5917       const unsigned NumPacked = PackedRegs.size();
5918       for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
5919         MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
5920         if (!SrcOp.isReg()) {
5921           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
5922           continue;
5923         }
5924 
5925         assert(SrcOp.getReg() != AMDGPU::NoRegister);
5926 
5927         if (I - Intr->VAddrStart < NumPacked)
5928           SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
5929         else
5930           SrcOp.setReg(AMDGPU::NoRegister);
5931       }
5932     }
5933   } else {
5934     // If the register allocator cannot place the address registers contiguously
5935     // without introducing moves, then using the non-sequential address encoding
5936     // is always preferable, since it saves VALU instructions and is usually a
5937     // wash in terms of code size or even better.
5938     //
5939     // However, we currently have no way of hinting to the register allocator
5940     // that MIMG addresses should be placed contiguously when it is possible to
5941     // do so, so force non-NSA for the common 2-address case as a heuristic.
5942     //
5943     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
5944     // allocation when possible.
5945     //
5946     // Partial NSA is allowed on GFX11 where the final register is a contiguous
5947     // set of the remaining addresses.
5948     const bool UseNSA = ST.hasNSAEncoding() &&
5949                         CorrectedNumVAddrs >= ST.getNSAThreshold(MF) &&
5950                         (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA);
5951     const bool UsePartialNSA =
5952         UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize;
5953 
5954     if (UsePartialNSA) {
5955       convertImageAddrToPacked(B, MI,
5956                                ArgOffset + Intr->VAddrStart + NSAMaxSize - 1,
5957                                Intr->NumVAddrs - NSAMaxSize + 1);
5958     } else if (!UseNSA && Intr->NumVAddrs > 1) {
5959       convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
5960                                Intr->NumVAddrs);
5961     }
5962   }
5963 
5964   int Flags = 0;
5965   if (IsA16)
5966     Flags |= 1;
5967   if (IsG16)
5968     Flags |= 2;
5969   MI.addOperand(MachineOperand::CreateImm(Flags));
5970 
5971   if (BaseOpcode->Store) { // No TFE for stores?
5972     // TODO: Handle dmask trim
5973     if (!Ty.isVector() || !IsD16)
5974       return true;
5975 
5976     Register RepackedReg = handleD16VData(B, *MRI, VData, true);
5977     if (RepackedReg != VData) {
5978       MI.getOperand(1).setReg(RepackedReg);
5979     }
5980 
5981     return true;
5982   }
5983 
5984   Register DstReg = MI.getOperand(0).getReg();
5985   const LLT EltTy = Ty.getScalarType();
5986   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
5987 
5988   // Confirm that the return type is large enough for the dmask specified
5989   if (NumElts < DMaskLanes)
5990     return false;
5991 
5992   if (NumElts > 4 || DMaskLanes > 4)
5993     return false;
5994 
5995   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
5996   const LLT AdjustedTy =
5997       Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
5998 
5999   // The raw dword aligned data component of the load. The only legal cases
6000   // where this matters should be when using the packed D16 format, for
6001   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
6002   LLT RoundedTy;
6003 
6004   // S32 vector to cover all data, plus TFE result element.
6005   LLT TFETy;
6006 
6007   // Register type to use for each loaded component. Will be S32 or V2S16.
6008   LLT RegTy;
6009 
6010   if (IsD16 && ST.hasUnpackedD16VMem()) {
6011     RoundedTy =
6012         LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
6013     TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
6014     RegTy = S32;
6015   } else {
6016     unsigned EltSize = EltTy.getSizeInBits();
6017     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
6018     unsigned RoundedSize = 32 * RoundedElts;
6019     RoundedTy = LLT::scalarOrVector(
6020         ElementCount::getFixed(RoundedSize / EltSize), EltSize);
6021     TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
6022     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
6023   }
6024 
6025   // The return type does not need adjustment.
6026   // TODO: Should we change s16 case to s32 or <2 x s16>?
6027   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
6028     return true;
6029 
6030   Register Dst1Reg;
6031 
6032   // Insert after the instruction.
6033   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
6034 
6035   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
6036   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
6037   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
6038   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
6039 
6040   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
6041 
6042   MI.getOperand(0).setReg(NewResultReg);
6043 
6044   // In the IR, TFE is supposed to be used with a 2 element struct return
6045   // type. The instruction really returns these two values in one contiguous
6046   // register, with one additional dword beyond the loaded data. Rewrite the
6047   // return type to use a single register result.
6048 
6049   if (IsTFE) {
6050     Dst1Reg = MI.getOperand(1).getReg();
6051     if (MRI->getType(Dst1Reg) != S32)
6052       return false;
6053 
6054     // TODO: Make sure the TFE operand bit is set.
6055     MI.removeOperand(1);
6056 
6057     // Handle the easy case that requires no repack instructions.
6058     if (Ty == S32) {
6059       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
6060       return true;
6061     }
6062   }
6063 
6064   // Now figure out how to copy the new result register back into the old
6065   // result.
6066   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
6067 
6068   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
6069 
6070   if (ResultNumRegs == 1) {
6071     assert(!IsTFE);
6072     ResultRegs[0] = NewResultReg;
6073   } else {
6074     // We have to repack into a new vector of some kind.
6075     for (int I = 0; I != NumDataRegs; ++I)
6076       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
6077     B.buildUnmerge(ResultRegs, NewResultReg);
6078 
6079     // Drop the final TFE element to get the data part. The TFE result is
6080     // directly written to the right place already.
6081     if (IsTFE)
6082       ResultRegs.resize(NumDataRegs);
6083   }
6084 
6085   // For an s16 scalar result, we form an s32 result with a truncate regardless
6086   // of packed vs. unpacked.
6087   if (IsD16 && !Ty.isVector()) {
6088     B.buildTrunc(DstReg, ResultRegs[0]);
6089     return true;
6090   }
6091 
6092   // Avoid a build/concat_vector of 1 entry.
6093   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
6094     B.buildBitcast(DstReg, ResultRegs[0]);
6095     return true;
6096   }
6097 
6098   assert(Ty.isVector());
6099 
6100   if (IsD16) {
6101     // For packed D16 results with TFE enabled, all the data components are
6102     // S32. Cast back to the expected type.
6103     //
6104     // TODO: We don't really need to use load s32 elements. We would only need one
6105     // cast for the TFE result if a multiple of v2s16 was used.
6106     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
6107       for (Register &Reg : ResultRegs)
6108         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
6109     } else if (ST.hasUnpackedD16VMem()) {
6110       for (Register &Reg : ResultRegs)
6111         Reg = B.buildTrunc(S16, Reg).getReg(0);
6112     }
6113   }
6114 
6115   auto padWithUndef = [&](LLT Ty, int NumElts) {
6116     if (NumElts == 0)
6117       return;
6118     Register Undef = B.buildUndef(Ty).getReg(0);
6119     for (int I = 0; I != NumElts; ++I)
6120       ResultRegs.push_back(Undef);
6121   };
6122 
6123   // Pad out any elements eliminated due to the dmask.
6124   LLT ResTy = MRI->getType(ResultRegs[0]);
6125   if (!ResTy.isVector()) {
6126     padWithUndef(ResTy, NumElts - ResultRegs.size());
6127     B.buildBuildVector(DstReg, ResultRegs);
6128     return true;
6129   }
6130 
6131   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
6132   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
6133 
6134   // Deal with the one annoying legal case.
6135   const LLT V3S16 = LLT::fixed_vector(3, 16);
6136   if (Ty == V3S16) {
6137     if (IsTFE) {
6138       if (ResultRegs.size() == 1) {
6139         NewResultReg = ResultRegs[0];
6140       } else if (ResultRegs.size() == 2) {
6141         LLT V4S16 = LLT::fixed_vector(4, 16);
6142         NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0);
6143       } else {
6144         return false;
6145       }
6146     }
6147 
6148     if (MRI->getType(DstReg).getNumElements() <
6149         MRI->getType(NewResultReg).getNumElements()) {
6150       B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
6151     } else {
6152       B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
6153     }
6154     return true;
6155   }
6156 
6157   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
6158   B.buildConcatVectors(DstReg, ResultRegs);
6159   return true;
6160 }
6161 
6162 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
6163   LegalizerHelper &Helper, MachineInstr &MI) const {
6164   MachineIRBuilder &B = Helper.MIRBuilder;
6165   GISelChangeObserver &Observer = Helper.Observer;
6166 
6167   Register Dst = MI.getOperand(0).getReg();
6168   LLT Ty = B.getMRI()->getType(Dst);
6169   unsigned Size = Ty.getSizeInBits();
6170   MachineFunction &MF = B.getMF();
6171 
6172   Observer.changingInstr(MI);
6173 
6174   // Handle needing to s.buffer.load() a p8 value.
6175   if (hasBufferRsrcWorkaround(Ty)) {
6176     Ty = castBufferRsrcFromV4I32(MI, B, *B.getMRI(), 0);
6177     Dst = MI.getOperand(0).getReg();
6178     B.setInsertPt(B.getMBB(), MI);
6179   }
6180   if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) {
6181     Ty = getBitcastRegisterType(Ty);
6182     Helper.bitcastDst(MI, Ty, 0);
6183     Dst = MI.getOperand(0).getReg();
6184     B.setInsertPt(B.getMBB(), MI);
6185   }
6186 
6187   // FIXME: We don't really need this intermediate instruction. The intrinsic
6188   // should be fixed to have a memory operand. Since it's readnone, we're not
6189   // allowed to add one.
6190   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
6191   MI.removeOperand(1); // Remove intrinsic ID
6192 
6193   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
6194   // TODO: Should this use datalayout alignment?
6195   const unsigned MemSize = (Size + 7) / 8;
6196   const Align MemAlign(4);
6197   MachineMemOperand *MMO = MF.getMachineMemOperand(
6198       MachinePointerInfo(),
6199       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
6200           MachineMemOperand::MOInvariant,
6201       MemSize, MemAlign);
6202   MI.addMemOperand(MF, MMO);
6203 
6204   // There are no 96-bit result scalar loads, but widening to 128-bit should
6205   // always be legal. We may need to restore this to a 96-bit result if it turns
6206   // out this needs to be converted to a vector load during RegBankSelect.
6207   if (!isPowerOf2_32(Size)) {
6208     if (Ty.isVector())
6209       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
6210     else
6211       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
6212   }
6213 
6214   Observer.changedInstr(MI);
6215   return true;
6216 }
6217 
6218 // TODO: Move to selection
6219 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
6220                                                 MachineRegisterInfo &MRI,
6221                                                 MachineIRBuilder &B) const {
6222   if (!ST.isTrapHandlerEnabled() ||
6223       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
6224     return legalizeTrapEndpgm(MI, MRI, B);
6225 
6226   const Module *M = B.getMF().getFunction().getParent();
6227   unsigned CodeObjectVersion = AMDGPU::getCodeObjectVersion(*M);
6228   if (CodeObjectVersion <= AMDGPU::AMDHSA_COV3)
6229     return legalizeTrapHsaQueuePtr(MI, MRI, B);
6230 
6231   return ST.supportsGetDoorbellID() ?
6232          legalizeTrapHsa(MI, MRI, B) : legalizeTrapHsaQueuePtr(MI, MRI, B);
6233 }
6234 
6235 bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
6236     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
6237   const DebugLoc &DL = MI.getDebugLoc();
6238   MachineBasicBlock &BB = B.getMBB();
6239   MachineFunction *MF = BB.getParent();
6240 
6241   if (BB.succ_empty() && std::next(MI.getIterator()) == BB.end()) {
6242     BuildMI(BB, BB.end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
6243       .addImm(0);
6244     MI.eraseFromParent();
6245     return true;
6246   }
6247 
6248   // We need a block split to make the real endpgm a terminator. We also don't
6249   // want to break phis in successor blocks, so we can't just delete to the
6250   // end of the block.
6251   BB.splitAt(MI, false /*UpdateLiveIns*/);
6252   MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
6253   MF->push_back(TrapBB);
6254   BuildMI(*TrapBB, TrapBB->end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
6255     .addImm(0);
6256   BuildMI(BB, &MI, DL, B.getTII().get(AMDGPU::S_CBRANCH_EXECNZ))
6257     .addMBB(TrapBB);
6258 
6259   BB.addSuccessor(TrapBB);
6260   MI.eraseFromParent();
6261   return true;
6262 }
6263 
6264 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
6265     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
6266   MachineFunction &MF = B.getMF();
6267   const LLT S64 = LLT::scalar(64);
6268 
6269   Register SGPR01(AMDGPU::SGPR0_SGPR1);
6270   // For code object version 5, queue_ptr is passed through implicit kernarg.
6271   if (AMDGPU::getCodeObjectVersion(*MF.getFunction().getParent()) >=
6272       AMDGPU::AMDHSA_COV5) {
6273     AMDGPUTargetLowering::ImplicitParameter Param =
6274         AMDGPUTargetLowering::QUEUE_PTR;
6275     uint64_t Offset =
6276         ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
6277 
6278     Register KernargPtrReg = MRI.createGenericVirtualRegister(
6279         LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
6280 
6281     if (!loadInputValue(KernargPtrReg, B,
6282                         AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
6283       return false;
6284 
6285     // TODO: can we be smarter about machine pointer info?
6286     MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
6287     MachineMemOperand *MMO = MF.getMachineMemOperand(
6288         PtrInfo,
6289         MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
6290             MachineMemOperand::MOInvariant,
6291         LLT::scalar(64), commonAlignment(Align(64), Offset));
6292 
6293     // Pointer address
6294     Register LoadAddr = MRI.createGenericVirtualRegister(
6295         LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
6296     B.buildPtrAdd(LoadAddr, KernargPtrReg,
6297                   B.buildConstant(LLT::scalar(64), Offset).getReg(0));
6298     // Load address
6299     Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0);
6300     B.buildCopy(SGPR01, Temp);
6301     B.buildInstr(AMDGPU::S_TRAP)
6302         .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
6303         .addReg(SGPR01, RegState::Implicit);
6304     MI.eraseFromParent();
6305     return true;
6306   }
6307 
6308   // Pass queue pointer to trap handler as input, and insert trap instruction
6309   // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
6310   Register LiveIn =
6311     MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
6312   if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
6313     return false;
6314 
6315   B.buildCopy(SGPR01, LiveIn);
6316   B.buildInstr(AMDGPU::S_TRAP)
6317       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
6318       .addReg(SGPR01, RegState::Implicit);
6319 
6320   MI.eraseFromParent();
6321   return true;
6322 }
6323 
6324 bool AMDGPULegalizerInfo::legalizeTrapHsa(
6325     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
6326   B.buildInstr(AMDGPU::S_TRAP)
6327       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
6328   MI.eraseFromParent();
6329   return true;
6330 }
6331 
6332 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
6333     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
6334   // Is non-HSA path or trap-handler disabled? Then, report a warning
6335   // accordingly
6336   if (!ST.isTrapHandlerEnabled() ||
6337       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
6338     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
6339                                      "debugtrap handler not supported",
6340                                      MI.getDebugLoc(), DS_Warning);
6341     LLVMContext &Ctx = B.getMF().getFunction().getContext();
6342     Ctx.diagnose(NoTrap);
6343   } else {
6344     // Insert debug-trap instruction
6345     B.buildInstr(AMDGPU::S_TRAP)
6346         .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
6347   }
6348 
6349   MI.eraseFromParent();
6350   return true;
6351 }
6352 
6353 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
6354                                                MachineIRBuilder &B) const {
6355   MachineRegisterInfo &MRI = *B.getMRI();
6356   const LLT S16 = LLT::scalar(16);
6357   const LLT S32 = LLT::scalar(32);
6358   const LLT V2S16 = LLT::fixed_vector(2, 16);
6359   const LLT V3S32 = LLT::fixed_vector(3, 32);
6360 
6361   Register DstReg = MI.getOperand(0).getReg();
6362   Register NodePtr = MI.getOperand(2).getReg();
6363   Register RayExtent = MI.getOperand(3).getReg();
6364   Register RayOrigin = MI.getOperand(4).getReg();
6365   Register RayDir = MI.getOperand(5).getReg();
6366   Register RayInvDir = MI.getOperand(6).getReg();
6367   Register TDescr = MI.getOperand(7).getReg();
6368 
6369   if (!ST.hasGFX10_AEncoding()) {
6370     DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
6371                                         "intrinsic not supported on subtarget",
6372                                         MI.getDebugLoc());
6373     B.getMF().getFunction().getContext().diagnose(BadIntrin);
6374     return false;
6375   }
6376 
6377   const bool IsGFX11Plus = AMDGPU::isGFX11Plus(ST);
6378   const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
6379   const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
6380   const unsigned NumVDataDwords = 4;
6381   const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
6382   const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
6383   const bool UseNSA = ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize();
6384   const unsigned BaseOpcodes[2][2] = {
6385       {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
6386       {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
6387        AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
6388   int Opcode;
6389   if (UseNSA) {
6390     Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
6391                                    IsGFX11Plus ? AMDGPU::MIMGEncGfx11NSA
6392                                                : AMDGPU::MIMGEncGfx10NSA,
6393                                    NumVDataDwords, NumVAddrDwords);
6394   } else {
6395     Opcode = AMDGPU::getMIMGOpcode(
6396         BaseOpcodes[Is64][IsA16],
6397         IsGFX11Plus ? AMDGPU::MIMGEncGfx11Default : AMDGPU::MIMGEncGfx10Default,
6398         NumVDataDwords, NumVAddrDwords);
6399   }
6400   assert(Opcode != -1);
6401 
6402   SmallVector<Register, 12> Ops;
6403   if (UseNSA && IsGFX11Plus) {
6404     auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) {
6405       auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
6406       auto Merged = B.buildMergeLikeInstr(
6407           V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)});
6408       Ops.push_back(Merged.getReg(0));
6409     };
6410 
6411     Ops.push_back(NodePtr);
6412     Ops.push_back(RayExtent);
6413     packLanes(RayOrigin);
6414 
6415     if (IsA16) {
6416       auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
6417       auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
6418       auto MergedDir = B.buildMergeLikeInstr(
6419           V3S32,
6420           {B.buildBitcast(
6421                 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(0),
6422                                                    UnmergeRayDir.getReg(0)}))
6423                .getReg(0),
6424            B.buildBitcast(
6425                 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(1),
6426                                                    UnmergeRayDir.getReg(1)}))
6427                .getReg(0),
6428            B.buildBitcast(
6429                 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(2),
6430                                                    UnmergeRayDir.getReg(2)}))
6431                .getReg(0)});
6432       Ops.push_back(MergedDir.getReg(0));
6433     } else {
6434       packLanes(RayDir);
6435       packLanes(RayInvDir);
6436     }
6437   } else {
6438     if (Is64) {
6439       auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
6440       Ops.push_back(Unmerge.getReg(0));
6441       Ops.push_back(Unmerge.getReg(1));
6442     } else {
6443       Ops.push_back(NodePtr);
6444     }
6445     Ops.push_back(RayExtent);
6446 
6447     auto packLanes = [&Ops, &S32, &B](Register Src) {
6448       auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
6449       Ops.push_back(Unmerge.getReg(0));
6450       Ops.push_back(Unmerge.getReg(1));
6451       Ops.push_back(Unmerge.getReg(2));
6452     };
6453 
6454     packLanes(RayOrigin);
6455     if (IsA16) {
6456       auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
6457       auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
6458       Register R1 = MRI.createGenericVirtualRegister(S32);
6459       Register R2 = MRI.createGenericVirtualRegister(S32);
6460       Register R3 = MRI.createGenericVirtualRegister(S32);
6461       B.buildMergeLikeInstr(R1,
6462                             {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
6463       B.buildMergeLikeInstr(
6464           R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
6465       B.buildMergeLikeInstr(
6466           R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
6467       Ops.push_back(R1);
6468       Ops.push_back(R2);
6469       Ops.push_back(R3);
6470     } else {
6471       packLanes(RayDir);
6472       packLanes(RayInvDir);
6473     }
6474   }
6475 
6476   if (!UseNSA) {
6477     // Build a single vector containing all the operands so far prepared.
6478     LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
6479     Register MergedOps = B.buildMergeLikeInstr(OpTy, Ops).getReg(0);
6480     Ops.clear();
6481     Ops.push_back(MergedOps);
6482   }
6483 
6484   auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
6485     .addDef(DstReg)
6486     .addImm(Opcode);
6487 
6488   for (Register R : Ops) {
6489     MIB.addUse(R);
6490   }
6491 
6492   MIB.addUse(TDescr)
6493      .addImm(IsA16 ? 1 : 0)
6494      .cloneMemRefs(MI);
6495 
6496   MI.eraseFromParent();
6497   return true;
6498 }
6499 
6500 bool AMDGPULegalizerInfo::legalizeFPTruncRound(MachineInstr &MI,
6501                                                MachineIRBuilder &B) const {
6502   unsigned Opc;
6503   int RoundMode = MI.getOperand(2).getImm();
6504 
6505   if (RoundMode == (int)RoundingMode::TowardPositive)
6506     Opc = AMDGPU::G_FPTRUNC_ROUND_UPWARD;
6507   else if (RoundMode == (int)RoundingMode::TowardNegative)
6508     Opc = AMDGPU::G_FPTRUNC_ROUND_DOWNWARD;
6509   else
6510     return false;
6511 
6512   B.buildInstr(Opc)
6513       .addDef(MI.getOperand(0).getReg())
6514       .addUse(MI.getOperand(1).getReg());
6515 
6516   MI.eraseFromParent();
6517 
6518   return true;
6519 }
6520 
6521 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
6522                                             MachineInstr &MI) const {
6523   MachineIRBuilder &B = Helper.MIRBuilder;
6524   MachineRegisterInfo &MRI = *B.getMRI();
6525 
6526   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
6527   auto IntrID = MI.getIntrinsicID();
6528   switch (IntrID) {
6529   case Intrinsic::amdgcn_if:
6530   case Intrinsic::amdgcn_else: {
6531     MachineInstr *Br = nullptr;
6532     MachineBasicBlock *UncondBrTarget = nullptr;
6533     bool Negated = false;
6534     if (MachineInstr *BrCond =
6535             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
6536       const SIRegisterInfo *TRI
6537         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
6538 
6539       Register Def = MI.getOperand(1).getReg();
6540       Register Use = MI.getOperand(3).getReg();
6541 
6542       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
6543 
6544       if (Negated)
6545         std::swap(CondBrTarget, UncondBrTarget);
6546 
6547       B.setInsertPt(B.getMBB(), BrCond->getIterator());
6548       if (IntrID == Intrinsic::amdgcn_if) {
6549         B.buildInstr(AMDGPU::SI_IF)
6550           .addDef(Def)
6551           .addUse(Use)
6552           .addMBB(UncondBrTarget);
6553       } else {
6554         B.buildInstr(AMDGPU::SI_ELSE)
6555             .addDef(Def)
6556             .addUse(Use)
6557             .addMBB(UncondBrTarget);
6558       }
6559 
6560       if (Br) {
6561         Br->getOperand(0).setMBB(CondBrTarget);
6562       } else {
6563         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
6564         // since we're swapping branch targets it needs to be reinserted.
6565         // FIXME: IRTranslator should probably not do this
6566         B.buildBr(*CondBrTarget);
6567       }
6568 
6569       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
6570       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
6571       MI.eraseFromParent();
6572       BrCond->eraseFromParent();
6573       return true;
6574     }
6575 
6576     return false;
6577   }
6578   case Intrinsic::amdgcn_loop: {
6579     MachineInstr *Br = nullptr;
6580     MachineBasicBlock *UncondBrTarget = nullptr;
6581     bool Negated = false;
6582     if (MachineInstr *BrCond =
6583             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
6584       const SIRegisterInfo *TRI
6585         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
6586 
6587       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
6588       Register Reg = MI.getOperand(2).getReg();
6589 
6590       if (Negated)
6591         std::swap(CondBrTarget, UncondBrTarget);
6592 
6593       B.setInsertPt(B.getMBB(), BrCond->getIterator());
6594       B.buildInstr(AMDGPU::SI_LOOP)
6595         .addUse(Reg)
6596         .addMBB(UncondBrTarget);
6597 
6598       if (Br)
6599         Br->getOperand(0).setMBB(CondBrTarget);
6600       else
6601         B.buildBr(*CondBrTarget);
6602 
6603       MI.eraseFromParent();
6604       BrCond->eraseFromParent();
6605       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
6606       return true;
6607     }
6608 
6609     return false;
6610   }
6611   case Intrinsic::amdgcn_make_buffer_rsrc:
6612     return legalizePointerAsRsrcIntrin(MI, MRI, B);
6613   case Intrinsic::amdgcn_kernarg_segment_ptr:
6614     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
6615       // This only makes sense to call in a kernel, so just lower to null.
6616       B.buildConstant(MI.getOperand(0).getReg(), 0);
6617       MI.eraseFromParent();
6618       return true;
6619     }
6620 
6621     return legalizePreloadedArgIntrin(
6622       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
6623   case Intrinsic::amdgcn_implicitarg_ptr:
6624     return legalizeImplicitArgPtr(MI, MRI, B);
6625   case Intrinsic::amdgcn_workitem_id_x:
6626     return legalizeWorkitemIDIntrinsic(MI, MRI, B, 0,
6627                                        AMDGPUFunctionArgInfo::WORKITEM_ID_X);
6628   case Intrinsic::amdgcn_workitem_id_y:
6629     return legalizeWorkitemIDIntrinsic(MI, MRI, B, 1,
6630                                        AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
6631   case Intrinsic::amdgcn_workitem_id_z:
6632     return legalizeWorkitemIDIntrinsic(MI, MRI, B, 2,
6633                                        AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
6634   case Intrinsic::amdgcn_workgroup_id_x:
6635     return legalizePreloadedArgIntrin(MI, MRI, B,
6636                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
6637   case Intrinsic::amdgcn_workgroup_id_y:
6638     return legalizePreloadedArgIntrin(MI, MRI, B,
6639                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
6640   case Intrinsic::amdgcn_workgroup_id_z:
6641     return legalizePreloadedArgIntrin(MI, MRI, B,
6642                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
6643   case Intrinsic::amdgcn_lds_kernel_id:
6644     return legalizePreloadedArgIntrin(MI, MRI, B,
6645                                       AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
6646   case Intrinsic::amdgcn_dispatch_ptr:
6647     return legalizePreloadedArgIntrin(MI, MRI, B,
6648                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
6649   case Intrinsic::amdgcn_queue_ptr:
6650     return legalizePreloadedArgIntrin(MI, MRI, B,
6651                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
6652   case Intrinsic::amdgcn_implicit_buffer_ptr:
6653     return legalizePreloadedArgIntrin(
6654       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
6655   case Intrinsic::amdgcn_dispatch_id:
6656     return legalizePreloadedArgIntrin(MI, MRI, B,
6657                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
6658   case Intrinsic::r600_read_ngroups_x:
6659     // TODO: Emit error for hsa
6660     return legalizeKernargMemParameter(MI, B,
6661                                        SI::KernelInputOffsets::NGROUPS_X);
6662   case Intrinsic::r600_read_ngroups_y:
6663     return legalizeKernargMemParameter(MI, B,
6664                                        SI::KernelInputOffsets::NGROUPS_Y);
6665   case Intrinsic::r600_read_ngroups_z:
6666     return legalizeKernargMemParameter(MI, B,
6667                                        SI::KernelInputOffsets::NGROUPS_Z);
6668   case Intrinsic::r600_read_local_size_x:
6669     // TODO: Could insert G_ASSERT_ZEXT from s16
6670     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_X);
6671   case Intrinsic::r600_read_local_size_y:
6672     // TODO: Could insert G_ASSERT_ZEXT from s16
6673     return legalizeKernargMemParameter(MI, B,  SI::KernelInputOffsets::LOCAL_SIZE_Y);
6674     // TODO: Could insert G_ASSERT_ZEXT from s16
6675   case Intrinsic::r600_read_local_size_z:
6676     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Z);
6677   case Intrinsic::r600_read_global_size_x:
6678     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_X);
6679   case Intrinsic::r600_read_global_size_y:
6680     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Y);
6681   case Intrinsic::r600_read_global_size_z:
6682     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Z);
6683   case Intrinsic::amdgcn_fdiv_fast:
6684     return legalizeFDIVFastIntrin(MI, MRI, B);
6685   case Intrinsic::amdgcn_is_shared:
6686     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
6687   case Intrinsic::amdgcn_is_private:
6688     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
6689   case Intrinsic::amdgcn_wavefrontsize: {
6690     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
6691     MI.eraseFromParent();
6692     return true;
6693   }
6694   case Intrinsic::amdgcn_s_buffer_load:
6695     return legalizeSBufferLoad(Helper, MI);
6696   case Intrinsic::amdgcn_raw_buffer_store:
6697   case Intrinsic::amdgcn_raw_ptr_buffer_store:
6698   case Intrinsic::amdgcn_struct_buffer_store:
6699   case Intrinsic::amdgcn_struct_ptr_buffer_store:
6700     return legalizeBufferStore(MI, MRI, B, false, false);
6701   case Intrinsic::amdgcn_raw_buffer_store_format:
6702   case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
6703   case Intrinsic::amdgcn_struct_buffer_store_format:
6704   case Intrinsic::amdgcn_struct_ptr_buffer_store_format:
6705     return legalizeBufferStore(MI, MRI, B, false, true);
6706   case Intrinsic::amdgcn_raw_tbuffer_store:
6707   case Intrinsic::amdgcn_raw_ptr_tbuffer_store:
6708   case Intrinsic::amdgcn_struct_tbuffer_store:
6709   case Intrinsic::amdgcn_struct_ptr_tbuffer_store:
6710     return legalizeBufferStore(MI, MRI, B, true, true);
6711   case Intrinsic::amdgcn_raw_buffer_load:
6712   case Intrinsic::amdgcn_raw_ptr_buffer_load:
6713   case Intrinsic::amdgcn_struct_buffer_load:
6714   case Intrinsic::amdgcn_struct_ptr_buffer_load:
6715     return legalizeBufferLoad(MI, MRI, B, false, false);
6716   case Intrinsic::amdgcn_raw_buffer_load_format:
6717   case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
6718   case Intrinsic::amdgcn_struct_buffer_load_format:
6719   case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
6720     return legalizeBufferLoad(MI, MRI, B, true, false);
6721   case Intrinsic::amdgcn_raw_tbuffer_load:
6722   case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
6723   case Intrinsic::amdgcn_struct_tbuffer_load:
6724   case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
6725     return legalizeBufferLoad(MI, MRI, B, true, true);
6726   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
6727   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
6728   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
6729   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
6730   case Intrinsic::amdgcn_raw_buffer_atomic_add:
6731   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
6732   case Intrinsic::amdgcn_struct_buffer_atomic_add:
6733   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
6734   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
6735   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
6736   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
6737   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
6738   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
6739   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
6740   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
6741   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
6742   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
6743   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
6744   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
6745   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
6746   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
6747   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
6748   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
6749   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
6750   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
6751   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
6752   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
6753   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
6754   case Intrinsic::amdgcn_raw_buffer_atomic_and:
6755   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
6756   case Intrinsic::amdgcn_struct_buffer_atomic_and:
6757   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
6758   case Intrinsic::amdgcn_raw_buffer_atomic_or:
6759   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
6760   case Intrinsic::amdgcn_struct_buffer_atomic_or:
6761   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
6762   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
6763   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
6764   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
6765   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
6766   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
6767   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
6768   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
6769   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
6770   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
6771   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
6772   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
6773   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
6774   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
6775   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
6776   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
6777   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
6778   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
6779   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
6780   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
6781   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
6782   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
6783   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
6784   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
6785   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
6786   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
6787   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
6788   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
6789   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
6790     return legalizeBufferAtomic(MI, B, IntrID);
6791   case Intrinsic::trap:
6792     return legalizeTrapIntrinsic(MI, MRI, B);
6793   case Intrinsic::debugtrap:
6794     return legalizeDebugTrapIntrinsic(MI, MRI, B);
6795   case Intrinsic::amdgcn_rsq_clamp:
6796     return legalizeRsqClampIntrinsic(MI, MRI, B);
6797   case Intrinsic::amdgcn_ds_fadd:
6798   case Intrinsic::amdgcn_ds_fmin:
6799   case Intrinsic::amdgcn_ds_fmax:
6800     return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
6801   case Intrinsic::amdgcn_image_bvh_intersect_ray:
6802     return legalizeBVHIntrinsic(MI, B);
6803   case Intrinsic::amdgcn_fmed3: {
6804     GISelChangeObserver &Observer = Helper.Observer;
6805 
6806     // FIXME: This is to workaround the inability of tablegen match combiners to
6807     // match intrinsics in patterns.
6808     Observer.changingInstr(MI);
6809     MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_FMED3));
6810     MI.removeOperand(1);
6811     Observer.changedInstr(MI);
6812     return true;
6813   }
6814   default: {
6815     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
6816             AMDGPU::getImageDimIntrinsicInfo(IntrID))
6817       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
6818     return true;
6819   }
6820   }
6821 
6822   return true;
6823 }
6824