xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp (revision 3f0efe05432b1633991114ca4ca330102a561959)
1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUInstrInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
21 #include "SIInstrInfo.h"
22 #include "SIMachineFunctionInfo.h"
23 #include "SIRegisterInfo.h"
24 #include "Utils/AMDGPUBaseInfo.h"
25 #include "llvm/ADT/ScopeExit.h"
26 #include "llvm/BinaryFormat/ELF.h"
27 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
28 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
29 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
30 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
31 #include "llvm/CodeGen/GlobalISel/Utils.h"
32 #include "llvm/CodeGen/TargetOpcodes.h"
33 #include "llvm/IR/DiagnosticInfo.h"
34 #include "llvm/IR/IntrinsicsAMDGPU.h"
35 #include "llvm/IR/IntrinsicsR600.h"
36 
37 #define DEBUG_TYPE "amdgpu-legalinfo"
38 
39 using namespace llvm;
40 using namespace LegalizeActions;
41 using namespace LegalizeMutations;
42 using namespace LegalityPredicates;
43 using namespace MIPatternMatch;
44 
45 // Hack until load/store selection patterns support any tuple of legal types.
46 static cl::opt<bool> EnableNewLegality(
47   "amdgpu-global-isel-new-legality",
48   cl::desc("Use GlobalISel desired legality, rather than try to use"
49            "rules compatible with selection patterns"),
50   cl::init(false),
51   cl::ReallyHidden);
52 
53 static constexpr unsigned MaxRegisterSize = 1024;
54 
55 // Round the number of elements to the next power of two elements
56 static LLT getPow2VectorType(LLT Ty) {
57   unsigned NElts = Ty.getNumElements();
58   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
59   return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
60 }
61 
62 // Round the number of bits to the next power of two bits
63 static LLT getPow2ScalarType(LLT Ty) {
64   unsigned Bits = Ty.getSizeInBits();
65   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
66   return LLT::scalar(Pow2Bits);
67 }
68 
69 /// \returns true if this is an odd sized vector which should widen by adding an
70 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
71 /// excludes s1 vectors, which should always be scalarized.
72 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
73   return [=](const LegalityQuery &Query) {
74     const LLT Ty = Query.Types[TypeIdx];
75     if (!Ty.isVector())
76       return false;
77 
78     const LLT EltTy = Ty.getElementType();
79     const unsigned EltSize = EltTy.getSizeInBits();
80     return Ty.getNumElements() % 2 != 0 &&
81            EltSize > 1 && EltSize < 32 &&
82            Ty.getSizeInBits() % 32 != 0;
83   };
84 }
85 
86 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
87   return [=](const LegalityQuery &Query) {
88     const LLT Ty = Query.Types[TypeIdx];
89     return Ty.getSizeInBits() % 32 == 0;
90   };
91 }
92 
93 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
94   return [=](const LegalityQuery &Query) {
95     const LLT Ty = Query.Types[TypeIdx];
96     const LLT EltTy = Ty.getScalarType();
97     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
98   };
99 }
100 
101 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
102   return [=](const LegalityQuery &Query) {
103     const LLT Ty = Query.Types[TypeIdx];
104     const LLT EltTy = Ty.getElementType();
105     return std::pair(TypeIdx,
106                      LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
107   };
108 }
109 
110 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
111   return [=](const LegalityQuery &Query) {
112     const LLT Ty = Query.Types[TypeIdx];
113     const LLT EltTy = Ty.getElementType();
114     unsigned Size = Ty.getSizeInBits();
115     unsigned Pieces = (Size + 63) / 64;
116     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
117     return std::pair(TypeIdx, LLT::scalarOrVector(
118                                   ElementCount::getFixed(NewNumElts), EltTy));
119   };
120 }
121 
122 // Increase the number of vector elements to reach the next multiple of 32-bit
123 // type.
124 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
125   return [=](const LegalityQuery &Query) {
126     const LLT Ty = Query.Types[TypeIdx];
127 
128     const LLT EltTy = Ty.getElementType();
129     const int Size = Ty.getSizeInBits();
130     const int EltSize = EltTy.getSizeInBits();
131     const int NextMul32 = (Size + 31) / 32;
132 
133     assert(EltSize < 32);
134 
135     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
136     return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
137   };
138 }
139 
140 // Increase the number of vector elements to reach the next legal RegClass.
141 static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx) {
142   return [=](const LegalityQuery &Query) {
143     const LLT Ty = Query.Types[TypeIdx];
144     const unsigned NumElts = Ty.getNumElements();
145     const unsigned EltSize = Ty.getElementType().getSizeInBits();
146     const unsigned MaxNumElts = MaxRegisterSize / EltSize;
147 
148     assert(EltSize == 32 || EltSize == 64);
149     assert(Ty.getSizeInBits() < MaxRegisterSize);
150 
151     unsigned NewNumElts;
152     // Find the nearest legal RegClass that is larger than the current type.
153     for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
154       if (SIRegisterInfo::getSGPRClassForBitWidth(NewNumElts * EltSize))
155         break;
156     }
157 
158     return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltSize));
159   };
160 }
161 
162 static LLT getBufferRsrcScalarType(const LLT Ty) {
163   if (!Ty.isVector())
164     return LLT::scalar(128);
165   const ElementCount NumElems = Ty.getElementCount();
166   return LLT::vector(NumElems, LLT::scalar(128));
167 }
168 
169 static LLT getBufferRsrcRegisterType(const LLT Ty) {
170   if (!Ty.isVector())
171     return LLT::fixed_vector(4, LLT::scalar(32));
172   const unsigned NumElems = Ty.getElementCount().getFixedValue();
173   return LLT::fixed_vector(NumElems * 4, LLT::scalar(32));
174 }
175 
176 static LLT getBitcastRegisterType(const LLT Ty) {
177   const unsigned Size = Ty.getSizeInBits();
178 
179   if (Size <= 32) {
180     // <2 x s8> -> s16
181     // <4 x s8> -> s32
182     return LLT::scalar(Size);
183   }
184 
185   return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32);
186 }
187 
188 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
189   return [=](const LegalityQuery &Query) {
190     const LLT Ty = Query.Types[TypeIdx];
191     return std::pair(TypeIdx, getBitcastRegisterType(Ty));
192   };
193 }
194 
195 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
196   return [=](const LegalityQuery &Query) {
197     const LLT Ty = Query.Types[TypeIdx];
198     unsigned Size = Ty.getSizeInBits();
199     assert(Size % 32 == 0);
200     return std::pair(
201         TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32));
202   };
203 }
204 
205 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
206   return [=](const LegalityQuery &Query) {
207     const LLT QueryTy = Query.Types[TypeIdx];
208     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
209   };
210 }
211 
212 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
213   return [=](const LegalityQuery &Query) {
214     const LLT QueryTy = Query.Types[TypeIdx];
215     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
216   };
217 }
218 
219 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
220   return [=](const LegalityQuery &Query) {
221     const LLT QueryTy = Query.Types[TypeIdx];
222     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
223   };
224 }
225 
226 static bool isRegisterSize(unsigned Size) {
227   return Size % 32 == 0 && Size <= MaxRegisterSize;
228 }
229 
230 static bool isRegisterVectorElementType(LLT EltTy) {
231   const int EltSize = EltTy.getSizeInBits();
232   return EltSize == 16 || EltSize % 32 == 0;
233 }
234 
235 static bool isRegisterVectorType(LLT Ty) {
236   const int EltSize = Ty.getElementType().getSizeInBits();
237   return EltSize == 32 || EltSize == 64 ||
238          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
239          EltSize == 128 || EltSize == 256;
240 }
241 
242 static bool isRegisterType(LLT Ty) {
243   if (!isRegisterSize(Ty.getSizeInBits()))
244     return false;
245 
246   if (Ty.isVector())
247     return isRegisterVectorType(Ty);
248 
249   return true;
250 }
251 
252 // Any combination of 32 or 64-bit elements up the maximum register size, and
253 // multiples of v2s16.
254 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
255   return [=](const LegalityQuery &Query) {
256     return isRegisterType(Query.Types[TypeIdx]);
257   };
258 }
259 
260 // RegisterType that doesn't have a corresponding RegClass.
261 static LegalityPredicate isIllegalRegisterType(unsigned TypeIdx) {
262   return [=](const LegalityQuery &Query) {
263     LLT Ty = Query.Types[TypeIdx];
264     return isRegisterType(Ty) &&
265            !SIRegisterInfo::getSGPRClassForBitWidth(Ty.getSizeInBits());
266   };
267 }
268 
269 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
270   return [=](const LegalityQuery &Query) {
271     const LLT QueryTy = Query.Types[TypeIdx];
272     if (!QueryTy.isVector())
273       return false;
274     const LLT EltTy = QueryTy.getElementType();
275     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
276   };
277 }
278 
279 // If we have a truncating store or an extending load with a data size larger
280 // than 32-bits, we need to reduce to a 32-bit type.
281 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
282   return [=](const LegalityQuery &Query) {
283     const LLT Ty = Query.Types[TypeIdx];
284     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
285            Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
286   };
287 }
288 
289 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
290 // handle some operations by just promoting the register during
291 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
292 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
293                                     bool IsLoad, bool IsAtomic) {
294   switch (AS) {
295   case AMDGPUAS::PRIVATE_ADDRESS:
296     // FIXME: Private element size.
297     return ST.enableFlatScratch() ? 128 : 32;
298   case AMDGPUAS::LOCAL_ADDRESS:
299     return ST.useDS128() ? 128 : 64;
300   case AMDGPUAS::GLOBAL_ADDRESS:
301   case AMDGPUAS::CONSTANT_ADDRESS:
302   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
303   case AMDGPUAS::BUFFER_RESOURCE:
304     // Treat constant and global as identical. SMRD loads are sometimes usable for
305     // global loads (ideally constant address space should be eliminated)
306     // depending on the context. Legality cannot be context dependent, but
307     // RegBankSelect can split the load as necessary depending on the pointer
308     // register bank/uniformity and if the memory is invariant or not written in a
309     // kernel.
310     return IsLoad ? 512 : 128;
311   default:
312     // FIXME: Flat addresses may contextually need to be split to 32-bit parts
313     // if they may alias scratch depending on the subtarget.  This needs to be
314     // moved to custom handling to use addressMayBeAccessedAsPrivate
315     return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
316   }
317 }
318 
319 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
320                                  const LegalityQuery &Query) {
321   const LLT Ty = Query.Types[0];
322 
323   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
324   const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
325 
326   unsigned RegSize = Ty.getSizeInBits();
327   uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
328   uint64_t AlignBits = Query.MMODescrs[0].AlignInBits;
329   unsigned AS = Query.Types[1].getAddressSpace();
330 
331   // All of these need to be custom lowered to cast the pointer operand.
332   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
333     return false;
334 
335   // Do not handle extending vector loads.
336   if (Ty.isVector() && MemSize != RegSize)
337     return false;
338 
339   // TODO: We should be able to widen loads if the alignment is high enough, but
340   // we also need to modify the memory access size.
341 #if 0
342   // Accept widening loads based on alignment.
343   if (IsLoad && MemSize < Size)
344     MemSize = std::max(MemSize, Align);
345 #endif
346 
347   // Only 1-byte and 2-byte to 32-bit extloads are valid.
348   if (MemSize != RegSize && RegSize != 32)
349     return false;
350 
351   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
352                                     Query.MMODescrs[0].Ordering !=
353                                         AtomicOrdering::NotAtomic))
354     return false;
355 
356   switch (MemSize) {
357   case 8:
358   case 16:
359   case 32:
360   case 64:
361   case 128:
362     break;
363   case 96:
364     if (!ST.hasDwordx3LoadStores())
365       return false;
366     break;
367   case 256:
368   case 512:
369     // These may contextually need to be broken down.
370     break;
371   default:
372     return false;
373   }
374 
375   assert(RegSize >= MemSize);
376 
377   if (AlignBits < MemSize) {
378     const SITargetLowering *TLI = ST.getTargetLowering();
379     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
380                                                  Align(AlignBits / 8)))
381       return false;
382   }
383 
384   return true;
385 }
386 
387 // The newer buffer intrinsic forms take their resource arguments as
388 // pointers in address space 8, aka s128 values. However, in order to not break
389 // SelectionDAG, the underlying operations have to continue to take v4i32
390 // arguments. Therefore, we convert resource pointers - or vectors of them
391 // to integer values here.
392 static bool hasBufferRsrcWorkaround(const LLT Ty) {
393   if (Ty.isPointer() && Ty.getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
394     return true;
395   if (Ty.isVector()) {
396     const LLT ElemTy = Ty.getElementType();
397     return hasBufferRsrcWorkaround(ElemTy);
398   }
399   return false;
400 }
401 
402 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
403 // workaround this. Eventually it should ignore the type for loads and only care
404 // about the size. Return true in cases where we will workaround this for now by
405 // bitcasting.
406 static bool loadStoreBitcastWorkaround(const LLT Ty) {
407   if (EnableNewLegality)
408     return false;
409 
410   const unsigned Size = Ty.getSizeInBits();
411   if (Size <= 64)
412     return false;
413   // Address space 8 pointers get their own workaround.
414   if (hasBufferRsrcWorkaround(Ty))
415     return false;
416   if (!Ty.isVector())
417     return true;
418 
419   LLT EltTy = Ty.getElementType();
420   if (EltTy.isPointer())
421     return true;
422 
423   unsigned EltSize = EltTy.getSizeInBits();
424   return EltSize != 32 && EltSize != 64;
425 }
426 
427 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
428   const LLT Ty = Query.Types[0];
429   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
430          !hasBufferRsrcWorkaround(Ty) && !loadStoreBitcastWorkaround(Ty);
431 }
432 
433 /// Return true if a load or store of the type should be lowered with a bitcast
434 /// to a different type.
435 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
436                                        const LLT MemTy) {
437   const unsigned MemSizeInBits = MemTy.getSizeInBits();
438   const unsigned Size = Ty.getSizeInBits();
439   if (Size != MemSizeInBits)
440     return Size <= 32 && Ty.isVector();
441 
442   if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
443     return true;
444 
445   // Don't try to handle bitcasting vector ext loads for now.
446   return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
447          (Size <= 32 || isRegisterSize(Size)) &&
448          !isRegisterVectorElementType(Ty.getElementType());
449 }
450 
451 /// Return true if we should legalize a load by widening an odd sized memory
452 /// access up to the alignment. Note this case when the memory access itself
453 /// changes, not the size of the result register.
454 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
455                             uint64_t AlignInBits, unsigned AddrSpace,
456                             unsigned Opcode) {
457   unsigned SizeInBits = MemoryTy.getSizeInBits();
458   // We don't want to widen cases that are naturally legal.
459   if (isPowerOf2_32(SizeInBits))
460     return false;
461 
462   // If we have 96-bit memory operations, we shouldn't touch them. Note we may
463   // end up widening these for a scalar load during RegBankSelect, if we don't
464   // have 96-bit scalar loads.
465   if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
466     return false;
467 
468   if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode, false))
469     return false;
470 
471   // A load is known dereferenceable up to the alignment, so it's legal to widen
472   // to it.
473   //
474   // TODO: Could check dereferenceable for less aligned cases.
475   unsigned RoundedSize = NextPowerOf2(SizeInBits);
476   if (AlignInBits < RoundedSize)
477     return false;
478 
479   // Do not widen if it would introduce a slow unaligned load.
480   const SITargetLowering *TLI = ST.getTargetLowering();
481   unsigned Fast = 0;
482   return TLI->allowsMisalignedMemoryAccessesImpl(
483              RoundedSize, AddrSpace, Align(AlignInBits / 8),
484              MachineMemOperand::MOLoad, &Fast) &&
485          Fast;
486 }
487 
488 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
489                             unsigned Opcode) {
490   if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
491     return false;
492 
493   return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
494                          Query.MMODescrs[0].AlignInBits,
495                          Query.Types[1].getAddressSpace(), Opcode);
496 }
497 
498 /// Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial
499 /// type of the operand `idx` and then to transform it to a `p8` via bitcasts
500 /// and inttoptr. In addition, handle vectors of p8. Returns the new type.
501 static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B,
502                                    MachineRegisterInfo &MRI, unsigned Idx) {
503   MachineOperand &MO = MI.getOperand(Idx);
504 
505   const LLT PointerTy = MRI.getType(MO.getReg());
506 
507   // Paranoidly prevent us from doing this multiple times.
508   if (!hasBufferRsrcWorkaround(PointerTy))
509     return PointerTy;
510 
511   const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
512   const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
513   if (!PointerTy.isVector()) {
514     // Happy path: (4 x s32) -> (s32, s32, s32, s32) -> (p8)
515     const unsigned NumParts = PointerTy.getSizeInBits() / 32;
516     const LLT S32 = LLT::scalar(32);
517 
518     Register VectorReg = MRI.createGenericVirtualRegister(VectorTy);
519     std::array<Register, 4> VectorElems;
520     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
521     for (unsigned I = 0; I < NumParts; ++I)
522       VectorElems[I] =
523           B.buildExtractVectorElementConstant(S32, VectorReg, I).getReg(0);
524     B.buildMergeValues(MO, VectorElems);
525     MO.setReg(VectorReg);
526     return VectorTy;
527   }
528   Register BitcastReg = MRI.createGenericVirtualRegister(VectorTy);
529   B.setInsertPt(B.getMBB(), ++B.getInsertPt());
530   auto Scalar = B.buildBitcast(ScalarTy, BitcastReg);
531   B.buildIntToPtr(MO, Scalar);
532   MO.setReg(BitcastReg);
533 
534   return VectorTy;
535 }
536 
537 /// Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is
538 /// the form in which the value must be in order to be passed to the low-level
539 /// representations used for MUBUF/MTBUF intrinsics. This is a hack, which is
540 /// needed in order to account for the fact that we can't define a register
541 /// class for s128 without breaking SelectionDAG.
542 static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B) {
543   MachineRegisterInfo &MRI = *B.getMRI();
544   const LLT PointerTy = MRI.getType(Pointer);
545   const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
546   const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
547 
548   if (!PointerTy.isVector()) {
549     // Special case: p8 -> (s32, s32, s32, s32) -> (4xs32)
550     SmallVector<Register, 4> PointerParts;
551     const unsigned NumParts = PointerTy.getSizeInBits() / 32;
552     auto Unmerged = B.buildUnmerge(LLT::scalar(32), Pointer);
553     for (unsigned I = 0; I < NumParts; ++I)
554       PointerParts.push_back(Unmerged.getReg(I));
555     return B.buildBuildVector(VectorTy, PointerParts).getReg(0);
556   }
557   Register Scalar = B.buildPtrToInt(ScalarTy, Pointer).getReg(0);
558   return B.buildBitcast(VectorTy, Scalar).getReg(0);
559 }
560 
561 static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B,
562                                      unsigned Idx) {
563   MachineOperand &MO = MI.getOperand(Idx);
564 
565   const LLT PointerTy = B.getMRI()->getType(MO.getReg());
566   // Paranoidly prevent us from doing this multiple times.
567   if (!hasBufferRsrcWorkaround(PointerTy))
568     return;
569   MO.setReg(castBufferRsrcToV4I32(MO.getReg(), B));
570 }
571 
572 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
573                                          const GCNTargetMachine &TM)
574   :  ST(ST_) {
575   using namespace TargetOpcode;
576 
577   auto GetAddrSpacePtr = [&TM](unsigned AS) {
578     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
579   };
580 
581   const LLT S1 = LLT::scalar(1);
582   const LLT S8 = LLT::scalar(8);
583   const LLT S16 = LLT::scalar(16);
584   const LLT S32 = LLT::scalar(32);
585   const LLT S64 = LLT::scalar(64);
586   const LLT S128 = LLT::scalar(128);
587   const LLT S256 = LLT::scalar(256);
588   const LLT S512 = LLT::scalar(512);
589   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
590 
591   const LLT V2S8 = LLT::fixed_vector(2, 8);
592   const LLT V2S16 = LLT::fixed_vector(2, 16);
593   const LLT V4S16 = LLT::fixed_vector(4, 16);
594 
595   const LLT V2S32 = LLT::fixed_vector(2, 32);
596   const LLT V3S32 = LLT::fixed_vector(3, 32);
597   const LLT V4S32 = LLT::fixed_vector(4, 32);
598   const LLT V5S32 = LLT::fixed_vector(5, 32);
599   const LLT V6S32 = LLT::fixed_vector(6, 32);
600   const LLT V7S32 = LLT::fixed_vector(7, 32);
601   const LLT V8S32 = LLT::fixed_vector(8, 32);
602   const LLT V9S32 = LLT::fixed_vector(9, 32);
603   const LLT V10S32 = LLT::fixed_vector(10, 32);
604   const LLT V11S32 = LLT::fixed_vector(11, 32);
605   const LLT V12S32 = LLT::fixed_vector(12, 32);
606   const LLT V13S32 = LLT::fixed_vector(13, 32);
607   const LLT V14S32 = LLT::fixed_vector(14, 32);
608   const LLT V15S32 = LLT::fixed_vector(15, 32);
609   const LLT V16S32 = LLT::fixed_vector(16, 32);
610   const LLT V32S32 = LLT::fixed_vector(32, 32);
611 
612   const LLT V2S64 = LLT::fixed_vector(2, 64);
613   const LLT V3S64 = LLT::fixed_vector(3, 64);
614   const LLT V4S64 = LLT::fixed_vector(4, 64);
615   const LLT V5S64 = LLT::fixed_vector(5, 64);
616   const LLT V6S64 = LLT::fixed_vector(6, 64);
617   const LLT V7S64 = LLT::fixed_vector(7, 64);
618   const LLT V8S64 = LLT::fixed_vector(8, 64);
619   const LLT V16S64 = LLT::fixed_vector(16, 64);
620 
621   std::initializer_list<LLT> AllS32Vectors =
622     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
623      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
624   std::initializer_list<LLT> AllS64Vectors =
625     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
626 
627   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
628   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
629   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
630   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
631   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
632   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
633   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
634   const LLT BufferFatPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_FAT_POINTER);
635   const LLT RsrcPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_RESOURCE);
636   const LLT BufferStridedPtr =
637       GetAddrSpacePtr(AMDGPUAS::BUFFER_STRIDED_POINTER);
638 
639   const LLT CodePtr = FlatPtr;
640 
641   const std::initializer_list<LLT> AddrSpaces64 = {
642     GlobalPtr, ConstantPtr, FlatPtr
643   };
644 
645   const std::initializer_list<LLT> AddrSpaces32 = {
646     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
647   };
648 
649   const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
650 
651   const std::initializer_list<LLT> FPTypesBase = {
652     S32, S64
653   };
654 
655   const std::initializer_list<LLT> FPTypes16 = {
656     S32, S64, S16
657   };
658 
659   const std::initializer_list<LLT> FPTypesPK16 = {
660     S32, S64, S16, V2S16
661   };
662 
663   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
664 
665   // s1 for VCC branches, s32 for SCC branches.
666   getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32});
667 
668   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
669   // elements for v3s16
670   getActionDefinitionsBuilder(G_PHI)
671       .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
672       .legalFor(AllS32Vectors)
673       .legalFor(AllS64Vectors)
674       .legalFor(AddrSpaces64)
675       .legalFor(AddrSpaces32)
676       .legalFor(AddrSpaces128)
677       .legalIf(isPointer(0))
678       .clampScalar(0, S16, S256)
679       .widenScalarToNextPow2(0, 32)
680       .clampMaxNumElements(0, S32, 16)
681       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
682       .scalarize(0);
683 
684   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
685     // Full set of gfx9 features.
686     if (ST.hasScalarAddSub64()) {
687       getActionDefinitionsBuilder({G_ADD, G_SUB})
688           .legalFor({S64, S32, S16, V2S16})
689           .clampMaxNumElementsStrict(0, S16, 2)
690           .scalarize(0)
691           .minScalar(0, S16)
692           .widenScalarToNextMultipleOf(0, 32)
693           .maxScalar(0, S32);
694     } else {
695       getActionDefinitionsBuilder({G_ADD, G_SUB})
696           .legalFor({S32, S16, V2S16})
697           .clampMaxNumElementsStrict(0, S16, 2)
698           .scalarize(0)
699           .minScalar(0, S16)
700           .widenScalarToNextMultipleOf(0, 32)
701           .maxScalar(0, S32);
702     }
703 
704     if (ST.hasScalarSMulU64()) {
705       getActionDefinitionsBuilder(G_MUL)
706           .legalFor({S64, S32, S16, V2S16})
707           .clampMaxNumElementsStrict(0, S16, 2)
708           .scalarize(0)
709           .minScalar(0, S16)
710           .widenScalarToNextMultipleOf(0, 32)
711           .custom();
712     } else {
713       getActionDefinitionsBuilder(G_MUL)
714           .legalFor({S32, S16, V2S16})
715           .clampMaxNumElementsStrict(0, S16, 2)
716           .scalarize(0)
717           .minScalar(0, S16)
718           .widenScalarToNextMultipleOf(0, 32)
719           .custom();
720     }
721     assert(ST.hasMad64_32());
722 
723     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
724       .legalFor({S32, S16, V2S16}) // Clamp modifier
725       .minScalarOrElt(0, S16)
726       .clampMaxNumElementsStrict(0, S16, 2)
727       .scalarize(0)
728       .widenScalarToNextPow2(0, 32)
729       .lower();
730   } else if (ST.has16BitInsts()) {
731     getActionDefinitionsBuilder({G_ADD, G_SUB})
732       .legalFor({S32, S16})
733       .minScalar(0, S16)
734       .widenScalarToNextMultipleOf(0, 32)
735       .maxScalar(0, S32)
736       .scalarize(0);
737 
738     getActionDefinitionsBuilder(G_MUL)
739       .legalFor({S32, S16})
740       .scalarize(0)
741       .minScalar(0, S16)
742       .widenScalarToNextMultipleOf(0, 32)
743       .custom();
744     assert(ST.hasMad64_32());
745 
746     // Technically the saturating operations require clamp bit support, but this
747     // was introduced at the same time as 16-bit operations.
748     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
749       .legalFor({S32, S16}) // Clamp modifier
750       .minScalar(0, S16)
751       .scalarize(0)
752       .widenScalarToNextPow2(0, 16)
753       .lower();
754 
755     // We're just lowering this, but it helps get a better result to try to
756     // coerce to the desired type first.
757     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
758       .minScalar(0, S16)
759       .scalarize(0)
760       .lower();
761   } else {
762     getActionDefinitionsBuilder({G_ADD, G_SUB})
763       .legalFor({S32})
764       .widenScalarToNextMultipleOf(0, 32)
765       .clampScalar(0, S32, S32)
766       .scalarize(0);
767 
768     auto &Mul = getActionDefinitionsBuilder(G_MUL)
769       .legalFor({S32})
770       .scalarize(0)
771       .minScalar(0, S32)
772       .widenScalarToNextMultipleOf(0, 32);
773 
774     if (ST.hasMad64_32())
775       Mul.custom();
776     else
777       Mul.maxScalar(0, S32);
778 
779     if (ST.hasIntClamp()) {
780       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
781         .legalFor({S32}) // Clamp modifier.
782         .scalarize(0)
783         .minScalarOrElt(0, S32)
784         .lower();
785     } else {
786       // Clamp bit support was added in VI, along with 16-bit operations.
787       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
788         .minScalar(0, S32)
789         .scalarize(0)
790         .lower();
791     }
792 
793     // FIXME: DAG expansion gets better results. The widening uses the smaller
794     // range values and goes for the min/max lowering directly.
795     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
796       .minScalar(0, S32)
797       .scalarize(0)
798       .lower();
799   }
800 
801   getActionDefinitionsBuilder(
802       {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
803       .customFor({S32, S64})
804       .clampScalar(0, S32, S64)
805       .widenScalarToNextPow2(0, 32)
806       .scalarize(0);
807 
808   auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
809                    .legalFor({S32})
810                    .maxScalar(0, S32);
811 
812   if (ST.hasVOP3PInsts()) {
813     Mulh
814       .clampMaxNumElements(0, S8, 2)
815       .lowerFor({V2S8});
816   }
817 
818   Mulh
819     .scalarize(0)
820     .lower();
821 
822   // Report legal for any types we can handle anywhere. For the cases only legal
823   // on the SALU, RegBankSelect will be able to re-legalize.
824   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
825     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
826     .clampScalar(0, S32, S64)
827     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
828     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
829     .widenScalarToNextPow2(0)
830     .scalarize(0);
831 
832   getActionDefinitionsBuilder(
833       {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
834       .legalFor({{S32, S1}, {S32, S32}})
835       .clampScalar(0, S32, S32)
836       .scalarize(0);
837 
838   getActionDefinitionsBuilder(G_BITCAST)
839     // Don't worry about the size constraint.
840     .legalIf(all(isRegisterType(0), isRegisterType(1)))
841     .lower();
842 
843 
844   getActionDefinitionsBuilder(G_CONSTANT)
845     .legalFor({S1, S32, S64, S16, GlobalPtr,
846                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
847     .legalIf(isPointer(0))
848     .clampScalar(0, S32, S64)
849     .widenScalarToNextPow2(0);
850 
851   getActionDefinitionsBuilder(G_FCONSTANT)
852     .legalFor({S32, S64, S16})
853     .clampScalar(0, S16, S64);
854 
855   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
856       .legalIf(isRegisterType(0))
857       // s1 and s16 are special cases because they have legal operations on
858       // them, but don't really occupy registers in the normal way.
859       .legalFor({S1, S16})
860       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
861       .clampScalarOrElt(0, S32, MaxScalar)
862       .widenScalarToNextPow2(0, 32)
863       .clampMaxNumElements(0, S32, 16);
864 
865   getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
866 
867   // If the amount is divergent, we have to do a wave reduction to get the
868   // maximum value, so this is expanded during RegBankSelect.
869   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
870     .legalFor({{PrivatePtr, S32}});
871 
872   getActionDefinitionsBuilder(G_STACKSAVE)
873     .customFor({PrivatePtr});
874   getActionDefinitionsBuilder(G_STACKRESTORE)
875     .legalFor({PrivatePtr});
876 
877   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
878     .customIf(typeIsNot(0, PrivatePtr));
879 
880   getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
881 
882   auto &FPOpActions = getActionDefinitionsBuilder(
883     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
884       G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
885     .legalFor({S32, S64});
886   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
887     .customFor({S32, S64});
888   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
889     .customFor({S32, S64});
890 
891   if (ST.has16BitInsts()) {
892     if (ST.hasVOP3PInsts())
893       FPOpActions.legalFor({S16, V2S16});
894     else
895       FPOpActions.legalFor({S16});
896 
897     TrigActions.customFor({S16});
898     FDIVActions.customFor({S16});
899   }
900 
901   if (ST.hasPackedFP32Ops()) {
902     FPOpActions.legalFor({V2S32});
903     FPOpActions.clampMaxNumElementsStrict(0, S32, 2);
904   }
905 
906   auto &MinNumMaxNum = getActionDefinitionsBuilder({
907       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
908 
909   if (ST.hasVOP3PInsts()) {
910     MinNumMaxNum.customFor(FPTypesPK16)
911       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
912       .clampMaxNumElements(0, S16, 2)
913       .clampScalar(0, S16, S64)
914       .scalarize(0);
915   } else if (ST.has16BitInsts()) {
916     MinNumMaxNum.customFor(FPTypes16)
917       .clampScalar(0, S16, S64)
918       .scalarize(0);
919   } else {
920     MinNumMaxNum.customFor(FPTypesBase)
921       .clampScalar(0, S32, S64)
922       .scalarize(0);
923   }
924 
925   if (ST.hasVOP3PInsts())
926     FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
927 
928   FPOpActions
929     .scalarize(0)
930     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
931 
932   TrigActions
933     .scalarize(0)
934     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
935 
936   FDIVActions
937     .scalarize(0)
938     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
939 
940   getActionDefinitionsBuilder({G_FNEG, G_FABS})
941     .legalFor(FPTypesPK16)
942     .clampMaxNumElementsStrict(0, S16, 2)
943     .scalarize(0)
944     .clampScalar(0, S16, S64);
945 
946   if (ST.has16BitInsts()) {
947     getActionDefinitionsBuilder(G_FSQRT)
948       .legalFor({S16})
949       .customFor({S32, S64})
950       .scalarize(0)
951       .unsupported();
952     getActionDefinitionsBuilder(G_FFLOOR)
953       .legalFor({S32, S64, S16})
954       .scalarize(0)
955       .clampScalar(0, S16, S64);
956 
957     getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
958       .legalFor({{S32, S32}, {S64, S32}, {S16, S16}})
959       .scalarize(0)
960       .maxScalarIf(typeIs(0, S16), 1, S16)
961       .clampScalar(1, S32, S32)
962       .lower();
963 
964     getActionDefinitionsBuilder(G_FFREXP)
965       .customFor({{S32, S32}, {S64, S32}, {S16, S16}, {S16, S32}})
966       .scalarize(0)
967       .lower();
968   } else {
969     getActionDefinitionsBuilder(G_FSQRT)
970       .customFor({S32, S64, S16})
971       .scalarize(0)
972       .unsupported();
973 
974 
975     if (ST.hasFractBug()) {
976       getActionDefinitionsBuilder(G_FFLOOR)
977         .customFor({S64})
978         .legalFor({S32, S64})
979         .scalarize(0)
980         .clampScalar(0, S32, S64);
981     } else {
982       getActionDefinitionsBuilder(G_FFLOOR)
983         .legalFor({S32, S64})
984         .scalarize(0)
985         .clampScalar(0, S32, S64);
986     }
987 
988     getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
989       .legalFor({{S32, S32}, {S64, S32}})
990       .scalarize(0)
991       .clampScalar(0, S32, S64)
992       .clampScalar(1, S32, S32)
993       .lower();
994 
995     getActionDefinitionsBuilder(G_FFREXP)
996       .customFor({{S32, S32}, {S64, S32}})
997       .scalarize(0)
998       .minScalar(0, S32)
999       .clampScalar(1, S32, S32)
1000       .lower();
1001   }
1002 
1003   getActionDefinitionsBuilder(G_FPTRUNC)
1004     .legalFor({{S32, S64}, {S16, S32}})
1005     .scalarize(0)
1006     .lower();
1007 
1008   getActionDefinitionsBuilder(G_FPEXT)
1009     .legalFor({{S64, S32}, {S32, S16}})
1010     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
1011     .scalarize(0);
1012 
1013   auto &FSubActions = getActionDefinitionsBuilder({G_FSUB, G_STRICT_FSUB});
1014   if (ST.has16BitInsts()) {
1015     FSubActions
1016       // Use actual fsub instruction
1017       .legalFor({S32, S16})
1018       // Must use fadd + fneg
1019       .lowerFor({S64, V2S16});
1020   } else {
1021     FSubActions
1022       // Use actual fsub instruction
1023       .legalFor({S32})
1024       // Must use fadd + fneg
1025       .lowerFor({S64, S16, V2S16});
1026   }
1027 
1028   FSubActions
1029     .scalarize(0)
1030     .clampScalar(0, S32, S64);
1031 
1032   // Whether this is legal depends on the floating point mode for the function.
1033   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
1034   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
1035     FMad.customFor({S32, S16});
1036   else if (ST.hasMadMacF32Insts())
1037     FMad.customFor({S32});
1038   else if (ST.hasMadF16())
1039     FMad.customFor({S16});
1040   FMad.scalarize(0)
1041       .lower();
1042 
1043   auto &FRem = getActionDefinitionsBuilder(G_FREM);
1044   if (ST.has16BitInsts()) {
1045     FRem.customFor({S16, S32, S64});
1046   } else {
1047     FRem.minScalar(0, S32)
1048         .customFor({S32, S64});
1049   }
1050   FRem.scalarize(0);
1051 
1052   // TODO: Do we need to clamp maximum bitwidth?
1053   getActionDefinitionsBuilder(G_TRUNC)
1054     .legalIf(isScalar(0))
1055     .legalFor({{V2S16, V2S32}})
1056     .clampMaxNumElements(0, S16, 2)
1057     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
1058     // situations (like an invalid implicit use), we don't want to infinite loop
1059     // in the legalizer.
1060     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
1061     .alwaysLegal();
1062 
1063   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
1064     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
1065                {S32, S1}, {S64, S1}, {S16, S1}})
1066     .scalarize(0)
1067     .clampScalar(0, S32, S64)
1068     .widenScalarToNextPow2(1, 32);
1069 
1070   // TODO: Split s1->s64 during regbankselect for VALU.
1071   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
1072                     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
1073                     .lowerIf(typeIs(1, S1))
1074                     .customFor({{S32, S64}, {S64, S64}});
1075   if (ST.has16BitInsts())
1076     IToFP.legalFor({{S16, S16}});
1077   IToFP.clampScalar(1, S32, S64)
1078        .minScalar(0, S32)
1079        .scalarize(0)
1080        .widenScalarToNextPow2(1);
1081 
1082   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
1083     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
1084     .customFor({{S64, S32}, {S64, S64}})
1085     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
1086   if (ST.has16BitInsts())
1087     FPToI.legalFor({{S16, S16}});
1088   else
1089     FPToI.minScalar(1, S32);
1090 
1091   FPToI.minScalar(0, S32)
1092        .widenScalarToNextPow2(0, 32)
1093        .scalarize(0)
1094        .lower();
1095 
1096   getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
1097       .customFor({S16, S32})
1098       .scalarize(0)
1099       .lower();
1100 
1101   // Lower G_FNEARBYINT and G_FRINT into G_INTRINSIC_ROUNDEVEN
1102   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_FRINT, G_FNEARBYINT})
1103       .scalarize(0)
1104       .lower();
1105 
1106   if (ST.has16BitInsts()) {
1107     getActionDefinitionsBuilder(
1108         {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1109         .legalFor({S16, S32, S64})
1110         .clampScalar(0, S16, S64)
1111         .scalarize(0);
1112   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
1113     getActionDefinitionsBuilder(
1114         {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1115         .legalFor({S32, S64})
1116         .clampScalar(0, S32, S64)
1117         .scalarize(0);
1118   } else {
1119     getActionDefinitionsBuilder(
1120         {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1121         .legalFor({S32})
1122         .customFor({S64})
1123         .clampScalar(0, S32, S64)
1124         .scalarize(0);
1125   }
1126 
1127   getActionDefinitionsBuilder(G_PTR_ADD)
1128       .unsupportedFor({BufferFatPtr, BufferStridedPtr, RsrcPtr})
1129       .legalIf(all(isPointer(0), sameSize(0, 1)))
1130       .scalarize(0)
1131       .scalarSameSizeAs(1, 0);
1132 
1133   getActionDefinitionsBuilder(G_PTRMASK)
1134     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
1135     .scalarSameSizeAs(1, 0)
1136     .scalarize(0);
1137 
1138   auto &CmpBuilder =
1139     getActionDefinitionsBuilder(G_ICMP)
1140     // The compare output type differs based on the register bank of the output,
1141     // so make both s1 and s32 legal.
1142     //
1143     // Scalar compares producing output in scc will be promoted to s32, as that
1144     // is the allocatable register type that will be needed for the copy from
1145     // scc. This will be promoted during RegBankSelect, and we assume something
1146     // before that won't try to use s32 result types.
1147     //
1148     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
1149     // bank.
1150     .legalForCartesianProduct(
1151       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
1152     .legalForCartesianProduct(
1153       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
1154   if (ST.has16BitInsts()) {
1155     CmpBuilder.legalFor({{S1, S16}});
1156   }
1157 
1158   CmpBuilder
1159     .widenScalarToNextPow2(1)
1160     .clampScalar(1, S32, S64)
1161     .scalarize(0)
1162     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
1163 
1164   auto &FCmpBuilder =
1165       getActionDefinitionsBuilder(G_FCMP).legalForCartesianProduct(
1166           {S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
1167 
1168   if (ST.hasSALUFloatInsts())
1169     FCmpBuilder.legalForCartesianProduct({S32}, {S16, S32});
1170 
1171   FCmpBuilder
1172     .widenScalarToNextPow2(1)
1173     .clampScalar(1, S32, S64)
1174     .scalarize(0);
1175 
1176   // FIXME: fpow has a selection pattern that should move to custom lowering.
1177   auto &ExpOps = getActionDefinitionsBuilder(G_FPOW);
1178   if (ST.has16BitInsts())
1179     ExpOps.customFor({{S32}, {S16}});
1180   else
1181     ExpOps.customFor({S32});
1182   ExpOps.clampScalar(0, MinScalarFPTy, S32)
1183         .scalarize(0);
1184 
1185   getActionDefinitionsBuilder(G_FPOWI)
1186     .clampScalar(0, MinScalarFPTy, S32)
1187     .lower();
1188 
1189   auto &Log2Ops = getActionDefinitionsBuilder({G_FLOG2, G_FEXP2});
1190   Log2Ops.customFor({S32});
1191   if (ST.has16BitInsts())
1192     Log2Ops.legalFor({S16});
1193   else
1194     Log2Ops.customFor({S16});
1195   Log2Ops.scalarize(0)
1196     .lower();
1197 
1198   auto &LogOps =
1199       getActionDefinitionsBuilder({G_FLOG, G_FLOG10, G_FEXP, G_FEXP10});
1200   LogOps.customFor({S32, S16});
1201   LogOps.clampScalar(0, MinScalarFPTy, S32)
1202         .scalarize(0);
1203 
1204   // The 64-bit versions produce 32-bit results, but only on the SALU.
1205   getActionDefinitionsBuilder(G_CTPOP)
1206     .legalFor({{S32, S32}, {S32, S64}})
1207     .clampScalar(0, S32, S32)
1208     .widenScalarToNextPow2(1, 32)
1209     .clampScalar(1, S32, S64)
1210     .scalarize(0)
1211     .widenScalarToNextPow2(0, 32);
1212 
1213   // If no 16 bit instr is available, lower into different instructions.
1214   if (ST.has16BitInsts())
1215     getActionDefinitionsBuilder(G_IS_FPCLASS)
1216         .legalForCartesianProduct({S1}, FPTypes16)
1217         .widenScalarToNextPow2(1)
1218         .scalarize(0)
1219         .lower();
1220   else
1221     getActionDefinitionsBuilder(G_IS_FPCLASS)
1222         .legalForCartesianProduct({S1}, FPTypesBase)
1223         .lowerFor({S1, S16})
1224         .widenScalarToNextPow2(1)
1225         .scalarize(0)
1226         .lower();
1227 
1228   // The hardware instructions return a different result on 0 than the generic
1229   // instructions expect. The hardware produces -1, but these produce the
1230   // bitwidth.
1231   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
1232     .scalarize(0)
1233     .clampScalar(0, S32, S32)
1234     .clampScalar(1, S32, S64)
1235     .widenScalarToNextPow2(0, 32)
1236     .widenScalarToNextPow2(1, 32)
1237     .custom();
1238 
1239   // The 64-bit versions produce 32-bit results, but only on the SALU.
1240   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
1241     .legalFor({{S32, S32}, {S32, S64}})
1242     .clampScalar(0, S32, S32)
1243     .clampScalar(1, S32, S64)
1244     .scalarize(0)
1245     .widenScalarToNextPow2(0, 32)
1246     .widenScalarToNextPow2(1, 32);
1247 
1248   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1249   // RegBankSelect.
1250   getActionDefinitionsBuilder(G_BITREVERSE)
1251     .legalFor({S32, S64})
1252     .clampScalar(0, S32, S64)
1253     .scalarize(0)
1254     .widenScalarToNextPow2(0);
1255 
1256   if (ST.has16BitInsts()) {
1257     getActionDefinitionsBuilder(G_BSWAP)
1258       .legalFor({S16, S32, V2S16})
1259       .clampMaxNumElementsStrict(0, S16, 2)
1260       // FIXME: Fixing non-power-of-2 before clamp is workaround for
1261       // narrowScalar limitation.
1262       .widenScalarToNextPow2(0)
1263       .clampScalar(0, S16, S32)
1264       .scalarize(0);
1265 
1266     if (ST.hasVOP3PInsts()) {
1267       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1268         .legalFor({S32, S16, V2S16})
1269         .clampMaxNumElements(0, S16, 2)
1270         .minScalar(0, S16)
1271         .widenScalarToNextPow2(0)
1272         .scalarize(0)
1273         .lower();
1274     } else {
1275       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1276         .legalFor({S32, S16})
1277         .widenScalarToNextPow2(0)
1278         .minScalar(0, S16)
1279         .scalarize(0)
1280         .lower();
1281     }
1282   } else {
1283     // TODO: Should have same legality without v_perm_b32
1284     getActionDefinitionsBuilder(G_BSWAP)
1285       .legalFor({S32})
1286       .lowerIf(scalarNarrowerThan(0, 32))
1287       // FIXME: Fixing non-power-of-2 before clamp is workaround for
1288       // narrowScalar limitation.
1289       .widenScalarToNextPow2(0)
1290       .maxScalar(0, S32)
1291       .scalarize(0)
1292       .lower();
1293 
1294     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1295       .legalFor({S32})
1296       .minScalar(0, S32)
1297       .widenScalarToNextPow2(0)
1298       .scalarize(0)
1299       .lower();
1300   }
1301 
1302   getActionDefinitionsBuilder(G_INTTOPTR)
1303       // List the common cases
1304       .legalForCartesianProduct(AddrSpaces64, {S64})
1305       .legalForCartesianProduct(AddrSpaces32, {S32})
1306       .scalarize(0)
1307       // Accept any address space as long as the size matches
1308       .legalIf(sameSize(0, 1))
1309       .widenScalarIf(smallerThan(1, 0),
1310                      [](const LegalityQuery &Query) {
1311                        return std::pair(
1312                            1, LLT::scalar(Query.Types[0].getSizeInBits()));
1313                      })
1314       .narrowScalarIf(largerThan(1, 0), [](const LegalityQuery &Query) {
1315         return std::pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1316       });
1317 
1318   getActionDefinitionsBuilder(G_PTRTOINT)
1319       // List the common cases
1320       .legalForCartesianProduct(AddrSpaces64, {S64})
1321       .legalForCartesianProduct(AddrSpaces32, {S32})
1322       .scalarize(0)
1323       // Accept any address space as long as the size matches
1324       .legalIf(sameSize(0, 1))
1325       .widenScalarIf(smallerThan(0, 1),
1326                      [](const LegalityQuery &Query) {
1327                        return std::pair(
1328                            0, LLT::scalar(Query.Types[1].getSizeInBits()));
1329                      })
1330       .narrowScalarIf(largerThan(0, 1), [](const LegalityQuery &Query) {
1331         return std::pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1332       });
1333 
1334   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1335     .scalarize(0)
1336     .custom();
1337 
1338   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1339                                     bool IsLoad) -> bool {
1340     const LLT DstTy = Query.Types[0];
1341 
1342     // Split vector extloads.
1343     unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1344 
1345     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1346       return true;
1347 
1348     const LLT PtrTy = Query.Types[1];
1349     unsigned AS = PtrTy.getAddressSpace();
1350     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
1351                                       Query.MMODescrs[0].Ordering !=
1352                                           AtomicOrdering::NotAtomic))
1353       return true;
1354 
1355     // Catch weird sized loads that don't evenly divide into the access sizes
1356     // TODO: May be able to widen depending on alignment etc.
1357     unsigned NumRegs = (MemSize + 31) / 32;
1358     if (NumRegs == 3) {
1359       if (!ST.hasDwordx3LoadStores())
1360         return true;
1361     } else {
1362       // If the alignment allows, these should have been widened.
1363       if (!isPowerOf2_32(NumRegs))
1364         return true;
1365     }
1366 
1367     return false;
1368   };
1369 
1370   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1371   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1372   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1373 
1374   // TODO: Refine based on subtargets which support unaligned access or 128-bit
1375   // LDS
1376   // TODO: Unsupported flat for SI.
1377 
1378   for (unsigned Op : {G_LOAD, G_STORE}) {
1379     const bool IsStore = Op == G_STORE;
1380 
1381     auto &Actions = getActionDefinitionsBuilder(Op);
1382     // Explicitly list some common cases.
1383     // TODO: Does this help compile time at all?
1384     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1385                                       {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1386                                       {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1387                                       {S64, GlobalPtr, S64, GlobalAlign32},
1388                                       {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1389                                       {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1390                                       {S32, GlobalPtr, S8, GlobalAlign8},
1391                                       {S32, GlobalPtr, S16, GlobalAlign16},
1392 
1393                                       {S32, LocalPtr, S32, 32},
1394                                       {S64, LocalPtr, S64, 32},
1395                                       {V2S32, LocalPtr, V2S32, 32},
1396                                       {S32, LocalPtr, S8, 8},
1397                                       {S32, LocalPtr, S16, 16},
1398                                       {V2S16, LocalPtr, S32, 32},
1399 
1400                                       {S32, PrivatePtr, S32, 32},
1401                                       {S32, PrivatePtr, S8, 8},
1402                                       {S32, PrivatePtr, S16, 16},
1403                                       {V2S16, PrivatePtr, S32, 32},
1404 
1405                                       {S32, ConstantPtr, S32, GlobalAlign32},
1406                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1407                                       {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1408                                       {S64, ConstantPtr, S64, GlobalAlign32},
1409                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1410     Actions.legalIf(
1411       [=](const LegalityQuery &Query) -> bool {
1412         return isLoadStoreLegal(ST, Query);
1413       });
1414 
1415     // The custom pointers (fat pointers, buffer resources) don't work with load
1416     // and store at this level. Fat pointers should have been lowered to
1417     // intrinsics before the translation to MIR.
1418     Actions.unsupportedIf(
1419         typeInSet(1, {BufferFatPtr, BufferStridedPtr, RsrcPtr}));
1420 
1421     // Address space 8 pointers are handled by a 4xs32 load, bitcast, and
1422     // ptrtoint. This is needed to account for the fact that we can't have i128
1423     // as a register class for SelectionDAG reasons.
1424     Actions.customIf([=](const LegalityQuery &Query) -> bool {
1425       return hasBufferRsrcWorkaround(Query.Types[0]);
1426     });
1427 
1428     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1429     // 64-bits.
1430     //
1431     // TODO: Should generalize bitcast action into coerce, which will also cover
1432     // inserting addrspacecasts.
1433     Actions.customIf(typeIs(1, Constant32Ptr));
1434 
1435     // Turn any illegal element vectors into something easier to deal
1436     // with. These will ultimately produce 32-bit scalar shifts to extract the
1437     // parts anyway.
1438     //
1439     // For odd 16-bit element vectors, prefer to split those into pieces with
1440     // 16-bit vector parts.
1441     Actions.bitcastIf(
1442       [=](const LegalityQuery &Query) -> bool {
1443         return shouldBitcastLoadStoreType(ST, Query.Types[0],
1444                                           Query.MMODescrs[0].MemoryTy);
1445       }, bitcastToRegisterType(0));
1446 
1447     if (!IsStore) {
1448       // Widen suitably aligned loads by loading extra bytes. The standard
1449       // legalization actions can't properly express widening memory operands.
1450       Actions.customIf([=](const LegalityQuery &Query) -> bool {
1451         return shouldWidenLoad(ST, Query, G_LOAD);
1452       });
1453     }
1454 
1455     // FIXME: load/store narrowing should be moved to lower action
1456     Actions
1457         .narrowScalarIf(
1458             [=](const LegalityQuery &Query) -> bool {
1459               return !Query.Types[0].isVector() &&
1460                      needToSplitMemOp(Query, Op == G_LOAD);
1461             },
1462             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1463               const LLT DstTy = Query.Types[0];
1464               const LLT PtrTy = Query.Types[1];
1465 
1466               const unsigned DstSize = DstTy.getSizeInBits();
1467               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1468 
1469               // Split extloads.
1470               if (DstSize > MemSize)
1471                 return std::pair(0, LLT::scalar(MemSize));
1472 
1473               unsigned MaxSize = maxSizeForAddrSpace(
1474                   ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1475                   Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1476               if (MemSize > MaxSize)
1477                 return std::pair(0, LLT::scalar(MaxSize));
1478 
1479               uint64_t Align = Query.MMODescrs[0].AlignInBits;
1480               return std::pair(0, LLT::scalar(Align));
1481             })
1482         .fewerElementsIf(
1483             [=](const LegalityQuery &Query) -> bool {
1484               return Query.Types[0].isVector() &&
1485                      needToSplitMemOp(Query, Op == G_LOAD);
1486             },
1487             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1488               const LLT DstTy = Query.Types[0];
1489               const LLT PtrTy = Query.Types[1];
1490 
1491               LLT EltTy = DstTy.getElementType();
1492               unsigned MaxSize = maxSizeForAddrSpace(
1493                   ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1494                   Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1495 
1496               // FIXME: Handle widened to power of 2 results better. This ends
1497               // up scalarizing.
1498               // FIXME: 3 element stores scalarized on SI
1499 
1500               // Split if it's too large for the address space.
1501               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1502               if (MemSize > MaxSize) {
1503                 unsigned NumElts = DstTy.getNumElements();
1504                 unsigned EltSize = EltTy.getSizeInBits();
1505 
1506                 if (MaxSize % EltSize == 0) {
1507                   return std::pair(
1508                       0, LLT::scalarOrVector(
1509                              ElementCount::getFixed(MaxSize / EltSize), EltTy));
1510                 }
1511 
1512                 unsigned NumPieces = MemSize / MaxSize;
1513 
1514                 // FIXME: Refine when odd breakdowns handled
1515                 // The scalars will need to be re-legalized.
1516                 if (NumPieces == 1 || NumPieces >= NumElts ||
1517                     NumElts % NumPieces != 0)
1518                   return std::pair(0, EltTy);
1519 
1520                 return std::pair(0,
1521                                  LLT::fixed_vector(NumElts / NumPieces, EltTy));
1522               }
1523 
1524               // FIXME: We could probably handle weird extending loads better.
1525               if (DstTy.getSizeInBits() > MemSize)
1526                 return std::pair(0, EltTy);
1527 
1528               unsigned EltSize = EltTy.getSizeInBits();
1529               unsigned DstSize = DstTy.getSizeInBits();
1530               if (!isPowerOf2_32(DstSize)) {
1531                 // We're probably decomposing an odd sized store. Try to split
1532                 // to the widest type. TODO: Account for alignment. As-is it
1533                 // should be OK, since the new parts will be further legalized.
1534                 unsigned FloorSize = llvm::bit_floor(DstSize);
1535                 return std::pair(
1536                     0, LLT::scalarOrVector(
1537                            ElementCount::getFixed(FloorSize / EltSize), EltTy));
1538               }
1539 
1540               // May need relegalization for the scalars.
1541               return std::pair(0, EltTy);
1542             })
1543     .minScalar(0, S32)
1544     .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
1545     .widenScalarToNextPow2(0)
1546     .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1547     .lower();
1548   }
1549 
1550   // FIXME: Unaligned accesses not lowered.
1551   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1552                        .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1553                                                   {S32, GlobalPtr, S16, 2 * 8},
1554                                                   {S32, LocalPtr, S8, 8},
1555                                                   {S32, LocalPtr, S16, 16},
1556                                                   {S32, PrivatePtr, S8, 8},
1557                                                   {S32, PrivatePtr, S16, 16},
1558                                                   {S32, ConstantPtr, S8, 8},
1559                                                   {S32, ConstantPtr, S16, 2 * 8}})
1560                        .legalIf(
1561                          [=](const LegalityQuery &Query) -> bool {
1562                            return isLoadStoreLegal(ST, Query);
1563                          });
1564 
1565   if (ST.hasFlatAddressSpace()) {
1566     ExtLoads.legalForTypesWithMemDesc(
1567         {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1568   }
1569 
1570   // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1571   // 64-bits.
1572   //
1573   // TODO: Should generalize bitcast action into coerce, which will also cover
1574   // inserting addrspacecasts.
1575   ExtLoads.customIf(typeIs(1, Constant32Ptr));
1576 
1577   ExtLoads.clampScalar(0, S32, S32)
1578           .widenScalarToNextPow2(0)
1579           .lower();
1580 
1581   auto &Atomics = getActionDefinitionsBuilder(
1582     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1583      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1584      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1585      G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1586     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1587                {S64, GlobalPtr}, {S64, LocalPtr},
1588                {S32, RegionPtr}, {S64, RegionPtr}});
1589   if (ST.hasFlatAddressSpace()) {
1590     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1591   }
1592 
1593   auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1594   if (ST.hasLDSFPAtomicAdd()) {
1595     Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1596     if (ST.hasGFX90AInsts())
1597       Atomic.legalFor({{S64, LocalPtr}});
1598     if (ST.hasAtomicDsPkAdd16Insts())
1599       Atomic.legalFor({{V2S16, LocalPtr}});
1600   }
1601   if (ST.hasAtomicFaddInsts())
1602     Atomic.legalFor({{S32, GlobalPtr}});
1603   if (ST.hasFlatAtomicFaddF32Inst())
1604     Atomic.legalFor({{S32, FlatPtr}});
1605 
1606   if (ST.hasGFX90AInsts()) {
1607     // These are legal with some caveats, and should have undergone expansion in
1608     // the IR in most situations
1609     // TODO: Move atomic expansion into legalizer
1610     Atomic.legalFor({
1611         {S32, GlobalPtr},
1612         {S64, GlobalPtr},
1613         {S64, FlatPtr}
1614       });
1615   }
1616 
1617   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1618   // demarshalling
1619   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1620     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1621                 {S32, FlatPtr}, {S64, FlatPtr}})
1622     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1623                {S32, RegionPtr}, {S64, RegionPtr}});
1624   // TODO: Pointer types, any 32-bit or 64-bit vector
1625 
1626   // Condition should be s32 for scalar, s1 for vector.
1627   getActionDefinitionsBuilder(G_SELECT)
1628       .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1629                                  LocalPtr, FlatPtr, PrivatePtr,
1630                                  LLT::fixed_vector(2, LocalPtr),
1631                                  LLT::fixed_vector(2, PrivatePtr)},
1632                                 {S1, S32})
1633       .clampScalar(0, S16, S64)
1634       .scalarize(1)
1635       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1636       .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1637       .clampMaxNumElements(0, S32, 2)
1638       .clampMaxNumElements(0, LocalPtr, 2)
1639       .clampMaxNumElements(0, PrivatePtr, 2)
1640       .scalarize(0)
1641       .widenScalarToNextPow2(0)
1642       .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1643 
1644   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1645   // be more flexible with the shift amount type.
1646   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1647     .legalFor({{S32, S32}, {S64, S32}});
1648   if (ST.has16BitInsts()) {
1649     if (ST.hasVOP3PInsts()) {
1650       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1651             .clampMaxNumElements(0, S16, 2);
1652     } else
1653       Shifts.legalFor({{S16, S16}});
1654 
1655     // TODO: Support 16-bit shift amounts for all types
1656     Shifts.widenScalarIf(
1657       [=](const LegalityQuery &Query) {
1658         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1659         // 32-bit amount.
1660         const LLT ValTy = Query.Types[0];
1661         const LLT AmountTy = Query.Types[1];
1662         return ValTy.getSizeInBits() <= 16 &&
1663                AmountTy.getSizeInBits() < 16;
1664       }, changeTo(1, S16));
1665     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1666     Shifts.clampScalar(1, S32, S32);
1667     Shifts.widenScalarToNextPow2(0, 16);
1668     Shifts.clampScalar(0, S16, S64);
1669 
1670     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1671       .minScalar(0, S16)
1672       .scalarize(0)
1673       .lower();
1674   } else {
1675     // Make sure we legalize the shift amount type first, as the general
1676     // expansion for the shifted type will produce much worse code if it hasn't
1677     // been truncated already.
1678     Shifts.clampScalar(1, S32, S32);
1679     Shifts.widenScalarToNextPow2(0, 32);
1680     Shifts.clampScalar(0, S32, S64);
1681 
1682     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1683       .minScalar(0, S32)
1684       .scalarize(0)
1685       .lower();
1686   }
1687   Shifts.scalarize(0);
1688 
1689   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1690     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1691     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1692     unsigned IdxTypeIdx = 2;
1693 
1694     getActionDefinitionsBuilder(Op)
1695       .customIf([=](const LegalityQuery &Query) {
1696           const LLT EltTy = Query.Types[EltTypeIdx];
1697           const LLT VecTy = Query.Types[VecTypeIdx];
1698           const LLT IdxTy = Query.Types[IdxTypeIdx];
1699           const unsigned EltSize = EltTy.getSizeInBits();
1700           const bool isLegalVecType =
1701               !!SIRegisterInfo::getSGPRClassForBitWidth(VecTy.getSizeInBits());
1702           // Address space 8 pointers are 128-bit wide values, but the logic
1703           // below will try to bitcast them to 2N x s64, which will fail.
1704           // Therefore, as an intermediate step, wrap extracts/insertions from a
1705           // ptrtoint-ing the vector and scalar arguments (or inttoptring the
1706           // extraction result) in order to produce a vector operation that can
1707           // be handled by the logic below.
1708           if (EltTy.isPointer() && EltSize > 64)
1709             return true;
1710           return (EltSize == 32 || EltSize == 64) &&
1711                   VecTy.getSizeInBits() % 32 == 0 &&
1712                   VecTy.getSizeInBits() <= MaxRegisterSize &&
1713                   IdxTy.getSizeInBits() == 32 &&
1714                   isLegalVecType;
1715         })
1716       .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1717                  bitcastToVectorElement32(VecTypeIdx))
1718       //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1719       .bitcastIf(
1720         all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1721         [=](const LegalityQuery &Query) {
1722           // For > 64-bit element types, try to turn this into a 64-bit
1723           // element vector since we may be able to do better indexing
1724           // if this is scalar. If not, fall back to 32.
1725           const LLT EltTy = Query.Types[EltTypeIdx];
1726           const LLT VecTy = Query.Types[VecTypeIdx];
1727           const unsigned DstEltSize = EltTy.getSizeInBits();
1728           const unsigned VecSize = VecTy.getSizeInBits();
1729 
1730           const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1731           return std::pair(
1732               VecTypeIdx,
1733               LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
1734         })
1735       .clampScalar(EltTypeIdx, S32, S64)
1736       .clampScalar(VecTypeIdx, S32, S64)
1737       .clampScalar(IdxTypeIdx, S32, S32)
1738       .clampMaxNumElements(VecTypeIdx, S32, 32)
1739       // TODO: Clamp elements for 64-bit vectors?
1740       .moreElementsIf(
1741         isIllegalRegisterType(VecTypeIdx),
1742         moreElementsToNextExistingRegClass(VecTypeIdx))
1743       // It should only be necessary with variable indexes.
1744       // As a last resort, lower to the stack
1745       .lower();
1746   }
1747 
1748   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1749     .unsupportedIf([=](const LegalityQuery &Query) {
1750         const LLT &EltTy = Query.Types[1].getElementType();
1751         return Query.Types[0] != EltTy;
1752       });
1753 
1754   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1755     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1756     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1757 
1758     // FIXME: Doesn't handle extract of illegal sizes.
1759     getActionDefinitionsBuilder(Op)
1760       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1761       .lowerIf([=](const LegalityQuery &Query) {
1762           // Sub-vector(or single element) insert and extract.
1763           // TODO: verify immediate offset here since lower only works with
1764           // whole elements.
1765           const LLT BigTy = Query.Types[BigTyIdx];
1766           return BigTy.isVector();
1767         })
1768       // FIXME: Multiples of 16 should not be legal.
1769       .legalIf([=](const LegalityQuery &Query) {
1770           const LLT BigTy = Query.Types[BigTyIdx];
1771           const LLT LitTy = Query.Types[LitTyIdx];
1772           return (BigTy.getSizeInBits() % 32 == 0) &&
1773                  (LitTy.getSizeInBits() % 16 == 0);
1774         })
1775       .widenScalarIf(
1776         [=](const LegalityQuery &Query) {
1777           const LLT BigTy = Query.Types[BigTyIdx];
1778           return (BigTy.getScalarSizeInBits() < 16);
1779         },
1780         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1781       .widenScalarIf(
1782         [=](const LegalityQuery &Query) {
1783           const LLT LitTy = Query.Types[LitTyIdx];
1784           return (LitTy.getScalarSizeInBits() < 16);
1785         },
1786         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1787       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1788       .widenScalarToNextPow2(BigTyIdx, 32);
1789 
1790   }
1791 
1792   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1793     .legalForCartesianProduct(AllS32Vectors, {S32})
1794     .legalForCartesianProduct(AllS64Vectors, {S64})
1795     .clampNumElements(0, V16S32, V32S32)
1796     .clampNumElements(0, V2S64, V16S64)
1797     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16))
1798     .moreElementsIf(
1799       isIllegalRegisterType(0),
1800       moreElementsToNextExistingRegClass(0));
1801 
1802   if (ST.hasScalarPackInsts()) {
1803     BuildVector
1804       // FIXME: Should probably widen s1 vectors straight to s32
1805       .minScalarOrElt(0, S16)
1806       .minScalar(1, S16);
1807 
1808     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1809       .legalFor({V2S16, S32})
1810       .lower();
1811   } else {
1812     BuildVector.customFor({V2S16, S16});
1813     BuildVector.minScalarOrElt(0, S32);
1814 
1815     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1816       .customFor({V2S16, S32})
1817       .lower();
1818   }
1819 
1820   BuildVector.legalIf(isRegisterType(0));
1821 
1822   // FIXME: Clamp maximum size
1823   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1824     .legalIf(all(isRegisterType(0), isRegisterType(1)))
1825     .clampMaxNumElements(0, S32, 32)
1826     .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1827     .clampMaxNumElements(0, S16, 64);
1828 
1829   getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1830 
1831   // Merge/Unmerge
1832   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1833     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1834     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1835 
1836     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1837       const LLT Ty = Query.Types[TypeIdx];
1838       if (Ty.isVector()) {
1839         const LLT &EltTy = Ty.getElementType();
1840         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1841           return true;
1842         if (!llvm::has_single_bit<uint32_t>(EltTy.getSizeInBits()))
1843           return true;
1844       }
1845       return false;
1846     };
1847 
1848     auto &Builder = getActionDefinitionsBuilder(Op)
1849       .legalIf(all(isRegisterType(0), isRegisterType(1)))
1850       .lowerFor({{S16, V2S16}})
1851       .lowerIf([=](const LegalityQuery &Query) {
1852           const LLT BigTy = Query.Types[BigTyIdx];
1853           return BigTy.getSizeInBits() == 32;
1854         })
1855       // Try to widen to s16 first for small types.
1856       // TODO: Only do this on targets with legal s16 shifts
1857       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1858       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1859       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1860       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1861                            elementTypeIs(1, S16)),
1862                        changeTo(1, V2S16))
1863       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1864       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1865       // valid.
1866       .clampScalar(LitTyIdx, S32, S512)
1867       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1868       // Break up vectors with weird elements into scalars
1869       .fewerElementsIf(
1870         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1871         scalarize(0))
1872       .fewerElementsIf(
1873         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1874         scalarize(1))
1875       .clampScalar(BigTyIdx, S32, MaxScalar);
1876 
1877     if (Op == G_MERGE_VALUES) {
1878       Builder.widenScalarIf(
1879         // TODO: Use 16-bit shifts if legal for 8-bit values?
1880         [=](const LegalityQuery &Query) {
1881           const LLT Ty = Query.Types[LitTyIdx];
1882           return Ty.getSizeInBits() < 32;
1883         },
1884         changeTo(LitTyIdx, S32));
1885     }
1886 
1887     Builder.widenScalarIf(
1888       [=](const LegalityQuery &Query) {
1889         const LLT Ty = Query.Types[BigTyIdx];
1890         return Ty.getSizeInBits() % 16 != 0;
1891       },
1892       [=](const LegalityQuery &Query) {
1893         // Pick the next power of 2, or a multiple of 64 over 128.
1894         // Whichever is smaller.
1895         const LLT &Ty = Query.Types[BigTyIdx];
1896         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1897         if (NewSizeInBits >= 256) {
1898           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1899           if (RoundedTo < NewSizeInBits)
1900             NewSizeInBits = RoundedTo;
1901         }
1902         return std::pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1903       })
1904       // Any vectors left are the wrong size. Scalarize them.
1905       .scalarize(0)
1906       .scalarize(1);
1907   }
1908 
1909   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1910   // RegBankSelect.
1911   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1912     .legalFor({{S32}, {S64}});
1913 
1914   if (ST.hasVOP3PInsts()) {
1915     SextInReg.lowerFor({{V2S16}})
1916       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1917       // get more vector shift opportunities, since we'll get those when
1918       // expanded.
1919       .clampMaxNumElementsStrict(0, S16, 2);
1920   } else if (ST.has16BitInsts()) {
1921     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1922   } else {
1923     // Prefer to promote to s32 before lowering if we don't have 16-bit
1924     // shifts. This avoid a lot of intermediate truncate and extend operations.
1925     SextInReg.lowerFor({{S32}, {S64}});
1926   }
1927 
1928   SextInReg
1929     .scalarize(0)
1930     .clampScalar(0, S32, S64)
1931     .lower();
1932 
1933   getActionDefinitionsBuilder({G_ROTR, G_ROTL})
1934     .scalarize(0)
1935     .lower();
1936 
1937   // TODO: Only Try to form v2s16 with legal packed instructions.
1938   getActionDefinitionsBuilder(G_FSHR)
1939     .legalFor({{S32, S32}})
1940     .lowerFor({{V2S16, V2S16}})
1941     .clampMaxNumElementsStrict(0, S16, 2)
1942     .scalarize(0)
1943     .lower();
1944 
1945   if (ST.hasVOP3PInsts()) {
1946     getActionDefinitionsBuilder(G_FSHL)
1947       .lowerFor({{V2S16, V2S16}})
1948       .clampMaxNumElementsStrict(0, S16, 2)
1949       .scalarize(0)
1950       .lower();
1951   } else {
1952     getActionDefinitionsBuilder(G_FSHL)
1953       .scalarize(0)
1954       .lower();
1955   }
1956 
1957   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1958     .legalFor({S64});
1959 
1960   getActionDefinitionsBuilder(G_FENCE)
1961     .alwaysLegal();
1962 
1963   getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1964       .scalarize(0)
1965       .minScalar(0, S32)
1966       .lower();
1967 
1968   getActionDefinitionsBuilder({G_SBFX, G_UBFX})
1969       .legalFor({{S32, S32}, {S64, S32}})
1970       .clampScalar(1, S32, S32)
1971       .clampScalar(0, S32, S64)
1972       .widenScalarToNextPow2(0)
1973       .scalarize(0);
1974 
1975   getActionDefinitionsBuilder(
1976       {// TODO: Verify V_BFI_B32 is generated from expanded bit ops
1977        G_FCOPYSIGN,
1978 
1979        G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB,
1980        G_READ_REGISTER, G_WRITE_REGISTER,
1981 
1982        G_SADDO, G_SSUBO})
1983       .lower();
1984 
1985   if (ST.hasIEEEMinMax()) {
1986     getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
1987         .legalFor(FPTypesPK16)
1988         .clampMaxNumElements(0, S16, 2)
1989         .scalarize(0);
1990   } else {
1991     // TODO: Implement
1992     getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
1993   }
1994 
1995   getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
1996       .lower();
1997 
1998   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1999         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
2000         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
2001     .unsupported();
2002 
2003   getActionDefinitionsBuilder(G_PREFETCH).alwaysLegal();
2004 
2005   getLegacyLegalizerInfo().computeTables();
2006   verify(*ST.getInstrInfo());
2007 }
2008 
2009 bool AMDGPULegalizerInfo::legalizeCustom(
2010     LegalizerHelper &Helper, MachineInstr &MI,
2011     LostDebugLocObserver &LocObserver) const {
2012   MachineIRBuilder &B = Helper.MIRBuilder;
2013   MachineRegisterInfo &MRI = *B.getMRI();
2014 
2015   switch (MI.getOpcode()) {
2016   case TargetOpcode::G_ADDRSPACE_CAST:
2017     return legalizeAddrSpaceCast(MI, MRI, B);
2018   case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2019     return legalizeFroundeven(MI, MRI, B);
2020   case TargetOpcode::G_FCEIL:
2021     return legalizeFceil(MI, MRI, B);
2022   case TargetOpcode::G_FREM:
2023     return legalizeFrem(MI, MRI, B);
2024   case TargetOpcode::G_INTRINSIC_TRUNC:
2025     return legalizeIntrinsicTrunc(MI, MRI, B);
2026   case TargetOpcode::G_SITOFP:
2027     return legalizeITOFP(MI, MRI, B, true);
2028   case TargetOpcode::G_UITOFP:
2029     return legalizeITOFP(MI, MRI, B, false);
2030   case TargetOpcode::G_FPTOSI:
2031     return legalizeFPTOI(MI, MRI, B, true);
2032   case TargetOpcode::G_FPTOUI:
2033     return legalizeFPTOI(MI, MRI, B, false);
2034   case TargetOpcode::G_FMINNUM:
2035   case TargetOpcode::G_FMAXNUM:
2036   case TargetOpcode::G_FMINNUM_IEEE:
2037   case TargetOpcode::G_FMAXNUM_IEEE:
2038     return legalizeMinNumMaxNum(Helper, MI);
2039   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2040     return legalizeExtractVectorElt(MI, MRI, B);
2041   case TargetOpcode::G_INSERT_VECTOR_ELT:
2042     return legalizeInsertVectorElt(MI, MRI, B);
2043   case TargetOpcode::G_FSIN:
2044   case TargetOpcode::G_FCOS:
2045     return legalizeSinCos(MI, MRI, B);
2046   case TargetOpcode::G_GLOBAL_VALUE:
2047     return legalizeGlobalValue(MI, MRI, B);
2048   case TargetOpcode::G_LOAD:
2049   case TargetOpcode::G_SEXTLOAD:
2050   case TargetOpcode::G_ZEXTLOAD:
2051     return legalizeLoad(Helper, MI);
2052   case TargetOpcode::G_STORE:
2053     return legalizeStore(Helper, MI);
2054   case TargetOpcode::G_FMAD:
2055     return legalizeFMad(MI, MRI, B);
2056   case TargetOpcode::G_FDIV:
2057     return legalizeFDIV(MI, MRI, B);
2058   case TargetOpcode::G_FFREXP:
2059     return legalizeFFREXP(MI, MRI, B);
2060   case TargetOpcode::G_FSQRT:
2061     return legalizeFSQRT(MI, MRI, B);
2062   case TargetOpcode::G_UDIV:
2063   case TargetOpcode::G_UREM:
2064   case TargetOpcode::G_UDIVREM:
2065     return legalizeUnsignedDIV_REM(MI, MRI, B);
2066   case TargetOpcode::G_SDIV:
2067   case TargetOpcode::G_SREM:
2068   case TargetOpcode::G_SDIVREM:
2069     return legalizeSignedDIV_REM(MI, MRI, B);
2070   case TargetOpcode::G_ATOMIC_CMPXCHG:
2071     return legalizeAtomicCmpXChg(MI, MRI, B);
2072   case TargetOpcode::G_FLOG2:
2073     return legalizeFlog2(MI, B);
2074   case TargetOpcode::G_FLOG:
2075   case TargetOpcode::G_FLOG10:
2076     return legalizeFlogCommon(MI, B);
2077   case TargetOpcode::G_FEXP2:
2078     return legalizeFExp2(MI, B);
2079   case TargetOpcode::G_FEXP:
2080   case TargetOpcode::G_FEXP10:
2081     return legalizeFExp(MI, B);
2082   case TargetOpcode::G_FPOW:
2083     return legalizeFPow(MI, B);
2084   case TargetOpcode::G_FFLOOR:
2085     return legalizeFFloor(MI, MRI, B);
2086   case TargetOpcode::G_BUILD_VECTOR:
2087   case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2088     return legalizeBuildVector(MI, MRI, B);
2089   case TargetOpcode::G_MUL:
2090     return legalizeMul(Helper, MI);
2091   case TargetOpcode::G_CTLZ:
2092   case TargetOpcode::G_CTTZ:
2093     return legalizeCTLZ_CTTZ(MI, MRI, B);
2094   case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND:
2095     return legalizeFPTruncRound(MI, B);
2096   case TargetOpcode::G_STACKSAVE:
2097     return legalizeStackSave(MI, B);
2098   default:
2099     return false;
2100   }
2101 
2102   llvm_unreachable("expected switch to return");
2103 }
2104 
2105 Register AMDGPULegalizerInfo::getSegmentAperture(
2106   unsigned AS,
2107   MachineRegisterInfo &MRI,
2108   MachineIRBuilder &B) const {
2109   MachineFunction &MF = B.getMF();
2110   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2111   const LLT S32 = LLT::scalar(32);
2112   const LLT S64 = LLT::scalar(64);
2113 
2114   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
2115 
2116   if (ST.hasApertureRegs()) {
2117     // Note: this register is somewhat broken. When used as a 32-bit operand,
2118     // it only returns zeroes. The real value is in the upper 32 bits.
2119     // Thus, we must emit extract the high 32 bits.
2120     const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
2121                                        ? AMDGPU::SRC_SHARED_BASE
2122                                        : AMDGPU::SRC_PRIVATE_BASE;
2123     // FIXME: It would be more natural to emit a COPY here, but then copy
2124     // coalescing would kick in and it would think it's okay to use the "HI"
2125     // subregister (instead of extracting the HI 32 bits) which is an artificial
2126     // (unusable) register.
2127     //  Register TableGen definitions would need an overhaul to get rid of the
2128     //  artificial "HI" aperture registers and prevent this kind of issue from
2129     //  happening.
2130     Register Dst = MRI.createGenericVirtualRegister(S64);
2131     MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass);
2132     B.buildInstr(AMDGPU::S_MOV_B64, {Dst}, {Register(ApertureRegNo)});
2133     return B.buildUnmerge(S32, Dst).getReg(1);
2134   }
2135 
2136   // TODO: can we be smarter about machine pointer info?
2137   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
2138   Register LoadAddr = MRI.createGenericVirtualRegister(
2139     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2140   // For code object version 5, private_base and shared_base are passed through
2141   // implicit kernargs.
2142   if (AMDGPU::getAMDHSACodeObjectVersion(*MF.getFunction().getParent()) >=
2143       AMDGPU::AMDHSA_COV5) {
2144     AMDGPUTargetLowering::ImplicitParameter Param =
2145         AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPUTargetLowering::SHARED_BASE
2146                                       : AMDGPUTargetLowering::PRIVATE_BASE;
2147     uint64_t Offset =
2148         ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
2149 
2150     Register KernargPtrReg = MRI.createGenericVirtualRegister(
2151         LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2152 
2153     if (!loadInputValue(KernargPtrReg, B,
2154                         AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
2155       return Register();
2156 
2157     MachineMemOperand *MMO = MF.getMachineMemOperand(
2158         PtrInfo,
2159         MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2160             MachineMemOperand::MOInvariant,
2161         LLT::scalar(32), commonAlignment(Align(64), Offset));
2162 
2163     // Pointer address
2164     B.buildPtrAdd(LoadAddr, KernargPtrReg,
2165                   B.buildConstant(LLT::scalar(64), Offset).getReg(0));
2166     // Load address
2167     return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
2168   }
2169 
2170   Register QueuePtr = MRI.createGenericVirtualRegister(
2171     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2172 
2173   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
2174     return Register();
2175 
2176   // Offset into amd_queue_t for group_segment_aperture_base_hi /
2177   // private_segment_aperture_base_hi.
2178   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
2179 
2180   MachineMemOperand *MMO = MF.getMachineMemOperand(
2181       PtrInfo,
2182       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2183           MachineMemOperand::MOInvariant,
2184       LLT::scalar(32), commonAlignment(Align(64), StructOffset));
2185 
2186   B.buildPtrAdd(LoadAddr, QueuePtr,
2187                 B.buildConstant(LLT::scalar(64), StructOffset).getReg(0));
2188   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
2189 }
2190 
2191 /// Return true if the value is a known valid address, such that a null check is
2192 /// not necessary.
2193 static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI,
2194                            const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
2195   MachineInstr *Def = MRI.getVRegDef(Val);
2196   switch (Def->getOpcode()) {
2197   case AMDGPU::G_FRAME_INDEX:
2198   case AMDGPU::G_GLOBAL_VALUE:
2199   case AMDGPU::G_BLOCK_ADDR:
2200     return true;
2201   case AMDGPU::G_CONSTANT: {
2202     const ConstantInt *CI = Def->getOperand(1).getCImm();
2203     return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace);
2204   }
2205   default:
2206     return false;
2207   }
2208 
2209   return false;
2210 }
2211 
2212 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
2213   MachineInstr &MI, MachineRegisterInfo &MRI,
2214   MachineIRBuilder &B) const {
2215   MachineFunction &MF = B.getMF();
2216 
2217   const LLT S32 = LLT::scalar(32);
2218   Register Dst = MI.getOperand(0).getReg();
2219   Register Src = MI.getOperand(1).getReg();
2220 
2221   LLT DstTy = MRI.getType(Dst);
2222   LLT SrcTy = MRI.getType(Src);
2223   unsigned DestAS = DstTy.getAddressSpace();
2224   unsigned SrcAS = SrcTy.getAddressSpace();
2225 
2226   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
2227   // vector element.
2228   assert(!DstTy.isVector());
2229 
2230   const AMDGPUTargetMachine &TM
2231     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
2232 
2233   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
2234     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
2235     return true;
2236   }
2237 
2238   if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
2239       (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
2240        DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2241     if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
2242       // Extract low 32-bits of the pointer.
2243       B.buildExtract(Dst, Src, 0);
2244       MI.eraseFromParent();
2245       return true;
2246     }
2247 
2248     unsigned NullVal = TM.getNullPointerValue(DestAS);
2249 
2250     auto SegmentNull = B.buildConstant(DstTy, NullVal);
2251     auto FlatNull = B.buildConstant(SrcTy, 0);
2252 
2253     // Extract low 32-bits of the pointer.
2254     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
2255 
2256     auto CmpRes =
2257         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
2258     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
2259 
2260     MI.eraseFromParent();
2261     return true;
2262   }
2263 
2264   if (DestAS == AMDGPUAS::FLAT_ADDRESS &&
2265       (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
2266        SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2267     Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
2268     if (!ApertureReg.isValid())
2269       return false;
2270 
2271     // Coerce the type of the low half of the result so we can use merge_values.
2272     Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
2273 
2274     // TODO: Should we allow mismatched types but matching sizes in merges to
2275     // avoid the ptrtoint?
2276     auto BuildPtr = B.buildMergeLikeInstr(DstTy, {SrcAsInt, ApertureReg});
2277 
2278     if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
2279       B.buildCopy(Dst, BuildPtr);
2280       MI.eraseFromParent();
2281       return true;
2282     }
2283 
2284     auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
2285     auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
2286 
2287     auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src,
2288                               SegmentNull.getReg(0));
2289 
2290     B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2291 
2292     MI.eraseFromParent();
2293     return true;
2294   }
2295 
2296   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2297       SrcTy.getSizeInBits() == 64) {
2298     // Truncate.
2299     B.buildExtract(Dst, Src, 0);
2300     MI.eraseFromParent();
2301     return true;
2302   }
2303 
2304   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2305       DstTy.getSizeInBits() == 64) {
2306     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2307     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2308     auto PtrLo = B.buildPtrToInt(S32, Src);
2309     auto HighAddr = B.buildConstant(S32, AddrHiVal);
2310     B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
2311     MI.eraseFromParent();
2312     return true;
2313   }
2314 
2315   DiagnosticInfoUnsupported InvalidAddrSpaceCast(
2316       MF.getFunction(), "invalid addrspacecast", B.getDebugLoc());
2317 
2318   LLVMContext &Ctx = MF.getFunction().getContext();
2319   Ctx.diagnose(InvalidAddrSpaceCast);
2320   B.buildUndef(Dst);
2321   MI.eraseFromParent();
2322   return true;
2323 }
2324 
2325 bool AMDGPULegalizerInfo::legalizeFroundeven(MachineInstr &MI,
2326                                              MachineRegisterInfo &MRI,
2327                                              MachineIRBuilder &B) const {
2328   Register Src = MI.getOperand(1).getReg();
2329   LLT Ty = MRI.getType(Src);
2330   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
2331 
2332   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2333   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2334 
2335   auto C1 = B.buildFConstant(Ty, C1Val);
2336   auto CopySign = B.buildFCopysign(Ty, C1, Src);
2337 
2338   // TODO: Should this propagate fast-math-flags?
2339   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
2340   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
2341 
2342   auto C2 = B.buildFConstant(Ty, C2Val);
2343   auto Fabs = B.buildFAbs(Ty, Src);
2344 
2345   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
2346   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
2347   MI.eraseFromParent();
2348   return true;
2349 }
2350 
2351 bool AMDGPULegalizerInfo::legalizeFceil(
2352   MachineInstr &MI, MachineRegisterInfo &MRI,
2353   MachineIRBuilder &B) const {
2354 
2355   const LLT S1 = LLT::scalar(1);
2356   const LLT S64 = LLT::scalar(64);
2357 
2358   Register Src = MI.getOperand(1).getReg();
2359   assert(MRI.getType(Src) == S64);
2360 
2361   // result = trunc(src)
2362   // if (src > 0.0 && src != result)
2363   //   result += 1.0
2364 
2365   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
2366 
2367   const auto Zero = B.buildFConstant(S64, 0.0);
2368   const auto One = B.buildFConstant(S64, 1.0);
2369   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
2370   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
2371   auto And = B.buildAnd(S1, Lt0, NeTrunc);
2372   auto Add = B.buildSelect(S64, And, One, Zero);
2373 
2374   // TODO: Should this propagate fast-math-flags?
2375   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
2376   MI.eraseFromParent();
2377   return true;
2378 }
2379 
2380 bool AMDGPULegalizerInfo::legalizeFrem(
2381   MachineInstr &MI, MachineRegisterInfo &MRI,
2382   MachineIRBuilder &B) const {
2383     Register DstReg = MI.getOperand(0).getReg();
2384     Register Src0Reg = MI.getOperand(1).getReg();
2385     Register Src1Reg = MI.getOperand(2).getReg();
2386     auto Flags = MI.getFlags();
2387     LLT Ty = MRI.getType(DstReg);
2388 
2389     auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2390     auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
2391     auto Neg = B.buildFNeg(Ty, Trunc, Flags);
2392     B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2393     MI.eraseFromParent();
2394     return true;
2395 }
2396 
2397 static MachineInstrBuilder extractF64Exponent(Register Hi,
2398                                               MachineIRBuilder &B) {
2399   const unsigned FractBits = 52;
2400   const unsigned ExpBits = 11;
2401   LLT S32 = LLT::scalar(32);
2402 
2403   auto Const0 = B.buildConstant(S32, FractBits - 32);
2404   auto Const1 = B.buildConstant(S32, ExpBits);
2405 
2406   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32})
2407                      .addUse(Hi)
2408                      .addUse(Const0.getReg(0))
2409                      .addUse(Const1.getReg(0));
2410 
2411   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2412 }
2413 
2414 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
2415   MachineInstr &MI, MachineRegisterInfo &MRI,
2416   MachineIRBuilder &B) const {
2417   const LLT S1 = LLT::scalar(1);
2418   const LLT S32 = LLT::scalar(32);
2419   const LLT S64 = LLT::scalar(64);
2420 
2421   Register Src = MI.getOperand(1).getReg();
2422   assert(MRI.getType(Src) == S64);
2423 
2424   // TODO: Should this use extract since the low half is unused?
2425   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2426   Register Hi = Unmerge.getReg(1);
2427 
2428   // Extract the upper half, since this is where we will find the sign and
2429   // exponent.
2430   auto Exp = extractF64Exponent(Hi, B);
2431 
2432   const unsigned FractBits = 52;
2433 
2434   // Extract the sign bit.
2435   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2436   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2437 
2438   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2439 
2440   const auto Zero32 = B.buildConstant(S32, 0);
2441 
2442   // Extend back to 64-bits.
2443   auto SignBit64 = B.buildMergeLikeInstr(S64, {Zero32, SignBit});
2444 
2445   auto Shr = B.buildAShr(S64, FractMask, Exp);
2446   auto Not = B.buildNot(S64, Shr);
2447   auto Tmp0 = B.buildAnd(S64, Src, Not);
2448   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2449 
2450   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2451   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2452 
2453   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2454   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2455   MI.eraseFromParent();
2456   return true;
2457 }
2458 
2459 bool AMDGPULegalizerInfo::legalizeITOFP(
2460   MachineInstr &MI, MachineRegisterInfo &MRI,
2461   MachineIRBuilder &B, bool Signed) const {
2462 
2463   Register Dst = MI.getOperand(0).getReg();
2464   Register Src = MI.getOperand(1).getReg();
2465 
2466   const LLT S64 = LLT::scalar(64);
2467   const LLT S32 = LLT::scalar(32);
2468 
2469   assert(MRI.getType(Src) == S64);
2470 
2471   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2472   auto ThirtyTwo = B.buildConstant(S32, 32);
2473 
2474   if (MRI.getType(Dst) == S64) {
2475     auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2476                         : B.buildUITOFP(S64, Unmerge.getReg(1));
2477 
2478     auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2479     auto LdExp = B.buildFLdexp(S64, CvtHi, ThirtyTwo);
2480 
2481     // TODO: Should this propagate fast-math-flags?
2482     B.buildFAdd(Dst, LdExp, CvtLo);
2483     MI.eraseFromParent();
2484     return true;
2485   }
2486 
2487   assert(MRI.getType(Dst) == S32);
2488 
2489   auto One = B.buildConstant(S32, 1);
2490 
2491   MachineInstrBuilder ShAmt;
2492   if (Signed) {
2493     auto ThirtyOne = B.buildConstant(S32, 31);
2494     auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2495     auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2496     auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2497     auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32})
2498                   .addUse(Unmerge.getReg(1));
2499     auto LS2 = B.buildSub(S32, LS, One);
2500     ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2501   } else
2502     ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2503   auto Norm = B.buildShl(S64, Src, ShAmt);
2504   auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2505   auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2506   auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2507   auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2508   auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2509   B.buildFLdexp(Dst, FVal, Scale);
2510   MI.eraseFromParent();
2511   return true;
2512 }
2513 
2514 // TODO: Copied from DAG implementation. Verify logic and document how this
2515 // actually works.
2516 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
2517                                         MachineRegisterInfo &MRI,
2518                                         MachineIRBuilder &B,
2519                                         bool Signed) const {
2520 
2521   Register Dst = MI.getOperand(0).getReg();
2522   Register Src = MI.getOperand(1).getReg();
2523 
2524   const LLT S64 = LLT::scalar(64);
2525   const LLT S32 = LLT::scalar(32);
2526 
2527   const LLT SrcLT = MRI.getType(Src);
2528   assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2529 
2530   unsigned Flags = MI.getFlags();
2531 
2532   // The basic idea of converting a floating point number into a pair of 32-bit
2533   // integers is illustrated as follows:
2534   //
2535   //     tf := trunc(val);
2536   //    hif := floor(tf * 2^-32);
2537   //    lof := tf - hif * 2^32; // lof is always positive due to floor.
2538   //     hi := fptoi(hif);
2539   //     lo := fptoi(lof);
2540   //
2541   auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2542   MachineInstrBuilder Sign;
2543   if (Signed && SrcLT == S32) {
2544     // However, a 32-bit floating point number has only 23 bits mantissa and
2545     // it's not enough to hold all the significant bits of `lof` if val is
2546     // negative. To avoid the loss of precision, We need to take the absolute
2547     // value after truncating and flip the result back based on the original
2548     // signedness.
2549     Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2550     Trunc = B.buildFAbs(S32, Trunc, Flags);
2551   }
2552   MachineInstrBuilder K0, K1;
2553   if (SrcLT == S64) {
2554     K0 = B.buildFConstant(
2555         S64, llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2556     K1 = B.buildFConstant(
2557         S64, llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2558   } else {
2559     K0 = B.buildFConstant(
2560         S32, llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)));
2561     K1 = B.buildFConstant(
2562         S32, llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)));
2563   }
2564 
2565   auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2566   auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2567   auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2568 
2569   auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2570                                      : B.buildFPTOUI(S32, FloorMul);
2571   auto Lo = B.buildFPTOUI(S32, Fma);
2572 
2573   if (Signed && SrcLT == S32) {
2574     // Flip the result based on the signedness, which is either all 0s or 1s.
2575     Sign = B.buildMergeLikeInstr(S64, {Sign, Sign});
2576     // r := xor({lo, hi}, sign) - sign;
2577     B.buildSub(Dst, B.buildXor(S64, B.buildMergeLikeInstr(S64, {Lo, Hi}), Sign),
2578                Sign);
2579   } else
2580     B.buildMergeLikeInstr(Dst, {Lo, Hi});
2581   MI.eraseFromParent();
2582 
2583   return true;
2584 }
2585 
2586 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2587                                                MachineInstr &MI) const {
2588   MachineFunction &MF = Helper.MIRBuilder.getMF();
2589   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2590 
2591   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2592                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2593 
2594   // With ieee_mode disabled, the instructions have the correct behavior
2595   // already for G_FMINNUM/G_FMAXNUM
2596   if (!MFI->getMode().IEEE)
2597     return !IsIEEEOp;
2598 
2599   if (IsIEEEOp)
2600     return true;
2601 
2602   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2603 }
2604 
2605 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2606   MachineInstr &MI, MachineRegisterInfo &MRI,
2607   MachineIRBuilder &B) const {
2608   // TODO: Should move some of this into LegalizerHelper.
2609 
2610   // TODO: Promote dynamic indexing of s16 to s32
2611 
2612   Register Dst = MI.getOperand(0).getReg();
2613   Register Vec = MI.getOperand(1).getReg();
2614 
2615   LLT VecTy = MRI.getType(Vec);
2616   LLT EltTy = VecTy.getElementType();
2617   assert(EltTy == MRI.getType(Dst));
2618 
2619   // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
2620   // but we can't go directly to that logic becasue you can't bitcast a vector
2621   // of pointers to a vector of integers. Therefore, introduce an intermediate
2622   // vector of integers using ptrtoint (and inttoptr on the output) in order to
2623   // drive the legalization forward.
2624   if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
2625     LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
2626     LLT IntVecTy = VecTy.changeElementType(IntTy);
2627 
2628     auto IntVec = B.buildPtrToInt(IntVecTy, Vec);
2629     auto IntElt = B.buildExtractVectorElement(IntTy, IntVec, MI.getOperand(2));
2630     B.buildIntToPtr(Dst, IntElt);
2631 
2632     MI.eraseFromParent();
2633     return true;
2634   }
2635 
2636   // FIXME: Artifact combiner probably should have replaced the truncated
2637   // constant before this, so we shouldn't need
2638   // getIConstantVRegValWithLookThrough.
2639   std::optional<ValueAndVReg> MaybeIdxVal =
2640       getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2641   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2642     return true;
2643   const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2644 
2645   if (IdxVal < VecTy.getNumElements()) {
2646     auto Unmerge = B.buildUnmerge(EltTy, Vec);
2647     B.buildCopy(Dst, Unmerge.getReg(IdxVal));
2648   } else {
2649     B.buildUndef(Dst);
2650   }
2651 
2652   MI.eraseFromParent();
2653   return true;
2654 }
2655 
2656 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2657   MachineInstr &MI, MachineRegisterInfo &MRI,
2658   MachineIRBuilder &B) const {
2659   // TODO: Should move some of this into LegalizerHelper.
2660 
2661   // TODO: Promote dynamic indexing of s16 to s32
2662 
2663   Register Dst = MI.getOperand(0).getReg();
2664   Register Vec = MI.getOperand(1).getReg();
2665   Register Ins = MI.getOperand(2).getReg();
2666 
2667   LLT VecTy = MRI.getType(Vec);
2668   LLT EltTy = VecTy.getElementType();
2669   assert(EltTy == MRI.getType(Ins));
2670 
2671   // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
2672   // but we can't go directly to that logic becasue you can't bitcast a vector
2673   // of pointers to a vector of integers. Therefore, make the pointer vector
2674   // into an equivalent vector of integers with ptrtoint, insert the ptrtoint'd
2675   // new value, and then inttoptr the result vector back. This will then allow
2676   // the rest of legalization to take over.
2677   if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
2678     LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
2679     LLT IntVecTy = VecTy.changeElementType(IntTy);
2680 
2681     auto IntVecSource = B.buildPtrToInt(IntVecTy, Vec);
2682     auto IntIns = B.buildPtrToInt(IntTy, Ins);
2683     auto IntVecDest = B.buildInsertVectorElement(IntVecTy, IntVecSource, IntIns,
2684                                                  MI.getOperand(3));
2685     B.buildIntToPtr(Dst, IntVecDest);
2686     MI.eraseFromParent();
2687     return true;
2688   }
2689 
2690   // FIXME: Artifact combiner probably should have replaced the truncated
2691   // constant before this, so we shouldn't need
2692   // getIConstantVRegValWithLookThrough.
2693   std::optional<ValueAndVReg> MaybeIdxVal =
2694       getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2695   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2696     return true;
2697 
2698   const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2699 
2700   unsigned NumElts = VecTy.getNumElements();
2701   if (IdxVal < NumElts) {
2702     SmallVector<Register, 8> SrcRegs;
2703     for (unsigned i = 0; i < NumElts; ++i)
2704       SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy));
2705     B.buildUnmerge(SrcRegs, Vec);
2706 
2707     SrcRegs[IdxVal] = MI.getOperand(2).getReg();
2708     B.buildMergeLikeInstr(Dst, SrcRegs);
2709   } else {
2710     B.buildUndef(Dst);
2711   }
2712 
2713   MI.eraseFromParent();
2714   return true;
2715 }
2716 
2717 bool AMDGPULegalizerInfo::legalizeSinCos(
2718   MachineInstr &MI, MachineRegisterInfo &MRI,
2719   MachineIRBuilder &B) const {
2720 
2721   Register DstReg = MI.getOperand(0).getReg();
2722   Register SrcReg = MI.getOperand(1).getReg();
2723   LLT Ty = MRI.getType(DstReg);
2724   unsigned Flags = MI.getFlags();
2725 
2726   Register TrigVal;
2727   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2728   if (ST.hasTrigReducedRange()) {
2729     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2730     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty})
2731                   .addUse(MulVal.getReg(0))
2732                   .setMIFlags(Flags)
2733                   .getReg(0);
2734   } else
2735     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2736 
2737   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2738     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2739   B.buildIntrinsic(TrigIntrin, ArrayRef<Register>(DstReg))
2740       .addUse(TrigVal)
2741       .setMIFlags(Flags);
2742   MI.eraseFromParent();
2743   return true;
2744 }
2745 
2746 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2747                                                   MachineIRBuilder &B,
2748                                                   const GlobalValue *GV,
2749                                                   int64_t Offset,
2750                                                   unsigned GAFlags) const {
2751   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2752   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2753   // to the following code sequence:
2754   //
2755   // For constant address space:
2756   //   s_getpc_b64 s[0:1]
2757   //   s_add_u32 s0, s0, $symbol
2758   //   s_addc_u32 s1, s1, 0
2759   //
2760   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2761   //   a fixup or relocation is emitted to replace $symbol with a literal
2762   //   constant, which is a pc-relative offset from the encoding of the $symbol
2763   //   operand to the global variable.
2764   //
2765   // For global address space:
2766   //   s_getpc_b64 s[0:1]
2767   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2768   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2769   //
2770   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2771   //   fixups or relocations are emitted to replace $symbol@*@lo and
2772   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2773   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2774   //   operand to the global variable.
2775 
2776   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2777 
2778   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2779     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2780 
2781   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2782     .addDef(PCReg);
2783 
2784   MIB.addGlobalAddress(GV, Offset, GAFlags);
2785   if (GAFlags == SIInstrInfo::MO_NONE)
2786     MIB.addImm(0);
2787   else
2788     MIB.addGlobalAddress(GV, Offset, GAFlags + 1);
2789 
2790   if (!B.getMRI()->getRegClassOrNull(PCReg))
2791     B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2792 
2793   if (PtrTy.getSizeInBits() == 32)
2794     B.buildExtract(DstReg, PCReg, 0);
2795   return true;
2796 }
2797 
2798 // Emit a ABS32_LO / ABS32_HI relocation stub.
2799 void AMDGPULegalizerInfo::buildAbsGlobalAddress(
2800     Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV,
2801     MachineRegisterInfo &MRI) const {
2802   bool RequiresHighHalf = PtrTy.getSizeInBits() != 32;
2803 
2804   LLT S32 = LLT::scalar(32);
2805 
2806   // Use the destination directly, if and only if we store the lower address
2807   // part only and we don't have a register class being set.
2808   Register AddrLo = !RequiresHighHalf && !MRI.getRegClassOrNull(DstReg)
2809                         ? DstReg
2810                         : MRI.createGenericVirtualRegister(S32);
2811 
2812   if (!MRI.getRegClassOrNull(AddrLo))
2813     MRI.setRegClass(AddrLo, &AMDGPU::SReg_32RegClass);
2814 
2815   // Write the lower half.
2816   B.buildInstr(AMDGPU::S_MOV_B32)
2817       .addDef(AddrLo)
2818       .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_LO);
2819 
2820   // If required, write the upper half as well.
2821   if (RequiresHighHalf) {
2822     assert(PtrTy.getSizeInBits() == 64 &&
2823            "Must provide a 64-bit pointer type!");
2824 
2825     Register AddrHi = MRI.createGenericVirtualRegister(S32);
2826     MRI.setRegClass(AddrHi, &AMDGPU::SReg_32RegClass);
2827 
2828     B.buildInstr(AMDGPU::S_MOV_B32)
2829         .addDef(AddrHi)
2830         .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_HI);
2831 
2832     // Use the destination directly, if and only if we don't have a register
2833     // class being set.
2834     Register AddrDst = !MRI.getRegClassOrNull(DstReg)
2835                            ? DstReg
2836                            : MRI.createGenericVirtualRegister(LLT::scalar(64));
2837 
2838     if (!MRI.getRegClassOrNull(AddrDst))
2839       MRI.setRegClass(AddrDst, &AMDGPU::SReg_64RegClass);
2840 
2841     B.buildMergeValues(AddrDst, {AddrLo, AddrHi});
2842 
2843     // If we created a new register for the destination, cast the result into
2844     // the final output.
2845     if (AddrDst != DstReg)
2846       B.buildCast(DstReg, AddrDst);
2847   } else if (AddrLo != DstReg) {
2848     // If we created a new register for the destination, cast the result into
2849     // the final output.
2850     B.buildCast(DstReg, AddrLo);
2851   }
2852 }
2853 
2854 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2855   MachineInstr &MI, MachineRegisterInfo &MRI,
2856   MachineIRBuilder &B) const {
2857   Register DstReg = MI.getOperand(0).getReg();
2858   LLT Ty = MRI.getType(DstReg);
2859   unsigned AS = Ty.getAddressSpace();
2860 
2861   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2862   MachineFunction &MF = B.getMF();
2863   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2864 
2865   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2866     if (!MFI->isModuleEntryFunction() &&
2867         !GV->getName().equals("llvm.amdgcn.module.lds")) {
2868       const Function &Fn = MF.getFunction();
2869       DiagnosticInfoUnsupported BadLDSDecl(
2870         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2871         DS_Warning);
2872       Fn.getContext().diagnose(BadLDSDecl);
2873 
2874       // We currently don't have a way to correctly allocate LDS objects that
2875       // aren't directly associated with a kernel. We do force inlining of
2876       // functions that use local objects. However, if these dead functions are
2877       // not eliminated, we don't want a compile time error. Just emit a warning
2878       // and a trap, since there should be no callable path here.
2879       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>());
2880       B.buildUndef(DstReg);
2881       MI.eraseFromParent();
2882       return true;
2883     }
2884 
2885     // TODO: We could emit code to handle the initialization somewhere.
2886     // We ignore the initializer for now and legalize it to allow selection.
2887     // The initializer will anyway get errored out during assembly emission.
2888     const SITargetLowering *TLI = ST.getTargetLowering();
2889     if (!TLI->shouldUseLDSConstAddress(GV)) {
2890       MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2891       return true; // Leave in place;
2892     }
2893 
2894     if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2895       Type *Ty = GV->getValueType();
2896       // HIP uses an unsized array `extern __shared__ T s[]` or similar
2897       // zero-sized type in other languages to declare the dynamic shared
2898       // memory which size is not known at the compile time. They will be
2899       // allocated by the runtime and placed directly after the static
2900       // allocated ones. They all share the same offset.
2901       if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2902         // Adjust alignment for that dynamic shared memory array.
2903         MFI->setDynLDSAlign(MF.getFunction(), *cast<GlobalVariable>(GV));
2904         LLT S32 = LLT::scalar(32);
2905         auto Sz = B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32});
2906         B.buildIntToPtr(DstReg, Sz);
2907         MI.eraseFromParent();
2908         return true;
2909       }
2910     }
2911 
2912     B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(),
2913                                                    *cast<GlobalVariable>(GV)));
2914     MI.eraseFromParent();
2915     return true;
2916   }
2917 
2918   if (ST.isAmdPalOS() || ST.isMesa3DOS()) {
2919     buildAbsGlobalAddress(DstReg, Ty, B, GV, MRI);
2920     MI.eraseFromParent();
2921     return true;
2922   }
2923 
2924   const SITargetLowering *TLI = ST.getTargetLowering();
2925 
2926   if (TLI->shouldEmitFixup(GV)) {
2927     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2928     MI.eraseFromParent();
2929     return true;
2930   }
2931 
2932   if (TLI->shouldEmitPCReloc(GV)) {
2933     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2934     MI.eraseFromParent();
2935     return true;
2936   }
2937 
2938   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2939   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2940 
2941   LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
2942   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2943       MachinePointerInfo::getGOT(MF),
2944       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2945           MachineMemOperand::MOInvariant,
2946       LoadTy, Align(8));
2947 
2948   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2949 
2950   if (Ty.getSizeInBits() == 32) {
2951     // Truncate if this is a 32-bit constant address.
2952     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2953     B.buildExtract(DstReg, Load, 0);
2954   } else
2955     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2956 
2957   MI.eraseFromParent();
2958   return true;
2959 }
2960 
2961 static LLT widenToNextPowerOf2(LLT Ty) {
2962   if (Ty.isVector())
2963     return Ty.changeElementCount(
2964         ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
2965   return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2966 }
2967 
2968 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2969                                        MachineInstr &MI) const {
2970   MachineIRBuilder &B = Helper.MIRBuilder;
2971   MachineRegisterInfo &MRI = *B.getMRI();
2972   GISelChangeObserver &Observer = Helper.Observer;
2973 
2974   Register PtrReg = MI.getOperand(1).getReg();
2975   LLT PtrTy = MRI.getType(PtrReg);
2976   unsigned AddrSpace = PtrTy.getAddressSpace();
2977 
2978   if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2979     LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2980     auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2981     Observer.changingInstr(MI);
2982     MI.getOperand(1).setReg(Cast.getReg(0));
2983     Observer.changedInstr(MI);
2984     return true;
2985   }
2986 
2987   if (MI.getOpcode() != AMDGPU::G_LOAD)
2988     return false;
2989 
2990   Register ValReg = MI.getOperand(0).getReg();
2991   LLT ValTy = MRI.getType(ValReg);
2992 
2993   if (hasBufferRsrcWorkaround(ValTy)) {
2994     Observer.changingInstr(MI);
2995     castBufferRsrcFromV4I32(MI, B, MRI, 0);
2996     Observer.changedInstr(MI);
2997     return true;
2998   }
2999 
3000   MachineMemOperand *MMO = *MI.memoperands_begin();
3001   const unsigned ValSize = ValTy.getSizeInBits();
3002   const LLT MemTy = MMO->getMemoryType();
3003   const Align MemAlign = MMO->getAlign();
3004   const unsigned MemSize = MemTy.getSizeInBits();
3005   const uint64_t AlignInBits = 8 * MemAlign.value();
3006 
3007   // Widen non-power-of-2 loads to the alignment if needed
3008   if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
3009     const unsigned WideMemSize = PowerOf2Ceil(MemSize);
3010 
3011     // This was already the correct extending load result type, so just adjust
3012     // the memory type.
3013     if (WideMemSize == ValSize) {
3014       MachineFunction &MF = B.getMF();
3015 
3016       MachineMemOperand *WideMMO =
3017           MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
3018       Observer.changingInstr(MI);
3019       MI.setMemRefs(MF, {WideMMO});
3020       Observer.changedInstr(MI);
3021       return true;
3022     }
3023 
3024     // Don't bother handling edge case that should probably never be produced.
3025     if (ValSize > WideMemSize)
3026       return false;
3027 
3028     LLT WideTy = widenToNextPowerOf2(ValTy);
3029 
3030     Register WideLoad;
3031     if (!WideTy.isVector()) {
3032       WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3033       B.buildTrunc(ValReg, WideLoad).getReg(0);
3034     } else {
3035       // Extract the subvector.
3036 
3037       if (isRegisterType(ValTy)) {
3038         // If this a case where G_EXTRACT is legal, use it.
3039         // (e.g. <3 x s32> -> <4 x s32>)
3040         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3041         B.buildExtract(ValReg, WideLoad, 0);
3042       } else {
3043         // For cases where the widened type isn't a nice register value, unmerge
3044         // from a widened register (e.g. <3 x s16> -> <4 x s16>)
3045         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3046         B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
3047       }
3048     }
3049 
3050     MI.eraseFromParent();
3051     return true;
3052   }
3053 
3054   return false;
3055 }
3056 
3057 bool AMDGPULegalizerInfo::legalizeStore(LegalizerHelper &Helper,
3058                                         MachineInstr &MI) const {
3059   MachineIRBuilder &B = Helper.MIRBuilder;
3060   MachineRegisterInfo &MRI = *B.getMRI();
3061   GISelChangeObserver &Observer = Helper.Observer;
3062 
3063   Register DataReg = MI.getOperand(0).getReg();
3064   LLT DataTy = MRI.getType(DataReg);
3065 
3066   if (hasBufferRsrcWorkaround(DataTy)) {
3067     Observer.changingInstr(MI);
3068     castBufferRsrcArgToV4I32(MI, B, 0);
3069     Observer.changedInstr(MI);
3070     return true;
3071   }
3072   return false;
3073 }
3074 
3075 bool AMDGPULegalizerInfo::legalizeFMad(
3076   MachineInstr &MI, MachineRegisterInfo &MRI,
3077   MachineIRBuilder &B) const {
3078   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3079   assert(Ty.isScalar());
3080 
3081   MachineFunction &MF = B.getMF();
3082   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
3083 
3084   // TODO: Always legal with future ftz flag.
3085   // FIXME: Do we need just output?
3086   if (Ty == LLT::float32() &&
3087       MFI->getMode().FP32Denormals == DenormalMode::getPreserveSign())
3088     return true;
3089   if (Ty == LLT::float16() &&
3090       MFI->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign())
3091     return true;
3092 
3093   MachineIRBuilder HelperBuilder(MI);
3094   GISelObserverWrapper DummyObserver;
3095   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
3096   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
3097 }
3098 
3099 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
3100   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
3101   Register DstReg = MI.getOperand(0).getReg();
3102   Register PtrReg = MI.getOperand(1).getReg();
3103   Register CmpVal = MI.getOperand(2).getReg();
3104   Register NewVal = MI.getOperand(3).getReg();
3105 
3106   assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
3107          "this should not have been custom lowered");
3108 
3109   LLT ValTy = MRI.getType(CmpVal);
3110   LLT VecTy = LLT::fixed_vector(2, ValTy);
3111 
3112   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
3113 
3114   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
3115     .addDef(DstReg)
3116     .addUse(PtrReg)
3117     .addUse(PackedVal)
3118     .setMemRefs(MI.memoperands());
3119 
3120   MI.eraseFromParent();
3121   return true;
3122 }
3123 
3124 /// Return true if it's known that \p Src can never be an f32 denormal value.
3125 static bool valueIsKnownNeverF32Denorm(const MachineRegisterInfo &MRI,
3126                                        Register Src) {
3127   const MachineInstr *DefMI = MRI.getVRegDef(Src);
3128   switch (DefMI->getOpcode()) {
3129   case TargetOpcode::G_INTRINSIC: {
3130     switch (cast<GIntrinsic>(DefMI)->getIntrinsicID()) {
3131     case Intrinsic::amdgcn_frexp_mant:
3132       return true;
3133     default:
3134       break;
3135     }
3136 
3137     break;
3138   }
3139   case TargetOpcode::G_FFREXP: {
3140     if (DefMI->getOperand(0).getReg() == Src)
3141       return true;
3142     break;
3143   }
3144   case TargetOpcode::G_FPEXT: {
3145     return MRI.getType(DefMI->getOperand(1).getReg()) == LLT::scalar(16);
3146   }
3147   default:
3148     return false;
3149   }
3150 
3151   return false;
3152 }
3153 
3154 static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags) {
3155   if (Flags & MachineInstr::FmAfn)
3156     return true;
3157   const auto &Options = MF.getTarget().Options;
3158   return Options.UnsafeFPMath || Options.ApproxFuncFPMath;
3159 }
3160 
3161 static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src,
3162                                    unsigned Flags) {
3163   return !valueIsKnownNeverF32Denorm(MF.getRegInfo(), Src) &&
3164          MF.getDenormalMode(APFloat::IEEEsingle()).Input !=
3165              DenormalMode::PreserveSign;
3166 }
3167 
3168 std::pair<Register, Register>
3169 AMDGPULegalizerInfo::getScaledLogInput(MachineIRBuilder &B, Register Src,
3170                                        unsigned Flags) const {
3171   if (!needsDenormHandlingF32(B.getMF(), Src, Flags))
3172     return {};
3173 
3174   const LLT F32 = LLT::scalar(32);
3175   auto SmallestNormal = B.buildFConstant(
3176       F32, APFloat::getSmallestNormalized(APFloat::IEEEsingle()));
3177   auto IsLtSmallestNormal =
3178       B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src, SmallestNormal);
3179 
3180   auto Scale32 = B.buildFConstant(F32, 0x1.0p+32);
3181   auto One = B.buildFConstant(F32, 1.0);
3182   auto ScaleFactor =
3183       B.buildSelect(F32, IsLtSmallestNormal, Scale32, One, Flags);
3184   auto ScaledInput = B.buildFMul(F32, Src, ScaleFactor, Flags);
3185 
3186   return {ScaledInput.getReg(0), IsLtSmallestNormal.getReg(0)};
3187 }
3188 
3189 bool AMDGPULegalizerInfo::legalizeFlog2(MachineInstr &MI,
3190                                         MachineIRBuilder &B) const {
3191   // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
3192   // If we have to handle denormals, scale up the input and adjust the result.
3193 
3194   // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
3195   // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
3196 
3197   Register Dst = MI.getOperand(0).getReg();
3198   Register Src = MI.getOperand(1).getReg();
3199   LLT Ty = B.getMRI()->getType(Dst);
3200   unsigned Flags = MI.getFlags();
3201 
3202   if (Ty == LLT::scalar(16)) {
3203     const LLT F32 = LLT::scalar(32);
3204     // Nothing in half is a denormal when promoted to f32.
3205     auto Ext = B.buildFPExt(F32, Src, Flags);
3206     auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {F32})
3207                     .addUse(Ext.getReg(0))
3208                     .setMIFlags(Flags);
3209     B.buildFPTrunc(Dst, Log2, Flags);
3210     MI.eraseFromParent();
3211     return true;
3212   }
3213 
3214   assert(Ty == LLT::scalar(32));
3215 
3216   auto [ScaledInput, IsLtSmallestNormal] = getScaledLogInput(B, Src, Flags);
3217   if (!ScaledInput) {
3218     B.buildIntrinsic(Intrinsic::amdgcn_log, {MI.getOperand(0)})
3219         .addUse(Src)
3220         .setMIFlags(Flags);
3221     MI.eraseFromParent();
3222     return true;
3223   }
3224 
3225   auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3226                   .addUse(ScaledInput)
3227                   .setMIFlags(Flags);
3228 
3229   auto ThirtyTwo = B.buildFConstant(Ty, 32.0);
3230   auto Zero = B.buildFConstant(Ty, 0.0);
3231   auto ResultOffset =
3232       B.buildSelect(Ty, IsLtSmallestNormal, ThirtyTwo, Zero, Flags);
3233   B.buildFSub(Dst, Log2, ResultOffset, Flags);
3234 
3235   MI.eraseFromParent();
3236   return true;
3237 }
3238 
3239 static Register getMad(MachineIRBuilder &B, LLT Ty, Register X, Register Y,
3240                        Register Z, unsigned Flags) {
3241   auto FMul = B.buildFMul(Ty, X, Y, Flags);
3242   return B.buildFAdd(Ty, FMul, Z, Flags).getReg(0);
3243 }
3244 
3245 bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI,
3246                                              MachineIRBuilder &B) const {
3247   const bool IsLog10 = MI.getOpcode() == TargetOpcode::G_FLOG10;
3248   assert(IsLog10 || MI.getOpcode() == TargetOpcode::G_FLOG);
3249 
3250   MachineRegisterInfo &MRI = *B.getMRI();
3251   Register Dst = MI.getOperand(0).getReg();
3252   Register X = MI.getOperand(1).getReg();
3253   unsigned Flags = MI.getFlags();
3254   const LLT Ty = MRI.getType(X);
3255   MachineFunction &MF = B.getMF();
3256 
3257   const LLT F32 = LLT::scalar(32);
3258   const LLT F16 = LLT::scalar(16);
3259 
3260   const AMDGPUTargetMachine &TM =
3261       static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
3262 
3263   if (Ty == F16 || MI.getFlag(MachineInstr::FmAfn) ||
3264       TM.Options.ApproxFuncFPMath || TM.Options.UnsafeFPMath) {
3265     if (Ty == F16 && !ST.has16BitInsts()) {
3266       Register LogVal = MRI.createGenericVirtualRegister(F32);
3267       auto PromoteSrc = B.buildFPExt(F32, X);
3268       legalizeFlogUnsafe(B, LogVal, PromoteSrc.getReg(0), IsLog10, Flags);
3269       B.buildFPTrunc(Dst, LogVal);
3270     } else {
3271       legalizeFlogUnsafe(B, Dst, X, IsLog10, Flags);
3272     }
3273 
3274     MI.eraseFromParent();
3275     return true;
3276   }
3277 
3278   auto [ScaledInput, IsScaled] = getScaledLogInput(B, X, Flags);
3279   if (ScaledInput)
3280     X = ScaledInput;
3281 
3282   auto Y =
3283       B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}).addUse(X).setMIFlags(Flags);
3284 
3285   Register R;
3286   if (ST.hasFastFMAF32()) {
3287     // c+cc are ln(2)/ln(10) to more than 49 bits
3288     const float c_log10 = 0x1.344134p-2f;
3289     const float cc_log10 = 0x1.09f79ep-26f;
3290 
3291     // c + cc is ln(2) to more than 49 bits
3292     const float c_log = 0x1.62e42ep-1f;
3293     const float cc_log = 0x1.efa39ep-25f;
3294 
3295     auto C = B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log);
3296     auto CC = B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log);
3297 
3298     R = B.buildFMul(Ty, Y, C, Flags).getReg(0);
3299     auto NegR = B.buildFNeg(Ty, R, Flags);
3300     auto FMA0 = B.buildFMA(Ty, Y, C, NegR, Flags);
3301     auto FMA1 = B.buildFMA(Ty, Y, CC, FMA0, Flags);
3302     R = B.buildFAdd(Ty, R, FMA1, Flags).getReg(0);
3303   } else {
3304     // ch+ct is ln(2)/ln(10) to more than 36 bits
3305     const float ch_log10 = 0x1.344000p-2f;
3306     const float ct_log10 = 0x1.3509f6p-18f;
3307 
3308     // ch + ct is ln(2) to more than 36 bits
3309     const float ch_log = 0x1.62e000p-1f;
3310     const float ct_log = 0x1.0bfbe8p-15f;
3311 
3312     auto CH = B.buildFConstant(Ty, IsLog10 ? ch_log10 : ch_log);
3313     auto CT = B.buildFConstant(Ty, IsLog10 ? ct_log10 : ct_log);
3314 
3315     auto MaskConst = B.buildConstant(Ty, 0xfffff000);
3316     auto YH = B.buildAnd(Ty, Y, MaskConst);
3317     auto YT = B.buildFSub(Ty, Y, YH, Flags);
3318     auto YTCT = B.buildFMul(Ty, YT, CT, Flags);
3319 
3320     Register Mad0 =
3321         getMad(B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), Flags);
3322     Register Mad1 = getMad(B, Ty, YT.getReg(0), CH.getReg(0), Mad0, Flags);
3323     R = getMad(B, Ty, YH.getReg(0), CH.getReg(0), Mad1, Flags);
3324   }
3325 
3326   const bool IsFiniteOnly =
3327       (MI.getFlag(MachineInstr::FmNoNans) || TM.Options.NoNaNsFPMath) &&
3328       (MI.getFlag(MachineInstr::FmNoInfs) || TM.Options.NoInfsFPMath);
3329 
3330   if (!IsFiniteOnly) {
3331     // Expand isfinite(x) => fabs(x) < inf
3332     auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
3333     auto Fabs = B.buildFAbs(Ty, Y);
3334     auto IsFinite =
3335         B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
3336     R = B.buildSelect(Ty, IsFinite, R, Y, Flags).getReg(0);
3337   }
3338 
3339   if (ScaledInput) {
3340     auto Zero = B.buildFConstant(Ty, 0.0);
3341     auto ShiftK =
3342         B.buildFConstant(Ty, IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f);
3343     auto Shift = B.buildSelect(Ty, IsScaled, ShiftK, Zero, Flags);
3344     B.buildFSub(Dst, R, Shift, Flags);
3345   } else {
3346     B.buildCopy(Dst, R);
3347   }
3348 
3349   MI.eraseFromParent();
3350   return true;
3351 }
3352 
3353 bool AMDGPULegalizerInfo::legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst,
3354                                              Register Src, bool IsLog10,
3355                                              unsigned Flags) const {
3356   const double Log2BaseInverted =
3357       IsLog10 ? numbers::ln2 / numbers::ln10 : numbers::ln2;
3358 
3359   LLT Ty = B.getMRI()->getType(Dst);
3360 
3361   if (Ty == LLT::scalar(32)) {
3362     auto [ScaledInput, IsScaled] = getScaledLogInput(B, Src, Flags);
3363     if (ScaledInput) {
3364       auto LogSrc = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3365                         .addUse(Src)
3366                         .setMIFlags(Flags);
3367       auto ScaledResultOffset = B.buildFConstant(Ty, -32.0 * Log2BaseInverted);
3368       auto Zero = B.buildFConstant(Ty, 0.0);
3369       auto ResultOffset =
3370           B.buildSelect(Ty, IsScaled, ScaledResultOffset, Zero, Flags);
3371       auto Log2Inv = B.buildFConstant(Ty, Log2BaseInverted);
3372 
3373       if (ST.hasFastFMAF32())
3374         B.buildFMA(Dst, LogSrc, Log2Inv, ResultOffset, Flags);
3375       else {
3376         auto Mul = B.buildFMul(Ty, LogSrc, Log2Inv, Flags);
3377         B.buildFAdd(Dst, Mul, ResultOffset, Flags);
3378       }
3379 
3380       return true;
3381     }
3382   }
3383 
3384   auto Log2Operand = Ty == LLT::scalar(16)
3385                          ? B.buildFLog2(Ty, Src, Flags)
3386                          : B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3387                                .addUse(Src)
3388                                .setMIFlags(Flags);
3389   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
3390   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
3391   return true;
3392 }
3393 
3394 bool AMDGPULegalizerInfo::legalizeFExp2(MachineInstr &MI,
3395                                         MachineIRBuilder &B) const {
3396   // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
3397   // If we have to handle denormals, scale up the input and adjust the result.
3398 
3399   Register Dst = MI.getOperand(0).getReg();
3400   Register Src = MI.getOperand(1).getReg();
3401   unsigned Flags = MI.getFlags();
3402   LLT Ty = B.getMRI()->getType(Dst);
3403   const LLT F16 = LLT::scalar(16);
3404   const LLT F32 = LLT::scalar(32);
3405 
3406   if (Ty == F16) {
3407     // Nothing in half is a denormal when promoted to f32.
3408     auto Ext = B.buildFPExt(F32, Src, Flags);
3409     auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {F32})
3410                     .addUse(Ext.getReg(0))
3411                     .setMIFlags(Flags);
3412     B.buildFPTrunc(Dst, Log2, Flags);
3413     MI.eraseFromParent();
3414     return true;
3415   }
3416 
3417   assert(Ty == F32);
3418 
3419   if (!needsDenormHandlingF32(B.getMF(), Src, Flags)) {
3420     B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst})
3421         .addUse(Src)
3422         .setMIFlags(Flags);
3423     MI.eraseFromParent();
3424     return true;
3425   }
3426 
3427   // bool needs_scaling = x < -0x1.f80000p+6f;
3428   // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
3429 
3430   // -nextafter(128.0, -1)
3431   auto RangeCheckConst = B.buildFConstant(Ty, -0x1.f80000p+6f);
3432   auto NeedsScaling = B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src,
3433                                   RangeCheckConst, Flags);
3434 
3435   auto SixtyFour = B.buildFConstant(Ty, 0x1.0p+6f);
3436   auto Zero = B.buildFConstant(Ty, 0.0);
3437   auto AddOffset = B.buildSelect(F32, NeedsScaling, SixtyFour, Zero, Flags);
3438   auto AddInput = B.buildFAdd(F32, Src, AddOffset, Flags);
3439 
3440   auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3441                   .addUse(AddInput.getReg(0))
3442                   .setMIFlags(Flags);
3443 
3444   auto TwoExpNeg64 = B.buildFConstant(Ty, 0x1.0p-64f);
3445   auto One = B.buildFConstant(Ty, 1.0);
3446   auto ResultScale = B.buildSelect(F32, NeedsScaling, TwoExpNeg64, One, Flags);
3447   B.buildFMul(Dst, Exp2, ResultScale, Flags);
3448   MI.eraseFromParent();
3449   return true;
3450 }
3451 
3452 bool AMDGPULegalizerInfo::legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst,
3453                                              Register X, unsigned Flags) const {
3454   LLT Ty = B.getMRI()->getType(Dst);
3455   LLT F32 = LLT::scalar(32);
3456 
3457   if (Ty != F32 || !needsDenormHandlingF32(B.getMF(), X, Flags)) {
3458     auto Log2E = B.buildFConstant(Ty, numbers::log2e);
3459     auto Mul = B.buildFMul(Ty, X, Log2E, Flags);
3460 
3461     if (Ty == F32) {
3462       B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst})
3463         .addUse(Mul.getReg(0))
3464         .setMIFlags(Flags);
3465     } else {
3466       B.buildFExp2(Dst, Mul.getReg(0), Flags);
3467     }
3468 
3469     return true;
3470   }
3471 
3472   auto Threshold = B.buildFConstant(Ty, -0x1.5d58a0p+6f);
3473   auto NeedsScaling =
3474       B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, Threshold, Flags);
3475   auto ScaleOffset = B.buildFConstant(Ty, 0x1.0p+6f);
3476   auto ScaledX = B.buildFAdd(Ty, X, ScaleOffset, Flags);
3477   auto AdjustedX = B.buildSelect(Ty, NeedsScaling, ScaledX, X, Flags);
3478 
3479   auto Log2E = B.buildFConstant(Ty, numbers::log2e);
3480   auto ExpInput = B.buildFMul(Ty, AdjustedX, Log2E, Flags);
3481 
3482   auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3483     .addUse(ExpInput.getReg(0))
3484     .setMIFlags(Flags);
3485 
3486   auto ResultScaleFactor = B.buildFConstant(Ty, 0x1.969d48p-93f);
3487   auto AdjustedResult = B.buildFMul(Ty, Exp2, ResultScaleFactor, Flags);
3488   B.buildSelect(Dst, NeedsScaling, AdjustedResult, Exp2, Flags);
3489   return true;
3490 }
3491 
3492 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
3493                                        MachineIRBuilder &B) const {
3494   Register Dst = MI.getOperand(0).getReg();
3495   Register X = MI.getOperand(1).getReg();
3496   const unsigned Flags = MI.getFlags();
3497   MachineFunction &MF = B.getMF();
3498   MachineRegisterInfo &MRI = *B.getMRI();
3499   LLT Ty = MRI.getType(Dst);
3500   const LLT F16 = LLT::scalar(16);
3501   const LLT F32 = LLT::scalar(32);
3502   const bool IsExp10 = MI.getOpcode() == TargetOpcode::G_FEXP10;
3503 
3504   if (Ty == F16) {
3505     // v_exp_f16 (fmul x, log2e)
3506     if (allowApproxFunc(MF, Flags)) {
3507       // TODO: Does this really require fast?
3508       legalizeFExpUnsafe(B, Dst, X, Flags);
3509       MI.eraseFromParent();
3510       return true;
3511     }
3512 
3513     // exp(f16 x) ->
3514     //   fptrunc (v_exp_f32 (fmul (fpext x), log2e))
3515 
3516     // Nothing in half is a denormal when promoted to f32.
3517     auto Ext = B.buildFPExt(F32, X, Flags);
3518     Register Lowered = MRI.createGenericVirtualRegister(F32);
3519     legalizeFExpUnsafe(B, Lowered, Ext.getReg(0), Flags);
3520     B.buildFPTrunc(Dst, Lowered, Flags);
3521     MI.eraseFromParent();
3522     return true;
3523   }
3524 
3525   assert(Ty == F32);
3526 
3527   // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
3528   // library behavior. Also, is known-not-daz source sufficient?
3529   if (allowApproxFunc(MF, Flags)) {
3530     legalizeFExpUnsafe(B, Dst, X, Flags);
3531     MI.eraseFromParent();
3532     return true;
3533   }
3534 
3535   //    Algorithm:
3536   //
3537   //    e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
3538   //
3539   //    x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
3540   //    n = 64*m + j,   0 <= j < 64
3541   //
3542   //    e^x = 2^((64*m + j + f)/64)
3543   //        = (2^m) * (2^(j/64)) * 2^(f/64)
3544   //        = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
3545   //
3546   //    f = x*(64/ln(2)) - n
3547   //    r = f*(ln(2)/64) = x - n*(ln(2)/64)
3548   //
3549   //    e^x = (2^m) * (2^(j/64)) * e^r
3550   //
3551   //    (2^(j/64)) is precomputed
3552   //
3553   //    e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3554   //    e^r = 1 + q
3555   //
3556   //    q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3557   //
3558   //    e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
3559   const unsigned FlagsNoContract = Flags & ~MachineInstr::FmContract;
3560   Register PH, PL;
3561 
3562   if (ST.hasFastFMAF32()) {
3563     const float c_exp = numbers::log2ef;
3564     const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
3565     const float c_exp10 = 0x1.a934f0p+1f;
3566     const float cc_exp10 = 0x1.2f346ep-24f;
3567 
3568     auto C = B.buildFConstant(Ty, IsExp10 ? c_exp10 : c_exp);
3569     PH = B.buildFMul(Ty, X, C, Flags).getReg(0);
3570     auto NegPH = B.buildFNeg(Ty, PH, Flags);
3571     auto FMA0 = B.buildFMA(Ty, X, C, NegPH, Flags);
3572 
3573     auto CC = B.buildFConstant(Ty, IsExp10 ? cc_exp10 : cc_exp);
3574     PL = B.buildFMA(Ty, X, CC, FMA0, Flags).getReg(0);
3575   } else {
3576     const float ch_exp = 0x1.714000p+0f;
3577     const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
3578 
3579     const float ch_exp10 = 0x1.a92000p+1f;
3580     const float cl_exp10 = 0x1.4f0978p-11f;
3581 
3582     auto MaskConst = B.buildConstant(Ty, 0xfffff000);
3583     auto XH = B.buildAnd(Ty, X, MaskConst);
3584     auto XL = B.buildFSub(Ty, X, XH, Flags);
3585 
3586     auto CH = B.buildFConstant(Ty, IsExp10 ? ch_exp10 : ch_exp);
3587     PH = B.buildFMul(Ty, XH, CH, Flags).getReg(0);
3588 
3589     auto CL = B.buildFConstant(Ty, IsExp10 ? cl_exp10 : cl_exp);
3590     auto XLCL = B.buildFMul(Ty, XL, CL, Flags);
3591 
3592     Register Mad0 =
3593         getMad(B, Ty, XL.getReg(0), CH.getReg(0), XLCL.getReg(0), Flags);
3594     PL = getMad(B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags);
3595   }
3596 
3597   auto E = B.buildIntrinsicRoundeven(Ty, PH, Flags);
3598 
3599   // It is unsafe to contract this fsub into the PH multiply.
3600   auto PHSubE = B.buildFSub(Ty, PH, E, FlagsNoContract);
3601   auto A = B.buildFAdd(Ty, PHSubE, PL, Flags);
3602   auto IntE = B.buildFPTOSI(LLT::scalar(32), E);
3603 
3604   auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3605                   .addUse(A.getReg(0))
3606                   .setMIFlags(Flags);
3607   auto R = B.buildFLdexp(Ty, Exp2, IntE, Flags);
3608 
3609   auto UnderflowCheckConst =
3610       B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
3611   auto Zero = B.buildFConstant(Ty, 0.0);
3612   auto Underflow =
3613       B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, UnderflowCheckConst);
3614 
3615   R = B.buildSelect(Ty, Underflow, Zero, R);
3616 
3617   const auto &Options = MF.getTarget().Options;
3618 
3619   if (!(Flags & MachineInstr::FmNoInfs) && !Options.NoInfsFPMath) {
3620     auto OverflowCheckConst =
3621         B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
3622 
3623     auto Overflow =
3624         B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), X, OverflowCheckConst);
3625     auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
3626     R = B.buildSelect(Ty, Overflow, Inf, R, Flags);
3627   }
3628 
3629   B.buildCopy(Dst, R);
3630   MI.eraseFromParent();
3631   return true;
3632 }
3633 
3634 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
3635                                        MachineIRBuilder &B) const {
3636   Register Dst = MI.getOperand(0).getReg();
3637   Register Src0 = MI.getOperand(1).getReg();
3638   Register Src1 = MI.getOperand(2).getReg();
3639   unsigned Flags = MI.getFlags();
3640   LLT Ty = B.getMRI()->getType(Dst);
3641   const LLT F16 = LLT::float16();
3642   const LLT F32 = LLT::float32();
3643 
3644   if (Ty == F32) {
3645     auto Log = B.buildFLog2(F32, Src0, Flags);
3646     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
3647                    .addUse(Log.getReg(0))
3648                    .addUse(Src1)
3649                    .setMIFlags(Flags);
3650     B.buildFExp2(Dst, Mul, Flags);
3651   } else if (Ty == F16) {
3652     // There's no f16 fmul_legacy, so we need to convert for it.
3653     auto Log = B.buildFLog2(F16, Src0, Flags);
3654     auto Ext0 = B.buildFPExt(F32, Log, Flags);
3655     auto Ext1 = B.buildFPExt(F32, Src1, Flags);
3656     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
3657                    .addUse(Ext0.getReg(0))
3658                    .addUse(Ext1.getReg(0))
3659                    .setMIFlags(Flags);
3660     B.buildFExp2(Dst, B.buildFPTrunc(F16, Mul), Flags);
3661   } else
3662     return false;
3663 
3664   MI.eraseFromParent();
3665   return true;
3666 }
3667 
3668 // Find a source register, ignoring any possible source modifiers.
3669 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
3670   Register ModSrc = OrigSrc;
3671   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
3672     ModSrc = SrcFNeg->getOperand(1).getReg();
3673     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
3674       ModSrc = SrcFAbs->getOperand(1).getReg();
3675   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
3676     ModSrc = SrcFAbs->getOperand(1).getReg();
3677   return ModSrc;
3678 }
3679 
3680 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
3681                                          MachineRegisterInfo &MRI,
3682                                          MachineIRBuilder &B) const {
3683 
3684   const LLT S1 = LLT::scalar(1);
3685   const LLT F64 = LLT::float64();
3686   Register Dst = MI.getOperand(0).getReg();
3687   Register OrigSrc = MI.getOperand(1).getReg();
3688   unsigned Flags = MI.getFlags();
3689   assert(ST.hasFractBug() && MRI.getType(Dst) == F64 &&
3690          "this should not have been custom lowered");
3691 
3692   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
3693   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
3694   // efficient way to implement it is using V_FRACT_F64. The workaround for the
3695   // V_FRACT bug is:
3696   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
3697   //
3698   // Convert floor(x) to (x - fract(x))
3699 
3700   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {F64})
3701                    .addUse(OrigSrc)
3702                    .setMIFlags(Flags);
3703 
3704   // Give source modifier matching some assistance before obscuring a foldable
3705   // pattern.
3706 
3707   // TODO: We can avoid the neg on the fract? The input sign to fract
3708   // shouldn't matter?
3709   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
3710 
3711   auto Const =
3712       B.buildFConstant(F64, llvm::bit_cast<double>(0x3fefffffffffffff));
3713 
3714   Register Min = MRI.createGenericVirtualRegister(F64);
3715 
3716   // We don't need to concern ourselves with the snan handling difference, so
3717   // use the one which will directly select.
3718   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3719   if (MFI->getMode().IEEE)
3720     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
3721   else
3722     B.buildFMinNum(Min, Fract, Const, Flags);
3723 
3724   Register CorrectedFract = Min;
3725   if (!MI.getFlag(MachineInstr::FmNoNans)) {
3726     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
3727     CorrectedFract = B.buildSelect(F64, IsNan, ModSrc, Min, Flags).getReg(0);
3728   }
3729 
3730   auto NegFract = B.buildFNeg(F64, CorrectedFract, Flags);
3731   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
3732 
3733   MI.eraseFromParent();
3734   return true;
3735 }
3736 
3737 // Turn an illegal packed v2s16 build vector into bit operations.
3738 // TODO: This should probably be a bitcast action in LegalizerHelper.
3739 bool AMDGPULegalizerInfo::legalizeBuildVector(
3740   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
3741   Register Dst = MI.getOperand(0).getReg();
3742   const LLT S32 = LLT::scalar(32);
3743   const LLT S16 = LLT::scalar(16);
3744   assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
3745 
3746   Register Src0 = MI.getOperand(1).getReg();
3747   Register Src1 = MI.getOperand(2).getReg();
3748 
3749   if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
3750     assert(MRI.getType(Src0) == S32);
3751     Src0 = B.buildTrunc(S16, MI.getOperand(1).getReg()).getReg(0);
3752     Src1 = B.buildTrunc(S16, MI.getOperand(2).getReg()).getReg(0);
3753   }
3754 
3755   auto Merge = B.buildMergeLikeInstr(S32, {Src0, Src1});
3756   B.buildBitcast(Dst, Merge);
3757 
3758   MI.eraseFromParent();
3759   return true;
3760 }
3761 
3762 // Build a big integer multiply or multiply-add using MAD_64_32 instructions.
3763 //
3764 // Source and accumulation registers must all be 32-bits.
3765 //
3766 // TODO: When the multiply is uniform, we should produce a code sequence
3767 // that is better suited to instruction selection on the SALU. Instead of
3768 // the outer loop going over parts of the result, the outer loop should go
3769 // over parts of one of the factors. This should result in instruction
3770 // selection that makes full use of S_ADDC_U32 instructions.
3771 void AMDGPULegalizerInfo::buildMultiply(LegalizerHelper &Helper,
3772                                         MutableArrayRef<Register> Accum,
3773                                         ArrayRef<Register> Src0,
3774                                         ArrayRef<Register> Src1,
3775                                         bool UsePartialMad64_32,
3776                                         bool SeparateOddAlignedProducts) const {
3777   // Use (possibly empty) vectors of S1 registers to represent the set of
3778   // carries from one pair of positions to the next.
3779   using Carry = SmallVector<Register, 2>;
3780 
3781   MachineIRBuilder &B = Helper.MIRBuilder;
3782   GISelKnownBits &KB = *Helper.getKnownBits();
3783 
3784   const LLT S1 = LLT::scalar(1);
3785   const LLT S32 = LLT::scalar(32);
3786   const LLT S64 = LLT::scalar(64);
3787 
3788   Register Zero32;
3789   Register Zero64;
3790 
3791   auto getZero32 = [&]() -> Register {
3792     if (!Zero32)
3793       Zero32 = B.buildConstant(S32, 0).getReg(0);
3794     return Zero32;
3795   };
3796   auto getZero64 = [&]() -> Register {
3797     if (!Zero64)
3798       Zero64 = B.buildConstant(S64, 0).getReg(0);
3799     return Zero64;
3800   };
3801 
3802   SmallVector<bool, 2> Src0KnownZeros, Src1KnownZeros;
3803   for (unsigned i = 0; i < Src0.size(); ++i) {
3804     Src0KnownZeros.push_back(KB.getKnownBits(Src0[i]).isZero());
3805     Src1KnownZeros.push_back(KB.getKnownBits(Src1[i]).isZero());
3806   }
3807 
3808   // Merge the given carries into the 32-bit LocalAccum, which is modified
3809   // in-place.
3810   //
3811   // Returns the carry-out, which is a single S1 register or null.
3812   auto mergeCarry =
3813       [&](Register &LocalAccum, const Carry &CarryIn) -> Register {
3814         if (CarryIn.empty())
3815           return Register();
3816 
3817         bool HaveCarryOut = true;
3818         Register CarryAccum;
3819         if (CarryIn.size() == 1) {
3820           if (!LocalAccum) {
3821             LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
3822             return Register();
3823           }
3824 
3825           CarryAccum = getZero32();
3826         } else {
3827           CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
3828           for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
3829             CarryAccum =
3830                 B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i])
3831                     .getReg(0);
3832           }
3833 
3834           if (!LocalAccum) {
3835             LocalAccum = getZero32();
3836             HaveCarryOut = false;
3837           }
3838         }
3839 
3840         auto Add =
3841             B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back());
3842         LocalAccum = Add.getReg(0);
3843         return HaveCarryOut ? Add.getReg(1) : Register();
3844       };
3845 
3846   // Build a multiply-add chain to compute
3847   //
3848   //   LocalAccum + (partial products at DstIndex)
3849   //       + (opportunistic subset of CarryIn)
3850   //
3851   // LocalAccum is an array of one or two 32-bit registers that are updated
3852   // in-place. The incoming registers may be null.
3853   //
3854   // In some edge cases, carry-ins can be consumed "for free". In that case,
3855   // the consumed carry bits are removed from CarryIn in-place.
3856   auto buildMadChain =
3857       [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn)
3858           -> Carry {
3859         assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) ||
3860                (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1));
3861 
3862         Carry CarryOut;
3863         unsigned j0 = 0;
3864 
3865         // Use plain 32-bit multiplication for the most significant part of the
3866         // result by default.
3867         if (LocalAccum.size() == 1 &&
3868             (!UsePartialMad64_32 || !CarryIn.empty())) {
3869           do {
3870             // Skip multiplication if one of the operands is 0
3871             unsigned j1 = DstIndex - j0;
3872             if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
3873               ++j0;
3874               continue;
3875             }
3876             auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]);
3877             if (!LocalAccum[0] || KB.getKnownBits(LocalAccum[0]).isZero()) {
3878               LocalAccum[0] = Mul.getReg(0);
3879             } else {
3880               if (CarryIn.empty()) {
3881                 LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0);
3882               } else {
3883                 LocalAccum[0] =
3884                     B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back())
3885                         .getReg(0);
3886                 CarryIn.pop_back();
3887               }
3888             }
3889             ++j0;
3890           } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
3891         }
3892 
3893         // Build full 64-bit multiplies.
3894         if (j0 <= DstIndex) {
3895           bool HaveSmallAccum = false;
3896           Register Tmp;
3897 
3898           if (LocalAccum[0]) {
3899             if (LocalAccum.size() == 1) {
3900               Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0);
3901               HaveSmallAccum = true;
3902             } else if (LocalAccum[1]) {
3903               Tmp = B.buildMergeLikeInstr(S64, LocalAccum).getReg(0);
3904               HaveSmallAccum = false;
3905             } else {
3906               Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0);
3907               HaveSmallAccum = true;
3908             }
3909           } else {
3910             assert(LocalAccum.size() == 1 || !LocalAccum[1]);
3911             Tmp = getZero64();
3912             HaveSmallAccum = true;
3913           }
3914 
3915           do {
3916             unsigned j1 = DstIndex - j0;
3917             if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
3918               ++j0;
3919               continue;
3920             }
3921             auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1},
3922                                     {Src0[j0], Src1[j1], Tmp});
3923             Tmp = Mad.getReg(0);
3924             if (!HaveSmallAccum)
3925               CarryOut.push_back(Mad.getReg(1));
3926             HaveSmallAccum = false;
3927 
3928             ++j0;
3929           } while (j0 <= DstIndex);
3930 
3931           auto Unmerge = B.buildUnmerge(S32, Tmp);
3932           LocalAccum[0] = Unmerge.getReg(0);
3933           if (LocalAccum.size() > 1)
3934             LocalAccum[1] = Unmerge.getReg(1);
3935         }
3936 
3937         return CarryOut;
3938       };
3939 
3940   // Outer multiply loop, iterating over destination parts from least
3941   // significant to most significant parts.
3942   //
3943   // The columns of the following diagram correspond to the destination parts
3944   // affected by one iteration of the outer loop (ignoring boundary
3945   // conditions).
3946   //
3947   //   Dest index relative to 2 * i:      1 0 -1
3948   //                                      ------
3949   //   Carries from previous iteration:     e o
3950   //   Even-aligned partial product sum:  E E .
3951   //   Odd-aligned partial product sum:     O O
3952   //
3953   // 'o' is OddCarry, 'e' is EvenCarry.
3954   // EE and OO are computed from partial products via buildMadChain and use
3955   // accumulation where possible and appropriate.
3956   //
3957   Register SeparateOddCarry;
3958   Carry EvenCarry;
3959   Carry OddCarry;
3960 
3961   for (unsigned i = 0; i <= Accum.size() / 2; ++i) {
3962     Carry OddCarryIn = std::move(OddCarry);
3963     Carry EvenCarryIn = std::move(EvenCarry);
3964     OddCarry.clear();
3965     EvenCarry.clear();
3966 
3967     // Partial products at offset 2 * i.
3968     if (2 * i < Accum.size()) {
3969       auto LocalAccum = Accum.drop_front(2 * i).take_front(2);
3970       EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
3971     }
3972 
3973     // Partial products at offset 2 * i - 1.
3974     if (i > 0) {
3975       if (!SeparateOddAlignedProducts) {
3976         auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2);
3977         OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
3978       } else {
3979         bool IsHighest = 2 * i >= Accum.size();
3980         Register SeparateOddOut[2];
3981         auto LocalAccum = MutableArrayRef(SeparateOddOut)
3982                               .take_front(IsHighest ? 1 : 2);
3983         OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
3984 
3985         MachineInstr *Lo;
3986 
3987         if (i == 1) {
3988           if (!IsHighest)
3989             Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]);
3990           else
3991             Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]);
3992         } else {
3993           Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0],
3994                             SeparateOddCarry);
3995         }
3996         Accum[2 * i - 1] = Lo->getOperand(0).getReg();
3997 
3998         if (!IsHighest) {
3999           auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1],
4000                                 Lo->getOperand(1).getReg());
4001           Accum[2 * i] = Hi.getReg(0);
4002           SeparateOddCarry = Hi.getReg(1);
4003         }
4004       }
4005     }
4006 
4007     // Add in the carries from the previous iteration
4008     if (i > 0) {
4009       if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
4010         EvenCarryIn.push_back(CarryOut);
4011 
4012       if (2 * i < Accum.size()) {
4013         if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
4014           OddCarry.push_back(CarryOut);
4015       }
4016     }
4017   }
4018 }
4019 
4020 // Custom narrowing of wide multiplies using wide multiply-add instructions.
4021 //
4022 // TODO: If the multiply is followed by an addition, we should attempt to
4023 // integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities.
4024 bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper &Helper,
4025                                       MachineInstr &MI) const {
4026   assert(ST.hasMad64_32());
4027   assert(MI.getOpcode() == TargetOpcode::G_MUL);
4028 
4029   MachineIRBuilder &B = Helper.MIRBuilder;
4030   MachineRegisterInfo &MRI = *B.getMRI();
4031 
4032   Register DstReg = MI.getOperand(0).getReg();
4033   Register Src0 = MI.getOperand(1).getReg();
4034   Register Src1 = MI.getOperand(2).getReg();
4035 
4036   LLT Ty = MRI.getType(DstReg);
4037   assert(Ty.isScalar());
4038 
4039   unsigned Size = Ty.getSizeInBits();
4040   unsigned NumParts = Size / 32;
4041   assert((Size % 32) == 0);
4042   assert(NumParts >= 2);
4043 
4044   // Whether to use MAD_64_32 for partial products whose high half is
4045   // discarded. This avoids some ADD instructions but risks false dependency
4046   // stalls on some subtargets in some cases.
4047   const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10;
4048 
4049   // Whether to compute odd-aligned partial products separately. This is
4050   // advisable on subtargets where the accumulator of MAD_64_32 must be placed
4051   // in an even-aligned VGPR.
4052   const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
4053 
4054   LLT S32 = LLT::scalar(32);
4055   SmallVector<Register, 2> Src0Parts, Src1Parts;
4056   for (unsigned i = 0; i < NumParts; ++i) {
4057     Src0Parts.push_back(MRI.createGenericVirtualRegister(S32));
4058     Src1Parts.push_back(MRI.createGenericVirtualRegister(S32));
4059   }
4060   B.buildUnmerge(Src0Parts, Src0);
4061   B.buildUnmerge(Src1Parts, Src1);
4062 
4063   SmallVector<Register, 2> AccumRegs(NumParts);
4064   buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
4065                 SeparateOddAlignedProducts);
4066 
4067   B.buildMergeLikeInstr(DstReg, AccumRegs);
4068   MI.eraseFromParent();
4069   return true;
4070 }
4071 
4072 // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
4073 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
4074 // case with a single min instruction instead of a compare+select.
4075 bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI,
4076                                             MachineRegisterInfo &MRI,
4077                                             MachineIRBuilder &B) const {
4078   Register Dst = MI.getOperand(0).getReg();
4079   Register Src = MI.getOperand(1).getReg();
4080   LLT DstTy = MRI.getType(Dst);
4081   LLT SrcTy = MRI.getType(Src);
4082 
4083   unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
4084                         ? AMDGPU::G_AMDGPU_FFBH_U32
4085                         : AMDGPU::G_AMDGPU_FFBL_B32;
4086   auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
4087   B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
4088 
4089   MI.eraseFromParent();
4090   return true;
4091 }
4092 
4093 // Check that this is a G_XOR x, -1
4094 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
4095   if (MI.getOpcode() != TargetOpcode::G_XOR)
4096     return false;
4097   auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
4098   return ConstVal && *ConstVal == -1;
4099 }
4100 
4101 // Return the use branch instruction, otherwise null if the usage is invalid.
4102 static MachineInstr *
4103 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
4104                   MachineBasicBlock *&UncondBrTarget, bool &Negated) {
4105   Register CondDef = MI.getOperand(0).getReg();
4106   if (!MRI.hasOneNonDBGUse(CondDef))
4107     return nullptr;
4108 
4109   MachineBasicBlock *Parent = MI.getParent();
4110   MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
4111 
4112   if (isNot(MRI, *UseMI)) {
4113     Register NegatedCond = UseMI->getOperand(0).getReg();
4114     if (!MRI.hasOneNonDBGUse(NegatedCond))
4115       return nullptr;
4116 
4117     // We're deleting the def of this value, so we need to remove it.
4118     eraseInstr(*UseMI, MRI);
4119 
4120     UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
4121     Negated = true;
4122   }
4123 
4124   if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
4125     return nullptr;
4126 
4127   // Make sure the cond br is followed by a G_BR, or is the last instruction.
4128   MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
4129   if (Next == Parent->end()) {
4130     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
4131     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
4132       return nullptr;
4133     UncondBrTarget = &*NextMBB;
4134   } else {
4135     if (Next->getOpcode() != AMDGPU::G_BR)
4136       return nullptr;
4137     Br = &*Next;
4138     UncondBrTarget = Br->getOperand(0).getMBB();
4139   }
4140 
4141   return UseMI;
4142 }
4143 
4144 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
4145                                          const ArgDescriptor *Arg,
4146                                          const TargetRegisterClass *ArgRC,
4147                                          LLT ArgTy) const {
4148   MCRegister SrcReg = Arg->getRegister();
4149   assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
4150   assert(DstReg.isVirtual() && "Virtual register expected");
4151 
4152   Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg,
4153                                              *ArgRC, B.getDebugLoc(), ArgTy);
4154   if (Arg->isMasked()) {
4155     // TODO: Should we try to emit this once in the entry block?
4156     const LLT S32 = LLT::scalar(32);
4157     const unsigned Mask = Arg->getMask();
4158     const unsigned Shift = llvm::countr_zero<unsigned>(Mask);
4159 
4160     Register AndMaskSrc = LiveIn;
4161 
4162     // TODO: Avoid clearing the high bits if we know workitem id y/z are always
4163     // 0.
4164     if (Shift != 0) {
4165       auto ShiftAmt = B.buildConstant(S32, Shift);
4166       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
4167     }
4168 
4169     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
4170   } else {
4171     B.buildCopy(DstReg, LiveIn);
4172   }
4173 
4174   return true;
4175 }
4176 
4177 bool AMDGPULegalizerInfo::loadInputValue(
4178     Register DstReg, MachineIRBuilder &B,
4179     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4180   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4181   const ArgDescriptor *Arg = nullptr;
4182   const TargetRegisterClass *ArgRC;
4183   LLT ArgTy;
4184 
4185   CallingConv::ID CC = B.getMF().getFunction().getCallingConv();
4186   const ArgDescriptor WorkGroupIDX =
4187       ArgDescriptor::createRegister(AMDGPU::TTMP9);
4188   // If GridZ is not programmed in an entry function then the hardware will set
4189   // it to all zeros, so there is no need to mask the GridY value in the low
4190   // order bits.
4191   const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
4192       AMDGPU::TTMP7,
4193       AMDGPU::isEntryFunctionCC(CC) && !MFI->hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
4194   const ArgDescriptor WorkGroupIDZ =
4195       ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
4196   if (ST.hasArchitectedSGPRs() && AMDGPU::isCompute(CC)) {
4197     switch (ArgType) {
4198     case AMDGPUFunctionArgInfo::WORKGROUP_ID_X:
4199       Arg = &WorkGroupIDX;
4200       ArgRC = &AMDGPU::SReg_32RegClass;
4201       ArgTy = LLT::scalar(32);
4202       break;
4203     case AMDGPUFunctionArgInfo::WORKGROUP_ID_Y:
4204       Arg = &WorkGroupIDY;
4205       ArgRC = &AMDGPU::SReg_32RegClass;
4206       ArgTy = LLT::scalar(32);
4207       break;
4208     case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z:
4209       Arg = &WorkGroupIDZ;
4210       ArgRC = &AMDGPU::SReg_32RegClass;
4211       ArgTy = LLT::scalar(32);
4212       break;
4213     default:
4214       break;
4215     }
4216   }
4217 
4218   if (!Arg)
4219     std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
4220 
4221   if (!Arg) {
4222     if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) {
4223       // The intrinsic may appear when we have a 0 sized kernarg segment, in which
4224       // case the pointer argument may be missing and we use null.
4225       B.buildConstant(DstReg, 0);
4226       return true;
4227     }
4228 
4229     // It's undefined behavior if a function marked with the amdgpu-no-*
4230     // attributes uses the corresponding intrinsic.
4231     B.buildUndef(DstReg);
4232     return true;
4233   }
4234 
4235   if (!Arg->isRegister() || !Arg->getRegister().isValid())
4236     return false; // TODO: Handle these
4237   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
4238 }
4239 
4240 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
4241     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
4242     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4243   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
4244     return false;
4245 
4246   MI.eraseFromParent();
4247   return true;
4248 }
4249 
4250 static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI,
4251                                 int64_t C) {
4252   B.buildConstant(MI.getOperand(0).getReg(), C);
4253   MI.eraseFromParent();
4254   return true;
4255 }
4256 
4257 bool AMDGPULegalizerInfo::legalizeWorkitemIDIntrinsic(
4258     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
4259     unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4260   unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim);
4261   if (MaxID == 0)
4262     return replaceWithConstant(B, MI, 0);
4263 
4264   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4265   const ArgDescriptor *Arg;
4266   const TargetRegisterClass *ArgRC;
4267   LLT ArgTy;
4268   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
4269 
4270   Register DstReg = MI.getOperand(0).getReg();
4271   if (!Arg) {
4272     // It's undefined behavior if a function marked with the amdgpu-no-*
4273     // attributes uses the corresponding intrinsic.
4274     B.buildUndef(DstReg);
4275     MI.eraseFromParent();
4276     return true;
4277   }
4278 
4279   if (Arg->isMasked()) {
4280     // Don't bother inserting AssertZext for packed IDs since we're emitting the
4281     // masking operations anyway.
4282     //
4283     // TODO: We could assert the top bit is 0 for the source copy.
4284     if (!loadInputValue(DstReg, B, ArgType))
4285       return false;
4286   } else {
4287     Register TmpReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
4288     if (!loadInputValue(TmpReg, B, ArgType))
4289       return false;
4290     B.buildAssertZExt(DstReg, TmpReg, llvm::bit_width(MaxID));
4291   }
4292 
4293   MI.eraseFromParent();
4294   return true;
4295 }
4296 
4297 Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B,
4298                                                      int64_t Offset) const {
4299   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
4300   Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy);
4301 
4302   // TODO: If we passed in the base kernel offset we could have a better
4303   // alignment than 4, but we don't really need it.
4304   if (!loadInputValue(KernArgReg, B,
4305                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
4306     llvm_unreachable("failed to find kernarg segment ptr");
4307 
4308   auto COffset = B.buildConstant(LLT::scalar(64), Offset);
4309   // TODO: Should get nuw
4310   return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0);
4311 }
4312 
4313 /// Legalize a value that's loaded from kernel arguments. This is only used by
4314 /// legacy intrinsics.
4315 bool AMDGPULegalizerInfo::legalizeKernargMemParameter(MachineInstr &MI,
4316                                                       MachineIRBuilder &B,
4317                                                       uint64_t Offset,
4318                                                       Align Alignment) const {
4319   Register DstReg = MI.getOperand(0).getReg();
4320 
4321   assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) &&
4322          "unexpected kernarg parameter type");
4323 
4324   Register Ptr = getKernargParameterPtr(B, Offset);
4325   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
4326   B.buildLoad(DstReg, Ptr, PtrInfo, Align(4),
4327               MachineMemOperand::MODereferenceable |
4328                   MachineMemOperand::MOInvariant);
4329   MI.eraseFromParent();
4330   return true;
4331 }
4332 
4333 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
4334                                        MachineRegisterInfo &MRI,
4335                                        MachineIRBuilder &B) const {
4336   Register Dst = MI.getOperand(0).getReg();
4337   LLT DstTy = MRI.getType(Dst);
4338   LLT S16 = LLT::scalar(16);
4339   LLT S32 = LLT::scalar(32);
4340   LLT S64 = LLT::scalar(64);
4341 
4342   if (DstTy == S16)
4343     return legalizeFDIV16(MI, MRI, B);
4344   if (DstTy == S32)
4345     return legalizeFDIV32(MI, MRI, B);
4346   if (DstTy == S64)
4347     return legalizeFDIV64(MI, MRI, B);
4348 
4349   return false;
4350 }
4351 
4352 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
4353                                                         Register DstDivReg,
4354                                                         Register DstRemReg,
4355                                                         Register X,
4356                                                         Register Y) const {
4357   const LLT S1 = LLT::scalar(1);
4358   const LLT S32 = LLT::scalar(32);
4359 
4360   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
4361   // algorithm used here.
4362 
4363   // Initial estimate of inv(y).
4364   auto FloatY = B.buildUITOFP(S32, Y);
4365   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
4366   auto Scale = B.buildFConstant(S32, llvm::bit_cast<float>(0x4f7ffffe));
4367   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
4368   auto Z = B.buildFPTOUI(S32, ScaledY);
4369 
4370   // One round of UNR.
4371   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
4372   auto NegYZ = B.buildMul(S32, NegY, Z);
4373   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
4374 
4375   // Quotient/remainder estimate.
4376   auto Q = B.buildUMulH(S32, X, Z);
4377   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
4378 
4379   // First quotient/remainder refinement.
4380   auto One = B.buildConstant(S32, 1);
4381   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
4382   if (DstDivReg)
4383     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
4384   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
4385 
4386   // Second quotient/remainder refinement.
4387   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
4388   if (DstDivReg)
4389     B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
4390 
4391   if (DstRemReg)
4392     B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
4393 }
4394 
4395 // Build integer reciprocal sequence around V_RCP_IFLAG_F32
4396 //
4397 // Return lo, hi of result
4398 //
4399 // %cvt.lo = G_UITOFP Val.lo
4400 // %cvt.hi = G_UITOFP Val.hi
4401 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
4402 // %rcp = G_AMDGPU_RCP_IFLAG %mad
4403 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
4404 // %mul2 = G_FMUL %mul1, 2**(-32)
4405 // %trunc = G_INTRINSIC_TRUNC %mul2
4406 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
4407 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
4408 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
4409                                                        Register Val) {
4410   const LLT S32 = LLT::scalar(32);
4411   auto Unmerge = B.buildUnmerge(S32, Val);
4412 
4413   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
4414   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
4415 
4416   auto Mad = B.buildFMAD(
4417       S32, CvtHi, // 2**32
4418       B.buildFConstant(S32, llvm::bit_cast<float>(0x4f800000)), CvtLo);
4419 
4420   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
4421   auto Mul1 = B.buildFMul(
4422       S32, Rcp, B.buildFConstant(S32, llvm::bit_cast<float>(0x5f7ffffc)));
4423 
4424   // 2**(-32)
4425   auto Mul2 = B.buildFMul(
4426       S32, Mul1, B.buildFConstant(S32, llvm::bit_cast<float>(0x2f800000)));
4427   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
4428 
4429   // -(2**32)
4430   auto Mad2 = B.buildFMAD(
4431       S32, Trunc, B.buildFConstant(S32, llvm::bit_cast<float>(0xcf800000)),
4432       Mul1);
4433 
4434   auto ResultLo = B.buildFPTOUI(S32, Mad2);
4435   auto ResultHi = B.buildFPTOUI(S32, Trunc);
4436 
4437   return {ResultLo.getReg(0), ResultHi.getReg(0)};
4438 }
4439 
4440 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
4441                                                         Register DstDivReg,
4442                                                         Register DstRemReg,
4443                                                         Register Numer,
4444                                                         Register Denom) const {
4445   const LLT S32 = LLT::scalar(32);
4446   const LLT S64 = LLT::scalar(64);
4447   const LLT S1 = LLT::scalar(1);
4448   Register RcpLo, RcpHi;
4449 
4450   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
4451 
4452   auto Rcp = B.buildMergeLikeInstr(S64, {RcpLo, RcpHi});
4453 
4454   auto Zero64 = B.buildConstant(S64, 0);
4455   auto NegDenom = B.buildSub(S64, Zero64, Denom);
4456 
4457   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
4458   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
4459 
4460   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
4461   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
4462   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
4463 
4464   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
4465   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
4466   auto Add1 = B.buildMergeLikeInstr(S64, {Add1_Lo, Add1_Hi});
4467 
4468   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
4469   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
4470   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
4471   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
4472   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
4473 
4474   auto Zero32 = B.buildConstant(S32, 0);
4475   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
4476   auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
4477   auto Add2 = B.buildMergeLikeInstr(S64, {Add2_Lo, Add2_Hi});
4478 
4479   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
4480   Register NumerLo = UnmergeNumer.getReg(0);
4481   Register NumerHi = UnmergeNumer.getReg(1);
4482 
4483   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
4484   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
4485   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
4486   Register Mul3_Lo = UnmergeMul3.getReg(0);
4487   Register Mul3_Hi = UnmergeMul3.getReg(1);
4488   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
4489   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
4490   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
4491   auto Sub1 = B.buildMergeLikeInstr(S64, {Sub1_Lo, Sub1_Hi});
4492 
4493   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
4494   Register DenomLo = UnmergeDenom.getReg(0);
4495   Register DenomHi = UnmergeDenom.getReg(1);
4496 
4497   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
4498   auto C1 = B.buildSExt(S32, CmpHi);
4499 
4500   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
4501   auto C2 = B.buildSExt(S32, CmpLo);
4502 
4503   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
4504   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
4505 
4506   // TODO: Here and below portions of the code can be enclosed into if/endif.
4507   // Currently control flow is unconditional and we have 4 selects after
4508   // potential endif to substitute PHIs.
4509 
4510   // if C3 != 0 ...
4511   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
4512   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
4513   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
4514   auto Sub2 = B.buildMergeLikeInstr(S64, {Sub2_Lo, Sub2_Hi});
4515 
4516   auto One64 = B.buildConstant(S64, 1);
4517   auto Add3 = B.buildAdd(S64, MulHi3, One64);
4518 
4519   auto C4 =
4520       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
4521   auto C5 =
4522       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
4523   auto C6 = B.buildSelect(
4524       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
4525 
4526   // if (C6 != 0)
4527   auto Add4 = B.buildAdd(S64, Add3, One64);
4528   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
4529 
4530   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
4531   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
4532   auto Sub3 = B.buildMergeLikeInstr(S64, {Sub3_Lo, Sub3_Hi});
4533 
4534   // endif C6
4535   // endif C3
4536 
4537   if (DstDivReg) {
4538     auto Sel1 = B.buildSelect(
4539         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
4540     B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
4541                   Sel1, MulHi3);
4542   }
4543 
4544   if (DstRemReg) {
4545     auto Sel2 = B.buildSelect(
4546         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
4547     B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
4548                   Sel2, Sub1);
4549   }
4550 }
4551 
4552 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
4553                                                   MachineRegisterInfo &MRI,
4554                                                   MachineIRBuilder &B) const {
4555   Register DstDivReg, DstRemReg;
4556   switch (MI.getOpcode()) {
4557   default:
4558     llvm_unreachable("Unexpected opcode!");
4559   case AMDGPU::G_UDIV: {
4560     DstDivReg = MI.getOperand(0).getReg();
4561     break;
4562   }
4563   case AMDGPU::G_UREM: {
4564     DstRemReg = MI.getOperand(0).getReg();
4565     break;
4566   }
4567   case AMDGPU::G_UDIVREM: {
4568     DstDivReg = MI.getOperand(0).getReg();
4569     DstRemReg = MI.getOperand(1).getReg();
4570     break;
4571   }
4572   }
4573 
4574   const LLT S64 = LLT::scalar(64);
4575   const LLT S32 = LLT::scalar(32);
4576   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
4577   Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
4578   Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
4579   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
4580 
4581   if (Ty == S32)
4582     legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
4583   else if (Ty == S64)
4584     legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
4585   else
4586     return false;
4587 
4588   MI.eraseFromParent();
4589   return true;
4590 }
4591 
4592 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
4593                                                 MachineRegisterInfo &MRI,
4594                                                 MachineIRBuilder &B) const {
4595   const LLT S64 = LLT::scalar(64);
4596   const LLT S32 = LLT::scalar(32);
4597 
4598   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
4599   if (Ty != S32 && Ty != S64)
4600     return false;
4601 
4602   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
4603   Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
4604   Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
4605 
4606   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
4607   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
4608   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
4609 
4610   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
4611   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
4612 
4613   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
4614   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
4615 
4616   Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
4617   switch (MI.getOpcode()) {
4618   default:
4619     llvm_unreachable("Unexpected opcode!");
4620   case AMDGPU::G_SDIV: {
4621     DstDivReg = MI.getOperand(0).getReg();
4622     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
4623     break;
4624   }
4625   case AMDGPU::G_SREM: {
4626     DstRemReg = MI.getOperand(0).getReg();
4627     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
4628     break;
4629   }
4630   case AMDGPU::G_SDIVREM: {
4631     DstDivReg = MI.getOperand(0).getReg();
4632     DstRemReg = MI.getOperand(1).getReg();
4633     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
4634     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
4635     break;
4636   }
4637   }
4638 
4639   if (Ty == S32)
4640     legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
4641   else
4642     legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
4643 
4644   if (DstDivReg) {
4645     auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
4646     auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
4647     B.buildSub(DstDivReg, SignXor, Sign);
4648   }
4649 
4650   if (DstRemReg) {
4651     auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
4652     auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
4653     B.buildSub(DstRemReg, SignXor, Sign);
4654   }
4655 
4656   MI.eraseFromParent();
4657   return true;
4658 }
4659 
4660 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
4661                                                  MachineRegisterInfo &MRI,
4662                                                  MachineIRBuilder &B) const {
4663   Register Res = MI.getOperand(0).getReg();
4664   Register LHS = MI.getOperand(1).getReg();
4665   Register RHS = MI.getOperand(2).getReg();
4666   uint16_t Flags = MI.getFlags();
4667   LLT ResTy = MRI.getType(Res);
4668 
4669   const MachineFunction &MF = B.getMF();
4670   bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn) ||
4671                             MF.getTarget().Options.UnsafeFPMath;
4672 
4673   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
4674     if (!AllowInaccurateRcp && ResTy != LLT::scalar(16))
4675       return false;
4676 
4677     // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
4678     // the CI documentation has a worst case error of 1 ulp.
4679     // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
4680     // use it as long as we aren't trying to use denormals.
4681     //
4682     // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
4683 
4684     // 1 / x -> RCP(x)
4685     if (CLHS->isExactlyValue(1.0)) {
4686       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
4687           .addUse(RHS)
4688           .setMIFlags(Flags);
4689 
4690       MI.eraseFromParent();
4691       return true;
4692     }
4693 
4694     // -1 / x -> RCP( FNEG(x) )
4695     if (CLHS->isExactlyValue(-1.0)) {
4696       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
4697       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
4698           .addUse(FNeg.getReg(0))
4699           .setMIFlags(Flags);
4700 
4701       MI.eraseFromParent();
4702       return true;
4703     }
4704   }
4705 
4706   // For f16 require afn or arcp.
4707   // For f32 require afn.
4708   if (!AllowInaccurateRcp && (ResTy != LLT::scalar(16) ||
4709                               !MI.getFlag(MachineInstr::FmArcp)))
4710     return false;
4711 
4712   // x / y -> x * (1.0 / y)
4713   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
4714                  .addUse(RHS)
4715                  .setMIFlags(Flags);
4716   B.buildFMul(Res, LHS, RCP, Flags);
4717 
4718   MI.eraseFromParent();
4719   return true;
4720 }
4721 
4722 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
4723                                                    MachineRegisterInfo &MRI,
4724                                                    MachineIRBuilder &B) const {
4725   Register Res = MI.getOperand(0).getReg();
4726   Register X = MI.getOperand(1).getReg();
4727   Register Y = MI.getOperand(2).getReg();
4728   uint16_t Flags = MI.getFlags();
4729   LLT ResTy = MRI.getType(Res);
4730 
4731   const MachineFunction &MF = B.getMF();
4732   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
4733                             MI.getFlag(MachineInstr::FmAfn);
4734 
4735   if (!AllowInaccurateRcp)
4736     return false;
4737 
4738   auto NegY = B.buildFNeg(ResTy, Y);
4739   auto One = B.buildFConstant(ResTy, 1.0);
4740 
4741   auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
4742                .addUse(Y)
4743                .setMIFlags(Flags);
4744 
4745   auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
4746   R = B.buildFMA(ResTy, Tmp0, R, R);
4747 
4748   auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
4749   R = B.buildFMA(ResTy, Tmp1, R, R);
4750 
4751   auto Ret = B.buildFMul(ResTy, X, R);
4752   auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
4753 
4754   B.buildFMA(Res, Tmp2, R, Ret);
4755   MI.eraseFromParent();
4756   return true;
4757 }
4758 
4759 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
4760                                          MachineRegisterInfo &MRI,
4761                                          MachineIRBuilder &B) const {
4762   if (legalizeFastUnsafeFDIV(MI, MRI, B))
4763     return true;
4764 
4765   Register Res = MI.getOperand(0).getReg();
4766   Register LHS = MI.getOperand(1).getReg();
4767   Register RHS = MI.getOperand(2).getReg();
4768 
4769   uint16_t Flags = MI.getFlags();
4770 
4771   LLT S16 = LLT::scalar(16);
4772   LLT S32 = LLT::scalar(32);
4773 
4774   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
4775   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
4776 
4777   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
4778                  .addUse(RHSExt.getReg(0))
4779                  .setMIFlags(Flags);
4780 
4781   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
4782   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
4783 
4784   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
4785       .addUse(RDst.getReg(0))
4786       .addUse(RHS)
4787       .addUse(LHS)
4788       .setMIFlags(Flags);
4789 
4790   MI.eraseFromParent();
4791   return true;
4792 }
4793 
4794 static const unsigned SPDenormModeBitField =
4795     AMDGPU::Hwreg::ID_MODE | (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
4796     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
4797 
4798 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
4799 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
4800 static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B,
4801                                const GCNSubtarget &ST,
4802                                SIModeRegisterDefaults Mode) {
4803   // Set SP denorm mode to this value.
4804   unsigned SPDenormMode =
4805     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
4806 
4807   if (ST.hasDenormModeInst()) {
4808     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
4809     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
4810 
4811     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
4812     B.buildInstr(AMDGPU::S_DENORM_MODE)
4813       .addImm(NewDenormModeValue);
4814 
4815   } else {
4816     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
4817       .addImm(SPDenormMode)
4818       .addImm(SPDenormModeBitField);
4819   }
4820 }
4821 
4822 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
4823                                          MachineRegisterInfo &MRI,
4824                                          MachineIRBuilder &B) const {
4825   if (legalizeFastUnsafeFDIV(MI, MRI, B))
4826     return true;
4827 
4828   Register Res = MI.getOperand(0).getReg();
4829   Register LHS = MI.getOperand(1).getReg();
4830   Register RHS = MI.getOperand(2).getReg();
4831   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4832   SIModeRegisterDefaults Mode = MFI->getMode();
4833 
4834   uint16_t Flags = MI.getFlags();
4835 
4836   LLT S32 = LLT::scalar(32);
4837   LLT S1 = LLT::scalar(1);
4838 
4839   auto One = B.buildFConstant(S32, 1.0f);
4840 
4841   auto DenominatorScaled =
4842       B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
4843           .addUse(LHS)
4844           .addUse(RHS)
4845           .addImm(0)
4846           .setMIFlags(Flags);
4847   auto NumeratorScaled =
4848       B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
4849           .addUse(LHS)
4850           .addUse(RHS)
4851           .addImm(1)
4852           .setMIFlags(Flags);
4853 
4854   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
4855                        .addUse(DenominatorScaled.getReg(0))
4856                        .setMIFlags(Flags);
4857   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
4858 
4859   const bool PreservesDenormals = Mode.FP32Denormals == DenormalMode::getIEEE();
4860   const bool HasDynamicDenormals =
4861       (Mode.FP32Denormals.Input == DenormalMode::Dynamic) ||
4862       (Mode.FP32Denormals.Output == DenormalMode::Dynamic);
4863 
4864   Register SavedSPDenormMode;
4865   if (!PreservesDenormals) {
4866     if (HasDynamicDenormals) {
4867       SavedSPDenormMode = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4868       B.buildInstr(AMDGPU::S_GETREG_B32)
4869           .addDef(SavedSPDenormMode)
4870           .addImm(SPDenormModeBitField);
4871     }
4872     toggleSPDenormMode(true, B, ST, Mode);
4873   }
4874 
4875   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
4876   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
4877   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
4878   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
4879   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
4880   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
4881 
4882   if (!PreservesDenormals) {
4883     if (HasDynamicDenormals) {
4884       assert(SavedSPDenormMode);
4885       B.buildInstr(AMDGPU::S_SETREG_B32)
4886           .addReg(SavedSPDenormMode)
4887           .addImm(SPDenormModeBitField);
4888     } else
4889       toggleSPDenormMode(false, B, ST, Mode);
4890   }
4891 
4892   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32})
4893                   .addUse(Fma4.getReg(0))
4894                   .addUse(Fma1.getReg(0))
4895                   .addUse(Fma3.getReg(0))
4896                   .addUse(NumeratorScaled.getReg(1))
4897                   .setMIFlags(Flags);
4898 
4899   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
4900       .addUse(Fmas.getReg(0))
4901       .addUse(RHS)
4902       .addUse(LHS)
4903       .setMIFlags(Flags);
4904 
4905   MI.eraseFromParent();
4906   return true;
4907 }
4908 
4909 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
4910                                          MachineRegisterInfo &MRI,
4911                                          MachineIRBuilder &B) const {
4912   if (legalizeFastUnsafeFDIV64(MI, MRI, B))
4913     return true;
4914 
4915   Register Res = MI.getOperand(0).getReg();
4916   Register LHS = MI.getOperand(1).getReg();
4917   Register RHS = MI.getOperand(2).getReg();
4918 
4919   uint16_t Flags = MI.getFlags();
4920 
4921   LLT S64 = LLT::scalar(64);
4922   LLT S1 = LLT::scalar(1);
4923 
4924   auto One = B.buildFConstant(S64, 1.0);
4925 
4926   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
4927                        .addUse(LHS)
4928                        .addUse(RHS)
4929                        .addImm(0)
4930                        .setMIFlags(Flags);
4931 
4932   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
4933 
4934   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64})
4935                  .addUse(DivScale0.getReg(0))
4936                  .setMIFlags(Flags);
4937 
4938   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
4939   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
4940   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
4941 
4942   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
4943                        .addUse(LHS)
4944                        .addUse(RHS)
4945                        .addImm(1)
4946                        .setMIFlags(Flags);
4947 
4948   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
4949   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
4950   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
4951 
4952   Register Scale;
4953   if (!ST.hasUsableDivScaleConditionOutput()) {
4954     // Workaround a hardware bug on SI where the condition output from div_scale
4955     // is not usable.
4956 
4957     LLT S32 = LLT::scalar(32);
4958 
4959     auto NumUnmerge = B.buildUnmerge(S32, LHS);
4960     auto DenUnmerge = B.buildUnmerge(S32, RHS);
4961     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
4962     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
4963 
4964     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
4965                               Scale1Unmerge.getReg(1));
4966     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
4967                               Scale0Unmerge.getReg(1));
4968     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
4969   } else {
4970     Scale = DivScale1.getReg(1);
4971   }
4972 
4973   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64})
4974                   .addUse(Fma4.getReg(0))
4975                   .addUse(Fma3.getReg(0))
4976                   .addUse(Mul.getReg(0))
4977                   .addUse(Scale)
4978                   .setMIFlags(Flags);
4979 
4980   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, ArrayRef(Res))
4981       .addUse(Fmas.getReg(0))
4982       .addUse(RHS)
4983       .addUse(LHS)
4984       .setMIFlags(Flags);
4985 
4986   MI.eraseFromParent();
4987   return true;
4988 }
4989 
4990 bool AMDGPULegalizerInfo::legalizeFFREXP(MachineInstr &MI,
4991                                          MachineRegisterInfo &MRI,
4992                                          MachineIRBuilder &B) const {
4993   Register Res0 = MI.getOperand(0).getReg();
4994   Register Res1 = MI.getOperand(1).getReg();
4995   Register Val = MI.getOperand(2).getReg();
4996   uint16_t Flags = MI.getFlags();
4997 
4998   LLT Ty = MRI.getType(Res0);
4999   LLT InstrExpTy = Ty == LLT::scalar(16) ? LLT::scalar(16) : LLT::scalar(32);
5000 
5001   auto Mant = B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty})
5002                   .addUse(Val)
5003                   .setMIFlags(Flags);
5004   auto Exp = B.buildIntrinsic(Intrinsic::amdgcn_frexp_exp, {InstrExpTy})
5005                  .addUse(Val)
5006                  .setMIFlags(Flags);
5007 
5008   if (ST.hasFractBug()) {
5009     auto Fabs = B.buildFAbs(Ty, Val);
5010     auto Inf = B.buildFConstant(Ty, APFloat::getInf(getFltSemanticForLLT(Ty)));
5011     auto IsFinite =
5012         B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
5013     auto Zero = B.buildConstant(InstrExpTy, 0);
5014     Exp = B.buildSelect(InstrExpTy, IsFinite, Exp, Zero);
5015     Mant = B.buildSelect(Ty, IsFinite, Mant, Val);
5016   }
5017 
5018   B.buildCopy(Res0, Mant);
5019   B.buildSExtOrTrunc(Res1, Exp);
5020 
5021   MI.eraseFromParent();
5022   return true;
5023 }
5024 
5025 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
5026                                                  MachineRegisterInfo &MRI,
5027                                                  MachineIRBuilder &B) const {
5028   Register Res = MI.getOperand(0).getReg();
5029   Register LHS = MI.getOperand(2).getReg();
5030   Register RHS = MI.getOperand(3).getReg();
5031   uint16_t Flags = MI.getFlags();
5032 
5033   LLT S32 = LLT::scalar(32);
5034   LLT S1 = LLT::scalar(1);
5035 
5036   auto Abs = B.buildFAbs(S32, RHS, Flags);
5037   const APFloat C0Val(1.0f);
5038 
5039   auto C0 = B.buildFConstant(S32, 0x1p+96f);
5040   auto C1 = B.buildFConstant(S32, 0x1p-32f);
5041   auto C2 = B.buildFConstant(S32, 1.0f);
5042 
5043   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
5044   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
5045 
5046   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
5047 
5048   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
5049                  .addUse(Mul0.getReg(0))
5050                  .setMIFlags(Flags);
5051 
5052   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
5053 
5054   B.buildFMul(Res, Sel, Mul1, Flags);
5055 
5056   MI.eraseFromParent();
5057   return true;
5058 }
5059 
5060 bool AMDGPULegalizerInfo::legalizeFSQRTF16(MachineInstr &MI,
5061                                            MachineRegisterInfo &MRI,
5062                                            MachineIRBuilder &B) const {
5063   // Bypass the correct expansion a standard promotion through G_FSQRT would
5064   // get. The f32 op is accurate enough for the f16 cas.
5065   unsigned Flags = MI.getFlags();
5066   assert(!ST.has16BitInsts());
5067   const LLT F32 = LLT::scalar(32);
5068   auto Ext = B.buildFPExt(F32, MI.getOperand(1), Flags);
5069   auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_sqrt, {F32})
5070     .addUse(Ext.getReg(0))
5071     .setMIFlags(Flags);
5072   B.buildFPTrunc(MI.getOperand(0), Log2, Flags);
5073   MI.eraseFromParent();
5074   return true;
5075 }
5076 
5077 bool AMDGPULegalizerInfo::legalizeFSQRTF32(MachineInstr &MI,
5078                                            MachineRegisterInfo &MRI,
5079                                            MachineIRBuilder &B) const {
5080   MachineFunction &MF = B.getMF();
5081   Register Dst = MI.getOperand(0).getReg();
5082   Register X = MI.getOperand(1).getReg();
5083   const unsigned Flags = MI.getFlags();
5084   const LLT S1 = LLT::scalar(1);
5085   const LLT F32 = LLT::scalar(32);
5086   const LLT I32 = LLT::scalar(32);
5087 
5088   if (allowApproxFunc(MF, Flags)) {
5089     B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({Dst}))
5090       .addUse(X)
5091       .setMIFlags(Flags);
5092     MI.eraseFromParent();
5093     return true;
5094   }
5095 
5096   auto ScaleThreshold = B.buildFConstant(F32, 0x1.0p-96f);
5097   auto NeedScale = B.buildFCmp(CmpInst::FCMP_OGT, S1, ScaleThreshold, X, Flags);
5098   auto ScaleUpFactor = B.buildFConstant(F32, 0x1.0p+32f);
5099   auto ScaledX = B.buildFMul(F32, X, ScaleUpFactor, Flags);
5100   auto SqrtX = B.buildSelect(F32, NeedScale, ScaledX, X, Flags);
5101 
5102   Register SqrtS = MRI.createGenericVirtualRegister(F32);
5103   if (needsDenormHandlingF32(MF, X, Flags)) {
5104     B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({SqrtS}))
5105       .addUse(SqrtX.getReg(0))
5106       .setMIFlags(Flags);
5107 
5108     auto NegOne = B.buildConstant(I32, -1);
5109     auto SqrtSNextDown = B.buildAdd(I32, SqrtS, NegOne);
5110 
5111     auto NegSqrtSNextDown = B.buildFNeg(F32, SqrtSNextDown, Flags);
5112     auto SqrtVP = B.buildFMA(F32, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
5113 
5114     auto PosOne = B.buildConstant(I32, 1);
5115     auto SqrtSNextUp = B.buildAdd(I32, SqrtS, PosOne);
5116 
5117     auto NegSqrtSNextUp = B.buildFNeg(F32, SqrtSNextUp, Flags);
5118     auto SqrtVS = B.buildFMA(F32, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
5119 
5120     auto Zero = B.buildFConstant(F32, 0.0f);
5121     auto SqrtVPLE0 = B.buildFCmp(CmpInst::FCMP_OLE, S1, SqrtVP, Zero, Flags);
5122 
5123     SqrtS =
5124         B.buildSelect(F32, SqrtVPLE0, SqrtSNextDown, SqrtS, Flags).getReg(0);
5125 
5126     auto SqrtVPVSGT0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, SqrtVS, Zero, Flags);
5127     SqrtS =
5128         B.buildSelect(F32, SqrtVPVSGT0, SqrtSNextUp, SqrtS, Flags).getReg(0);
5129   } else {
5130     auto SqrtR =
5131         B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F32}).addReg(SqrtX.getReg(0));
5132     B.buildFMul(SqrtS, SqrtX, SqrtR, Flags);
5133 
5134     auto Half = B.buildFConstant(F32, 0.5f);
5135     auto SqrtH = B.buildFMul(F32, SqrtR, Half, Flags);
5136     auto NegSqrtH = B.buildFNeg(F32, SqrtH, Flags);
5137     auto SqrtE = B.buildFMA(F32, NegSqrtH, SqrtS, Half, Flags);
5138     SqrtH = B.buildFMA(F32, SqrtH, SqrtE, SqrtH, Flags);
5139     SqrtS = B.buildFMA(F32, SqrtS, SqrtE, SqrtS, Flags).getReg(0);
5140     auto NegSqrtS = B.buildFNeg(F32, SqrtS, Flags);
5141     auto SqrtD = B.buildFMA(F32, NegSqrtS, SqrtS, SqrtX, Flags);
5142     SqrtS = B.buildFMA(F32, SqrtD, SqrtH, SqrtS, Flags).getReg(0);
5143   }
5144 
5145   auto ScaleDownFactor = B.buildFConstant(F32, 0x1.0p-16f);
5146 
5147   auto ScaledDown = B.buildFMul(F32, SqrtS, ScaleDownFactor, Flags);
5148 
5149   SqrtS = B.buildSelect(F32, NeedScale, ScaledDown, SqrtS, Flags).getReg(0);
5150 
5151   auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf);
5152   B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtS, Flags);
5153 
5154   MI.eraseFromParent();
5155   return true;
5156 }
5157 
5158 bool AMDGPULegalizerInfo::legalizeFSQRTF64(MachineInstr &MI,
5159                                            MachineRegisterInfo &MRI,
5160                                            MachineIRBuilder &B) const {
5161   // For double type, the SQRT and RSQ instructions don't have required
5162   // precision, we apply Goldschmidt's algorithm to improve the result:
5163   //
5164   //   y0 = rsq(x)
5165   //   g0 = x * y0
5166   //   h0 = 0.5 * y0
5167   //
5168   //   r0 = 0.5 - h0 * g0
5169   //   g1 = g0 * r0 + g0
5170   //   h1 = h0 * r0 + h0
5171   //
5172   //   r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
5173   //   g2 = g1 * r1 + g1     g2 = d0 * h1 + g1
5174   //   h2 = h1 * r1 + h1
5175   //
5176   //   r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
5177   //   g3 = g2 * r2 + g2     g3 = d1 * h1 + g2
5178   //
5179   //   sqrt(x) = g3
5180 
5181   const LLT S1 = LLT::scalar(1);
5182   const LLT S32 = LLT::scalar(32);
5183   const LLT F64 = LLT::scalar(64);
5184 
5185   Register Dst = MI.getOperand(0).getReg();
5186   assert(MRI.getType(Dst) == F64 && "only expect to lower f64 sqrt");
5187 
5188   Register X = MI.getOperand(1).getReg();
5189   unsigned Flags = MI.getFlags();
5190 
5191   auto ScaleConstant = B.buildFConstant(F64, 0x1.0p-767);
5192 
5193   auto ZeroInt = B.buildConstant(S32, 0);
5194   auto Scaling = B.buildFCmp(FCmpInst::FCMP_OLT, S1, X, ScaleConstant);
5195 
5196   // Scale up input if it is too small.
5197   auto ScaleUpFactor = B.buildConstant(S32, 256);
5198   auto ScaleUp = B.buildSelect(S32, Scaling, ScaleUpFactor, ZeroInt);
5199   auto SqrtX = B.buildFLdexp(F64, X, ScaleUp, Flags);
5200 
5201   auto SqrtY =
5202       B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F64}).addReg(SqrtX.getReg(0));
5203 
5204   auto Half = B.buildFConstant(F64, 0.5);
5205   auto SqrtH0 = B.buildFMul(F64, SqrtY, Half);
5206   auto SqrtS0 = B.buildFMul(F64, SqrtX, SqrtY);
5207 
5208   auto NegSqrtH0 = B.buildFNeg(F64, SqrtH0);
5209   auto SqrtR0 = B.buildFMA(F64, NegSqrtH0, SqrtS0, Half);
5210 
5211   auto SqrtS1 = B.buildFMA(F64, SqrtS0, SqrtR0, SqrtS0);
5212   auto SqrtH1 = B.buildFMA(F64, SqrtH0, SqrtR0, SqrtH0);
5213 
5214   auto NegSqrtS1 = B.buildFNeg(F64, SqrtS1);
5215   auto SqrtD0 = B.buildFMA(F64, NegSqrtS1, SqrtS1, SqrtX);
5216 
5217   auto SqrtS2 = B.buildFMA(F64, SqrtD0, SqrtH1, SqrtS1);
5218 
5219   auto NegSqrtS2 = B.buildFNeg(F64, SqrtS2);
5220   auto SqrtD1 = B.buildFMA(F64, NegSqrtS2, SqrtS2, SqrtX);
5221 
5222   auto SqrtRet = B.buildFMA(F64, SqrtD1, SqrtH1, SqrtS2);
5223 
5224   // Scale down the result.
5225   auto ScaleDownFactor = B.buildConstant(S32, -128);
5226   auto ScaleDown = B.buildSelect(S32, Scaling, ScaleDownFactor, ZeroInt);
5227   SqrtRet = B.buildFLdexp(F64, SqrtRet, ScaleDown, Flags);
5228 
5229   // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
5230   // with finite only or nsz because rsq(+/-0) = +/-inf
5231 
5232   // TODO: Check for DAZ and expand to subnormals
5233   auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf);
5234 
5235   // If x is +INF, +0, or -0, use its original value
5236   B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtRet, Flags);
5237 
5238   MI.eraseFromParent();
5239   return true;
5240 }
5241 
5242 bool AMDGPULegalizerInfo::legalizeFSQRT(MachineInstr &MI,
5243                                         MachineRegisterInfo &MRI,
5244                                         MachineIRBuilder &B) const {
5245   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
5246   if (Ty == LLT::scalar(32))
5247     return legalizeFSQRTF32(MI, MRI, B);
5248   if (Ty == LLT::scalar(64))
5249     return legalizeFSQRTF64(MI, MRI, B);
5250   if (Ty == LLT::scalar(16))
5251     return legalizeFSQRTF16(MI, MRI, B);
5252   return false;
5253 }
5254 
5255 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
5256 // FIXME: Why do we handle this one but not other removed instructions?
5257 //
5258 // Reciprocal square root.  The clamp prevents infinite results, clamping
5259 // infinities to max_float.  D.f = 1.0 / sqrt(S0.f), result clamped to
5260 // +-max_float.
5261 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
5262                                                     MachineRegisterInfo &MRI,
5263                                                     MachineIRBuilder &B) const {
5264   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
5265     return true;
5266 
5267   Register Dst = MI.getOperand(0).getReg();
5268   Register Src = MI.getOperand(2).getReg();
5269   auto Flags = MI.getFlags();
5270 
5271   LLT Ty = MRI.getType(Dst);
5272 
5273   const fltSemantics *FltSemantics;
5274   if (Ty == LLT::scalar(32))
5275     FltSemantics = &APFloat::IEEEsingle();
5276   else if (Ty == LLT::scalar(64))
5277     FltSemantics = &APFloat::IEEEdouble();
5278   else
5279     return false;
5280 
5281   auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty})
5282                  .addUse(Src)
5283                  .setMIFlags(Flags);
5284 
5285   // We don't need to concern ourselves with the snan handling difference, since
5286   // the rsq quieted (or not) so use the one which will directly select.
5287   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5288   const bool UseIEEE = MFI->getMode().IEEE;
5289 
5290   auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
5291   auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
5292                             B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
5293 
5294   auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
5295 
5296   if (UseIEEE)
5297     B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
5298   else
5299     B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
5300   MI.eraseFromParent();
5301   return true;
5302 }
5303 
5304 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
5305   switch (IID) {
5306   case Intrinsic::amdgcn_ds_fadd:
5307     return AMDGPU::G_ATOMICRMW_FADD;
5308   case Intrinsic::amdgcn_ds_fmin:
5309     return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
5310   case Intrinsic::amdgcn_ds_fmax:
5311     return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
5312   default:
5313     llvm_unreachable("not a DS FP intrinsic");
5314   }
5315 }
5316 
5317 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
5318                                                       MachineInstr &MI,
5319                                                       Intrinsic::ID IID) const {
5320   GISelChangeObserver &Observer = Helper.Observer;
5321   Observer.changingInstr(MI);
5322 
5323   MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
5324 
5325   // The remaining operands were used to set fields in the MemOperand on
5326   // construction.
5327   for (int I = 6; I > 3; --I)
5328     MI.removeOperand(I);
5329 
5330   MI.removeOperand(1); // Remove the intrinsic ID.
5331   Observer.changedInstr(MI);
5332   return true;
5333 }
5334 
5335 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
5336                                             MachineRegisterInfo &MRI,
5337                                             MachineIRBuilder &B) const {
5338   uint64_t Offset =
5339     ST.getTargetLowering()->getImplicitParameterOffset(
5340       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
5341   LLT DstTy = MRI.getType(DstReg);
5342   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
5343 
5344   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
5345   if (!loadInputValue(KernargPtrReg, B,
5346                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
5347     return false;
5348 
5349   // FIXME: This should be nuw
5350   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
5351   return true;
5352 }
5353 
5354 /// To create a buffer resource from a 64-bit pointer, mask off the upper 32
5355 /// bits of the pointer and replace them with the stride argument, then
5356 /// merge_values everything together. In the common case of a raw buffer (the
5357 /// stride component is 0), we can just AND off the upper half.
5358 bool AMDGPULegalizerInfo::legalizePointerAsRsrcIntrin(
5359     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
5360   Register Result = MI.getOperand(0).getReg();
5361   Register Pointer = MI.getOperand(2).getReg();
5362   Register Stride = MI.getOperand(3).getReg();
5363   Register NumRecords = MI.getOperand(4).getReg();
5364   Register Flags = MI.getOperand(5).getReg();
5365 
5366   LLT S32 = LLT::scalar(32);
5367 
5368   B.setInsertPt(B.getMBB(), ++B.getInsertPt());
5369   auto Unmerge = B.buildUnmerge(S32, Pointer);
5370   Register LowHalf = Unmerge.getReg(0);
5371   Register HighHalf = Unmerge.getReg(1);
5372 
5373   auto AndMask = B.buildConstant(S32, 0x0000ffff);
5374   auto Masked = B.buildAnd(S32, HighHalf, AndMask);
5375 
5376   MachineInstrBuilder NewHighHalf = Masked;
5377   std::optional<ValueAndVReg> StrideConst =
5378       getIConstantVRegValWithLookThrough(Stride, MRI);
5379   if (!StrideConst || !StrideConst->Value.isZero()) {
5380     MachineInstrBuilder ShiftedStride;
5381     if (StrideConst) {
5382       uint32_t StrideVal = StrideConst->Value.getZExtValue();
5383       uint32_t ShiftedStrideVal = StrideVal << 16;
5384       ShiftedStride = B.buildConstant(S32, ShiftedStrideVal);
5385     } else {
5386       auto ExtStride = B.buildAnyExt(S32, Stride);
5387       auto ShiftConst = B.buildConstant(S32, 16);
5388       ShiftedStride = B.buildShl(S32, ExtStride, ShiftConst);
5389     }
5390     NewHighHalf = B.buildOr(S32, Masked, ShiftedStride);
5391   }
5392   Register NewHighHalfReg = NewHighHalf.getReg(0);
5393   B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags});
5394   MI.eraseFromParent();
5395   return true;
5396 }
5397 
5398 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
5399                                                  MachineRegisterInfo &MRI,
5400                                                  MachineIRBuilder &B) const {
5401   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5402   if (!MFI->isEntryFunction()) {
5403     return legalizePreloadedArgIntrin(MI, MRI, B,
5404                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
5405   }
5406 
5407   Register DstReg = MI.getOperand(0).getReg();
5408   if (!getImplicitArgPtr(DstReg, MRI, B))
5409     return false;
5410 
5411   MI.eraseFromParent();
5412   return true;
5413 }
5414 
5415 bool AMDGPULegalizerInfo::getLDSKernelId(Register DstReg,
5416                                          MachineRegisterInfo &MRI,
5417                                          MachineIRBuilder &B) const {
5418   Function &F = B.getMF().getFunction();
5419   std::optional<uint32_t> KnownSize =
5420       AMDGPUMachineFunction::getLDSKernelIdMetadata(F);
5421   if (KnownSize.has_value())
5422     B.buildConstant(DstReg, *KnownSize);
5423   return false;
5424 }
5425 
5426 bool AMDGPULegalizerInfo::legalizeLDSKernelId(MachineInstr &MI,
5427                                               MachineRegisterInfo &MRI,
5428                                               MachineIRBuilder &B) const {
5429 
5430   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5431   if (!MFI->isEntryFunction()) {
5432     return legalizePreloadedArgIntrin(MI, MRI, B,
5433                                       AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
5434   }
5435 
5436   Register DstReg = MI.getOperand(0).getReg();
5437   if (!getLDSKernelId(DstReg, MRI, B))
5438     return false;
5439 
5440   MI.eraseFromParent();
5441   return true;
5442 }
5443 
5444 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
5445                                               MachineRegisterInfo &MRI,
5446                                               MachineIRBuilder &B,
5447                                               unsigned AddrSpace) const {
5448   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
5449   auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
5450   Register Hi32 = Unmerge.getReg(1);
5451 
5452   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
5453   MI.eraseFromParent();
5454   return true;
5455 }
5456 
5457 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
5458 // offset (the offset that is included in bounds checking and swizzling, to be
5459 // split between the instruction's voffset and immoffset fields) and soffset
5460 // (the offset that is excluded from bounds checking and swizzling, to go in
5461 // the instruction's soffset field).  This function takes the first kind of
5462 // offset and figures out how to split it between voffset and immoffset.
5463 std::pair<Register, unsigned>
5464 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
5465                                         Register OrigOffset) const {
5466   const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(ST);
5467   Register BaseReg;
5468   unsigned ImmOffset;
5469   const LLT S32 = LLT::scalar(32);
5470   MachineRegisterInfo &MRI = *B.getMRI();
5471 
5472   std::tie(BaseReg, ImmOffset) =
5473       AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
5474 
5475   // If BaseReg is a pointer, convert it to int.
5476   if (MRI.getType(BaseReg).isPointer())
5477     BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
5478 
5479   // If the immediate value is too big for the immoffset field, put only bits
5480   // that would normally fit in the immoffset field. The remaining value that
5481   // is copied/added for the voffset field is a large power of 2, and it
5482   // stands more chance of being CSEd with the copy/add for another similar
5483   // load/store.
5484   // However, do not do that rounding down if that is a negative
5485   // number, as it appears to be illegal to have a negative offset in the
5486   // vgpr, even if adding the immediate offset makes it positive.
5487   unsigned Overflow = ImmOffset & ~MaxImm;
5488   ImmOffset -= Overflow;
5489   if ((int32_t)Overflow < 0) {
5490     Overflow += ImmOffset;
5491     ImmOffset = 0;
5492   }
5493 
5494   if (Overflow != 0) {
5495     if (!BaseReg) {
5496       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
5497     } else {
5498       auto OverflowVal = B.buildConstant(S32, Overflow);
5499       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
5500     }
5501   }
5502 
5503   if (!BaseReg)
5504     BaseReg = B.buildConstant(S32, 0).getReg(0);
5505 
5506   return std::pair(BaseReg, ImmOffset);
5507 }
5508 
5509 /// Handle register layout difference for f16 images for some subtargets.
5510 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
5511                                              MachineRegisterInfo &MRI,
5512                                              Register Reg,
5513                                              bool ImageStore) const {
5514   const LLT S16 = LLT::scalar(16);
5515   const LLT S32 = LLT::scalar(32);
5516   LLT StoreVT = MRI.getType(Reg);
5517   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
5518 
5519   if (ST.hasUnpackedD16VMem()) {
5520     auto Unmerge = B.buildUnmerge(S16, Reg);
5521 
5522     SmallVector<Register, 4> WideRegs;
5523     for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
5524       WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
5525 
5526     int NumElts = StoreVT.getNumElements();
5527 
5528     return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
5529         .getReg(0);
5530   }
5531 
5532   if (ImageStore && ST.hasImageStoreD16Bug()) {
5533     if (StoreVT.getNumElements() == 2) {
5534       SmallVector<Register, 4> PackedRegs;
5535       Reg = B.buildBitcast(S32, Reg).getReg(0);
5536       PackedRegs.push_back(Reg);
5537       PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
5538       return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
5539           .getReg(0);
5540     }
5541 
5542     if (StoreVT.getNumElements() == 3) {
5543       SmallVector<Register, 4> PackedRegs;
5544       auto Unmerge = B.buildUnmerge(S16, Reg);
5545       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
5546         PackedRegs.push_back(Unmerge.getReg(I));
5547       PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
5548       Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
5549       return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
5550     }
5551 
5552     if (StoreVT.getNumElements() == 4) {
5553       SmallVector<Register, 4> PackedRegs;
5554       Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
5555       auto Unmerge = B.buildUnmerge(S32, Reg);
5556       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
5557         PackedRegs.push_back(Unmerge.getReg(I));
5558       PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
5559       return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
5560           .getReg(0);
5561     }
5562 
5563     llvm_unreachable("invalid data type");
5564   }
5565 
5566   if (StoreVT == LLT::fixed_vector(3, S16)) {
5567     Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg)
5568               .getReg(0);
5569   }
5570   return Reg;
5571 }
5572 
5573 Register AMDGPULegalizerInfo::fixStoreSourceType(
5574   MachineIRBuilder &B, Register VData, bool IsFormat) const {
5575   MachineRegisterInfo *MRI = B.getMRI();
5576   LLT Ty = MRI->getType(VData);
5577 
5578   const LLT S16 = LLT::scalar(16);
5579 
5580   // Fixup buffer resources themselves needing to be v4i128.
5581   if (hasBufferRsrcWorkaround(Ty))
5582     return castBufferRsrcToV4I32(VData, B);
5583 
5584   // Fixup illegal register types for i8 stores.
5585   if (Ty == LLT::scalar(8) || Ty == S16) {
5586     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
5587     return AnyExt;
5588   }
5589 
5590   if (Ty.isVector()) {
5591     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
5592       if (IsFormat)
5593         return handleD16VData(B, *MRI, VData);
5594     }
5595   }
5596 
5597   return VData;
5598 }
5599 
5600 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
5601                                               MachineRegisterInfo &MRI,
5602                                               MachineIRBuilder &B,
5603                                               bool IsTyped,
5604                                               bool IsFormat) const {
5605   Register VData = MI.getOperand(1).getReg();
5606   LLT Ty = MRI.getType(VData);
5607   LLT EltTy = Ty.getScalarType();
5608   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
5609   const LLT S32 = LLT::scalar(32);
5610 
5611   VData = fixStoreSourceType(B, VData, IsFormat);
5612   castBufferRsrcArgToV4I32(MI, B, 2);
5613   Register RSrc = MI.getOperand(2).getReg();
5614 
5615   MachineMemOperand *MMO = *MI.memoperands_begin();
5616   const int MemSize = MMO->getSize();
5617 
5618   unsigned ImmOffset;
5619 
5620   // The typed intrinsics add an immediate after the registers.
5621   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
5622 
5623   // The struct intrinsic variants add one additional operand over raw.
5624   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
5625   Register VIndex;
5626   int OpOffset = 0;
5627   if (HasVIndex) {
5628     VIndex = MI.getOperand(3).getReg();
5629     OpOffset = 1;
5630   } else {
5631     VIndex = B.buildConstant(S32, 0).getReg(0);
5632   }
5633 
5634   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
5635   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
5636 
5637   unsigned Format = 0;
5638   if (IsTyped) {
5639     Format = MI.getOperand(5 + OpOffset).getImm();
5640     ++OpOffset;
5641   }
5642 
5643   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
5644 
5645   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
5646 
5647   unsigned Opc;
5648   if (IsTyped) {
5649     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
5650                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
5651   } else if (IsFormat) {
5652     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
5653                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
5654   } else {
5655     switch (MemSize) {
5656     case 1:
5657       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
5658       break;
5659     case 2:
5660       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
5661       break;
5662     default:
5663       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
5664       break;
5665     }
5666   }
5667 
5668   auto MIB = B.buildInstr(Opc)
5669     .addUse(VData)              // vdata
5670     .addUse(RSrc)               // rsrc
5671     .addUse(VIndex)             // vindex
5672     .addUse(VOffset)            // voffset
5673     .addUse(SOffset)            // soffset
5674     .addImm(ImmOffset);         // offset(imm)
5675 
5676   if (IsTyped)
5677     MIB.addImm(Format);
5678 
5679   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
5680      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
5681      .addMemOperand(MMO);
5682 
5683   MI.eraseFromParent();
5684   return true;
5685 }
5686 
5687 static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc,
5688                             Register VIndex, Register VOffset, Register SOffset,
5689                             unsigned ImmOffset, unsigned Format,
5690                             unsigned AuxiliaryData, MachineMemOperand *MMO,
5691                             bool IsTyped, bool HasVIndex, MachineIRBuilder &B) {
5692   auto MIB = B.buildInstr(Opc)
5693                  .addDef(LoadDstReg) // vdata
5694                  .addUse(RSrc)       // rsrc
5695                  .addUse(VIndex)     // vindex
5696                  .addUse(VOffset)    // voffset
5697                  .addUse(SOffset)    // soffset
5698                  .addImm(ImmOffset); // offset(imm)
5699 
5700   if (IsTyped)
5701     MIB.addImm(Format);
5702 
5703   MIB.addImm(AuxiliaryData)       // cachepolicy, swizzled buffer(imm)
5704       .addImm(HasVIndex ? -1 : 0) // idxen(imm)
5705       .addMemOperand(MMO);
5706 }
5707 
5708 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
5709                                              MachineRegisterInfo &MRI,
5710                                              MachineIRBuilder &B,
5711                                              bool IsFormat,
5712                                              bool IsTyped) const {
5713   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
5714   MachineMemOperand *MMO = *MI.memoperands_begin();
5715   const LLT MemTy = MMO->getMemoryType();
5716   const LLT S32 = LLT::scalar(32);
5717 
5718   Register Dst = MI.getOperand(0).getReg();
5719 
5720   Register StatusDst;
5721   int OpOffset = 0;
5722   assert(MI.getNumExplicitDefs() == 1 || MI.getNumExplicitDefs() == 2);
5723   bool IsTFE = MI.getNumExplicitDefs() == 2;
5724   if (IsTFE) {
5725     StatusDst = MI.getOperand(1).getReg();
5726     ++OpOffset;
5727   }
5728 
5729   castBufferRsrcArgToV4I32(MI, B, 2 + OpOffset);
5730   Register RSrc = MI.getOperand(2 + OpOffset).getReg();
5731 
5732   // The typed intrinsics add an immediate after the registers.
5733   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
5734 
5735   // The struct intrinsic variants add one additional operand over raw.
5736   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps + OpOffset;
5737   Register VIndex;
5738   if (HasVIndex) {
5739     VIndex = MI.getOperand(3 + OpOffset).getReg();
5740     ++OpOffset;
5741   } else {
5742     VIndex = B.buildConstant(S32, 0).getReg(0);
5743   }
5744 
5745   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
5746   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
5747 
5748   unsigned Format = 0;
5749   if (IsTyped) {
5750     Format = MI.getOperand(5 + OpOffset).getImm();
5751     ++OpOffset;
5752   }
5753 
5754   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
5755   unsigned ImmOffset;
5756 
5757   LLT Ty = MRI.getType(Dst);
5758   // Make addrspace 8 pointers loads into 4xs32 loads here, so the rest of the
5759   // logic doesn't have to handle that case.
5760   if (hasBufferRsrcWorkaround(Ty)) {
5761     Ty = castBufferRsrcFromV4I32(MI, B, MRI, 0);
5762     Dst = MI.getOperand(0).getReg();
5763   }
5764   LLT EltTy = Ty.getScalarType();
5765   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
5766   const bool Unpacked = ST.hasUnpackedD16VMem();
5767 
5768   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
5769 
5770   unsigned Opc;
5771 
5772   // TODO: Support TFE for typed and narrow loads.
5773   if (IsTyped) {
5774     if (IsTFE)
5775       return false;
5776     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
5777                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
5778   } else if (IsFormat) {
5779     if (IsD16) {
5780       if (IsTFE)
5781         return false;
5782       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
5783     } else {
5784       Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
5785                   : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
5786     }
5787   } else {
5788     if (IsTFE)
5789       return false;
5790     switch (MemTy.getSizeInBits()) {
5791     case 8:
5792       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
5793       break;
5794     case 16:
5795       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
5796       break;
5797     default:
5798       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
5799       break;
5800     }
5801   }
5802 
5803   if (IsTFE) {
5804     unsigned NumValueDWords = divideCeil(Ty.getSizeInBits(), 32);
5805     unsigned NumLoadDWords = NumValueDWords + 1;
5806     LLT LoadTy = LLT::fixed_vector(NumLoadDWords, S32);
5807     Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(LoadTy);
5808     buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
5809                     Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
5810     if (NumValueDWords == 1) {
5811       B.buildUnmerge({Dst, StatusDst}, LoadDstReg);
5812     } else {
5813       SmallVector<Register, 5> LoadElts;
5814       for (unsigned I = 0; I != NumValueDWords; ++I)
5815         LoadElts.push_back(B.getMRI()->createGenericVirtualRegister(S32));
5816       LoadElts.push_back(StatusDst);
5817       B.buildUnmerge(LoadElts, LoadDstReg);
5818       LoadElts.truncate(NumValueDWords);
5819       B.buildMergeLikeInstr(Dst, LoadElts);
5820     }
5821   } else if ((!IsD16 && MemTy.getSizeInBits() < 32) ||
5822              (IsD16 && !Ty.isVector())) {
5823     Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
5824     buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
5825                     Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
5826     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
5827     B.buildTrunc(Dst, LoadDstReg);
5828   } else if (Unpacked && IsD16 && Ty.isVector()) {
5829     LLT UnpackedTy = Ty.changeElementSize(32);
5830     Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
5831     buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
5832                     Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
5833     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
5834     // FIXME: G_TRUNC should work, but legalization currently fails
5835     auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
5836     SmallVector<Register, 4> Repack;
5837     for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
5838       Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
5839     B.buildMergeLikeInstr(Dst, Repack);
5840   } else {
5841     buildBufferLoad(Opc, Dst, RSrc, VIndex, VOffset, SOffset, ImmOffset, Format,
5842                     AuxiliaryData, MMO, IsTyped, HasVIndex, B);
5843   }
5844 
5845   MI.eraseFromParent();
5846   return true;
5847 }
5848 
5849 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
5850   switch (IntrID) {
5851   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
5852   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
5853   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
5854   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
5855     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
5856   case Intrinsic::amdgcn_raw_buffer_atomic_add:
5857   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
5858   case Intrinsic::amdgcn_struct_buffer_atomic_add:
5859   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
5860     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
5861   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
5862   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
5863   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
5864   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
5865     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
5866   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
5867   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
5868   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
5869   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
5870     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
5871   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
5872   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
5873   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
5874   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
5875     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
5876   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
5877   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
5878   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
5879   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
5880     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
5881   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
5882   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
5883   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
5884   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
5885     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
5886   case Intrinsic::amdgcn_raw_buffer_atomic_and:
5887   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
5888   case Intrinsic::amdgcn_struct_buffer_atomic_and:
5889   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
5890     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
5891   case Intrinsic::amdgcn_raw_buffer_atomic_or:
5892   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
5893   case Intrinsic::amdgcn_struct_buffer_atomic_or:
5894   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
5895     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
5896   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
5897   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
5898   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
5899   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
5900     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
5901   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
5902   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
5903   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
5904   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
5905     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
5906   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
5907   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
5908   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
5909   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
5910     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
5911   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
5912   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
5913   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
5914   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
5915     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
5916   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
5917   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
5918   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
5919   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
5920     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
5921   case Intrinsic::amdgcn_raw_buffer_atomic_fadd_v2bf16:
5922   case Intrinsic::amdgcn_struct_buffer_atomic_fadd_v2bf16:
5923     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD_BF16;
5924   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
5925   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
5926   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
5927   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
5928     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
5929   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
5930   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
5931   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
5932   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
5933     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
5934   case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
5935   case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
5936     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32;
5937   default:
5938     llvm_unreachable("unhandled atomic opcode");
5939   }
5940 }
5941 
5942 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
5943                                                MachineIRBuilder &B,
5944                                                Intrinsic::ID IID) const {
5945   const bool IsCmpSwap =
5946       IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
5947       IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap ||
5948       IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap ||
5949       IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap;
5950 
5951   Register Dst = MI.getOperand(0).getReg();
5952   // Since we don't have 128-bit atomics, we don't need to handle the case of
5953   // p8 argmunents to the atomic itself
5954   Register VData = MI.getOperand(2).getReg();
5955 
5956   Register CmpVal;
5957   int OpOffset = 0;
5958 
5959   if (IsCmpSwap) {
5960     CmpVal = MI.getOperand(3).getReg();
5961     ++OpOffset;
5962   }
5963 
5964   castBufferRsrcArgToV4I32(MI, B, 3 + OpOffset);
5965   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
5966   const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
5967 
5968   // The struct intrinsic variants add one additional operand over raw.
5969   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
5970   Register VIndex;
5971   if (HasVIndex) {
5972     VIndex = MI.getOperand(4 + OpOffset).getReg();
5973     ++OpOffset;
5974   } else {
5975     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
5976   }
5977 
5978   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
5979   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
5980   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
5981 
5982   MachineMemOperand *MMO = *MI.memoperands_begin();
5983 
5984   unsigned ImmOffset;
5985   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
5986 
5987   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
5988       .addDef(Dst)
5989       .addUse(VData); // vdata
5990 
5991   if (IsCmpSwap)
5992     MIB.addReg(CmpVal);
5993 
5994   MIB.addUse(RSrc)               // rsrc
5995      .addUse(VIndex)             // vindex
5996      .addUse(VOffset)            // voffset
5997      .addUse(SOffset)            // soffset
5998      .addImm(ImmOffset)          // offset(imm)
5999      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
6000      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
6001      .addMemOperand(MMO);
6002 
6003   MI.eraseFromParent();
6004   return true;
6005 }
6006 
6007 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
6008 /// vector with s16 typed elements.
6009 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
6010                                       SmallVectorImpl<Register> &PackedAddrs,
6011                                       unsigned ArgOffset,
6012                                       const AMDGPU::ImageDimIntrinsicInfo *Intr,
6013                                       bool IsA16, bool IsG16) {
6014   const LLT S16 = LLT::scalar(16);
6015   const LLT V2S16 = LLT::fixed_vector(2, 16);
6016   auto EndIdx = Intr->VAddrEnd;
6017 
6018   for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
6019     MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
6020     if (!SrcOp.isReg())
6021       continue; // _L to _LZ may have eliminated this.
6022 
6023     Register AddrReg = SrcOp.getReg();
6024 
6025     if ((I < Intr->GradientStart) ||
6026         (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
6027         (I >= Intr->CoordStart && !IsA16)) {
6028       if ((I < Intr->GradientStart) && IsA16 &&
6029           (B.getMRI()->getType(AddrReg) == S16)) {
6030         assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
6031         // Special handling of bias when A16 is on. Bias is of type half but
6032         // occupies full 32-bit.
6033         PackedAddrs.push_back(
6034             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
6035                 .getReg(0));
6036       } else {
6037         assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
6038                "Bias needs to be converted to 16 bit in A16 mode");
6039         // Handle any gradient or coordinate operands that should not be packed
6040         AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
6041         PackedAddrs.push_back(AddrReg);
6042       }
6043     } else {
6044       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
6045       // derivatives dx/dh and dx/dv are packed with undef.
6046       if (((I + 1) >= EndIdx) ||
6047           ((Intr->NumGradients / 2) % 2 == 1 &&
6048            (I == static_cast<unsigned>(Intr->GradientStart +
6049                                        (Intr->NumGradients / 2) - 1) ||
6050             I == static_cast<unsigned>(Intr->GradientStart +
6051                                        Intr->NumGradients - 1))) ||
6052           // Check for _L to _LZ optimization
6053           !MI.getOperand(ArgOffset + I + 1).isReg()) {
6054         PackedAddrs.push_back(
6055             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
6056                 .getReg(0));
6057       } else {
6058         PackedAddrs.push_back(
6059             B.buildBuildVector(
6060                  V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
6061                 .getReg(0));
6062         ++I;
6063       }
6064     }
6065   }
6066 }
6067 
6068 /// Convert from separate vaddr components to a single vector address register,
6069 /// and replace the remaining operands with $noreg.
6070 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
6071                                      int DimIdx, int NumVAddrs) {
6072   const LLT S32 = LLT::scalar(32);
6073   (void)S32;
6074   SmallVector<Register, 8> AddrRegs;
6075   for (int I = 0; I != NumVAddrs; ++I) {
6076     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
6077     if (SrcOp.isReg()) {
6078       AddrRegs.push_back(SrcOp.getReg());
6079       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
6080     }
6081   }
6082 
6083   int NumAddrRegs = AddrRegs.size();
6084   if (NumAddrRegs != 1) {
6085     auto VAddr =
6086         B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
6087     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
6088   }
6089 
6090   for (int I = 1; I != NumVAddrs; ++I) {
6091     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
6092     if (SrcOp.isReg())
6093       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
6094   }
6095 }
6096 
6097 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
6098 ///
6099 /// Depending on the subtarget, load/store with 16-bit element data need to be
6100 /// rewritten to use the low half of 32-bit registers, or directly use a packed
6101 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
6102 /// registers.
6103 ///
6104 /// We don't want to directly select image instructions just yet, but also want
6105 /// to exposes all register repacking to the legalizer/combiners. We also don't
6106 /// want a selected instruction entering RegBankSelect. In order to avoid
6107 /// defining a multitude of intermediate image instructions, directly hack on
6108 /// the intrinsic's arguments. In cases like a16 addresses, this requires
6109 /// padding now unnecessary arguments with $noreg.
6110 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
6111     MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
6112     const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
6113 
6114   const MachineFunction &MF = *MI.getMF();
6115   const unsigned NumDefs = MI.getNumExplicitDefs();
6116   const unsigned ArgOffset = NumDefs + 1;
6117   bool IsTFE = NumDefs == 2;
6118   // We are only processing the operands of d16 image operations on subtargets
6119   // that use the unpacked register layout, or need to repack the TFE result.
6120 
6121   // TODO: Do we need to guard against already legalized intrinsics?
6122   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
6123       AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
6124 
6125   MachineRegisterInfo *MRI = B.getMRI();
6126   const LLT S32 = LLT::scalar(32);
6127   const LLT S16 = LLT::scalar(16);
6128   const LLT V2S16 = LLT::fixed_vector(2, 16);
6129 
6130   unsigned DMask = 0;
6131   Register VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
6132   LLT Ty = MRI->getType(VData);
6133 
6134   const bool IsAtomicPacked16Bit =
6135       (BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
6136        BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
6137 
6138   // Check for 16 bit addresses and pack if true.
6139   LLT GradTy =
6140       MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
6141   LLT AddrTy =
6142       MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
6143   const bool IsG16 =
6144       ST.hasG16() ? (BaseOpcode->Gradients && GradTy == S16) : GradTy == S16;
6145   const bool IsA16 = AddrTy == S16;
6146   const bool IsD16 = !IsAtomicPacked16Bit && Ty.getScalarType() == S16;
6147 
6148   int DMaskLanes = 0;
6149   if (!BaseOpcode->Atomic) {
6150     DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
6151     if (BaseOpcode->Gather4) {
6152       DMaskLanes = 4;
6153     } else if (DMask != 0) {
6154       DMaskLanes = llvm::popcount(DMask);
6155     } else if (!IsTFE && !BaseOpcode->Store) {
6156       // If dmask is 0, this is a no-op load. This can be eliminated.
6157       B.buildUndef(MI.getOperand(0));
6158       MI.eraseFromParent();
6159       return true;
6160     }
6161   }
6162 
6163   Observer.changingInstr(MI);
6164   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
6165 
6166   const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
6167                                      : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
6168   const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
6169                                     : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
6170   unsigned NewOpcode = NumDefs == 0 ? StoreOpcode : LoadOpcode;
6171 
6172   // Track that we legalized this
6173   MI.setDesc(B.getTII().get(NewOpcode));
6174 
6175   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
6176   // dmask to be at least 1 otherwise the instruction will fail
6177   if (IsTFE && DMask == 0) {
6178     DMask = 0x1;
6179     DMaskLanes = 1;
6180     MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
6181   }
6182 
6183   if (BaseOpcode->Atomic) {
6184     Register VData0 = MI.getOperand(2).getReg();
6185     LLT Ty = MRI->getType(VData0);
6186 
6187     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
6188     if (Ty.isVector() && !IsAtomicPacked16Bit)
6189       return false;
6190 
6191     if (BaseOpcode->AtomicX2) {
6192       Register VData1 = MI.getOperand(3).getReg();
6193       // The two values are packed in one register.
6194       LLT PackedTy = LLT::fixed_vector(2, Ty);
6195       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
6196       MI.getOperand(2).setReg(Concat.getReg(0));
6197       MI.getOperand(3).setReg(AMDGPU::NoRegister);
6198     }
6199   }
6200 
6201   unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
6202 
6203   // Rewrite the addressing register layout before doing anything else.
6204   if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
6205     // 16 bit gradients are supported, but are tied to the A16 control
6206     // so both gradients and addresses must be 16 bit
6207     return false;
6208   }
6209 
6210   if (IsA16 && !ST.hasA16()) {
6211     // A16 not supported
6212     return false;
6213   }
6214 
6215   const unsigned NSAMaxSize = ST.getNSAMaxSize(BaseOpcode->Sampler);
6216   const unsigned HasPartialNSA = ST.hasPartialNSAEncoding();
6217 
6218   if (IsA16 || IsG16) {
6219     // Even if NumVAddrs == 1 we should pack it into a 32-bit value, because the
6220     // instructions expect VGPR_32
6221     SmallVector<Register, 4> PackedRegs;
6222 
6223     packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, IsG16);
6224 
6225     // See also below in the non-a16 branch
6226     const bool UseNSA = ST.hasNSAEncoding() &&
6227                         PackedRegs.size() >= ST.getNSAThreshold(MF) &&
6228                         (PackedRegs.size() <= NSAMaxSize || HasPartialNSA);
6229     const bool UsePartialNSA =
6230         UseNSA && HasPartialNSA && PackedRegs.size() > NSAMaxSize;
6231 
6232     if (UsePartialNSA) {
6233       // Pack registers that would go over NSAMaxSize into last VAddr register
6234       LLT PackedAddrTy =
6235           LLT::fixed_vector(2 * (PackedRegs.size() - NSAMaxSize + 1), 16);
6236       auto Concat = B.buildConcatVectors(
6237           PackedAddrTy, ArrayRef(PackedRegs).slice(NSAMaxSize - 1));
6238       PackedRegs[NSAMaxSize - 1] = Concat.getReg(0);
6239       PackedRegs.resize(NSAMaxSize);
6240     } else if (!UseNSA && PackedRegs.size() > 1) {
6241       LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
6242       auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
6243       PackedRegs[0] = Concat.getReg(0);
6244       PackedRegs.resize(1);
6245     }
6246 
6247     const unsigned NumPacked = PackedRegs.size();
6248     for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
6249       MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
6250       if (!SrcOp.isReg()) {
6251         assert(SrcOp.isImm() && SrcOp.getImm() == 0);
6252         continue;
6253       }
6254 
6255       assert(SrcOp.getReg() != AMDGPU::NoRegister);
6256 
6257       if (I - Intr->VAddrStart < NumPacked)
6258         SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
6259       else
6260         SrcOp.setReg(AMDGPU::NoRegister);
6261     }
6262   } else {
6263     // If the register allocator cannot place the address registers contiguously
6264     // without introducing moves, then using the non-sequential address encoding
6265     // is always preferable, since it saves VALU instructions and is usually a
6266     // wash in terms of code size or even better.
6267     //
6268     // However, we currently have no way of hinting to the register allocator
6269     // that MIMG addresses should be placed contiguously when it is possible to
6270     // do so, so force non-NSA for the common 2-address case as a heuristic.
6271     //
6272     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
6273     // allocation when possible.
6274     //
6275     // Partial NSA is allowed on GFX11+ where the final register is a contiguous
6276     // set of the remaining addresses.
6277     const bool UseNSA = ST.hasNSAEncoding() &&
6278                         CorrectedNumVAddrs >= ST.getNSAThreshold(MF) &&
6279                         (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA);
6280     const bool UsePartialNSA =
6281         UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize;
6282 
6283     if (UsePartialNSA) {
6284       convertImageAddrToPacked(B, MI,
6285                                ArgOffset + Intr->VAddrStart + NSAMaxSize - 1,
6286                                Intr->NumVAddrs - NSAMaxSize + 1);
6287     } else if (!UseNSA && Intr->NumVAddrs > 1) {
6288       convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
6289                                Intr->NumVAddrs);
6290     }
6291   }
6292 
6293   int Flags = 0;
6294   if (IsA16)
6295     Flags |= 1;
6296   if (IsG16)
6297     Flags |= 2;
6298   MI.addOperand(MachineOperand::CreateImm(Flags));
6299 
6300   if (BaseOpcode->Store) { // No TFE for stores?
6301     // TODO: Handle dmask trim
6302     if (!Ty.isVector() || !IsD16)
6303       return true;
6304 
6305     Register RepackedReg = handleD16VData(B, *MRI, VData, true);
6306     if (RepackedReg != VData) {
6307       MI.getOperand(1).setReg(RepackedReg);
6308     }
6309 
6310     return true;
6311   }
6312 
6313   Register DstReg = MI.getOperand(0).getReg();
6314   const LLT EltTy = Ty.getScalarType();
6315   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
6316 
6317   // Confirm that the return type is large enough for the dmask specified
6318   if (NumElts < DMaskLanes)
6319     return false;
6320 
6321   if (NumElts > 4 || DMaskLanes > 4)
6322     return false;
6323 
6324   // Image atomic instructions are using DMask to specify how many bits
6325   // input/output data will have. 32-bits (s32, v2s16) or 64-bits (s64, v4s16).
6326   // DMaskLanes for image atomic has default value '0'.
6327   // We must be sure that atomic variants (especially packed) will not be
6328   // truncated from v2s16 or v4s16 to s16 type.
6329   //
6330   // ChangeElementCount will be needed for image load where Ty is always scalar.
6331   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
6332   const LLT AdjustedTy =
6333       DMaskLanes == 0
6334           ? Ty
6335           : Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
6336 
6337   // The raw dword aligned data component of the load. The only legal cases
6338   // where this matters should be when using the packed D16 format, for
6339   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
6340   LLT RoundedTy;
6341 
6342   // S32 vector to cover all data, plus TFE result element.
6343   LLT TFETy;
6344 
6345   // Register type to use for each loaded component. Will be S32 or V2S16.
6346   LLT RegTy;
6347 
6348   if (IsD16 && ST.hasUnpackedD16VMem()) {
6349     RoundedTy =
6350         LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
6351     TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
6352     RegTy = S32;
6353   } else {
6354     unsigned EltSize = EltTy.getSizeInBits();
6355     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
6356     unsigned RoundedSize = 32 * RoundedElts;
6357     RoundedTy = LLT::scalarOrVector(
6358         ElementCount::getFixed(RoundedSize / EltSize), EltSize);
6359     TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
6360     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
6361   }
6362 
6363   // The return type does not need adjustment.
6364   // TODO: Should we change s16 case to s32 or <2 x s16>?
6365   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
6366     return true;
6367 
6368   Register Dst1Reg;
6369 
6370   // Insert after the instruction.
6371   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
6372 
6373   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
6374   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
6375   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
6376   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
6377 
6378   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
6379 
6380   MI.getOperand(0).setReg(NewResultReg);
6381 
6382   // In the IR, TFE is supposed to be used with a 2 element struct return
6383   // type. The instruction really returns these two values in one contiguous
6384   // register, with one additional dword beyond the loaded data. Rewrite the
6385   // return type to use a single register result.
6386 
6387   if (IsTFE) {
6388     Dst1Reg = MI.getOperand(1).getReg();
6389     if (MRI->getType(Dst1Reg) != S32)
6390       return false;
6391 
6392     // TODO: Make sure the TFE operand bit is set.
6393     MI.removeOperand(1);
6394 
6395     // Handle the easy case that requires no repack instructions.
6396     if (Ty == S32) {
6397       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
6398       return true;
6399     }
6400   }
6401 
6402   // Now figure out how to copy the new result register back into the old
6403   // result.
6404   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
6405 
6406   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
6407 
6408   if (ResultNumRegs == 1) {
6409     assert(!IsTFE);
6410     ResultRegs[0] = NewResultReg;
6411   } else {
6412     // We have to repack into a new vector of some kind.
6413     for (int I = 0; I != NumDataRegs; ++I)
6414       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
6415     B.buildUnmerge(ResultRegs, NewResultReg);
6416 
6417     // Drop the final TFE element to get the data part. The TFE result is
6418     // directly written to the right place already.
6419     if (IsTFE)
6420       ResultRegs.resize(NumDataRegs);
6421   }
6422 
6423   // For an s16 scalar result, we form an s32 result with a truncate regardless
6424   // of packed vs. unpacked.
6425   if (IsD16 && !Ty.isVector()) {
6426     B.buildTrunc(DstReg, ResultRegs[0]);
6427     return true;
6428   }
6429 
6430   // Avoid a build/concat_vector of 1 entry.
6431   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
6432     B.buildBitcast(DstReg, ResultRegs[0]);
6433     return true;
6434   }
6435 
6436   assert(Ty.isVector());
6437 
6438   if (IsD16) {
6439     // For packed D16 results with TFE enabled, all the data components are
6440     // S32. Cast back to the expected type.
6441     //
6442     // TODO: We don't really need to use load s32 elements. We would only need one
6443     // cast for the TFE result if a multiple of v2s16 was used.
6444     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
6445       for (Register &Reg : ResultRegs)
6446         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
6447     } else if (ST.hasUnpackedD16VMem()) {
6448       for (Register &Reg : ResultRegs)
6449         Reg = B.buildTrunc(S16, Reg).getReg(0);
6450     }
6451   }
6452 
6453   auto padWithUndef = [&](LLT Ty, int NumElts) {
6454     if (NumElts == 0)
6455       return;
6456     Register Undef = B.buildUndef(Ty).getReg(0);
6457     for (int I = 0; I != NumElts; ++I)
6458       ResultRegs.push_back(Undef);
6459   };
6460 
6461   // Pad out any elements eliminated due to the dmask.
6462   LLT ResTy = MRI->getType(ResultRegs[0]);
6463   if (!ResTy.isVector()) {
6464     padWithUndef(ResTy, NumElts - ResultRegs.size());
6465     B.buildBuildVector(DstReg, ResultRegs);
6466     return true;
6467   }
6468 
6469   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
6470   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
6471 
6472   // Deal with the one annoying legal case.
6473   const LLT V3S16 = LLT::fixed_vector(3, 16);
6474   if (Ty == V3S16) {
6475     if (IsTFE) {
6476       if (ResultRegs.size() == 1) {
6477         NewResultReg = ResultRegs[0];
6478       } else if (ResultRegs.size() == 2) {
6479         LLT V4S16 = LLT::fixed_vector(4, 16);
6480         NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0);
6481       } else {
6482         return false;
6483       }
6484     }
6485 
6486     if (MRI->getType(DstReg).getNumElements() <
6487         MRI->getType(NewResultReg).getNumElements()) {
6488       B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
6489     } else {
6490       B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
6491     }
6492     return true;
6493   }
6494 
6495   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
6496   B.buildConcatVectors(DstReg, ResultRegs);
6497   return true;
6498 }
6499 
6500 bool AMDGPULegalizerInfo::legalizeSBufferLoad(LegalizerHelper &Helper,
6501                                               MachineInstr &MI) const {
6502   MachineIRBuilder &B = Helper.MIRBuilder;
6503   GISelChangeObserver &Observer = Helper.Observer;
6504 
6505   Register OrigDst = MI.getOperand(0).getReg();
6506   Register Dst;
6507   LLT Ty = B.getMRI()->getType(OrigDst);
6508   unsigned Size = Ty.getSizeInBits();
6509   MachineFunction &MF = B.getMF();
6510   unsigned Opc = 0;
6511   if (Size < 32 && ST.hasScalarSubwordLoads()) {
6512     assert(Size == 8 || Size == 16);
6513     Opc = Size == 8 ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE
6514                     : AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT;
6515     // The 8-bit and 16-bit scalar buffer load instructions have 32-bit
6516     // destination register.
6517     Dst = B.getMRI()->createGenericVirtualRegister(LLT::scalar(32));
6518   } else {
6519     Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD;
6520     Dst = OrigDst;
6521   }
6522 
6523   Observer.changingInstr(MI);
6524 
6525   // Handle needing to s.buffer.load() a p8 value.
6526   if (hasBufferRsrcWorkaround(Ty)) {
6527     Ty = castBufferRsrcFromV4I32(MI, B, *B.getMRI(), 0);
6528     B.setInsertPt(B.getMBB(), MI);
6529   }
6530   if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) {
6531     Ty = getBitcastRegisterType(Ty);
6532     Helper.bitcastDst(MI, Ty, 0);
6533     B.setInsertPt(B.getMBB(), MI);
6534   }
6535 
6536   // FIXME: We don't really need this intermediate instruction. The intrinsic
6537   // should be fixed to have a memory operand. Since it's readnone, we're not
6538   // allowed to add one.
6539   MI.setDesc(B.getTII().get(Opc));
6540   MI.removeOperand(1); // Remove intrinsic ID
6541 
6542   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
6543   // TODO: Should this use datalayout alignment?
6544   const unsigned MemSize = (Size + 7) / 8;
6545   const Align MemAlign(std::min(MemSize, 4u));
6546   MachineMemOperand *MMO = MF.getMachineMemOperand(
6547       MachinePointerInfo(),
6548       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
6549           MachineMemOperand::MOInvariant,
6550       MemSize, MemAlign);
6551   MI.addMemOperand(MF, MMO);
6552   if (Dst != OrigDst) {
6553     MI.getOperand(0).setReg(Dst);
6554     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
6555     B.buildTrunc(OrigDst, Dst);
6556   }
6557 
6558   // If we don't have 96-bit result scalar loads, widening to 128-bit should
6559   // always be legal. We may need to restore this to a 96-bit result if it turns
6560   // out this needs to be converted to a vector load during RegBankSelect.
6561   if (!isPowerOf2_32(Size) && (Size != 96 || !ST.hasScalarDwordx3Loads())) {
6562     if (Ty.isVector())
6563       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
6564     else
6565       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
6566   }
6567 
6568   Observer.changedInstr(MI);
6569   return true;
6570 }
6571 
6572 // TODO: Move to selection
6573 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
6574                                                 MachineRegisterInfo &MRI,
6575                                                 MachineIRBuilder &B) const {
6576   if (!ST.isTrapHandlerEnabled() ||
6577       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
6578     return legalizeTrapEndpgm(MI, MRI, B);
6579 
6580   return ST.supportsGetDoorbellID() ?
6581          legalizeTrapHsa(MI, MRI, B) : legalizeTrapHsaQueuePtr(MI, MRI, B);
6582 }
6583 
6584 bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
6585     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
6586   const DebugLoc &DL = MI.getDebugLoc();
6587   MachineBasicBlock &BB = B.getMBB();
6588   MachineFunction *MF = BB.getParent();
6589 
6590   if (BB.succ_empty() && std::next(MI.getIterator()) == BB.end()) {
6591     BuildMI(BB, BB.end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
6592       .addImm(0);
6593     MI.eraseFromParent();
6594     return true;
6595   }
6596 
6597   // We need a block split to make the real endpgm a terminator. We also don't
6598   // want to break phis in successor blocks, so we can't just delete to the
6599   // end of the block.
6600   BB.splitAt(MI, false /*UpdateLiveIns*/);
6601   MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
6602   MF->push_back(TrapBB);
6603   BuildMI(*TrapBB, TrapBB->end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
6604     .addImm(0);
6605   BuildMI(BB, &MI, DL, B.getTII().get(AMDGPU::S_CBRANCH_EXECNZ))
6606     .addMBB(TrapBB);
6607 
6608   BB.addSuccessor(TrapBB);
6609   MI.eraseFromParent();
6610   return true;
6611 }
6612 
6613 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
6614     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
6615   MachineFunction &MF = B.getMF();
6616   const LLT S64 = LLT::scalar(64);
6617 
6618   Register SGPR01(AMDGPU::SGPR0_SGPR1);
6619   // For code object version 5, queue_ptr is passed through implicit kernarg.
6620   if (AMDGPU::getAMDHSACodeObjectVersion(*MF.getFunction().getParent()) >=
6621       AMDGPU::AMDHSA_COV5) {
6622     AMDGPUTargetLowering::ImplicitParameter Param =
6623         AMDGPUTargetLowering::QUEUE_PTR;
6624     uint64_t Offset =
6625         ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
6626 
6627     Register KernargPtrReg = MRI.createGenericVirtualRegister(
6628         LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
6629 
6630     if (!loadInputValue(KernargPtrReg, B,
6631                         AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
6632       return false;
6633 
6634     // TODO: can we be smarter about machine pointer info?
6635     MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
6636     MachineMemOperand *MMO = MF.getMachineMemOperand(
6637         PtrInfo,
6638         MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
6639             MachineMemOperand::MOInvariant,
6640         LLT::scalar(64), commonAlignment(Align(64), Offset));
6641 
6642     // Pointer address
6643     Register LoadAddr = MRI.createGenericVirtualRegister(
6644         LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
6645     B.buildPtrAdd(LoadAddr, KernargPtrReg,
6646                   B.buildConstant(LLT::scalar(64), Offset).getReg(0));
6647     // Load address
6648     Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0);
6649     B.buildCopy(SGPR01, Temp);
6650     B.buildInstr(AMDGPU::S_TRAP)
6651         .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
6652         .addReg(SGPR01, RegState::Implicit);
6653     MI.eraseFromParent();
6654     return true;
6655   }
6656 
6657   // Pass queue pointer to trap handler as input, and insert trap instruction
6658   // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
6659   Register LiveIn =
6660     MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
6661   if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
6662     return false;
6663 
6664   B.buildCopy(SGPR01, LiveIn);
6665   B.buildInstr(AMDGPU::S_TRAP)
6666       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
6667       .addReg(SGPR01, RegState::Implicit);
6668 
6669   MI.eraseFromParent();
6670   return true;
6671 }
6672 
6673 bool AMDGPULegalizerInfo::legalizeTrapHsa(
6674     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
6675   B.buildInstr(AMDGPU::S_TRAP)
6676       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
6677   MI.eraseFromParent();
6678   return true;
6679 }
6680 
6681 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
6682     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
6683   // Is non-HSA path or trap-handler disabled? Then, report a warning
6684   // accordingly
6685   if (!ST.isTrapHandlerEnabled() ||
6686       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
6687     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
6688                                      "debugtrap handler not supported",
6689                                      MI.getDebugLoc(), DS_Warning);
6690     LLVMContext &Ctx = B.getMF().getFunction().getContext();
6691     Ctx.diagnose(NoTrap);
6692   } else {
6693     // Insert debug-trap instruction
6694     B.buildInstr(AMDGPU::S_TRAP)
6695         .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
6696   }
6697 
6698   MI.eraseFromParent();
6699   return true;
6700 }
6701 
6702 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
6703                                                MachineIRBuilder &B) const {
6704   MachineRegisterInfo &MRI = *B.getMRI();
6705   const LLT S16 = LLT::scalar(16);
6706   const LLT S32 = LLT::scalar(32);
6707   const LLT V2S16 = LLT::fixed_vector(2, 16);
6708   const LLT V3S32 = LLT::fixed_vector(3, 32);
6709 
6710   Register DstReg = MI.getOperand(0).getReg();
6711   Register NodePtr = MI.getOperand(2).getReg();
6712   Register RayExtent = MI.getOperand(3).getReg();
6713   Register RayOrigin = MI.getOperand(4).getReg();
6714   Register RayDir = MI.getOperand(5).getReg();
6715   Register RayInvDir = MI.getOperand(6).getReg();
6716   Register TDescr = MI.getOperand(7).getReg();
6717 
6718   if (!ST.hasGFX10_AEncoding()) {
6719     DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
6720                                         "intrinsic not supported on subtarget",
6721                                         MI.getDebugLoc());
6722     B.getMF().getFunction().getContext().diagnose(BadIntrin);
6723     return false;
6724   }
6725 
6726   const bool IsGFX11 = AMDGPU::isGFX11(ST);
6727   const bool IsGFX11Plus = AMDGPU::isGFX11Plus(ST);
6728   const bool IsGFX12Plus = AMDGPU::isGFX12Plus(ST);
6729   const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
6730   const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
6731   const unsigned NumVDataDwords = 4;
6732   const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
6733   const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
6734   const bool UseNSA =
6735       IsGFX12Plus || (ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize());
6736 
6737   const unsigned BaseOpcodes[2][2] = {
6738       {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
6739       {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
6740        AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
6741   int Opcode;
6742   if (UseNSA) {
6743     Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
6744                                    IsGFX12Plus ? AMDGPU::MIMGEncGfx12
6745                                    : IsGFX11   ? AMDGPU::MIMGEncGfx11NSA
6746                                                : AMDGPU::MIMGEncGfx10NSA,
6747                                    NumVDataDwords, NumVAddrDwords);
6748   } else {
6749     assert(!IsGFX12Plus);
6750     Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
6751                                    IsGFX11 ? AMDGPU::MIMGEncGfx11Default
6752                                            : AMDGPU::MIMGEncGfx10Default,
6753                                    NumVDataDwords, NumVAddrDwords);
6754   }
6755   assert(Opcode != -1);
6756 
6757   SmallVector<Register, 12> Ops;
6758   if (UseNSA && IsGFX11Plus) {
6759     auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) {
6760       auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
6761       auto Merged = B.buildMergeLikeInstr(
6762           V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)});
6763       Ops.push_back(Merged.getReg(0));
6764     };
6765 
6766     Ops.push_back(NodePtr);
6767     Ops.push_back(RayExtent);
6768     packLanes(RayOrigin);
6769 
6770     if (IsA16) {
6771       auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
6772       auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
6773       auto MergedDir = B.buildMergeLikeInstr(
6774           V3S32,
6775           {B.buildBitcast(
6776                 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(0),
6777                                                    UnmergeRayDir.getReg(0)}))
6778                .getReg(0),
6779            B.buildBitcast(
6780                 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(1),
6781                                                    UnmergeRayDir.getReg(1)}))
6782                .getReg(0),
6783            B.buildBitcast(
6784                 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(2),
6785                                                    UnmergeRayDir.getReg(2)}))
6786                .getReg(0)});
6787       Ops.push_back(MergedDir.getReg(0));
6788     } else {
6789       packLanes(RayDir);
6790       packLanes(RayInvDir);
6791     }
6792   } else {
6793     if (Is64) {
6794       auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
6795       Ops.push_back(Unmerge.getReg(0));
6796       Ops.push_back(Unmerge.getReg(1));
6797     } else {
6798       Ops.push_back(NodePtr);
6799     }
6800     Ops.push_back(RayExtent);
6801 
6802     auto packLanes = [&Ops, &S32, &B](Register Src) {
6803       auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
6804       Ops.push_back(Unmerge.getReg(0));
6805       Ops.push_back(Unmerge.getReg(1));
6806       Ops.push_back(Unmerge.getReg(2));
6807     };
6808 
6809     packLanes(RayOrigin);
6810     if (IsA16) {
6811       auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
6812       auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
6813       Register R1 = MRI.createGenericVirtualRegister(S32);
6814       Register R2 = MRI.createGenericVirtualRegister(S32);
6815       Register R3 = MRI.createGenericVirtualRegister(S32);
6816       B.buildMergeLikeInstr(R1,
6817                             {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
6818       B.buildMergeLikeInstr(
6819           R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
6820       B.buildMergeLikeInstr(
6821           R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
6822       Ops.push_back(R1);
6823       Ops.push_back(R2);
6824       Ops.push_back(R3);
6825     } else {
6826       packLanes(RayDir);
6827       packLanes(RayInvDir);
6828     }
6829   }
6830 
6831   if (!UseNSA) {
6832     // Build a single vector containing all the operands so far prepared.
6833     LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
6834     Register MergedOps = B.buildMergeLikeInstr(OpTy, Ops).getReg(0);
6835     Ops.clear();
6836     Ops.push_back(MergedOps);
6837   }
6838 
6839   auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
6840     .addDef(DstReg)
6841     .addImm(Opcode);
6842 
6843   for (Register R : Ops) {
6844     MIB.addUse(R);
6845   }
6846 
6847   MIB.addUse(TDescr)
6848      .addImm(IsA16 ? 1 : 0)
6849      .cloneMemRefs(MI);
6850 
6851   MI.eraseFromParent();
6852   return true;
6853 }
6854 
6855 bool AMDGPULegalizerInfo::legalizeFPTruncRound(MachineInstr &MI,
6856                                                MachineIRBuilder &B) const {
6857   unsigned Opc;
6858   int RoundMode = MI.getOperand(2).getImm();
6859 
6860   if (RoundMode == (int)RoundingMode::TowardPositive)
6861     Opc = AMDGPU::G_FPTRUNC_ROUND_UPWARD;
6862   else if (RoundMode == (int)RoundingMode::TowardNegative)
6863     Opc = AMDGPU::G_FPTRUNC_ROUND_DOWNWARD;
6864   else
6865     return false;
6866 
6867   B.buildInstr(Opc)
6868       .addDef(MI.getOperand(0).getReg())
6869       .addUse(MI.getOperand(1).getReg());
6870 
6871   MI.eraseFromParent();
6872 
6873   return true;
6874 }
6875 
6876 bool AMDGPULegalizerInfo::legalizeStackSave(MachineInstr &MI,
6877                                             MachineIRBuilder &B) const {
6878   const SITargetLowering *TLI = ST.getTargetLowering();
6879   Register StackPtr = TLI->getStackPointerRegisterToSaveRestore();
6880   Register DstReg = MI.getOperand(0).getReg();
6881   B.buildInstr(AMDGPU::G_AMDGPU_WAVE_ADDRESS, {DstReg}, {StackPtr});
6882   MI.eraseFromParent();
6883   return true;
6884 }
6885 
6886 bool AMDGPULegalizerInfo::legalizeWaveID(MachineInstr &MI,
6887                                          MachineIRBuilder &B) const {
6888   // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
6889   if (!ST.hasArchitectedSGPRs())
6890     return false;
6891   LLT S32 = LLT::scalar(32);
6892   Register DstReg = MI.getOperand(0).getReg();
6893   auto TTMP8 = B.buildCopy(S32, Register(AMDGPU::TTMP8));
6894   auto LSB = B.buildConstant(S32, 25);
6895   auto Width = B.buildConstant(S32, 5);
6896   B.buildUbfx(DstReg, TTMP8, LSB, Width);
6897   MI.eraseFromParent();
6898   return true;
6899 }
6900 
6901 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
6902                                             MachineInstr &MI) const {
6903   MachineIRBuilder &B = Helper.MIRBuilder;
6904   MachineRegisterInfo &MRI = *B.getMRI();
6905 
6906   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
6907   auto IntrID = cast<GIntrinsic>(MI).getIntrinsicID();
6908   switch (IntrID) {
6909   case Intrinsic::amdgcn_if:
6910   case Intrinsic::amdgcn_else: {
6911     MachineInstr *Br = nullptr;
6912     MachineBasicBlock *UncondBrTarget = nullptr;
6913     bool Negated = false;
6914     if (MachineInstr *BrCond =
6915             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
6916       const SIRegisterInfo *TRI
6917         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
6918 
6919       Register Def = MI.getOperand(1).getReg();
6920       Register Use = MI.getOperand(3).getReg();
6921 
6922       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
6923 
6924       if (Negated)
6925         std::swap(CondBrTarget, UncondBrTarget);
6926 
6927       B.setInsertPt(B.getMBB(), BrCond->getIterator());
6928       if (IntrID == Intrinsic::amdgcn_if) {
6929         B.buildInstr(AMDGPU::SI_IF)
6930           .addDef(Def)
6931           .addUse(Use)
6932           .addMBB(UncondBrTarget);
6933       } else {
6934         B.buildInstr(AMDGPU::SI_ELSE)
6935             .addDef(Def)
6936             .addUse(Use)
6937             .addMBB(UncondBrTarget);
6938       }
6939 
6940       if (Br) {
6941         Br->getOperand(0).setMBB(CondBrTarget);
6942       } else {
6943         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
6944         // since we're swapping branch targets it needs to be reinserted.
6945         // FIXME: IRTranslator should probably not do this
6946         B.buildBr(*CondBrTarget);
6947       }
6948 
6949       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
6950       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
6951       MI.eraseFromParent();
6952       BrCond->eraseFromParent();
6953       return true;
6954     }
6955 
6956     return false;
6957   }
6958   case Intrinsic::amdgcn_loop: {
6959     MachineInstr *Br = nullptr;
6960     MachineBasicBlock *UncondBrTarget = nullptr;
6961     bool Negated = false;
6962     if (MachineInstr *BrCond =
6963             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
6964       const SIRegisterInfo *TRI
6965         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
6966 
6967       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
6968       Register Reg = MI.getOperand(2).getReg();
6969 
6970       if (Negated)
6971         std::swap(CondBrTarget, UncondBrTarget);
6972 
6973       B.setInsertPt(B.getMBB(), BrCond->getIterator());
6974       B.buildInstr(AMDGPU::SI_LOOP)
6975         .addUse(Reg)
6976         .addMBB(UncondBrTarget);
6977 
6978       if (Br)
6979         Br->getOperand(0).setMBB(CondBrTarget);
6980       else
6981         B.buildBr(*CondBrTarget);
6982 
6983       MI.eraseFromParent();
6984       BrCond->eraseFromParent();
6985       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
6986       return true;
6987     }
6988 
6989     return false;
6990   }
6991   case Intrinsic::amdgcn_make_buffer_rsrc:
6992     return legalizePointerAsRsrcIntrin(MI, MRI, B);
6993   case Intrinsic::amdgcn_kernarg_segment_ptr:
6994     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
6995       // This only makes sense to call in a kernel, so just lower to null.
6996       B.buildConstant(MI.getOperand(0).getReg(), 0);
6997       MI.eraseFromParent();
6998       return true;
6999     }
7000 
7001     return legalizePreloadedArgIntrin(
7002       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
7003   case Intrinsic::amdgcn_implicitarg_ptr:
7004     return legalizeImplicitArgPtr(MI, MRI, B);
7005   case Intrinsic::amdgcn_workitem_id_x:
7006     return legalizeWorkitemIDIntrinsic(MI, MRI, B, 0,
7007                                        AMDGPUFunctionArgInfo::WORKITEM_ID_X);
7008   case Intrinsic::amdgcn_workitem_id_y:
7009     return legalizeWorkitemIDIntrinsic(MI, MRI, B, 1,
7010                                        AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
7011   case Intrinsic::amdgcn_workitem_id_z:
7012     return legalizeWorkitemIDIntrinsic(MI, MRI, B, 2,
7013                                        AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
7014   case Intrinsic::amdgcn_workgroup_id_x:
7015     return legalizePreloadedArgIntrin(MI, MRI, B,
7016                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
7017   case Intrinsic::amdgcn_workgroup_id_y:
7018     return legalizePreloadedArgIntrin(MI, MRI, B,
7019                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
7020   case Intrinsic::amdgcn_workgroup_id_z:
7021     return legalizePreloadedArgIntrin(MI, MRI, B,
7022                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
7023   case Intrinsic::amdgcn_wave_id:
7024     return legalizeWaveID(MI, B);
7025   case Intrinsic::amdgcn_lds_kernel_id:
7026     return legalizePreloadedArgIntrin(MI, MRI, B,
7027                                       AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
7028   case Intrinsic::amdgcn_dispatch_ptr:
7029     return legalizePreloadedArgIntrin(MI, MRI, B,
7030                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
7031   case Intrinsic::amdgcn_queue_ptr:
7032     return legalizePreloadedArgIntrin(MI, MRI, B,
7033                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
7034   case Intrinsic::amdgcn_implicit_buffer_ptr:
7035     return legalizePreloadedArgIntrin(
7036       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
7037   case Intrinsic::amdgcn_dispatch_id:
7038     return legalizePreloadedArgIntrin(MI, MRI, B,
7039                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
7040   case Intrinsic::r600_read_ngroups_x:
7041     // TODO: Emit error for hsa
7042     return legalizeKernargMemParameter(MI, B,
7043                                        SI::KernelInputOffsets::NGROUPS_X);
7044   case Intrinsic::r600_read_ngroups_y:
7045     return legalizeKernargMemParameter(MI, B,
7046                                        SI::KernelInputOffsets::NGROUPS_Y);
7047   case Intrinsic::r600_read_ngroups_z:
7048     return legalizeKernargMemParameter(MI, B,
7049                                        SI::KernelInputOffsets::NGROUPS_Z);
7050   case Intrinsic::r600_read_local_size_x:
7051     // TODO: Could insert G_ASSERT_ZEXT from s16
7052     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_X);
7053   case Intrinsic::r600_read_local_size_y:
7054     // TODO: Could insert G_ASSERT_ZEXT from s16
7055     return legalizeKernargMemParameter(MI, B,  SI::KernelInputOffsets::LOCAL_SIZE_Y);
7056     // TODO: Could insert G_ASSERT_ZEXT from s16
7057   case Intrinsic::r600_read_local_size_z:
7058     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Z);
7059   case Intrinsic::r600_read_global_size_x:
7060     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_X);
7061   case Intrinsic::r600_read_global_size_y:
7062     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Y);
7063   case Intrinsic::r600_read_global_size_z:
7064     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Z);
7065   case Intrinsic::amdgcn_fdiv_fast:
7066     return legalizeFDIVFastIntrin(MI, MRI, B);
7067   case Intrinsic::amdgcn_is_shared:
7068     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
7069   case Intrinsic::amdgcn_is_private:
7070     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
7071   case Intrinsic::amdgcn_wavefrontsize: {
7072     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
7073     MI.eraseFromParent();
7074     return true;
7075   }
7076   case Intrinsic::amdgcn_s_buffer_load:
7077     return legalizeSBufferLoad(Helper, MI);
7078   case Intrinsic::amdgcn_raw_buffer_store:
7079   case Intrinsic::amdgcn_raw_ptr_buffer_store:
7080   case Intrinsic::amdgcn_struct_buffer_store:
7081   case Intrinsic::amdgcn_struct_ptr_buffer_store:
7082     return legalizeBufferStore(MI, MRI, B, false, false);
7083   case Intrinsic::amdgcn_raw_buffer_store_format:
7084   case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
7085   case Intrinsic::amdgcn_struct_buffer_store_format:
7086   case Intrinsic::amdgcn_struct_ptr_buffer_store_format:
7087     return legalizeBufferStore(MI, MRI, B, false, true);
7088   case Intrinsic::amdgcn_raw_tbuffer_store:
7089   case Intrinsic::amdgcn_raw_ptr_tbuffer_store:
7090   case Intrinsic::amdgcn_struct_tbuffer_store:
7091   case Intrinsic::amdgcn_struct_ptr_tbuffer_store:
7092     return legalizeBufferStore(MI, MRI, B, true, true);
7093   case Intrinsic::amdgcn_raw_buffer_load:
7094   case Intrinsic::amdgcn_raw_ptr_buffer_load:
7095   case Intrinsic::amdgcn_struct_buffer_load:
7096   case Intrinsic::amdgcn_struct_ptr_buffer_load:
7097     return legalizeBufferLoad(MI, MRI, B, false, false);
7098   case Intrinsic::amdgcn_raw_buffer_load_format:
7099   case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
7100   case Intrinsic::amdgcn_struct_buffer_load_format:
7101   case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
7102     return legalizeBufferLoad(MI, MRI, B, true, false);
7103   case Intrinsic::amdgcn_raw_tbuffer_load:
7104   case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
7105   case Intrinsic::amdgcn_struct_tbuffer_load:
7106   case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
7107     return legalizeBufferLoad(MI, MRI, B, true, true);
7108   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
7109   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
7110   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
7111   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
7112   case Intrinsic::amdgcn_raw_buffer_atomic_add:
7113   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
7114   case Intrinsic::amdgcn_struct_buffer_atomic_add:
7115   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
7116   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
7117   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
7118   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
7119   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
7120   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
7121   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
7122   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
7123   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
7124   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
7125   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
7126   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
7127   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
7128   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
7129   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
7130   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
7131   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
7132   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
7133   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
7134   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
7135   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
7136   case Intrinsic::amdgcn_raw_buffer_atomic_and:
7137   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
7138   case Intrinsic::amdgcn_struct_buffer_atomic_and:
7139   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
7140   case Intrinsic::amdgcn_raw_buffer_atomic_or:
7141   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
7142   case Intrinsic::amdgcn_struct_buffer_atomic_or:
7143   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
7144   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
7145   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
7146   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
7147   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
7148   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
7149   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
7150   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
7151   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
7152   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
7153   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
7154   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
7155   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
7156   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
7157   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
7158   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
7159   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
7160   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
7161   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
7162   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
7163   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
7164   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
7165   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
7166   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
7167   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
7168   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
7169   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
7170   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
7171   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
7172   case Intrinsic::amdgcn_raw_buffer_atomic_fadd_v2bf16:
7173   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd_v2bf16:
7174   case Intrinsic::amdgcn_struct_buffer_atomic_fadd_v2bf16:
7175   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd_v2bf16:
7176     return legalizeBufferAtomic(MI, B, IntrID);
7177   case Intrinsic::trap:
7178     return legalizeTrapIntrinsic(MI, MRI, B);
7179   case Intrinsic::debugtrap:
7180     return legalizeDebugTrapIntrinsic(MI, MRI, B);
7181   case Intrinsic::amdgcn_rsq_clamp:
7182     return legalizeRsqClampIntrinsic(MI, MRI, B);
7183   case Intrinsic::amdgcn_ds_fadd:
7184   case Intrinsic::amdgcn_ds_fmin:
7185   case Intrinsic::amdgcn_ds_fmax:
7186     return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
7187   case Intrinsic::amdgcn_image_bvh_intersect_ray:
7188     return legalizeBVHIntrinsic(MI, B);
7189   case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
7190   case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
7191   case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
7192   case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
7193   case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
7194   case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
7195   case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
7196   case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
7197     Register Index = MI.getOperand(5).getReg();
7198     LLT S32 = LLT::scalar(32);
7199     if (MRI.getType(Index) != S32)
7200       MI.getOperand(5).setReg(B.buildAnyExt(S32, Index).getReg(0));
7201     return true;
7202   }
7203   case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
7204   case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
7205   case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
7206     Register Index = MI.getOperand(7).getReg();
7207     LLT S32 = LLT::scalar(32);
7208     if (MRI.getType(Index) != S32)
7209       MI.getOperand(7).setReg(B.buildAnyExt(S32, Index).getReg(0));
7210     return true;
7211   }
7212   case Intrinsic::amdgcn_fmed3: {
7213     GISelChangeObserver &Observer = Helper.Observer;
7214 
7215     // FIXME: This is to workaround the inability of tablegen match combiners to
7216     // match intrinsics in patterns.
7217     Observer.changingInstr(MI);
7218     MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_FMED3));
7219     MI.removeOperand(1);
7220     Observer.changedInstr(MI);
7221     return true;
7222   }
7223   default: {
7224     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
7225             AMDGPU::getImageDimIntrinsicInfo(IntrID))
7226       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
7227     return true;
7228   }
7229   }
7230 
7231   return true;
7232 }
7233