xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp (revision 1db9f3b21e39176dd5b67cf8ac378633b172463e)
1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUInstrInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
21 #include "SIInstrInfo.h"
22 #include "SIMachineFunctionInfo.h"
23 #include "SIRegisterInfo.h"
24 #include "Utils/AMDGPUBaseInfo.h"
25 #include "llvm/ADT/ScopeExit.h"
26 #include "llvm/BinaryFormat/ELF.h"
27 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
28 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
29 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
30 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
31 #include "llvm/CodeGen/GlobalISel/Utils.h"
32 #include "llvm/CodeGen/TargetOpcodes.h"
33 #include "llvm/IR/DiagnosticInfo.h"
34 #include "llvm/IR/IntrinsicsAMDGPU.h"
35 #include "llvm/IR/IntrinsicsR600.h"
36 
37 #define DEBUG_TYPE "amdgpu-legalinfo"
38 
39 using namespace llvm;
40 using namespace LegalizeActions;
41 using namespace LegalizeMutations;
42 using namespace LegalityPredicates;
43 using namespace MIPatternMatch;
44 
45 // Hack until load/store selection patterns support any tuple of legal types.
46 static cl::opt<bool> EnableNewLegality(
47   "amdgpu-global-isel-new-legality",
48   cl::desc("Use GlobalISel desired legality, rather than try to use"
49            "rules compatible with selection patterns"),
50   cl::init(false),
51   cl::ReallyHidden);
52 
53 static constexpr unsigned MaxRegisterSize = 1024;
54 
55 // Round the number of elements to the next power of two elements
56 static LLT getPow2VectorType(LLT Ty) {
57   unsigned NElts = Ty.getNumElements();
58   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
59   return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
60 }
61 
62 // Round the number of bits to the next power of two bits
63 static LLT getPow2ScalarType(LLT Ty) {
64   unsigned Bits = Ty.getSizeInBits();
65   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
66   return LLT::scalar(Pow2Bits);
67 }
68 
69 /// \returns true if this is an odd sized vector which should widen by adding an
70 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
71 /// excludes s1 vectors, which should always be scalarized.
72 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
73   return [=](const LegalityQuery &Query) {
74     const LLT Ty = Query.Types[TypeIdx];
75     if (!Ty.isVector())
76       return false;
77 
78     const LLT EltTy = Ty.getElementType();
79     const unsigned EltSize = EltTy.getSizeInBits();
80     return Ty.getNumElements() % 2 != 0 &&
81            EltSize > 1 && EltSize < 32 &&
82            Ty.getSizeInBits() % 32 != 0;
83   };
84 }
85 
86 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
87   return [=](const LegalityQuery &Query) {
88     const LLT Ty = Query.Types[TypeIdx];
89     return Ty.getSizeInBits() % 32 == 0;
90   };
91 }
92 
93 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
94   return [=](const LegalityQuery &Query) {
95     const LLT Ty = Query.Types[TypeIdx];
96     const LLT EltTy = Ty.getScalarType();
97     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
98   };
99 }
100 
101 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
102   return [=](const LegalityQuery &Query) {
103     const LLT Ty = Query.Types[TypeIdx];
104     const LLT EltTy = Ty.getElementType();
105     return std::pair(TypeIdx,
106                      LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
107   };
108 }
109 
110 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
111   return [=](const LegalityQuery &Query) {
112     const LLT Ty = Query.Types[TypeIdx];
113     const LLT EltTy = Ty.getElementType();
114     unsigned Size = Ty.getSizeInBits();
115     unsigned Pieces = (Size + 63) / 64;
116     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
117     return std::pair(TypeIdx, LLT::scalarOrVector(
118                                   ElementCount::getFixed(NewNumElts), EltTy));
119   };
120 }
121 
122 // Increase the number of vector elements to reach the next multiple of 32-bit
123 // type.
124 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
125   return [=](const LegalityQuery &Query) {
126     const LLT Ty = Query.Types[TypeIdx];
127 
128     const LLT EltTy = Ty.getElementType();
129     const int Size = Ty.getSizeInBits();
130     const int EltSize = EltTy.getSizeInBits();
131     const int NextMul32 = (Size + 31) / 32;
132 
133     assert(EltSize < 32);
134 
135     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
136     return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
137   };
138 }
139 
140 // Increase the number of vector elements to reach the next legal RegClass.
141 static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx) {
142   return [=](const LegalityQuery &Query) {
143     const LLT Ty = Query.Types[TypeIdx];
144     const unsigned NumElts = Ty.getNumElements();
145     const unsigned EltSize = Ty.getElementType().getSizeInBits();
146     const unsigned MaxNumElts = MaxRegisterSize / EltSize;
147 
148     assert(EltSize == 32 || EltSize == 64);
149     assert(Ty.getSizeInBits() < MaxRegisterSize);
150 
151     unsigned NewNumElts;
152     // Find the nearest legal RegClass that is larger than the current type.
153     for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
154       if (SIRegisterInfo::getSGPRClassForBitWidth(NewNumElts * EltSize))
155         break;
156     }
157 
158     return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltSize));
159   };
160 }
161 
162 static LLT getBufferRsrcScalarType(const LLT Ty) {
163   if (!Ty.isVector())
164     return LLT::scalar(128);
165   const ElementCount NumElems = Ty.getElementCount();
166   return LLT::vector(NumElems, LLT::scalar(128));
167 }
168 
169 static LLT getBufferRsrcRegisterType(const LLT Ty) {
170   if (!Ty.isVector())
171     return LLT::fixed_vector(4, LLT::scalar(32));
172   const unsigned NumElems = Ty.getElementCount().getFixedValue();
173   return LLT::fixed_vector(NumElems * 4, LLT::scalar(32));
174 }
175 
176 static LLT getBitcastRegisterType(const LLT Ty) {
177   const unsigned Size = Ty.getSizeInBits();
178 
179   if (Size <= 32) {
180     // <2 x s8> -> s16
181     // <4 x s8> -> s32
182     return LLT::scalar(Size);
183   }
184 
185   return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32);
186 }
187 
188 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
189   return [=](const LegalityQuery &Query) {
190     const LLT Ty = Query.Types[TypeIdx];
191     return std::pair(TypeIdx, getBitcastRegisterType(Ty));
192   };
193 }
194 
195 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
196   return [=](const LegalityQuery &Query) {
197     const LLT Ty = Query.Types[TypeIdx];
198     unsigned Size = Ty.getSizeInBits();
199     assert(Size % 32 == 0);
200     return std::pair(
201         TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32));
202   };
203 }
204 
205 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
206   return [=](const LegalityQuery &Query) {
207     const LLT QueryTy = Query.Types[TypeIdx];
208     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
209   };
210 }
211 
212 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
213   return [=](const LegalityQuery &Query) {
214     const LLT QueryTy = Query.Types[TypeIdx];
215     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
216   };
217 }
218 
219 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
220   return [=](const LegalityQuery &Query) {
221     const LLT QueryTy = Query.Types[TypeIdx];
222     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
223   };
224 }
225 
226 static bool isRegisterSize(unsigned Size) {
227   return Size % 32 == 0 && Size <= MaxRegisterSize;
228 }
229 
230 static bool isRegisterVectorElementType(LLT EltTy) {
231   const int EltSize = EltTy.getSizeInBits();
232   return EltSize == 16 || EltSize % 32 == 0;
233 }
234 
235 static bool isRegisterVectorType(LLT Ty) {
236   const int EltSize = Ty.getElementType().getSizeInBits();
237   return EltSize == 32 || EltSize == 64 ||
238          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
239          EltSize == 128 || EltSize == 256;
240 }
241 
242 static bool isRegisterType(LLT Ty) {
243   if (!isRegisterSize(Ty.getSizeInBits()))
244     return false;
245 
246   if (Ty.isVector())
247     return isRegisterVectorType(Ty);
248 
249   return true;
250 }
251 
252 // Any combination of 32 or 64-bit elements up the maximum register size, and
253 // multiples of v2s16.
254 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
255   return [=](const LegalityQuery &Query) {
256     return isRegisterType(Query.Types[TypeIdx]);
257   };
258 }
259 
260 // RegisterType that doesn't have a corresponding RegClass.
261 static LegalityPredicate isIllegalRegisterType(unsigned TypeIdx) {
262   return [=](const LegalityQuery &Query) {
263     LLT Ty = Query.Types[TypeIdx];
264     return isRegisterType(Ty) &&
265            !SIRegisterInfo::getSGPRClassForBitWidth(Ty.getSizeInBits());
266   };
267 }
268 
269 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
270   return [=](const LegalityQuery &Query) {
271     const LLT QueryTy = Query.Types[TypeIdx];
272     if (!QueryTy.isVector())
273       return false;
274     const LLT EltTy = QueryTy.getElementType();
275     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
276   };
277 }
278 
279 // If we have a truncating store or an extending load with a data size larger
280 // than 32-bits, we need to reduce to a 32-bit type.
281 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
282   return [=](const LegalityQuery &Query) {
283     const LLT Ty = Query.Types[TypeIdx];
284     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
285            Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
286   };
287 }
288 
289 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
290 // handle some operations by just promoting the register during
291 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
292 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
293                                     bool IsLoad, bool IsAtomic) {
294   switch (AS) {
295   case AMDGPUAS::PRIVATE_ADDRESS:
296     // FIXME: Private element size.
297     return ST.enableFlatScratch() ? 128 : 32;
298   case AMDGPUAS::LOCAL_ADDRESS:
299     return ST.useDS128() ? 128 : 64;
300   case AMDGPUAS::GLOBAL_ADDRESS:
301   case AMDGPUAS::CONSTANT_ADDRESS:
302   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
303   case AMDGPUAS::BUFFER_RESOURCE:
304     // Treat constant and global as identical. SMRD loads are sometimes usable for
305     // global loads (ideally constant address space should be eliminated)
306     // depending on the context. Legality cannot be context dependent, but
307     // RegBankSelect can split the load as necessary depending on the pointer
308     // register bank/uniformity and if the memory is invariant or not written in a
309     // kernel.
310     return IsLoad ? 512 : 128;
311   default:
312     // FIXME: Flat addresses may contextually need to be split to 32-bit parts
313     // if they may alias scratch depending on the subtarget.  This needs to be
314     // moved to custom handling to use addressMayBeAccessedAsPrivate
315     return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
316   }
317 }
318 
319 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
320                                  const LegalityQuery &Query) {
321   const LLT Ty = Query.Types[0];
322 
323   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
324   const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
325 
326   unsigned RegSize = Ty.getSizeInBits();
327   uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
328   uint64_t AlignBits = Query.MMODescrs[0].AlignInBits;
329   unsigned AS = Query.Types[1].getAddressSpace();
330 
331   // All of these need to be custom lowered to cast the pointer operand.
332   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
333     return false;
334 
335   // Do not handle extending vector loads.
336   if (Ty.isVector() && MemSize != RegSize)
337     return false;
338 
339   // TODO: We should be able to widen loads if the alignment is high enough, but
340   // we also need to modify the memory access size.
341 #if 0
342   // Accept widening loads based on alignment.
343   if (IsLoad && MemSize < Size)
344     MemSize = std::max(MemSize, Align);
345 #endif
346 
347   // Only 1-byte and 2-byte to 32-bit extloads are valid.
348   if (MemSize != RegSize && RegSize != 32)
349     return false;
350 
351   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
352                                     Query.MMODescrs[0].Ordering !=
353                                         AtomicOrdering::NotAtomic))
354     return false;
355 
356   switch (MemSize) {
357   case 8:
358   case 16:
359   case 32:
360   case 64:
361   case 128:
362     break;
363   case 96:
364     if (!ST.hasDwordx3LoadStores())
365       return false;
366     break;
367   case 256:
368   case 512:
369     // These may contextually need to be broken down.
370     break;
371   default:
372     return false;
373   }
374 
375   assert(RegSize >= MemSize);
376 
377   if (AlignBits < MemSize) {
378     const SITargetLowering *TLI = ST.getTargetLowering();
379     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
380                                                  Align(AlignBits / 8)))
381       return false;
382   }
383 
384   return true;
385 }
386 
387 // The newer buffer intrinsic forms take their resource arguments as
388 // pointers in address space 8, aka s128 values. However, in order to not break
389 // SelectionDAG, the underlying operations have to continue to take v4i32
390 // arguments. Therefore, we convert resource pointers - or vectors of them
391 // to integer values here.
392 static bool hasBufferRsrcWorkaround(const LLT Ty) {
393   if (Ty.isPointer() && Ty.getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
394     return true;
395   if (Ty.isVector()) {
396     const LLT ElemTy = Ty.getElementType();
397     return hasBufferRsrcWorkaround(ElemTy);
398   }
399   return false;
400 }
401 
402 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
403 // workaround this. Eventually it should ignore the type for loads and only care
404 // about the size. Return true in cases where we will workaround this for now by
405 // bitcasting.
406 static bool loadStoreBitcastWorkaround(const LLT Ty) {
407   if (EnableNewLegality)
408     return false;
409 
410   const unsigned Size = Ty.getSizeInBits();
411   if (Size <= 64)
412     return false;
413   // Address space 8 pointers get their own workaround.
414   if (hasBufferRsrcWorkaround(Ty))
415     return false;
416   if (!Ty.isVector())
417     return true;
418 
419   LLT EltTy = Ty.getElementType();
420   if (EltTy.isPointer())
421     return true;
422 
423   unsigned EltSize = EltTy.getSizeInBits();
424   return EltSize != 32 && EltSize != 64;
425 }
426 
427 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
428   const LLT Ty = Query.Types[0];
429   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
430          !hasBufferRsrcWorkaround(Ty) && !loadStoreBitcastWorkaround(Ty);
431 }
432 
433 /// Return true if a load or store of the type should be lowered with a bitcast
434 /// to a different type.
435 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
436                                        const LLT MemTy) {
437   const unsigned MemSizeInBits = MemTy.getSizeInBits();
438   const unsigned Size = Ty.getSizeInBits();
439   if (Size != MemSizeInBits)
440     return Size <= 32 && Ty.isVector();
441 
442   if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
443     return true;
444 
445   // Don't try to handle bitcasting vector ext loads for now.
446   return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
447          (Size <= 32 || isRegisterSize(Size)) &&
448          !isRegisterVectorElementType(Ty.getElementType());
449 }
450 
451 /// Return true if we should legalize a load by widening an odd sized memory
452 /// access up to the alignment. Note this case when the memory access itself
453 /// changes, not the size of the result register.
454 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
455                             uint64_t AlignInBits, unsigned AddrSpace,
456                             unsigned Opcode) {
457   unsigned SizeInBits = MemoryTy.getSizeInBits();
458   // We don't want to widen cases that are naturally legal.
459   if (isPowerOf2_32(SizeInBits))
460     return false;
461 
462   // If we have 96-bit memory operations, we shouldn't touch them. Note we may
463   // end up widening these for a scalar load during RegBankSelect, if we don't
464   // have 96-bit scalar loads.
465   if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
466     return false;
467 
468   if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode, false))
469     return false;
470 
471   // A load is known dereferenceable up to the alignment, so it's legal to widen
472   // to it.
473   //
474   // TODO: Could check dereferenceable for less aligned cases.
475   unsigned RoundedSize = NextPowerOf2(SizeInBits);
476   if (AlignInBits < RoundedSize)
477     return false;
478 
479   // Do not widen if it would introduce a slow unaligned load.
480   const SITargetLowering *TLI = ST.getTargetLowering();
481   unsigned Fast = 0;
482   return TLI->allowsMisalignedMemoryAccessesImpl(
483              RoundedSize, AddrSpace, Align(AlignInBits / 8),
484              MachineMemOperand::MOLoad, &Fast) &&
485          Fast;
486 }
487 
488 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
489                             unsigned Opcode) {
490   if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
491     return false;
492 
493   return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
494                          Query.MMODescrs[0].AlignInBits,
495                          Query.Types[1].getAddressSpace(), Opcode);
496 }
497 
498 /// Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial
499 /// type of the operand `idx` and then to transform it to a `p8` via bitcasts
500 /// and inttoptr. In addition, handle vectors of p8. Returns the new type.
501 static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B,
502                                    MachineRegisterInfo &MRI, unsigned Idx) {
503   MachineOperand &MO = MI.getOperand(Idx);
504 
505   const LLT PointerTy = MRI.getType(MO.getReg());
506 
507   // Paranoidly prevent us from doing this multiple times.
508   if (!hasBufferRsrcWorkaround(PointerTy))
509     return PointerTy;
510 
511   const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
512   const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
513   if (!PointerTy.isVector()) {
514     // Happy path: (4 x s32) -> (s32, s32, s32, s32) -> (p8)
515     const unsigned NumParts = PointerTy.getSizeInBits() / 32;
516     const LLT S32 = LLT::scalar(32);
517 
518     Register VectorReg = MRI.createGenericVirtualRegister(VectorTy);
519     std::array<Register, 4> VectorElems;
520     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
521     for (unsigned I = 0; I < NumParts; ++I)
522       VectorElems[I] =
523           B.buildExtractVectorElementConstant(S32, VectorReg, I).getReg(0);
524     B.buildMergeValues(MO, VectorElems);
525     MO.setReg(VectorReg);
526     return VectorTy;
527   }
528   Register BitcastReg = MRI.createGenericVirtualRegister(VectorTy);
529   B.setInsertPt(B.getMBB(), ++B.getInsertPt());
530   auto Scalar = B.buildBitcast(ScalarTy, BitcastReg);
531   B.buildIntToPtr(MO, Scalar);
532   MO.setReg(BitcastReg);
533 
534   return VectorTy;
535 }
536 
537 /// Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is
538 /// the form in which the value must be in order to be passed to the low-level
539 /// representations used for MUBUF/MTBUF intrinsics. This is a hack, which is
540 /// needed in order to account for the fact that we can't define a register
541 /// class for s128 without breaking SelectionDAG.
542 static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B) {
543   MachineRegisterInfo &MRI = *B.getMRI();
544   const LLT PointerTy = MRI.getType(Pointer);
545   const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
546   const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
547 
548   if (!PointerTy.isVector()) {
549     // Special case: p8 -> (s32, s32, s32, s32) -> (4xs32)
550     SmallVector<Register, 4> PointerParts;
551     const unsigned NumParts = PointerTy.getSizeInBits() / 32;
552     auto Unmerged = B.buildUnmerge(LLT::scalar(32), Pointer);
553     for (unsigned I = 0; I < NumParts; ++I)
554       PointerParts.push_back(Unmerged.getReg(I));
555     return B.buildBuildVector(VectorTy, PointerParts).getReg(0);
556   }
557   Register Scalar = B.buildPtrToInt(ScalarTy, Pointer).getReg(0);
558   return B.buildBitcast(VectorTy, Scalar).getReg(0);
559 }
560 
561 static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B,
562                                      unsigned Idx) {
563   MachineOperand &MO = MI.getOperand(Idx);
564 
565   const LLT PointerTy = B.getMRI()->getType(MO.getReg());
566   // Paranoidly prevent us from doing this multiple times.
567   if (!hasBufferRsrcWorkaround(PointerTy))
568     return;
569   MO.setReg(castBufferRsrcToV4I32(MO.getReg(), B));
570 }
571 
572 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
573                                          const GCNTargetMachine &TM)
574   :  ST(ST_) {
575   using namespace TargetOpcode;
576 
577   auto GetAddrSpacePtr = [&TM](unsigned AS) {
578     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
579   };
580 
581   const LLT S1 = LLT::scalar(1);
582   const LLT S8 = LLT::scalar(8);
583   const LLT S16 = LLT::scalar(16);
584   const LLT S32 = LLT::scalar(32);
585   const LLT S64 = LLT::scalar(64);
586   const LLT S128 = LLT::scalar(128);
587   const LLT S256 = LLT::scalar(256);
588   const LLT S512 = LLT::scalar(512);
589   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
590 
591   const LLT V2S8 = LLT::fixed_vector(2, 8);
592   const LLT V2S16 = LLT::fixed_vector(2, 16);
593   const LLT V4S16 = LLT::fixed_vector(4, 16);
594 
595   const LLT V2S32 = LLT::fixed_vector(2, 32);
596   const LLT V3S32 = LLT::fixed_vector(3, 32);
597   const LLT V4S32 = LLT::fixed_vector(4, 32);
598   const LLT V5S32 = LLT::fixed_vector(5, 32);
599   const LLT V6S32 = LLT::fixed_vector(6, 32);
600   const LLT V7S32 = LLT::fixed_vector(7, 32);
601   const LLT V8S32 = LLT::fixed_vector(8, 32);
602   const LLT V9S32 = LLT::fixed_vector(9, 32);
603   const LLT V10S32 = LLT::fixed_vector(10, 32);
604   const LLT V11S32 = LLT::fixed_vector(11, 32);
605   const LLT V12S32 = LLT::fixed_vector(12, 32);
606   const LLT V13S32 = LLT::fixed_vector(13, 32);
607   const LLT V14S32 = LLT::fixed_vector(14, 32);
608   const LLT V15S32 = LLT::fixed_vector(15, 32);
609   const LLT V16S32 = LLT::fixed_vector(16, 32);
610   const LLT V32S32 = LLT::fixed_vector(32, 32);
611 
612   const LLT V2S64 = LLT::fixed_vector(2, 64);
613   const LLT V3S64 = LLT::fixed_vector(3, 64);
614   const LLT V4S64 = LLT::fixed_vector(4, 64);
615   const LLT V5S64 = LLT::fixed_vector(5, 64);
616   const LLT V6S64 = LLT::fixed_vector(6, 64);
617   const LLT V7S64 = LLT::fixed_vector(7, 64);
618   const LLT V8S64 = LLT::fixed_vector(8, 64);
619   const LLT V16S64 = LLT::fixed_vector(16, 64);
620 
621   std::initializer_list<LLT> AllS32Vectors =
622     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
623      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
624   std::initializer_list<LLT> AllS64Vectors =
625     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
626 
627   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
628   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
629   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
630   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
631   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
632   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
633   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
634   const LLT BufferFatPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_FAT_POINTER);
635   const LLT RsrcPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_RESOURCE);
636   const LLT BufferStridedPtr =
637       GetAddrSpacePtr(AMDGPUAS::BUFFER_STRIDED_POINTER);
638 
639   const LLT CodePtr = FlatPtr;
640 
641   const std::initializer_list<LLT> AddrSpaces64 = {
642     GlobalPtr, ConstantPtr, FlatPtr
643   };
644 
645   const std::initializer_list<LLT> AddrSpaces32 = {
646     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
647   };
648 
649   const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
650 
651   const std::initializer_list<LLT> FPTypesBase = {
652     S32, S64
653   };
654 
655   const std::initializer_list<LLT> FPTypes16 = {
656     S32, S64, S16
657   };
658 
659   const std::initializer_list<LLT> FPTypesPK16 = {
660     S32, S64, S16, V2S16
661   };
662 
663   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
664 
665   // s1 for VCC branches, s32 for SCC branches.
666   getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32});
667 
668   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
669   // elements for v3s16
670   getActionDefinitionsBuilder(G_PHI)
671       .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
672       .legalFor(AllS32Vectors)
673       .legalFor(AllS64Vectors)
674       .legalFor(AddrSpaces64)
675       .legalFor(AddrSpaces32)
676       .legalFor(AddrSpaces128)
677       .legalIf(isPointer(0))
678       .clampScalar(0, S16, S256)
679       .widenScalarToNextPow2(0, 32)
680       .clampMaxNumElements(0, S32, 16)
681       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
682       .scalarize(0);
683 
684   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
685     // Full set of gfx9 features.
686     if (ST.hasScalarAddSub64()) {
687       getActionDefinitionsBuilder({G_ADD, G_SUB})
688           .legalFor({S64, S32, S16, V2S16})
689           .clampMaxNumElementsStrict(0, S16, 2)
690           .scalarize(0)
691           .minScalar(0, S16)
692           .widenScalarToNextMultipleOf(0, 32)
693           .maxScalar(0, S32);
694     } else {
695       getActionDefinitionsBuilder({G_ADD, G_SUB})
696           .legalFor({S32, S16, V2S16})
697           .clampMaxNumElementsStrict(0, S16, 2)
698           .scalarize(0)
699           .minScalar(0, S16)
700           .widenScalarToNextMultipleOf(0, 32)
701           .maxScalar(0, S32);
702     }
703 
704     if (ST.hasScalarSMulU64()) {
705       getActionDefinitionsBuilder(G_MUL)
706           .legalFor({S64, S32, S16, V2S16})
707           .clampMaxNumElementsStrict(0, S16, 2)
708           .scalarize(0)
709           .minScalar(0, S16)
710           .widenScalarToNextMultipleOf(0, 32)
711           .custom();
712     } else {
713       getActionDefinitionsBuilder(G_MUL)
714           .legalFor({S32, S16, V2S16})
715           .clampMaxNumElementsStrict(0, S16, 2)
716           .scalarize(0)
717           .minScalar(0, S16)
718           .widenScalarToNextMultipleOf(0, 32)
719           .custom();
720     }
721     assert(ST.hasMad64_32());
722 
723     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
724       .legalFor({S32, S16, V2S16}) // Clamp modifier
725       .minScalarOrElt(0, S16)
726       .clampMaxNumElementsStrict(0, S16, 2)
727       .scalarize(0)
728       .widenScalarToNextPow2(0, 32)
729       .lower();
730   } else if (ST.has16BitInsts()) {
731     getActionDefinitionsBuilder({G_ADD, G_SUB})
732       .legalFor({S32, S16})
733       .minScalar(0, S16)
734       .widenScalarToNextMultipleOf(0, 32)
735       .maxScalar(0, S32)
736       .scalarize(0);
737 
738     getActionDefinitionsBuilder(G_MUL)
739       .legalFor({S32, S16})
740       .scalarize(0)
741       .minScalar(0, S16)
742       .widenScalarToNextMultipleOf(0, 32)
743       .custom();
744     assert(ST.hasMad64_32());
745 
746     // Technically the saturating operations require clamp bit support, but this
747     // was introduced at the same time as 16-bit operations.
748     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
749       .legalFor({S32, S16}) // Clamp modifier
750       .minScalar(0, S16)
751       .scalarize(0)
752       .widenScalarToNextPow2(0, 16)
753       .lower();
754 
755     // We're just lowering this, but it helps get a better result to try to
756     // coerce to the desired type first.
757     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
758       .minScalar(0, S16)
759       .scalarize(0)
760       .lower();
761   } else {
762     getActionDefinitionsBuilder({G_ADD, G_SUB})
763       .legalFor({S32})
764       .widenScalarToNextMultipleOf(0, 32)
765       .clampScalar(0, S32, S32)
766       .scalarize(0);
767 
768     auto &Mul = getActionDefinitionsBuilder(G_MUL)
769       .legalFor({S32})
770       .scalarize(0)
771       .minScalar(0, S32)
772       .widenScalarToNextMultipleOf(0, 32);
773 
774     if (ST.hasMad64_32())
775       Mul.custom();
776     else
777       Mul.maxScalar(0, S32);
778 
779     if (ST.hasIntClamp()) {
780       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
781         .legalFor({S32}) // Clamp modifier.
782         .scalarize(0)
783         .minScalarOrElt(0, S32)
784         .lower();
785     } else {
786       // Clamp bit support was added in VI, along with 16-bit operations.
787       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
788         .minScalar(0, S32)
789         .scalarize(0)
790         .lower();
791     }
792 
793     // FIXME: DAG expansion gets better results. The widening uses the smaller
794     // range values and goes for the min/max lowering directly.
795     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
796       .minScalar(0, S32)
797       .scalarize(0)
798       .lower();
799   }
800 
801   getActionDefinitionsBuilder(
802       {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
803       .customFor({S32, S64})
804       .clampScalar(0, S32, S64)
805       .widenScalarToNextPow2(0, 32)
806       .scalarize(0);
807 
808   auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
809                    .legalFor({S32})
810                    .maxScalar(0, S32);
811 
812   if (ST.hasVOP3PInsts()) {
813     Mulh
814       .clampMaxNumElements(0, S8, 2)
815       .lowerFor({V2S8});
816   }
817 
818   Mulh
819     .scalarize(0)
820     .lower();
821 
822   // Report legal for any types we can handle anywhere. For the cases only legal
823   // on the SALU, RegBankSelect will be able to re-legalize.
824   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
825     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
826     .clampScalar(0, S32, S64)
827     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
828     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
829     .widenScalarToNextPow2(0)
830     .scalarize(0);
831 
832   getActionDefinitionsBuilder(
833       {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
834       .legalFor({{S32, S1}, {S32, S32}})
835       .clampScalar(0, S32, S32)
836       .scalarize(0);
837 
838   getActionDefinitionsBuilder(G_BITCAST)
839     // Don't worry about the size constraint.
840     .legalIf(all(isRegisterType(0), isRegisterType(1)))
841     .lower();
842 
843 
844   getActionDefinitionsBuilder(G_CONSTANT)
845     .legalFor({S1, S32, S64, S16, GlobalPtr,
846                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
847     .legalIf(isPointer(0))
848     .clampScalar(0, S32, S64)
849     .widenScalarToNextPow2(0);
850 
851   getActionDefinitionsBuilder(G_FCONSTANT)
852     .legalFor({S32, S64, S16})
853     .clampScalar(0, S16, S64);
854 
855   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
856       .legalIf(isRegisterType(0))
857       // s1 and s16 are special cases because they have legal operations on
858       // them, but don't really occupy registers in the normal way.
859       .legalFor({S1, S16})
860       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
861       .clampScalarOrElt(0, S32, MaxScalar)
862       .widenScalarToNextPow2(0, 32)
863       .clampMaxNumElements(0, S32, 16);
864 
865   getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
866 
867   // If the amount is divergent, we have to do a wave reduction to get the
868   // maximum value, so this is expanded during RegBankSelect.
869   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
870     .legalFor({{PrivatePtr, S32}});
871 
872   getActionDefinitionsBuilder(G_STACKSAVE)
873     .customFor({PrivatePtr});
874   getActionDefinitionsBuilder(G_STACKRESTORE)
875     .legalFor({PrivatePtr});
876 
877   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
878     .customIf(typeIsNot(0, PrivatePtr));
879 
880   getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
881 
882   auto &FPOpActions = getActionDefinitionsBuilder(
883     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
884       G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
885     .legalFor({S32, S64});
886   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
887     .customFor({S32, S64});
888   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
889     .customFor({S32, S64});
890 
891   if (ST.has16BitInsts()) {
892     if (ST.hasVOP3PInsts())
893       FPOpActions.legalFor({S16, V2S16});
894     else
895       FPOpActions.legalFor({S16});
896 
897     TrigActions.customFor({S16});
898     FDIVActions.customFor({S16});
899   }
900 
901   if (ST.hasPackedFP32Ops()) {
902     FPOpActions.legalFor({V2S32});
903     FPOpActions.clampMaxNumElementsStrict(0, S32, 2);
904   }
905 
906   auto &MinNumMaxNum = getActionDefinitionsBuilder({
907       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
908 
909   if (ST.hasVOP3PInsts()) {
910     MinNumMaxNum.customFor(FPTypesPK16)
911       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
912       .clampMaxNumElements(0, S16, 2)
913       .clampScalar(0, S16, S64)
914       .scalarize(0);
915   } else if (ST.has16BitInsts()) {
916     MinNumMaxNum.customFor(FPTypes16)
917       .clampScalar(0, S16, S64)
918       .scalarize(0);
919   } else {
920     MinNumMaxNum.customFor(FPTypesBase)
921       .clampScalar(0, S32, S64)
922       .scalarize(0);
923   }
924 
925   if (ST.hasVOP3PInsts())
926     FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
927 
928   FPOpActions
929     .scalarize(0)
930     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
931 
932   TrigActions
933     .scalarize(0)
934     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
935 
936   FDIVActions
937     .scalarize(0)
938     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
939 
940   getActionDefinitionsBuilder({G_FNEG, G_FABS})
941     .legalFor(FPTypesPK16)
942     .clampMaxNumElementsStrict(0, S16, 2)
943     .scalarize(0)
944     .clampScalar(0, S16, S64);
945 
946   if (ST.has16BitInsts()) {
947     getActionDefinitionsBuilder(G_FSQRT)
948       .legalFor({S16})
949       .customFor({S32, S64})
950       .scalarize(0)
951       .unsupported();
952     getActionDefinitionsBuilder(G_FFLOOR)
953       .legalFor({S32, S64, S16})
954       .scalarize(0)
955       .clampScalar(0, S16, S64);
956 
957     getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
958       .legalFor({{S32, S32}, {S64, S32}, {S16, S16}})
959       .scalarize(0)
960       .maxScalarIf(typeIs(0, S16), 1, S16)
961       .clampScalar(1, S32, S32)
962       .lower();
963 
964     getActionDefinitionsBuilder(G_FFREXP)
965       .customFor({{S32, S32}, {S64, S32}, {S16, S16}, {S16, S32}})
966       .scalarize(0)
967       .lower();
968   } else {
969     getActionDefinitionsBuilder(G_FSQRT)
970       .customFor({S32, S64, S16})
971       .scalarize(0)
972       .unsupported();
973 
974 
975     if (ST.hasFractBug()) {
976       getActionDefinitionsBuilder(G_FFLOOR)
977         .customFor({S64})
978         .legalFor({S32, S64})
979         .scalarize(0)
980         .clampScalar(0, S32, S64);
981     } else {
982       getActionDefinitionsBuilder(G_FFLOOR)
983         .legalFor({S32, S64})
984         .scalarize(0)
985         .clampScalar(0, S32, S64);
986     }
987 
988     getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
989       .legalFor({{S32, S32}, {S64, S32}})
990       .scalarize(0)
991       .clampScalar(0, S32, S64)
992       .clampScalar(1, S32, S32)
993       .lower();
994 
995     getActionDefinitionsBuilder(G_FFREXP)
996       .customFor({{S32, S32}, {S64, S32}})
997       .scalarize(0)
998       .minScalar(0, S32)
999       .clampScalar(1, S32, S32)
1000       .lower();
1001   }
1002 
1003   getActionDefinitionsBuilder(G_FPTRUNC)
1004     .legalFor({{S32, S64}, {S16, S32}})
1005     .scalarize(0)
1006     .lower();
1007 
1008   getActionDefinitionsBuilder(G_FPEXT)
1009     .legalFor({{S64, S32}, {S32, S16}})
1010     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
1011     .scalarize(0);
1012 
1013   auto &FSubActions = getActionDefinitionsBuilder({G_FSUB, G_STRICT_FSUB});
1014   if (ST.has16BitInsts()) {
1015     FSubActions
1016       // Use actual fsub instruction
1017       .legalFor({S32, S16})
1018       // Must use fadd + fneg
1019       .lowerFor({S64, V2S16});
1020   } else {
1021     FSubActions
1022       // Use actual fsub instruction
1023       .legalFor({S32})
1024       // Must use fadd + fneg
1025       .lowerFor({S64, S16, V2S16});
1026   }
1027 
1028   FSubActions
1029     .scalarize(0)
1030     .clampScalar(0, S32, S64);
1031 
1032   // Whether this is legal depends on the floating point mode for the function.
1033   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
1034   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
1035     FMad.customFor({S32, S16});
1036   else if (ST.hasMadMacF32Insts())
1037     FMad.customFor({S32});
1038   else if (ST.hasMadF16())
1039     FMad.customFor({S16});
1040   FMad.scalarize(0)
1041       .lower();
1042 
1043   auto &FRem = getActionDefinitionsBuilder(G_FREM);
1044   if (ST.has16BitInsts()) {
1045     FRem.customFor({S16, S32, S64});
1046   } else {
1047     FRem.minScalar(0, S32)
1048         .customFor({S32, S64});
1049   }
1050   FRem.scalarize(0);
1051 
1052   // TODO: Do we need to clamp maximum bitwidth?
1053   getActionDefinitionsBuilder(G_TRUNC)
1054     .legalIf(isScalar(0))
1055     .legalFor({{V2S16, V2S32}})
1056     .clampMaxNumElements(0, S16, 2)
1057     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
1058     // situations (like an invalid implicit use), we don't want to infinite loop
1059     // in the legalizer.
1060     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
1061     .alwaysLegal();
1062 
1063   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
1064     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
1065                {S32, S1}, {S64, S1}, {S16, S1}})
1066     .scalarize(0)
1067     .clampScalar(0, S32, S64)
1068     .widenScalarToNextPow2(1, 32);
1069 
1070   // TODO: Split s1->s64 during regbankselect for VALU.
1071   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
1072                     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
1073                     .lowerIf(typeIs(1, S1))
1074                     .customFor({{S32, S64}, {S64, S64}});
1075   if (ST.has16BitInsts())
1076     IToFP.legalFor({{S16, S16}});
1077   IToFP.clampScalar(1, S32, S64)
1078        .minScalar(0, S32)
1079        .scalarize(0)
1080        .widenScalarToNextPow2(1);
1081 
1082   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
1083     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
1084     .customFor({{S64, S32}, {S64, S64}})
1085     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
1086   if (ST.has16BitInsts())
1087     FPToI.legalFor({{S16, S16}});
1088   else
1089     FPToI.minScalar(1, S32);
1090 
1091   FPToI.minScalar(0, S32)
1092        .widenScalarToNextPow2(0, 32)
1093        .scalarize(0)
1094        .lower();
1095 
1096   getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
1097       .customFor({S16, S32})
1098       .scalarize(0)
1099       .lower();
1100 
1101   // Lower G_FNEARBYINT and G_FRINT into G_INTRINSIC_ROUNDEVEN
1102   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_FRINT, G_FNEARBYINT})
1103       .scalarize(0)
1104       .lower();
1105 
1106   if (ST.has16BitInsts()) {
1107     getActionDefinitionsBuilder(
1108         {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1109         .legalFor({S16, S32, S64})
1110         .clampScalar(0, S16, S64)
1111         .scalarize(0);
1112   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
1113     getActionDefinitionsBuilder(
1114         {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1115         .legalFor({S32, S64})
1116         .clampScalar(0, S32, S64)
1117         .scalarize(0);
1118   } else {
1119     getActionDefinitionsBuilder(
1120         {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1121         .legalFor({S32})
1122         .customFor({S64})
1123         .clampScalar(0, S32, S64)
1124         .scalarize(0);
1125   }
1126 
1127   getActionDefinitionsBuilder(G_PTR_ADD)
1128       .unsupportedFor({BufferFatPtr, BufferStridedPtr, RsrcPtr})
1129       .legalIf(all(isPointer(0), sameSize(0, 1)))
1130       .scalarize(0)
1131       .scalarSameSizeAs(1, 0);
1132 
1133   getActionDefinitionsBuilder(G_PTRMASK)
1134     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
1135     .scalarSameSizeAs(1, 0)
1136     .scalarize(0);
1137 
1138   auto &CmpBuilder =
1139     getActionDefinitionsBuilder(G_ICMP)
1140     // The compare output type differs based on the register bank of the output,
1141     // so make both s1 and s32 legal.
1142     //
1143     // Scalar compares producing output in scc will be promoted to s32, as that
1144     // is the allocatable register type that will be needed for the copy from
1145     // scc. This will be promoted during RegBankSelect, and we assume something
1146     // before that won't try to use s32 result types.
1147     //
1148     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
1149     // bank.
1150     .legalForCartesianProduct(
1151       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
1152     .legalForCartesianProduct(
1153       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
1154   if (ST.has16BitInsts()) {
1155     CmpBuilder.legalFor({{S1, S16}});
1156   }
1157 
1158   CmpBuilder
1159     .widenScalarToNextPow2(1)
1160     .clampScalar(1, S32, S64)
1161     .scalarize(0)
1162     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
1163 
1164   auto &FCmpBuilder =
1165       getActionDefinitionsBuilder(G_FCMP).legalForCartesianProduct(
1166           {S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
1167 
1168   if (ST.hasSALUFloatInsts())
1169     FCmpBuilder.legalForCartesianProduct({S32}, {S16, S32});
1170 
1171   FCmpBuilder
1172     .widenScalarToNextPow2(1)
1173     .clampScalar(1, S32, S64)
1174     .scalarize(0);
1175 
1176   // FIXME: fpow has a selection pattern that should move to custom lowering.
1177   auto &ExpOps = getActionDefinitionsBuilder(G_FPOW);
1178   if (ST.has16BitInsts())
1179     ExpOps.customFor({{S32}, {S16}});
1180   else
1181     ExpOps.customFor({S32});
1182   ExpOps.clampScalar(0, MinScalarFPTy, S32)
1183         .scalarize(0);
1184 
1185   getActionDefinitionsBuilder(G_FPOWI)
1186     .clampScalar(0, MinScalarFPTy, S32)
1187     .lower();
1188 
1189   auto &Log2Ops = getActionDefinitionsBuilder({G_FLOG2, G_FEXP2});
1190   Log2Ops.customFor({S32});
1191   if (ST.has16BitInsts())
1192     Log2Ops.legalFor({S16});
1193   else
1194     Log2Ops.customFor({S16});
1195   Log2Ops.scalarize(0)
1196     .lower();
1197 
1198   auto &LogOps =
1199       getActionDefinitionsBuilder({G_FLOG, G_FLOG10, G_FEXP, G_FEXP10});
1200   LogOps.customFor({S32, S16});
1201   LogOps.clampScalar(0, MinScalarFPTy, S32)
1202         .scalarize(0);
1203 
1204   // The 64-bit versions produce 32-bit results, but only on the SALU.
1205   getActionDefinitionsBuilder(G_CTPOP)
1206     .legalFor({{S32, S32}, {S32, S64}})
1207     .clampScalar(0, S32, S32)
1208     .widenScalarToNextPow2(1, 32)
1209     .clampScalar(1, S32, S64)
1210     .scalarize(0)
1211     .widenScalarToNextPow2(0, 32);
1212 
1213   // If no 16 bit instr is available, lower into different instructions.
1214   if (ST.has16BitInsts())
1215     getActionDefinitionsBuilder(G_IS_FPCLASS)
1216         .legalForCartesianProduct({S1}, FPTypes16)
1217         .widenScalarToNextPow2(1)
1218         .scalarize(0)
1219         .lower();
1220   else
1221     getActionDefinitionsBuilder(G_IS_FPCLASS)
1222         .legalForCartesianProduct({S1}, FPTypesBase)
1223         .lowerFor({S1, S16})
1224         .widenScalarToNextPow2(1)
1225         .scalarize(0)
1226         .lower();
1227 
1228   // The hardware instructions return a different result on 0 than the generic
1229   // instructions expect. The hardware produces -1, but these produce the
1230   // bitwidth.
1231   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
1232     .scalarize(0)
1233     .clampScalar(0, S32, S32)
1234     .clampScalar(1, S32, S64)
1235     .widenScalarToNextPow2(0, 32)
1236     .widenScalarToNextPow2(1, 32)
1237     .custom();
1238 
1239   // The 64-bit versions produce 32-bit results, but only on the SALU.
1240   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
1241     .legalFor({{S32, S32}, {S32, S64}})
1242     .clampScalar(0, S32, S32)
1243     .clampScalar(1, S32, S64)
1244     .scalarize(0)
1245     .widenScalarToNextPow2(0, 32)
1246     .widenScalarToNextPow2(1, 32);
1247 
1248   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1249   // RegBankSelect.
1250   getActionDefinitionsBuilder(G_BITREVERSE)
1251     .legalFor({S32, S64})
1252     .clampScalar(0, S32, S64)
1253     .scalarize(0)
1254     .widenScalarToNextPow2(0);
1255 
1256   if (ST.has16BitInsts()) {
1257     getActionDefinitionsBuilder(G_BSWAP)
1258       .legalFor({S16, S32, V2S16})
1259       .clampMaxNumElementsStrict(0, S16, 2)
1260       // FIXME: Fixing non-power-of-2 before clamp is workaround for
1261       // narrowScalar limitation.
1262       .widenScalarToNextPow2(0)
1263       .clampScalar(0, S16, S32)
1264       .scalarize(0);
1265 
1266     if (ST.hasVOP3PInsts()) {
1267       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1268         .legalFor({S32, S16, V2S16})
1269         .clampMaxNumElements(0, S16, 2)
1270         .minScalar(0, S16)
1271         .widenScalarToNextPow2(0)
1272         .scalarize(0)
1273         .lower();
1274     } else {
1275       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1276         .legalFor({S32, S16})
1277         .widenScalarToNextPow2(0)
1278         .minScalar(0, S16)
1279         .scalarize(0)
1280         .lower();
1281     }
1282   } else {
1283     // TODO: Should have same legality without v_perm_b32
1284     getActionDefinitionsBuilder(G_BSWAP)
1285       .legalFor({S32})
1286       .lowerIf(scalarNarrowerThan(0, 32))
1287       // FIXME: Fixing non-power-of-2 before clamp is workaround for
1288       // narrowScalar limitation.
1289       .widenScalarToNextPow2(0)
1290       .maxScalar(0, S32)
1291       .scalarize(0)
1292       .lower();
1293 
1294     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1295       .legalFor({S32})
1296       .minScalar(0, S32)
1297       .widenScalarToNextPow2(0)
1298       .scalarize(0)
1299       .lower();
1300   }
1301 
1302   getActionDefinitionsBuilder(G_INTTOPTR)
1303       // List the common cases
1304       .legalForCartesianProduct(AddrSpaces64, {S64})
1305       .legalForCartesianProduct(AddrSpaces32, {S32})
1306       .scalarize(0)
1307       // Accept any address space as long as the size matches
1308       .legalIf(sameSize(0, 1))
1309       .widenScalarIf(smallerThan(1, 0),
1310                      [](const LegalityQuery &Query) {
1311                        return std::pair(
1312                            1, LLT::scalar(Query.Types[0].getSizeInBits()));
1313                      })
1314       .narrowScalarIf(largerThan(1, 0), [](const LegalityQuery &Query) {
1315         return std::pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1316       });
1317 
1318   getActionDefinitionsBuilder(G_PTRTOINT)
1319       // List the common cases
1320       .legalForCartesianProduct(AddrSpaces64, {S64})
1321       .legalForCartesianProduct(AddrSpaces32, {S32})
1322       .scalarize(0)
1323       // Accept any address space as long as the size matches
1324       .legalIf(sameSize(0, 1))
1325       .widenScalarIf(smallerThan(0, 1),
1326                      [](const LegalityQuery &Query) {
1327                        return std::pair(
1328                            0, LLT::scalar(Query.Types[1].getSizeInBits()));
1329                      })
1330       .narrowScalarIf(largerThan(0, 1), [](const LegalityQuery &Query) {
1331         return std::pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1332       });
1333 
1334   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1335     .scalarize(0)
1336     .custom();
1337 
1338   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1339                                     bool IsLoad) -> bool {
1340     const LLT DstTy = Query.Types[0];
1341 
1342     // Split vector extloads.
1343     unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1344 
1345     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1346       return true;
1347 
1348     const LLT PtrTy = Query.Types[1];
1349     unsigned AS = PtrTy.getAddressSpace();
1350     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
1351                                       Query.MMODescrs[0].Ordering !=
1352                                           AtomicOrdering::NotAtomic))
1353       return true;
1354 
1355     // Catch weird sized loads that don't evenly divide into the access sizes
1356     // TODO: May be able to widen depending on alignment etc.
1357     unsigned NumRegs = (MemSize + 31) / 32;
1358     if (NumRegs == 3) {
1359       if (!ST.hasDwordx3LoadStores())
1360         return true;
1361     } else {
1362       // If the alignment allows, these should have been widened.
1363       if (!isPowerOf2_32(NumRegs))
1364         return true;
1365     }
1366 
1367     return false;
1368   };
1369 
1370   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1371   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1372   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1373 
1374   // TODO: Refine based on subtargets which support unaligned access or 128-bit
1375   // LDS
1376   // TODO: Unsupported flat for SI.
1377 
1378   for (unsigned Op : {G_LOAD, G_STORE}) {
1379     const bool IsStore = Op == G_STORE;
1380 
1381     auto &Actions = getActionDefinitionsBuilder(Op);
1382     // Explicitly list some common cases.
1383     // TODO: Does this help compile time at all?
1384     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1385                                       {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1386                                       {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1387                                       {S64, GlobalPtr, S64, GlobalAlign32},
1388                                       {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1389                                       {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1390                                       {S32, GlobalPtr, S8, GlobalAlign8},
1391                                       {S32, GlobalPtr, S16, GlobalAlign16},
1392 
1393                                       {S32, LocalPtr, S32, 32},
1394                                       {S64, LocalPtr, S64, 32},
1395                                       {V2S32, LocalPtr, V2S32, 32},
1396                                       {S32, LocalPtr, S8, 8},
1397                                       {S32, LocalPtr, S16, 16},
1398                                       {V2S16, LocalPtr, S32, 32},
1399 
1400                                       {S32, PrivatePtr, S32, 32},
1401                                       {S32, PrivatePtr, S8, 8},
1402                                       {S32, PrivatePtr, S16, 16},
1403                                       {V2S16, PrivatePtr, S32, 32},
1404 
1405                                       {S32, ConstantPtr, S32, GlobalAlign32},
1406                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1407                                       {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1408                                       {S64, ConstantPtr, S64, GlobalAlign32},
1409                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1410     Actions.legalIf(
1411       [=](const LegalityQuery &Query) -> bool {
1412         return isLoadStoreLegal(ST, Query);
1413       });
1414 
1415     // The custom pointers (fat pointers, buffer resources) don't work with load
1416     // and store at this level. Fat pointers should have been lowered to
1417     // intrinsics before the translation to MIR.
1418     Actions.unsupportedIf(
1419         typeInSet(1, {BufferFatPtr, BufferStridedPtr, RsrcPtr}));
1420 
1421     // Address space 8 pointers are handled by a 4xs32 load, bitcast, and
1422     // ptrtoint. This is needed to account for the fact that we can't have i128
1423     // as a register class for SelectionDAG reasons.
1424     Actions.customIf([=](const LegalityQuery &Query) -> bool {
1425       return hasBufferRsrcWorkaround(Query.Types[0]);
1426     });
1427 
1428     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1429     // 64-bits.
1430     //
1431     // TODO: Should generalize bitcast action into coerce, which will also cover
1432     // inserting addrspacecasts.
1433     Actions.customIf(typeIs(1, Constant32Ptr));
1434 
1435     // Turn any illegal element vectors into something easier to deal
1436     // with. These will ultimately produce 32-bit scalar shifts to extract the
1437     // parts anyway.
1438     //
1439     // For odd 16-bit element vectors, prefer to split those into pieces with
1440     // 16-bit vector parts.
1441     Actions.bitcastIf(
1442       [=](const LegalityQuery &Query) -> bool {
1443         return shouldBitcastLoadStoreType(ST, Query.Types[0],
1444                                           Query.MMODescrs[0].MemoryTy);
1445       }, bitcastToRegisterType(0));
1446 
1447     if (!IsStore) {
1448       // Widen suitably aligned loads by loading extra bytes. The standard
1449       // legalization actions can't properly express widening memory operands.
1450       Actions.customIf([=](const LegalityQuery &Query) -> bool {
1451         return shouldWidenLoad(ST, Query, G_LOAD);
1452       });
1453     }
1454 
1455     // FIXME: load/store narrowing should be moved to lower action
1456     Actions
1457         .narrowScalarIf(
1458             [=](const LegalityQuery &Query) -> bool {
1459               return !Query.Types[0].isVector() &&
1460                      needToSplitMemOp(Query, Op == G_LOAD);
1461             },
1462             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1463               const LLT DstTy = Query.Types[0];
1464               const LLT PtrTy = Query.Types[1];
1465 
1466               const unsigned DstSize = DstTy.getSizeInBits();
1467               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1468 
1469               // Split extloads.
1470               if (DstSize > MemSize)
1471                 return std::pair(0, LLT::scalar(MemSize));
1472 
1473               unsigned MaxSize = maxSizeForAddrSpace(
1474                   ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1475                   Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1476               if (MemSize > MaxSize)
1477                 return std::pair(0, LLT::scalar(MaxSize));
1478 
1479               uint64_t Align = Query.MMODescrs[0].AlignInBits;
1480               return std::pair(0, LLT::scalar(Align));
1481             })
1482         .fewerElementsIf(
1483             [=](const LegalityQuery &Query) -> bool {
1484               return Query.Types[0].isVector() &&
1485                      needToSplitMemOp(Query, Op == G_LOAD);
1486             },
1487             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1488               const LLT DstTy = Query.Types[0];
1489               const LLT PtrTy = Query.Types[1];
1490 
1491               LLT EltTy = DstTy.getElementType();
1492               unsigned MaxSize = maxSizeForAddrSpace(
1493                   ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1494                   Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1495 
1496               // FIXME: Handle widened to power of 2 results better. This ends
1497               // up scalarizing.
1498               // FIXME: 3 element stores scalarized on SI
1499 
1500               // Split if it's too large for the address space.
1501               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1502               if (MemSize > MaxSize) {
1503                 unsigned NumElts = DstTy.getNumElements();
1504                 unsigned EltSize = EltTy.getSizeInBits();
1505 
1506                 if (MaxSize % EltSize == 0) {
1507                   return std::pair(
1508                       0, LLT::scalarOrVector(
1509                              ElementCount::getFixed(MaxSize / EltSize), EltTy));
1510                 }
1511 
1512                 unsigned NumPieces = MemSize / MaxSize;
1513 
1514                 // FIXME: Refine when odd breakdowns handled
1515                 // The scalars will need to be re-legalized.
1516                 if (NumPieces == 1 || NumPieces >= NumElts ||
1517                     NumElts % NumPieces != 0)
1518                   return std::pair(0, EltTy);
1519 
1520                 return std::pair(0,
1521                                  LLT::fixed_vector(NumElts / NumPieces, EltTy));
1522               }
1523 
1524               // FIXME: We could probably handle weird extending loads better.
1525               if (DstTy.getSizeInBits() > MemSize)
1526                 return std::pair(0, EltTy);
1527 
1528               unsigned EltSize = EltTy.getSizeInBits();
1529               unsigned DstSize = DstTy.getSizeInBits();
1530               if (!isPowerOf2_32(DstSize)) {
1531                 // We're probably decomposing an odd sized store. Try to split
1532                 // to the widest type. TODO: Account for alignment. As-is it
1533                 // should be OK, since the new parts will be further legalized.
1534                 unsigned FloorSize = llvm::bit_floor(DstSize);
1535                 return std::pair(
1536                     0, LLT::scalarOrVector(
1537                            ElementCount::getFixed(FloorSize / EltSize), EltTy));
1538               }
1539 
1540               // May need relegalization for the scalars.
1541               return std::pair(0, EltTy);
1542             })
1543     .minScalar(0, S32)
1544     .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
1545     .widenScalarToNextPow2(0)
1546     .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1547     .lower();
1548   }
1549 
1550   // FIXME: Unaligned accesses not lowered.
1551   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1552                        .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1553                                                   {S32, GlobalPtr, S16, 2 * 8},
1554                                                   {S32, LocalPtr, S8, 8},
1555                                                   {S32, LocalPtr, S16, 16},
1556                                                   {S32, PrivatePtr, S8, 8},
1557                                                   {S32, PrivatePtr, S16, 16},
1558                                                   {S32, ConstantPtr, S8, 8},
1559                                                   {S32, ConstantPtr, S16, 2 * 8}})
1560                        .legalIf(
1561                          [=](const LegalityQuery &Query) -> bool {
1562                            return isLoadStoreLegal(ST, Query);
1563                          });
1564 
1565   if (ST.hasFlatAddressSpace()) {
1566     ExtLoads.legalForTypesWithMemDesc(
1567         {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1568   }
1569 
1570   // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1571   // 64-bits.
1572   //
1573   // TODO: Should generalize bitcast action into coerce, which will also cover
1574   // inserting addrspacecasts.
1575   ExtLoads.customIf(typeIs(1, Constant32Ptr));
1576 
1577   ExtLoads.clampScalar(0, S32, S32)
1578           .widenScalarToNextPow2(0)
1579           .lower();
1580 
1581   auto &Atomics = getActionDefinitionsBuilder(
1582     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1583      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1584      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1585      G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1586     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1587                {S64, GlobalPtr}, {S64, LocalPtr},
1588                {S32, RegionPtr}, {S64, RegionPtr}});
1589   if (ST.hasFlatAddressSpace()) {
1590     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1591   }
1592 
1593   auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1594   if (ST.hasLDSFPAtomicAdd()) {
1595     Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1596     if (ST.hasGFX90AInsts())
1597       Atomic.legalFor({{S64, LocalPtr}});
1598     if (ST.hasAtomicDsPkAdd16Insts())
1599       Atomic.legalFor({{V2S16, LocalPtr}});
1600   }
1601   if (ST.hasAtomicFaddInsts())
1602     Atomic.legalFor({{S32, GlobalPtr}});
1603   if (ST.hasFlatAtomicFaddF32Inst())
1604     Atomic.legalFor({{S32, FlatPtr}});
1605 
1606   if (ST.hasGFX90AInsts()) {
1607     // These are legal with some caveats, and should have undergone expansion in
1608     // the IR in most situations
1609     // TODO: Move atomic expansion into legalizer
1610     Atomic.legalFor({
1611         {S32, GlobalPtr},
1612         {S64, GlobalPtr},
1613         {S64, FlatPtr}
1614       });
1615   }
1616 
1617   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1618   // demarshalling
1619   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1620     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1621                 {S32, FlatPtr}, {S64, FlatPtr}})
1622     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1623                {S32, RegionPtr}, {S64, RegionPtr}});
1624   // TODO: Pointer types, any 32-bit or 64-bit vector
1625 
1626   // Condition should be s32 for scalar, s1 for vector.
1627   getActionDefinitionsBuilder(G_SELECT)
1628       .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1629                                  LocalPtr, FlatPtr, PrivatePtr,
1630                                  LLT::fixed_vector(2, LocalPtr),
1631                                  LLT::fixed_vector(2, PrivatePtr)},
1632                                 {S1, S32})
1633       .clampScalar(0, S16, S64)
1634       .scalarize(1)
1635       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1636       .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1637       .clampMaxNumElements(0, S32, 2)
1638       .clampMaxNumElements(0, LocalPtr, 2)
1639       .clampMaxNumElements(0, PrivatePtr, 2)
1640       .scalarize(0)
1641       .widenScalarToNextPow2(0)
1642       .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1643 
1644   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1645   // be more flexible with the shift amount type.
1646   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1647     .legalFor({{S32, S32}, {S64, S32}});
1648   if (ST.has16BitInsts()) {
1649     if (ST.hasVOP3PInsts()) {
1650       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1651             .clampMaxNumElements(0, S16, 2);
1652     } else
1653       Shifts.legalFor({{S16, S16}});
1654 
1655     // TODO: Support 16-bit shift amounts for all types
1656     Shifts.widenScalarIf(
1657       [=](const LegalityQuery &Query) {
1658         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1659         // 32-bit amount.
1660         const LLT ValTy = Query.Types[0];
1661         const LLT AmountTy = Query.Types[1];
1662         return ValTy.getSizeInBits() <= 16 &&
1663                AmountTy.getSizeInBits() < 16;
1664       }, changeTo(1, S16));
1665     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1666     Shifts.clampScalar(1, S32, S32);
1667     Shifts.widenScalarToNextPow2(0, 16);
1668     Shifts.clampScalar(0, S16, S64);
1669 
1670     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1671       .minScalar(0, S16)
1672       .scalarize(0)
1673       .lower();
1674   } else {
1675     // Make sure we legalize the shift amount type first, as the general
1676     // expansion for the shifted type will produce much worse code if it hasn't
1677     // been truncated already.
1678     Shifts.clampScalar(1, S32, S32);
1679     Shifts.widenScalarToNextPow2(0, 32);
1680     Shifts.clampScalar(0, S32, S64);
1681 
1682     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1683       .minScalar(0, S32)
1684       .scalarize(0)
1685       .lower();
1686   }
1687   Shifts.scalarize(0);
1688 
1689   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1690     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1691     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1692     unsigned IdxTypeIdx = 2;
1693 
1694     getActionDefinitionsBuilder(Op)
1695       .customIf([=](const LegalityQuery &Query) {
1696           const LLT EltTy = Query.Types[EltTypeIdx];
1697           const LLT VecTy = Query.Types[VecTypeIdx];
1698           const LLT IdxTy = Query.Types[IdxTypeIdx];
1699           const unsigned EltSize = EltTy.getSizeInBits();
1700           const bool isLegalVecType =
1701               !!SIRegisterInfo::getSGPRClassForBitWidth(VecTy.getSizeInBits());
1702           // Address space 8 pointers are 128-bit wide values, but the logic
1703           // below will try to bitcast them to 2N x s64, which will fail.
1704           // Therefore, as an intermediate step, wrap extracts/insertions from a
1705           // ptrtoint-ing the vector and scalar arguments (or inttoptring the
1706           // extraction result) in order to produce a vector operation that can
1707           // be handled by the logic below.
1708           if (EltTy.isPointer() && EltSize > 64)
1709             return true;
1710           return (EltSize == 32 || EltSize == 64) &&
1711                   VecTy.getSizeInBits() % 32 == 0 &&
1712                   VecTy.getSizeInBits() <= MaxRegisterSize &&
1713                   IdxTy.getSizeInBits() == 32 &&
1714                   isLegalVecType;
1715         })
1716       .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1717                  bitcastToVectorElement32(VecTypeIdx))
1718       //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1719       .bitcastIf(
1720         all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1721         [=](const LegalityQuery &Query) {
1722           // For > 64-bit element types, try to turn this into a 64-bit
1723           // element vector since we may be able to do better indexing
1724           // if this is scalar. If not, fall back to 32.
1725           const LLT EltTy = Query.Types[EltTypeIdx];
1726           const LLT VecTy = Query.Types[VecTypeIdx];
1727           const unsigned DstEltSize = EltTy.getSizeInBits();
1728           const unsigned VecSize = VecTy.getSizeInBits();
1729 
1730           const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1731           return std::pair(
1732               VecTypeIdx,
1733               LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
1734         })
1735       .clampScalar(EltTypeIdx, S32, S64)
1736       .clampScalar(VecTypeIdx, S32, S64)
1737       .clampScalar(IdxTypeIdx, S32, S32)
1738       .clampMaxNumElements(VecTypeIdx, S32, 32)
1739       // TODO: Clamp elements for 64-bit vectors?
1740       .moreElementsIf(
1741         isIllegalRegisterType(VecTypeIdx),
1742         moreElementsToNextExistingRegClass(VecTypeIdx))
1743       // It should only be necessary with variable indexes.
1744       // As a last resort, lower to the stack
1745       .lower();
1746   }
1747 
1748   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1749     .unsupportedIf([=](const LegalityQuery &Query) {
1750         const LLT &EltTy = Query.Types[1].getElementType();
1751         return Query.Types[0] != EltTy;
1752       });
1753 
1754   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1755     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1756     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1757 
1758     // FIXME: Doesn't handle extract of illegal sizes.
1759     getActionDefinitionsBuilder(Op)
1760       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1761       .lowerIf([=](const LegalityQuery &Query) {
1762           // Sub-vector(or single element) insert and extract.
1763           // TODO: verify immediate offset here since lower only works with
1764           // whole elements.
1765           const LLT BigTy = Query.Types[BigTyIdx];
1766           return BigTy.isVector();
1767         })
1768       // FIXME: Multiples of 16 should not be legal.
1769       .legalIf([=](const LegalityQuery &Query) {
1770           const LLT BigTy = Query.Types[BigTyIdx];
1771           const LLT LitTy = Query.Types[LitTyIdx];
1772           return (BigTy.getSizeInBits() % 32 == 0) &&
1773                  (LitTy.getSizeInBits() % 16 == 0);
1774         })
1775       .widenScalarIf(
1776         [=](const LegalityQuery &Query) {
1777           const LLT BigTy = Query.Types[BigTyIdx];
1778           return (BigTy.getScalarSizeInBits() < 16);
1779         },
1780         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1781       .widenScalarIf(
1782         [=](const LegalityQuery &Query) {
1783           const LLT LitTy = Query.Types[LitTyIdx];
1784           return (LitTy.getScalarSizeInBits() < 16);
1785         },
1786         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1787       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1788       .widenScalarToNextPow2(BigTyIdx, 32);
1789 
1790   }
1791 
1792   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1793     .legalForCartesianProduct(AllS32Vectors, {S32})
1794     .legalForCartesianProduct(AllS64Vectors, {S64})
1795     .clampNumElements(0, V16S32, V32S32)
1796     .clampNumElements(0, V2S64, V16S64)
1797     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16))
1798     .moreElementsIf(
1799       isIllegalRegisterType(0),
1800       moreElementsToNextExistingRegClass(0));
1801 
1802   if (ST.hasScalarPackInsts()) {
1803     BuildVector
1804       // FIXME: Should probably widen s1 vectors straight to s32
1805       .minScalarOrElt(0, S16)
1806       .minScalar(1, S16);
1807 
1808     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1809       .legalFor({V2S16, S32})
1810       .lower();
1811   } else {
1812     BuildVector.customFor({V2S16, S16});
1813     BuildVector.minScalarOrElt(0, S32);
1814 
1815     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1816       .customFor({V2S16, S32})
1817       .lower();
1818   }
1819 
1820   BuildVector.legalIf(isRegisterType(0));
1821 
1822   // FIXME: Clamp maximum size
1823   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1824     .legalIf(all(isRegisterType(0), isRegisterType(1)))
1825     .clampMaxNumElements(0, S32, 32)
1826     .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1827     .clampMaxNumElements(0, S16, 64);
1828 
1829   getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1830 
1831   // Merge/Unmerge
1832   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1833     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1834     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1835 
1836     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1837       const LLT Ty = Query.Types[TypeIdx];
1838       if (Ty.isVector()) {
1839         const LLT &EltTy = Ty.getElementType();
1840         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1841           return true;
1842         if (!llvm::has_single_bit<uint32_t>(EltTy.getSizeInBits()))
1843           return true;
1844       }
1845       return false;
1846     };
1847 
1848     auto &Builder = getActionDefinitionsBuilder(Op)
1849       .legalIf(all(isRegisterType(0), isRegisterType(1)))
1850       .lowerFor({{S16, V2S16}})
1851       .lowerIf([=](const LegalityQuery &Query) {
1852           const LLT BigTy = Query.Types[BigTyIdx];
1853           return BigTy.getSizeInBits() == 32;
1854         })
1855       // Try to widen to s16 first for small types.
1856       // TODO: Only do this on targets with legal s16 shifts
1857       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1858       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1859       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1860       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1861                            elementTypeIs(1, S16)),
1862                        changeTo(1, V2S16))
1863       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1864       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1865       // valid.
1866       .clampScalar(LitTyIdx, S32, S512)
1867       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1868       // Break up vectors with weird elements into scalars
1869       .fewerElementsIf(
1870         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1871         scalarize(0))
1872       .fewerElementsIf(
1873         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1874         scalarize(1))
1875       .clampScalar(BigTyIdx, S32, MaxScalar);
1876 
1877     if (Op == G_MERGE_VALUES) {
1878       Builder.widenScalarIf(
1879         // TODO: Use 16-bit shifts if legal for 8-bit values?
1880         [=](const LegalityQuery &Query) {
1881           const LLT Ty = Query.Types[LitTyIdx];
1882           return Ty.getSizeInBits() < 32;
1883         },
1884         changeTo(LitTyIdx, S32));
1885     }
1886 
1887     Builder.widenScalarIf(
1888       [=](const LegalityQuery &Query) {
1889         const LLT Ty = Query.Types[BigTyIdx];
1890         return Ty.getSizeInBits() % 16 != 0;
1891       },
1892       [=](const LegalityQuery &Query) {
1893         // Pick the next power of 2, or a multiple of 64 over 128.
1894         // Whichever is smaller.
1895         const LLT &Ty = Query.Types[BigTyIdx];
1896         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1897         if (NewSizeInBits >= 256) {
1898           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1899           if (RoundedTo < NewSizeInBits)
1900             NewSizeInBits = RoundedTo;
1901         }
1902         return std::pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1903       })
1904       // Any vectors left are the wrong size. Scalarize them.
1905       .scalarize(0)
1906       .scalarize(1);
1907   }
1908 
1909   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1910   // RegBankSelect.
1911   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1912     .legalFor({{S32}, {S64}});
1913 
1914   if (ST.hasVOP3PInsts()) {
1915     SextInReg.lowerFor({{V2S16}})
1916       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1917       // get more vector shift opportunities, since we'll get those when
1918       // expanded.
1919       .clampMaxNumElementsStrict(0, S16, 2);
1920   } else if (ST.has16BitInsts()) {
1921     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1922   } else {
1923     // Prefer to promote to s32 before lowering if we don't have 16-bit
1924     // shifts. This avoid a lot of intermediate truncate and extend operations.
1925     SextInReg.lowerFor({{S32}, {S64}});
1926   }
1927 
1928   SextInReg
1929     .scalarize(0)
1930     .clampScalar(0, S32, S64)
1931     .lower();
1932 
1933   getActionDefinitionsBuilder({G_ROTR, G_ROTL})
1934     .scalarize(0)
1935     .lower();
1936 
1937   // TODO: Only Try to form v2s16 with legal packed instructions.
1938   getActionDefinitionsBuilder(G_FSHR)
1939     .legalFor({{S32, S32}})
1940     .lowerFor({{V2S16, V2S16}})
1941     .clampMaxNumElementsStrict(0, S16, 2)
1942     .scalarize(0)
1943     .lower();
1944 
1945   if (ST.hasVOP3PInsts()) {
1946     getActionDefinitionsBuilder(G_FSHL)
1947       .lowerFor({{V2S16, V2S16}})
1948       .clampMaxNumElementsStrict(0, S16, 2)
1949       .scalarize(0)
1950       .lower();
1951   } else {
1952     getActionDefinitionsBuilder(G_FSHL)
1953       .scalarize(0)
1954       .lower();
1955   }
1956 
1957   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1958     .legalFor({S64});
1959 
1960   getActionDefinitionsBuilder(G_FENCE)
1961     .alwaysLegal();
1962 
1963   getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1964       .scalarize(0)
1965       .minScalar(0, S32)
1966       .lower();
1967 
1968   getActionDefinitionsBuilder({G_SBFX, G_UBFX})
1969       .legalFor({{S32, S32}, {S64, S32}})
1970       .clampScalar(1, S32, S32)
1971       .clampScalar(0, S32, S64)
1972       .widenScalarToNextPow2(0)
1973       .scalarize(0);
1974 
1975   getActionDefinitionsBuilder(
1976       {// TODO: Verify V_BFI_B32 is generated from expanded bit ops
1977        G_FCOPYSIGN,
1978 
1979        G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB,
1980        G_READ_REGISTER, G_WRITE_REGISTER,
1981 
1982        G_SADDO, G_SSUBO})
1983       .lower();
1984 
1985   if (ST.hasIEEEMinMax()) {
1986     getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
1987         .legalFor(FPTypesPK16)
1988         .clampMaxNumElements(0, S16, 2)
1989         .scalarize(0);
1990   } else {
1991     // TODO: Implement
1992     getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
1993   }
1994 
1995   getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
1996       .lower();
1997 
1998   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1999         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
2000         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
2001     .unsupported();
2002 
2003   getActionDefinitionsBuilder(G_PREFETCH).alwaysLegal();
2004 
2005   getLegacyLegalizerInfo().computeTables();
2006   verify(*ST.getInstrInfo());
2007 }
2008 
2009 bool AMDGPULegalizerInfo::legalizeCustom(
2010     LegalizerHelper &Helper, MachineInstr &MI,
2011     LostDebugLocObserver &LocObserver) const {
2012   MachineIRBuilder &B = Helper.MIRBuilder;
2013   MachineRegisterInfo &MRI = *B.getMRI();
2014 
2015   switch (MI.getOpcode()) {
2016   case TargetOpcode::G_ADDRSPACE_CAST:
2017     return legalizeAddrSpaceCast(MI, MRI, B);
2018   case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2019     return legalizeFroundeven(MI, MRI, B);
2020   case TargetOpcode::G_FCEIL:
2021     return legalizeFceil(MI, MRI, B);
2022   case TargetOpcode::G_FREM:
2023     return legalizeFrem(MI, MRI, B);
2024   case TargetOpcode::G_INTRINSIC_TRUNC:
2025     return legalizeIntrinsicTrunc(MI, MRI, B);
2026   case TargetOpcode::G_SITOFP:
2027     return legalizeITOFP(MI, MRI, B, true);
2028   case TargetOpcode::G_UITOFP:
2029     return legalizeITOFP(MI, MRI, B, false);
2030   case TargetOpcode::G_FPTOSI:
2031     return legalizeFPTOI(MI, MRI, B, true);
2032   case TargetOpcode::G_FPTOUI:
2033     return legalizeFPTOI(MI, MRI, B, false);
2034   case TargetOpcode::G_FMINNUM:
2035   case TargetOpcode::G_FMAXNUM:
2036   case TargetOpcode::G_FMINNUM_IEEE:
2037   case TargetOpcode::G_FMAXNUM_IEEE:
2038     return legalizeMinNumMaxNum(Helper, MI);
2039   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2040     return legalizeExtractVectorElt(MI, MRI, B);
2041   case TargetOpcode::G_INSERT_VECTOR_ELT:
2042     return legalizeInsertVectorElt(MI, MRI, B);
2043   case TargetOpcode::G_FSIN:
2044   case TargetOpcode::G_FCOS:
2045     return legalizeSinCos(MI, MRI, B);
2046   case TargetOpcode::G_GLOBAL_VALUE:
2047     return legalizeGlobalValue(MI, MRI, B);
2048   case TargetOpcode::G_LOAD:
2049   case TargetOpcode::G_SEXTLOAD:
2050   case TargetOpcode::G_ZEXTLOAD:
2051     return legalizeLoad(Helper, MI);
2052   case TargetOpcode::G_STORE:
2053     return legalizeStore(Helper, MI);
2054   case TargetOpcode::G_FMAD:
2055     return legalizeFMad(MI, MRI, B);
2056   case TargetOpcode::G_FDIV:
2057     return legalizeFDIV(MI, MRI, B);
2058   case TargetOpcode::G_FFREXP:
2059     return legalizeFFREXP(MI, MRI, B);
2060   case TargetOpcode::G_FSQRT:
2061     return legalizeFSQRT(MI, MRI, B);
2062   case TargetOpcode::G_UDIV:
2063   case TargetOpcode::G_UREM:
2064   case TargetOpcode::G_UDIVREM:
2065     return legalizeUnsignedDIV_REM(MI, MRI, B);
2066   case TargetOpcode::G_SDIV:
2067   case TargetOpcode::G_SREM:
2068   case TargetOpcode::G_SDIVREM:
2069     return legalizeSignedDIV_REM(MI, MRI, B);
2070   case TargetOpcode::G_ATOMIC_CMPXCHG:
2071     return legalizeAtomicCmpXChg(MI, MRI, B);
2072   case TargetOpcode::G_FLOG2:
2073     return legalizeFlog2(MI, B);
2074   case TargetOpcode::G_FLOG:
2075   case TargetOpcode::G_FLOG10:
2076     return legalizeFlogCommon(MI, B);
2077   case TargetOpcode::G_FEXP2:
2078     return legalizeFExp2(MI, B);
2079   case TargetOpcode::G_FEXP:
2080   case TargetOpcode::G_FEXP10:
2081     return legalizeFExp(MI, B);
2082   case TargetOpcode::G_FPOW:
2083     return legalizeFPow(MI, B);
2084   case TargetOpcode::G_FFLOOR:
2085     return legalizeFFloor(MI, MRI, B);
2086   case TargetOpcode::G_BUILD_VECTOR:
2087   case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2088     return legalizeBuildVector(MI, MRI, B);
2089   case TargetOpcode::G_MUL:
2090     return legalizeMul(Helper, MI);
2091   case TargetOpcode::G_CTLZ:
2092   case TargetOpcode::G_CTTZ:
2093     return legalizeCTLZ_CTTZ(MI, MRI, B);
2094   case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND:
2095     return legalizeFPTruncRound(MI, B);
2096   case TargetOpcode::G_STACKSAVE:
2097     return legalizeStackSave(MI, B);
2098   default:
2099     return false;
2100   }
2101 
2102   llvm_unreachable("expected switch to return");
2103 }
2104 
2105 Register AMDGPULegalizerInfo::getSegmentAperture(
2106   unsigned AS,
2107   MachineRegisterInfo &MRI,
2108   MachineIRBuilder &B) const {
2109   MachineFunction &MF = B.getMF();
2110   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2111   const LLT S32 = LLT::scalar(32);
2112   const LLT S64 = LLT::scalar(64);
2113 
2114   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
2115 
2116   if (ST.hasApertureRegs()) {
2117     // Note: this register is somewhat broken. When used as a 32-bit operand,
2118     // it only returns zeroes. The real value is in the upper 32 bits.
2119     // Thus, we must emit extract the high 32 bits.
2120     const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
2121                                        ? AMDGPU::SRC_SHARED_BASE
2122                                        : AMDGPU::SRC_PRIVATE_BASE;
2123     // FIXME: It would be more natural to emit a COPY here, but then copy
2124     // coalescing would kick in and it would think it's okay to use the "HI"
2125     // subregister (instead of extracting the HI 32 bits) which is an artificial
2126     // (unusable) register.
2127     //  Register TableGen definitions would need an overhaul to get rid of the
2128     //  artificial "HI" aperture registers and prevent this kind of issue from
2129     //  happening.
2130     Register Dst = MRI.createGenericVirtualRegister(S64);
2131     MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass);
2132     B.buildInstr(AMDGPU::S_MOV_B64, {Dst}, {Register(ApertureRegNo)});
2133     return B.buildUnmerge(S32, Dst).getReg(1);
2134   }
2135 
2136   // TODO: can we be smarter about machine pointer info?
2137   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
2138   Register LoadAddr = MRI.createGenericVirtualRegister(
2139     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2140   // For code object version 5, private_base and shared_base are passed through
2141   // implicit kernargs.
2142   if (AMDGPU::getCodeObjectVersion(*MF.getFunction().getParent()) >=
2143       AMDGPU::AMDHSA_COV5) {
2144     AMDGPUTargetLowering::ImplicitParameter Param =
2145         AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPUTargetLowering::SHARED_BASE
2146                                       : AMDGPUTargetLowering::PRIVATE_BASE;
2147     uint64_t Offset =
2148         ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
2149 
2150     Register KernargPtrReg = MRI.createGenericVirtualRegister(
2151         LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2152 
2153     if (!loadInputValue(KernargPtrReg, B,
2154                         AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
2155       return Register();
2156 
2157     MachineMemOperand *MMO = MF.getMachineMemOperand(
2158         PtrInfo,
2159         MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2160             MachineMemOperand::MOInvariant,
2161         LLT::scalar(32), commonAlignment(Align(64), Offset));
2162 
2163     // Pointer address
2164     B.buildPtrAdd(LoadAddr, KernargPtrReg,
2165                   B.buildConstant(LLT::scalar(64), Offset).getReg(0));
2166     // Load address
2167     return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
2168   }
2169 
2170   Register QueuePtr = MRI.createGenericVirtualRegister(
2171     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2172 
2173   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
2174     return Register();
2175 
2176   // Offset into amd_queue_t for group_segment_aperture_base_hi /
2177   // private_segment_aperture_base_hi.
2178   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
2179 
2180   MachineMemOperand *MMO = MF.getMachineMemOperand(
2181       PtrInfo,
2182       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2183           MachineMemOperand::MOInvariant,
2184       LLT::scalar(32), commonAlignment(Align(64), StructOffset));
2185 
2186   B.buildPtrAdd(LoadAddr, QueuePtr,
2187                 B.buildConstant(LLT::scalar(64), StructOffset).getReg(0));
2188   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
2189 }
2190 
2191 /// Return true if the value is a known valid address, such that a null check is
2192 /// not necessary.
2193 static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI,
2194                            const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
2195   MachineInstr *Def = MRI.getVRegDef(Val);
2196   switch (Def->getOpcode()) {
2197   case AMDGPU::G_FRAME_INDEX:
2198   case AMDGPU::G_GLOBAL_VALUE:
2199   case AMDGPU::G_BLOCK_ADDR:
2200     return true;
2201   case AMDGPU::G_CONSTANT: {
2202     const ConstantInt *CI = Def->getOperand(1).getCImm();
2203     return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace);
2204   }
2205   default:
2206     return false;
2207   }
2208 
2209   return false;
2210 }
2211 
2212 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
2213   MachineInstr &MI, MachineRegisterInfo &MRI,
2214   MachineIRBuilder &B) const {
2215   MachineFunction &MF = B.getMF();
2216 
2217   const LLT S32 = LLT::scalar(32);
2218   Register Dst = MI.getOperand(0).getReg();
2219   Register Src = MI.getOperand(1).getReg();
2220 
2221   LLT DstTy = MRI.getType(Dst);
2222   LLT SrcTy = MRI.getType(Src);
2223   unsigned DestAS = DstTy.getAddressSpace();
2224   unsigned SrcAS = SrcTy.getAddressSpace();
2225 
2226   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
2227   // vector element.
2228   assert(!DstTy.isVector());
2229 
2230   const AMDGPUTargetMachine &TM
2231     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
2232 
2233   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
2234     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
2235     return true;
2236   }
2237 
2238   if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
2239       (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
2240        DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2241     if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
2242       // Extract low 32-bits of the pointer.
2243       B.buildExtract(Dst, Src, 0);
2244       MI.eraseFromParent();
2245       return true;
2246     }
2247 
2248     unsigned NullVal = TM.getNullPointerValue(DestAS);
2249 
2250     auto SegmentNull = B.buildConstant(DstTy, NullVal);
2251     auto FlatNull = B.buildConstant(SrcTy, 0);
2252 
2253     // Extract low 32-bits of the pointer.
2254     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
2255 
2256     auto CmpRes =
2257         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
2258     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
2259 
2260     MI.eraseFromParent();
2261     return true;
2262   }
2263 
2264   if (DestAS == AMDGPUAS::FLAT_ADDRESS &&
2265       (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
2266        SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2267     Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
2268     if (!ApertureReg.isValid())
2269       return false;
2270 
2271     // Coerce the type of the low half of the result so we can use merge_values.
2272     Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
2273 
2274     // TODO: Should we allow mismatched types but matching sizes in merges to
2275     // avoid the ptrtoint?
2276     auto BuildPtr = B.buildMergeLikeInstr(DstTy, {SrcAsInt, ApertureReg});
2277 
2278     if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
2279       B.buildCopy(Dst, BuildPtr);
2280       MI.eraseFromParent();
2281       return true;
2282     }
2283 
2284     auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
2285     auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
2286 
2287     auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src,
2288                               SegmentNull.getReg(0));
2289 
2290     B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2291 
2292     MI.eraseFromParent();
2293     return true;
2294   }
2295 
2296   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2297       SrcTy.getSizeInBits() == 64) {
2298     // Truncate.
2299     B.buildExtract(Dst, Src, 0);
2300     MI.eraseFromParent();
2301     return true;
2302   }
2303 
2304   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2305       DstTy.getSizeInBits() == 64) {
2306     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2307     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2308     auto PtrLo = B.buildPtrToInt(S32, Src);
2309     auto HighAddr = B.buildConstant(S32, AddrHiVal);
2310     B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
2311     MI.eraseFromParent();
2312     return true;
2313   }
2314 
2315   DiagnosticInfoUnsupported InvalidAddrSpaceCast(
2316       MF.getFunction(), "invalid addrspacecast", B.getDebugLoc());
2317 
2318   LLVMContext &Ctx = MF.getFunction().getContext();
2319   Ctx.diagnose(InvalidAddrSpaceCast);
2320   B.buildUndef(Dst);
2321   MI.eraseFromParent();
2322   return true;
2323 }
2324 
2325 bool AMDGPULegalizerInfo::legalizeFroundeven(MachineInstr &MI,
2326                                              MachineRegisterInfo &MRI,
2327                                              MachineIRBuilder &B) const {
2328   Register Src = MI.getOperand(1).getReg();
2329   LLT Ty = MRI.getType(Src);
2330   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
2331 
2332   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2333   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2334 
2335   auto C1 = B.buildFConstant(Ty, C1Val);
2336   auto CopySign = B.buildFCopysign(Ty, C1, Src);
2337 
2338   // TODO: Should this propagate fast-math-flags?
2339   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
2340   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
2341 
2342   auto C2 = B.buildFConstant(Ty, C2Val);
2343   auto Fabs = B.buildFAbs(Ty, Src);
2344 
2345   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
2346   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
2347   MI.eraseFromParent();
2348   return true;
2349 }
2350 
2351 bool AMDGPULegalizerInfo::legalizeFceil(
2352   MachineInstr &MI, MachineRegisterInfo &MRI,
2353   MachineIRBuilder &B) const {
2354 
2355   const LLT S1 = LLT::scalar(1);
2356   const LLT S64 = LLT::scalar(64);
2357 
2358   Register Src = MI.getOperand(1).getReg();
2359   assert(MRI.getType(Src) == S64);
2360 
2361   // result = trunc(src)
2362   // if (src > 0.0 && src != result)
2363   //   result += 1.0
2364 
2365   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
2366 
2367   const auto Zero = B.buildFConstant(S64, 0.0);
2368   const auto One = B.buildFConstant(S64, 1.0);
2369   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
2370   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
2371   auto And = B.buildAnd(S1, Lt0, NeTrunc);
2372   auto Add = B.buildSelect(S64, And, One, Zero);
2373 
2374   // TODO: Should this propagate fast-math-flags?
2375   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
2376   MI.eraseFromParent();
2377   return true;
2378 }
2379 
2380 bool AMDGPULegalizerInfo::legalizeFrem(
2381   MachineInstr &MI, MachineRegisterInfo &MRI,
2382   MachineIRBuilder &B) const {
2383     Register DstReg = MI.getOperand(0).getReg();
2384     Register Src0Reg = MI.getOperand(1).getReg();
2385     Register Src1Reg = MI.getOperand(2).getReg();
2386     auto Flags = MI.getFlags();
2387     LLT Ty = MRI.getType(DstReg);
2388 
2389     auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2390     auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
2391     auto Neg = B.buildFNeg(Ty, Trunc, Flags);
2392     B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2393     MI.eraseFromParent();
2394     return true;
2395 }
2396 
2397 static MachineInstrBuilder extractF64Exponent(Register Hi,
2398                                               MachineIRBuilder &B) {
2399   const unsigned FractBits = 52;
2400   const unsigned ExpBits = 11;
2401   LLT S32 = LLT::scalar(32);
2402 
2403   auto Const0 = B.buildConstant(S32, FractBits - 32);
2404   auto Const1 = B.buildConstant(S32, ExpBits);
2405 
2406   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32})
2407                      .addUse(Hi)
2408                      .addUse(Const0.getReg(0))
2409                      .addUse(Const1.getReg(0));
2410 
2411   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2412 }
2413 
2414 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
2415   MachineInstr &MI, MachineRegisterInfo &MRI,
2416   MachineIRBuilder &B) const {
2417   const LLT S1 = LLT::scalar(1);
2418   const LLT S32 = LLT::scalar(32);
2419   const LLT S64 = LLT::scalar(64);
2420 
2421   Register Src = MI.getOperand(1).getReg();
2422   assert(MRI.getType(Src) == S64);
2423 
2424   // TODO: Should this use extract since the low half is unused?
2425   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2426   Register Hi = Unmerge.getReg(1);
2427 
2428   // Extract the upper half, since this is where we will find the sign and
2429   // exponent.
2430   auto Exp = extractF64Exponent(Hi, B);
2431 
2432   const unsigned FractBits = 52;
2433 
2434   // Extract the sign bit.
2435   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2436   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2437 
2438   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2439 
2440   const auto Zero32 = B.buildConstant(S32, 0);
2441 
2442   // Extend back to 64-bits.
2443   auto SignBit64 = B.buildMergeLikeInstr(S64, {Zero32, SignBit});
2444 
2445   auto Shr = B.buildAShr(S64, FractMask, Exp);
2446   auto Not = B.buildNot(S64, Shr);
2447   auto Tmp0 = B.buildAnd(S64, Src, Not);
2448   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2449 
2450   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2451   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2452 
2453   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2454   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2455   MI.eraseFromParent();
2456   return true;
2457 }
2458 
2459 bool AMDGPULegalizerInfo::legalizeITOFP(
2460   MachineInstr &MI, MachineRegisterInfo &MRI,
2461   MachineIRBuilder &B, bool Signed) const {
2462 
2463   Register Dst = MI.getOperand(0).getReg();
2464   Register Src = MI.getOperand(1).getReg();
2465 
2466   const LLT S64 = LLT::scalar(64);
2467   const LLT S32 = LLT::scalar(32);
2468 
2469   assert(MRI.getType(Src) == S64);
2470 
2471   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2472   auto ThirtyTwo = B.buildConstant(S32, 32);
2473 
2474   if (MRI.getType(Dst) == S64) {
2475     auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2476                         : B.buildUITOFP(S64, Unmerge.getReg(1));
2477 
2478     auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2479     auto LdExp = B.buildFLdexp(S64, CvtHi, ThirtyTwo);
2480 
2481     // TODO: Should this propagate fast-math-flags?
2482     B.buildFAdd(Dst, LdExp, CvtLo);
2483     MI.eraseFromParent();
2484     return true;
2485   }
2486 
2487   assert(MRI.getType(Dst) == S32);
2488 
2489   auto One = B.buildConstant(S32, 1);
2490 
2491   MachineInstrBuilder ShAmt;
2492   if (Signed) {
2493     auto ThirtyOne = B.buildConstant(S32, 31);
2494     auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2495     auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2496     auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2497     auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32})
2498                   .addUse(Unmerge.getReg(1));
2499     auto LS2 = B.buildSub(S32, LS, One);
2500     ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2501   } else
2502     ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2503   auto Norm = B.buildShl(S64, Src, ShAmt);
2504   auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2505   auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2506   auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2507   auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2508   auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2509   B.buildFLdexp(Dst, FVal, Scale);
2510   MI.eraseFromParent();
2511   return true;
2512 }
2513 
2514 // TODO: Copied from DAG implementation. Verify logic and document how this
2515 // actually works.
2516 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
2517                                         MachineRegisterInfo &MRI,
2518                                         MachineIRBuilder &B,
2519                                         bool Signed) const {
2520 
2521   Register Dst = MI.getOperand(0).getReg();
2522   Register Src = MI.getOperand(1).getReg();
2523 
2524   const LLT S64 = LLT::scalar(64);
2525   const LLT S32 = LLT::scalar(32);
2526 
2527   const LLT SrcLT = MRI.getType(Src);
2528   assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2529 
2530   unsigned Flags = MI.getFlags();
2531 
2532   // The basic idea of converting a floating point number into a pair of 32-bit
2533   // integers is illustrated as follows:
2534   //
2535   //     tf := trunc(val);
2536   //    hif := floor(tf * 2^-32);
2537   //    lof := tf - hif * 2^32; // lof is always positive due to floor.
2538   //     hi := fptoi(hif);
2539   //     lo := fptoi(lof);
2540   //
2541   auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2542   MachineInstrBuilder Sign;
2543   if (Signed && SrcLT == S32) {
2544     // However, a 32-bit floating point number has only 23 bits mantissa and
2545     // it's not enough to hold all the significant bits of `lof` if val is
2546     // negative. To avoid the loss of precision, We need to take the absolute
2547     // value after truncating and flip the result back based on the original
2548     // signedness.
2549     Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2550     Trunc = B.buildFAbs(S32, Trunc, Flags);
2551   }
2552   MachineInstrBuilder K0, K1;
2553   if (SrcLT == S64) {
2554     K0 = B.buildFConstant(
2555         S64, llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2556     K1 = B.buildFConstant(
2557         S64, llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2558   } else {
2559     K0 = B.buildFConstant(
2560         S32, llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)));
2561     K1 = B.buildFConstant(
2562         S32, llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)));
2563   }
2564 
2565   auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2566   auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2567   auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2568 
2569   auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2570                                      : B.buildFPTOUI(S32, FloorMul);
2571   auto Lo = B.buildFPTOUI(S32, Fma);
2572 
2573   if (Signed && SrcLT == S32) {
2574     // Flip the result based on the signedness, which is either all 0s or 1s.
2575     Sign = B.buildMergeLikeInstr(S64, {Sign, Sign});
2576     // r := xor({lo, hi}, sign) - sign;
2577     B.buildSub(Dst, B.buildXor(S64, B.buildMergeLikeInstr(S64, {Lo, Hi}), Sign),
2578                Sign);
2579   } else
2580     B.buildMergeLikeInstr(Dst, {Lo, Hi});
2581   MI.eraseFromParent();
2582 
2583   return true;
2584 }
2585 
2586 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2587                                                MachineInstr &MI) const {
2588   MachineFunction &MF = Helper.MIRBuilder.getMF();
2589   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2590 
2591   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2592                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2593 
2594   // With ieee_mode disabled, the instructions have the correct behavior
2595   // already for G_FMINNUM/G_FMAXNUM
2596   if (!MFI->getMode().IEEE)
2597     return !IsIEEEOp;
2598 
2599   if (IsIEEEOp)
2600     return true;
2601 
2602   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2603 }
2604 
2605 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2606   MachineInstr &MI, MachineRegisterInfo &MRI,
2607   MachineIRBuilder &B) const {
2608   // TODO: Should move some of this into LegalizerHelper.
2609 
2610   // TODO: Promote dynamic indexing of s16 to s32
2611 
2612   Register Dst = MI.getOperand(0).getReg();
2613   Register Vec = MI.getOperand(1).getReg();
2614 
2615   LLT VecTy = MRI.getType(Vec);
2616   LLT EltTy = VecTy.getElementType();
2617   assert(EltTy == MRI.getType(Dst));
2618 
2619   // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
2620   // but we can't go directly to that logic becasue you can't bitcast a vector
2621   // of pointers to a vector of integers. Therefore, introduce an intermediate
2622   // vector of integers using ptrtoint (and inttoptr on the output) in order to
2623   // drive the legalization forward.
2624   if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
2625     LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
2626     LLT IntVecTy = VecTy.changeElementType(IntTy);
2627 
2628     auto IntVec = B.buildPtrToInt(IntVecTy, Vec);
2629     auto IntElt = B.buildExtractVectorElement(IntTy, IntVec, MI.getOperand(2));
2630     B.buildIntToPtr(Dst, IntElt);
2631 
2632     MI.eraseFromParent();
2633     return true;
2634   }
2635 
2636   // FIXME: Artifact combiner probably should have replaced the truncated
2637   // constant before this, so we shouldn't need
2638   // getIConstantVRegValWithLookThrough.
2639   std::optional<ValueAndVReg> MaybeIdxVal =
2640       getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2641   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2642     return true;
2643   const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2644 
2645   if (IdxVal < VecTy.getNumElements()) {
2646     auto Unmerge = B.buildUnmerge(EltTy, Vec);
2647     B.buildCopy(Dst, Unmerge.getReg(IdxVal));
2648   } else {
2649     B.buildUndef(Dst);
2650   }
2651 
2652   MI.eraseFromParent();
2653   return true;
2654 }
2655 
2656 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2657   MachineInstr &MI, MachineRegisterInfo &MRI,
2658   MachineIRBuilder &B) const {
2659   // TODO: Should move some of this into LegalizerHelper.
2660 
2661   // TODO: Promote dynamic indexing of s16 to s32
2662 
2663   Register Dst = MI.getOperand(0).getReg();
2664   Register Vec = MI.getOperand(1).getReg();
2665   Register Ins = MI.getOperand(2).getReg();
2666 
2667   LLT VecTy = MRI.getType(Vec);
2668   LLT EltTy = VecTy.getElementType();
2669   assert(EltTy == MRI.getType(Ins));
2670 
2671   // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
2672   // but we can't go directly to that logic becasue you can't bitcast a vector
2673   // of pointers to a vector of integers. Therefore, make the pointer vector
2674   // into an equivalent vector of integers with ptrtoint, insert the ptrtoint'd
2675   // new value, and then inttoptr the result vector back. This will then allow
2676   // the rest of legalization to take over.
2677   if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
2678     LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
2679     LLT IntVecTy = VecTy.changeElementType(IntTy);
2680 
2681     auto IntVecSource = B.buildPtrToInt(IntVecTy, Vec);
2682     auto IntIns = B.buildPtrToInt(IntTy, Ins);
2683     auto IntVecDest = B.buildInsertVectorElement(IntVecTy, IntVecSource, IntIns,
2684                                                  MI.getOperand(3));
2685     B.buildIntToPtr(Dst, IntVecDest);
2686     MI.eraseFromParent();
2687     return true;
2688   }
2689 
2690   // FIXME: Artifact combiner probably should have replaced the truncated
2691   // constant before this, so we shouldn't need
2692   // getIConstantVRegValWithLookThrough.
2693   std::optional<ValueAndVReg> MaybeIdxVal =
2694       getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2695   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2696     return true;
2697 
2698   const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2699 
2700   unsigned NumElts = VecTy.getNumElements();
2701   if (IdxVal < NumElts) {
2702     SmallVector<Register, 8> SrcRegs;
2703     for (unsigned i = 0; i < NumElts; ++i)
2704       SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy));
2705     B.buildUnmerge(SrcRegs, Vec);
2706 
2707     SrcRegs[IdxVal] = MI.getOperand(2).getReg();
2708     B.buildMergeLikeInstr(Dst, SrcRegs);
2709   } else {
2710     B.buildUndef(Dst);
2711   }
2712 
2713   MI.eraseFromParent();
2714   return true;
2715 }
2716 
2717 bool AMDGPULegalizerInfo::legalizeSinCos(
2718   MachineInstr &MI, MachineRegisterInfo &MRI,
2719   MachineIRBuilder &B) const {
2720 
2721   Register DstReg = MI.getOperand(0).getReg();
2722   Register SrcReg = MI.getOperand(1).getReg();
2723   LLT Ty = MRI.getType(DstReg);
2724   unsigned Flags = MI.getFlags();
2725 
2726   Register TrigVal;
2727   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2728   if (ST.hasTrigReducedRange()) {
2729     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2730     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty})
2731                   .addUse(MulVal.getReg(0))
2732                   .setMIFlags(Flags)
2733                   .getReg(0);
2734   } else
2735     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2736 
2737   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2738     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2739   B.buildIntrinsic(TrigIntrin, ArrayRef<Register>(DstReg))
2740       .addUse(TrigVal)
2741       .setMIFlags(Flags);
2742   MI.eraseFromParent();
2743   return true;
2744 }
2745 
2746 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2747                                                   MachineIRBuilder &B,
2748                                                   const GlobalValue *GV,
2749                                                   int64_t Offset,
2750                                                   unsigned GAFlags) const {
2751   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2752   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2753   // to the following code sequence:
2754   //
2755   // For constant address space:
2756   //   s_getpc_b64 s[0:1]
2757   //   s_add_u32 s0, s0, $symbol
2758   //   s_addc_u32 s1, s1, 0
2759   //
2760   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2761   //   a fixup or relocation is emitted to replace $symbol with a literal
2762   //   constant, which is a pc-relative offset from the encoding of the $symbol
2763   //   operand to the global variable.
2764   //
2765   // For global address space:
2766   //   s_getpc_b64 s[0:1]
2767   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2768   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2769   //
2770   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2771   //   fixups or relocations are emitted to replace $symbol@*@lo and
2772   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2773   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2774   //   operand to the global variable.
2775 
2776   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2777 
2778   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2779     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2780 
2781   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2782     .addDef(PCReg);
2783 
2784   MIB.addGlobalAddress(GV, Offset, GAFlags);
2785   if (GAFlags == SIInstrInfo::MO_NONE)
2786     MIB.addImm(0);
2787   else
2788     MIB.addGlobalAddress(GV, Offset, GAFlags + 1);
2789 
2790   if (!B.getMRI()->getRegClassOrNull(PCReg))
2791     B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2792 
2793   if (PtrTy.getSizeInBits() == 32)
2794     B.buildExtract(DstReg, PCReg, 0);
2795   return true;
2796 }
2797 
2798 // Emit a ABS32_LO / ABS32_HI relocation stub.
2799 void AMDGPULegalizerInfo::buildAbsGlobalAddress(
2800     Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV,
2801     MachineRegisterInfo &MRI) const {
2802   bool RequiresHighHalf = PtrTy.getSizeInBits() != 32;
2803 
2804   LLT S32 = LLT::scalar(32);
2805 
2806   // Use the destination directly, if and only if we store the lower address
2807   // part only and we don't have a register class being set.
2808   Register AddrLo = !RequiresHighHalf && !MRI.getRegClassOrNull(DstReg)
2809                         ? DstReg
2810                         : MRI.createGenericVirtualRegister(S32);
2811 
2812   if (!MRI.getRegClassOrNull(AddrLo))
2813     MRI.setRegClass(AddrLo, &AMDGPU::SReg_32RegClass);
2814 
2815   // Write the lower half.
2816   B.buildInstr(AMDGPU::S_MOV_B32)
2817       .addDef(AddrLo)
2818       .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_LO);
2819 
2820   // If required, write the upper half as well.
2821   if (RequiresHighHalf) {
2822     assert(PtrTy.getSizeInBits() == 64 &&
2823            "Must provide a 64-bit pointer type!");
2824 
2825     Register AddrHi = MRI.createGenericVirtualRegister(S32);
2826     MRI.setRegClass(AddrHi, &AMDGPU::SReg_32RegClass);
2827 
2828     B.buildInstr(AMDGPU::S_MOV_B32)
2829         .addDef(AddrHi)
2830         .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_HI);
2831 
2832     // Use the destination directly, if and only if we don't have a register
2833     // class being set.
2834     Register AddrDst = !MRI.getRegClassOrNull(DstReg)
2835                            ? DstReg
2836                            : MRI.createGenericVirtualRegister(LLT::scalar(64));
2837 
2838     if (!MRI.getRegClassOrNull(AddrDst))
2839       MRI.setRegClass(AddrDst, &AMDGPU::SReg_64RegClass);
2840 
2841     B.buildMergeValues(AddrDst, {AddrLo, AddrHi});
2842 
2843     // If we created a new register for the destination, cast the result into
2844     // the final output.
2845     if (AddrDst != DstReg)
2846       B.buildCast(DstReg, AddrDst);
2847   } else if (AddrLo != DstReg) {
2848     // If we created a new register for the destination, cast the result into
2849     // the final output.
2850     B.buildCast(DstReg, AddrLo);
2851   }
2852 }
2853 
2854 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2855   MachineInstr &MI, MachineRegisterInfo &MRI,
2856   MachineIRBuilder &B) const {
2857   Register DstReg = MI.getOperand(0).getReg();
2858   LLT Ty = MRI.getType(DstReg);
2859   unsigned AS = Ty.getAddressSpace();
2860 
2861   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2862   MachineFunction &MF = B.getMF();
2863   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2864 
2865   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2866     if (!MFI->isModuleEntryFunction() &&
2867         !GV->getName().equals("llvm.amdgcn.module.lds")) {
2868       const Function &Fn = MF.getFunction();
2869       DiagnosticInfoUnsupported BadLDSDecl(
2870         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2871         DS_Warning);
2872       Fn.getContext().diagnose(BadLDSDecl);
2873 
2874       // We currently don't have a way to correctly allocate LDS objects that
2875       // aren't directly associated with a kernel. We do force inlining of
2876       // functions that use local objects. However, if these dead functions are
2877       // not eliminated, we don't want a compile time error. Just emit a warning
2878       // and a trap, since there should be no callable path here.
2879       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>());
2880       B.buildUndef(DstReg);
2881       MI.eraseFromParent();
2882       return true;
2883     }
2884 
2885     // TODO: We could emit code to handle the initialization somewhere.
2886     // We ignore the initializer for now and legalize it to allow selection.
2887     // The initializer will anyway get errored out during assembly emission.
2888     const SITargetLowering *TLI = ST.getTargetLowering();
2889     if (!TLI->shouldUseLDSConstAddress(GV)) {
2890       MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2891       return true; // Leave in place;
2892     }
2893 
2894     if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2895       Type *Ty = GV->getValueType();
2896       // HIP uses an unsized array `extern __shared__ T s[]` or similar
2897       // zero-sized type in other languages to declare the dynamic shared
2898       // memory which size is not known at the compile time. They will be
2899       // allocated by the runtime and placed directly after the static
2900       // allocated ones. They all share the same offset.
2901       if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2902         // Adjust alignment for that dynamic shared memory array.
2903         MFI->setDynLDSAlign(MF.getFunction(), *cast<GlobalVariable>(GV));
2904         LLT S32 = LLT::scalar(32);
2905         auto Sz = B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32});
2906         B.buildIntToPtr(DstReg, Sz);
2907         MI.eraseFromParent();
2908         return true;
2909       }
2910     }
2911 
2912     B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(),
2913                                                    *cast<GlobalVariable>(GV)));
2914     MI.eraseFromParent();
2915     return true;
2916   }
2917 
2918   if (ST.isAmdPalOS() || ST.isMesa3DOS()) {
2919     buildAbsGlobalAddress(DstReg, Ty, B, GV, MRI);
2920     MI.eraseFromParent();
2921     return true;
2922   }
2923 
2924   const SITargetLowering *TLI = ST.getTargetLowering();
2925 
2926   if (TLI->shouldEmitFixup(GV)) {
2927     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2928     MI.eraseFromParent();
2929     return true;
2930   }
2931 
2932   if (TLI->shouldEmitPCReloc(GV)) {
2933     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2934     MI.eraseFromParent();
2935     return true;
2936   }
2937 
2938   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2939   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2940 
2941   LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
2942   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2943       MachinePointerInfo::getGOT(MF),
2944       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2945           MachineMemOperand::MOInvariant,
2946       LoadTy, Align(8));
2947 
2948   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2949 
2950   if (Ty.getSizeInBits() == 32) {
2951     // Truncate if this is a 32-bit constant address.
2952     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2953     B.buildExtract(DstReg, Load, 0);
2954   } else
2955     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2956 
2957   MI.eraseFromParent();
2958   return true;
2959 }
2960 
2961 static LLT widenToNextPowerOf2(LLT Ty) {
2962   if (Ty.isVector())
2963     return Ty.changeElementCount(
2964         ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
2965   return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2966 }
2967 
2968 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2969                                        MachineInstr &MI) const {
2970   MachineIRBuilder &B = Helper.MIRBuilder;
2971   MachineRegisterInfo &MRI = *B.getMRI();
2972   GISelChangeObserver &Observer = Helper.Observer;
2973 
2974   Register PtrReg = MI.getOperand(1).getReg();
2975   LLT PtrTy = MRI.getType(PtrReg);
2976   unsigned AddrSpace = PtrTy.getAddressSpace();
2977 
2978   if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2979     LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2980     auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2981     Observer.changingInstr(MI);
2982     MI.getOperand(1).setReg(Cast.getReg(0));
2983     Observer.changedInstr(MI);
2984     return true;
2985   }
2986 
2987   if (MI.getOpcode() != AMDGPU::G_LOAD)
2988     return false;
2989 
2990   Register ValReg = MI.getOperand(0).getReg();
2991   LLT ValTy = MRI.getType(ValReg);
2992 
2993   if (hasBufferRsrcWorkaround(ValTy)) {
2994     Observer.changingInstr(MI);
2995     castBufferRsrcFromV4I32(MI, B, MRI, 0);
2996     Observer.changedInstr(MI);
2997     return true;
2998   }
2999 
3000   MachineMemOperand *MMO = *MI.memoperands_begin();
3001   const unsigned ValSize = ValTy.getSizeInBits();
3002   const LLT MemTy = MMO->getMemoryType();
3003   const Align MemAlign = MMO->getAlign();
3004   const unsigned MemSize = MemTy.getSizeInBits();
3005   const uint64_t AlignInBits = 8 * MemAlign.value();
3006 
3007   // Widen non-power-of-2 loads to the alignment if needed
3008   if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
3009     const unsigned WideMemSize = PowerOf2Ceil(MemSize);
3010 
3011     // This was already the correct extending load result type, so just adjust
3012     // the memory type.
3013     if (WideMemSize == ValSize) {
3014       MachineFunction &MF = B.getMF();
3015 
3016       MachineMemOperand *WideMMO =
3017           MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
3018       Observer.changingInstr(MI);
3019       MI.setMemRefs(MF, {WideMMO});
3020       Observer.changedInstr(MI);
3021       return true;
3022     }
3023 
3024     // Don't bother handling edge case that should probably never be produced.
3025     if (ValSize > WideMemSize)
3026       return false;
3027 
3028     LLT WideTy = widenToNextPowerOf2(ValTy);
3029 
3030     Register WideLoad;
3031     if (!WideTy.isVector()) {
3032       WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3033       B.buildTrunc(ValReg, WideLoad).getReg(0);
3034     } else {
3035       // Extract the subvector.
3036 
3037       if (isRegisterType(ValTy)) {
3038         // If this a case where G_EXTRACT is legal, use it.
3039         // (e.g. <3 x s32> -> <4 x s32>)
3040         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3041         B.buildExtract(ValReg, WideLoad, 0);
3042       } else {
3043         // For cases where the widened type isn't a nice register value, unmerge
3044         // from a widened register (e.g. <3 x s16> -> <4 x s16>)
3045         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3046         B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
3047       }
3048     }
3049 
3050     MI.eraseFromParent();
3051     return true;
3052   }
3053 
3054   return false;
3055 }
3056 
3057 bool AMDGPULegalizerInfo::legalizeStore(LegalizerHelper &Helper,
3058                                         MachineInstr &MI) const {
3059   MachineIRBuilder &B = Helper.MIRBuilder;
3060   MachineRegisterInfo &MRI = *B.getMRI();
3061   GISelChangeObserver &Observer = Helper.Observer;
3062 
3063   Register DataReg = MI.getOperand(0).getReg();
3064   LLT DataTy = MRI.getType(DataReg);
3065 
3066   if (hasBufferRsrcWorkaround(DataTy)) {
3067     Observer.changingInstr(MI);
3068     castBufferRsrcArgToV4I32(MI, B, 0);
3069     Observer.changedInstr(MI);
3070     return true;
3071   }
3072   return false;
3073 }
3074 
3075 bool AMDGPULegalizerInfo::legalizeFMad(
3076   MachineInstr &MI, MachineRegisterInfo &MRI,
3077   MachineIRBuilder &B) const {
3078   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3079   assert(Ty.isScalar());
3080 
3081   MachineFunction &MF = B.getMF();
3082   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
3083 
3084   // TODO: Always legal with future ftz flag.
3085   // FIXME: Do we need just output?
3086   if (Ty == LLT::float32() &&
3087       MFI->getMode().FP32Denormals == DenormalMode::getPreserveSign())
3088     return true;
3089   if (Ty == LLT::float16() &&
3090       MFI->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign())
3091     return true;
3092 
3093   MachineIRBuilder HelperBuilder(MI);
3094   GISelObserverWrapper DummyObserver;
3095   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
3096   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
3097 }
3098 
3099 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
3100   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
3101   Register DstReg = MI.getOperand(0).getReg();
3102   Register PtrReg = MI.getOperand(1).getReg();
3103   Register CmpVal = MI.getOperand(2).getReg();
3104   Register NewVal = MI.getOperand(3).getReg();
3105 
3106   assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
3107          "this should not have been custom lowered");
3108 
3109   LLT ValTy = MRI.getType(CmpVal);
3110   LLT VecTy = LLT::fixed_vector(2, ValTy);
3111 
3112   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
3113 
3114   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
3115     .addDef(DstReg)
3116     .addUse(PtrReg)
3117     .addUse(PackedVal)
3118     .setMemRefs(MI.memoperands());
3119 
3120   MI.eraseFromParent();
3121   return true;
3122 }
3123 
3124 /// Return true if it's known that \p Src can never be an f32 denormal value.
3125 static bool valueIsKnownNeverF32Denorm(const MachineRegisterInfo &MRI,
3126                                        Register Src) {
3127   const MachineInstr *DefMI = MRI.getVRegDef(Src);
3128   switch (DefMI->getOpcode()) {
3129   case TargetOpcode::G_INTRINSIC: {
3130     switch (cast<GIntrinsic>(DefMI)->getIntrinsicID()) {
3131     case Intrinsic::amdgcn_frexp_mant:
3132       return true;
3133     default:
3134       break;
3135     }
3136 
3137     break;
3138   }
3139   case TargetOpcode::G_FFREXP: {
3140     if (DefMI->getOperand(0).getReg() == Src)
3141       return true;
3142     break;
3143   }
3144   case TargetOpcode::G_FPEXT: {
3145     return MRI.getType(DefMI->getOperand(1).getReg()) == LLT::scalar(16);
3146   }
3147   default:
3148     return false;
3149   }
3150 
3151   return false;
3152 }
3153 
3154 static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags) {
3155   if (Flags & MachineInstr::FmAfn)
3156     return true;
3157   const auto &Options = MF.getTarget().Options;
3158   return Options.UnsafeFPMath || Options.ApproxFuncFPMath;
3159 }
3160 
3161 static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src,
3162                                    unsigned Flags) {
3163   return !valueIsKnownNeverF32Denorm(MF.getRegInfo(), Src) &&
3164          MF.getDenormalMode(APFloat::IEEEsingle()).Input !=
3165              DenormalMode::PreserveSign;
3166 }
3167 
3168 std::pair<Register, Register>
3169 AMDGPULegalizerInfo::getScaledLogInput(MachineIRBuilder &B, Register Src,
3170                                        unsigned Flags) const {
3171   if (!needsDenormHandlingF32(B.getMF(), Src, Flags))
3172     return {};
3173 
3174   const LLT F32 = LLT::scalar(32);
3175   auto SmallestNormal = B.buildFConstant(
3176       F32, APFloat::getSmallestNormalized(APFloat::IEEEsingle()));
3177   auto IsLtSmallestNormal =
3178       B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src, SmallestNormal);
3179 
3180   auto Scale32 = B.buildFConstant(F32, 0x1.0p+32);
3181   auto One = B.buildFConstant(F32, 1.0);
3182   auto ScaleFactor =
3183       B.buildSelect(F32, IsLtSmallestNormal, Scale32, One, Flags);
3184   auto ScaledInput = B.buildFMul(F32, Src, ScaleFactor, Flags);
3185 
3186   return {ScaledInput.getReg(0), IsLtSmallestNormal.getReg(0)};
3187 }
3188 
3189 bool AMDGPULegalizerInfo::legalizeFlog2(MachineInstr &MI,
3190                                         MachineIRBuilder &B) const {
3191   // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
3192   // If we have to handle denormals, scale up the input and adjust the result.
3193 
3194   // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
3195   // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
3196 
3197   Register Dst = MI.getOperand(0).getReg();
3198   Register Src = MI.getOperand(1).getReg();
3199   LLT Ty = B.getMRI()->getType(Dst);
3200   unsigned Flags = MI.getFlags();
3201 
3202   if (Ty == LLT::scalar(16)) {
3203     const LLT F32 = LLT::scalar(32);
3204     // Nothing in half is a denormal when promoted to f32.
3205     auto Ext = B.buildFPExt(F32, Src, Flags);
3206     auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {F32})
3207                     .addUse(Ext.getReg(0))
3208                     .setMIFlags(Flags);
3209     B.buildFPTrunc(Dst, Log2, Flags);
3210     MI.eraseFromParent();
3211     return true;
3212   }
3213 
3214   assert(Ty == LLT::scalar(32));
3215 
3216   auto [ScaledInput, IsLtSmallestNormal] = getScaledLogInput(B, Src, Flags);
3217   if (!ScaledInput) {
3218     B.buildIntrinsic(Intrinsic::amdgcn_log, {MI.getOperand(0)})
3219         .addUse(Src)
3220         .setMIFlags(Flags);
3221     MI.eraseFromParent();
3222     return true;
3223   }
3224 
3225   auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3226                   .addUse(ScaledInput)
3227                   .setMIFlags(Flags);
3228 
3229   auto ThirtyTwo = B.buildFConstant(Ty, 32.0);
3230   auto Zero = B.buildFConstant(Ty, 0.0);
3231   auto ResultOffset =
3232       B.buildSelect(Ty, IsLtSmallestNormal, ThirtyTwo, Zero, Flags);
3233   B.buildFSub(Dst, Log2, ResultOffset, Flags);
3234 
3235   MI.eraseFromParent();
3236   return true;
3237 }
3238 
3239 static Register getMad(MachineIRBuilder &B, LLT Ty, Register X, Register Y,
3240                        Register Z, unsigned Flags) {
3241   auto FMul = B.buildFMul(Ty, X, Y, Flags);
3242   return B.buildFAdd(Ty, FMul, Z, Flags).getReg(0);
3243 }
3244 
3245 bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI,
3246                                              MachineIRBuilder &B) const {
3247   const bool IsLog10 = MI.getOpcode() == TargetOpcode::G_FLOG10;
3248   assert(IsLog10 || MI.getOpcode() == TargetOpcode::G_FLOG);
3249 
3250   MachineRegisterInfo &MRI = *B.getMRI();
3251   Register Dst = MI.getOperand(0).getReg();
3252   Register X = MI.getOperand(1).getReg();
3253   unsigned Flags = MI.getFlags();
3254   const LLT Ty = MRI.getType(X);
3255   MachineFunction &MF = B.getMF();
3256 
3257   const LLT F32 = LLT::scalar(32);
3258   const LLT F16 = LLT::scalar(16);
3259 
3260   const AMDGPUTargetMachine &TM =
3261       static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
3262 
3263   if (Ty == F16 || MI.getFlag(MachineInstr::FmAfn) ||
3264       TM.Options.ApproxFuncFPMath || TM.Options.UnsafeFPMath) {
3265     if (Ty == F16 && !ST.has16BitInsts()) {
3266       Register LogVal = MRI.createGenericVirtualRegister(F32);
3267       auto PromoteSrc = B.buildFPExt(F32, X);
3268       legalizeFlogUnsafe(B, LogVal, PromoteSrc.getReg(0), IsLog10, Flags);
3269       B.buildFPTrunc(Dst, LogVal);
3270     } else {
3271       legalizeFlogUnsafe(B, Dst, X, IsLog10, Flags);
3272     }
3273 
3274     MI.eraseFromParent();
3275     return true;
3276   }
3277 
3278   auto [ScaledInput, IsScaled] = getScaledLogInput(B, X, Flags);
3279   if (ScaledInput)
3280     X = ScaledInput;
3281 
3282   auto Y =
3283       B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}).addUse(X).setMIFlags(Flags);
3284 
3285   Register R;
3286   if (ST.hasFastFMAF32()) {
3287     // c+cc are ln(2)/ln(10) to more than 49 bits
3288     const float c_log10 = 0x1.344134p-2f;
3289     const float cc_log10 = 0x1.09f79ep-26f;
3290 
3291     // c + cc is ln(2) to more than 49 bits
3292     const float c_log = 0x1.62e42ep-1f;
3293     const float cc_log = 0x1.efa39ep-25f;
3294 
3295     auto C = B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log);
3296     auto CC = B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log);
3297 
3298     R = B.buildFMul(Ty, Y, C, Flags).getReg(0);
3299     auto NegR = B.buildFNeg(Ty, R, Flags);
3300     auto FMA0 = B.buildFMA(Ty, Y, C, NegR, Flags);
3301     auto FMA1 = B.buildFMA(Ty, Y, CC, FMA0, Flags);
3302     R = B.buildFAdd(Ty, R, FMA1, Flags).getReg(0);
3303   } else {
3304     // ch+ct is ln(2)/ln(10) to more than 36 bits
3305     const float ch_log10 = 0x1.344000p-2f;
3306     const float ct_log10 = 0x1.3509f6p-18f;
3307 
3308     // ch + ct is ln(2) to more than 36 bits
3309     const float ch_log = 0x1.62e000p-1f;
3310     const float ct_log = 0x1.0bfbe8p-15f;
3311 
3312     auto CH = B.buildFConstant(Ty, IsLog10 ? ch_log10 : ch_log);
3313     auto CT = B.buildFConstant(Ty, IsLog10 ? ct_log10 : ct_log);
3314 
3315     auto MaskConst = B.buildConstant(Ty, 0xfffff000);
3316     auto YH = B.buildAnd(Ty, Y, MaskConst);
3317     auto YT = B.buildFSub(Ty, Y, YH, Flags);
3318     auto YTCT = B.buildFMul(Ty, YT, CT, Flags);
3319 
3320     Register Mad0 =
3321         getMad(B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), Flags);
3322     Register Mad1 = getMad(B, Ty, YT.getReg(0), CH.getReg(0), Mad0, Flags);
3323     R = getMad(B, Ty, YH.getReg(0), CH.getReg(0), Mad1, Flags);
3324   }
3325 
3326   const bool IsFiniteOnly =
3327       (MI.getFlag(MachineInstr::FmNoNans) || TM.Options.NoNaNsFPMath) &&
3328       (MI.getFlag(MachineInstr::FmNoInfs) || TM.Options.NoInfsFPMath);
3329 
3330   if (!IsFiniteOnly) {
3331     // Expand isfinite(x) => fabs(x) < inf
3332     auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
3333     auto Fabs = B.buildFAbs(Ty, Y);
3334     auto IsFinite =
3335         B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
3336     R = B.buildSelect(Ty, IsFinite, R, Y, Flags).getReg(0);
3337   }
3338 
3339   if (ScaledInput) {
3340     auto Zero = B.buildFConstant(Ty, 0.0);
3341     auto ShiftK =
3342         B.buildFConstant(Ty, IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f);
3343     auto Shift = B.buildSelect(Ty, IsScaled, ShiftK, Zero, Flags);
3344     B.buildFSub(Dst, R, Shift, Flags);
3345   } else {
3346     B.buildCopy(Dst, R);
3347   }
3348 
3349   MI.eraseFromParent();
3350   return true;
3351 }
3352 
3353 bool AMDGPULegalizerInfo::legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst,
3354                                              Register Src, bool IsLog10,
3355                                              unsigned Flags) const {
3356   const double Log2BaseInverted =
3357       IsLog10 ? numbers::ln2 / numbers::ln10 : numbers::ln2;
3358 
3359   LLT Ty = B.getMRI()->getType(Dst);
3360 
3361   if (Ty == LLT::scalar(32)) {
3362     auto [ScaledInput, IsScaled] = getScaledLogInput(B, Src, Flags);
3363     if (ScaledInput) {
3364       auto LogSrc = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3365                         .addUse(Src)
3366                         .setMIFlags(Flags);
3367       auto ScaledResultOffset = B.buildFConstant(Ty, -32.0 * Log2BaseInverted);
3368       auto Zero = B.buildFConstant(Ty, 0.0);
3369       auto ResultOffset =
3370           B.buildSelect(Ty, IsScaled, ScaledResultOffset, Zero, Flags);
3371       auto Log2Inv = B.buildFConstant(Ty, Log2BaseInverted);
3372 
3373       if (ST.hasFastFMAF32())
3374         B.buildFMA(Dst, LogSrc, Log2Inv, ResultOffset, Flags);
3375       else {
3376         auto Mul = B.buildFMul(Ty, LogSrc, Log2Inv, Flags);
3377         B.buildFAdd(Dst, Mul, ResultOffset, Flags);
3378       }
3379 
3380       return true;
3381     }
3382   }
3383 
3384   auto Log2Operand = Ty == LLT::scalar(16)
3385                          ? B.buildFLog2(Ty, Src, Flags)
3386                          : B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3387                                .addUse(Src)
3388                                .setMIFlags(Flags);
3389   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
3390   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
3391   return true;
3392 }
3393 
3394 bool AMDGPULegalizerInfo::legalizeFExp2(MachineInstr &MI,
3395                                         MachineIRBuilder &B) const {
3396   // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
3397   // If we have to handle denormals, scale up the input and adjust the result.
3398 
3399   Register Dst = MI.getOperand(0).getReg();
3400   Register Src = MI.getOperand(1).getReg();
3401   unsigned Flags = MI.getFlags();
3402   LLT Ty = B.getMRI()->getType(Dst);
3403   const LLT F16 = LLT::scalar(16);
3404   const LLT F32 = LLT::scalar(32);
3405 
3406   if (Ty == F16) {
3407     // Nothing in half is a denormal when promoted to f32.
3408     auto Ext = B.buildFPExt(F32, Src, Flags);
3409     auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {F32})
3410                     .addUse(Ext.getReg(0))
3411                     .setMIFlags(Flags);
3412     B.buildFPTrunc(Dst, Log2, Flags);
3413     MI.eraseFromParent();
3414     return true;
3415   }
3416 
3417   assert(Ty == F32);
3418 
3419   if (!needsDenormHandlingF32(B.getMF(), Src, Flags)) {
3420     B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst})
3421         .addUse(Src)
3422         .setMIFlags(Flags);
3423     MI.eraseFromParent();
3424     return true;
3425   }
3426 
3427   // bool needs_scaling = x < -0x1.f80000p+6f;
3428   // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
3429 
3430   // -nextafter(128.0, -1)
3431   auto RangeCheckConst = B.buildFConstant(Ty, -0x1.f80000p+6f);
3432   auto NeedsScaling = B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src,
3433                                   RangeCheckConst, Flags);
3434 
3435   auto SixtyFour = B.buildFConstant(Ty, 0x1.0p+6f);
3436   auto Zero = B.buildFConstant(Ty, 0.0);
3437   auto AddOffset = B.buildSelect(F32, NeedsScaling, SixtyFour, Zero, Flags);
3438   auto AddInput = B.buildFAdd(F32, Src, AddOffset, Flags);
3439 
3440   auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3441                   .addUse(AddInput.getReg(0))
3442                   .setMIFlags(Flags);
3443 
3444   auto TwoExpNeg64 = B.buildFConstant(Ty, 0x1.0p-64f);
3445   auto One = B.buildFConstant(Ty, 1.0);
3446   auto ResultScale = B.buildSelect(F32, NeedsScaling, TwoExpNeg64, One, Flags);
3447   B.buildFMul(Dst, Exp2, ResultScale, Flags);
3448   MI.eraseFromParent();
3449   return true;
3450 }
3451 
3452 bool AMDGPULegalizerInfo::legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst,
3453                                              Register X, unsigned Flags) const {
3454   LLT Ty = B.getMRI()->getType(Dst);
3455   LLT F32 = LLT::scalar(32);
3456 
3457   if (Ty != F32 || !needsDenormHandlingF32(B.getMF(), X, Flags)) {
3458     auto Log2E = B.buildFConstant(Ty, numbers::log2e);
3459     auto Mul = B.buildFMul(Ty, X, Log2E, Flags);
3460 
3461     if (Ty == F32) {
3462       B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst})
3463         .addUse(Mul.getReg(0))
3464         .setMIFlags(Flags);
3465     } else {
3466       B.buildFExp2(Dst, Mul.getReg(0), Flags);
3467     }
3468 
3469     return true;
3470   }
3471 
3472   auto Threshold = B.buildFConstant(Ty, -0x1.5d58a0p+6f);
3473   auto NeedsScaling =
3474       B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, Threshold, Flags);
3475   auto ScaleOffset = B.buildFConstant(Ty, 0x1.0p+6f);
3476   auto ScaledX = B.buildFAdd(Ty, X, ScaleOffset, Flags);
3477   auto AdjustedX = B.buildSelect(Ty, NeedsScaling, ScaledX, X, Flags);
3478 
3479   auto Log2E = B.buildFConstant(Ty, numbers::log2e);
3480   auto ExpInput = B.buildFMul(Ty, AdjustedX, Log2E, Flags);
3481 
3482   auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3483     .addUse(ExpInput.getReg(0))
3484     .setMIFlags(Flags);
3485 
3486   auto ResultScaleFactor = B.buildFConstant(Ty, 0x1.969d48p-93f);
3487   auto AdjustedResult = B.buildFMul(Ty, Exp2, ResultScaleFactor, Flags);
3488   B.buildSelect(Dst, NeedsScaling, AdjustedResult, Exp2, Flags);
3489   return true;
3490 }
3491 
3492 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
3493                                        MachineIRBuilder &B) const {
3494   Register Dst = MI.getOperand(0).getReg();
3495   Register X = MI.getOperand(1).getReg();
3496   const unsigned Flags = MI.getFlags();
3497   MachineFunction &MF = B.getMF();
3498   MachineRegisterInfo &MRI = *B.getMRI();
3499   LLT Ty = MRI.getType(Dst);
3500   const LLT F16 = LLT::scalar(16);
3501   const LLT F32 = LLT::scalar(32);
3502   const bool IsExp10 = MI.getOpcode() == TargetOpcode::G_FEXP10;
3503 
3504   if (Ty == F16) {
3505     // v_exp_f16 (fmul x, log2e)
3506     if (allowApproxFunc(MF, Flags)) {
3507       // TODO: Does this really require fast?
3508       legalizeFExpUnsafe(B, Dst, X, Flags);
3509       MI.eraseFromParent();
3510       return true;
3511     }
3512 
3513     // exp(f16 x) ->
3514     //   fptrunc (v_exp_f32 (fmul (fpext x), log2e))
3515 
3516     // Nothing in half is a denormal when promoted to f32.
3517     auto Ext = B.buildFPExt(F32, X, Flags);
3518     Register Lowered = MRI.createGenericVirtualRegister(F32);
3519     legalizeFExpUnsafe(B, Lowered, Ext.getReg(0), Flags);
3520     B.buildFPTrunc(Dst, Lowered, Flags);
3521     MI.eraseFromParent();
3522     return true;
3523   }
3524 
3525   assert(Ty == F32);
3526 
3527   // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
3528   // library behavior. Also, is known-not-daz source sufficient?
3529   if (allowApproxFunc(MF, Flags)) {
3530     legalizeFExpUnsafe(B, Dst, X, Flags);
3531     MI.eraseFromParent();
3532     return true;
3533   }
3534 
3535   //    Algorithm:
3536   //
3537   //    e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
3538   //
3539   //    x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
3540   //    n = 64*m + j,   0 <= j < 64
3541   //
3542   //    e^x = 2^((64*m + j + f)/64)
3543   //        = (2^m) * (2^(j/64)) * 2^(f/64)
3544   //        = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
3545   //
3546   //    f = x*(64/ln(2)) - n
3547   //    r = f*(ln(2)/64) = x - n*(ln(2)/64)
3548   //
3549   //    e^x = (2^m) * (2^(j/64)) * e^r
3550   //
3551   //    (2^(j/64)) is precomputed
3552   //
3553   //    e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3554   //    e^r = 1 + q
3555   //
3556   //    q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3557   //
3558   //    e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
3559   const unsigned FlagsNoContract = Flags & ~MachineInstr::FmContract;
3560   Register PH, PL;
3561 
3562   if (ST.hasFastFMAF32()) {
3563     const float c_exp = numbers::log2ef;
3564     const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
3565     const float c_exp10 = 0x1.a934f0p+1f;
3566     const float cc_exp10 = 0x1.2f346ep-24f;
3567 
3568     auto C = B.buildFConstant(Ty, IsExp10 ? c_exp10 : c_exp);
3569     PH = B.buildFMul(Ty, X, C, Flags).getReg(0);
3570     auto NegPH = B.buildFNeg(Ty, PH, Flags);
3571     auto FMA0 = B.buildFMA(Ty, X, C, NegPH, Flags);
3572 
3573     auto CC = B.buildFConstant(Ty, IsExp10 ? cc_exp10 : cc_exp);
3574     PL = B.buildFMA(Ty, X, CC, FMA0, Flags).getReg(0);
3575   } else {
3576     const float ch_exp = 0x1.714000p+0f;
3577     const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
3578 
3579     const float ch_exp10 = 0x1.a92000p+1f;
3580     const float cl_exp10 = 0x1.4f0978p-11f;
3581 
3582     auto MaskConst = B.buildConstant(Ty, 0xfffff000);
3583     auto XH = B.buildAnd(Ty, X, MaskConst);
3584     auto XL = B.buildFSub(Ty, X, XH, Flags);
3585 
3586     auto CH = B.buildFConstant(Ty, IsExp10 ? ch_exp10 : ch_exp);
3587     PH = B.buildFMul(Ty, XH, CH, Flags).getReg(0);
3588 
3589     auto CL = B.buildFConstant(Ty, IsExp10 ? cl_exp10 : cl_exp);
3590     auto XLCL = B.buildFMul(Ty, XL, CL, Flags);
3591 
3592     Register Mad0 =
3593         getMad(B, Ty, XL.getReg(0), CH.getReg(0), XLCL.getReg(0), Flags);
3594     PL = getMad(B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags);
3595   }
3596 
3597   auto E = B.buildIntrinsicRoundeven(Ty, PH, Flags);
3598 
3599   // It is unsafe to contract this fsub into the PH multiply.
3600   auto PHSubE = B.buildFSub(Ty, PH, E, FlagsNoContract);
3601   auto A = B.buildFAdd(Ty, PHSubE, PL, Flags);
3602   auto IntE = B.buildFPTOSI(LLT::scalar(32), E);
3603 
3604   auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3605                   .addUse(A.getReg(0))
3606                   .setMIFlags(Flags);
3607   auto R = B.buildFLdexp(Ty, Exp2, IntE, Flags);
3608 
3609   auto UnderflowCheckConst =
3610       B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
3611   auto Zero = B.buildFConstant(Ty, 0.0);
3612   auto Underflow =
3613       B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, UnderflowCheckConst);
3614 
3615   R = B.buildSelect(Ty, Underflow, Zero, R);
3616 
3617   const auto &Options = MF.getTarget().Options;
3618 
3619   if (!(Flags & MachineInstr::FmNoInfs) && !Options.NoInfsFPMath) {
3620     auto OverflowCheckConst =
3621         B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
3622 
3623     auto Overflow =
3624         B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), X, OverflowCheckConst);
3625     auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
3626     R = B.buildSelect(Ty, Overflow, Inf, R, Flags);
3627   }
3628 
3629   B.buildCopy(Dst, R);
3630   MI.eraseFromParent();
3631   return true;
3632 }
3633 
3634 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
3635                                        MachineIRBuilder &B) const {
3636   Register Dst = MI.getOperand(0).getReg();
3637   Register Src0 = MI.getOperand(1).getReg();
3638   Register Src1 = MI.getOperand(2).getReg();
3639   unsigned Flags = MI.getFlags();
3640   LLT Ty = B.getMRI()->getType(Dst);
3641   const LLT F16 = LLT::float16();
3642   const LLT F32 = LLT::float32();
3643 
3644   if (Ty == F32) {
3645     auto Log = B.buildFLog2(F32, Src0, Flags);
3646     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
3647                    .addUse(Log.getReg(0))
3648                    .addUse(Src1)
3649                    .setMIFlags(Flags);
3650     B.buildFExp2(Dst, Mul, Flags);
3651   } else if (Ty == F16) {
3652     // There's no f16 fmul_legacy, so we need to convert for it.
3653     auto Log = B.buildFLog2(F16, Src0, Flags);
3654     auto Ext0 = B.buildFPExt(F32, Log, Flags);
3655     auto Ext1 = B.buildFPExt(F32, Src1, Flags);
3656     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
3657                    .addUse(Ext0.getReg(0))
3658                    .addUse(Ext1.getReg(0))
3659                    .setMIFlags(Flags);
3660     B.buildFExp2(Dst, B.buildFPTrunc(F16, Mul), Flags);
3661   } else
3662     return false;
3663 
3664   MI.eraseFromParent();
3665   return true;
3666 }
3667 
3668 // Find a source register, ignoring any possible source modifiers.
3669 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
3670   Register ModSrc = OrigSrc;
3671   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
3672     ModSrc = SrcFNeg->getOperand(1).getReg();
3673     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
3674       ModSrc = SrcFAbs->getOperand(1).getReg();
3675   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
3676     ModSrc = SrcFAbs->getOperand(1).getReg();
3677   return ModSrc;
3678 }
3679 
3680 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
3681                                          MachineRegisterInfo &MRI,
3682                                          MachineIRBuilder &B) const {
3683 
3684   const LLT S1 = LLT::scalar(1);
3685   const LLT F64 = LLT::float64();
3686   Register Dst = MI.getOperand(0).getReg();
3687   Register OrigSrc = MI.getOperand(1).getReg();
3688   unsigned Flags = MI.getFlags();
3689   assert(ST.hasFractBug() && MRI.getType(Dst) == F64 &&
3690          "this should not have been custom lowered");
3691 
3692   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
3693   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
3694   // efficient way to implement it is using V_FRACT_F64. The workaround for the
3695   // V_FRACT bug is:
3696   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
3697   //
3698   // Convert floor(x) to (x - fract(x))
3699 
3700   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {F64})
3701                    .addUse(OrigSrc)
3702                    .setMIFlags(Flags);
3703 
3704   // Give source modifier matching some assistance before obscuring a foldable
3705   // pattern.
3706 
3707   // TODO: We can avoid the neg on the fract? The input sign to fract
3708   // shouldn't matter?
3709   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
3710 
3711   auto Const =
3712       B.buildFConstant(F64, llvm::bit_cast<double>(0x3fefffffffffffff));
3713 
3714   Register Min = MRI.createGenericVirtualRegister(F64);
3715 
3716   // We don't need to concern ourselves with the snan handling difference, so
3717   // use the one which will directly select.
3718   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3719   if (MFI->getMode().IEEE)
3720     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
3721   else
3722     B.buildFMinNum(Min, Fract, Const, Flags);
3723 
3724   Register CorrectedFract = Min;
3725   if (!MI.getFlag(MachineInstr::FmNoNans)) {
3726     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
3727     CorrectedFract = B.buildSelect(F64, IsNan, ModSrc, Min, Flags).getReg(0);
3728   }
3729 
3730   auto NegFract = B.buildFNeg(F64, CorrectedFract, Flags);
3731   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
3732 
3733   MI.eraseFromParent();
3734   return true;
3735 }
3736 
3737 // Turn an illegal packed v2s16 build vector into bit operations.
3738 // TODO: This should probably be a bitcast action in LegalizerHelper.
3739 bool AMDGPULegalizerInfo::legalizeBuildVector(
3740   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
3741   Register Dst = MI.getOperand(0).getReg();
3742   const LLT S32 = LLT::scalar(32);
3743   const LLT S16 = LLT::scalar(16);
3744   assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
3745 
3746   Register Src0 = MI.getOperand(1).getReg();
3747   Register Src1 = MI.getOperand(2).getReg();
3748 
3749   if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
3750     assert(MRI.getType(Src0) == S32);
3751     Src0 = B.buildTrunc(S16, MI.getOperand(1).getReg()).getReg(0);
3752     Src1 = B.buildTrunc(S16, MI.getOperand(2).getReg()).getReg(0);
3753   }
3754 
3755   auto Merge = B.buildMergeLikeInstr(S32, {Src0, Src1});
3756   B.buildBitcast(Dst, Merge);
3757 
3758   MI.eraseFromParent();
3759   return true;
3760 }
3761 
3762 // Build a big integer multiply or multiply-add using MAD_64_32 instructions.
3763 //
3764 // Source and accumulation registers must all be 32-bits.
3765 //
3766 // TODO: When the multiply is uniform, we should produce a code sequence
3767 // that is better suited to instruction selection on the SALU. Instead of
3768 // the outer loop going over parts of the result, the outer loop should go
3769 // over parts of one of the factors. This should result in instruction
3770 // selection that makes full use of S_ADDC_U32 instructions.
3771 void AMDGPULegalizerInfo::buildMultiply(LegalizerHelper &Helper,
3772                                         MutableArrayRef<Register> Accum,
3773                                         ArrayRef<Register> Src0,
3774                                         ArrayRef<Register> Src1,
3775                                         bool UsePartialMad64_32,
3776                                         bool SeparateOddAlignedProducts) const {
3777   // Use (possibly empty) vectors of S1 registers to represent the set of
3778   // carries from one pair of positions to the next.
3779   using Carry = SmallVector<Register, 2>;
3780 
3781   MachineIRBuilder &B = Helper.MIRBuilder;
3782   GISelKnownBits &KB = *Helper.getKnownBits();
3783 
3784   const LLT S1 = LLT::scalar(1);
3785   const LLT S32 = LLT::scalar(32);
3786   const LLT S64 = LLT::scalar(64);
3787 
3788   Register Zero32;
3789   Register Zero64;
3790 
3791   auto getZero32 = [&]() -> Register {
3792     if (!Zero32)
3793       Zero32 = B.buildConstant(S32, 0).getReg(0);
3794     return Zero32;
3795   };
3796   auto getZero64 = [&]() -> Register {
3797     if (!Zero64)
3798       Zero64 = B.buildConstant(S64, 0).getReg(0);
3799     return Zero64;
3800   };
3801 
3802   SmallVector<bool, 2> Src0KnownZeros, Src1KnownZeros;
3803   for (unsigned i = 0; i < Src0.size(); ++i) {
3804     Src0KnownZeros.push_back(KB.getKnownBits(Src0[i]).isZero());
3805     Src1KnownZeros.push_back(KB.getKnownBits(Src1[i]).isZero());
3806   }
3807 
3808   // Merge the given carries into the 32-bit LocalAccum, which is modified
3809   // in-place.
3810   //
3811   // Returns the carry-out, which is a single S1 register or null.
3812   auto mergeCarry =
3813       [&](Register &LocalAccum, const Carry &CarryIn) -> Register {
3814         if (CarryIn.empty())
3815           return Register();
3816 
3817         bool HaveCarryOut = true;
3818         Register CarryAccum;
3819         if (CarryIn.size() == 1) {
3820           if (!LocalAccum) {
3821             LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
3822             return Register();
3823           }
3824 
3825           CarryAccum = getZero32();
3826         } else {
3827           CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
3828           for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
3829             CarryAccum =
3830                 B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i])
3831                     .getReg(0);
3832           }
3833 
3834           if (!LocalAccum) {
3835             LocalAccum = getZero32();
3836             HaveCarryOut = false;
3837           }
3838         }
3839 
3840         auto Add =
3841             B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back());
3842         LocalAccum = Add.getReg(0);
3843         return HaveCarryOut ? Add.getReg(1) : Register();
3844       };
3845 
3846   // Build a multiply-add chain to compute
3847   //
3848   //   LocalAccum + (partial products at DstIndex)
3849   //       + (opportunistic subset of CarryIn)
3850   //
3851   // LocalAccum is an array of one or two 32-bit registers that are updated
3852   // in-place. The incoming registers may be null.
3853   //
3854   // In some edge cases, carry-ins can be consumed "for free". In that case,
3855   // the consumed carry bits are removed from CarryIn in-place.
3856   auto buildMadChain =
3857       [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn)
3858           -> Carry {
3859         assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) ||
3860                (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1));
3861 
3862         Carry CarryOut;
3863         unsigned j0 = 0;
3864 
3865         // Use plain 32-bit multiplication for the most significant part of the
3866         // result by default.
3867         if (LocalAccum.size() == 1 &&
3868             (!UsePartialMad64_32 || !CarryIn.empty())) {
3869           do {
3870             // Skip multiplication if one of the operands is 0
3871             unsigned j1 = DstIndex - j0;
3872             if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
3873               ++j0;
3874               continue;
3875             }
3876             auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]);
3877             if (!LocalAccum[0] || KB.getKnownBits(LocalAccum[0]).isZero()) {
3878               LocalAccum[0] = Mul.getReg(0);
3879             } else {
3880               if (CarryIn.empty()) {
3881                 LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0);
3882               } else {
3883                 LocalAccum[0] =
3884                     B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back())
3885                         .getReg(0);
3886                 CarryIn.pop_back();
3887               }
3888             }
3889             ++j0;
3890           } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
3891         }
3892 
3893         // Build full 64-bit multiplies.
3894         if (j0 <= DstIndex) {
3895           bool HaveSmallAccum = false;
3896           Register Tmp;
3897 
3898           if (LocalAccum[0]) {
3899             if (LocalAccum.size() == 1) {
3900               Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0);
3901               HaveSmallAccum = true;
3902             } else if (LocalAccum[1]) {
3903               Tmp = B.buildMergeLikeInstr(S64, LocalAccum).getReg(0);
3904               HaveSmallAccum = false;
3905             } else {
3906               Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0);
3907               HaveSmallAccum = true;
3908             }
3909           } else {
3910             assert(LocalAccum.size() == 1 || !LocalAccum[1]);
3911             Tmp = getZero64();
3912             HaveSmallAccum = true;
3913           }
3914 
3915           do {
3916             unsigned j1 = DstIndex - j0;
3917             if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
3918               ++j0;
3919               continue;
3920             }
3921             auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1},
3922                                     {Src0[j0], Src1[j1], Tmp});
3923             Tmp = Mad.getReg(0);
3924             if (!HaveSmallAccum)
3925               CarryOut.push_back(Mad.getReg(1));
3926             HaveSmallAccum = false;
3927 
3928             ++j0;
3929           } while (j0 <= DstIndex);
3930 
3931           auto Unmerge = B.buildUnmerge(S32, Tmp);
3932           LocalAccum[0] = Unmerge.getReg(0);
3933           if (LocalAccum.size() > 1)
3934             LocalAccum[1] = Unmerge.getReg(1);
3935         }
3936 
3937         return CarryOut;
3938       };
3939 
3940   // Outer multiply loop, iterating over destination parts from least
3941   // significant to most significant parts.
3942   //
3943   // The columns of the following diagram correspond to the destination parts
3944   // affected by one iteration of the outer loop (ignoring boundary
3945   // conditions).
3946   //
3947   //   Dest index relative to 2 * i:      1 0 -1
3948   //                                      ------
3949   //   Carries from previous iteration:     e o
3950   //   Even-aligned partial product sum:  E E .
3951   //   Odd-aligned partial product sum:     O O
3952   //
3953   // 'o' is OddCarry, 'e' is EvenCarry.
3954   // EE and OO are computed from partial products via buildMadChain and use
3955   // accumulation where possible and appropriate.
3956   //
3957   Register SeparateOddCarry;
3958   Carry EvenCarry;
3959   Carry OddCarry;
3960 
3961   for (unsigned i = 0; i <= Accum.size() / 2; ++i) {
3962     Carry OddCarryIn = std::move(OddCarry);
3963     Carry EvenCarryIn = std::move(EvenCarry);
3964     OddCarry.clear();
3965     EvenCarry.clear();
3966 
3967     // Partial products at offset 2 * i.
3968     if (2 * i < Accum.size()) {
3969       auto LocalAccum = Accum.drop_front(2 * i).take_front(2);
3970       EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
3971     }
3972 
3973     // Partial products at offset 2 * i - 1.
3974     if (i > 0) {
3975       if (!SeparateOddAlignedProducts) {
3976         auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2);
3977         OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
3978       } else {
3979         bool IsHighest = 2 * i >= Accum.size();
3980         Register SeparateOddOut[2];
3981         auto LocalAccum = MutableArrayRef(SeparateOddOut)
3982                               .take_front(IsHighest ? 1 : 2);
3983         OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
3984 
3985         MachineInstr *Lo;
3986 
3987         if (i == 1) {
3988           if (!IsHighest)
3989             Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]);
3990           else
3991             Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]);
3992         } else {
3993           Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0],
3994                             SeparateOddCarry);
3995         }
3996         Accum[2 * i - 1] = Lo->getOperand(0).getReg();
3997 
3998         if (!IsHighest) {
3999           auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1],
4000                                 Lo->getOperand(1).getReg());
4001           Accum[2 * i] = Hi.getReg(0);
4002           SeparateOddCarry = Hi.getReg(1);
4003         }
4004       }
4005     }
4006 
4007     // Add in the carries from the previous iteration
4008     if (i > 0) {
4009       if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
4010         EvenCarryIn.push_back(CarryOut);
4011 
4012       if (2 * i < Accum.size()) {
4013         if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
4014           OddCarry.push_back(CarryOut);
4015       }
4016     }
4017   }
4018 }
4019 
4020 // Custom narrowing of wide multiplies using wide multiply-add instructions.
4021 //
4022 // TODO: If the multiply is followed by an addition, we should attempt to
4023 // integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities.
4024 bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper &Helper,
4025                                       MachineInstr &MI) const {
4026   assert(ST.hasMad64_32());
4027   assert(MI.getOpcode() == TargetOpcode::G_MUL);
4028 
4029   MachineIRBuilder &B = Helper.MIRBuilder;
4030   MachineRegisterInfo &MRI = *B.getMRI();
4031 
4032   Register DstReg = MI.getOperand(0).getReg();
4033   Register Src0 = MI.getOperand(1).getReg();
4034   Register Src1 = MI.getOperand(2).getReg();
4035 
4036   LLT Ty = MRI.getType(DstReg);
4037   assert(Ty.isScalar());
4038 
4039   unsigned Size = Ty.getSizeInBits();
4040   unsigned NumParts = Size / 32;
4041   assert((Size % 32) == 0);
4042   assert(NumParts >= 2);
4043 
4044   // Whether to use MAD_64_32 for partial products whose high half is
4045   // discarded. This avoids some ADD instructions but risks false dependency
4046   // stalls on some subtargets in some cases.
4047   const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10;
4048 
4049   // Whether to compute odd-aligned partial products separately. This is
4050   // advisable on subtargets where the accumulator of MAD_64_32 must be placed
4051   // in an even-aligned VGPR.
4052   const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
4053 
4054   LLT S32 = LLT::scalar(32);
4055   SmallVector<Register, 2> Src0Parts, Src1Parts;
4056   for (unsigned i = 0; i < NumParts; ++i) {
4057     Src0Parts.push_back(MRI.createGenericVirtualRegister(S32));
4058     Src1Parts.push_back(MRI.createGenericVirtualRegister(S32));
4059   }
4060   B.buildUnmerge(Src0Parts, Src0);
4061   B.buildUnmerge(Src1Parts, Src1);
4062 
4063   SmallVector<Register, 2> AccumRegs(NumParts);
4064   buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
4065                 SeparateOddAlignedProducts);
4066 
4067   B.buildMergeLikeInstr(DstReg, AccumRegs);
4068   MI.eraseFromParent();
4069   return true;
4070 }
4071 
4072 // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
4073 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
4074 // case with a single min instruction instead of a compare+select.
4075 bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI,
4076                                             MachineRegisterInfo &MRI,
4077                                             MachineIRBuilder &B) const {
4078   Register Dst = MI.getOperand(0).getReg();
4079   Register Src = MI.getOperand(1).getReg();
4080   LLT DstTy = MRI.getType(Dst);
4081   LLT SrcTy = MRI.getType(Src);
4082 
4083   unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
4084                         ? AMDGPU::G_AMDGPU_FFBH_U32
4085                         : AMDGPU::G_AMDGPU_FFBL_B32;
4086   auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
4087   B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
4088 
4089   MI.eraseFromParent();
4090   return true;
4091 }
4092 
4093 // Check that this is a G_XOR x, -1
4094 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
4095   if (MI.getOpcode() != TargetOpcode::G_XOR)
4096     return false;
4097   auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
4098   return ConstVal && *ConstVal == -1;
4099 }
4100 
4101 // Return the use branch instruction, otherwise null if the usage is invalid.
4102 static MachineInstr *
4103 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
4104                   MachineBasicBlock *&UncondBrTarget, bool &Negated) {
4105   Register CondDef = MI.getOperand(0).getReg();
4106   if (!MRI.hasOneNonDBGUse(CondDef))
4107     return nullptr;
4108 
4109   MachineBasicBlock *Parent = MI.getParent();
4110   MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
4111 
4112   if (isNot(MRI, *UseMI)) {
4113     Register NegatedCond = UseMI->getOperand(0).getReg();
4114     if (!MRI.hasOneNonDBGUse(NegatedCond))
4115       return nullptr;
4116 
4117     // We're deleting the def of this value, so we need to remove it.
4118     eraseInstr(*UseMI, MRI);
4119 
4120     UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
4121     Negated = true;
4122   }
4123 
4124   if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
4125     return nullptr;
4126 
4127   // Make sure the cond br is followed by a G_BR, or is the last instruction.
4128   MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
4129   if (Next == Parent->end()) {
4130     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
4131     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
4132       return nullptr;
4133     UncondBrTarget = &*NextMBB;
4134   } else {
4135     if (Next->getOpcode() != AMDGPU::G_BR)
4136       return nullptr;
4137     Br = &*Next;
4138     UncondBrTarget = Br->getOperand(0).getMBB();
4139   }
4140 
4141   return UseMI;
4142 }
4143 
4144 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
4145                                          const ArgDescriptor *Arg,
4146                                          const TargetRegisterClass *ArgRC,
4147                                          LLT ArgTy) const {
4148   MCRegister SrcReg = Arg->getRegister();
4149   assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
4150   assert(DstReg.isVirtual() && "Virtual register expected");
4151 
4152   Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg,
4153                                              *ArgRC, B.getDebugLoc(), ArgTy);
4154   if (Arg->isMasked()) {
4155     // TODO: Should we try to emit this once in the entry block?
4156     const LLT S32 = LLT::scalar(32);
4157     const unsigned Mask = Arg->getMask();
4158     const unsigned Shift = llvm::countr_zero<unsigned>(Mask);
4159 
4160     Register AndMaskSrc = LiveIn;
4161 
4162     // TODO: Avoid clearing the high bits if we know workitem id y/z are always
4163     // 0.
4164     if (Shift != 0) {
4165       auto ShiftAmt = B.buildConstant(S32, Shift);
4166       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
4167     }
4168 
4169     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
4170   } else {
4171     B.buildCopy(DstReg, LiveIn);
4172   }
4173 
4174   return true;
4175 }
4176 
4177 bool AMDGPULegalizerInfo::loadInputValue(
4178     Register DstReg, MachineIRBuilder &B,
4179     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4180   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4181   const ArgDescriptor *Arg;
4182   const TargetRegisterClass *ArgRC;
4183   LLT ArgTy;
4184   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
4185 
4186   if (!Arg) {
4187     if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) {
4188       // The intrinsic may appear when we have a 0 sized kernarg segment, in which
4189       // case the pointer argument may be missing and we use null.
4190       B.buildConstant(DstReg, 0);
4191       return true;
4192     }
4193 
4194     // It's undefined behavior if a function marked with the amdgpu-no-*
4195     // attributes uses the corresponding intrinsic.
4196     B.buildUndef(DstReg);
4197     return true;
4198   }
4199 
4200   if (!Arg->isRegister() || !Arg->getRegister().isValid())
4201     return false; // TODO: Handle these
4202   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
4203 }
4204 
4205 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
4206     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
4207     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4208   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
4209     return false;
4210 
4211   MI.eraseFromParent();
4212   return true;
4213 }
4214 
4215 static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI,
4216                                 int64_t C) {
4217   B.buildConstant(MI.getOperand(0).getReg(), C);
4218   MI.eraseFromParent();
4219   return true;
4220 }
4221 
4222 bool AMDGPULegalizerInfo::legalizeWorkitemIDIntrinsic(
4223     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
4224     unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4225   unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim);
4226   if (MaxID == 0)
4227     return replaceWithConstant(B, MI, 0);
4228 
4229   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4230   const ArgDescriptor *Arg;
4231   const TargetRegisterClass *ArgRC;
4232   LLT ArgTy;
4233   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
4234 
4235   Register DstReg = MI.getOperand(0).getReg();
4236   if (!Arg) {
4237     // It's undefined behavior if a function marked with the amdgpu-no-*
4238     // attributes uses the corresponding intrinsic.
4239     B.buildUndef(DstReg);
4240     MI.eraseFromParent();
4241     return true;
4242   }
4243 
4244   if (Arg->isMasked()) {
4245     // Don't bother inserting AssertZext for packed IDs since we're emitting the
4246     // masking operations anyway.
4247     //
4248     // TODO: We could assert the top bit is 0 for the source copy.
4249     if (!loadInputValue(DstReg, B, ArgType))
4250       return false;
4251   } else {
4252     Register TmpReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
4253     if (!loadInputValue(TmpReg, B, ArgType))
4254       return false;
4255     B.buildAssertZExt(DstReg, TmpReg, llvm::bit_width(MaxID));
4256   }
4257 
4258   MI.eraseFromParent();
4259   return true;
4260 }
4261 
4262 Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B,
4263                                                      int64_t Offset) const {
4264   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
4265   Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy);
4266 
4267   // TODO: If we passed in the base kernel offset we could have a better
4268   // alignment than 4, but we don't really need it.
4269   if (!loadInputValue(KernArgReg, B,
4270                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
4271     llvm_unreachable("failed to find kernarg segment ptr");
4272 
4273   auto COffset = B.buildConstant(LLT::scalar(64), Offset);
4274   // TODO: Should get nuw
4275   return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0);
4276 }
4277 
4278 /// Legalize a value that's loaded from kernel arguments. This is only used by
4279 /// legacy intrinsics.
4280 bool AMDGPULegalizerInfo::legalizeKernargMemParameter(MachineInstr &MI,
4281                                                       MachineIRBuilder &B,
4282                                                       uint64_t Offset,
4283                                                       Align Alignment) const {
4284   Register DstReg = MI.getOperand(0).getReg();
4285 
4286   assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) &&
4287          "unexpected kernarg parameter type");
4288 
4289   Register Ptr = getKernargParameterPtr(B, Offset);
4290   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
4291   B.buildLoad(DstReg, Ptr, PtrInfo, Align(4),
4292               MachineMemOperand::MODereferenceable |
4293                   MachineMemOperand::MOInvariant);
4294   MI.eraseFromParent();
4295   return true;
4296 }
4297 
4298 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
4299                                        MachineRegisterInfo &MRI,
4300                                        MachineIRBuilder &B) const {
4301   Register Dst = MI.getOperand(0).getReg();
4302   LLT DstTy = MRI.getType(Dst);
4303   LLT S16 = LLT::scalar(16);
4304   LLT S32 = LLT::scalar(32);
4305   LLT S64 = LLT::scalar(64);
4306 
4307   if (DstTy == S16)
4308     return legalizeFDIV16(MI, MRI, B);
4309   if (DstTy == S32)
4310     return legalizeFDIV32(MI, MRI, B);
4311   if (DstTy == S64)
4312     return legalizeFDIV64(MI, MRI, B);
4313 
4314   return false;
4315 }
4316 
4317 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
4318                                                         Register DstDivReg,
4319                                                         Register DstRemReg,
4320                                                         Register X,
4321                                                         Register Y) const {
4322   const LLT S1 = LLT::scalar(1);
4323   const LLT S32 = LLT::scalar(32);
4324 
4325   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
4326   // algorithm used here.
4327 
4328   // Initial estimate of inv(y).
4329   auto FloatY = B.buildUITOFP(S32, Y);
4330   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
4331   auto Scale = B.buildFConstant(S32, llvm::bit_cast<float>(0x4f7ffffe));
4332   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
4333   auto Z = B.buildFPTOUI(S32, ScaledY);
4334 
4335   // One round of UNR.
4336   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
4337   auto NegYZ = B.buildMul(S32, NegY, Z);
4338   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
4339 
4340   // Quotient/remainder estimate.
4341   auto Q = B.buildUMulH(S32, X, Z);
4342   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
4343 
4344   // First quotient/remainder refinement.
4345   auto One = B.buildConstant(S32, 1);
4346   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
4347   if (DstDivReg)
4348     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
4349   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
4350 
4351   // Second quotient/remainder refinement.
4352   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
4353   if (DstDivReg)
4354     B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
4355 
4356   if (DstRemReg)
4357     B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
4358 }
4359 
4360 // Build integer reciprocal sequence around V_RCP_IFLAG_F32
4361 //
4362 // Return lo, hi of result
4363 //
4364 // %cvt.lo = G_UITOFP Val.lo
4365 // %cvt.hi = G_UITOFP Val.hi
4366 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
4367 // %rcp = G_AMDGPU_RCP_IFLAG %mad
4368 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
4369 // %mul2 = G_FMUL %mul1, 2**(-32)
4370 // %trunc = G_INTRINSIC_TRUNC %mul2
4371 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
4372 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
4373 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
4374                                                        Register Val) {
4375   const LLT S32 = LLT::scalar(32);
4376   auto Unmerge = B.buildUnmerge(S32, Val);
4377 
4378   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
4379   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
4380 
4381   auto Mad = B.buildFMAD(
4382       S32, CvtHi, // 2**32
4383       B.buildFConstant(S32, llvm::bit_cast<float>(0x4f800000)), CvtLo);
4384 
4385   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
4386   auto Mul1 = B.buildFMul(
4387       S32, Rcp, B.buildFConstant(S32, llvm::bit_cast<float>(0x5f7ffffc)));
4388 
4389   // 2**(-32)
4390   auto Mul2 = B.buildFMul(
4391       S32, Mul1, B.buildFConstant(S32, llvm::bit_cast<float>(0x2f800000)));
4392   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
4393 
4394   // -(2**32)
4395   auto Mad2 = B.buildFMAD(
4396       S32, Trunc, B.buildFConstant(S32, llvm::bit_cast<float>(0xcf800000)),
4397       Mul1);
4398 
4399   auto ResultLo = B.buildFPTOUI(S32, Mad2);
4400   auto ResultHi = B.buildFPTOUI(S32, Trunc);
4401 
4402   return {ResultLo.getReg(0), ResultHi.getReg(0)};
4403 }
4404 
4405 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
4406                                                         Register DstDivReg,
4407                                                         Register DstRemReg,
4408                                                         Register Numer,
4409                                                         Register Denom) const {
4410   const LLT S32 = LLT::scalar(32);
4411   const LLT S64 = LLT::scalar(64);
4412   const LLT S1 = LLT::scalar(1);
4413   Register RcpLo, RcpHi;
4414 
4415   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
4416 
4417   auto Rcp = B.buildMergeLikeInstr(S64, {RcpLo, RcpHi});
4418 
4419   auto Zero64 = B.buildConstant(S64, 0);
4420   auto NegDenom = B.buildSub(S64, Zero64, Denom);
4421 
4422   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
4423   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
4424 
4425   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
4426   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
4427   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
4428 
4429   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
4430   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
4431   auto Add1 = B.buildMergeLikeInstr(S64, {Add1_Lo, Add1_Hi});
4432 
4433   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
4434   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
4435   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
4436   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
4437   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
4438 
4439   auto Zero32 = B.buildConstant(S32, 0);
4440   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
4441   auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
4442   auto Add2 = B.buildMergeLikeInstr(S64, {Add2_Lo, Add2_Hi});
4443 
4444   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
4445   Register NumerLo = UnmergeNumer.getReg(0);
4446   Register NumerHi = UnmergeNumer.getReg(1);
4447 
4448   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
4449   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
4450   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
4451   Register Mul3_Lo = UnmergeMul3.getReg(0);
4452   Register Mul3_Hi = UnmergeMul3.getReg(1);
4453   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
4454   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
4455   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
4456   auto Sub1 = B.buildMergeLikeInstr(S64, {Sub1_Lo, Sub1_Hi});
4457 
4458   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
4459   Register DenomLo = UnmergeDenom.getReg(0);
4460   Register DenomHi = UnmergeDenom.getReg(1);
4461 
4462   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
4463   auto C1 = B.buildSExt(S32, CmpHi);
4464 
4465   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
4466   auto C2 = B.buildSExt(S32, CmpLo);
4467 
4468   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
4469   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
4470 
4471   // TODO: Here and below portions of the code can be enclosed into if/endif.
4472   // Currently control flow is unconditional and we have 4 selects after
4473   // potential endif to substitute PHIs.
4474 
4475   // if C3 != 0 ...
4476   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
4477   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
4478   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
4479   auto Sub2 = B.buildMergeLikeInstr(S64, {Sub2_Lo, Sub2_Hi});
4480 
4481   auto One64 = B.buildConstant(S64, 1);
4482   auto Add3 = B.buildAdd(S64, MulHi3, One64);
4483 
4484   auto C4 =
4485       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
4486   auto C5 =
4487       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
4488   auto C6 = B.buildSelect(
4489       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
4490 
4491   // if (C6 != 0)
4492   auto Add4 = B.buildAdd(S64, Add3, One64);
4493   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
4494 
4495   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
4496   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
4497   auto Sub3 = B.buildMergeLikeInstr(S64, {Sub3_Lo, Sub3_Hi});
4498 
4499   // endif C6
4500   // endif C3
4501 
4502   if (DstDivReg) {
4503     auto Sel1 = B.buildSelect(
4504         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
4505     B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
4506                   Sel1, MulHi3);
4507   }
4508 
4509   if (DstRemReg) {
4510     auto Sel2 = B.buildSelect(
4511         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
4512     B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
4513                   Sel2, Sub1);
4514   }
4515 }
4516 
4517 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
4518                                                   MachineRegisterInfo &MRI,
4519                                                   MachineIRBuilder &B) const {
4520   Register DstDivReg, DstRemReg;
4521   switch (MI.getOpcode()) {
4522   default:
4523     llvm_unreachable("Unexpected opcode!");
4524   case AMDGPU::G_UDIV: {
4525     DstDivReg = MI.getOperand(0).getReg();
4526     break;
4527   }
4528   case AMDGPU::G_UREM: {
4529     DstRemReg = MI.getOperand(0).getReg();
4530     break;
4531   }
4532   case AMDGPU::G_UDIVREM: {
4533     DstDivReg = MI.getOperand(0).getReg();
4534     DstRemReg = MI.getOperand(1).getReg();
4535     break;
4536   }
4537   }
4538 
4539   const LLT S64 = LLT::scalar(64);
4540   const LLT S32 = LLT::scalar(32);
4541   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
4542   Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
4543   Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
4544   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
4545 
4546   if (Ty == S32)
4547     legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
4548   else if (Ty == S64)
4549     legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
4550   else
4551     return false;
4552 
4553   MI.eraseFromParent();
4554   return true;
4555 }
4556 
4557 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
4558                                                 MachineRegisterInfo &MRI,
4559                                                 MachineIRBuilder &B) const {
4560   const LLT S64 = LLT::scalar(64);
4561   const LLT S32 = LLT::scalar(32);
4562 
4563   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
4564   if (Ty != S32 && Ty != S64)
4565     return false;
4566 
4567   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
4568   Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
4569   Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
4570 
4571   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
4572   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
4573   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
4574 
4575   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
4576   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
4577 
4578   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
4579   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
4580 
4581   Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
4582   switch (MI.getOpcode()) {
4583   default:
4584     llvm_unreachable("Unexpected opcode!");
4585   case AMDGPU::G_SDIV: {
4586     DstDivReg = MI.getOperand(0).getReg();
4587     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
4588     break;
4589   }
4590   case AMDGPU::G_SREM: {
4591     DstRemReg = MI.getOperand(0).getReg();
4592     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
4593     break;
4594   }
4595   case AMDGPU::G_SDIVREM: {
4596     DstDivReg = MI.getOperand(0).getReg();
4597     DstRemReg = MI.getOperand(1).getReg();
4598     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
4599     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
4600     break;
4601   }
4602   }
4603 
4604   if (Ty == S32)
4605     legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
4606   else
4607     legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
4608 
4609   if (DstDivReg) {
4610     auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
4611     auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
4612     B.buildSub(DstDivReg, SignXor, Sign);
4613   }
4614 
4615   if (DstRemReg) {
4616     auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
4617     auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
4618     B.buildSub(DstRemReg, SignXor, Sign);
4619   }
4620 
4621   MI.eraseFromParent();
4622   return true;
4623 }
4624 
4625 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
4626                                                  MachineRegisterInfo &MRI,
4627                                                  MachineIRBuilder &B) const {
4628   Register Res = MI.getOperand(0).getReg();
4629   Register LHS = MI.getOperand(1).getReg();
4630   Register RHS = MI.getOperand(2).getReg();
4631   uint16_t Flags = MI.getFlags();
4632   LLT ResTy = MRI.getType(Res);
4633 
4634   const MachineFunction &MF = B.getMF();
4635   bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn) ||
4636                             MF.getTarget().Options.UnsafeFPMath;
4637 
4638   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
4639     if (!AllowInaccurateRcp && ResTy != LLT::scalar(16))
4640       return false;
4641 
4642     // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
4643     // the CI documentation has a worst case error of 1 ulp.
4644     // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
4645     // use it as long as we aren't trying to use denormals.
4646     //
4647     // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
4648 
4649     // 1 / x -> RCP(x)
4650     if (CLHS->isExactlyValue(1.0)) {
4651       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
4652           .addUse(RHS)
4653           .setMIFlags(Flags);
4654 
4655       MI.eraseFromParent();
4656       return true;
4657     }
4658 
4659     // -1 / x -> RCP( FNEG(x) )
4660     if (CLHS->isExactlyValue(-1.0)) {
4661       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
4662       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
4663           .addUse(FNeg.getReg(0))
4664           .setMIFlags(Flags);
4665 
4666       MI.eraseFromParent();
4667       return true;
4668     }
4669   }
4670 
4671   // For f16 require afn or arcp.
4672   // For f32 require afn.
4673   if (!AllowInaccurateRcp && (ResTy != LLT::scalar(16) ||
4674                               !MI.getFlag(MachineInstr::FmArcp)))
4675     return false;
4676 
4677   // x / y -> x * (1.0 / y)
4678   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
4679                  .addUse(RHS)
4680                  .setMIFlags(Flags);
4681   B.buildFMul(Res, LHS, RCP, Flags);
4682 
4683   MI.eraseFromParent();
4684   return true;
4685 }
4686 
4687 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
4688                                                    MachineRegisterInfo &MRI,
4689                                                    MachineIRBuilder &B) const {
4690   Register Res = MI.getOperand(0).getReg();
4691   Register X = MI.getOperand(1).getReg();
4692   Register Y = MI.getOperand(2).getReg();
4693   uint16_t Flags = MI.getFlags();
4694   LLT ResTy = MRI.getType(Res);
4695 
4696   const MachineFunction &MF = B.getMF();
4697   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
4698                             MI.getFlag(MachineInstr::FmAfn);
4699 
4700   if (!AllowInaccurateRcp)
4701     return false;
4702 
4703   auto NegY = B.buildFNeg(ResTy, Y);
4704   auto One = B.buildFConstant(ResTy, 1.0);
4705 
4706   auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
4707                .addUse(Y)
4708                .setMIFlags(Flags);
4709 
4710   auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
4711   R = B.buildFMA(ResTy, Tmp0, R, R);
4712 
4713   auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
4714   R = B.buildFMA(ResTy, Tmp1, R, R);
4715 
4716   auto Ret = B.buildFMul(ResTy, X, R);
4717   auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
4718 
4719   B.buildFMA(Res, Tmp2, R, Ret);
4720   MI.eraseFromParent();
4721   return true;
4722 }
4723 
4724 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
4725                                          MachineRegisterInfo &MRI,
4726                                          MachineIRBuilder &B) const {
4727   if (legalizeFastUnsafeFDIV(MI, MRI, B))
4728     return true;
4729 
4730   Register Res = MI.getOperand(0).getReg();
4731   Register LHS = MI.getOperand(1).getReg();
4732   Register RHS = MI.getOperand(2).getReg();
4733 
4734   uint16_t Flags = MI.getFlags();
4735 
4736   LLT S16 = LLT::scalar(16);
4737   LLT S32 = LLT::scalar(32);
4738 
4739   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
4740   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
4741 
4742   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
4743                  .addUse(RHSExt.getReg(0))
4744                  .setMIFlags(Flags);
4745 
4746   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
4747   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
4748 
4749   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
4750       .addUse(RDst.getReg(0))
4751       .addUse(RHS)
4752       .addUse(LHS)
4753       .setMIFlags(Flags);
4754 
4755   MI.eraseFromParent();
4756   return true;
4757 }
4758 
4759 static const unsigned SPDenormModeBitField =
4760     AMDGPU::Hwreg::ID_MODE | (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
4761     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
4762 
4763 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
4764 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
4765 static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B,
4766                                const GCNSubtarget &ST,
4767                                SIModeRegisterDefaults Mode) {
4768   // Set SP denorm mode to this value.
4769   unsigned SPDenormMode =
4770     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
4771 
4772   if (ST.hasDenormModeInst()) {
4773     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
4774     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
4775 
4776     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
4777     B.buildInstr(AMDGPU::S_DENORM_MODE)
4778       .addImm(NewDenormModeValue);
4779 
4780   } else {
4781     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
4782       .addImm(SPDenormMode)
4783       .addImm(SPDenormModeBitField);
4784   }
4785 }
4786 
4787 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
4788                                          MachineRegisterInfo &MRI,
4789                                          MachineIRBuilder &B) const {
4790   if (legalizeFastUnsafeFDIV(MI, MRI, B))
4791     return true;
4792 
4793   Register Res = MI.getOperand(0).getReg();
4794   Register LHS = MI.getOperand(1).getReg();
4795   Register RHS = MI.getOperand(2).getReg();
4796   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4797   SIModeRegisterDefaults Mode = MFI->getMode();
4798 
4799   uint16_t Flags = MI.getFlags();
4800 
4801   LLT S32 = LLT::scalar(32);
4802   LLT S1 = LLT::scalar(1);
4803 
4804   auto One = B.buildFConstant(S32, 1.0f);
4805 
4806   auto DenominatorScaled =
4807       B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
4808           .addUse(LHS)
4809           .addUse(RHS)
4810           .addImm(0)
4811           .setMIFlags(Flags);
4812   auto NumeratorScaled =
4813       B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
4814           .addUse(LHS)
4815           .addUse(RHS)
4816           .addImm(1)
4817           .setMIFlags(Flags);
4818 
4819   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
4820                        .addUse(DenominatorScaled.getReg(0))
4821                        .setMIFlags(Flags);
4822   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
4823 
4824   const bool PreservesDenormals = Mode.FP32Denormals == DenormalMode::getIEEE();
4825   const bool HasDynamicDenormals =
4826       (Mode.FP32Denormals.Input == DenormalMode::Dynamic) ||
4827       (Mode.FP32Denormals.Output == DenormalMode::Dynamic);
4828 
4829   Register SavedSPDenormMode;
4830   if (!PreservesDenormals) {
4831     if (HasDynamicDenormals) {
4832       SavedSPDenormMode = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4833       B.buildInstr(AMDGPU::S_GETREG_B32)
4834           .addDef(SavedSPDenormMode)
4835           .addImm(SPDenormModeBitField);
4836     }
4837     toggleSPDenormMode(true, B, ST, Mode);
4838   }
4839 
4840   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
4841   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
4842   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
4843   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
4844   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
4845   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
4846 
4847   if (!PreservesDenormals) {
4848     if (HasDynamicDenormals) {
4849       assert(SavedSPDenormMode);
4850       B.buildInstr(AMDGPU::S_SETREG_B32)
4851           .addReg(SavedSPDenormMode)
4852           .addImm(SPDenormModeBitField);
4853     } else
4854       toggleSPDenormMode(false, B, ST, Mode);
4855   }
4856 
4857   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32})
4858                   .addUse(Fma4.getReg(0))
4859                   .addUse(Fma1.getReg(0))
4860                   .addUse(Fma3.getReg(0))
4861                   .addUse(NumeratorScaled.getReg(1))
4862                   .setMIFlags(Flags);
4863 
4864   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
4865       .addUse(Fmas.getReg(0))
4866       .addUse(RHS)
4867       .addUse(LHS)
4868       .setMIFlags(Flags);
4869 
4870   MI.eraseFromParent();
4871   return true;
4872 }
4873 
4874 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
4875                                          MachineRegisterInfo &MRI,
4876                                          MachineIRBuilder &B) const {
4877   if (legalizeFastUnsafeFDIV64(MI, MRI, B))
4878     return true;
4879 
4880   Register Res = MI.getOperand(0).getReg();
4881   Register LHS = MI.getOperand(1).getReg();
4882   Register RHS = MI.getOperand(2).getReg();
4883 
4884   uint16_t Flags = MI.getFlags();
4885 
4886   LLT S64 = LLT::scalar(64);
4887   LLT S1 = LLT::scalar(1);
4888 
4889   auto One = B.buildFConstant(S64, 1.0);
4890 
4891   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
4892                        .addUse(LHS)
4893                        .addUse(RHS)
4894                        .addImm(0)
4895                        .setMIFlags(Flags);
4896 
4897   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
4898 
4899   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64})
4900                  .addUse(DivScale0.getReg(0))
4901                  .setMIFlags(Flags);
4902 
4903   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
4904   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
4905   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
4906 
4907   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
4908                        .addUse(LHS)
4909                        .addUse(RHS)
4910                        .addImm(1)
4911                        .setMIFlags(Flags);
4912 
4913   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
4914   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
4915   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
4916 
4917   Register Scale;
4918   if (!ST.hasUsableDivScaleConditionOutput()) {
4919     // Workaround a hardware bug on SI where the condition output from div_scale
4920     // is not usable.
4921 
4922     LLT S32 = LLT::scalar(32);
4923 
4924     auto NumUnmerge = B.buildUnmerge(S32, LHS);
4925     auto DenUnmerge = B.buildUnmerge(S32, RHS);
4926     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
4927     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
4928 
4929     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
4930                               Scale1Unmerge.getReg(1));
4931     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
4932                               Scale0Unmerge.getReg(1));
4933     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
4934   } else {
4935     Scale = DivScale1.getReg(1);
4936   }
4937 
4938   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64})
4939                   .addUse(Fma4.getReg(0))
4940                   .addUse(Fma3.getReg(0))
4941                   .addUse(Mul.getReg(0))
4942                   .addUse(Scale)
4943                   .setMIFlags(Flags);
4944 
4945   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, ArrayRef(Res))
4946       .addUse(Fmas.getReg(0))
4947       .addUse(RHS)
4948       .addUse(LHS)
4949       .setMIFlags(Flags);
4950 
4951   MI.eraseFromParent();
4952   return true;
4953 }
4954 
4955 bool AMDGPULegalizerInfo::legalizeFFREXP(MachineInstr &MI,
4956                                          MachineRegisterInfo &MRI,
4957                                          MachineIRBuilder &B) const {
4958   Register Res0 = MI.getOperand(0).getReg();
4959   Register Res1 = MI.getOperand(1).getReg();
4960   Register Val = MI.getOperand(2).getReg();
4961   uint16_t Flags = MI.getFlags();
4962 
4963   LLT Ty = MRI.getType(Res0);
4964   LLT InstrExpTy = Ty == LLT::scalar(16) ? LLT::scalar(16) : LLT::scalar(32);
4965 
4966   auto Mant = B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty})
4967                   .addUse(Val)
4968                   .setMIFlags(Flags);
4969   auto Exp = B.buildIntrinsic(Intrinsic::amdgcn_frexp_exp, {InstrExpTy})
4970                  .addUse(Val)
4971                  .setMIFlags(Flags);
4972 
4973   if (ST.hasFractBug()) {
4974     auto Fabs = B.buildFAbs(Ty, Val);
4975     auto Inf = B.buildFConstant(Ty, APFloat::getInf(getFltSemanticForLLT(Ty)));
4976     auto IsFinite =
4977         B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
4978     auto Zero = B.buildConstant(InstrExpTy, 0);
4979     Exp = B.buildSelect(InstrExpTy, IsFinite, Exp, Zero);
4980     Mant = B.buildSelect(Ty, IsFinite, Mant, Val);
4981   }
4982 
4983   B.buildCopy(Res0, Mant);
4984   B.buildSExtOrTrunc(Res1, Exp);
4985 
4986   MI.eraseFromParent();
4987   return true;
4988 }
4989 
4990 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
4991                                                  MachineRegisterInfo &MRI,
4992                                                  MachineIRBuilder &B) const {
4993   Register Res = MI.getOperand(0).getReg();
4994   Register LHS = MI.getOperand(2).getReg();
4995   Register RHS = MI.getOperand(3).getReg();
4996   uint16_t Flags = MI.getFlags();
4997 
4998   LLT S32 = LLT::scalar(32);
4999   LLT S1 = LLT::scalar(1);
5000 
5001   auto Abs = B.buildFAbs(S32, RHS, Flags);
5002   const APFloat C0Val(1.0f);
5003 
5004   auto C0 = B.buildFConstant(S32, 0x1p+96f);
5005   auto C1 = B.buildFConstant(S32, 0x1p-32f);
5006   auto C2 = B.buildFConstant(S32, 1.0f);
5007 
5008   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
5009   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
5010 
5011   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
5012 
5013   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
5014                  .addUse(Mul0.getReg(0))
5015                  .setMIFlags(Flags);
5016 
5017   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
5018 
5019   B.buildFMul(Res, Sel, Mul1, Flags);
5020 
5021   MI.eraseFromParent();
5022   return true;
5023 }
5024 
5025 bool AMDGPULegalizerInfo::legalizeFSQRTF16(MachineInstr &MI,
5026                                            MachineRegisterInfo &MRI,
5027                                            MachineIRBuilder &B) const {
5028   // Bypass the correct expansion a standard promotion through G_FSQRT would
5029   // get. The f32 op is accurate enough for the f16 cas.
5030   unsigned Flags = MI.getFlags();
5031   assert(!ST.has16BitInsts());
5032   const LLT F32 = LLT::scalar(32);
5033   auto Ext = B.buildFPExt(F32, MI.getOperand(1), Flags);
5034   auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_sqrt, {F32})
5035     .addUse(Ext.getReg(0))
5036     .setMIFlags(Flags);
5037   B.buildFPTrunc(MI.getOperand(0), Log2, Flags);
5038   MI.eraseFromParent();
5039   return true;
5040 }
5041 
5042 bool AMDGPULegalizerInfo::legalizeFSQRTF32(MachineInstr &MI,
5043                                            MachineRegisterInfo &MRI,
5044                                            MachineIRBuilder &B) const {
5045   MachineFunction &MF = B.getMF();
5046   Register Dst = MI.getOperand(0).getReg();
5047   Register X = MI.getOperand(1).getReg();
5048   const unsigned Flags = MI.getFlags();
5049   const LLT S1 = LLT::scalar(1);
5050   const LLT F32 = LLT::scalar(32);
5051   const LLT I32 = LLT::scalar(32);
5052 
5053   if (allowApproxFunc(MF, Flags)) {
5054     B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({Dst}))
5055       .addUse(X)
5056       .setMIFlags(Flags);
5057     MI.eraseFromParent();
5058     return true;
5059   }
5060 
5061   auto ScaleThreshold = B.buildFConstant(F32, 0x1.0p-96f);
5062   auto NeedScale = B.buildFCmp(CmpInst::FCMP_OGT, S1, ScaleThreshold, X, Flags);
5063   auto ScaleUpFactor = B.buildFConstant(F32, 0x1.0p+32f);
5064   auto ScaledX = B.buildFMul(F32, X, ScaleUpFactor, Flags);
5065   auto SqrtX = B.buildSelect(F32, NeedScale, ScaledX, X, Flags);
5066 
5067   Register SqrtS = MRI.createGenericVirtualRegister(F32);
5068   if (needsDenormHandlingF32(MF, X, Flags)) {
5069     B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({SqrtS}))
5070       .addUse(SqrtX.getReg(0))
5071       .setMIFlags(Flags);
5072 
5073     auto NegOne = B.buildConstant(I32, -1);
5074     auto SqrtSNextDown = B.buildAdd(I32, SqrtS, NegOne);
5075 
5076     auto NegSqrtSNextDown = B.buildFNeg(F32, SqrtSNextDown, Flags);
5077     auto SqrtVP = B.buildFMA(F32, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
5078 
5079     auto PosOne = B.buildConstant(I32, 1);
5080     auto SqrtSNextUp = B.buildAdd(I32, SqrtS, PosOne);
5081 
5082     auto NegSqrtSNextUp = B.buildFNeg(F32, SqrtSNextUp, Flags);
5083     auto SqrtVS = B.buildFMA(F32, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
5084 
5085     auto Zero = B.buildFConstant(F32, 0.0f);
5086     auto SqrtVPLE0 = B.buildFCmp(CmpInst::FCMP_OLE, S1, SqrtVP, Zero, Flags);
5087 
5088     SqrtS =
5089         B.buildSelect(F32, SqrtVPLE0, SqrtSNextDown, SqrtS, Flags).getReg(0);
5090 
5091     auto SqrtVPVSGT0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, SqrtVS, Zero, Flags);
5092     SqrtS =
5093         B.buildSelect(F32, SqrtVPVSGT0, SqrtSNextUp, SqrtS, Flags).getReg(0);
5094   } else {
5095     auto SqrtR =
5096         B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F32}).addReg(SqrtX.getReg(0));
5097     B.buildFMul(SqrtS, SqrtX, SqrtR, Flags);
5098 
5099     auto Half = B.buildFConstant(F32, 0.5f);
5100     auto SqrtH = B.buildFMul(F32, SqrtR, Half, Flags);
5101     auto NegSqrtH = B.buildFNeg(F32, SqrtH, Flags);
5102     auto SqrtE = B.buildFMA(F32, NegSqrtH, SqrtS, Half, Flags);
5103     SqrtH = B.buildFMA(F32, SqrtH, SqrtE, SqrtH, Flags);
5104     SqrtS = B.buildFMA(F32, SqrtS, SqrtE, SqrtS, Flags).getReg(0);
5105     auto NegSqrtS = B.buildFNeg(F32, SqrtS, Flags);
5106     auto SqrtD = B.buildFMA(F32, NegSqrtS, SqrtS, SqrtX, Flags);
5107     SqrtS = B.buildFMA(F32, SqrtD, SqrtH, SqrtS, Flags).getReg(0);
5108   }
5109 
5110   auto ScaleDownFactor = B.buildFConstant(F32, 0x1.0p-16f);
5111 
5112   auto ScaledDown = B.buildFMul(F32, SqrtS, ScaleDownFactor, Flags);
5113 
5114   SqrtS = B.buildSelect(F32, NeedScale, ScaledDown, SqrtS, Flags).getReg(0);
5115 
5116   auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf);
5117   B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtS, Flags);
5118 
5119   MI.eraseFromParent();
5120   return true;
5121 }
5122 
5123 bool AMDGPULegalizerInfo::legalizeFSQRTF64(MachineInstr &MI,
5124                                            MachineRegisterInfo &MRI,
5125                                            MachineIRBuilder &B) const {
5126   // For double type, the SQRT and RSQ instructions don't have required
5127   // precision, we apply Goldschmidt's algorithm to improve the result:
5128   //
5129   //   y0 = rsq(x)
5130   //   g0 = x * y0
5131   //   h0 = 0.5 * y0
5132   //
5133   //   r0 = 0.5 - h0 * g0
5134   //   g1 = g0 * r0 + g0
5135   //   h1 = h0 * r0 + h0
5136   //
5137   //   r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
5138   //   g2 = g1 * r1 + g1     g2 = d0 * h1 + g1
5139   //   h2 = h1 * r1 + h1
5140   //
5141   //   r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
5142   //   g3 = g2 * r2 + g2     g3 = d1 * h1 + g2
5143   //
5144   //   sqrt(x) = g3
5145 
5146   const LLT S1 = LLT::scalar(1);
5147   const LLT S32 = LLT::scalar(32);
5148   const LLT F64 = LLT::scalar(64);
5149 
5150   Register Dst = MI.getOperand(0).getReg();
5151   assert(MRI.getType(Dst) == F64 && "only expect to lower f64 sqrt");
5152 
5153   Register X = MI.getOperand(1).getReg();
5154   unsigned Flags = MI.getFlags();
5155 
5156   auto ScaleConstant = B.buildFConstant(F64, 0x1.0p-767);
5157 
5158   auto ZeroInt = B.buildConstant(S32, 0);
5159   auto Scaling = B.buildFCmp(FCmpInst::FCMP_OLT, S1, X, ScaleConstant);
5160 
5161   // Scale up input if it is too small.
5162   auto ScaleUpFactor = B.buildConstant(S32, 256);
5163   auto ScaleUp = B.buildSelect(S32, Scaling, ScaleUpFactor, ZeroInt);
5164   auto SqrtX = B.buildFLdexp(F64, X, ScaleUp, Flags);
5165 
5166   auto SqrtY =
5167       B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F64}).addReg(SqrtX.getReg(0));
5168 
5169   auto Half = B.buildFConstant(F64, 0.5);
5170   auto SqrtH0 = B.buildFMul(F64, SqrtY, Half);
5171   auto SqrtS0 = B.buildFMul(F64, SqrtX, SqrtY);
5172 
5173   auto NegSqrtH0 = B.buildFNeg(F64, SqrtH0);
5174   auto SqrtR0 = B.buildFMA(F64, NegSqrtH0, SqrtS0, Half);
5175 
5176   auto SqrtS1 = B.buildFMA(F64, SqrtS0, SqrtR0, SqrtS0);
5177   auto SqrtH1 = B.buildFMA(F64, SqrtH0, SqrtR0, SqrtH0);
5178 
5179   auto NegSqrtS1 = B.buildFNeg(F64, SqrtS1);
5180   auto SqrtD0 = B.buildFMA(F64, NegSqrtS1, SqrtS1, SqrtX);
5181 
5182   auto SqrtS2 = B.buildFMA(F64, SqrtD0, SqrtH1, SqrtS1);
5183 
5184   auto NegSqrtS2 = B.buildFNeg(F64, SqrtS2);
5185   auto SqrtD1 = B.buildFMA(F64, NegSqrtS2, SqrtS2, SqrtX);
5186 
5187   auto SqrtRet = B.buildFMA(F64, SqrtD1, SqrtH1, SqrtS2);
5188 
5189   // Scale down the result.
5190   auto ScaleDownFactor = B.buildConstant(S32, -128);
5191   auto ScaleDown = B.buildSelect(S32, Scaling, ScaleDownFactor, ZeroInt);
5192   SqrtRet = B.buildFLdexp(F64, SqrtRet, ScaleDown, Flags);
5193 
5194   // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
5195   // with finite only or nsz because rsq(+/-0) = +/-inf
5196 
5197   // TODO: Check for DAZ and expand to subnormals
5198   auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf);
5199 
5200   // If x is +INF, +0, or -0, use its original value
5201   B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtRet, Flags);
5202 
5203   MI.eraseFromParent();
5204   return true;
5205 }
5206 
5207 bool AMDGPULegalizerInfo::legalizeFSQRT(MachineInstr &MI,
5208                                         MachineRegisterInfo &MRI,
5209                                         MachineIRBuilder &B) const {
5210   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
5211   if (Ty == LLT::scalar(32))
5212     return legalizeFSQRTF32(MI, MRI, B);
5213   if (Ty == LLT::scalar(64))
5214     return legalizeFSQRTF64(MI, MRI, B);
5215   if (Ty == LLT::scalar(16))
5216     return legalizeFSQRTF16(MI, MRI, B);
5217   return false;
5218 }
5219 
5220 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
5221 // FIXME: Why do we handle this one but not other removed instructions?
5222 //
5223 // Reciprocal square root.  The clamp prevents infinite results, clamping
5224 // infinities to max_float.  D.f = 1.0 / sqrt(S0.f), result clamped to
5225 // +-max_float.
5226 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
5227                                                     MachineRegisterInfo &MRI,
5228                                                     MachineIRBuilder &B) const {
5229   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
5230     return true;
5231 
5232   Register Dst = MI.getOperand(0).getReg();
5233   Register Src = MI.getOperand(2).getReg();
5234   auto Flags = MI.getFlags();
5235 
5236   LLT Ty = MRI.getType(Dst);
5237 
5238   const fltSemantics *FltSemantics;
5239   if (Ty == LLT::scalar(32))
5240     FltSemantics = &APFloat::IEEEsingle();
5241   else if (Ty == LLT::scalar(64))
5242     FltSemantics = &APFloat::IEEEdouble();
5243   else
5244     return false;
5245 
5246   auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty})
5247                  .addUse(Src)
5248                  .setMIFlags(Flags);
5249 
5250   // We don't need to concern ourselves with the snan handling difference, since
5251   // the rsq quieted (or not) so use the one which will directly select.
5252   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5253   const bool UseIEEE = MFI->getMode().IEEE;
5254 
5255   auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
5256   auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
5257                             B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
5258 
5259   auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
5260 
5261   if (UseIEEE)
5262     B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
5263   else
5264     B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
5265   MI.eraseFromParent();
5266   return true;
5267 }
5268 
5269 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
5270   switch (IID) {
5271   case Intrinsic::amdgcn_ds_fadd:
5272     return AMDGPU::G_ATOMICRMW_FADD;
5273   case Intrinsic::amdgcn_ds_fmin:
5274     return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
5275   case Intrinsic::amdgcn_ds_fmax:
5276     return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
5277   default:
5278     llvm_unreachable("not a DS FP intrinsic");
5279   }
5280 }
5281 
5282 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
5283                                                       MachineInstr &MI,
5284                                                       Intrinsic::ID IID) const {
5285   GISelChangeObserver &Observer = Helper.Observer;
5286   Observer.changingInstr(MI);
5287 
5288   MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
5289 
5290   // The remaining operands were used to set fields in the MemOperand on
5291   // construction.
5292   for (int I = 6; I > 3; --I)
5293     MI.removeOperand(I);
5294 
5295   MI.removeOperand(1); // Remove the intrinsic ID.
5296   Observer.changedInstr(MI);
5297   return true;
5298 }
5299 
5300 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
5301                                             MachineRegisterInfo &MRI,
5302                                             MachineIRBuilder &B) const {
5303   uint64_t Offset =
5304     ST.getTargetLowering()->getImplicitParameterOffset(
5305       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
5306   LLT DstTy = MRI.getType(DstReg);
5307   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
5308 
5309   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
5310   if (!loadInputValue(KernargPtrReg, B,
5311                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
5312     return false;
5313 
5314   // FIXME: This should be nuw
5315   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
5316   return true;
5317 }
5318 
5319 /// To create a buffer resource from a 64-bit pointer, mask off the upper 32
5320 /// bits of the pointer and replace them with the stride argument, then
5321 /// merge_values everything together. In the common case of a raw buffer (the
5322 /// stride component is 0), we can just AND off the upper half.
5323 bool AMDGPULegalizerInfo::legalizePointerAsRsrcIntrin(
5324     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
5325   Register Result = MI.getOperand(0).getReg();
5326   Register Pointer = MI.getOperand(2).getReg();
5327   Register Stride = MI.getOperand(3).getReg();
5328   Register NumRecords = MI.getOperand(4).getReg();
5329   Register Flags = MI.getOperand(5).getReg();
5330 
5331   LLT S32 = LLT::scalar(32);
5332 
5333   B.setInsertPt(B.getMBB(), ++B.getInsertPt());
5334   auto Unmerge = B.buildUnmerge(S32, Pointer);
5335   Register LowHalf = Unmerge.getReg(0);
5336   Register HighHalf = Unmerge.getReg(1);
5337 
5338   auto AndMask = B.buildConstant(S32, 0x0000ffff);
5339   auto Masked = B.buildAnd(S32, HighHalf, AndMask);
5340 
5341   MachineInstrBuilder NewHighHalf = Masked;
5342   std::optional<ValueAndVReg> StrideConst =
5343       getIConstantVRegValWithLookThrough(Stride, MRI);
5344   if (!StrideConst || !StrideConst->Value.isZero()) {
5345     MachineInstrBuilder ShiftedStride;
5346     if (StrideConst) {
5347       uint32_t StrideVal = StrideConst->Value.getZExtValue();
5348       uint32_t ShiftedStrideVal = StrideVal << 16;
5349       ShiftedStride = B.buildConstant(S32, ShiftedStrideVal);
5350     } else {
5351       auto ExtStride = B.buildAnyExt(S32, Stride);
5352       auto ShiftConst = B.buildConstant(S32, 16);
5353       ShiftedStride = B.buildShl(S32, ExtStride, ShiftConst);
5354     }
5355     NewHighHalf = B.buildOr(S32, Masked, ShiftedStride);
5356   }
5357   Register NewHighHalfReg = NewHighHalf.getReg(0);
5358   B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags});
5359   MI.eraseFromParent();
5360   return true;
5361 }
5362 
5363 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
5364                                                  MachineRegisterInfo &MRI,
5365                                                  MachineIRBuilder &B) const {
5366   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5367   if (!MFI->isEntryFunction()) {
5368     return legalizePreloadedArgIntrin(MI, MRI, B,
5369                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
5370   }
5371 
5372   Register DstReg = MI.getOperand(0).getReg();
5373   if (!getImplicitArgPtr(DstReg, MRI, B))
5374     return false;
5375 
5376   MI.eraseFromParent();
5377   return true;
5378 }
5379 
5380 bool AMDGPULegalizerInfo::getLDSKernelId(Register DstReg,
5381                                          MachineRegisterInfo &MRI,
5382                                          MachineIRBuilder &B) const {
5383   Function &F = B.getMF().getFunction();
5384   std::optional<uint32_t> KnownSize =
5385       AMDGPUMachineFunction::getLDSKernelIdMetadata(F);
5386   if (KnownSize.has_value())
5387     B.buildConstant(DstReg, *KnownSize);
5388   return false;
5389 }
5390 
5391 bool AMDGPULegalizerInfo::legalizeLDSKernelId(MachineInstr &MI,
5392                                               MachineRegisterInfo &MRI,
5393                                               MachineIRBuilder &B) const {
5394 
5395   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5396   if (!MFI->isEntryFunction()) {
5397     return legalizePreloadedArgIntrin(MI, MRI, B,
5398                                       AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
5399   }
5400 
5401   Register DstReg = MI.getOperand(0).getReg();
5402   if (!getLDSKernelId(DstReg, MRI, B))
5403     return false;
5404 
5405   MI.eraseFromParent();
5406   return true;
5407 }
5408 
5409 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
5410                                               MachineRegisterInfo &MRI,
5411                                               MachineIRBuilder &B,
5412                                               unsigned AddrSpace) const {
5413   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
5414   auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
5415   Register Hi32 = Unmerge.getReg(1);
5416 
5417   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
5418   MI.eraseFromParent();
5419   return true;
5420 }
5421 
5422 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
5423 // offset (the offset that is included in bounds checking and swizzling, to be
5424 // split between the instruction's voffset and immoffset fields) and soffset
5425 // (the offset that is excluded from bounds checking and swizzling, to go in
5426 // the instruction's soffset field).  This function takes the first kind of
5427 // offset and figures out how to split it between voffset and immoffset.
5428 std::pair<Register, unsigned>
5429 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
5430                                         Register OrigOffset) const {
5431   const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(ST);
5432   Register BaseReg;
5433   unsigned ImmOffset;
5434   const LLT S32 = LLT::scalar(32);
5435   MachineRegisterInfo &MRI = *B.getMRI();
5436 
5437   std::tie(BaseReg, ImmOffset) =
5438       AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
5439 
5440   // If BaseReg is a pointer, convert it to int.
5441   if (MRI.getType(BaseReg).isPointer())
5442     BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
5443 
5444   // If the immediate value is too big for the immoffset field, put only bits
5445   // that would normally fit in the immoffset field. The remaining value that
5446   // is copied/added for the voffset field is a large power of 2, and it
5447   // stands more chance of being CSEd with the copy/add for another similar
5448   // load/store.
5449   // However, do not do that rounding down if that is a negative
5450   // number, as it appears to be illegal to have a negative offset in the
5451   // vgpr, even if adding the immediate offset makes it positive.
5452   unsigned Overflow = ImmOffset & ~MaxImm;
5453   ImmOffset -= Overflow;
5454   if ((int32_t)Overflow < 0) {
5455     Overflow += ImmOffset;
5456     ImmOffset = 0;
5457   }
5458 
5459   if (Overflow != 0) {
5460     if (!BaseReg) {
5461       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
5462     } else {
5463       auto OverflowVal = B.buildConstant(S32, Overflow);
5464       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
5465     }
5466   }
5467 
5468   if (!BaseReg)
5469     BaseReg = B.buildConstant(S32, 0).getReg(0);
5470 
5471   return std::pair(BaseReg, ImmOffset);
5472 }
5473 
5474 /// Handle register layout difference for f16 images for some subtargets.
5475 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
5476                                              MachineRegisterInfo &MRI,
5477                                              Register Reg,
5478                                              bool ImageStore) const {
5479   const LLT S16 = LLT::scalar(16);
5480   const LLT S32 = LLT::scalar(32);
5481   LLT StoreVT = MRI.getType(Reg);
5482   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
5483 
5484   if (ST.hasUnpackedD16VMem()) {
5485     auto Unmerge = B.buildUnmerge(S16, Reg);
5486 
5487     SmallVector<Register, 4> WideRegs;
5488     for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
5489       WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
5490 
5491     int NumElts = StoreVT.getNumElements();
5492 
5493     return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
5494         .getReg(0);
5495   }
5496 
5497   if (ImageStore && ST.hasImageStoreD16Bug()) {
5498     if (StoreVT.getNumElements() == 2) {
5499       SmallVector<Register, 4> PackedRegs;
5500       Reg = B.buildBitcast(S32, Reg).getReg(0);
5501       PackedRegs.push_back(Reg);
5502       PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
5503       return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
5504           .getReg(0);
5505     }
5506 
5507     if (StoreVT.getNumElements() == 3) {
5508       SmallVector<Register, 4> PackedRegs;
5509       auto Unmerge = B.buildUnmerge(S16, Reg);
5510       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
5511         PackedRegs.push_back(Unmerge.getReg(I));
5512       PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
5513       Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
5514       return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
5515     }
5516 
5517     if (StoreVT.getNumElements() == 4) {
5518       SmallVector<Register, 4> PackedRegs;
5519       Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
5520       auto Unmerge = B.buildUnmerge(S32, Reg);
5521       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
5522         PackedRegs.push_back(Unmerge.getReg(I));
5523       PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
5524       return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
5525           .getReg(0);
5526     }
5527 
5528     llvm_unreachable("invalid data type");
5529   }
5530 
5531   if (StoreVT == LLT::fixed_vector(3, S16)) {
5532     Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg)
5533               .getReg(0);
5534   }
5535   return Reg;
5536 }
5537 
5538 Register AMDGPULegalizerInfo::fixStoreSourceType(
5539   MachineIRBuilder &B, Register VData, bool IsFormat) const {
5540   MachineRegisterInfo *MRI = B.getMRI();
5541   LLT Ty = MRI->getType(VData);
5542 
5543   const LLT S16 = LLT::scalar(16);
5544 
5545   // Fixup buffer resources themselves needing to be v4i128.
5546   if (hasBufferRsrcWorkaround(Ty))
5547     return castBufferRsrcToV4I32(VData, B);
5548 
5549   // Fixup illegal register types for i8 stores.
5550   if (Ty == LLT::scalar(8) || Ty == S16) {
5551     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
5552     return AnyExt;
5553   }
5554 
5555   if (Ty.isVector()) {
5556     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
5557       if (IsFormat)
5558         return handleD16VData(B, *MRI, VData);
5559     }
5560   }
5561 
5562   return VData;
5563 }
5564 
5565 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
5566                                               MachineRegisterInfo &MRI,
5567                                               MachineIRBuilder &B,
5568                                               bool IsTyped,
5569                                               bool IsFormat) const {
5570   Register VData = MI.getOperand(1).getReg();
5571   LLT Ty = MRI.getType(VData);
5572   LLT EltTy = Ty.getScalarType();
5573   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
5574   const LLT S32 = LLT::scalar(32);
5575 
5576   VData = fixStoreSourceType(B, VData, IsFormat);
5577   castBufferRsrcArgToV4I32(MI, B, 2);
5578   Register RSrc = MI.getOperand(2).getReg();
5579 
5580   MachineMemOperand *MMO = *MI.memoperands_begin();
5581   const int MemSize = MMO->getSize();
5582 
5583   unsigned ImmOffset;
5584 
5585   // The typed intrinsics add an immediate after the registers.
5586   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
5587 
5588   // The struct intrinsic variants add one additional operand over raw.
5589   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
5590   Register VIndex;
5591   int OpOffset = 0;
5592   if (HasVIndex) {
5593     VIndex = MI.getOperand(3).getReg();
5594     OpOffset = 1;
5595   } else {
5596     VIndex = B.buildConstant(S32, 0).getReg(0);
5597   }
5598 
5599   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
5600   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
5601 
5602   unsigned Format = 0;
5603   if (IsTyped) {
5604     Format = MI.getOperand(5 + OpOffset).getImm();
5605     ++OpOffset;
5606   }
5607 
5608   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
5609 
5610   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
5611 
5612   unsigned Opc;
5613   if (IsTyped) {
5614     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
5615                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
5616   } else if (IsFormat) {
5617     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
5618                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
5619   } else {
5620     switch (MemSize) {
5621     case 1:
5622       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
5623       break;
5624     case 2:
5625       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
5626       break;
5627     default:
5628       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
5629       break;
5630     }
5631   }
5632 
5633   auto MIB = B.buildInstr(Opc)
5634     .addUse(VData)              // vdata
5635     .addUse(RSrc)               // rsrc
5636     .addUse(VIndex)             // vindex
5637     .addUse(VOffset)            // voffset
5638     .addUse(SOffset)            // soffset
5639     .addImm(ImmOffset);         // offset(imm)
5640 
5641   if (IsTyped)
5642     MIB.addImm(Format);
5643 
5644   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
5645      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
5646      .addMemOperand(MMO);
5647 
5648   MI.eraseFromParent();
5649   return true;
5650 }
5651 
5652 static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc,
5653                             Register VIndex, Register VOffset, Register SOffset,
5654                             unsigned ImmOffset, unsigned Format,
5655                             unsigned AuxiliaryData, MachineMemOperand *MMO,
5656                             bool IsTyped, bool HasVIndex, MachineIRBuilder &B) {
5657   auto MIB = B.buildInstr(Opc)
5658                  .addDef(LoadDstReg) // vdata
5659                  .addUse(RSrc)       // rsrc
5660                  .addUse(VIndex)     // vindex
5661                  .addUse(VOffset)    // voffset
5662                  .addUse(SOffset)    // soffset
5663                  .addImm(ImmOffset); // offset(imm)
5664 
5665   if (IsTyped)
5666     MIB.addImm(Format);
5667 
5668   MIB.addImm(AuxiliaryData)       // cachepolicy, swizzled buffer(imm)
5669       .addImm(HasVIndex ? -1 : 0) // idxen(imm)
5670       .addMemOperand(MMO);
5671 }
5672 
5673 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
5674                                              MachineRegisterInfo &MRI,
5675                                              MachineIRBuilder &B,
5676                                              bool IsFormat,
5677                                              bool IsTyped) const {
5678   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
5679   MachineMemOperand *MMO = *MI.memoperands_begin();
5680   const LLT MemTy = MMO->getMemoryType();
5681   const LLT S32 = LLT::scalar(32);
5682 
5683   Register Dst = MI.getOperand(0).getReg();
5684 
5685   Register StatusDst;
5686   int OpOffset = 0;
5687   assert(MI.getNumExplicitDefs() == 1 || MI.getNumExplicitDefs() == 2);
5688   bool IsTFE = MI.getNumExplicitDefs() == 2;
5689   if (IsTFE) {
5690     StatusDst = MI.getOperand(1).getReg();
5691     ++OpOffset;
5692   }
5693 
5694   castBufferRsrcArgToV4I32(MI, B, 2 + OpOffset);
5695   Register RSrc = MI.getOperand(2 + OpOffset).getReg();
5696 
5697   // The typed intrinsics add an immediate after the registers.
5698   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
5699 
5700   // The struct intrinsic variants add one additional operand over raw.
5701   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps + OpOffset;
5702   Register VIndex;
5703   if (HasVIndex) {
5704     VIndex = MI.getOperand(3 + OpOffset).getReg();
5705     ++OpOffset;
5706   } else {
5707     VIndex = B.buildConstant(S32, 0).getReg(0);
5708   }
5709 
5710   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
5711   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
5712 
5713   unsigned Format = 0;
5714   if (IsTyped) {
5715     Format = MI.getOperand(5 + OpOffset).getImm();
5716     ++OpOffset;
5717   }
5718 
5719   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
5720   unsigned ImmOffset;
5721 
5722   LLT Ty = MRI.getType(Dst);
5723   // Make addrspace 8 pointers loads into 4xs32 loads here, so the rest of the
5724   // logic doesn't have to handle that case.
5725   if (hasBufferRsrcWorkaround(Ty)) {
5726     Ty = castBufferRsrcFromV4I32(MI, B, MRI, 0);
5727     Dst = MI.getOperand(0).getReg();
5728   }
5729   LLT EltTy = Ty.getScalarType();
5730   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
5731   const bool Unpacked = ST.hasUnpackedD16VMem();
5732 
5733   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
5734 
5735   unsigned Opc;
5736 
5737   // TODO: Support TFE for typed and narrow loads.
5738   if (IsTyped) {
5739     if (IsTFE)
5740       return false;
5741     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
5742                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
5743   } else if (IsFormat) {
5744     if (IsD16) {
5745       if (IsTFE)
5746         return false;
5747       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
5748     } else {
5749       Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
5750                   : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
5751     }
5752   } else {
5753     if (IsTFE)
5754       return false;
5755     switch (MemTy.getSizeInBits()) {
5756     case 8:
5757       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
5758       break;
5759     case 16:
5760       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
5761       break;
5762     default:
5763       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
5764       break;
5765     }
5766   }
5767 
5768   if (IsTFE) {
5769     unsigned NumValueDWords = divideCeil(Ty.getSizeInBits(), 32);
5770     unsigned NumLoadDWords = NumValueDWords + 1;
5771     LLT LoadTy = LLT::fixed_vector(NumLoadDWords, S32);
5772     Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(LoadTy);
5773     buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
5774                     Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
5775     if (NumValueDWords == 1) {
5776       B.buildUnmerge({Dst, StatusDst}, LoadDstReg);
5777     } else {
5778       SmallVector<Register, 5> LoadElts;
5779       for (unsigned I = 0; I != NumValueDWords; ++I)
5780         LoadElts.push_back(B.getMRI()->createGenericVirtualRegister(S32));
5781       LoadElts.push_back(StatusDst);
5782       B.buildUnmerge(LoadElts, LoadDstReg);
5783       LoadElts.truncate(NumValueDWords);
5784       B.buildMergeLikeInstr(Dst, LoadElts);
5785     }
5786   } else if ((!IsD16 && MemTy.getSizeInBits() < 32) ||
5787              (IsD16 && !Ty.isVector())) {
5788     Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
5789     buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
5790                     Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
5791     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
5792     B.buildTrunc(Dst, LoadDstReg);
5793   } else if (Unpacked && IsD16 && Ty.isVector()) {
5794     LLT UnpackedTy = Ty.changeElementSize(32);
5795     Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
5796     buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
5797                     Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
5798     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
5799     // FIXME: G_TRUNC should work, but legalization currently fails
5800     auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
5801     SmallVector<Register, 4> Repack;
5802     for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
5803       Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
5804     B.buildMergeLikeInstr(Dst, Repack);
5805   } else {
5806     buildBufferLoad(Opc, Dst, RSrc, VIndex, VOffset, SOffset, ImmOffset, Format,
5807                     AuxiliaryData, MMO, IsTyped, HasVIndex, B);
5808   }
5809 
5810   MI.eraseFromParent();
5811   return true;
5812 }
5813 
5814 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
5815   switch (IntrID) {
5816   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
5817   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
5818   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
5819   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
5820     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
5821   case Intrinsic::amdgcn_raw_buffer_atomic_add:
5822   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
5823   case Intrinsic::amdgcn_struct_buffer_atomic_add:
5824   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
5825     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
5826   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
5827   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
5828   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
5829   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
5830     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
5831   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
5832   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
5833   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
5834   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
5835     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
5836   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
5837   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
5838   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
5839   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
5840     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
5841   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
5842   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
5843   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
5844   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
5845     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
5846   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
5847   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
5848   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
5849   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
5850     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
5851   case Intrinsic::amdgcn_raw_buffer_atomic_and:
5852   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
5853   case Intrinsic::amdgcn_struct_buffer_atomic_and:
5854   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
5855     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
5856   case Intrinsic::amdgcn_raw_buffer_atomic_or:
5857   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
5858   case Intrinsic::amdgcn_struct_buffer_atomic_or:
5859   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
5860     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
5861   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
5862   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
5863   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
5864   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
5865     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
5866   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
5867   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
5868   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
5869   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
5870     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
5871   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
5872   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
5873   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
5874   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
5875     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
5876   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
5877   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
5878   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
5879   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
5880     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
5881   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
5882   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
5883   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
5884   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
5885     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
5886   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
5887   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
5888   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
5889   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
5890     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
5891   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
5892   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
5893   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
5894   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
5895     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
5896   default:
5897     llvm_unreachable("unhandled atomic opcode");
5898   }
5899 }
5900 
5901 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
5902                                                MachineIRBuilder &B,
5903                                                Intrinsic::ID IID) const {
5904   const bool IsCmpSwap =
5905       IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
5906       IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap ||
5907       IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap ||
5908       IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap;
5909 
5910   Register Dst = MI.getOperand(0).getReg();
5911   // Since we don't have 128-bit atomics, we don't need to handle the case of
5912   // p8 argmunents to the atomic itself
5913   Register VData = MI.getOperand(2).getReg();
5914 
5915   Register CmpVal;
5916   int OpOffset = 0;
5917 
5918   if (IsCmpSwap) {
5919     CmpVal = MI.getOperand(3).getReg();
5920     ++OpOffset;
5921   }
5922 
5923   castBufferRsrcArgToV4I32(MI, B, 3 + OpOffset);
5924   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
5925   const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
5926 
5927   // The struct intrinsic variants add one additional operand over raw.
5928   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
5929   Register VIndex;
5930   if (HasVIndex) {
5931     VIndex = MI.getOperand(4 + OpOffset).getReg();
5932     ++OpOffset;
5933   } else {
5934     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
5935   }
5936 
5937   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
5938   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
5939   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
5940 
5941   MachineMemOperand *MMO = *MI.memoperands_begin();
5942 
5943   unsigned ImmOffset;
5944   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
5945 
5946   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
5947       .addDef(Dst)
5948       .addUse(VData); // vdata
5949 
5950   if (IsCmpSwap)
5951     MIB.addReg(CmpVal);
5952 
5953   MIB.addUse(RSrc)               // rsrc
5954      .addUse(VIndex)             // vindex
5955      .addUse(VOffset)            // voffset
5956      .addUse(SOffset)            // soffset
5957      .addImm(ImmOffset)          // offset(imm)
5958      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
5959      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
5960      .addMemOperand(MMO);
5961 
5962   MI.eraseFromParent();
5963   return true;
5964 }
5965 
5966 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
5967 /// vector with s16 typed elements.
5968 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
5969                                       SmallVectorImpl<Register> &PackedAddrs,
5970                                       unsigned ArgOffset,
5971                                       const AMDGPU::ImageDimIntrinsicInfo *Intr,
5972                                       bool IsA16, bool IsG16) {
5973   const LLT S16 = LLT::scalar(16);
5974   const LLT V2S16 = LLT::fixed_vector(2, 16);
5975   auto EndIdx = Intr->VAddrEnd;
5976 
5977   for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
5978     MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
5979     if (!SrcOp.isReg())
5980       continue; // _L to _LZ may have eliminated this.
5981 
5982     Register AddrReg = SrcOp.getReg();
5983 
5984     if ((I < Intr->GradientStart) ||
5985         (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
5986         (I >= Intr->CoordStart && !IsA16)) {
5987       if ((I < Intr->GradientStart) && IsA16 &&
5988           (B.getMRI()->getType(AddrReg) == S16)) {
5989         assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
5990         // Special handling of bias when A16 is on. Bias is of type half but
5991         // occupies full 32-bit.
5992         PackedAddrs.push_back(
5993             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
5994                 .getReg(0));
5995       } else {
5996         assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
5997                "Bias needs to be converted to 16 bit in A16 mode");
5998         // Handle any gradient or coordinate operands that should not be packed
5999         AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
6000         PackedAddrs.push_back(AddrReg);
6001       }
6002     } else {
6003       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
6004       // derivatives dx/dh and dx/dv are packed with undef.
6005       if (((I + 1) >= EndIdx) ||
6006           ((Intr->NumGradients / 2) % 2 == 1 &&
6007            (I == static_cast<unsigned>(Intr->GradientStart +
6008                                        (Intr->NumGradients / 2) - 1) ||
6009             I == static_cast<unsigned>(Intr->GradientStart +
6010                                        Intr->NumGradients - 1))) ||
6011           // Check for _L to _LZ optimization
6012           !MI.getOperand(ArgOffset + I + 1).isReg()) {
6013         PackedAddrs.push_back(
6014             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
6015                 .getReg(0));
6016       } else {
6017         PackedAddrs.push_back(
6018             B.buildBuildVector(
6019                  V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
6020                 .getReg(0));
6021         ++I;
6022       }
6023     }
6024   }
6025 }
6026 
6027 /// Convert from separate vaddr components to a single vector address register,
6028 /// and replace the remaining operands with $noreg.
6029 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
6030                                      int DimIdx, int NumVAddrs) {
6031   const LLT S32 = LLT::scalar(32);
6032   (void)S32;
6033   SmallVector<Register, 8> AddrRegs;
6034   for (int I = 0; I != NumVAddrs; ++I) {
6035     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
6036     if (SrcOp.isReg()) {
6037       AddrRegs.push_back(SrcOp.getReg());
6038       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
6039     }
6040   }
6041 
6042   int NumAddrRegs = AddrRegs.size();
6043   if (NumAddrRegs != 1) {
6044     auto VAddr =
6045         B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
6046     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
6047   }
6048 
6049   for (int I = 1; I != NumVAddrs; ++I) {
6050     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
6051     if (SrcOp.isReg())
6052       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
6053   }
6054 }
6055 
6056 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
6057 ///
6058 /// Depending on the subtarget, load/store with 16-bit element data need to be
6059 /// rewritten to use the low half of 32-bit registers, or directly use a packed
6060 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
6061 /// registers.
6062 ///
6063 /// We don't want to directly select image instructions just yet, but also want
6064 /// to exposes all register repacking to the legalizer/combiners. We also don't
6065 /// want a selected instruction entering RegBankSelect. In order to avoid
6066 /// defining a multitude of intermediate image instructions, directly hack on
6067 /// the intrinsic's arguments. In cases like a16 addresses, this requires
6068 /// padding now unnecessary arguments with $noreg.
6069 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
6070     MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
6071     const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
6072 
6073   const MachineFunction &MF = *MI.getMF();
6074   const unsigned NumDefs = MI.getNumExplicitDefs();
6075   const unsigned ArgOffset = NumDefs + 1;
6076   bool IsTFE = NumDefs == 2;
6077   // We are only processing the operands of d16 image operations on subtargets
6078   // that use the unpacked register layout, or need to repack the TFE result.
6079 
6080   // TODO: Do we need to guard against already legalized intrinsics?
6081   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
6082       AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
6083 
6084   MachineRegisterInfo *MRI = B.getMRI();
6085   const LLT S32 = LLT::scalar(32);
6086   const LLT S16 = LLT::scalar(16);
6087   const LLT V2S16 = LLT::fixed_vector(2, 16);
6088 
6089   unsigned DMask = 0;
6090   Register VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
6091   LLT Ty = MRI->getType(VData);
6092 
6093   // Check for 16 bit addresses and pack if true.
6094   LLT GradTy =
6095       MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
6096   LLT AddrTy =
6097       MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
6098   const bool IsG16 =
6099       ST.hasG16() ? (BaseOpcode->Gradients && GradTy == S16) : GradTy == S16;
6100   const bool IsA16 = AddrTy == S16;
6101   const bool IsD16 = Ty.getScalarType() == S16;
6102 
6103   int DMaskLanes = 0;
6104   if (!BaseOpcode->Atomic) {
6105     DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
6106     if (BaseOpcode->Gather4) {
6107       DMaskLanes = 4;
6108     } else if (DMask != 0) {
6109       DMaskLanes = llvm::popcount(DMask);
6110     } else if (!IsTFE && !BaseOpcode->Store) {
6111       // If dmask is 0, this is a no-op load. This can be eliminated.
6112       B.buildUndef(MI.getOperand(0));
6113       MI.eraseFromParent();
6114       return true;
6115     }
6116   }
6117 
6118   Observer.changingInstr(MI);
6119   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
6120 
6121   const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
6122                                      : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
6123   const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
6124                                     : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
6125   unsigned NewOpcode = NumDefs == 0 ? StoreOpcode : LoadOpcode;
6126 
6127   // Track that we legalized this
6128   MI.setDesc(B.getTII().get(NewOpcode));
6129 
6130   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
6131   // dmask to be at least 1 otherwise the instruction will fail
6132   if (IsTFE && DMask == 0) {
6133     DMask = 0x1;
6134     DMaskLanes = 1;
6135     MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
6136   }
6137 
6138   if (BaseOpcode->Atomic) {
6139     Register VData0 = MI.getOperand(2).getReg();
6140     LLT Ty = MRI->getType(VData0);
6141 
6142     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
6143     if (Ty.isVector())
6144       return false;
6145 
6146     if (BaseOpcode->AtomicX2) {
6147       Register VData1 = MI.getOperand(3).getReg();
6148       // The two values are packed in one register.
6149       LLT PackedTy = LLT::fixed_vector(2, Ty);
6150       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
6151       MI.getOperand(2).setReg(Concat.getReg(0));
6152       MI.getOperand(3).setReg(AMDGPU::NoRegister);
6153     }
6154   }
6155 
6156   unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
6157 
6158   // Rewrite the addressing register layout before doing anything else.
6159   if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
6160     // 16 bit gradients are supported, but are tied to the A16 control
6161     // so both gradients and addresses must be 16 bit
6162     return false;
6163   }
6164 
6165   if (IsA16 && !ST.hasA16()) {
6166     // A16 not supported
6167     return false;
6168   }
6169 
6170   const unsigned NSAMaxSize = ST.getNSAMaxSize(BaseOpcode->Sampler);
6171   const unsigned HasPartialNSA = ST.hasPartialNSAEncoding();
6172 
6173   if (IsA16 || IsG16) {
6174     // Even if NumVAddrs == 1 we should pack it into a 32-bit value, because the
6175     // instructions expect VGPR_32
6176     SmallVector<Register, 4> PackedRegs;
6177 
6178     packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, IsG16);
6179 
6180     // See also below in the non-a16 branch
6181     const bool UseNSA = ST.hasNSAEncoding() &&
6182                         PackedRegs.size() >= ST.getNSAThreshold(MF) &&
6183                         (PackedRegs.size() <= NSAMaxSize || HasPartialNSA);
6184     const bool UsePartialNSA =
6185         UseNSA && HasPartialNSA && PackedRegs.size() > NSAMaxSize;
6186 
6187     if (UsePartialNSA) {
6188       // Pack registers that would go over NSAMaxSize into last VAddr register
6189       LLT PackedAddrTy =
6190           LLT::fixed_vector(2 * (PackedRegs.size() - NSAMaxSize + 1), 16);
6191       auto Concat = B.buildConcatVectors(
6192           PackedAddrTy, ArrayRef(PackedRegs).slice(NSAMaxSize - 1));
6193       PackedRegs[NSAMaxSize - 1] = Concat.getReg(0);
6194       PackedRegs.resize(NSAMaxSize);
6195     } else if (!UseNSA && PackedRegs.size() > 1) {
6196       LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
6197       auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
6198       PackedRegs[0] = Concat.getReg(0);
6199       PackedRegs.resize(1);
6200     }
6201 
6202     const unsigned NumPacked = PackedRegs.size();
6203     for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
6204       MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
6205       if (!SrcOp.isReg()) {
6206         assert(SrcOp.isImm() && SrcOp.getImm() == 0);
6207         continue;
6208       }
6209 
6210       assert(SrcOp.getReg() != AMDGPU::NoRegister);
6211 
6212       if (I - Intr->VAddrStart < NumPacked)
6213         SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
6214       else
6215         SrcOp.setReg(AMDGPU::NoRegister);
6216     }
6217   } else {
6218     // If the register allocator cannot place the address registers contiguously
6219     // without introducing moves, then using the non-sequential address encoding
6220     // is always preferable, since it saves VALU instructions and is usually a
6221     // wash in terms of code size or even better.
6222     //
6223     // However, we currently have no way of hinting to the register allocator
6224     // that MIMG addresses should be placed contiguously when it is possible to
6225     // do so, so force non-NSA for the common 2-address case as a heuristic.
6226     //
6227     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
6228     // allocation when possible.
6229     //
6230     // Partial NSA is allowed on GFX11+ where the final register is a contiguous
6231     // set of the remaining addresses.
6232     const bool UseNSA = ST.hasNSAEncoding() &&
6233                         CorrectedNumVAddrs >= ST.getNSAThreshold(MF) &&
6234                         (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA);
6235     const bool UsePartialNSA =
6236         UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize;
6237 
6238     if (UsePartialNSA) {
6239       convertImageAddrToPacked(B, MI,
6240                                ArgOffset + Intr->VAddrStart + NSAMaxSize - 1,
6241                                Intr->NumVAddrs - NSAMaxSize + 1);
6242     } else if (!UseNSA && Intr->NumVAddrs > 1) {
6243       convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
6244                                Intr->NumVAddrs);
6245     }
6246   }
6247 
6248   int Flags = 0;
6249   if (IsA16)
6250     Flags |= 1;
6251   if (IsG16)
6252     Flags |= 2;
6253   MI.addOperand(MachineOperand::CreateImm(Flags));
6254 
6255   if (BaseOpcode->Store) { // No TFE for stores?
6256     // TODO: Handle dmask trim
6257     if (!Ty.isVector() || !IsD16)
6258       return true;
6259 
6260     Register RepackedReg = handleD16VData(B, *MRI, VData, true);
6261     if (RepackedReg != VData) {
6262       MI.getOperand(1).setReg(RepackedReg);
6263     }
6264 
6265     return true;
6266   }
6267 
6268   Register DstReg = MI.getOperand(0).getReg();
6269   const LLT EltTy = Ty.getScalarType();
6270   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
6271 
6272   // Confirm that the return type is large enough for the dmask specified
6273   if (NumElts < DMaskLanes)
6274     return false;
6275 
6276   if (NumElts > 4 || DMaskLanes > 4)
6277     return false;
6278 
6279   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
6280   const LLT AdjustedTy =
6281       Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
6282 
6283   // The raw dword aligned data component of the load. The only legal cases
6284   // where this matters should be when using the packed D16 format, for
6285   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
6286   LLT RoundedTy;
6287 
6288   // S32 vector to cover all data, plus TFE result element.
6289   LLT TFETy;
6290 
6291   // Register type to use for each loaded component. Will be S32 or V2S16.
6292   LLT RegTy;
6293 
6294   if (IsD16 && ST.hasUnpackedD16VMem()) {
6295     RoundedTy =
6296         LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
6297     TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
6298     RegTy = S32;
6299   } else {
6300     unsigned EltSize = EltTy.getSizeInBits();
6301     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
6302     unsigned RoundedSize = 32 * RoundedElts;
6303     RoundedTy = LLT::scalarOrVector(
6304         ElementCount::getFixed(RoundedSize / EltSize), EltSize);
6305     TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
6306     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
6307   }
6308 
6309   // The return type does not need adjustment.
6310   // TODO: Should we change s16 case to s32 or <2 x s16>?
6311   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
6312     return true;
6313 
6314   Register Dst1Reg;
6315 
6316   // Insert after the instruction.
6317   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
6318 
6319   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
6320   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
6321   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
6322   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
6323 
6324   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
6325 
6326   MI.getOperand(0).setReg(NewResultReg);
6327 
6328   // In the IR, TFE is supposed to be used with a 2 element struct return
6329   // type. The instruction really returns these two values in one contiguous
6330   // register, with one additional dword beyond the loaded data. Rewrite the
6331   // return type to use a single register result.
6332 
6333   if (IsTFE) {
6334     Dst1Reg = MI.getOperand(1).getReg();
6335     if (MRI->getType(Dst1Reg) != S32)
6336       return false;
6337 
6338     // TODO: Make sure the TFE operand bit is set.
6339     MI.removeOperand(1);
6340 
6341     // Handle the easy case that requires no repack instructions.
6342     if (Ty == S32) {
6343       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
6344       return true;
6345     }
6346   }
6347 
6348   // Now figure out how to copy the new result register back into the old
6349   // result.
6350   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
6351 
6352   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
6353 
6354   if (ResultNumRegs == 1) {
6355     assert(!IsTFE);
6356     ResultRegs[0] = NewResultReg;
6357   } else {
6358     // We have to repack into a new vector of some kind.
6359     for (int I = 0; I != NumDataRegs; ++I)
6360       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
6361     B.buildUnmerge(ResultRegs, NewResultReg);
6362 
6363     // Drop the final TFE element to get the data part. The TFE result is
6364     // directly written to the right place already.
6365     if (IsTFE)
6366       ResultRegs.resize(NumDataRegs);
6367   }
6368 
6369   // For an s16 scalar result, we form an s32 result with a truncate regardless
6370   // of packed vs. unpacked.
6371   if (IsD16 && !Ty.isVector()) {
6372     B.buildTrunc(DstReg, ResultRegs[0]);
6373     return true;
6374   }
6375 
6376   // Avoid a build/concat_vector of 1 entry.
6377   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
6378     B.buildBitcast(DstReg, ResultRegs[0]);
6379     return true;
6380   }
6381 
6382   assert(Ty.isVector());
6383 
6384   if (IsD16) {
6385     // For packed D16 results with TFE enabled, all the data components are
6386     // S32. Cast back to the expected type.
6387     //
6388     // TODO: We don't really need to use load s32 elements. We would only need one
6389     // cast for the TFE result if a multiple of v2s16 was used.
6390     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
6391       for (Register &Reg : ResultRegs)
6392         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
6393     } else if (ST.hasUnpackedD16VMem()) {
6394       for (Register &Reg : ResultRegs)
6395         Reg = B.buildTrunc(S16, Reg).getReg(0);
6396     }
6397   }
6398 
6399   auto padWithUndef = [&](LLT Ty, int NumElts) {
6400     if (NumElts == 0)
6401       return;
6402     Register Undef = B.buildUndef(Ty).getReg(0);
6403     for (int I = 0; I != NumElts; ++I)
6404       ResultRegs.push_back(Undef);
6405   };
6406 
6407   // Pad out any elements eliminated due to the dmask.
6408   LLT ResTy = MRI->getType(ResultRegs[0]);
6409   if (!ResTy.isVector()) {
6410     padWithUndef(ResTy, NumElts - ResultRegs.size());
6411     B.buildBuildVector(DstReg, ResultRegs);
6412     return true;
6413   }
6414 
6415   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
6416   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
6417 
6418   // Deal with the one annoying legal case.
6419   const LLT V3S16 = LLT::fixed_vector(3, 16);
6420   if (Ty == V3S16) {
6421     if (IsTFE) {
6422       if (ResultRegs.size() == 1) {
6423         NewResultReg = ResultRegs[0];
6424       } else if (ResultRegs.size() == 2) {
6425         LLT V4S16 = LLT::fixed_vector(4, 16);
6426         NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0);
6427       } else {
6428         return false;
6429       }
6430     }
6431 
6432     if (MRI->getType(DstReg).getNumElements() <
6433         MRI->getType(NewResultReg).getNumElements()) {
6434       B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
6435     } else {
6436       B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
6437     }
6438     return true;
6439   }
6440 
6441   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
6442   B.buildConcatVectors(DstReg, ResultRegs);
6443   return true;
6444 }
6445 
6446 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
6447   LegalizerHelper &Helper, MachineInstr &MI) const {
6448   MachineIRBuilder &B = Helper.MIRBuilder;
6449   GISelChangeObserver &Observer = Helper.Observer;
6450 
6451   Register Dst = MI.getOperand(0).getReg();
6452   LLT Ty = B.getMRI()->getType(Dst);
6453   unsigned Size = Ty.getSizeInBits();
6454   MachineFunction &MF = B.getMF();
6455 
6456   Observer.changingInstr(MI);
6457 
6458   // Handle needing to s.buffer.load() a p8 value.
6459   if (hasBufferRsrcWorkaround(Ty)) {
6460     Ty = castBufferRsrcFromV4I32(MI, B, *B.getMRI(), 0);
6461     B.setInsertPt(B.getMBB(), MI);
6462   }
6463   if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) {
6464     Ty = getBitcastRegisterType(Ty);
6465     Helper.bitcastDst(MI, Ty, 0);
6466     B.setInsertPt(B.getMBB(), MI);
6467   }
6468 
6469   // FIXME: We don't really need this intermediate instruction. The intrinsic
6470   // should be fixed to have a memory operand. Since it's readnone, we're not
6471   // allowed to add one.
6472   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
6473   MI.removeOperand(1); // Remove intrinsic ID
6474 
6475   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
6476   // TODO: Should this use datalayout alignment?
6477   const unsigned MemSize = (Size + 7) / 8;
6478   const Align MemAlign(4);
6479   MachineMemOperand *MMO = MF.getMachineMemOperand(
6480       MachinePointerInfo(),
6481       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
6482           MachineMemOperand::MOInvariant,
6483       MemSize, MemAlign);
6484   MI.addMemOperand(MF, MMO);
6485 
6486   // If we don't have 96-bit result scalar loads, widening to 128-bit should
6487   // always be legal. We may need to restore this to a 96-bit result if it turns
6488   // out this needs to be converted to a vector load during RegBankSelect.
6489   if (!isPowerOf2_32(Size) && (Size != 96 || !ST.hasScalarDwordx3Loads())) {
6490     if (Ty.isVector())
6491       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
6492     else
6493       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
6494   }
6495 
6496   Observer.changedInstr(MI);
6497   return true;
6498 }
6499 
6500 // TODO: Move to selection
6501 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
6502                                                 MachineRegisterInfo &MRI,
6503                                                 MachineIRBuilder &B) const {
6504   if (!ST.isTrapHandlerEnabled() ||
6505       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
6506     return legalizeTrapEndpgm(MI, MRI, B);
6507 
6508   return ST.supportsGetDoorbellID() ?
6509          legalizeTrapHsa(MI, MRI, B) : legalizeTrapHsaQueuePtr(MI, MRI, B);
6510 }
6511 
6512 bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
6513     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
6514   const DebugLoc &DL = MI.getDebugLoc();
6515   MachineBasicBlock &BB = B.getMBB();
6516   MachineFunction *MF = BB.getParent();
6517 
6518   if (BB.succ_empty() && std::next(MI.getIterator()) == BB.end()) {
6519     BuildMI(BB, BB.end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
6520       .addImm(0);
6521     MI.eraseFromParent();
6522     return true;
6523   }
6524 
6525   // We need a block split to make the real endpgm a terminator. We also don't
6526   // want to break phis in successor blocks, so we can't just delete to the
6527   // end of the block.
6528   BB.splitAt(MI, false /*UpdateLiveIns*/);
6529   MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
6530   MF->push_back(TrapBB);
6531   BuildMI(*TrapBB, TrapBB->end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
6532     .addImm(0);
6533   BuildMI(BB, &MI, DL, B.getTII().get(AMDGPU::S_CBRANCH_EXECNZ))
6534     .addMBB(TrapBB);
6535 
6536   BB.addSuccessor(TrapBB);
6537   MI.eraseFromParent();
6538   return true;
6539 }
6540 
6541 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
6542     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
6543   MachineFunction &MF = B.getMF();
6544   const LLT S64 = LLT::scalar(64);
6545 
6546   Register SGPR01(AMDGPU::SGPR0_SGPR1);
6547   // For code object version 5, queue_ptr is passed through implicit kernarg.
6548   if (AMDGPU::getCodeObjectVersion(*MF.getFunction().getParent()) >=
6549       AMDGPU::AMDHSA_COV5) {
6550     AMDGPUTargetLowering::ImplicitParameter Param =
6551         AMDGPUTargetLowering::QUEUE_PTR;
6552     uint64_t Offset =
6553         ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
6554 
6555     Register KernargPtrReg = MRI.createGenericVirtualRegister(
6556         LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
6557 
6558     if (!loadInputValue(KernargPtrReg, B,
6559                         AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
6560       return false;
6561 
6562     // TODO: can we be smarter about machine pointer info?
6563     MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
6564     MachineMemOperand *MMO = MF.getMachineMemOperand(
6565         PtrInfo,
6566         MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
6567             MachineMemOperand::MOInvariant,
6568         LLT::scalar(64), commonAlignment(Align(64), Offset));
6569 
6570     // Pointer address
6571     Register LoadAddr = MRI.createGenericVirtualRegister(
6572         LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
6573     B.buildPtrAdd(LoadAddr, KernargPtrReg,
6574                   B.buildConstant(LLT::scalar(64), Offset).getReg(0));
6575     // Load address
6576     Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0);
6577     B.buildCopy(SGPR01, Temp);
6578     B.buildInstr(AMDGPU::S_TRAP)
6579         .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
6580         .addReg(SGPR01, RegState::Implicit);
6581     MI.eraseFromParent();
6582     return true;
6583   }
6584 
6585   // Pass queue pointer to trap handler as input, and insert trap instruction
6586   // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
6587   Register LiveIn =
6588     MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
6589   if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
6590     return false;
6591 
6592   B.buildCopy(SGPR01, LiveIn);
6593   B.buildInstr(AMDGPU::S_TRAP)
6594       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
6595       .addReg(SGPR01, RegState::Implicit);
6596 
6597   MI.eraseFromParent();
6598   return true;
6599 }
6600 
6601 bool AMDGPULegalizerInfo::legalizeTrapHsa(
6602     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
6603   B.buildInstr(AMDGPU::S_TRAP)
6604       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
6605   MI.eraseFromParent();
6606   return true;
6607 }
6608 
6609 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
6610     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
6611   // Is non-HSA path or trap-handler disabled? Then, report a warning
6612   // accordingly
6613   if (!ST.isTrapHandlerEnabled() ||
6614       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
6615     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
6616                                      "debugtrap handler not supported",
6617                                      MI.getDebugLoc(), DS_Warning);
6618     LLVMContext &Ctx = B.getMF().getFunction().getContext();
6619     Ctx.diagnose(NoTrap);
6620   } else {
6621     // Insert debug-trap instruction
6622     B.buildInstr(AMDGPU::S_TRAP)
6623         .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
6624   }
6625 
6626   MI.eraseFromParent();
6627   return true;
6628 }
6629 
6630 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
6631                                                MachineIRBuilder &B) const {
6632   MachineRegisterInfo &MRI = *B.getMRI();
6633   const LLT S16 = LLT::scalar(16);
6634   const LLT S32 = LLT::scalar(32);
6635   const LLT V2S16 = LLT::fixed_vector(2, 16);
6636   const LLT V3S32 = LLT::fixed_vector(3, 32);
6637 
6638   Register DstReg = MI.getOperand(0).getReg();
6639   Register NodePtr = MI.getOperand(2).getReg();
6640   Register RayExtent = MI.getOperand(3).getReg();
6641   Register RayOrigin = MI.getOperand(4).getReg();
6642   Register RayDir = MI.getOperand(5).getReg();
6643   Register RayInvDir = MI.getOperand(6).getReg();
6644   Register TDescr = MI.getOperand(7).getReg();
6645 
6646   if (!ST.hasGFX10_AEncoding()) {
6647     DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
6648                                         "intrinsic not supported on subtarget",
6649                                         MI.getDebugLoc());
6650     B.getMF().getFunction().getContext().diagnose(BadIntrin);
6651     return false;
6652   }
6653 
6654   const bool IsGFX11 = AMDGPU::isGFX11(ST);
6655   const bool IsGFX11Plus = AMDGPU::isGFX11Plus(ST);
6656   const bool IsGFX12Plus = AMDGPU::isGFX12Plus(ST);
6657   const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
6658   const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
6659   const unsigned NumVDataDwords = 4;
6660   const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
6661   const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
6662   const bool UseNSA =
6663       IsGFX12Plus || (ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize());
6664 
6665   const unsigned BaseOpcodes[2][2] = {
6666       {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
6667       {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
6668        AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
6669   int Opcode;
6670   if (UseNSA) {
6671     Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
6672                                    IsGFX12Plus ? AMDGPU::MIMGEncGfx12
6673                                    : IsGFX11   ? AMDGPU::MIMGEncGfx11NSA
6674                                                : AMDGPU::MIMGEncGfx10NSA,
6675                                    NumVDataDwords, NumVAddrDwords);
6676   } else {
6677     assert(!IsGFX12Plus);
6678     Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
6679                                    IsGFX11 ? AMDGPU::MIMGEncGfx11Default
6680                                            : AMDGPU::MIMGEncGfx10Default,
6681                                    NumVDataDwords, NumVAddrDwords);
6682   }
6683   assert(Opcode != -1);
6684 
6685   SmallVector<Register, 12> Ops;
6686   if (UseNSA && IsGFX11Plus) {
6687     auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) {
6688       auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
6689       auto Merged = B.buildMergeLikeInstr(
6690           V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)});
6691       Ops.push_back(Merged.getReg(0));
6692     };
6693 
6694     Ops.push_back(NodePtr);
6695     Ops.push_back(RayExtent);
6696     packLanes(RayOrigin);
6697 
6698     if (IsA16) {
6699       auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
6700       auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
6701       auto MergedDir = B.buildMergeLikeInstr(
6702           V3S32,
6703           {B.buildBitcast(
6704                 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(0),
6705                                                    UnmergeRayDir.getReg(0)}))
6706                .getReg(0),
6707            B.buildBitcast(
6708                 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(1),
6709                                                    UnmergeRayDir.getReg(1)}))
6710                .getReg(0),
6711            B.buildBitcast(
6712                 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(2),
6713                                                    UnmergeRayDir.getReg(2)}))
6714                .getReg(0)});
6715       Ops.push_back(MergedDir.getReg(0));
6716     } else {
6717       packLanes(RayDir);
6718       packLanes(RayInvDir);
6719     }
6720   } else {
6721     if (Is64) {
6722       auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
6723       Ops.push_back(Unmerge.getReg(0));
6724       Ops.push_back(Unmerge.getReg(1));
6725     } else {
6726       Ops.push_back(NodePtr);
6727     }
6728     Ops.push_back(RayExtent);
6729 
6730     auto packLanes = [&Ops, &S32, &B](Register Src) {
6731       auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
6732       Ops.push_back(Unmerge.getReg(0));
6733       Ops.push_back(Unmerge.getReg(1));
6734       Ops.push_back(Unmerge.getReg(2));
6735     };
6736 
6737     packLanes(RayOrigin);
6738     if (IsA16) {
6739       auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
6740       auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
6741       Register R1 = MRI.createGenericVirtualRegister(S32);
6742       Register R2 = MRI.createGenericVirtualRegister(S32);
6743       Register R3 = MRI.createGenericVirtualRegister(S32);
6744       B.buildMergeLikeInstr(R1,
6745                             {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
6746       B.buildMergeLikeInstr(
6747           R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
6748       B.buildMergeLikeInstr(
6749           R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
6750       Ops.push_back(R1);
6751       Ops.push_back(R2);
6752       Ops.push_back(R3);
6753     } else {
6754       packLanes(RayDir);
6755       packLanes(RayInvDir);
6756     }
6757   }
6758 
6759   if (!UseNSA) {
6760     // Build a single vector containing all the operands so far prepared.
6761     LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
6762     Register MergedOps = B.buildMergeLikeInstr(OpTy, Ops).getReg(0);
6763     Ops.clear();
6764     Ops.push_back(MergedOps);
6765   }
6766 
6767   auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
6768     .addDef(DstReg)
6769     .addImm(Opcode);
6770 
6771   for (Register R : Ops) {
6772     MIB.addUse(R);
6773   }
6774 
6775   MIB.addUse(TDescr)
6776      .addImm(IsA16 ? 1 : 0)
6777      .cloneMemRefs(MI);
6778 
6779   MI.eraseFromParent();
6780   return true;
6781 }
6782 
6783 bool AMDGPULegalizerInfo::legalizeFPTruncRound(MachineInstr &MI,
6784                                                MachineIRBuilder &B) const {
6785   unsigned Opc;
6786   int RoundMode = MI.getOperand(2).getImm();
6787 
6788   if (RoundMode == (int)RoundingMode::TowardPositive)
6789     Opc = AMDGPU::G_FPTRUNC_ROUND_UPWARD;
6790   else if (RoundMode == (int)RoundingMode::TowardNegative)
6791     Opc = AMDGPU::G_FPTRUNC_ROUND_DOWNWARD;
6792   else
6793     return false;
6794 
6795   B.buildInstr(Opc)
6796       .addDef(MI.getOperand(0).getReg())
6797       .addUse(MI.getOperand(1).getReg());
6798 
6799   MI.eraseFromParent();
6800 
6801   return true;
6802 }
6803 
6804 bool AMDGPULegalizerInfo::legalizeStackSave(MachineInstr &MI,
6805                                             MachineIRBuilder &B) const {
6806   const SITargetLowering *TLI = ST.getTargetLowering();
6807   Register StackPtr = TLI->getStackPointerRegisterToSaveRestore();
6808   Register DstReg = MI.getOperand(0).getReg();
6809   B.buildInstr(AMDGPU::G_AMDGPU_WAVE_ADDRESS, {DstReg}, {StackPtr});
6810   MI.eraseFromParent();
6811   return true;
6812 }
6813 
6814 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
6815                                             MachineInstr &MI) const {
6816   MachineIRBuilder &B = Helper.MIRBuilder;
6817   MachineRegisterInfo &MRI = *B.getMRI();
6818 
6819   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
6820   auto IntrID = cast<GIntrinsic>(MI).getIntrinsicID();
6821   switch (IntrID) {
6822   case Intrinsic::amdgcn_if:
6823   case Intrinsic::amdgcn_else: {
6824     MachineInstr *Br = nullptr;
6825     MachineBasicBlock *UncondBrTarget = nullptr;
6826     bool Negated = false;
6827     if (MachineInstr *BrCond =
6828             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
6829       const SIRegisterInfo *TRI
6830         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
6831 
6832       Register Def = MI.getOperand(1).getReg();
6833       Register Use = MI.getOperand(3).getReg();
6834 
6835       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
6836 
6837       if (Negated)
6838         std::swap(CondBrTarget, UncondBrTarget);
6839 
6840       B.setInsertPt(B.getMBB(), BrCond->getIterator());
6841       if (IntrID == Intrinsic::amdgcn_if) {
6842         B.buildInstr(AMDGPU::SI_IF)
6843           .addDef(Def)
6844           .addUse(Use)
6845           .addMBB(UncondBrTarget);
6846       } else {
6847         B.buildInstr(AMDGPU::SI_ELSE)
6848             .addDef(Def)
6849             .addUse(Use)
6850             .addMBB(UncondBrTarget);
6851       }
6852 
6853       if (Br) {
6854         Br->getOperand(0).setMBB(CondBrTarget);
6855       } else {
6856         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
6857         // since we're swapping branch targets it needs to be reinserted.
6858         // FIXME: IRTranslator should probably not do this
6859         B.buildBr(*CondBrTarget);
6860       }
6861 
6862       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
6863       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
6864       MI.eraseFromParent();
6865       BrCond->eraseFromParent();
6866       return true;
6867     }
6868 
6869     return false;
6870   }
6871   case Intrinsic::amdgcn_loop: {
6872     MachineInstr *Br = nullptr;
6873     MachineBasicBlock *UncondBrTarget = nullptr;
6874     bool Negated = false;
6875     if (MachineInstr *BrCond =
6876             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
6877       const SIRegisterInfo *TRI
6878         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
6879 
6880       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
6881       Register Reg = MI.getOperand(2).getReg();
6882 
6883       if (Negated)
6884         std::swap(CondBrTarget, UncondBrTarget);
6885 
6886       B.setInsertPt(B.getMBB(), BrCond->getIterator());
6887       B.buildInstr(AMDGPU::SI_LOOP)
6888         .addUse(Reg)
6889         .addMBB(UncondBrTarget);
6890 
6891       if (Br)
6892         Br->getOperand(0).setMBB(CondBrTarget);
6893       else
6894         B.buildBr(*CondBrTarget);
6895 
6896       MI.eraseFromParent();
6897       BrCond->eraseFromParent();
6898       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
6899       return true;
6900     }
6901 
6902     return false;
6903   }
6904   case Intrinsic::amdgcn_make_buffer_rsrc:
6905     return legalizePointerAsRsrcIntrin(MI, MRI, B);
6906   case Intrinsic::amdgcn_kernarg_segment_ptr:
6907     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
6908       // This only makes sense to call in a kernel, so just lower to null.
6909       B.buildConstant(MI.getOperand(0).getReg(), 0);
6910       MI.eraseFromParent();
6911       return true;
6912     }
6913 
6914     return legalizePreloadedArgIntrin(
6915       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
6916   case Intrinsic::amdgcn_implicitarg_ptr:
6917     return legalizeImplicitArgPtr(MI, MRI, B);
6918   case Intrinsic::amdgcn_workitem_id_x:
6919     return legalizeWorkitemIDIntrinsic(MI, MRI, B, 0,
6920                                        AMDGPUFunctionArgInfo::WORKITEM_ID_X);
6921   case Intrinsic::amdgcn_workitem_id_y:
6922     return legalizeWorkitemIDIntrinsic(MI, MRI, B, 1,
6923                                        AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
6924   case Intrinsic::amdgcn_workitem_id_z:
6925     return legalizeWorkitemIDIntrinsic(MI, MRI, B, 2,
6926                                        AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
6927   case Intrinsic::amdgcn_workgroup_id_x:
6928     return legalizePreloadedArgIntrin(MI, MRI, B,
6929                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
6930   case Intrinsic::amdgcn_workgroup_id_y:
6931     return legalizePreloadedArgIntrin(MI, MRI, B,
6932                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
6933   case Intrinsic::amdgcn_workgroup_id_z:
6934     return legalizePreloadedArgIntrin(MI, MRI, B,
6935                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
6936   case Intrinsic::amdgcn_lds_kernel_id:
6937     return legalizePreloadedArgIntrin(MI, MRI, B,
6938                                       AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
6939   case Intrinsic::amdgcn_dispatch_ptr:
6940     return legalizePreloadedArgIntrin(MI, MRI, B,
6941                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
6942   case Intrinsic::amdgcn_queue_ptr:
6943     return legalizePreloadedArgIntrin(MI, MRI, B,
6944                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
6945   case Intrinsic::amdgcn_implicit_buffer_ptr:
6946     return legalizePreloadedArgIntrin(
6947       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
6948   case Intrinsic::amdgcn_dispatch_id:
6949     return legalizePreloadedArgIntrin(MI, MRI, B,
6950                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
6951   case Intrinsic::r600_read_ngroups_x:
6952     // TODO: Emit error for hsa
6953     return legalizeKernargMemParameter(MI, B,
6954                                        SI::KernelInputOffsets::NGROUPS_X);
6955   case Intrinsic::r600_read_ngroups_y:
6956     return legalizeKernargMemParameter(MI, B,
6957                                        SI::KernelInputOffsets::NGROUPS_Y);
6958   case Intrinsic::r600_read_ngroups_z:
6959     return legalizeKernargMemParameter(MI, B,
6960                                        SI::KernelInputOffsets::NGROUPS_Z);
6961   case Intrinsic::r600_read_local_size_x:
6962     // TODO: Could insert G_ASSERT_ZEXT from s16
6963     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_X);
6964   case Intrinsic::r600_read_local_size_y:
6965     // TODO: Could insert G_ASSERT_ZEXT from s16
6966     return legalizeKernargMemParameter(MI, B,  SI::KernelInputOffsets::LOCAL_SIZE_Y);
6967     // TODO: Could insert G_ASSERT_ZEXT from s16
6968   case Intrinsic::r600_read_local_size_z:
6969     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Z);
6970   case Intrinsic::r600_read_global_size_x:
6971     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_X);
6972   case Intrinsic::r600_read_global_size_y:
6973     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Y);
6974   case Intrinsic::r600_read_global_size_z:
6975     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Z);
6976   case Intrinsic::amdgcn_fdiv_fast:
6977     return legalizeFDIVFastIntrin(MI, MRI, B);
6978   case Intrinsic::amdgcn_is_shared:
6979     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
6980   case Intrinsic::amdgcn_is_private:
6981     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
6982   case Intrinsic::amdgcn_wavefrontsize: {
6983     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
6984     MI.eraseFromParent();
6985     return true;
6986   }
6987   case Intrinsic::amdgcn_s_buffer_load:
6988     return legalizeSBufferLoad(Helper, MI);
6989   case Intrinsic::amdgcn_raw_buffer_store:
6990   case Intrinsic::amdgcn_raw_ptr_buffer_store:
6991   case Intrinsic::amdgcn_struct_buffer_store:
6992   case Intrinsic::amdgcn_struct_ptr_buffer_store:
6993     return legalizeBufferStore(MI, MRI, B, false, false);
6994   case Intrinsic::amdgcn_raw_buffer_store_format:
6995   case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
6996   case Intrinsic::amdgcn_struct_buffer_store_format:
6997   case Intrinsic::amdgcn_struct_ptr_buffer_store_format:
6998     return legalizeBufferStore(MI, MRI, B, false, true);
6999   case Intrinsic::amdgcn_raw_tbuffer_store:
7000   case Intrinsic::amdgcn_raw_ptr_tbuffer_store:
7001   case Intrinsic::amdgcn_struct_tbuffer_store:
7002   case Intrinsic::amdgcn_struct_ptr_tbuffer_store:
7003     return legalizeBufferStore(MI, MRI, B, true, true);
7004   case Intrinsic::amdgcn_raw_buffer_load:
7005   case Intrinsic::amdgcn_raw_ptr_buffer_load:
7006   case Intrinsic::amdgcn_struct_buffer_load:
7007   case Intrinsic::amdgcn_struct_ptr_buffer_load:
7008     return legalizeBufferLoad(MI, MRI, B, false, false);
7009   case Intrinsic::amdgcn_raw_buffer_load_format:
7010   case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
7011   case Intrinsic::amdgcn_struct_buffer_load_format:
7012   case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
7013     return legalizeBufferLoad(MI, MRI, B, true, false);
7014   case Intrinsic::amdgcn_raw_tbuffer_load:
7015   case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
7016   case Intrinsic::amdgcn_struct_tbuffer_load:
7017   case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
7018     return legalizeBufferLoad(MI, MRI, B, true, true);
7019   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
7020   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
7021   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
7022   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
7023   case Intrinsic::amdgcn_raw_buffer_atomic_add:
7024   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
7025   case Intrinsic::amdgcn_struct_buffer_atomic_add:
7026   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
7027   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
7028   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
7029   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
7030   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
7031   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
7032   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
7033   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
7034   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
7035   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
7036   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
7037   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
7038   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
7039   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
7040   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
7041   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
7042   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
7043   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
7044   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
7045   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
7046   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
7047   case Intrinsic::amdgcn_raw_buffer_atomic_and:
7048   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
7049   case Intrinsic::amdgcn_struct_buffer_atomic_and:
7050   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
7051   case Intrinsic::amdgcn_raw_buffer_atomic_or:
7052   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
7053   case Intrinsic::amdgcn_struct_buffer_atomic_or:
7054   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
7055   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
7056   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
7057   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
7058   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
7059   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
7060   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
7061   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
7062   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
7063   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
7064   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
7065   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
7066   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
7067   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
7068   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
7069   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
7070   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
7071   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
7072   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
7073   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
7074   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
7075   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
7076   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
7077   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
7078   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
7079   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
7080   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
7081   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
7082   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
7083     return legalizeBufferAtomic(MI, B, IntrID);
7084   case Intrinsic::trap:
7085     return legalizeTrapIntrinsic(MI, MRI, B);
7086   case Intrinsic::debugtrap:
7087     return legalizeDebugTrapIntrinsic(MI, MRI, B);
7088   case Intrinsic::amdgcn_rsq_clamp:
7089     return legalizeRsqClampIntrinsic(MI, MRI, B);
7090   case Intrinsic::amdgcn_ds_fadd:
7091   case Intrinsic::amdgcn_ds_fmin:
7092   case Intrinsic::amdgcn_ds_fmax:
7093     return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
7094   case Intrinsic::amdgcn_image_bvh_intersect_ray:
7095     return legalizeBVHIntrinsic(MI, B);
7096   case Intrinsic::amdgcn_fmed3: {
7097     GISelChangeObserver &Observer = Helper.Observer;
7098 
7099     // FIXME: This is to workaround the inability of tablegen match combiners to
7100     // match intrinsics in patterns.
7101     Observer.changingInstr(MI);
7102     MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_FMED3));
7103     MI.removeOperand(1);
7104     Observer.changedInstr(MI);
7105     return true;
7106   }
7107   default: {
7108     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
7109             AMDGPU::getImageDimIntrinsicInfo(IntrID))
7110       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
7111     return true;
7112   }
7113   }
7114 
7115   return true;
7116 }
7117